sys/vm/vm_map.c

   1 /*
   2  * (MPSAFE)
   3  *
   4  * Copyright (c) 1991, 1993
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * This code is derived from software contributed to Berkeley by
   8  * The Mach Operating System project at Carnegie-Mellon University.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      from: @(#)vm_map.c      8.3 (Berkeley) 1/12/94
  35  *
  36  *
  37  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  38  * All rights reserved.
  39  *
  40  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  41  *
  42  * Permission to use, copy, modify and distribute this software and
  43  * its documentation is hereby granted, provided that both the copyright
  44  * notice and this permission notice appear in all copies of the
  45  * software, derivative works or modified versions, and any portions
  46  * thereof, and that both notices appear in supporting documentation.
  47  *
  48  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  49  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  50  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  51  *
  52  * Carnegie Mellon requests users of this software to return to
  53  *
  54  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  55  *  School of Computer Science
  56  *  Carnegie Mellon University
  57  *  Pittsburgh PA 15213-3890
  58  *
  59  * any improvements or extensions that they make and grant Carnegie the
  60  * rights to redistribute these changes.
  61  *
  62  * $FreeBSD: src/sys/vm/vm_map.c,v 1.187.2.19 2003/05/27 00:47:02 alc Exp $
  63  */
  64
  65 /*
  66  *      Virtual memory mapping module.
  67  */
  68
  69 #include <sys/param.h>
  70 #include <sys/systm.h>
  71 #include <sys/kernel.h>
  72 #include <sys/proc.h>
  73 #include <sys/serialize.h>
  74 #include <sys/lock.h>
  75 #include <sys/vmmeter.h>
  76 #include <sys/mman.h>
  77 #include <sys/vnode.h>
  78 #include <sys/resourcevar.h>
  79 #include <sys/shm.h>
  80 #include <sys/tree.h>
  81 #include <sys/malloc.h>
  82 #include <sys/objcache.h>
  83
  84 #include <vm/vm.h>
  85 #include <vm/vm_param.h>
  86 #include <vm/pmap.h>
  87 #include <vm/vm_map.h>
  88 #include <vm/vm_page.h>
  89 #include <vm/vm_object.h>
  90 #include <vm/vm_pager.h>
  91 #include <vm/vm_kern.h>
  92 #include <vm/vm_extern.h>
  93 #include <vm/swap_pager.h>
  94 #include <vm/vm_zone.h>
  95
  96 #include <sys/random.h>
  97 #include <sys/sysctl.h>
  98 #include <sys/spinlock.h>
  99
 100 #include <sys/thread2.h>
 101 #include <sys/spinlock2.h>
 102
 103 /*
 104  * Virtual memory maps provide for the mapping, protection, and sharing
 105  * of virtual memory objects.  In addition, this module provides for an
 106  * efficient virtual copy of memory from one map to another.
 107  *
 108  * Synchronization is required prior to most operations.
 109  *
 110  * Maps consist of an ordered doubly-linked list of simple entries.
 111  * A hint and a RB tree is used to speed-up lookups.
 112  *
 113  * Callers looking to modify maps specify start/end addresses which cause
 114  * the related map entry to be clipped if necessary, and then later
 115  * recombined if the pieces remained compatible.
 116  *
 117  * Virtual copy operations are performed by copying VM object references
 118  * from one map to another, and then marking both regions as copy-on-write.
 119  */
 120 static boolean_t vmspace_ctor(void *obj, void *privdata, int ocflags);
 121 static void vmspace_dtor(void *obj, void *privdata);
 122 static void vmspace_terminate(struct vmspace *vm, int final);
 123
 124 MALLOC_DEFINE(M_VMSPACE, "vmspace", "vmspace objcache backingstore");
 125 static struct objcache *vmspace_cache;
 126
 127 /*
 128  * per-cpu page table cross mappings are initialized in early boot
 129  * and might require a considerable number of vm_map_entry structures.
 130  */
 131 #define MAPENTRYBSP_CACHE       (MAXCPU+1)
 132 #define MAPENTRYAP_CACHE        8
 133
 134 static struct vm_zone mapentzone_store;
 135 static vm_zone_t mapentzone;
 136
 137 static struct vm_map_entry map_entry_init[MAX_MAPENT];
 138 static struct vm_map_entry cpu_map_entry_init_bsp[MAPENTRYBSP_CACHE];
 139 static struct vm_map_entry cpu_map_entry_init_ap[MAXCPU][MAPENTRYAP_CACHE];
 140
 141 static int randomize_mmap;
 142 SYSCTL_INT(_vm, OID_AUTO, randomize_mmap, CTLFLAG_RW, &randomize_mmap, 0,
 143     "Randomize mmap offsets");
 144 static int vm_map_relock_enable = 1;
 145 SYSCTL_INT(_vm, OID_AUTO, map_relock_enable, CTLFLAG_RW,
 146            &vm_map_relock_enable, 0, "Randomize mmap offsets");
 147
 148 static void vmspace_drop_notoken(struct vmspace *vm);
 149 static void vm_map_entry_shadow(vm_map_entry_t entry, int addref);
 150 static vm_map_entry_t vm_map_entry_create(vm_map_t map, int *);
 151 static void vm_map_entry_dispose (vm_map_t map, vm_map_entry_t entry, int *);
 152 static void _vm_map_clip_end (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
 153 static void _vm_map_clip_start (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
 154 static void vm_map_entry_delete (vm_map_t, vm_map_entry_t, int *);
 155 static void vm_map_entry_unwire (vm_map_t, vm_map_entry_t);
 156 static void vm_map_copy_entry (vm_map_t, vm_map_t, vm_map_entry_t,
 157                 vm_map_entry_t);
 158 static void vm_map_unclip_range (vm_map_t map, vm_map_entry_t start_entry, vm_offset_t start, vm_offset_t end, int *count, int flags);
 159
 160 /*
 161  * Initialize the vm_map module.  Must be called before any other vm_map
 162  * routines.
 163  *
 164  * Map and entry structures are allocated from the general purpose
 165  * memory pool with some exceptions:
 166  *
 167  *      - The kernel map is allocated statically.
 168  *      - Initial kernel map entries are allocated out of a static pool.
 169  *      - We must set ZONE_SPECIAL here or the early boot code can get
 170  *        stuck if there are >63 cores.
 171  *
 172  *      These restrictions are necessary since malloc() uses the
 173  *      maps and requires map entries.
 174  *
 175  * Called from the low level boot code only.
 176  */
 177 void
 178 vm_map_startup(void)
 179 {
 180         mapentzone = &mapentzone_store;
 181         zbootinit(mapentzone, "MAP ENTRY", sizeof (struct vm_map_entry),
 182                   map_entry_init, MAX_MAPENT);
 183         mapentzone_store.zflags |= ZONE_SPECIAL;
 184 }
 185
 186 /*
 187  * Called prior to any vmspace allocations.
 188  *
 189  * Called from the low level boot code only.
 190  */
 191 void
 192 vm_init2(void)
 193 {
 194         vmspace_cache = objcache_create_mbacked(M_VMSPACE,
 195                                                 sizeof(struct vmspace),
 196                                                 0, ncpus * 4,
 197                                                 vmspace_ctor, vmspace_dtor,
 198                                                 NULL);
 199         zinitna(mapentzone, NULL, 0, 0, ZONE_USE_RESERVE | ZONE_SPECIAL);
 200         pmap_init2();
 201         vm_object_init2();
 202 }
 203
 204 /*
 205  * objcache support.  We leave the pmap root cached as long as possible
 206  * for performance reasons.
 207  */
 208 static
 209 boolean_t
 210 vmspace_ctor(void *obj, void *privdata, int ocflags)
 211 {
 212         struct vmspace *vm = obj;
 213
 214         bzero(vm, sizeof(*vm));
 215         vm->vm_refcnt = VM_REF_DELETED;
 216
 217         return 1;
 218 }
 219
 220 static
 221 void
 222 vmspace_dtor(void *obj, void *privdata)
 223 {
 224         struct vmspace *vm = obj;
 225
 226         KKASSERT(vm->vm_refcnt == VM_REF_DELETED);
 227         pmap_puninit(vmspace_pmap(vm));
 228 }
 229
 230 /*
 231  * Red black tree functions
 232  *
 233  * The caller must hold the related map lock.
 234  */
 235 static int rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b);
 236 RB_GENERATE(vm_map_rb_tree, vm_map_entry, rb_entry, rb_vm_map_compare);
 237
 238 /* a->start is address, and the only field has to be initialized */
 239 static int
 240 rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b)
 241 {
 242         if (a->start < b->start)
 243                 return(-1);
 244         else if (a->start > b->start)
 245                 return(1);
 246         return(0);
 247 }
 248
 249 /*
 250  * Initialize vmspace ref/hold counts vmspace0.  There is a holdcnt for
 251  * every refcnt.
 252  */
 253 void
 254 vmspace_initrefs(struct vmspace *vm)
 255 {
 256         vm->vm_refcnt = 1;
 257         vm->vm_holdcnt = 1;
 258 }
 259
 260 /*
 261  * Allocate a vmspace structure, including a vm_map and pmap.
 262  * Initialize numerous fields.  While the initial allocation is zerod,
 263  * subsequence reuse from the objcache leaves elements of the structure
 264  * intact (particularly the pmap), so portions must be zerod.
 265  *
 266  * Returns a referenced vmspace.
 267  *
 268  * No requirements.
 269  */
 270 struct vmspace *
 271 vmspace_alloc(vm_offset_t min, vm_offset_t max)
 272 {
 273         struct vmspace *vm;
 274
 275         vm = objcache_get(vmspace_cache, M_WAITOK);
 276
 277         bzero(&vm->vm_startcopy,
 278               (char *)&vm->vm_endcopy - (char *)&vm->vm_startcopy);
 279         vm_map_init(&vm->vm_map, min, max, NULL);       /* initializes token */
 280
 281         /*
 282          * NOTE: hold to acquires token for safety.
 283          *
 284          * On return vmspace is referenced (refs=1, hold=1).  That is,
 285          * each refcnt also has a holdcnt.  There can be additional holds
 286          * (holdcnt) above and beyond the refcnt.  Finalization is handled in
 287          * two stages, one on refs 1->0, and the the second on hold 1->0.
 288          */
 289         KKASSERT(vm->vm_holdcnt == 0);
 290         KKASSERT(vm->vm_refcnt == VM_REF_DELETED);
 291         vmspace_initrefs(vm);
 292         vmspace_hold(vm);
 293         pmap_pinit(vmspace_pmap(vm));           /* (some fields reused) */
 294         vm->vm_map.pmap = vmspace_pmap(vm);     /* XXX */
 295         vm->vm_shm = NULL;
 296         vm->vm_flags = 0;
 297         cpu_vmspace_alloc(vm);
 298         vmspace_drop(vm);
 299
 300         return (vm);
 301 }
 302
 303 /*
 304  * NOTE: Can return 0 if the vmspace is exiting.
 305  */
 306 int
 307 vmspace_getrefs(struct vmspace *vm)
 308 {
 309         int32_t n;
 310
 311         n = vm->vm_refcnt;
 312         cpu_ccfence();
 313         if (n & VM_REF_DELETED)
 314                 n = -1;
 315         return n;
 316 }
 317
 318 void
 319 vmspace_hold(struct vmspace *vm)
 320 {
 321         atomic_add_int(&vm->vm_holdcnt, 1);
 322         lwkt_gettoken(&vm->vm_map.token);
 323 }
 324
 325 /*
 326  * Drop with final termination interlock.
 327  */
 328 void
 329 vmspace_drop(struct vmspace *vm)
 330 {
 331         lwkt_reltoken(&vm->vm_map.token);
 332         vmspace_drop_notoken(vm);
 333 }
 334
 335 static void
 336 vmspace_drop_notoken(struct vmspace *vm)
 337 {
 338         if (atomic_fetchadd_int(&vm->vm_holdcnt, -1) == 1) {
 339                 if (vm->vm_refcnt & VM_REF_DELETED)
 340                         vmspace_terminate(vm, 1);
 341         }
 342 }
 343
 344 /*
 345  * A vmspace object must not be in a terminated state to be able to obtain
 346  * additional refs on it.
 347  *
 348  * These are official references to the vmspace, the count is used to check
 349  * for vmspace sharing.  Foreign accessors should use 'hold' and not 'ref'.
 350  *
 351  * XXX we need to combine hold & ref together into one 64-bit field to allow
 352  * holds to prevent stage-1 termination.
 353  */
 354 void
 355 vmspace_ref(struct vmspace *vm)
 356 {
 357         uint32_t n;
 358
 359         atomic_add_int(&vm->vm_holdcnt, 1);
 360         n = atomic_fetchadd_int(&vm->vm_refcnt, 1);
 361         KKASSERT((n & VM_REF_DELETED) == 0);
 362 }
 363
 364 /*
 365  * Release a ref on the vmspace.  On the 1->0 transition we do stage-1
 366  * termination of the vmspace.  Then, on the final drop of the hold we
 367  * will do stage-2 final termination.
 368  */
 369 void
 370 vmspace_rel(struct vmspace *vm)
 371 {
 372         uint32_t n;
 373
 374         /*
 375          * Drop refs.  Each ref also has a hold which is also dropped.
 376          *
 377          * When refs hits 0 compete to get the VM_REF_DELETED flag (hold
 378          * prevent finalization) to start termination processing.
 379          * Finalization occurs when the last hold count drops to 0.
 380          */
 381         n = atomic_fetchadd_int(&vm->vm_refcnt, -1) - 1;
 382         while (n == 0) {
 383                 if (atomic_cmpset_int(&vm->vm_refcnt, 0, VM_REF_DELETED)) {
 384                         vmspace_terminate(vm, 0);
 385                         break;
 386                 }
 387                 n = vm->vm_refcnt;
 388                 cpu_ccfence();
 389         }
 390         vmspace_drop_notoken(vm);
 391 }
 392
 393 /*
 394  * This is called during exit indicating that the vmspace is no
 395  * longer in used by an exiting process, but the process has not yet
 396  * been reaped.
 397  *
 398  * We drop refs, allowing for stage-1 termination, but maintain a holdcnt
 399  * to prevent stage-2 until the process is reaped.  Note hte order of
 400  * operation, we must hold first.
 401  *
 402  * No requirements.
 403  */
 404 void
 405 vmspace_relexit(struct vmspace *vm)
 406 {
 407         atomic_add_int(&vm->vm_holdcnt, 1);
 408         vmspace_rel(vm);
 409 }
 410
 411 /*
 412  * Called during reap to disconnect the remainder of the vmspace from
 413  * the process.  On the hold drop the vmspace termination is finalized.
 414  *
 415  * No requirements.
 416  */
 417 void
 418 vmspace_exitfree(struct proc *p)
 419 {
 420         struct vmspace *vm;
 421
 422         vm = p->p_vmspace;
 423         p->p_vmspace = NULL;
 424         vmspace_drop_notoken(vm);
 425 }
 426
 427 /*
 428  * Called in two cases:
 429  *
 430  * (1) When the last refcnt is dropped and the vmspace becomes inactive,
 431  *     called with final == 0.  refcnt will be (u_int)-1 at this point,
 432  *     and holdcnt will still be non-zero.
 433  *
 434  * (2) When holdcnt becomes 0, called with final == 1.  There should no
 435  *     longer be anyone with access to the vmspace.
 436  *
 437  * VMSPACE_EXIT1 flags the primary deactivation
 438  * VMSPACE_EXIT2 flags the last reap
 439  */
 440 static void
 441 vmspace_terminate(struct vmspace *vm, int final)
 442 {
 443         int count;
 444
 445         lwkt_gettoken(&vm->vm_map.token);
 446         if (final == 0) {
 447                 KKASSERT((vm->vm_flags & VMSPACE_EXIT1) == 0);
 448                 vm->vm_flags |= VMSPACE_EXIT1;
 449
 450                 /*
 451                  * Get rid of most of the resources.  Leave the kernel pmap
 452                  * intact.
 453                  *
 454                  * If the pmap does not contain wired pages we can bulk-delete
 455                  * the pmap as a performance optimization before removing the
 456                  * related mappings.
 457                  *
 458                  * If the pmap contains wired pages we cannot do this
 459                  * pre-optimization because currently vm_fault_unwire()
 460                  * expects the pmap pages to exist and will not decrement
 461                  * p->wire_count if they do not.
 462                  */
 463                 shmexit(vm);
 464                 if (vmspace_pmap(vm)->pm_stats.wired_count) {
 465                         vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS,
 466                                       VM_MAX_USER_ADDRESS);
 467                         pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS,
 468                                           VM_MAX_USER_ADDRESS);
 469                 } else {
 470                         pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS,
 471                                           VM_MAX_USER_ADDRESS);
 472                         vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS,
 473                                       VM_MAX_USER_ADDRESS);
 474                 }
 475                 lwkt_reltoken(&vm->vm_map.token);
 476         } else {
 477                 KKASSERT((vm->vm_flags & VMSPACE_EXIT1) != 0);
 478                 KKASSERT((vm->vm_flags & VMSPACE_EXIT2) == 0);
 479
 480                 /*
 481                  * Get rid of remaining basic resources.
 482                  */
 483                 vm->vm_flags |= VMSPACE_EXIT2;
 484                 shmexit(vm);
 485
 486                 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
 487                 vm_map_lock(&vm->vm_map);
 488                 cpu_vmspace_free(vm);
 489
 490                 /*
 491                  * Lock the map, to wait out all other references to it.
 492                  * Delete all of the mappings and pages they hold, then call
 493                  * the pmap module to reclaim anything left.
 494                  */
 495                 vm_map_delete(&vm->vm_map, vm->vm_map.min_offset,
 496                               vm->vm_map.max_offset, &count);
 497                 vm_map_unlock(&vm->vm_map);
 498                 vm_map_entry_release(count);
 499
 500                 pmap_release(vmspace_pmap(vm));
 501                 lwkt_reltoken(&vm->vm_map.token);
 502                 objcache_put(vmspace_cache, vm);
 503         }
 504 }
 505
 506 /*
 507  * Swap useage is determined by taking the proportional swap used by
 508  * VM objects backing the VM map.  To make up for fractional losses,
 509  * if the VM object has any swap use at all the associated map entries
 510  * count for at least 1 swap page.
 511  *
 512  * No requirements.
 513  */
 514 vm_offset_t
 515 vmspace_swap_count(struct vmspace *vm)
 516 {
 517         vm_map_t map = &vm->vm_map;
 518         vm_map_entry_t cur;
 519         vm_object_t object;
 520         vm_offset_t count = 0;
 521         vm_offset_t n;
 522
 523         vmspace_hold(vm);
 524         for (cur = map->header.next; cur != &map->header; cur = cur->next) {
 525                 switch(cur->maptype) {
 526                 case VM_MAPTYPE_NORMAL:
 527                 case VM_MAPTYPE_VPAGETABLE:
 528                         if ((object = cur->object.vm_object) == NULL)
 529                                 break;
 530                         if (object->swblock_count) {
 531                                 n = (cur->end - cur->start) / PAGE_SIZE;
 532                                 count += object->swblock_count *
 533                                     SWAP_META_PAGES * n / object->size + 1;
 534                         }
 535                         break;
 536                 default:
 537                         break;
 538                 }
 539         }
 540         vmspace_drop(vm);
 541
 542         return(count);
 543 }
 544
 545 /*
 546  * Calculate the approximate number of anonymous pages in use by
 547  * this vmspace.  To make up for fractional losses, we count each
 548  * VM object as having at least 1 anonymous page.
 549  *
 550  * No requirements.
 551  */
 552 vm_offset_t
 553 vmspace_anonymous_count(struct vmspace *vm)
 554 {
 555         vm_map_t map = &vm->vm_map;
 556         vm_map_entry_t cur;
 557         vm_object_t object;
 558         vm_offset_t count = 0;
 559
 560         vmspace_hold(vm);
 561         for (cur = map->header.next; cur != &map->header; cur = cur->next) {
 562                 switch(cur->maptype) {
 563                 case VM_MAPTYPE_NORMAL:
 564                 case VM_MAPTYPE_VPAGETABLE:
 565                         if ((object = cur->object.vm_object) == NULL)
 566                                 break;
 567                         if (object->type != OBJT_DEFAULT &&
 568                             object->type != OBJT_SWAP) {
 569                                 break;
 570                         }
 571                         count += object->resident_page_count;
 572                         break;
 573                 default:
 574                         break;
 575                 }
 576         }
 577         vmspace_drop(vm);
 578
 579         return(count);
 580 }
 581
 582 /*
 583  * Initialize an existing vm_map structure such as that in the vmspace
 584  * structure.  The pmap is initialized elsewhere.
 585  *
 586  * No requirements.
 587  */
 588 void
 589 vm_map_init(struct vm_map *map, vm_offset_t min, vm_offset_t max, pmap_t pmap)
 590 {
 591         map->header.next = map->header.prev = &map->header;
 592         RB_INIT(&map->rb_root);
 593         spin_init(&map->ilock_spin, "ilock");
 594         map->ilock_base = NULL;
 595         map->nentries = 0;
 596         map->size = 0;
 597         map->system_map = 0;
 598         map->min_offset = min;
 599         map->max_offset = max;
 600         map->pmap = pmap;
 601         map->first_free = &map->header;
 602         map->hint = &map->header;
 603         map->timestamp = 0;
 604         map->flags = 0;
 605         lwkt_token_init(&map->token, "vm_map");
 606         lockinit(&map->lock, "vm_maplk", (hz + 9) / 10, 0);
 607 }
 608
 609 /*
 610  * Shadow the vm_map_entry's object.  This typically needs to be done when
 611  * a write fault is taken on an entry which had previously been cloned by
 612  * fork().  The shared object (which might be NULL) must become private so
 613  * we add a shadow layer above it.
 614  *
 615  * Object allocation for anonymous mappings is defered as long as possible.
 616  * When creating a shadow, however, the underlying object must be instantiated
 617  * so it can be shared.
 618  *
 619  * If the map segment is governed by a virtual page table then it is
 620  * possible to address offsets beyond the mapped area.  Just allocate
 621  * a maximally sized object for this case.
 622  *
 623  * If addref is non-zero an additional reference is added to the returned
 624  * entry.  This mechanic exists because the additional reference might have
 625  * to be added atomically and not after return to prevent a premature
 626  * collapse.
 627  *
 628  * The vm_map must be exclusively locked.
 629  * No other requirements.
 630  */
 631 static
 632 void
 633 vm_map_entry_shadow(vm_map_entry_t entry, int addref)
 634 {
 635         if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
 636                 vm_object_shadow(&entry->object.vm_object, &entry->offset,
 637                                  0x7FFFFFFF, addref);   /* XXX */
 638         } else {
 639                 vm_object_shadow(&entry->object.vm_object, &entry->offset,
 640                                  atop(entry->end - entry->start), addref);
 641         }
 642         entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
 643 }
 644
 645 /*
 646  * Allocate an object for a vm_map_entry.
 647  *
 648  * Object allocation for anonymous mappings is defered as long as possible.
 649  * This function is called when we can defer no longer, generally when a map
 650  * entry might be split or forked or takes a page fault.
 651  *
 652  * If the map segment is governed by a virtual page table then it is
 653  * possible to address offsets beyond the mapped area.  Just allocate
 654  * a maximally sized object for this case.
 655  *
 656  * The vm_map must be exclusively locked.
 657  * No other requirements.
 658  */
 659 void
 660 vm_map_entry_allocate_object(vm_map_entry_t entry)
 661 {
 662         vm_object_t obj;
 663
 664         if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
 665                 obj = vm_object_allocate(OBJT_DEFAULT, 0x7FFFFFFF); /* XXX */
 666         } else {
 667                 obj = vm_object_allocate(OBJT_DEFAULT,
 668                                          atop(entry->end - entry->start));
 669         }
 670         entry->object.vm_object = obj;
 671         entry->offset = 0;
 672 }
 673
 674 /*
 675  * Set an initial negative count so the first attempt to reserve
 676  * space preloads a bunch of vm_map_entry's for this cpu.  Also
 677  * pre-allocate 2 vm_map_entries which will be needed by zalloc() to
 678  * map a new page for vm_map_entry structures.  SMP systems are
 679  * particularly sensitive.
 680  *
 681  * This routine is called in early boot so we cannot just call
 682  * vm_map_entry_reserve().
 683  *
 684  * Called from the low level boot code only (for each cpu)
 685  *
 686  * WARNING! Take care not to have too-big a static/BSS structure here
 687  *          as MAXCPU can be 256+, otherwise the loader's 64MB heap
 688  *          can get blown out by the kernel plus the initrd image.
 689  */
 690 void
 691 vm_map_entry_reserve_cpu_init(globaldata_t gd)
 692 {
 693         vm_map_entry_t entry;
 694         int count;
 695         int i;
 696
 697         gd->gd_vme_avail -= MAP_RESERVE_COUNT * 2;
 698         if (gd->gd_cpuid == 0) {
 699                 entry = &cpu_map_entry_init_bsp[0];
 700                 count = MAPENTRYBSP_CACHE;
 701         } else {
 702                 entry = &cpu_map_entry_init_ap[gd->gd_cpuid][0];
 703                 count = MAPENTRYAP_CACHE;
 704         }
 705         for (i = 0; i < count; ++i, ++entry) {
 706                 entry->next = gd->gd_vme_base;
 707                 gd->gd_vme_base = entry;
 708         }
 709 }
 710
 711 /*
 712  * Reserves vm_map_entry structures so code later on can manipulate
 713  * map_entry structures within a locked map without blocking trying
 714  * to allocate a new vm_map_entry.
 715  *
 716  * No requirements.
 717  */
 718 int
 719 vm_map_entry_reserve(int count)
 720 {
 721         struct globaldata *gd = mycpu;
 722         vm_map_entry_t entry;
 723
 724         /*
 725          * Make sure we have enough structures in gd_vme_base to handle
 726          * the reservation request.
 727          *
 728          * The critical section protects access to the per-cpu gd.
 729          */
 730         crit_enter();
 731         while (gd->gd_vme_avail < count) {
 732                 entry = zalloc(mapentzone);
 733                 entry->next = gd->gd_vme_base;
 734                 gd->gd_vme_base = entry;
 735                 ++gd->gd_vme_avail;
 736         }
 737         gd->gd_vme_avail -= count;
 738         crit_exit();
 739
 740         return(count);
 741 }
 742
 743 /*
 744  * Releases previously reserved vm_map_entry structures that were not
 745  * used.  If we have too much junk in our per-cpu cache clean some of
 746  * it out.
 747  *
 748  * No requirements.
 749  */
 750 void
 751 vm_map_entry_release(int count)
 752 {
 753         struct globaldata *gd = mycpu;
 754         vm_map_entry_t entry;
 755
 756         crit_enter();
 757         gd->gd_vme_avail += count;
 758         while (gd->gd_vme_avail > MAP_RESERVE_SLOP) {
 759                 entry = gd->gd_vme_base;
 760                 KKASSERT(entry != NULL);
 761                 gd->gd_vme_base = entry->next;
 762                 --gd->gd_vme_avail;
 763                 crit_exit();
 764                 zfree(mapentzone, entry);
 765                 crit_enter();
 766         }
 767         crit_exit();
 768 }
 769
 770 /*
 771  * Reserve map entry structures for use in kernel_map itself.  These
 772  * entries have *ALREADY* been reserved on a per-cpu basis when the map
 773  * was inited.  This function is used by zalloc() to avoid a recursion
 774  * when zalloc() itself needs to allocate additional kernel memory.
 775  *
 776  * This function works like the normal reserve but does not load the
 777  * vm_map_entry cache (because that would result in an infinite
 778  * recursion).  Note that gd_vme_avail may go negative.  This is expected.
 779  *
 780  * Any caller of this function must be sure to renormalize after
 781  * potentially eating entries to ensure that the reserve supply
 782  * remains intact.
 783  *
 784  * No requirements.
 785  */
 786 int
 787 vm_map_entry_kreserve(int count)
 788 {
 789         struct globaldata *gd = mycpu;
 790
 791         crit_enter();
 792         gd->gd_vme_avail -= count;
 793         crit_exit();
 794         KASSERT(gd->gd_vme_base != NULL,
 795                 ("no reserved entries left, gd_vme_avail = %d",
 796                 gd->gd_vme_avail));
 797         return(count);
 798 }
 799
 800 /*
 801  * Release previously reserved map entries for kernel_map.  We do not
 802  * attempt to clean up like the normal release function as this would
 803  * cause an unnecessary (but probably not fatal) deep procedure call.
 804  *
 805  * No requirements.
 806  */
 807 void
 808 vm_map_entry_krelease(int count)
 809 {
 810         struct globaldata *gd = mycpu;
 811
 812         crit_enter();
 813         gd->gd_vme_avail += count;
 814         crit_exit();
 815 }
 816
 817 /*
 818  * Allocates a VM map entry for insertion.  No entry fields are filled in.
 819  *
 820  * The entries should have previously been reserved.  The reservation count
 821  * is tracked in (*countp).
 822  *
 823  * No requirements.
 824  */
 825 static vm_map_entry_t
 826 vm_map_entry_create(vm_map_t map, int *countp)
 827 {
 828         struct globaldata *gd = mycpu;
 829         vm_map_entry_t entry;
 830
 831         KKASSERT(*countp > 0);
 832         --*countp;
 833         crit_enter();
 834         entry = gd->gd_vme_base;
 835         KASSERT(entry != NULL, ("gd_vme_base NULL! count %d", *countp));
 836         gd->gd_vme_base = entry->next;
 837         crit_exit();
 838
 839         return(entry);
 840 }
 841
 842 /*
 843  * Dispose of a vm_map_entry that is no longer being referenced.
 844  *
 845  * No requirements.
 846  */
 847 static void
 848 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry, int *countp)
 849 {
 850         struct globaldata *gd = mycpu;
 851
 852         KKASSERT(map->hint != entry);
 853         KKASSERT(map->first_free != entry);
 854
 855         ++*countp;
 856         crit_enter();
 857         entry->next = gd->gd_vme_base;
 858         gd->gd_vme_base = entry;
 859         crit_exit();
 860 }
 861
 862
 863 /*
 864  * Insert/remove entries from maps.
 865  *
 866  * The related map must be exclusively locked.
 867  * The caller must hold map->token
 868  * No other requirements.
 869  */
 870 static __inline void
 871 vm_map_entry_link(vm_map_t map,
 872                   vm_map_entry_t after_where,
 873                   vm_map_entry_t entry)
 874 {
 875         ASSERT_VM_MAP_LOCKED(map);
 876
 877         map->nentries++;
 878         entry->prev = after_where;
 879         entry->next = after_where->next;
 880         entry->next->prev = entry;
 881         after_where->next = entry;
 882         if (vm_map_rb_tree_RB_INSERT(&map->rb_root, entry))
 883                 panic("vm_map_entry_link: dup addr map %p ent %p", map, entry);
 884 }
 885
 886 static __inline void
 887 vm_map_entry_unlink(vm_map_t map,
 888                     vm_map_entry_t entry)
 889 {
 890         vm_map_entry_t prev;
 891         vm_map_entry_t next;
 892
 893         ASSERT_VM_MAP_LOCKED(map);
 894
 895         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
 896                 panic("vm_map_entry_unlink: attempt to mess with "
 897                       "locked entry! %p", entry);
 898         }
 899         prev = entry->prev;
 900         next = entry->next;
 901         next->prev = prev;
 902         prev->next = next;
 903         vm_map_rb_tree_RB_REMOVE(&map->rb_root, entry);
 904         map->nentries--;
 905 }
 906
 907 /*
 908  * Finds the map entry containing (or immediately preceding) the specified
 909  * address in the given map.  The entry is returned in (*entry).
 910  *
 911  * The boolean result indicates whether the address is actually contained
 912  * in the map.
 913  *
 914  * The related map must be locked.
 915  * No other requirements.
 916  */
 917 boolean_t
 918 vm_map_lookup_entry(vm_map_t map, vm_offset_t address, vm_map_entry_t *entry)
 919 {
 920         vm_map_entry_t tmp;
 921         vm_map_entry_t last;
 922
 923         ASSERT_VM_MAP_LOCKED(map);
 924 #if 0
 925         /*
 926          * XXX TEMPORARILY DISABLED.  For some reason our attempt to revive
 927          * the hint code with the red-black lookup meets with system crashes
 928          * and lockups.  We do not yet know why.
 929          *
 930          * It is possible that the problem is related to the setting
 931          * of the hint during map_entry deletion, in the code specified
 932          * at the GGG comment later on in this file.
 933          *
 934          * YYY More likely it's because this function can be called with
 935          * a shared lock on the map, resulting in map->hint updates possibly
 936          * racing.  Fixed now but untested.
 937          */
 938         /*
 939          * Quickly check the cached hint, there's a good chance of a match.
 940          */
 941         tmp = map->hint;
 942         cpu_ccfence();
 943         if (tmp != &map->header) {
 944                 if (address >= tmp->start && address < tmp->end) {
 945                         *entry = tmp;
 946                         return(TRUE);
 947                 }
 948         }
 949 #endif
 950
 951         /*
 952          * Locate the record from the top of the tree.  'last' tracks the
 953          * closest prior record and is returned if no match is found, which
 954          * in binary tree terms means tracking the most recent right-branch
 955          * taken.  If there is no prior record, &map->header is returned.
 956          */
 957         last = &map->header;
 958         tmp = RB_ROOT(&map->rb_root);
 959
 960         while (tmp) {
 961                 if (address >= tmp->start) {
 962                         if (address < tmp->end) {
 963                                 *entry = tmp;
 964                                 map->hint = tmp;
 965                                 return(TRUE);
 966                         }
 967                         last = tmp;
 968                         tmp = RB_RIGHT(tmp, rb_entry);
 969                 } else {
 970                         tmp = RB_LEFT(tmp, rb_entry);
 971                 }
 972         }
 973         *entry = last;
 974         return (FALSE);
 975 }
 976
 977 /*
 978  * Inserts the given whole VM object into the target map at the specified
 979  * address range.  The object's size should match that of the address range.
 980  *
 981  * The map must be exclusively locked.
 982  * The object must be held.
 983  * The caller must have reserved sufficient vm_map_entry structures.
 984  *
 985  * If object is non-NULL, ref count must be bumped by caller prior to
 986  * making call to account for the new entry.
 987  */
 988 int
 989 vm_map_insert(vm_map_t map, int *countp, void *map_object, void *map_aux,
 990               vm_ooffset_t offset, vm_offset_t start, vm_offset_t end,
 991               vm_maptype_t maptype, vm_subsys_t id,
 992               vm_prot_t prot, vm_prot_t max, int cow)
 993 {
 994         vm_map_entry_t new_entry;
 995         vm_map_entry_t prev_entry;
 996         vm_map_entry_t temp_entry;
 997         vm_eflags_t protoeflags;
 998         int must_drop = 0;
 999         vm_object_t object;
1000
1001         if (maptype == VM_MAPTYPE_UKSMAP)
1002                 object = NULL;
1003         else
1004                 object = map_object;
1005
1006         ASSERT_VM_MAP_LOCKED(map);
1007         if (object)
1008                 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1009
1010         /*
1011          * Check that the start and end points are not bogus.
1012          */
1013         if ((start < map->min_offset) || (end > map->max_offset) ||
1014             (start >= end))
1015                 return (KERN_INVALID_ADDRESS);
1016
1017         /*
1018          * Find the entry prior to the proposed starting address; if it's part
1019          * of an existing entry, this range is bogus.
1020          */
1021         if (vm_map_lookup_entry(map, start, &temp_entry))
1022                 return (KERN_NO_SPACE);
1023
1024         prev_entry = temp_entry;
1025
1026         /*
1027          * Assert that the next entry doesn't overlap the end point.
1028          */
1029
1030         if ((prev_entry->next != &map->header) &&
1031             (prev_entry->next->start < end))
1032                 return (KERN_NO_SPACE);
1033
1034         protoeflags = 0;
1035
1036         if (cow & MAP_COPY_ON_WRITE)
1037                 protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY;
1038
1039         if (cow & MAP_NOFAULT) {
1040                 protoeflags |= MAP_ENTRY_NOFAULT;
1041
1042                 KASSERT(object == NULL,
1043                         ("vm_map_insert: paradoxical MAP_NOFAULT request"));
1044         }
1045         if (cow & MAP_DISABLE_SYNCER)
1046                 protoeflags |= MAP_ENTRY_NOSYNC;
1047         if (cow & MAP_DISABLE_COREDUMP)
1048                 protoeflags |= MAP_ENTRY_NOCOREDUMP;
1049         if (cow & MAP_IS_STACK)
1050                 protoeflags |= MAP_ENTRY_STACK;
1051         if (cow & MAP_IS_KSTACK)
1052                 protoeflags |= MAP_ENTRY_KSTACK;
1053
1054         lwkt_gettoken(&map->token);
1055
1056         if (object) {
1057                 /*
1058                  * When object is non-NULL, it could be shared with another
1059                  * process.  We have to set or clear OBJ_ONEMAPPING
1060                  * appropriately.
1061                  *
1062                  * NOTE: This flag is only applicable to DEFAULT and SWAP
1063                  *       objects and will already be clear in other types
1064                  *       of objects, so a shared object lock is ok for
1065                  *       VNODE objects.
1066                  */
1067                 if ((object->ref_count > 1) || (object->shadow_count != 0)) {
1068                         vm_object_clear_flag(object, OBJ_ONEMAPPING);
1069                 }
1070         }
1071         else if ((prev_entry != &map->header) &&
1072                  (prev_entry->eflags == protoeflags) &&
1073                  (prev_entry->end == start) &&
1074                  (prev_entry->wired_count == 0) &&
1075                  (prev_entry->id == id) &&
1076                  prev_entry->maptype == maptype &&
1077                  maptype == VM_MAPTYPE_NORMAL &&
1078                  ((prev_entry->object.vm_object == NULL) ||
1079                   vm_object_coalesce(prev_entry->object.vm_object,
1080                                      OFF_TO_IDX(prev_entry->offset),
1081                                      (vm_size_t)(prev_entry->end - prev_entry->start),
1082                                      (vm_size_t)(end - prev_entry->end)))) {
1083                 /*
1084                  * We were able to extend the object.  Determine if we
1085                  * can extend the previous map entry to include the
1086                  * new range as well.
1087                  */
1088                 if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) &&
1089                     (prev_entry->protection == prot) &&
1090                     (prev_entry->max_protection == max)) {
1091                         map->size += (end - prev_entry->end);
1092                         prev_entry->end = end;
1093                         vm_map_simplify_entry(map, prev_entry, countp);
1094                         lwkt_reltoken(&map->token);
1095                         return (KERN_SUCCESS);
1096                 }
1097
1098                 /*
1099                  * If we can extend the object but cannot extend the
1100                  * map entry, we have to create a new map entry.  We
1101                  * must bump the ref count on the extended object to
1102                  * account for it.  object may be NULL.
1103                  *
1104                  * XXX if object is NULL should we set offset to 0 here ?
1105                  */
1106                 object = prev_entry->object.vm_object;
1107                 offset = prev_entry->offset +
1108                         (prev_entry->end - prev_entry->start);
1109                 if (object) {
1110                         vm_object_hold(object);
1111                         vm_object_chain_wait(object, 0);
1112                         vm_object_reference_locked(object);
1113                         must_drop = 1;
1114                         map_object = object;
1115                 }
1116         }
1117
1118         /*
1119          * NOTE: if conditionals fail, object can be NULL here.  This occurs
1120          * in things like the buffer map where we manage kva but do not manage
1121          * backing objects.
1122          */
1123
1124         /*
1125          * Create a new entry
1126          */
1127
1128         new_entry = vm_map_entry_create(map, countp);
1129         new_entry->start = start;
1130         new_entry->end = end;
1131         new_entry->id = id;
1132
1133         new_entry->maptype = maptype;
1134         new_entry->eflags = protoeflags;
1135         new_entry->object.map_object = map_object;
1136         new_entry->aux.master_pde = 0;          /* in case size is different */
1137         new_entry->aux.map_aux = map_aux;
1138         new_entry->offset = offset;
1139
1140         new_entry->inheritance = VM_INHERIT_DEFAULT;
1141         new_entry->protection = prot;
1142         new_entry->max_protection = max;
1143         new_entry->wired_count = 0;
1144
1145         /*
1146          * Insert the new entry into the list
1147          */
1148
1149         vm_map_entry_link(map, prev_entry, new_entry);
1150         map->size += new_entry->end - new_entry->start;
1151
1152         /*
1153          * Update the free space hint.  Entries cannot overlap.
1154          * An exact comparison is needed to avoid matching
1155          * against the map->header.
1156          */
1157         if ((map->first_free == prev_entry) &&
1158             (prev_entry->end == new_entry->start)) {
1159                 map->first_free = new_entry;
1160         }
1161
1162 #if 0
1163         /*
1164          * Temporarily removed to avoid MAP_STACK panic, due to
1165          * MAP_STACK being a huge hack.  Will be added back in
1166          * when MAP_STACK (and the user stack mapping) is fixed.
1167          */
1168         /*
1169          * It may be possible to simplify the entry
1170          */
1171         vm_map_simplify_entry(map, new_entry, countp);
1172 #endif
1173
1174         /*
1175          * Try to pre-populate the page table.  Mappings governed by virtual
1176          * page tables cannot be prepopulated without a lot of work, so
1177          * don't try.
1178          */
1179         if ((cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) &&
1180             maptype != VM_MAPTYPE_VPAGETABLE &&
1181             maptype != VM_MAPTYPE_UKSMAP) {
1182                 int dorelock = 0;
1183                 if (vm_map_relock_enable && (cow & MAP_PREFAULT_RELOCK)) {
1184                         dorelock = 1;
1185                         vm_object_lock_swap();
1186                         vm_object_drop(object);
1187                 }
1188                 pmap_object_init_pt(map->pmap, start, prot,
1189                                     object, OFF_TO_IDX(offset), end - start,
1190                                     cow & MAP_PREFAULT_PARTIAL);
1191                 if (dorelock) {
1192                         vm_object_hold(object);
1193                         vm_object_lock_swap();
1194                 }
1195         }
1196         if (must_drop)
1197                 vm_object_drop(object);
1198
1199         lwkt_reltoken(&map->token);
1200         return (KERN_SUCCESS);
1201 }
1202
1203 /*
1204  * Find sufficient space for `length' bytes in the given map, starting at
1205  * `start'.  Returns 0 on success, 1 on no space.
1206  *
1207  * This function will returned an arbitrarily aligned pointer.  If no
1208  * particular alignment is required you should pass align as 1.  Note that
1209  * the map may return PAGE_SIZE aligned pointers if all the lengths used in
1210  * the map are a multiple of PAGE_SIZE, even if you pass a smaller align
1211  * argument.
1212  *
1213  * 'align' should be a power of 2 but is not required to be.
1214  *
1215  * The map must be exclusively locked.
1216  * No other requirements.
1217  */
1218 int
1219 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length,
1220                  vm_size_t align, int flags, vm_offset_t *addr)
1221 {
1222         vm_map_entry_t entry, next;
1223         vm_offset_t end;
1224         vm_offset_t align_mask;
1225
1226         if (start < map->min_offset)
1227                 start = map->min_offset;
1228         if (start > map->max_offset)
1229                 return (1);
1230
1231         /*
1232          * If the alignment is not a power of 2 we will have to use
1233          * a mod/division, set align_mask to a special value.
1234          */
1235         if ((align | (align - 1)) + 1 != (align << 1))
1236                 align_mask = (vm_offset_t)-1;
1237         else
1238                 align_mask = align - 1;
1239
1240         /*
1241          * Look for the first possible address; if there's already something
1242          * at this address, we have to start after it.
1243          */
1244         if (start == map->min_offset) {
1245                 if ((entry = map->first_free) != &map->header)
1246                         start = entry->end;
1247         } else {
1248                 vm_map_entry_t tmp;
1249
1250                 if (vm_map_lookup_entry(map, start, &tmp))
1251                         start = tmp->end;
1252                 entry = tmp;
1253         }
1254
1255         /*
1256          * Look through the rest of the map, trying to fit a new region in the
1257          * gap between existing regions, or after the very last region.
1258          */
1259         for (;; start = (entry = next)->end) {
1260                 /*
1261                  * Adjust the proposed start by the requested alignment,
1262                  * be sure that we didn't wrap the address.
1263                  */
1264                 if (align_mask == (vm_offset_t)-1)
1265                         end = roundup(start, align);
1266                 else
1267                         end = (start + align_mask) & ~align_mask;
1268                 if (end < start)
1269                         return (1);
1270                 start = end;
1271                 /*
1272                  * Find the end of the proposed new region.  Be sure we didn't
1273                  * go beyond the end of the map, or wrap around the address.
1274                  * Then check to see if this is the last entry or if the
1275                  * proposed end fits in the gap between this and the next
1276                  * entry.
1277                  */
1278                 end = start + length;
1279                 if (end > map->max_offset || end < start)
1280                         return (1);
1281                 next = entry->next;
1282
1283                 /*
1284                  * If the next entry's start address is beyond the desired
1285                  * end address we may have found a good entry.
1286                  *
1287                  * If the next entry is a stack mapping we do not map into
1288                  * the stack's reserved space.
1289                  *
1290                  * XXX continue to allow mapping into the stack's reserved
1291                  * space if doing a MAP_STACK mapping inside a MAP_STACK
1292                  * mapping, for backwards compatibility.  But the caller
1293                  * really should use MAP_STACK | MAP_TRYFIXED if they
1294                  * want to do that.
1295                  */
1296                 if (next == &map->header)
1297                         break;
1298                 if (next->start >= end) {
1299                         if ((next->eflags & MAP_ENTRY_STACK) == 0)
1300                                 break;
1301                         if (flags & MAP_STACK)
1302                                 break;
1303                         if (next->start - next->aux.avail_ssize >= end)
1304                                 break;
1305                 }
1306         }
1307         map->hint = entry;
1308
1309         /*
1310          * Grow the kernel_map if necessary.  pmap_growkernel() will panic
1311          * if it fails.  The kernel_map is locked and nothing can steal
1312          * our address space if pmap_growkernel() blocks.
1313          *
1314          * NOTE: This may be unconditionally called for kldload areas on
1315          *       x86_64 because these do not bump kernel_vm_end (which would
1316          *       fill 128G worth of page tables!).  Therefore we must not
1317          *       retry.
1318          */
1319         if (map == &kernel_map) {
1320                 vm_offset_t kstop;
1321
1322                 kstop = round_page(start + length);
1323                 if (kstop > kernel_vm_end)
1324                         pmap_growkernel(start, kstop);
1325         }
1326         *addr = start;
1327         return (0);
1328 }
1329
1330 /*
1331  * vm_map_find finds an unallocated region in the target address map with
1332  * the given length and allocates it.  The search is defined to be first-fit
1333  * from the specified address; the region found is returned in the same
1334  * parameter.
1335  *
1336  * If object is non-NULL, ref count must be bumped by caller
1337  * prior to making call to account for the new entry.
1338  *
1339  * No requirements.  This function will lock the map temporarily.
1340  */
1341 int
1342 vm_map_find(vm_map_t map, void *map_object, void *map_aux,
1343             vm_ooffset_t offset, vm_offset_t *addr,
1344             vm_size_t length, vm_size_t align, boolean_t fitit,
1345             vm_maptype_t maptype, vm_subsys_t id,
1346             vm_prot_t prot, vm_prot_t max, int cow)
1347 {
1348         vm_offset_t start;
1349         vm_object_t object;
1350         int result;
1351         int count;
1352
1353         if (maptype == VM_MAPTYPE_UKSMAP)
1354                 object = NULL;
1355         else
1356                 object = map_object;
1357
1358         start = *addr;
1359
1360         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1361         vm_map_lock(map);
1362         if (object)
1363                 vm_object_hold_shared(object);
1364         if (fitit) {
1365                 if (vm_map_findspace(map, start, length, align, 0, addr)) {
1366                         if (object)
1367                                 vm_object_drop(object);
1368                         vm_map_unlock(map);
1369                         vm_map_entry_release(count);
1370                         return (KERN_NO_SPACE);
1371                 }
1372                 start = *addr;
1373         }
1374         result = vm_map_insert(map, &count, map_object, map_aux,
1375                                offset, start, start + length,
1376                                maptype, id, prot, max, cow);
1377         if (object)
1378                 vm_object_drop(object);
1379         vm_map_unlock(map);
1380         vm_map_entry_release(count);
1381
1382         return (result);
1383 }
1384
1385 /*
1386  * Simplify the given map entry by merging with either neighbor.  This
1387  * routine also has the ability to merge with both neighbors.
1388  *
1389  * This routine guarentees that the passed entry remains valid (though
1390  * possibly extended).  When merging, this routine may delete one or
1391  * both neighbors.  No action is taken on entries which have their
1392  * in-transition flag set.
1393  *
1394  * The map must be exclusively locked.
1395  */
1396 void
1397 vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry, int *countp)
1398 {
1399         vm_map_entry_t next, prev;
1400         vm_size_t prevsize, esize;
1401
1402         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1403                 ++mycpu->gd_cnt.v_intrans_coll;
1404                 return;
1405         }
1406
1407         if (entry->maptype == VM_MAPTYPE_SUBMAP)
1408                 return;
1409         if (entry->maptype == VM_MAPTYPE_UKSMAP)
1410                 return;
1411
1412         prev = entry->prev;
1413         if (prev != &map->header) {
1414                 prevsize = prev->end - prev->start;
1415                 if ( (prev->end == entry->start) &&
1416                      (prev->maptype == entry->maptype) &&
1417                      (prev->object.vm_object == entry->object.vm_object) &&
1418                      (!prev->object.vm_object ||
1419                         (prev->offset + prevsize == entry->offset)) &&
1420                      (prev->eflags == entry->eflags) &&
1421                      (prev->protection == entry->protection) &&
1422                      (prev->max_protection == entry->max_protection) &&
1423                      (prev->inheritance == entry->inheritance) &&
1424                      (prev->id == entry->id) &&
1425                      (prev->wired_count == entry->wired_count)) {
1426                         if (map->first_free == prev)
1427                                 map->first_free = entry;
1428                         if (map->hint == prev)
1429                                 map->hint = entry;
1430                         vm_map_entry_unlink(map, prev);
1431                         entry->start = prev->start;
1432                         entry->offset = prev->offset;
1433                         if (prev->object.vm_object)
1434                                 vm_object_deallocate(prev->object.vm_object);
1435                         vm_map_entry_dispose(map, prev, countp);
1436                 }
1437         }
1438
1439         next = entry->next;
1440         if (next != &map->header) {
1441                 esize = entry->end - entry->start;
1442                 if ((entry->end == next->start) &&
1443                     (next->maptype == entry->maptype) &&
1444                     (next->object.vm_object == entry->object.vm_object) &&
1445                      (!entry->object.vm_object ||
1446                         (entry->offset + esize == next->offset)) &&
1447                     (next->eflags == entry->eflags) &&
1448                     (next->protection == entry->protection) &&
1449                     (next->max_protection == entry->max_protection) &&
1450                     (next->inheritance == entry->inheritance) &&
1451                     (next->id == entry->id) &&
1452                     (next->wired_count == entry->wired_count)) {
1453                         if (map->first_free == next)
1454                                 map->first_free = entry;
1455                         if (map->hint == next)
1456                                 map->hint = entry;
1457                         vm_map_entry_unlink(map, next);
1458                         entry->end = next->end;
1459                         if (next->object.vm_object)
1460                                 vm_object_deallocate(next->object.vm_object);
1461                         vm_map_entry_dispose(map, next, countp);
1462                 }
1463         }
1464 }
1465
1466 /*
1467  * Asserts that the given entry begins at or after the specified address.
1468  * If necessary, it splits the entry into two.
1469  */
1470 #define vm_map_clip_start(map, entry, startaddr, countp)                \
1471 {                                                                       \
1472         if (startaddr > entry->start)                                   \
1473                 _vm_map_clip_start(map, entry, startaddr, countp);      \
1474 }
1475
1476 /*
1477  * This routine is called only when it is known that the entry must be split.
1478  *
1479  * The map must be exclusively locked.
1480  */
1481 static void
1482 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start,
1483                    int *countp)
1484 {
1485         vm_map_entry_t new_entry;
1486
1487         /*
1488          * Split off the front portion -- note that we must insert the new
1489          * entry BEFORE this one, so that this entry has the specified
1490          * starting address.
1491          */
1492
1493         vm_map_simplify_entry(map, entry, countp);
1494
1495         /*
1496          * If there is no object backing this entry, we might as well create
1497          * one now.  If we defer it, an object can get created after the map
1498          * is clipped, and individual objects will be created for the split-up
1499          * map.  This is a bit of a hack, but is also about the best place to
1500          * put this improvement.
1501          */
1502         if (entry->object.vm_object == NULL && !map->system_map) {
1503                 vm_map_entry_allocate_object(entry);
1504         }
1505
1506         new_entry = vm_map_entry_create(map, countp);
1507         *new_entry = *entry;
1508
1509         new_entry->end = start;
1510         entry->offset += (start - entry->start);
1511         entry->start = start;
1512
1513         vm_map_entry_link(map, entry->prev, new_entry);
1514
1515         switch(entry->maptype) {
1516         case VM_MAPTYPE_NORMAL:
1517         case VM_MAPTYPE_VPAGETABLE:
1518                 if (new_entry->object.vm_object) {
1519                         vm_object_hold(new_entry->object.vm_object);
1520                         vm_object_chain_wait(new_entry->object.vm_object, 0);
1521                         vm_object_reference_locked(new_entry->object.vm_object);
1522                         vm_object_drop(new_entry->object.vm_object);
1523                 }
1524                 break;
1525         default:
1526                 break;
1527         }
1528 }
1529
1530 /*
1531  * Asserts that the given entry ends at or before the specified address.
1532  * If necessary, it splits the entry into two.
1533  *
1534  * The map must be exclusively locked.
1535  */
1536 #define vm_map_clip_end(map, entry, endaddr, countp)            \
1537 {                                                               \
1538         if (endaddr < entry->end)                               \
1539                 _vm_map_clip_end(map, entry, endaddr, countp);  \
1540 }
1541
1542 /*
1543  * This routine is called only when it is known that the entry must be split.
1544  *
1545  * The map must be exclusively locked.
1546  */
1547 static void
1548 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end,
1549                  int *countp)
1550 {
1551         vm_map_entry_t new_entry;
1552
1553         /*
1554          * If there is no object backing this entry, we might as well create
1555          * one now.  If we defer it, an object can get created after the map
1556          * is clipped, and individual objects will be created for the split-up
1557          * map.  This is a bit of a hack, but is also about the best place to
1558          * put this improvement.
1559          */
1560
1561         if (entry->object.vm_object == NULL && !map->system_map) {
1562                 vm_map_entry_allocate_object(entry);
1563         }
1564
1565         /*
1566          * Create a new entry and insert it AFTER the specified entry
1567          */
1568
1569         new_entry = vm_map_entry_create(map, countp);
1570         *new_entry = *entry;
1571
1572         new_entry->start = entry->end = end;
1573         new_entry->offset += (end - entry->start);
1574
1575         vm_map_entry_link(map, entry, new_entry);
1576
1577         switch(entry->maptype) {
1578         case VM_MAPTYPE_NORMAL:
1579         case VM_MAPTYPE_VPAGETABLE:
1580                 if (new_entry->object.vm_object) {
1581                         vm_object_hold(new_entry->object.vm_object);
1582                         vm_object_chain_wait(new_entry->object.vm_object, 0);
1583                         vm_object_reference_locked(new_entry->object.vm_object);
1584                         vm_object_drop(new_entry->object.vm_object);
1585                 }
1586                 break;
1587         default:
1588                 break;
1589         }
1590 }
1591
1592 /*
1593  * Asserts that the starting and ending region addresses fall within the
1594  * valid range for the map.
1595  */
1596 #define VM_MAP_RANGE_CHECK(map, start, end)     \
1597 {                                               \
1598         if (start < vm_map_min(map))            \
1599                 start = vm_map_min(map);        \
1600         if (end > vm_map_max(map))              \
1601                 end = vm_map_max(map);          \
1602         if (start > end)                        \
1603                 start = end;                    \
1604 }
1605
1606 /*
1607  * Used to block when an in-transition collison occurs.  The map
1608  * is unlocked for the sleep and relocked before the return.
1609  */
1610 void
1611 vm_map_transition_wait(vm_map_t map)
1612 {
1613         tsleep_interlock(map, 0);
1614         vm_map_unlock(map);
1615         tsleep(map, PINTERLOCKED, "vment", 0);
1616         vm_map_lock(map);
1617 }
1618
1619 /*
1620  * When we do blocking operations with the map lock held it is
1621  * possible that a clip might have occured on our in-transit entry,
1622  * requiring an adjustment to the entry in our loop.  These macros
1623  * help the pageable and clip_range code deal with the case.  The
1624  * conditional costs virtually nothing if no clipping has occured.
1625  */
1626
1627 #define CLIP_CHECK_BACK(entry, save_start)              \
1628     do {                                                \
1629             while (entry->start != save_start) {        \
1630                     entry = entry->prev;                \
1631                     KASSERT(entry != &map->header, ("bad entry clip")); \
1632             }                                           \
1633     } while(0)
1634
1635 #define CLIP_CHECK_FWD(entry, save_end)                 \
1636     do {                                                \
1637             while (entry->end != save_end) {            \
1638                     entry = entry->next;                \
1639                     KASSERT(entry != &map->header, ("bad entry clip")); \
1640             }                                           \
1641     } while(0)
1642
1643
1644 /*
1645  * Clip the specified range and return the base entry.  The
1646  * range may cover several entries starting at the returned base
1647  * and the first and last entry in the covering sequence will be
1648  * properly clipped to the requested start and end address.
1649  *
1650  * If no holes are allowed you should pass the MAP_CLIP_NO_HOLES
1651  * flag.
1652  *
1653  * The MAP_ENTRY_IN_TRANSITION flag will be set for the entries
1654  * covered by the requested range.
1655  *
1656  * The map must be exclusively locked on entry and will remain locked
1657  * on return. If no range exists or the range contains holes and you
1658  * specified that no holes were allowed, NULL will be returned.  This
1659  * routine may temporarily unlock the map in order avoid a deadlock when
1660  * sleeping.
1661  */
1662 static
1663 vm_map_entry_t
1664 vm_map_clip_range(vm_map_t map, vm_offset_t start, vm_offset_t end,
1665                   int *countp, int flags)
1666 {
1667         vm_map_entry_t start_entry;
1668         vm_map_entry_t entry;
1669
1670         /*
1671          * Locate the entry and effect initial clipping.  The in-transition
1672          * case does not occur very often so do not try to optimize it.
1673          */
1674 again:
1675         if (vm_map_lookup_entry(map, start, &start_entry) == FALSE)
1676                 return (NULL);
1677         entry = start_entry;
1678         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1679                 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1680                 ++mycpu->gd_cnt.v_intrans_coll;
1681                 ++mycpu->gd_cnt.v_intrans_wait;
1682                 vm_map_transition_wait(map);
1683                 /*
1684                  * entry and/or start_entry may have been clipped while
1685                  * we slept, or may have gone away entirely.  We have
1686                  * to restart from the lookup.
1687                  */
1688                 goto again;
1689         }
1690
1691         /*
1692          * Since we hold an exclusive map lock we do not have to restart
1693          * after clipping, even though clipping may block in zalloc.
1694          */
1695         vm_map_clip_start(map, entry, start, countp);
1696         vm_map_clip_end(map, entry, end, countp);
1697         entry->eflags |= MAP_ENTRY_IN_TRANSITION;
1698
1699         /*
1700          * Scan entries covered by the range.  When working on the next
1701          * entry a restart need only re-loop on the current entry which
1702          * we have already locked, since 'next' may have changed.  Also,
1703          * even though entry is safe, it may have been clipped so we
1704          * have to iterate forwards through the clip after sleeping.
1705          */
1706         while (entry->next != &map->header && entry->next->start < end) {
1707                 vm_map_entry_t next = entry->next;
1708
1709                 if (flags & MAP_CLIP_NO_HOLES) {
1710                         if (next->start > entry->end) {
1711                                 vm_map_unclip_range(map, start_entry,
1712                                         start, entry->end, countp, flags);
1713                                 return(NULL);
1714                         }
1715                 }
1716
1717                 if (next->eflags & MAP_ENTRY_IN_TRANSITION) {
1718                         vm_offset_t save_end = entry->end;
1719                         next->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1720                         ++mycpu->gd_cnt.v_intrans_coll;
1721                         ++mycpu->gd_cnt.v_intrans_wait;
1722                         vm_map_transition_wait(map);
1723
1724                         /*
1725                          * clips might have occured while we blocked.
1726                          */
1727                         CLIP_CHECK_FWD(entry, save_end);
1728                         CLIP_CHECK_BACK(start_entry, start);
1729                         continue;
1730                 }
1731                 /*
1732                  * No restart necessary even though clip_end may block, we
1733                  * are holding the map lock.
1734                  */
1735                 vm_map_clip_end(map, next, end, countp);
1736                 next->eflags |= MAP_ENTRY_IN_TRANSITION;
1737                 entry = next;
1738         }
1739         if (flags & MAP_CLIP_NO_HOLES) {
1740                 if (entry->end != end) {
1741                         vm_map_unclip_range(map, start_entry,
1742                                 start, entry->end, countp, flags);
1743                         return(NULL);
1744                 }
1745         }
1746         return(start_entry);
1747 }
1748
1749 /*
1750  * Undo the effect of vm_map_clip_range().  You should pass the same
1751  * flags and the same range that you passed to vm_map_clip_range().
1752  * This code will clear the in-transition flag on the entries and
1753  * wake up anyone waiting.  This code will also simplify the sequence
1754  * and attempt to merge it with entries before and after the sequence.
1755  *
1756  * The map must be locked on entry and will remain locked on return.
1757  *
1758  * Note that you should also pass the start_entry returned by
1759  * vm_map_clip_range().  However, if you block between the two calls
1760  * with the map unlocked please be aware that the start_entry may
1761  * have been clipped and you may need to scan it backwards to find
1762  * the entry corresponding with the original start address.  You are
1763  * responsible for this, vm_map_unclip_range() expects the correct
1764  * start_entry to be passed to it and will KASSERT otherwise.
1765  */
1766 static
1767 void
1768 vm_map_unclip_range(vm_map_t map, vm_map_entry_t start_entry,
1769                     vm_offset_t start, vm_offset_t end,
1770                     int *countp, int flags)
1771 {
1772         vm_map_entry_t entry;
1773
1774         entry = start_entry;
1775
1776         KASSERT(entry->start == start, ("unclip_range: illegal base entry"));
1777         while (entry != &map->header && entry->start < end) {
1778                 KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
1779                         ("in-transition flag not set during unclip on: %p",
1780                         entry));
1781                 KASSERT(entry->end <= end,
1782                         ("unclip_range: tail wasn't clipped"));
1783                 entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
1784                 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
1785                         entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
1786                         wakeup(map);
1787                 }
1788                 entry = entry->next;
1789         }
1790
1791         /*
1792          * Simplification does not block so there is no restart case.
1793          */
1794         entry = start_entry;
1795         while (entry != &map->header && entry->start < end) {
1796                 vm_map_simplify_entry(map, entry, countp);
1797                 entry = entry->next;
1798         }
1799 }
1800
1801 /*
1802  * Mark the given range as handled by a subordinate map.
1803  *
1804  * This range must have been created with vm_map_find(), and no other
1805  * operations may have been performed on this range prior to calling
1806  * vm_map_submap().
1807  *
1808  * Submappings cannot be removed.
1809  *
1810  * No requirements.
1811  */
1812 int
1813 vm_map_submap(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_map_t submap)
1814 {
1815         vm_map_entry_t entry;
1816         int result = KERN_INVALID_ARGUMENT;
1817         int count;
1818
1819         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1820         vm_map_lock(map);
1821
1822         VM_MAP_RANGE_CHECK(map, start, end);
1823
1824         if (vm_map_lookup_entry(map, start, &entry)) {
1825                 vm_map_clip_start(map, entry, start, &count);
1826         } else {
1827                 entry = entry->next;
1828         }
1829
1830         vm_map_clip_end(map, entry, end, &count);
1831
1832         if ((entry->start == start) && (entry->end == end) &&
1833             ((entry->eflags & MAP_ENTRY_COW) == 0) &&
1834             (entry->object.vm_object == NULL)) {
1835                 entry->object.sub_map = submap;
1836                 entry->maptype = VM_MAPTYPE_SUBMAP;
1837                 result = KERN_SUCCESS;
1838         }
1839         vm_map_unlock(map);
1840         vm_map_entry_release(count);
1841
1842         return (result);
1843 }
1844
1845 /*
1846  * Sets the protection of the specified address region in the target map.
1847  * If "set_max" is specified, the maximum protection is to be set;
1848  * otherwise, only the current protection is affected.
1849  *
1850  * The protection is not applicable to submaps, but is applicable to normal
1851  * maps and maps governed by virtual page tables.  For example, when operating
1852  * on a virtual page table our protection basically controls how COW occurs
1853  * on the backing object, whereas the virtual page table abstraction itself
1854  * is an abstraction for userland.
1855  *
1856  * No requirements.
1857  */
1858 int
1859 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
1860                vm_prot_t new_prot, boolean_t set_max)
1861 {
1862         vm_map_entry_t current;
1863         vm_map_entry_t entry;
1864         int count;
1865
1866         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1867         vm_map_lock(map);
1868
1869         VM_MAP_RANGE_CHECK(map, start, end);
1870
1871         if (vm_map_lookup_entry(map, start, &entry)) {
1872                 vm_map_clip_start(map, entry, start, &count);
1873         } else {
1874                 entry = entry->next;
1875         }
1876
1877         /*
1878          * Make a first pass to check for protection violations.
1879          */
1880         current = entry;
1881         while ((current != &map->header) && (current->start < end)) {
1882                 if (current->maptype == VM_MAPTYPE_SUBMAP) {
1883                         vm_map_unlock(map);
1884                         vm_map_entry_release(count);
1885                         return (KERN_INVALID_ARGUMENT);
1886                 }
1887                 if ((new_prot & current->max_protection) != new_prot) {
1888                         vm_map_unlock(map);
1889                         vm_map_entry_release(count);
1890                         return (KERN_PROTECTION_FAILURE);
1891                 }
1892                 current = current->next;
1893         }
1894
1895         /*
1896          * Go back and fix up protections. [Note that clipping is not
1897          * necessary the second time.]
1898          */
1899         current = entry;
1900
1901         while ((current != &map->header) && (current->start < end)) {
1902                 vm_prot_t old_prot;
1903
1904                 vm_map_clip_end(map, current, end, &count);
1905
1906                 old_prot = current->protection;
1907                 if (set_max) {
1908                         current->max_protection = new_prot;
1909                         current->protection = new_prot & old_prot;
1910                 } else {
1911                         current->protection = new_prot;
1912                 }
1913
1914                 /*
1915                  * Update physical map if necessary. Worry about copy-on-write
1916                  * here -- CHECK THIS XXX
1917                  */
1918
1919                 if (current->protection != old_prot) {
1920 #define MASK(entry)     (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
1921                                                         VM_PROT_ALL)
1922
1923                         pmap_protect(map->pmap, current->start,
1924                             current->end,
1925                             current->protection & MASK(current));
1926 #undef  MASK
1927                 }
1928
1929                 vm_map_simplify_entry(map, current, &count);
1930
1931                 current = current->next;
1932         }
1933
1934         vm_map_unlock(map);
1935         vm_map_entry_release(count);
1936         return (KERN_SUCCESS);
1937 }
1938
1939 /*
1940  * This routine traverses a processes map handling the madvise
1941  * system call.  Advisories are classified as either those effecting
1942  * the vm_map_entry structure, or those effecting the underlying
1943  * objects.
1944  *
1945  * The <value> argument is used for extended madvise calls.
1946  *
1947  * No requirements.
1948  */
1949 int
1950 vm_map_madvise(vm_map_t map, vm_offset_t start, vm_offset_t end,
1951                int behav, off_t value)
1952 {
1953         vm_map_entry_t current, entry;
1954         int modify_map = 0;
1955         int error = 0;
1956         int count;
1957
1958         /*
1959          * Some madvise calls directly modify the vm_map_entry, in which case
1960          * we need to use an exclusive lock on the map and we need to perform
1961          * various clipping operations.  Otherwise we only need a read-lock
1962          * on the map.
1963          */
1964         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1965
1966         switch(behav) {
1967         case MADV_NORMAL:
1968         case MADV_SEQUENTIAL:
1969         case MADV_RANDOM:
1970         case MADV_NOSYNC:
1971         case MADV_AUTOSYNC:
1972         case MADV_NOCORE:
1973         case MADV_CORE:
1974         case MADV_SETMAP:
1975                 modify_map = 1;
1976                 vm_map_lock(map);
1977                 break;
1978         case MADV_INVAL:
1979         case MADV_WILLNEED:
1980         case MADV_DONTNEED:
1981         case MADV_FREE:
1982                 vm_map_lock_read(map);
1983                 break;
1984         default:
1985                 vm_map_entry_release(count);
1986                 return (EINVAL);
1987         }
1988
1989         /*
1990          * Locate starting entry and clip if necessary.
1991          */
1992
1993         VM_MAP_RANGE_CHECK(map, start, end);
1994
1995         if (vm_map_lookup_entry(map, start, &entry)) {
1996                 if (modify_map)
1997                         vm_map_clip_start(map, entry, start, &count);
1998         } else {
1999                 entry = entry->next;
2000         }
2001
2002         if (modify_map) {
2003                 /*
2004                  * madvise behaviors that are implemented in the vm_map_entry.
2005                  *
2006                  * We clip the vm_map_entry so that behavioral changes are
2007                  * limited to the specified address range.
2008                  */
2009                 for (current = entry;
2010                      (current != &map->header) && (current->start < end);
2011                      current = current->next
2012                 ) {
2013                         if (current->maptype == VM_MAPTYPE_SUBMAP)
2014                                 continue;
2015
2016                         vm_map_clip_end(map, current, end, &count);
2017
2018                         switch (behav) {
2019                         case MADV_NORMAL:
2020                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
2021                                 break;
2022                         case MADV_SEQUENTIAL:
2023                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
2024                                 break;
2025                         case MADV_RANDOM:
2026                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
2027                                 break;
2028                         case MADV_NOSYNC:
2029                                 current->eflags |= MAP_ENTRY_NOSYNC;
2030                                 break;
2031                         case MADV_AUTOSYNC:
2032                                 current->eflags &= ~MAP_ENTRY_NOSYNC;
2033                                 break;
2034                         case MADV_NOCORE:
2035                                 current->eflags |= MAP_ENTRY_NOCOREDUMP;
2036                                 break;
2037                         case MADV_CORE:
2038                                 current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
2039                                 break;
2040                         case MADV_SETMAP:
2041                                 /*
2042                                  * Set the page directory page for a map
2043                                  * governed by a virtual page table.  Mark
2044                                  * the entry as being governed by a virtual
2045                                  * page table if it is not.
2046                                  *
2047                                  * XXX the page directory page is stored
2048                                  * in the avail_ssize field if the map_entry.
2049                                  *
2050                                  * XXX the map simplification code does not
2051                                  * compare this field so weird things may
2052                                  * happen if you do not apply this function
2053                                  * to the entire mapping governed by the
2054                                  * virtual page table.
2055                                  */
2056                                 if (current->maptype != VM_MAPTYPE_VPAGETABLE) {
2057                                         error = EINVAL;
2058                                         break;
2059                                 }
2060                                 current->aux.master_pde = value;
2061                                 pmap_remove(map->pmap,
2062                                             current->start, current->end);
2063                                 break;
2064                         case MADV_INVAL:
2065                                 /*
2066                                  * Invalidate the related pmap entries, used
2067                                  * to flush portions of the real kernel's
2068                                  * pmap when the caller has removed or
2069                                  * modified existing mappings in a virtual
2070                                  * page table.
2071                                  *
2072                                  * (exclusive locked map version does not
2073                                  * need the range interlock).
2074                                  */
2075                                 pmap_remove(map->pmap,
2076                                             current->start, current->end);
2077                                 break;
2078                         default:
2079                                 error = EINVAL;
2080                                 break;
2081                         }
2082                         vm_map_simplify_entry(map, current, &count);
2083                 }
2084                 vm_map_unlock(map);
2085         } else {
2086                 vm_pindex_t pindex;
2087                 vm_pindex_t delta;
2088
2089                 /*
2090                  * madvise behaviors that are implemented in the underlying
2091                  * vm_object.
2092                  *
2093                  * Since we don't clip the vm_map_entry, we have to clip
2094                  * the vm_object pindex and count.
2095                  *
2096                  * NOTE!  These functions are only supported on normal maps,
2097                  *        except MADV_INVAL which is also supported on
2098                  *        virtual page tables.
2099                  */
2100                 for (current = entry;
2101                      (current != &map->header) && (current->start < end);
2102                      current = current->next
2103                 ) {
2104                         vm_offset_t useStart;
2105
2106                         if (current->maptype != VM_MAPTYPE_NORMAL &&
2107                             (current->maptype != VM_MAPTYPE_VPAGETABLE ||
2108                              behav != MADV_INVAL)) {
2109                                 continue;
2110                         }
2111
2112                         pindex = OFF_TO_IDX(current->offset);
2113                         delta = atop(current->end - current->start);
2114                         useStart = current->start;
2115
2116                         if (current->start < start) {
2117                                 pindex += atop(start - current->start);
2118                                 delta -= atop(start - current->start);
2119                                 useStart = start;
2120                         }
2121                         if (current->end > end)
2122                                 delta -= atop(current->end - end);
2123
2124                         if ((vm_spindex_t)delta <= 0)
2125                                 continue;
2126
2127                         if (behav == MADV_INVAL) {
2128                                 /*
2129                                  * Invalidate the related pmap entries, used
2130                                  * to flush portions of the real kernel's
2131                                  * pmap when the caller has removed or
2132                                  * modified existing mappings in a virtual
2133                                  * page table.
2134                                  *
2135                                  * (shared locked map version needs the
2136                                  * interlock, see vm_fault()).
2137                                  */
2138                                 struct vm_map_ilock ilock;
2139
2140                                 KASSERT(useStart >= VM_MIN_USER_ADDRESS &&
2141                                             useStart + ptoa(delta) <=
2142                                             VM_MAX_USER_ADDRESS,
2143                                          ("Bad range %016jx-%016jx (%016jx)",
2144                                          useStart, useStart + ptoa(delta),
2145                                          delta));
2146                                 vm_map_interlock(map, &ilock,
2147                                                  useStart,
2148                                                  useStart + ptoa(delta));
2149                                 pmap_remove(map->pmap,
2150                                             useStart,
2151                                             useStart + ptoa(delta));
2152                                 vm_map_deinterlock(map, &ilock);
2153                         } else {
2154                                 vm_object_madvise(current->object.vm_object,
2155                                                   pindex, delta, behav);
2156                         }
2157
2158                         /*
2159                          * Try to populate the page table.  Mappings governed
2160                          * by virtual page tables cannot be pre-populated
2161                          * without a lot of work so don't try.
2162                          */
2163                         if (behav == MADV_WILLNEED &&
2164                             current->maptype != VM_MAPTYPE_VPAGETABLE) {
2165                                 pmap_object_init_pt(
2166                                     map->pmap,
2167                                     useStart,
2168                                     current->protection,
2169                                     current->object.vm_object,
2170                                     pindex,
2171                                     (count << PAGE_SHIFT),
2172                                     MAP_PREFAULT_MADVISE
2173                                 );
2174                         }
2175                 }
2176                 vm_map_unlock_read(map);
2177         }
2178         vm_map_entry_release(count);
2179         return(error);
2180 }
2181
2182
2183 /*
2184  * Sets the inheritance of the specified address range in the target map.
2185  * Inheritance affects how the map will be shared with child maps at the
2186  * time of vm_map_fork.
2187  */
2188 int
2189 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
2190                vm_inherit_t new_inheritance)
2191 {
2192         vm_map_entry_t entry;
2193         vm_map_entry_t temp_entry;
2194         int count;
2195
2196         switch (new_inheritance) {
2197         case VM_INHERIT_NONE:
2198         case VM_INHERIT_COPY:
2199         case VM_INHERIT_SHARE:
2200                 break;
2201         default:
2202                 return (KERN_INVALID_ARGUMENT);
2203         }
2204
2205         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2206         vm_map_lock(map);
2207
2208         VM_MAP_RANGE_CHECK(map, start, end);
2209
2210         if (vm_map_lookup_entry(map, start, &temp_entry)) {
2211                 entry = temp_entry;
2212                 vm_map_clip_start(map, entry, start, &count);
2213         } else
2214                 entry = temp_entry->next;
2215
2216         while ((entry != &map->header) && (entry->start < end)) {
2217                 vm_map_clip_end(map, entry, end, &count);
2218
2219                 entry->inheritance = new_inheritance;
2220
2221                 vm_map_simplify_entry(map, entry, &count);
2222
2223                 entry = entry->next;
2224         }
2225         vm_map_unlock(map);
2226         vm_map_entry_release(count);
2227         return (KERN_SUCCESS);
2228 }
2229
2230 /*
2231  * Implement the semantics of mlock
2232  */
2233 int
2234 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t real_end,
2235               boolean_t new_pageable)
2236 {
2237         vm_map_entry_t entry;
2238         vm_map_entry_t start_entry;
2239         vm_offset_t end;
2240         int rv = KERN_SUCCESS;
2241         int count;
2242
2243         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2244         vm_map_lock(map);
2245         VM_MAP_RANGE_CHECK(map, start, real_end);
2246         end = real_end;
2247
2248         start_entry = vm_map_clip_range(map, start, end, &count,
2249                                         MAP_CLIP_NO_HOLES);
2250         if (start_entry == NULL) {
2251                 vm_map_unlock(map);
2252                 vm_map_entry_release(count);
2253                 return (KERN_INVALID_ADDRESS);
2254         }
2255
2256         if (new_pageable == 0) {
2257                 entry = start_entry;
2258                 while ((entry != &map->header) && (entry->start < end)) {
2259                         vm_offset_t save_start;
2260                         vm_offset_t save_end;
2261
2262                         /*
2263                          * Already user wired or hard wired (trivial cases)
2264                          */
2265                         if (entry->eflags & MAP_ENTRY_USER_WIRED) {
2266                                 entry = entry->next;
2267                                 continue;
2268                         }
2269                         if (entry->wired_count != 0) {
2270                                 entry->wired_count++;
2271                                 entry->eflags |= MAP_ENTRY_USER_WIRED;
2272                                 entry = entry->next;
2273                                 continue;
2274                         }
2275
2276                         /*
2277                          * A new wiring requires instantiation of appropriate
2278                          * management structures and the faulting in of the
2279                          * page.
2280                          */
2281                         if (entry->maptype == VM_MAPTYPE_NORMAL ||
2282                             entry->maptype == VM_MAPTYPE_VPAGETABLE) {
2283                                 int copyflag = entry->eflags &
2284                                                MAP_ENTRY_NEEDS_COPY;
2285                                 if (copyflag && ((entry->protection &
2286                                                   VM_PROT_WRITE) != 0)) {
2287                                         vm_map_entry_shadow(entry, 0);
2288                                 } else if (entry->object.vm_object == NULL &&
2289                                            !map->system_map) {
2290                                         vm_map_entry_allocate_object(entry);
2291                                 }
2292                         }
2293                         entry->wired_count++;
2294                         entry->eflags |= MAP_ENTRY_USER_WIRED;
2295
2296                         /*
2297                          * Now fault in the area.  Note that vm_fault_wire()
2298                          * may release the map lock temporarily, it will be
2299                          * relocked on return.  The in-transition
2300                          * flag protects the entries.
2301                          */
2302                         save_start = entry->start;
2303                         save_end = entry->end;
2304                         rv = vm_fault_wire(map, entry, TRUE, 0);
2305                         if (rv) {
2306                                 CLIP_CHECK_BACK(entry, save_start);
2307                                 for (;;) {
2308                                         KASSERT(entry->wired_count == 1, ("bad wired_count on entry"));
2309                                         entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2310                                         entry->wired_count = 0;
2311                                         if (entry->end == save_end)
2312                                                 break;
2313                                         entry = entry->next;
2314                                         KASSERT(entry != &map->header, ("bad entry clip during backout"));
2315                                 }
2316                                 end = save_start;       /* unwire the rest */
2317                                 break;
2318                         }
2319                         /*
2320                          * note that even though the entry might have been
2321                          * clipped, the USER_WIRED flag we set prevents
2322                          * duplication so we do not have to do a
2323                          * clip check.
2324                          */
2325                         entry = entry->next;
2326                 }
2327
2328                 /*
2329                  * If we failed fall through to the unwiring section to
2330                  * unwire what we had wired so far.  'end' has already
2331                  * been adjusted.
2332                  */
2333                 if (rv)
2334                         new_pageable = 1;
2335
2336                 /*
2337                  * start_entry might have been clipped if we unlocked the
2338                  * map and blocked.  No matter how clipped it has gotten
2339                  * there should be a fragment that is on our start boundary.
2340                  */
2341                 CLIP_CHECK_BACK(start_entry, start);
2342         }
2343
2344         /*
2345          * Deal with the unwiring case.
2346          */
2347         if (new_pageable) {
2348                 /*
2349                  * This is the unwiring case.  We must first ensure that the
2350                  * range to be unwired is really wired down.  We know there
2351                  * are no holes.
2352                  */
2353                 entry = start_entry;
2354                 while ((entry != &map->header) && (entry->start < end)) {
2355                         if ((entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
2356                                 rv = KERN_INVALID_ARGUMENT;
2357                                 goto done;
2358                         }
2359                         KASSERT(entry->wired_count != 0, ("wired count was 0 with USER_WIRED set! %p", entry));
2360                         entry = entry->next;
2361                 }
2362
2363                 /*
2364                  * Now decrement the wiring count for each region. If a region
2365                  * becomes completely unwired, unwire its physical pages and
2366                  * mappings.
2367                  */
2368                 /*
2369                  * The map entries are processed in a loop, checking to
2370                  * make sure the entry is wired and asserting it has a wired
2371                  * count. However, another loop was inserted more-or-less in
2372                  * the middle of the unwiring path. This loop picks up the
2373                  * "entry" loop variable from the first loop without first
2374                  * setting it to start_entry. Naturally, the secound loop
2375                  * is never entered and the pages backing the entries are
2376                  * never unwired. This can lead to a leak of wired pages.
2377                  */
2378                 entry = start_entry;
2379                 while ((entry != &map->header) && (entry->start < end)) {
2380                         KASSERT(entry->eflags & MAP_ENTRY_USER_WIRED,
2381                                 ("expected USER_WIRED on entry %p", entry));
2382                         entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2383                         entry->wired_count--;
2384                         if (entry->wired_count == 0)
2385                                 vm_fault_unwire(map, entry);
2386                         entry = entry->next;
2387                 }
2388         }
2389 done:
2390         vm_map_unclip_range(map, start_entry, start, real_end, &count,
2391                 MAP_CLIP_NO_HOLES);
2392         map->timestamp++;
2393         vm_map_unlock(map);
2394         vm_map_entry_release(count);
2395         return (rv);
2396 }
2397
2398 /*
2399  * Sets the pageability of the specified address range in the target map.
2400  * Regions specified as not pageable require locked-down physical
2401  * memory and physical page maps.
2402  *
2403  * The map must not be locked, but a reference must remain to the map
2404  * throughout the call.
2405  *
2406  * This function may be called via the zalloc path and must properly
2407  * reserve map entries for kernel_map.
2408  *
2409  * No requirements.
2410  */
2411 int
2412 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t real_end, int kmflags)
2413 {
2414         vm_map_entry_t entry;
2415         vm_map_entry_t start_entry;
2416         vm_offset_t end;
2417         int rv = KERN_SUCCESS;
2418         int count;
2419
2420         if (kmflags & KM_KRESERVE)
2421                 count = vm_map_entry_kreserve(MAP_RESERVE_COUNT);
2422         else
2423                 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2424         vm_map_lock(map);
2425         VM_MAP_RANGE_CHECK(map, start, real_end);
2426         end = real_end;
2427
2428         start_entry = vm_map_clip_range(map, start, end, &count,
2429                                         MAP_CLIP_NO_HOLES);
2430         if (start_entry == NULL) {
2431                 vm_map_unlock(map);
2432                 rv = KERN_INVALID_ADDRESS;
2433                 goto failure;
2434         }
2435         if ((kmflags & KM_PAGEABLE) == 0) {
2436                 /*
2437                  * Wiring.
2438                  *
2439                  * 1.  Holding the write lock, we create any shadow or zero-fill
2440                  * objects that need to be created. Then we clip each map
2441                  * entry to the region to be wired and increment its wiring
2442                  * count.  We create objects before clipping the map entries
2443                  * to avoid object proliferation.
2444                  *
2445                  * 2.  We downgrade to a read lock, and call vm_fault_wire to
2446                  * fault in the pages for any newly wired area (wired_count is
2447                  * 1).
2448                  *
2449                  * Downgrading to a read lock for vm_fault_wire avoids a
2450                  * possible deadlock with another process that may have faulted
2451                  * on one of the pages to be wired (it would mark the page busy,
2452                  * blocking us, then in turn block on the map lock that we
2453                  * hold).  Because of problems in the recursive lock package,
2454                  * we cannot upgrade to a write lock in vm_map_lookup.  Thus,
2455                  * any actions that require the write lock must be done
2456                  * beforehand.  Because we keep the read lock on the map, the
2457                  * copy-on-write status of the entries we modify here cannot
2458                  * change.
2459                  */
2460                 entry = start_entry;
2461                 while ((entry != &map->header) && (entry->start < end)) {
2462                         /*
2463                          * Trivial case if the entry is already wired
2464                          */
2465                         if (entry->wired_count) {
2466                                 entry->wired_count++;
2467                                 entry = entry->next;
2468                                 continue;
2469                         }
2470
2471                         /*
2472                          * The entry is being newly wired, we have to setup
2473                          * appropriate management structures.  A shadow
2474                          * object is required for a copy-on-write region,
2475                          * or a normal object for a zero-fill region.  We
2476                          * do not have to do this for entries that point to sub
2477                          * maps because we won't hold the lock on the sub map.
2478                          */
2479                         if (entry->maptype == VM_MAPTYPE_NORMAL ||
2480                             entry->maptype == VM_MAPTYPE_VPAGETABLE) {
2481                                 int copyflag = entry->eflags &
2482                                                MAP_ENTRY_NEEDS_COPY;
2483                                 if (copyflag && ((entry->protection &
2484                                                   VM_PROT_WRITE) != 0)) {
2485                                         vm_map_entry_shadow(entry, 0);
2486                                 } else if (entry->object.vm_object == NULL &&
2487                                            !map->system_map) {
2488                                         vm_map_entry_allocate_object(entry);
2489                                 }
2490                         }
2491
2492                         entry->wired_count++;
2493                         entry = entry->next;
2494                 }
2495
2496                 /*
2497                  * Pass 2.
2498                  */
2499
2500                 /*
2501                  * HACK HACK HACK HACK
2502                  *
2503                  * vm_fault_wire() temporarily unlocks the map to avoid
2504                  * deadlocks.  The in-transition flag from vm_map_clip_range
2505                  * call should protect us from changes while the map is
2506                  * unlocked.  T
2507                  *
2508                  * NOTE: Previously this comment stated that clipping might
2509                  *       still occur while the entry is unlocked, but from
2510                  *       what I can tell it actually cannot.
2511                  *
2512                  *       It is unclear whether the CLIP_CHECK_*() calls
2513                  *       are still needed but we keep them in anyway.
2514                  *
2515                  * HACK HACK HACK HACK
2516                  */
2517
2518                 entry = start_entry;
2519                 while (entry != &map->header && entry->start < end) {
2520                         /*
2521                          * If vm_fault_wire fails for any page we need to undo
2522                          * what has been done.  We decrement the wiring count
2523                          * for those pages which have not yet been wired (now)
2524                          * and unwire those that have (later).
2525                          */
2526                         vm_offset_t save_start = entry->start;
2527                         vm_offset_t save_end = entry->end;
2528
2529                         if (entry->wired_count == 1)
2530                                 rv = vm_fault_wire(map, entry, FALSE, kmflags);
2531                         if (rv) {
2532                                 CLIP_CHECK_BACK(entry, save_start);
2533                                 for (;;) {
2534                                         KASSERT(entry->wired_count == 1, ("wired_count changed unexpectedly"));
2535                                         entry->wired_count = 0;
2536                                         if (entry->end == save_end)
2537                                                 break;
2538                                         entry = entry->next;
2539                                         KASSERT(entry != &map->header, ("bad entry clip during backout"));
2540                                 }
2541                                 end = save_start;
2542                                 break;
2543                         }
2544                         CLIP_CHECK_FWD(entry, save_end);
2545                         entry = entry->next;
2546                 }
2547
2548                 /*
2549                  * If a failure occured undo everything by falling through
2550                  * to the unwiring code.  'end' has already been adjusted
2551                  * appropriately.
2552                  */
2553                 if (rv)
2554                         kmflags |= KM_PAGEABLE;
2555
2556                 /*
2557                  * start_entry is still IN_TRANSITION but may have been
2558                  * clipped since vm_fault_wire() unlocks and relocks the
2559                  * map.  No matter how clipped it has gotten there should
2560                  * be a fragment that is on our start boundary.
2561                  */
2562                 CLIP_CHECK_BACK(start_entry, start);
2563         }
2564
2565         if (kmflags & KM_PAGEABLE) {
2566                 /*
2567                  * This is the unwiring case.  We must first ensure that the
2568                  * range to be unwired is really wired down.  We know there
2569                  * are no holes.
2570                  */
2571                 entry = start_entry;
2572                 while ((entry != &map->header) && (entry->start < end)) {
2573                         if (entry->wired_count == 0) {
2574                                 rv = KERN_INVALID_ARGUMENT;
2575                                 goto done;
2576                         }
2577                         entry = entry->next;
2578                 }
2579
2580                 /*
2581                  * Now decrement the wiring count for each region. If a region
2582                  * becomes completely unwired, unwire its physical pages and
2583                  * mappings.
2584                  */
2585                 entry = start_entry;
2586                 while ((entry != &map->header) && (entry->start < end)) {
2587                         entry->wired_count--;
2588                         if (entry->wired_count == 0)
2589                                 vm_fault_unwire(map, entry);
2590                         entry = entry->next;
2591                 }
2592         }
2593 done:
2594         vm_map_unclip_range(map, start_entry, start, real_end,
2595                             &count, MAP_CLIP_NO_HOLES);
2596         map->timestamp++;
2597         vm_map_unlock(map);
2598 failure:
2599         if (kmflags & KM_KRESERVE)
2600                 vm_map_entry_krelease(count);
2601         else
2602                 vm_map_entry_release(count);
2603         return (rv);
2604 }
2605
2606 /*
2607  * Mark a newly allocated address range as wired but do not fault in
2608  * the pages.  The caller is expected to load the pages into the object.
2609  *
2610  * The map must be locked on entry and will remain locked on return.
2611  * No other requirements.
2612  */
2613 void
2614 vm_map_set_wired_quick(vm_map_t map, vm_offset_t addr, vm_size_t size,
2615                        int *countp)
2616 {
2617         vm_map_entry_t scan;
2618         vm_map_entry_t entry;
2619
2620         entry = vm_map_clip_range(map, addr, addr + size,
2621                                   countp, MAP_CLIP_NO_HOLES);
2622         for (scan = entry;
2623              scan != &map->header && scan->start < addr + size;
2624              scan = scan->next) {
2625             KKASSERT(scan->wired_count == 0);
2626             scan->wired_count = 1;
2627         }
2628         vm_map_unclip_range(map, entry, addr, addr + size,
2629                             countp, MAP_CLIP_NO_HOLES);
2630 }
2631
2632 /*
2633  * Push any dirty cached pages in the address range to their pager.
2634  * If syncio is TRUE, dirty pages are written synchronously.
2635  * If invalidate is TRUE, any cached pages are freed as well.
2636  *
2637  * This routine is called by sys_msync()
2638  *
2639  * Returns an error if any part of the specified range is not mapped.
2640  *
2641  * No requirements.
2642  */
2643 int
2644 vm_map_clean(vm_map_t map, vm_offset_t start, vm_offset_t end,
2645              boolean_t syncio, boolean_t invalidate)
2646 {
2647         vm_map_entry_t current;
2648         vm_map_entry_t entry;
2649         vm_size_t size;
2650         vm_object_t object;
2651         vm_object_t tobj;
2652         vm_ooffset_t offset;
2653
2654         vm_map_lock_read(map);
2655         VM_MAP_RANGE_CHECK(map, start, end);
2656         if (!vm_map_lookup_entry(map, start, &entry)) {
2657                 vm_map_unlock_read(map);
2658                 return (KERN_INVALID_ADDRESS);
2659         }
2660         lwkt_gettoken(&map->token);
2661
2662         /*
2663          * Make a first pass to check for holes.
2664          */
2665         for (current = entry; current->start < end; current = current->next) {
2666                 if (current->maptype == VM_MAPTYPE_SUBMAP) {
2667                         lwkt_reltoken(&map->token);
2668                         vm_map_unlock_read(map);
2669                         return (KERN_INVALID_ARGUMENT);
2670                 }
2671                 if (end > current->end &&
2672                     (current->next == &map->header ||
2673                         current->end != current->next->start)) {
2674                         lwkt_reltoken(&map->token);
2675                         vm_map_unlock_read(map);
2676                         return (KERN_INVALID_ADDRESS);
2677                 }
2678         }
2679
2680         if (invalidate)
2681                 pmap_remove(vm_map_pmap(map), start, end);
2682
2683         /*
2684          * Make a second pass, cleaning/uncaching pages from the indicated
2685          * objects as we go.
2686          */
2687         for (current = entry; current->start < end; current = current->next) {
2688                 offset = current->offset + (start - current->start);
2689                 size = (end <= current->end ? end : current->end) - start;
2690
2691                 switch(current->maptype) {
2692                 case VM_MAPTYPE_SUBMAP:
2693                 {
2694                         vm_map_t smap;
2695                         vm_map_entry_t tentry;
2696                         vm_size_t tsize;
2697
2698                         smap = current->object.sub_map;
2699                         vm_map_lock_read(smap);
2700                         vm_map_lookup_entry(smap, offset, &tentry);
2701                         tsize = tentry->end - offset;
2702                         if (tsize < size)
2703                                 size = tsize;
2704                         object = tentry->object.vm_object;
2705                         offset = tentry->offset + (offset - tentry->start);
2706                         vm_map_unlock_read(smap);
2707                         break;
2708                 }
2709                 case VM_MAPTYPE_NORMAL:
2710                 case VM_MAPTYPE_VPAGETABLE:
2711                         object = current->object.vm_object;
2712                         break;
2713                 default:
2714                         object = NULL;
2715                         break;
2716                 }
2717
2718                 if (object)
2719                         vm_object_hold(object);
2720
2721                 /*
2722                  * Note that there is absolutely no sense in writing out
2723                  * anonymous objects, so we track down the vnode object
2724                  * to write out.
2725                  * We invalidate (remove) all pages from the address space
2726                  * anyway, for semantic correctness.
2727                  *
2728                  * note: certain anonymous maps, such as MAP_NOSYNC maps,
2729                  * may start out with a NULL object.
2730                  */
2731                 while (object && (tobj = object->backing_object) != NULL) {
2732                         vm_object_hold(tobj);
2733                         if (tobj == object->backing_object) {
2734                                 vm_object_lock_swap();
2735                                 offset += object->backing_object_offset;
2736                                 vm_object_drop(object);
2737                                 object = tobj;
2738                                 if (object->size < OFF_TO_IDX(offset + size))
2739                                         size = IDX_TO_OFF(object->size) -
2740                                                offset;
2741                                 break;
2742                         }
2743                         vm_object_drop(tobj);
2744                 }
2745                 if (object && (object->type == OBJT_VNODE) &&
2746                     (current->protection & VM_PROT_WRITE) &&
2747                     (object->flags & OBJ_NOMSYNC) == 0) {
2748                         /*
2749                          * Flush pages if writing is allowed, invalidate them
2750                          * if invalidation requested.  Pages undergoing I/O
2751                          * will be ignored by vm_object_page_remove().
2752                          *
2753                          * We cannot lock the vnode and then wait for paging
2754                          * to complete without deadlocking against vm_fault.
2755                          * Instead we simply call vm_object_page_remove() and
2756                          * allow it to block internally on a page-by-page
2757                          * basis when it encounters pages undergoing async
2758                          * I/O.
2759                          */
2760                         int flags;
2761
2762                         /* no chain wait needed for vnode objects */
2763                         vm_object_reference_locked(object);
2764                         vn_lock(object->handle, LK_EXCLUSIVE | LK_RETRY);
2765                         flags = (syncio || invalidate) ? OBJPC_SYNC : 0;
2766                         flags |= invalidate ? OBJPC_INVAL : 0;
2767
2768                         /*
2769                          * When operating on a virtual page table just
2770                          * flush the whole object.  XXX we probably ought
2771                          * to
2772                          */
2773                         switch(current->maptype) {
2774                         case VM_MAPTYPE_NORMAL:
2775                                 vm_object_page_clean(object,
2776                                     OFF_TO_IDX(offset),
2777                                     OFF_TO_IDX(offset + size + PAGE_MASK),
2778                                     flags);
2779                                 break;
2780                         case VM_MAPTYPE_VPAGETABLE:
2781                                 vm_object_page_clean(object, 0, 0, flags);
2782                                 break;
2783                         }
2784                         vn_unlock(((struct vnode *)object->handle));
2785                         vm_object_deallocate_locked(object);
2786                 }
2787                 if (object && invalidate &&
2788                    ((object->type == OBJT_VNODE) ||
2789                     (object->type == OBJT_DEVICE) ||
2790                     (object->type == OBJT_MGTDEVICE))) {
2791                         int clean_only =
2792                                 ((object->type == OBJT_DEVICE) ||
2793                                 (object->type == OBJT_MGTDEVICE)) ? FALSE : TRUE;
2794                         /* no chain wait needed for vnode/device objects */
2795                         vm_object_reference_locked(object);
2796                         switch(current->maptype) {
2797                         case VM_MAPTYPE_NORMAL:
2798                                 vm_object_page_remove(object,
2799                                     OFF_TO_IDX(offset),
2800                                     OFF_TO_IDX(offset + size + PAGE_MASK),
2801                                     clean_only);
2802                                 break;
2803                         case VM_MAPTYPE_VPAGETABLE:
2804                                 vm_object_page_remove(object, 0, 0, clean_only);
2805                                 break;
2806                         }
2807                         vm_object_deallocate_locked(object);
2808                 }
2809                 start += size;
2810                 if (object)
2811                         vm_object_drop(object);
2812         }
2813
2814         lwkt_reltoken(&map->token);
2815         vm_map_unlock_read(map);
2816
2817         return (KERN_SUCCESS);
2818 }
2819
2820 /*
2821  * Make the region specified by this entry pageable.
2822  *
2823  * The vm_map must be exclusively locked.
2824  */
2825 static void
2826 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
2827 {
2828         entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2829         entry->wired_count = 0;
2830         vm_fault_unwire(map, entry);
2831 }
2832
2833 /*
2834  * Deallocate the given entry from the target map.
2835  *
2836  * The vm_map must be exclusively locked.
2837  */
2838 static void
2839 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry, int *countp)
2840 {
2841         vm_map_entry_unlink(map, entry);
2842         map->size -= entry->end - entry->start;
2843
2844         switch(entry->maptype) {
2845         case VM_MAPTYPE_NORMAL:
2846         case VM_MAPTYPE_VPAGETABLE:
2847         case VM_MAPTYPE_SUBMAP:
2848                 vm_object_deallocate(entry->object.vm_object);
2849                 break;
2850         case VM_MAPTYPE_UKSMAP:
2851                 /* XXX TODO */
2852                 break;
2853         default:
2854                 break;
2855         }
2856
2857         vm_map_entry_dispose(map, entry, countp);
2858 }
2859
2860 /*
2861  * Deallocates the given address range from the target map.
2862  *
2863  * The vm_map must be exclusively locked.
2864  */
2865 int
2866 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end, int *countp)
2867 {
2868         vm_object_t object;
2869         vm_map_entry_t entry;
2870         vm_map_entry_t first_entry;
2871
2872         ASSERT_VM_MAP_LOCKED(map);
2873         lwkt_gettoken(&map->token);
2874 again:
2875         /*
2876          * Find the start of the region, and clip it.  Set entry to point
2877          * at the first record containing the requested address or, if no
2878          * such record exists, the next record with a greater address.  The
2879          * loop will run from this point until a record beyond the termination
2880          * address is encountered.
2881          *
2882          * map->hint must be adjusted to not point to anything we delete,
2883          * so set it to the entry prior to the one being deleted.
2884          *
2885          * GGG see other GGG comment.
2886          */
2887         if (vm_map_lookup_entry(map, start, &first_entry)) {
2888                 entry = first_entry;
2889                 vm_map_clip_start(map, entry, start, countp);
2890                 map->hint = entry->prev;        /* possible problem XXX */
2891         } else {
2892                 map->hint = first_entry;        /* possible problem XXX */
2893                 entry = first_entry->next;
2894         }
2895
2896         /*
2897          * If a hole opens up prior to the current first_free then
2898          * adjust first_free.  As with map->hint, map->first_free
2899          * cannot be left set to anything we might delete.
2900          */
2901         if (entry == &map->header) {
2902                 map->first_free = &map->header;
2903         } else if (map->first_free->start >= start) {
2904                 map->first_free = entry->prev;
2905         }
2906
2907         /*
2908          * Step through all entries in this region
2909          */
2910         while ((entry != &map->header) && (entry->start < end)) {
2911                 vm_map_entry_t next;
2912                 vm_offset_t s, e;
2913                 vm_pindex_t offidxstart, offidxend, count;
2914
2915                 /*
2916                  * If we hit an in-transition entry we have to sleep and
2917                  * retry.  It's easier (and not really slower) to just retry
2918                  * since this case occurs so rarely and the hint is already
2919                  * pointing at the right place.  We have to reset the
2920                  * start offset so as not to accidently delete an entry
2921                  * another process just created in vacated space.
2922                  */
2923                 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
2924                         entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2925                         start = entry->start;
2926                         ++mycpu->gd_cnt.v_intrans_coll;
2927                         ++mycpu->gd_cnt.v_intrans_wait;
2928                         vm_map_transition_wait(map);
2929                         goto again;
2930                 }
2931                 vm_map_clip_end(map, entry, end, countp);
2932
2933                 s = entry->start;
2934                 e = entry->end;
2935                 next = entry->next;
2936
2937                 offidxstart = OFF_TO_IDX(entry->offset);
2938                 count = OFF_TO_IDX(e - s);
2939
2940                 switch(entry->maptype) {
2941                 case VM_MAPTYPE_NORMAL:
2942                 case VM_MAPTYPE_VPAGETABLE:
2943                 case VM_MAPTYPE_SUBMAP:
2944                         object = entry->object.vm_object;
2945                         break;
2946                 default:
2947                         object = NULL;
2948                         break;
2949                 }
2950
2951                 /*
2952                  * Unwire before removing addresses from the pmap; otherwise,
2953                  * unwiring will put the entries back in the pmap.
2954                  */
2955                 if (entry->wired_count != 0)
2956                         vm_map_entry_unwire(map, entry);
2957
2958                 offidxend = offidxstart + count;
2959
2960                 if (object == &kernel_object) {
2961                         vm_object_hold(object);
2962                         vm_object_page_remove(object, offidxstart,
2963                                               offidxend, FALSE);
2964                         vm_object_drop(object);
2965                 } else if (object && object->type != OBJT_DEFAULT &&
2966                            object->type != OBJT_SWAP) {
2967                         /*
2968                          * vnode object routines cannot be chain-locked,
2969                          * but since we aren't removing pages from the
2970                          * object here we can use a shared hold.
2971                          */
2972                         vm_object_hold_shared(object);
2973                         pmap_remove(map->pmap, s, e);
2974                         vm_object_drop(object);
2975                 } else if (object) {
2976                         vm_object_hold(object);
2977                         vm_object_chain_acquire(object, 0);
2978                         pmap_remove(map->pmap, s, e);
2979
2980                         if (object != NULL &&
2981                             object->ref_count != 1 &&
2982                             (object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) ==
2983                              OBJ_ONEMAPPING &&
2984                             (object->type == OBJT_DEFAULT ||
2985                              object->type == OBJT_SWAP)) {
2986                                 vm_object_collapse(object, NULL);
2987                                 vm_object_page_remove(object, offidxstart,
2988                                                       offidxend, FALSE);
2989                                 if (object->type == OBJT_SWAP) {
2990                                         swap_pager_freespace(object,
2991                                                              offidxstart,
2992                                                              count);
2993                                 }
2994                                 if (offidxend >= object->size &&
2995                                     offidxstart < object->size) {
2996                                         object->size = offidxstart;
2997                                 }
2998                         }
2999                         vm_object_chain_release(object);
3000                         vm_object_drop(object);
3001                 } else if (entry->maptype == VM_MAPTYPE_UKSMAP) {
3002                         pmap_remove(map->pmap, s, e);
3003                 }
3004
3005                 /*
3006                  * Delete the entry (which may delete the object) only after
3007                  * removing all pmap entries pointing to its pages.
3008                  * (Otherwise, its page frames may be reallocated, and any
3009                  * modify bits will be set in the wrong object!)
3010                  */
3011                 vm_map_entry_delete(map, entry, countp);
3012                 entry = next;
3013         }
3014         lwkt_reltoken(&map->token);
3015         return (KERN_SUCCESS);
3016 }
3017
3018 /*
3019  * Remove the given address range from the target map.
3020  * This is the exported form of vm_map_delete.
3021  *
3022  * No requirements.
3023  */
3024 int
3025 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
3026 {
3027         int result;
3028         int count;
3029
3030         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3031         vm_map_lock(map);
3032         VM_MAP_RANGE_CHECK(map, start, end);
3033         result = vm_map_delete(map, start, end, &count);
3034         vm_map_unlock(map);
3035         vm_map_entry_release(count);
3036
3037         return (result);
3038 }
3039
3040 /*
3041  * Assert that the target map allows the specified privilege on the
3042  * entire address region given.  The entire region must be allocated.
3043  *
3044  * The caller must specify whether the vm_map is already locked or not.
3045  */
3046 boolean_t
3047 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
3048                         vm_prot_t protection, boolean_t have_lock)
3049 {
3050         vm_map_entry_t entry;
3051         vm_map_entry_t tmp_entry;
3052         boolean_t result;
3053
3054         if (have_lock == FALSE)
3055                 vm_map_lock_read(map);
3056
3057         if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
3058                 if (have_lock == FALSE)
3059                         vm_map_unlock_read(map);
3060                 return (FALSE);
3061         }
3062         entry = tmp_entry;
3063
3064         result = TRUE;
3065         while (start < end) {
3066                 if (entry == &map->header) {
3067                         result = FALSE;
3068                         break;
3069                 }
3070                 /*
3071                  * No holes allowed!
3072                  */
3073
3074                 if (start < entry->start) {
3075                         result = FALSE;
3076                         break;
3077                 }
3078                 /*
3079                  * Check protection associated with entry.
3080                  */
3081
3082                 if ((entry->protection & protection) != protection) {
3083                         result = FALSE;
3084                         break;
3085                 }
3086                 /* go to next entry */
3087
3088                 start = entry->end;
3089                 entry = entry->next;
3090         }
3091         if (have_lock == FALSE)
3092                 vm_map_unlock_read(map);
3093         return (result);
3094 }
3095
3096 /*
3097  * If appropriate this function shadows the original object with a new object
3098  * and moves the VM pages from the original object to the new object.
3099  * The original object will also be collapsed, if possible.
3100  *
3101  * We can only do this for normal memory objects with a single mapping, and
3102  * it only makes sense to do it if there are 2 or more refs on the original
3103  * object.  i.e. typically a memory object that has been extended into
3104  * multiple vm_map_entry's with non-overlapping ranges.
3105  *
3106  * This makes it easier to remove unused pages and keeps object inheritance
3107  * from being a negative impact on memory usage.
3108  *
3109  * On return the (possibly new) entry->object.vm_object will have an
3110  * additional ref on it for the caller to dispose of (usually by cloning
3111  * the vm_map_entry).  The additional ref had to be done in this routine
3112  * to avoid racing a collapse.  The object's ONEMAPPING flag will also be
3113  * cleared.
3114  *
3115  * The vm_map must be locked and its token held.
3116  */
3117 static void
3118 vm_map_split(vm_map_entry_t entry)
3119 {
3120         /* OPTIMIZED */
3121         vm_object_t oobject, nobject, bobject;
3122         vm_offset_t s, e;
3123         vm_page_t m;
3124         vm_pindex_t offidxstart, offidxend, idx;
3125         vm_size_t size;
3126         vm_ooffset_t offset;
3127         int useshadowlist;
3128
3129         /*
3130          * Optimize away object locks for vnode objects.  Important exit/exec
3131          * critical path.
3132          *
3133          * OBJ_ONEMAPPING doesn't apply to vnode objects but clear the flag
3134          * anyway.
3135          */
3136         oobject = entry->object.vm_object;
3137         if (oobject->type != OBJT_DEFAULT && oobject->type != OBJT_SWAP) {
3138                 vm_object_reference_quick(oobject);
3139                 vm_object_clear_flag(oobject, OBJ_ONEMAPPING);
3140                 return;
3141         }
3142
3143         /*
3144          * Setup.  Chain lock the original object throughout the entire
3145          * routine to prevent new page faults from occuring.
3146          *
3147          * XXX can madvise WILLNEED interfere with us too?
3148          */
3149         vm_object_hold(oobject);
3150         vm_object_chain_acquire(oobject, 0);
3151
3152         /*
3153          * Original object cannot be split?  Might have also changed state.
3154          */
3155         if (oobject->handle == NULL || (oobject->type != OBJT_DEFAULT &&
3156                                         oobject->type != OBJT_SWAP)) {
3157                 vm_object_chain_release(oobject);
3158                 vm_object_reference_locked(oobject);
3159                 vm_object_clear_flag(oobject, OBJ_ONEMAPPING);
3160                 vm_object_drop(oobject);
3161                 return;
3162         }
3163
3164         /*
3165          * Collapse original object with its backing store as an
3166          * optimization to reduce chain lengths when possible.
3167          *
3168          * If ref_count <= 1 there aren't other non-overlapping vm_map_entry's
3169          * for oobject, so there's no point collapsing it.
3170          *
3171          * Then re-check whether the object can be split.
3172          */
3173         vm_object_collapse(oobject, NULL);
3174
3175         if (oobject->ref_count <= 1 ||
3176             (oobject->type != OBJT_DEFAULT && oobject->type != OBJT_SWAP) ||
3177             (oobject->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) != OBJ_ONEMAPPING) {
3178                 vm_object_chain_release(oobject);
3179                 vm_object_reference_locked(oobject);
3180                 vm_object_clear_flag(oobject, OBJ_ONEMAPPING);
3181                 vm_object_drop(oobject);
3182                 return;
3183         }
3184
3185         /*
3186          * Acquire the chain lock on the backing object.
3187          *
3188          * Give bobject an additional ref count for when it will be shadowed
3189          * by nobject.
3190          */
3191         useshadowlist = 0;
3192         if ((bobject = oobject->backing_object) != NULL) {
3193                 if (bobject->type != OBJT_VNODE) {
3194                         useshadowlist = 1;
3195                         vm_object_hold(bobject);
3196                         vm_object_chain_wait(bobject, 0);
3197                         /* ref for shadowing below */
3198                         vm_object_reference_locked(bobject);
3199                         vm_object_chain_acquire(bobject, 0);
3200                         KKASSERT(bobject->backing_object == bobject);
3201                         KKASSERT((bobject->flags & OBJ_DEAD) == 0);
3202                 } else {
3203                         /*
3204                          * vnodes are not placed on the shadow list but
3205                          * they still get another ref for the backing_object
3206                          * reference.
3207                          */
3208                         vm_object_reference_quick(bobject);
3209                 }
3210         }
3211
3212         /*
3213          * Calculate the object page range and allocate the new object.
3214          */
3215         offset = entry->offset;
3216         s = entry->start;
3217         e = entry->end;
3218
3219         offidxstart = OFF_TO_IDX(offset);
3220         offidxend = offidxstart + OFF_TO_IDX(e - s);
3221         size = offidxend - offidxstart;
3222
3223         switch(oobject->type) {
3224         case OBJT_DEFAULT:
3225                 nobject = default_pager_alloc(NULL, IDX_TO_OFF(size),
3226                                               VM_PROT_ALL, 0);
3227                 break;
3228         case OBJT_SWAP:
3229                 nobject = swap_pager_alloc(NULL, IDX_TO_OFF(size),
3230                                            VM_PROT_ALL, 0);
3231                 break;
3232         default:
3233                 /* not reached */
3234                 nobject = NULL;
3235                 KKASSERT(0);
3236         }
3237
3238         if (nobject == NULL) {
3239                 if (bobject) {
3240                         if (useshadowlist) {
3241                                 vm_object_chain_release(bobject);
3242                                 vm_object_deallocate(bobject);
3243                                 vm_object_drop(bobject);
3244                         } else {
3245                                 vm_object_deallocate(bobject);
3246                         }
3247                 }
3248                 vm_object_chain_release(oobject);
3249                 vm_object_reference_locked(oobject);
3250                 vm_object_clear_flag(oobject, OBJ_ONEMAPPING);
3251                 vm_object_drop(oobject);
3252                 return;
3253         }
3254
3255         /*
3256          * The new object will replace entry->object.vm_object so it needs
3257          * a second reference (the caller expects an additional ref).
3258          */
3259         vm_object_hold(nobject);
3260         vm_object_reference_locked(nobject);
3261         vm_object_chain_acquire(nobject, 0);
3262
3263         /*
3264          * nobject shadows bobject (oobject already shadows bobject).
3265          *
3266          * Adding an object to bobject's shadow list requires refing bobject
3267          * which we did above in the useshadowlist case.
3268          */
3269         if (bobject) {
3270                 nobject->backing_object_offset =
3271                     oobject->backing_object_offset + IDX_TO_OFF(offidxstart);
3272                 nobject->backing_object = bobject;
3273                 if (useshadowlist) {
3274                         bobject->shadow_count++;
3275                         atomic_add_int(&bobject->generation, 1);
3276                         LIST_INSERT_HEAD(&bobject->shadow_head,
3277                                          nobject, shadow_list);
3278                         vm_object_clear_flag(bobject, OBJ_ONEMAPPING); /*XXX*/
3279                         vm_object_chain_release(bobject);
3280                         vm_object_drop(bobject);
3281                         vm_object_set_flag(nobject, OBJ_ONSHADOW);
3282                 }
3283         }
3284
3285         /*
3286          * Move the VM pages from oobject to nobject
3287          */
3288         for (idx = 0; idx < size; idx++) {
3289                 vm_page_t m;
3290
3291                 m = vm_page_lookup_busy_wait(oobject, offidxstart + idx,
3292                                              TRUE, "vmpg");
3293                 if (m == NULL)
3294                         continue;
3295
3296                 /*
3297                  * We must wait for pending I/O to complete before we can
3298                  * rename the page.
3299                  *
3300                  * We do not have to VM_PROT_NONE the page as mappings should
3301                  * not be changed by this operation.
3302                  *
3303                  * NOTE: The act of renaming a page updates chaingen for both
3304                  *       objects.
3305                  */
3306                 vm_page_rename(m, nobject, idx);
3307                 /* page automatically made dirty by rename and cache handled */
3308                 /* page remains busy */
3309         }
3310
3311         if (oobject->type == OBJT_SWAP) {
3312                 vm_object_pip_add(oobject, 1);
3313                 /*
3314                  * copy oobject pages into nobject and destroy unneeded
3315                  * pages in shadow object.
3316                  */
3317                 swap_pager_copy(oobject, nobject, offidxstart, 0);
3318                 vm_object_pip_wakeup(oobject);
3319         }
3320
3321         /*
3322          * Wakeup the pages we played with.  No spl protection is needed
3323          * for a simple wakeup.
3324          */
3325         for (idx = 0; idx < size; idx++) {
3326                 m = vm_page_lookup(nobject, idx);
3327                 if (m) {
3328                         KKASSERT(m->flags & PG_BUSY);
3329                         vm_page_wakeup(m);
3330                 }
3331         }
3332         entry->object.vm_object = nobject;
3333         entry->offset = 0LL;
3334
3335         /*
3336          * Cleanup
3337          *
3338          * NOTE: There is no need to remove OBJ_ONEMAPPING from oobject, the
3339          *       related pages were moved and are no longer applicable to the
3340          *       original object.
3341          *
3342          * NOTE: Deallocate oobject (due to its entry->object.vm_object being
3343          *       replaced by nobject).
3344          */
3345         vm_object_chain_release(nobject);
3346         vm_object_drop(nobject);
3347         if (bobject && useshadowlist) {
3348                 vm_object_chain_release(bobject);
3349                 vm_object_drop(bobject);
3350         }
3351         vm_object_chain_release(oobject);
3352         /*vm_object_clear_flag(oobject, OBJ_ONEMAPPING);*/
3353         vm_object_deallocate_locked(oobject);
3354         vm_object_drop(oobject);
3355 }
3356
3357 /*
3358  * Copies the contents of the source entry to the destination
3359  * entry.  The entries *must* be aligned properly.
3360  *
3361  * The vm_maps must be exclusively locked.
3362  * The vm_map's token must be held.
3363  *
3364  * Because the maps are locked no faults can be in progress during the
3365  * operation.
3366  */
3367 static void
3368 vm_map_copy_entry(vm_map_t src_map, vm_map_t dst_map,
3369                   vm_map_entry_t src_entry, vm_map_entry_t dst_entry)
3370 {
3371         vm_object_t src_object;
3372
3373         if (dst_entry->maptype == VM_MAPTYPE_SUBMAP ||
3374             dst_entry->maptype == VM_MAPTYPE_UKSMAP)
3375                 return;
3376         if (src_entry->maptype == VM_MAPTYPE_SUBMAP ||
3377             src_entry->maptype == VM_MAPTYPE_UKSMAP)
3378                 return;
3379
3380         if (src_entry->wired_count == 0) {
3381                 /*
3382                  * If the source entry is marked needs_copy, it is already
3383                  * write-protected.
3384                  */
3385                 if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
3386                         pmap_protect(src_map->pmap,
3387                             src_entry->start,
3388                             src_entry->end,
3389                             src_entry->protection & ~VM_PROT_WRITE);
3390                 }
3391
3392                 /*
3393                  * Make a copy of the object.
3394                  *
3395                  * The object must be locked prior to checking the object type
3396                  * and for the call to vm_object_collapse() and vm_map_split().
3397                  * We cannot use *_hold() here because the split code will
3398                  * probably try to destroy the object.  The lock is a pool
3399                  * token and doesn't care.
3400                  *
3401                  * We must bump src_map->timestamp when setting
3402                  * MAP_ENTRY_NEEDS_COPY to force any concurrent fault
3403                  * to retry, otherwise the concurrent fault might improperly
3404                  * install a RW pte when its supposed to be a RO(COW) pte.
3405                  * This race can occur because a vnode-backed fault may have
3406                  * to temporarily release the map lock.
3407                  */
3408                 if (src_entry->object.vm_object != NULL) {
3409                         vm_map_split(src_entry);
3410                         src_object = src_entry->object.vm_object;
3411                         dst_entry->object.vm_object = src_object;
3412                         src_entry->eflags |= (MAP_ENTRY_COW |
3413                                               MAP_ENTRY_NEEDS_COPY);
3414                         dst_entry->eflags |= (MAP_ENTRY_COW |
3415                                               MAP_ENTRY_NEEDS_COPY);
3416                         dst_entry->offset = src_entry->offset;
3417                         ++src_map->timestamp;
3418                 } else {
3419                         dst_entry->object.vm_object = NULL;
3420                         dst_entry->offset = 0;
3421                 }
3422
3423                 pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start,
3424                     dst_entry->end - dst_entry->start, src_entry->start);
3425         } else {
3426                 /*
3427                  * Of course, wired down pages can't be set copy-on-write.
3428                  * Cause wired pages to be copied into the new map by
3429                  * simulating faults (the new pages are pageable)
3430                  */
3431                 vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry);
3432         }
3433 }
3434
3435 /*
3436  * vmspace_fork:
3437  * Create a new process vmspace structure and vm_map
3438  * based on those of an existing process.  The new map
3439  * is based on the old map, according to the inheritance
3440  * values on the regions in that map.
3441  *
3442  * The source map must not be locked.
3443  * No requirements.
3444  */
3445 static void vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map,
3446                           vm_map_entry_t old_entry, int *countp);
3447 static void vmspace_fork_uksmap_entry(vm_map_t old_map, vm_map_t new_map,
3448                           vm_map_entry_t old_entry, int *countp);
3449
3450 struct vmspace *
3451 vmspace_fork(struct vmspace *vm1)
3452 {
3453         struct vmspace *vm2;
3454         vm_map_t old_map = &vm1->vm_map;
3455         vm_map_t new_map;
3456         vm_map_entry_t old_entry;
3457         int count;
3458
3459         lwkt_gettoken(&vm1->vm_map.token);
3460         vm_map_lock(old_map);
3461
3462         vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset);
3463         lwkt_gettoken(&vm2->vm_map.token);
3464         bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy,
3465             (caddr_t)&vm1->vm_endcopy - (caddr_t)&vm1->vm_startcopy);
3466         new_map = &vm2->vm_map; /* XXX */
3467         new_map->timestamp = 1;
3468
3469         vm_map_lock(new_map);
3470
3471         count = 0;
3472         old_entry = old_map->header.next;
3473         while (old_entry != &old_map->header) {
3474                 ++count;
3475                 old_entry = old_entry->next;
3476         }
3477
3478         count = vm_map_entry_reserve(count + MAP_RESERVE_COUNT);
3479
3480         old_entry = old_map->header.next;
3481         while (old_entry != &old_map->header) {
3482                 switch(old_entry->maptype) {
3483                 case VM_MAPTYPE_SUBMAP:
3484                         panic("vm_map_fork: encountered a submap");
3485                         break;
3486                 case VM_MAPTYPE_UKSMAP:
3487                         vmspace_fork_uksmap_entry(old_map, new_map,
3488                                                   old_entry, &count);
3489                         break;
3490                 case VM_MAPTYPE_NORMAL:
3491                 case VM_MAPTYPE_VPAGETABLE:
3492                         vmspace_fork_normal_entry(old_map, new_map,
3493                                                   old_entry, &count);
3494                         break;
3495                 }
3496                 old_entry = old_entry->next;
3497         }
3498
3499         new_map->size = old_map->size;
3500         vm_map_unlock(old_map);
3501         vm_map_unlock(new_map);
3502         vm_map_entry_release(count);
3503
3504         lwkt_reltoken(&vm2->vm_map.token);
3505         lwkt_reltoken(&vm1->vm_map.token);
3506
3507         return (vm2);
3508 }
3509
3510 static
3511 void
3512 vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map,
3513                           vm_map_entry_t old_entry, int *countp)
3514 {
3515         vm_map_entry_t new_entry;
3516         vm_object_t object;
3517
3518         switch (old_entry->inheritance) {
3519         case VM_INHERIT_NONE:
3520                 break;
3521         case VM_INHERIT_SHARE:
3522                 /*
3523                  * Clone the entry, creating the shared object if
3524                  * necessary.
3525                  */
3526                 if (old_entry->object.vm_object == NULL)
3527                         vm_map_entry_allocate_object(old_entry);
3528
3529                 if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3530                         /*
3531                          * Shadow a map_entry which needs a copy,
3532                          * replacing its object with a new object
3533                          * that points to the old one.  Ask the
3534                          * shadow code to automatically add an
3535                          * additional ref.  We can't do it afterwords
3536                          * because we might race a collapse.  The call
3537                          * to vm_map_entry_shadow() will also clear
3538                          * OBJ_ONEMAPPING.
3539                          */
3540                         vm_map_entry_shadow(old_entry, 1);
3541                 } else if (old_entry->object.vm_object) {
3542                         /*
3543                          * We will make a shared copy of the object,
3544                          * and must clear OBJ_ONEMAPPING.
3545                          *
3546                          * Optimize vnode objects.  OBJ_ONEMAPPING
3547                          * is non-applicable but clear it anyway,
3548                          * and its terminal so we don'th ave to deal
3549                          * with chains.  Reduces SMP conflicts.
3550                          *
3551                          * XXX assert that object.vm_object != NULL
3552                          *     since we allocate it above.
3553                          */
3554                         object = old_entry->object.vm_object;
3555                         if (object->type == OBJT_VNODE) {
3556                                 vm_object_reference_quick(object);
3557                                 vm_object_clear_flag(object,
3558                                                      OBJ_ONEMAPPING);
3559                         } else {
3560                                 vm_object_hold(object);
3561                                 vm_object_chain_wait(object, 0);
3562                                 vm_object_reference_locked(object);
3563                                 vm_object_clear_flag(object,
3564                                                      OBJ_ONEMAPPING);
3565                                 vm_object_drop(object);
3566                         }
3567                 }
3568
3569                 /*
3570                  * Clone the entry.  We've already bumped the ref on
3571                  * any vm_object.
3572                  */
3573                 new_entry = vm_map_entry_create(new_map, countp);
3574                 *new_entry = *old_entry;
3575                 new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3576                 new_entry->wired_count = 0;
3577
3578                 /*
3579                  * Insert the entry into the new map -- we know we're
3580                  * inserting at the end of the new map.
3581                  */
3582
3583                 vm_map_entry_link(new_map, new_map->header.prev,
3584                                   new_entry);
3585
3586                 /*
3587                  * Update the physical map
3588                  */
3589                 pmap_copy(new_map->pmap, old_map->pmap,
3590                           new_entry->start,
3591                           (old_entry->end - old_entry->start),
3592                           old_entry->start);
3593                 break;
3594         case VM_INHERIT_COPY:
3595                 /*
3596                  * Clone the entry and link into the map.
3597                  */
3598                 new_entry = vm_map_entry_create(new_map, countp);
3599                 *new_entry = *old_entry;
3600                 new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3601                 new_entry->wired_count = 0;
3602                 new_entry->object.vm_object = NULL;
3603                 vm_map_entry_link(new_map, new_map->header.prev,
3604                                   new_entry);
3605                 vm_map_copy_entry(old_map, new_map, old_entry,
3606                                   new_entry);
3607                 break;
3608         }
3609 }
3610
3611 /*
3612  * When forking user-kernel shared maps, the map might change in the
3613  * child so do not try to copy the underlying pmap entries.
3614  */
3615 static
3616 void
3617 vmspace_fork_uksmap_entry(vm_map_t old_map, vm_map_t new_map,
3618                           vm_map_entry_t old_entry, int *countp)
3619 {
3620         vm_map_entry_t new_entry;
3621
3622         new_entry = vm_map_entry_create(new_map, countp);
3623         *new_entry = *old_entry;
3624         new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3625         new_entry->wired_count = 0;
3626         vm_map_entry_link(new_map, new_map->header.prev,
3627                           new_entry);
3628 }
3629
3630 /*
3631  * Create an auto-grow stack entry
3632  *
3633  * No requirements.
3634  */
3635 int
3636 vm_map_stack (vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
3637               int flags, vm_prot_t prot, vm_prot_t max, int cow)
3638 {
3639         vm_map_entry_t  prev_entry;
3640         vm_map_entry_t  new_stack_entry;
3641         vm_size_t       init_ssize;
3642         int             rv;
3643         int             count;
3644         vm_offset_t     tmpaddr;
3645
3646         cow |= MAP_IS_STACK;
3647
3648         if (max_ssize < sgrowsiz)
3649                 init_ssize = max_ssize;
3650         else
3651                 init_ssize = sgrowsiz;
3652
3653         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3654         vm_map_lock(map);
3655
3656         /*
3657          * Find space for the mapping
3658          */
3659         if ((flags & (MAP_FIXED | MAP_TRYFIXED)) == 0) {
3660                 if (vm_map_findspace(map, addrbos, max_ssize, 1,
3661                                      flags, &tmpaddr)) {
3662                         vm_map_unlock(map);
3663                         vm_map_entry_release(count);
3664                         return (KERN_NO_SPACE);
3665                 }
3666                 addrbos = tmpaddr;
3667         }
3668
3669         /* If addr is already mapped, no go */
3670         if (vm_map_lookup_entry(map, addrbos, &prev_entry)) {
3671                 vm_map_unlock(map);
3672                 vm_map_entry_release(count);
3673                 return (KERN_NO_SPACE);
3674         }
3675
3676 #if 0
3677         /* XXX already handled by kern_mmap() */
3678         /* If we would blow our VMEM resource limit, no go */
3679         if (map->size + init_ssize >
3680             curproc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
3681                 vm_map_unlock(map);
3682                 vm_map_entry_release(count);
3683                 return (KERN_NO_SPACE);
3684         }
3685 #endif
3686
3687         /*
3688          * If we can't accomodate max_ssize in the current mapping,
3689          * no go.  However, we need to be aware that subsequent user
3690          * mappings might map into the space we have reserved for
3691          * stack, and currently this space is not protected.
3692          *
3693          * Hopefully we will at least detect this condition
3694          * when we try to grow the stack.
3695          */
3696         if ((prev_entry->next != &map->header) &&
3697             (prev_entry->next->start < addrbos + max_ssize)) {
3698                 vm_map_unlock(map);
3699                 vm_map_entry_release(count);
3700                 return (KERN_NO_SPACE);
3701         }
3702
3703         /*
3704          * We initially map a stack of only init_ssize.  We will
3705          * grow as needed later.  Since this is to be a grow
3706          * down stack, we map at the top of the range.
3707          *
3708          * Note: we would normally expect prot and max to be
3709          * VM_PROT_ALL, and cow to be 0.  Possibly we should
3710          * eliminate these as input parameters, and just
3711          * pass these values here in the insert call.
3712          */
3713         rv = vm_map_insert(map, &count, NULL, NULL,
3714                            0, addrbos + max_ssize - init_ssize,
3715                            addrbos + max_ssize,
3716                            VM_MAPTYPE_NORMAL,
3717                            VM_SUBSYS_STACK, prot, max, cow);
3718
3719         /* Now set the avail_ssize amount */
3720         if (rv == KERN_SUCCESS) {
3721                 if (prev_entry != &map->header)
3722                         vm_map_clip_end(map, prev_entry, addrbos + max_ssize - init_ssize, &count);
3723                 new_stack_entry = prev_entry->next;
3724                 if (new_stack_entry->end   != addrbos + max_ssize ||
3725                     new_stack_entry->start != addrbos + max_ssize - init_ssize)
3726                         panic ("Bad entry start/end for new stack entry");
3727                 else
3728                         new_stack_entry->aux.avail_ssize = max_ssize - init_ssize;
3729         }
3730
3731         vm_map_unlock(map);
3732         vm_map_entry_release(count);
3733         return (rv);
3734 }
3735
3736 /*
3737  * Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if the
3738  * desired address is already mapped, or if we successfully grow
3739  * the stack.  Also returns KERN_SUCCESS if addr is outside the
3740  * stack range (this is strange, but preserves compatibility with
3741  * the grow function in vm_machdep.c).
3742  *
3743  * No requirements.
3744  */
3745 int
3746 vm_map_growstack (vm_map_t map, vm_offset_t addr)
3747 {
3748         vm_map_entry_t prev_entry;
3749         vm_map_entry_t stack_entry;
3750         vm_map_entry_t new_stack_entry;
3751         struct vmspace *vm;
3752         struct lwp *lp;
3753         struct proc *p;
3754         vm_offset_t    end;
3755         int grow_amount;
3756         int rv = KERN_SUCCESS;
3757         int is_procstack;
3758         int use_read_lock = 1;
3759         int count;
3760
3761         /*
3762          * Find the vm
3763          */
3764         lp = curthread->td_lwp;
3765         p = curthread->td_proc;
3766         KKASSERT(lp != NULL);
3767         vm = lp->lwp_vmspace;
3768
3769         /*
3770          * Growstack is only allowed on the current process.  We disallow
3771          * other use cases, e.g. trying to access memory via procfs that
3772          * the stack hasn't grown into.
3773          */
3774         if (map != &vm->vm_map) {
3775                 return KERN_FAILURE;
3776         }
3777
3778         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3779 Retry:
3780         if (use_read_lock)
3781                 vm_map_lock_read(map);
3782         else
3783                 vm_map_lock(map);
3784
3785         /* If addr is already in the entry range, no need to grow.*/
3786         if (vm_map_lookup_entry(map, addr, &prev_entry))
3787                 goto done;
3788
3789         if ((stack_entry = prev_entry->next) == &map->header)
3790                 goto done;
3791         if (prev_entry == &map->header)
3792                 end = stack_entry->start - stack_entry->aux.avail_ssize;
3793         else
3794                 end = prev_entry->end;
3795
3796         /*
3797          * This next test mimics the old grow function in vm_machdep.c.
3798          * It really doesn't quite make sense, but we do it anyway
3799          * for compatibility.
3800          *
3801          * If not growable stack, return success.  This signals the
3802          * caller to proceed as he would normally with normal vm.
3803          */
3804         if (stack_entry->aux.avail_ssize < 1 ||
3805             addr >= stack_entry->start ||
3806             addr <  stack_entry->start - stack_entry->aux.avail_ssize) {
3807                 goto done;
3808         }
3809
3810         /* Find the minimum grow amount */
3811         grow_amount = roundup (stack_entry->start - addr, PAGE_SIZE);
3812         if (grow_amount > stack_entry->aux.avail_ssize) {
3813                 rv = KERN_NO_SPACE;
3814                 goto done;
3815         }
3816
3817         /*
3818          * If there is no longer enough space between the entries
3819          * nogo, and adjust the available space.  Note: this
3820          * should only happen if the user has mapped into the
3821          * stack area after the stack was created, and is
3822          * probably an error.
3823          *
3824          * This also effectively destroys any guard page the user
3825          * might have intended by limiting the stack size.
3826          */
3827         if (grow_amount > stack_entry->start - end) {
3828                 if (use_read_lock && vm_map_lock_upgrade(map)) {
3829                         /* lost lock */
3830                         use_read_lock = 0;
3831                         goto Retry;
3832                 }
3833                 use_read_lock = 0;
3834                 stack_entry->aux.avail_ssize = stack_entry->start - end;
3835                 rv = KERN_NO_SPACE;
3836                 goto done;
3837         }
3838
3839         is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr;
3840
3841         /* If this is the main process stack, see if we're over the
3842          * stack limit.
3843          */
3844         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount >
3845                              p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
3846                 rv = KERN_NO_SPACE;
3847                 goto done;
3848         }
3849
3850         /* Round up the grow amount modulo SGROWSIZ */
3851         grow_amount = roundup (grow_amount, sgrowsiz);
3852         if (grow_amount > stack_entry->aux.avail_ssize) {
3853                 grow_amount = stack_entry->aux.avail_ssize;
3854         }
3855         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount >
3856                              p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
3857                 grow_amount = p->p_rlimit[RLIMIT_STACK].rlim_cur -
3858                               ctob(vm->vm_ssize);
3859         }
3860
3861         /* If we would blow our VMEM resource limit, no go */
3862         if (map->size + grow_amount > p->p_rlimit[RLIMIT_VMEM].rlim_cur) {
3863                 rv = KERN_NO_SPACE;
3864                 goto done;
3865         }
3866
3867         if (use_read_lock && vm_map_lock_upgrade(map)) {
3868                 /* lost lock */
3869                 use_read_lock = 0;
3870                 goto Retry;
3871         }
3872         use_read_lock = 0;
3873
3874         /* Get the preliminary new entry start value */
3875         addr = stack_entry->start - grow_amount;
3876
3877         /* If this puts us into the previous entry, cut back our growth
3878          * to the available space.  Also, see the note above.
3879          */
3880         if (addr < end) {
3881                 stack_entry->aux.avail_ssize = stack_entry->start - end;
3882                 addr = end;
3883         }
3884
3885         rv = vm_map_insert(map, &count, NULL, NULL,
3886                            0, addr, stack_entry->start,
3887                            VM_MAPTYPE_NORMAL,
3888                            VM_SUBSYS_STACK, VM_PROT_ALL, VM_PROT_ALL, 0);
3889
3890         /* Adjust the available stack space by the amount we grew. */
3891         if (rv == KERN_SUCCESS) {
3892                 if (prev_entry != &map->header)
3893                         vm_map_clip_end(map, prev_entry, addr, &count);
3894                 new_stack_entry = prev_entry->next;
3895                 if (new_stack_entry->end   != stack_entry->start  ||
3896                     new_stack_entry->start != addr)
3897                         panic ("Bad stack grow start/end in new stack entry");
3898                 else {
3899                         new_stack_entry->aux.avail_ssize =
3900                                 stack_entry->aux.avail_ssize -
3901                                 (new_stack_entry->end - new_stack_entry->start);
3902                         if (is_procstack)
3903                                 vm->vm_ssize += btoc(new_stack_entry->end -
3904                                                      new_stack_entry->start);
3905                 }
3906
3907                 if (map->flags & MAP_WIREFUTURE)
3908                         vm_map_unwire(map, new_stack_entry->start,
3909                                       new_stack_entry->end, FALSE);
3910         }
3911
3912 done:
3913         if (use_read_lock)
3914                 vm_map_unlock_read(map);
3915         else
3916                 vm_map_unlock(map);
3917         vm_map_entry_release(count);
3918         return (rv);
3919 }
3920
3921 /*
3922  * Unshare the specified VM space for exec.  If other processes are
3923  * mapped to it, then create a new one.  The new vmspace is null.
3924  *
3925  * No requirements.
3926  */
3927 void
3928 vmspace_exec(struct proc *p, struct vmspace *vmcopy)
3929 {
3930         struct vmspace *oldvmspace = p->p_vmspace;
3931         struct vmspace *newvmspace;
3932         vm_map_t map = &p->p_vmspace->vm_map;
3933
3934         /*
3935          * If we are execing a resident vmspace we fork it, otherwise
3936          * we create a new vmspace.  Note that exitingcnt is not
3937          * copied to the new vmspace.
3938          */
3939         lwkt_gettoken(&oldvmspace->vm_map.token);
3940         if (vmcopy)  {
3941                 newvmspace = vmspace_fork(vmcopy);
3942                 lwkt_gettoken(&newvmspace->vm_map.token);
3943         } else {
3944                 newvmspace = vmspace_alloc(map->min_offset, map->max_offset);
3945                 lwkt_gettoken(&newvmspace->vm_map.token);
3946                 bcopy(&oldvmspace->vm_startcopy, &newvmspace->vm_startcopy,
3947                       (caddr_t)&oldvmspace->vm_endcopy -
3948                        (caddr_t)&oldvmspace->vm_startcopy);
3949         }
3950
3951         /*
3952          * Finish initializing the vmspace before assigning it
3953          * to the process.  The vmspace will become the current vmspace
3954          * if p == curproc.
3955          */
3956         pmap_pinit2(vmspace_pmap(newvmspace));
3957         pmap_replacevm(p, newvmspace, 0);
3958         lwkt_reltoken(&newvmspace->vm_map.token);
3959         lwkt_reltoken(&oldvmspace->vm_map.token);
3960         vmspace_rel(oldvmspace);
3961 }
3962
3963 /*
3964  * Unshare the specified VM space for forcing COW.  This
3965  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
3966  */
3967 void
3968 vmspace_unshare(struct proc *p)
3969 {
3970         struct vmspace *oldvmspace = p->p_vmspace;
3971         struct vmspace *newvmspace;
3972
3973         lwkt_gettoken(&oldvmspace->vm_map.token);
3974         if (vmspace_getrefs(oldvmspace) == 1) {
3975                 lwkt_reltoken(&oldvmspace->vm_map.token);
3976                 return;
3977         }
3978         newvmspace = vmspace_fork(oldvmspace);
3979         lwkt_gettoken(&newvmspace->vm_map.token);
3980         pmap_pinit2(vmspace_pmap(newvmspace));
3981         pmap_replacevm(p, newvmspace, 0);
3982         lwkt_reltoken(&newvmspace->vm_map.token);
3983         lwkt_reltoken(&oldvmspace->vm_map.token);
3984         vmspace_rel(oldvmspace);
3985 }
3986
3987 /*
3988  * vm_map_hint: return the beginning of the best area suitable for
3989  * creating a new mapping with "prot" protection.
3990  *
3991  * No requirements.
3992  */
3993 vm_offset_t
3994 vm_map_hint(struct proc *p, vm_offset_t addr, vm_prot_t prot)
3995 {
3996         struct vmspace *vms = p->p_vmspace;
3997
3998         if (!randomize_mmap || addr != 0) {
3999                 /*
4000                  * Set a reasonable start point for the hint if it was
4001                  * not specified or if it falls within the heap space.
4002                  * Hinted mmap()s do not allocate out of the heap space.
4003                  */
4004                 if (addr == 0 ||
4005                     (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
4006                      addr < round_page((vm_offset_t)vms->vm_daddr + maxdsiz))) {
4007                         addr = round_page((vm_offset_t)vms->vm_daddr + maxdsiz);
4008                 }
4009
4010                 return addr;
4011         }
4012         addr = (vm_offset_t)vms->vm_daddr + MAXDSIZ;
4013         addr += karc4random() & (MIN((256 * 1024 * 1024), MAXDSIZ) - 1);
4014
4015         return (round_page(addr));
4016 }
4017
4018 /*
4019  * Finds the VM object, offset, and protection for a given virtual address
4020  * in the specified map, assuming a page fault of the type specified.
4021  *
4022  * Leaves the map in question locked for read; return values are guaranteed
4023  * until a vm_map_lookup_done call is performed.  Note that the map argument
4024  * is in/out; the returned map must be used in the call to vm_map_lookup_done.
4025  *
4026  * A handle (out_entry) is returned for use in vm_map_lookup_done, to make
4027  * that fast.
4028  *
4029  * If a lookup is requested with "write protection" specified, the map may
4030  * be changed to perform virtual copying operations, although the data
4031  * referenced will remain the same.
4032  *
4033  * No requirements.
4034  */
4035 int
4036 vm_map_lookup(vm_map_t *var_map,                /* IN/OUT */
4037               vm_offset_t vaddr,
4038               vm_prot_t fault_typea,
4039               vm_map_entry_t *out_entry,        /* OUT */
4040               vm_object_t *object,              /* OUT */
4041               vm_pindex_t *pindex,              /* OUT */
4042               vm_prot_t *out_prot,              /* OUT */
4043               boolean_t *wired)                 /* OUT */
4044 {
4045         vm_map_entry_t entry;
4046         vm_map_t map = *var_map;
4047         vm_prot_t prot;
4048         vm_prot_t fault_type = fault_typea;
4049         int use_read_lock = 1;
4050         int rv = KERN_SUCCESS;
4051
4052 RetryLookup:
4053         if (use_read_lock)
4054                 vm_map_lock_read(map);
4055         else
4056                 vm_map_lock(map);
4057
4058         /*
4059          * If the map has an interesting hint, try it before calling full
4060          * blown lookup routine.
4061          */
4062         entry = map->hint;
4063         cpu_ccfence();
4064         *out_entry = entry;
4065         *object = NULL;
4066
4067         if ((entry == &map->header) ||
4068             (vaddr < entry->start) || (vaddr >= entry->end)) {
4069                 vm_map_entry_t tmp_entry;
4070
4071                 /*
4072                  * Entry was either not a valid hint, or the vaddr was not
4073                  * contained in the entry, so do a full lookup.
4074                  */
4075                 if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
4076                         rv = KERN_INVALID_ADDRESS;
4077                         goto done;
4078                 }
4079
4080                 entry = tmp_entry;
4081                 *out_entry = entry;
4082         }
4083
4084         /*
4085          * Handle submaps.
4086          */
4087         if (entry->maptype == VM_MAPTYPE_SUBMAP) {
4088                 vm_map_t old_map = map;
4089
4090                 *var_map = map = entry->object.sub_map;
4091                 if (use_read_lock)
4092                         vm_map_unlock_read(old_map);
4093                 else
4094                         vm_map_unlock(old_map);
4095                 use_read_lock = 1;
4096                 goto RetryLookup;
4097         }
4098
4099         /*
4100          * Check whether this task is allowed to have this page.
4101          * Note the special case for MAP_ENTRY_COW pages with an override.
4102          * This is to implement a forced COW for debuggers.
4103          */
4104         if (fault_type & VM_PROT_OVERRIDE_WRITE)
4105                 prot = entry->max_protection;
4106         else
4107                 prot = entry->protection;
4108
4109         fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
4110         if ((fault_type & prot) != fault_type) {
4111                 rv = KERN_PROTECTION_FAILURE;
4112                 goto done;
4113         }
4114
4115         if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
4116             (entry->eflags & MAP_ENTRY_COW) &&
4117             (fault_type & VM_PROT_WRITE) &&
4118             (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) {
4119                 rv = KERN_PROTECTION_FAILURE;
4120                 goto done;
4121         }
4122
4123         /*
4124          * If this page is not pageable, we have to get it for all possible
4125          * accesses.
4126          */
4127         *wired = (entry->wired_count != 0);
4128         if (*wired)
4129                 prot = fault_type = entry->protection;
4130
4131         /*
4132          * Virtual page tables may need to update the accessed (A) bit
4133          * in a page table entry.  Upgrade the fault to a write fault for
4134          * that case if the map will support it.  If the map does not support
4135          * it the page table entry simply will not be updated.
4136          */
4137         if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
4138                 if (prot & VM_PROT_WRITE)
4139                         fault_type |= VM_PROT_WRITE;
4140         }
4141
4142         if (curthread->td_lwp && curthread->td_lwp->lwp_vmspace &&
4143             pmap_emulate_ad_bits(&curthread->td_lwp->lwp_vmspace->vm_pmap)) {
4144                 if ((prot & VM_PROT_WRITE) == 0)
4145                         fault_type |= VM_PROT_WRITE;
4146         }
4147
4148         /*
4149          * Only NORMAL and VPAGETABLE maps are object-based.  UKSMAPs are not.
4150          */
4151         if (entry->maptype != VM_MAPTYPE_NORMAL &&
4152             entry->maptype != VM_MAPTYPE_VPAGETABLE) {
4153                 *object = NULL;
4154                 goto skip;
4155         }
4156
4157         /*
4158          * If the entry was copy-on-write, we either ...
4159          */
4160         if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4161                 /*
4162                  * If we want to write the page, we may as well handle that
4163                  * now since we've got the map locked.
4164                  *
4165                  * If we don't need to write the page, we just demote the
4166                  * permissions allowed.
4167                  */
4168
4169                 if (fault_type & VM_PROT_WRITE) {
4170                         /*
4171                          * Not allowed if TDF_NOFAULT is set as the shadowing
4172                          * operation can deadlock against the faulting
4173                          * function due to the copy-on-write.
4174                          */
4175                         if (curthread->td_flags & TDF_NOFAULT) {
4176                                 rv = KERN_FAILURE_NOFAULT;
4177                                 goto done;
4178                         }
4179
4180                         /*
4181                          * Make a new object, and place it in the object
4182                          * chain.  Note that no new references have appeared
4183                          * -- one just moved from the map to the new
4184                          * object.
4185                          */
4186
4187                         if (use_read_lock && vm_map_lock_upgrade(map)) {
4188                                 /* lost lock */
4189                                 use_read_lock = 0;
4190                                 goto RetryLookup;
4191                         }
4192                         use_read_lock = 0;
4193
4194                         vm_map_entry_shadow(entry, 0);
4195                 } else {
4196                         /*
4197                          * We're attempting to read a copy-on-write page --
4198                          * don't allow writes.
4199                          */
4200
4201                         prot &= ~VM_PROT_WRITE;
4202                 }
4203         }
4204
4205         /*
4206          * Create an object if necessary.
4207          */
4208         if (entry->object.vm_object == NULL && !map->system_map) {
4209                 if (use_read_lock && vm_map_lock_upgrade(map))  {
4210                         /* lost lock */
4211                         use_read_lock = 0;
4212                         goto RetryLookup;
4213                 }
4214                 use_read_lock = 0;
4215                 vm_map_entry_allocate_object(entry);
4216         }
4217
4218         /*
4219          * Return the object/offset from this entry.  If the entry was
4220          * copy-on-write or empty, it has been fixed up.
4221          */
4222         *object = entry->object.vm_object;
4223
4224 skip:
4225         *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
4226
4227         /*
4228          * Return whether this is the only map sharing this data.  On
4229          * success we return with a read lock held on the map.  On failure
4230          * we return with the map unlocked.
4231          */
4232         *out_prot = prot;
4233 done:
4234         if (rv == KERN_SUCCESS) {
4235                 if (use_read_lock == 0)
4236                         vm_map_lock_downgrade(map);
4237         } else if (use_read_lock) {
4238                 vm_map_unlock_read(map);
4239         } else {
4240                 vm_map_unlock(map);
4241         }
4242         return (rv);
4243 }
4244
4245 /*
4246  * Releases locks acquired by a vm_map_lookup()
4247  * (according to the handle returned by that lookup).
4248  *
4249  * No other requirements.
4250  */
4251 void
4252 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry, int count)
4253 {
4254         /*
4255          * Unlock the main-level map
4256          */
4257         vm_map_unlock_read(map);
4258         if (count)
4259                 vm_map_entry_release(count);
4260 }
4261
4262 /*
4263  * Quick hack, needs some help to make it more SMP friendly.
4264  */
4265 void
4266 vm_map_interlock(vm_map_t map, struct vm_map_ilock *ilock,
4267                  vm_offset_t ran_beg, vm_offset_t ran_end)
4268 {
4269         struct vm_map_ilock *scan;
4270
4271         ilock->ran_beg = ran_beg;
4272         ilock->ran_end = ran_end;
4273         ilock->flags = 0;
4274
4275         spin_lock(&map->ilock_spin);
4276 restart:
4277         for (scan = map->ilock_base; scan; scan = scan->next) {
4278                 if (ran_end > scan->ran_beg && ran_beg < scan->ran_end) {
4279                         scan->flags |= ILOCK_WAITING;
4280                         ssleep(scan, &map->ilock_spin, 0, "ilock", 0);
4281                         goto restart;
4282                 }
4283         }
4284         ilock->next = map->ilock_base;
4285         map->ilock_base = ilock;
4286         spin_unlock(&map->ilock_spin);
4287 }
4288
4289 void
4290 vm_map_deinterlock(vm_map_t map, struct  vm_map_ilock *ilock)
4291 {
4292         struct vm_map_ilock *scan;
4293         struct vm_map_ilock **scanp;
4294
4295         spin_lock(&map->ilock_spin);
4296         scanp = &map->ilock_base;
4297         while ((scan = *scanp) != NULL) {
4298                 if (scan == ilock) {
4299                         *scanp = ilock->next;
4300                         spin_unlock(&map->ilock_spin);
4301                         if (ilock->flags & ILOCK_WAITING)
4302                                 wakeup(ilock);
4303                         return;
4304                 }
4305                 scanp = &scan->next;
4306         }
4307         spin_unlock(&map->ilock_spin);
4308         panic("vm_map_deinterlock: missing ilock!");
4309 }
4310
4311 #include "opt_ddb.h"
4312 #ifdef DDB
4313 #include <ddb/ddb.h>
4314
4315 /*
4316  * Debugging only
4317  */
4318 DB_SHOW_COMMAND(map, vm_map_print)
4319 {
4320         static int nlines;
4321         /* XXX convert args. */
4322         vm_map_t map = (vm_map_t)addr;
4323         boolean_t full = have_addr;
4324
4325         vm_map_entry_t entry;
4326
4327         db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
4328             (void *)map,
4329             (void *)map->pmap, map->nentries, map->timestamp);
4330         nlines++;
4331
4332         if (!full && db_indent)
4333                 return;
4334
4335         db_indent += 2;
4336         for (entry = map->header.next; entry != &map->header;
4337             entry = entry->next) {
4338                 db_iprintf("map entry %p: start=%p, end=%p\n",
4339                     (void *)entry, (void *)entry->start, (void *)entry->end);
4340                 nlines++;
4341                 {
4342                         static char *inheritance_name[4] =
4343                         {"share", "copy", "none", "donate_copy"};
4344
4345                         db_iprintf(" prot=%x/%x/%s",
4346                             entry->protection,
4347                             entry->max_protection,
4348                             inheritance_name[(int)(unsigned char)
4349                                                 entry->inheritance]);
4350                         if (entry->wired_count != 0)
4351                                 db_printf(", wired");
4352                 }
4353                 switch(entry->maptype) {
4354                 case VM_MAPTYPE_SUBMAP:
4355                         /* XXX no %qd in kernel.  Truncate entry->offset. */
4356                         db_printf(", share=%p, offset=0x%lx\n",
4357                             (void *)entry->object.sub_map,
4358                             (long)entry->offset);
4359                         nlines++;
4360                         if ((entry->prev == &map->header) ||
4361                             (entry->prev->object.sub_map !=
4362                                 entry->object.sub_map)) {
4363                                 db_indent += 2;
4364                                 vm_map_print((db_expr_t)(intptr_t)
4365                                              entry->object.sub_map,
4366                                              full, 0, NULL);
4367                                 db_indent -= 2;
4368                         }
4369                         break;
4370                 case VM_MAPTYPE_NORMAL:
4371                 case VM_MAPTYPE_VPAGETABLE:
4372                         /* XXX no %qd in kernel.  Truncate entry->offset. */
4373                         db_printf(", object=%p, offset=0x%lx",
4374                             (void *)entry->object.vm_object,
4375                             (long)entry->offset);
4376                         if (entry->eflags & MAP_ENTRY_COW)
4377                                 db_printf(", copy (%s)",
4378                                     (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
4379                         db_printf("\n");
4380                         nlines++;
4381
4382                         if ((entry->prev == &map->header) ||
4383                             (entry->prev->object.vm_object !=
4384                                 entry->object.vm_object)) {
4385                                 db_indent += 2;
4386                                 vm_object_print((db_expr_t)(intptr_t)
4387                                                 entry->object.vm_object,
4388                                                 full, 0, NULL);
4389                                 nlines += 4;
4390                                 db_indent -= 2;
4391                         }
4392                         break;
4393                 case VM_MAPTYPE_UKSMAP:
4394                         db_printf(", uksmap=%p, offset=0x%lx",
4395                             (void *)entry->object.uksmap,
4396                             (long)entry->offset);
4397                         if (entry->eflags & MAP_ENTRY_COW)
4398                                 db_printf(", copy (%s)",
4399                                     (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
4400                         db_printf("\n");
4401                         nlines++;
4402                         break;
4403                 default:
4404                         break;
4405                 }
4406         }
4407         db_indent -= 2;
4408         if (db_indent == 0)
4409                 nlines = 0;
4410 }
4411
4412 /*
4413  * Debugging only
4414  */
4415 DB_SHOW_COMMAND(procvm, procvm)
4416 {
4417         struct proc *p;
4418
4419         if (have_addr) {
4420                 p = (struct proc *) addr;
4421         } else {
4422                 p = curproc;
4423         }
4424
4425         db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
4426             (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
4427             (void *)vmspace_pmap(p->p_vmspace));
4428
4429         vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL);
4430 }
4431
4432 #endif /* DDB */