sys/vm/vm_map.c

   1 /*
   2  * Copyright (c) 1991, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * Copyright (c) 2003-2017 The DragonFly Project.  All rights reserved.
   5  *
   6  * This code is derived from software contributed to Berkeley by
   7  * The Mach Operating System project at Carnegie-Mellon University.
   8  *
   9  * This code is derived from software contributed to The DragonFly Project
  10  * by Matthew Dillon <dillon@backplane.com>
  11  *
  12  * Redistribution and use in source and binary forms, with or without
  13  * modification, are permitted provided that the following conditions
  14  * are met:
  15  * 1. Redistributions of source code must retain the above copyright
  16  *    notice, this list of conditions and the following disclaimer.
  17  * 2. Redistributions in binary form must reproduce the above copyright
  18  *    notice, this list of conditions and the following disclaimer in the
  19  *    documentation and/or other materials provided with the distribution.
  20  * 3. Neither the name of the University nor the names of its contributors
  21  *    may be used to endorse or promote products derived from this software
  22  *    without specific prior written permission.
  23  *
  24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  34  * SUCH DAMAGE.
  35  *
  36  *      from: @(#)vm_map.c      8.3 (Berkeley) 1/12/94
  37  *
  38  *
  39  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  40  * All rights reserved.
  41  *
  42  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  43  *
  44  * Permission to use, copy, modify and distribute this software and
  45  * its documentation is hereby granted, provided that both the copyright
  46  * notice and this permission notice appear in all copies of the
  47  * software, derivative works or modified versions, and any portions
  48  * thereof, and that both notices appear in supporting documentation.
  49  *
  50  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  51  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  52  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  53  *
  54  * Carnegie Mellon requests users of this software to return to
  55  *
  56  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  57  *  School of Computer Science
  58  *  Carnegie Mellon University
  59  *  Pittsburgh PA 15213-3890
  60  *
  61  * any improvements or extensions that they make and grant Carnegie the
  62  * rights to redistribute these changes.
  63  *
  64  * $FreeBSD: src/sys/vm/vm_map.c,v 1.187.2.19 2003/05/27 00:47:02 alc Exp $
  65  */
  66
  67 /*
  68  *      Virtual memory mapping module.
  69  */
  70
  71 #include <sys/param.h>
  72 #include <sys/systm.h>
  73 #include <sys/kernel.h>
  74 #include <sys/proc.h>
  75 #include <sys/serialize.h>
  76 #include <sys/lock.h>
  77 #include <sys/vmmeter.h>
  78 #include <sys/mman.h>
  79 #include <sys/vnode.h>
  80 #include <sys/resourcevar.h>
  81 #include <sys/shm.h>
  82 #include <sys/tree.h>
  83 #include <sys/malloc.h>
  84 #include <sys/objcache.h>
  85
  86 #include <vm/vm.h>
  87 #include <vm/vm_param.h>
  88 #include <vm/pmap.h>
  89 #include <vm/vm_map.h>
  90 #include <vm/vm_page.h>
  91 #include <vm/vm_object.h>
  92 #include <vm/vm_pager.h>
  93 #include <vm/vm_kern.h>
  94 #include <vm/vm_extern.h>
  95 #include <vm/swap_pager.h>
  96 #include <vm/vm_zone.h>
  97
  98 #include <sys/random.h>
  99 #include <sys/sysctl.h>
 100 #include <sys/spinlock.h>
 101
 102 #include <sys/thread2.h>
 103 #include <sys/spinlock2.h>
 104
 105 /*
 106  * Virtual memory maps provide for the mapping, protection, and sharing
 107  * of virtual memory objects.  In addition, this module provides for an
 108  * efficient virtual copy of memory from one map to another.
 109  *
 110  * Synchronization is required prior to most operations.
 111  *
 112  * Maps consist of an ordered doubly-linked list of simple entries.
 113  * A hint and a RB tree is used to speed-up lookups.
 114  *
 115  * Callers looking to modify maps specify start/end addresses which cause
 116  * the related map entry to be clipped if necessary, and then later
 117  * recombined if the pieces remained compatible.
 118  *
 119  * Virtual copy operations are performed by copying VM object references
 120  * from one map to another, and then marking both regions as copy-on-write.
 121  */
 122 static boolean_t vmspace_ctor(void *obj, void *privdata, int ocflags);
 123 static void vmspace_dtor(void *obj, void *privdata);
 124 static void vmspace_terminate(struct vmspace *vm, int final);
 125
 126 MALLOC_DEFINE(M_VMSPACE, "vmspace", "vmspace objcache backingstore");
 127 static struct objcache *vmspace_cache;
 128
 129 /*
 130  * per-cpu page table cross mappings are initialized in early boot
 131  * and might require a considerable number of vm_map_entry structures.
 132  */
 133 #define MAPENTRYBSP_CACHE       (MAXCPU+1)
 134 #define MAPENTRYAP_CACHE        8
 135
 136 /*
 137  * Partioning threaded programs with large anonymous memory areas can
 138  * improve concurrent fault performance.
 139  */
 140 #define MAP_ENTRY_PARTITION_SIZE        ((vm_offset_t)(32 * 1024 * 1024))
 141 #define MAP_ENTRY_PARTITION_MASK        (MAP_ENTRY_PARTITION_SIZE - 1)
 142
 143 #define VM_MAP_ENTRY_WITHIN_PARTITION(entry)    \
 144         ((((entry)->start ^ (entry)->end) & ~MAP_ENTRY_PARTITION_MASK) == 0)
 145
 146 static struct vm_zone mapentzone_store;
 147 static vm_zone_t mapentzone;
 148
 149 static struct vm_map_entry map_entry_init[MAX_MAPENT];
 150 static struct vm_map_entry cpu_map_entry_init_bsp[MAPENTRYBSP_CACHE];
 151 static struct vm_map_entry cpu_map_entry_init_ap[MAXCPU][MAPENTRYAP_CACHE];
 152
 153 static int randomize_mmap;
 154 SYSCTL_INT(_vm, OID_AUTO, randomize_mmap, CTLFLAG_RW, &randomize_mmap, 0,
 155     "Randomize mmap offsets");
 156 static int vm_map_relock_enable = 1;
 157 SYSCTL_INT(_vm, OID_AUTO, map_relock_enable, CTLFLAG_RW,
 158            &vm_map_relock_enable, 0, "insert pop pgtable optimization");
 159 static int vm_map_partition_enable = 1;
 160 SYSCTL_INT(_vm, OID_AUTO, map_partition_enable, CTLFLAG_RW,
 161            &vm_map_partition_enable, 0, "Break up larger vm_map_entry's");
 162
 163 static void vmspace_drop_notoken(struct vmspace *vm);
 164 static void vm_map_entry_shadow(vm_map_entry_t entry, int addref);
 165 static vm_map_entry_t vm_map_entry_create(vm_map_t map, int *);
 166 static void vm_map_entry_dispose (vm_map_t map, vm_map_entry_t entry, int *);
 167 static void _vm_map_clip_end (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
 168 static void _vm_map_clip_start (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
 169 static void vm_map_entry_delete (vm_map_t, vm_map_entry_t, int *);
 170 static void vm_map_entry_unwire (vm_map_t, vm_map_entry_t);
 171 static void vm_map_copy_entry (vm_map_t, vm_map_t, vm_map_entry_t,
 172                 vm_map_entry_t);
 173 static void vm_map_unclip_range (vm_map_t map, vm_map_entry_t start_entry,
 174                 vm_offset_t start, vm_offset_t end, int *countp, int flags);
 175 static void vm_map_entry_partition(vm_map_t map, vm_map_entry_t entry,
 176                 vm_offset_t vaddr, int *countp);
 177
 178 /*
 179  * Initialize the vm_map module.  Must be called before any other vm_map
 180  * routines.
 181  *
 182  * Map and entry structures are allocated from the general purpose
 183  * memory pool with some exceptions:
 184  *
 185  *      - The kernel map is allocated statically.
 186  *      - Initial kernel map entries are allocated out of a static pool.
 187  *      - We must set ZONE_SPECIAL here or the early boot code can get
 188  *        stuck if there are >63 cores.
 189  *
 190  *      These restrictions are necessary since malloc() uses the
 191  *      maps and requires map entries.
 192  *
 193  * Called from the low level boot code only.
 194  */
 195 void
 196 vm_map_startup(void)
 197 {
 198         mapentzone = &mapentzone_store;
 199         zbootinit(mapentzone, "MAP ENTRY", sizeof (struct vm_map_entry),
 200                   map_entry_init, MAX_MAPENT);
 201         mapentzone_store.zflags |= ZONE_SPECIAL;
 202 }
 203
 204 /*
 205  * Called prior to any vmspace allocations.
 206  *
 207  * Called from the low level boot code only.
 208  */
 209 void
 210 vm_init2(void)
 211 {
 212         vmspace_cache = objcache_create_mbacked(M_VMSPACE,
 213                                                 sizeof(struct vmspace),
 214                                                 0, ncpus * 4,
 215                                                 vmspace_ctor, vmspace_dtor,
 216                                                 NULL);
 217         zinitna(mapentzone, NULL, 0, 0, ZONE_USE_RESERVE | ZONE_SPECIAL);
 218         pmap_init2();
 219         vm_object_init2();
 220 }
 221
 222 /*
 223  * objcache support.  We leave the pmap root cached as long as possible
 224  * for performance reasons.
 225  */
 226 static
 227 boolean_t
 228 vmspace_ctor(void *obj, void *privdata, int ocflags)
 229 {
 230         struct vmspace *vm = obj;
 231
 232         bzero(vm, sizeof(*vm));
 233         vm->vm_refcnt = VM_REF_DELETED;
 234
 235         return 1;
 236 }
 237
 238 static
 239 void
 240 vmspace_dtor(void *obj, void *privdata)
 241 {
 242         struct vmspace *vm = obj;
 243
 244         KKASSERT(vm->vm_refcnt == VM_REF_DELETED);
 245         pmap_puninit(vmspace_pmap(vm));
 246 }
 247
 248 /*
 249  * Red black tree functions
 250  *
 251  * The caller must hold the related map lock.
 252  */
 253 static int rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b);
 254 RB_GENERATE(vm_map_rb_tree, vm_map_entry, rb_entry, rb_vm_map_compare);
 255
 256 /* a->start is address, and the only field has to be initialized */
 257 static int
 258 rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b)
 259 {
 260         if (a->start < b->start)
 261                 return(-1);
 262         else if (a->start > b->start)
 263                 return(1);
 264         return(0);
 265 }
 266
 267 /*
 268  * Initialize vmspace ref/hold counts vmspace0.  There is a holdcnt for
 269  * every refcnt.
 270  */
 271 void
 272 vmspace_initrefs(struct vmspace *vm)
 273 {
 274         vm->vm_refcnt = 1;
 275         vm->vm_holdcnt = 1;
 276 }
 277
 278 /*
 279  * Allocate a vmspace structure, including a vm_map and pmap.
 280  * Initialize numerous fields.  While the initial allocation is zerod,
 281  * subsequence reuse from the objcache leaves elements of the structure
 282  * intact (particularly the pmap), so portions must be zerod.
 283  *
 284  * Returns a referenced vmspace.
 285  *
 286  * No requirements.
 287  */
 288 struct vmspace *
 289 vmspace_alloc(vm_offset_t min, vm_offset_t max)
 290 {
 291         struct vmspace *vm;
 292
 293         vm = objcache_get(vmspace_cache, M_WAITOK);
 294
 295         bzero(&vm->vm_startcopy,
 296               (char *)&vm->vm_endcopy - (char *)&vm->vm_startcopy);
 297         vm_map_init(&vm->vm_map, min, max, NULL);       /* initializes token */
 298
 299         /*
 300          * NOTE: hold to acquires token for safety.
 301          *
 302          * On return vmspace is referenced (refs=1, hold=1).  That is,
 303          * each refcnt also has a holdcnt.  There can be additional holds
 304          * (holdcnt) above and beyond the refcnt.  Finalization is handled in
 305          * two stages, one on refs 1->0, and the the second on hold 1->0.
 306          */
 307         KKASSERT(vm->vm_holdcnt == 0);
 308         KKASSERT(vm->vm_refcnt == VM_REF_DELETED);
 309         vmspace_initrefs(vm);
 310         vmspace_hold(vm);
 311         pmap_pinit(vmspace_pmap(vm));           /* (some fields reused) */
 312         vm->vm_map.pmap = vmspace_pmap(vm);     /* XXX */
 313         vm->vm_shm = NULL;
 314         vm->vm_flags = 0;
 315         cpu_vmspace_alloc(vm);
 316         vmspace_drop(vm);
 317
 318         return (vm);
 319 }
 320
 321 /*
 322  * NOTE: Can return 0 if the vmspace is exiting.
 323  */
 324 int
 325 vmspace_getrefs(struct vmspace *vm)
 326 {
 327         int32_t n;
 328
 329         n = vm->vm_refcnt;
 330         cpu_ccfence();
 331         if (n & VM_REF_DELETED)
 332                 n = -1;
 333         return n;
 334 }
 335
 336 void
 337 vmspace_hold(struct vmspace *vm)
 338 {
 339         atomic_add_int(&vm->vm_holdcnt, 1);
 340         lwkt_gettoken(&vm->vm_map.token);
 341 }
 342
 343 /*
 344  * Drop with final termination interlock.
 345  */
 346 void
 347 vmspace_drop(struct vmspace *vm)
 348 {
 349         lwkt_reltoken(&vm->vm_map.token);
 350         vmspace_drop_notoken(vm);
 351 }
 352
 353 static void
 354 vmspace_drop_notoken(struct vmspace *vm)
 355 {
 356         if (atomic_fetchadd_int(&vm->vm_holdcnt, -1) == 1) {
 357                 if (vm->vm_refcnt & VM_REF_DELETED)
 358                         vmspace_terminate(vm, 1);
 359         }
 360 }
 361
 362 /*
 363  * A vmspace object must not be in a terminated state to be able to obtain
 364  * additional refs on it.
 365  *
 366  * These are official references to the vmspace, the count is used to check
 367  * for vmspace sharing.  Foreign accessors should use 'hold' and not 'ref'.
 368  *
 369  * XXX we need to combine hold & ref together into one 64-bit field to allow
 370  * holds to prevent stage-1 termination.
 371  */
 372 void
 373 vmspace_ref(struct vmspace *vm)
 374 {
 375         uint32_t n;
 376
 377         atomic_add_int(&vm->vm_holdcnt, 1);
 378         n = atomic_fetchadd_int(&vm->vm_refcnt, 1);
 379         KKASSERT((n & VM_REF_DELETED) == 0);
 380 }
 381
 382 /*
 383  * Release a ref on the vmspace.  On the 1->0 transition we do stage-1
 384  * termination of the vmspace.  Then, on the final drop of the hold we
 385  * will do stage-2 final termination.
 386  */
 387 void
 388 vmspace_rel(struct vmspace *vm)
 389 {
 390         uint32_t n;
 391
 392         /*
 393          * Drop refs.  Each ref also has a hold which is also dropped.
 394          *
 395          * When refs hits 0 compete to get the VM_REF_DELETED flag (hold
 396          * prevent finalization) to start termination processing.
 397          * Finalization occurs when the last hold count drops to 0.
 398          */
 399         n = atomic_fetchadd_int(&vm->vm_refcnt, -1) - 1;
 400         while (n == 0) {
 401                 if (atomic_cmpset_int(&vm->vm_refcnt, 0, VM_REF_DELETED)) {
 402                         vmspace_terminate(vm, 0);
 403                         break;
 404                 }
 405                 n = vm->vm_refcnt;
 406                 cpu_ccfence();
 407         }
 408         vmspace_drop_notoken(vm);
 409 }
 410
 411 /*
 412  * This is called during exit indicating that the vmspace is no
 413  * longer in used by an exiting process, but the process has not yet
 414  * been reaped.
 415  *
 416  * We drop refs, allowing for stage-1 termination, but maintain a holdcnt
 417  * to prevent stage-2 until the process is reaped.  Note hte order of
 418  * operation, we must hold first.
 419  *
 420  * No requirements.
 421  */
 422 void
 423 vmspace_relexit(struct vmspace *vm)
 424 {
 425         atomic_add_int(&vm->vm_holdcnt, 1);
 426         vmspace_rel(vm);
 427 }
 428
 429 /*
 430  * Called during reap to disconnect the remainder of the vmspace from
 431  * the process.  On the hold drop the vmspace termination is finalized.
 432  *
 433  * No requirements.
 434  */
 435 void
 436 vmspace_exitfree(struct proc *p)
 437 {
 438         struct vmspace *vm;
 439
 440         vm = p->p_vmspace;
 441         p->p_vmspace = NULL;
 442         vmspace_drop_notoken(vm);
 443 }
 444
 445 /*
 446  * Called in two cases:
 447  *
 448  * (1) When the last refcnt is dropped and the vmspace becomes inactive,
 449  *     called with final == 0.  refcnt will be (u_int)-1 at this point,
 450  *     and holdcnt will still be non-zero.
 451  *
 452  * (2) When holdcnt becomes 0, called with final == 1.  There should no
 453  *     longer be anyone with access to the vmspace.
 454  *
 455  * VMSPACE_EXIT1 flags the primary deactivation
 456  * VMSPACE_EXIT2 flags the last reap
 457  */
 458 static void
 459 vmspace_terminate(struct vmspace *vm, int final)
 460 {
 461         int count;
 462
 463         lwkt_gettoken(&vm->vm_map.token);
 464         if (final == 0) {
 465                 KKASSERT((vm->vm_flags & VMSPACE_EXIT1) == 0);
 466                 vm->vm_flags |= VMSPACE_EXIT1;
 467
 468                 /*
 469                  * Get rid of most of the resources.  Leave the kernel pmap
 470                  * intact.
 471                  *
 472                  * If the pmap does not contain wired pages we can bulk-delete
 473                  * the pmap as a performance optimization before removing the
 474                  * related mappings.
 475                  *
 476                  * If the pmap contains wired pages we cannot do this
 477                  * pre-optimization because currently vm_fault_unwire()
 478                  * expects the pmap pages to exist and will not decrement
 479                  * p->wire_count if they do not.
 480                  */
 481                 shmexit(vm);
 482                 if (vmspace_pmap(vm)->pm_stats.wired_count) {
 483                         vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS,
 484                                       VM_MAX_USER_ADDRESS);
 485                         pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS,
 486                                           VM_MAX_USER_ADDRESS);
 487                 } else {
 488                         pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS,
 489                                           VM_MAX_USER_ADDRESS);
 490                         vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS,
 491                                       VM_MAX_USER_ADDRESS);
 492                 }
 493                 lwkt_reltoken(&vm->vm_map.token);
 494         } else {
 495                 KKASSERT((vm->vm_flags & VMSPACE_EXIT1) != 0);
 496                 KKASSERT((vm->vm_flags & VMSPACE_EXIT2) == 0);
 497
 498                 /*
 499                  * Get rid of remaining basic resources.
 500                  */
 501                 vm->vm_flags |= VMSPACE_EXIT2;
 502                 shmexit(vm);
 503
 504                 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
 505                 vm_map_lock(&vm->vm_map);
 506                 cpu_vmspace_free(vm);
 507
 508                 /*
 509                  * Lock the map, to wait out all other references to it.
 510                  * Delete all of the mappings and pages they hold, then call
 511                  * the pmap module to reclaim anything left.
 512                  */
 513                 vm_map_delete(&vm->vm_map, vm->vm_map.min_offset,
 514                               vm->vm_map.max_offset, &count);
 515                 vm_map_unlock(&vm->vm_map);
 516                 vm_map_entry_release(count);
 517
 518                 pmap_release(vmspace_pmap(vm));
 519                 lwkt_reltoken(&vm->vm_map.token);
 520                 objcache_put(vmspace_cache, vm);
 521         }
 522 }
 523
 524 /*
 525  * Swap useage is determined by taking the proportional swap used by
 526  * VM objects backing the VM map.  To make up for fractional losses,
 527  * if the VM object has any swap use at all the associated map entries
 528  * count for at least 1 swap page.
 529  *
 530  * No requirements.
 531  */
 532 vm_offset_t
 533 vmspace_swap_count(struct vmspace *vm)
 534 {
 535         vm_map_t map = &vm->vm_map;
 536         vm_map_entry_t cur;
 537         vm_object_t object;
 538         vm_offset_t count = 0;
 539         vm_offset_t n;
 540
 541         vmspace_hold(vm);
 542         for (cur = map->header.next; cur != &map->header; cur = cur->next) {
 543                 switch(cur->maptype) {
 544                 case VM_MAPTYPE_NORMAL:
 545                 case VM_MAPTYPE_VPAGETABLE:
 546                         if ((object = cur->object.vm_object) == NULL)
 547                                 break;
 548                         if (object->swblock_count) {
 549                                 n = (cur->end - cur->start) / PAGE_SIZE;
 550                                 count += object->swblock_count *
 551                                     SWAP_META_PAGES * n / object->size + 1;
 552                         }
 553                         break;
 554                 default:
 555                         break;
 556                 }
 557         }
 558         vmspace_drop(vm);
 559
 560         return(count);
 561 }
 562
 563 /*
 564  * Calculate the approximate number of anonymous pages in use by
 565  * this vmspace.  To make up for fractional losses, we count each
 566  * VM object as having at least 1 anonymous page.
 567  *
 568  * No requirements.
 569  */
 570 vm_offset_t
 571 vmspace_anonymous_count(struct vmspace *vm)
 572 {
 573         vm_map_t map = &vm->vm_map;
 574         vm_map_entry_t cur;
 575         vm_object_t object;
 576         vm_offset_t count = 0;
 577
 578         vmspace_hold(vm);
 579         for (cur = map->header.next; cur != &map->header; cur = cur->next) {
 580                 switch(cur->maptype) {
 581                 case VM_MAPTYPE_NORMAL:
 582                 case VM_MAPTYPE_VPAGETABLE:
 583                         if ((object = cur->object.vm_object) == NULL)
 584                                 break;
 585                         if (object->type != OBJT_DEFAULT &&
 586                             object->type != OBJT_SWAP) {
 587                                 break;
 588                         }
 589                         count += object->resident_page_count;
 590                         break;
 591                 default:
 592                         break;
 593                 }
 594         }
 595         vmspace_drop(vm);
 596
 597         return(count);
 598 }
 599
 600 /*
 601  * Initialize an existing vm_map structure such as that in the vmspace
 602  * structure.  The pmap is initialized elsewhere.
 603  *
 604  * No requirements.
 605  */
 606 void
 607 vm_map_init(struct vm_map *map, vm_offset_t min, vm_offset_t max, pmap_t pmap)
 608 {
 609         map->header.next = map->header.prev = &map->header;
 610         RB_INIT(&map->rb_root);
 611         spin_init(&map->ilock_spin, "ilock");
 612         map->ilock_base = NULL;
 613         map->nentries = 0;
 614         map->size = 0;
 615         map->system_map = 0;
 616         map->min_offset = min;
 617         map->max_offset = max;
 618         map->pmap = pmap;
 619         map->timestamp = 0;
 620         map->flags = 0;
 621         bzero(&map->freehint, sizeof(map->freehint));
 622         lwkt_token_init(&map->token, "vm_map");
 623         lockinit(&map->lock, "vm_maplk", (hz + 9) / 10, 0);
 624 }
 625
 626 /*
 627  * Find the first possible free address for the specified request length.
 628  * Returns 0 if we don't have one cached.
 629  */
 630 static
 631 vm_offset_t
 632 vm_map_freehint_find(vm_map_t map, vm_size_t length, vm_size_t align)
 633 {
 634         vm_map_freehint_t *scan;
 635
 636         scan = &map->freehint[0];
 637         while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
 638                 if (scan->length == length && scan->align == align)
 639                         return(scan->start);
 640                 ++scan;
 641         }
 642         return 0;
 643 }
 644
 645 /*
 646  * Unconditionally set the freehint.  Called by vm_map_findspace() after
 647  * it finds an address.  This will help us iterate optimally on the next
 648  * similar findspace.
 649  */
 650 static
 651 void
 652 vm_map_freehint_update(vm_map_t map, vm_offset_t start,
 653                        vm_size_t length, vm_size_t align)
 654 {
 655         vm_map_freehint_t *scan;
 656
 657         scan = &map->freehint[0];
 658         while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
 659                 if (scan->length == length && scan->align == align) {
 660                         scan->start = start;
 661                         return;
 662                 }
 663                 ++scan;
 664         }
 665         scan = &map->freehint[map->freehint_newindex & VM_MAP_FFMASK];
 666         scan->start = start;
 667         scan->align = align;
 668         scan->length = length;
 669         ++map->freehint_newindex;
 670 }
 671
 672 /*
 673  * Update any existing freehints (for any alignment), for the hole we just
 674  * added.
 675  */
 676 static
 677 void
 678 vm_map_freehint_hole(vm_map_t map, vm_offset_t start, vm_size_t length)
 679 {
 680         vm_map_freehint_t *scan;
 681
 682         scan = &map->freehint[0];
 683         while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
 684                 if (scan->length <= length && scan->start > start)
 685                         scan->start = start;
 686                 ++scan;
 687         }
 688 }
 689
 690 /*
 691  * Shadow the vm_map_entry's object.  This typically needs to be done when
 692  * a write fault is taken on an entry which had previously been cloned by
 693  * fork().  The shared object (which might be NULL) must become private so
 694  * we add a shadow layer above it.
 695  *
 696  * Object allocation for anonymous mappings is defered as long as possible.
 697  * When creating a shadow, however, the underlying object must be instantiated
 698  * so it can be shared.
 699  *
 700  * If the map segment is governed by a virtual page table then it is
 701  * possible to address offsets beyond the mapped area.  Just allocate
 702  * a maximally sized object for this case.
 703  *
 704  * If addref is non-zero an additional reference is added to the returned
 705  * entry.  This mechanic exists because the additional reference might have
 706  * to be added atomically and not after return to prevent a premature
 707  * collapse.
 708  *
 709  * The vm_map must be exclusively locked.
 710  * No other requirements.
 711  */
 712 static
 713 void
 714 vm_map_entry_shadow(vm_map_entry_t entry, int addref)
 715 {
 716         if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
 717                 vm_object_shadow(&entry->object.vm_object, &entry->offset,
 718                                  0x7FFFFFFF, addref);   /* XXX */
 719         } else {
 720                 vm_object_shadow(&entry->object.vm_object, &entry->offset,
 721                                  atop(entry->end - entry->start), addref);
 722         }
 723         entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
 724 }
 725
 726 /*
 727  * Allocate an object for a vm_map_entry.
 728  *
 729  * Object allocation for anonymous mappings is defered as long as possible.
 730  * This function is called when we can defer no longer, generally when a map
 731  * entry might be split or forked or takes a page fault.
 732  *
 733  * If the map segment is governed by a virtual page table then it is
 734  * possible to address offsets beyond the mapped area.  Just allocate
 735  * a maximally sized object for this case.
 736  *
 737  * The vm_map must be exclusively locked.
 738  * No other requirements.
 739  */
 740 void
 741 vm_map_entry_allocate_object(vm_map_entry_t entry)
 742 {
 743         vm_object_t obj;
 744
 745         if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
 746                 obj = vm_object_allocate(OBJT_DEFAULT, 0x7FFFFFFF); /* XXX */
 747         } else {
 748                 obj = vm_object_allocate(OBJT_DEFAULT,
 749                                          atop(entry->end - entry->start));
 750         }
 751         entry->object.vm_object = obj;
 752         entry->offset = 0;
 753 }
 754
 755 /*
 756  * Set an initial negative count so the first attempt to reserve
 757  * space preloads a bunch of vm_map_entry's for this cpu.  Also
 758  * pre-allocate 2 vm_map_entries which will be needed by zalloc() to
 759  * map a new page for vm_map_entry structures.  SMP systems are
 760  * particularly sensitive.
 761  *
 762  * This routine is called in early boot so we cannot just call
 763  * vm_map_entry_reserve().
 764  *
 765  * Called from the low level boot code only (for each cpu)
 766  *
 767  * WARNING! Take care not to have too-big a static/BSS structure here
 768  *          as MAXCPU can be 256+, otherwise the loader's 64MB heap
 769  *          can get blown out by the kernel plus the initrd image.
 770  */
 771 void
 772 vm_map_entry_reserve_cpu_init(globaldata_t gd)
 773 {
 774         vm_map_entry_t entry;
 775         int count;
 776         int i;
 777
 778         atomic_add_int(&gd->gd_vme_avail, -MAP_RESERVE_COUNT * 2);
 779         if (gd->gd_cpuid == 0) {
 780                 entry = &cpu_map_entry_init_bsp[0];
 781                 count = MAPENTRYBSP_CACHE;
 782         } else {
 783                 entry = &cpu_map_entry_init_ap[gd->gd_cpuid][0];
 784                 count = MAPENTRYAP_CACHE;
 785         }
 786         for (i = 0; i < count; ++i, ++entry) {
 787                 entry->next = gd->gd_vme_base;
 788                 gd->gd_vme_base = entry;
 789         }
 790 }
 791
 792 /*
 793  * Reserves vm_map_entry structures so code later-on can manipulate
 794  * map_entry structures within a locked map without blocking trying
 795  * to allocate a new vm_map_entry.
 796  *
 797  * No requirements.
 798  *
 799  * WARNING!  We must not decrement gd_vme_avail until after we have
 800  *           ensured that sufficient entries exist, otherwise we can
 801  *           get into an endless call recursion in the zalloc code
 802  *           itself.
 803  */
 804 int
 805 vm_map_entry_reserve(int count)
 806 {
 807         struct globaldata *gd = mycpu;
 808         vm_map_entry_t entry;
 809
 810         /*
 811          * Make sure we have enough structures in gd_vme_base to handle
 812          * the reservation request.
 813          *
 814          * Use a critical section to protect against VM faults.  It might
 815          * not be needed, but we have to be careful here.
 816          */
 817         if (gd->gd_vme_avail < count) {
 818                 crit_enter();
 819                 while (gd->gd_vme_avail < count) {
 820                         entry = zalloc(mapentzone);
 821                         entry->next = gd->gd_vme_base;
 822                         gd->gd_vme_base = entry;
 823                         atomic_add_int(&gd->gd_vme_avail, 1);
 824                 }
 825                 crit_exit();
 826         }
 827         atomic_add_int(&gd->gd_vme_avail, -count);
 828
 829         return(count);
 830 }
 831
 832 /*
 833  * Releases previously reserved vm_map_entry structures that were not
 834  * used.  If we have too much junk in our per-cpu cache clean some of
 835  * it out.
 836  *
 837  * No requirements.
 838  */
 839 void
 840 vm_map_entry_release(int count)
 841 {
 842         struct globaldata *gd = mycpu;
 843         vm_map_entry_t entry;
 844         vm_map_entry_t efree;
 845
 846         count = atomic_fetchadd_int(&gd->gd_vme_avail, count) + count;
 847         if (gd->gd_vme_avail > MAP_RESERVE_SLOP) {
 848                 efree = NULL;
 849                 crit_enter();
 850                 while (gd->gd_vme_avail > MAP_RESERVE_HYST) {
 851                         entry = gd->gd_vme_base;
 852                         KKASSERT(entry != NULL);
 853                         gd->gd_vme_base = entry->next;
 854                         atomic_add_int(&gd->gd_vme_avail, -1);
 855                         entry->next = efree;
 856                         efree = entry;
 857                 }
 858                 crit_exit();
 859                 while ((entry = efree) != NULL) {
 860                         efree = efree->next;
 861                         zfree(mapentzone, entry);
 862                 }
 863         }
 864 }
 865
 866 /*
 867  * Reserve map entry structures for use in kernel_map itself.  These
 868  * entries have *ALREADY* been reserved on a per-cpu basis when the map
 869  * was inited.  This function is used by zalloc() to avoid a recursion
 870  * when zalloc() itself needs to allocate additional kernel memory.
 871  *
 872  * This function works like the normal reserve but does not load the
 873  * vm_map_entry cache (because that would result in an infinite
 874  * recursion).  Note that gd_vme_avail may go negative.  This is expected.
 875  *
 876  * Any caller of this function must be sure to renormalize after
 877  * potentially eating entries to ensure that the reserve supply
 878  * remains intact.
 879  *
 880  * No requirements.
 881  */
 882 int
 883 vm_map_entry_kreserve(int count)
 884 {
 885         struct globaldata *gd = mycpu;
 886
 887         atomic_add_int(&gd->gd_vme_avail, -count);
 888         KASSERT(gd->gd_vme_base != NULL,
 889                 ("no reserved entries left, gd_vme_avail = %d",
 890                 gd->gd_vme_avail));
 891         return(count);
 892 }
 893
 894 /*
 895  * Release previously reserved map entries for kernel_map.  We do not
 896  * attempt to clean up like the normal release function as this would
 897  * cause an unnecessary (but probably not fatal) deep procedure call.
 898  *
 899  * No requirements.
 900  */
 901 void
 902 vm_map_entry_krelease(int count)
 903 {
 904         struct globaldata *gd = mycpu;
 905
 906         atomic_add_int(&gd->gd_vme_avail, count);
 907 }
 908
 909 /*
 910  * Allocates a VM map entry for insertion.  No entry fields are filled in.
 911  *
 912  * The entries should have previously been reserved.  The reservation count
 913  * is tracked in (*countp).
 914  *
 915  * No requirements.
 916  */
 917 static vm_map_entry_t
 918 vm_map_entry_create(vm_map_t map, int *countp)
 919 {
 920         struct globaldata *gd = mycpu;
 921         vm_map_entry_t entry;
 922
 923         KKASSERT(*countp > 0);
 924         --*countp;
 925         crit_enter();
 926         entry = gd->gd_vme_base;
 927         KASSERT(entry != NULL, ("gd_vme_base NULL! count %d", *countp));
 928         gd->gd_vme_base = entry->next;
 929         crit_exit();
 930
 931         return(entry);
 932 }
 933
 934 /*
 935  * Dispose of a vm_map_entry that is no longer being referenced.
 936  *
 937  * No requirements.
 938  */
 939 static void
 940 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry, int *countp)
 941 {
 942         struct globaldata *gd = mycpu;
 943
 944         ++*countp;
 945         crit_enter();
 946         entry->next = gd->gd_vme_base;
 947         gd->gd_vme_base = entry;
 948         crit_exit();
 949 }
 950
 951
 952 /*
 953  * Insert/remove entries from maps.
 954  *
 955  * The related map must be exclusively locked.
 956  * The caller must hold map->token
 957  * No other requirements.
 958  */
 959 static __inline void
 960 vm_map_entry_link(vm_map_t map,
 961                   vm_map_entry_t after_where,
 962                   vm_map_entry_t entry)
 963 {
 964         ASSERT_VM_MAP_LOCKED(map);
 965
 966         map->nentries++;
 967         entry->prev = after_where;
 968         entry->next = after_where->next;
 969         entry->next->prev = entry;
 970         after_where->next = entry;
 971         if (vm_map_rb_tree_RB_INSERT(&map->rb_root, entry))
 972                 panic("vm_map_entry_link: dup addr map %p ent %p", map, entry);
 973 }
 974
 975 static __inline void
 976 vm_map_entry_unlink(vm_map_t map,
 977                     vm_map_entry_t entry)
 978 {
 979         vm_map_entry_t prev;
 980         vm_map_entry_t next;
 981
 982         ASSERT_VM_MAP_LOCKED(map);
 983
 984         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
 985                 panic("vm_map_entry_unlink: attempt to mess with "
 986                       "locked entry! %p", entry);
 987         }
 988         prev = entry->prev;
 989         next = entry->next;
 990         next->prev = prev;
 991         prev->next = next;
 992         vm_map_rb_tree_RB_REMOVE(&map->rb_root, entry);
 993         map->nentries--;
 994 }
 995
 996 /*
 997  * Finds the map entry containing (or immediately preceding) the specified
 998  * address in the given map.  The entry is returned in (*entry).
 999  *
1000  * The boolean result indicates whether the address is actually contained
1001  * in the map.
1002  *
1003  * The related map must be locked.
1004  * No other requirements.
1005  */
1006 boolean_t
1007 vm_map_lookup_entry(vm_map_t map, vm_offset_t address, vm_map_entry_t *entry)
1008 {
1009         vm_map_entry_t tmp;
1010         vm_map_entry_t last;
1011
1012         ASSERT_VM_MAP_LOCKED(map);
1013
1014         /*
1015          * Locate the record from the top of the tree.  'last' tracks the
1016          * closest prior record and is returned if no match is found, which
1017          * in binary tree terms means tracking the most recent right-branch
1018          * taken.  If there is no prior record, &map->header is returned.
1019          */
1020         last = &map->header;
1021         tmp = RB_ROOT(&map->rb_root);
1022
1023         while (tmp) {
1024                 if (address >= tmp->start) {
1025                         if (address < tmp->end) {
1026                                 *entry = tmp;
1027                                 return(TRUE);
1028                         }
1029                         last = tmp;
1030                         tmp = RB_RIGHT(tmp, rb_entry);
1031                 } else {
1032                         tmp = RB_LEFT(tmp, rb_entry);
1033                 }
1034         }
1035         *entry = last;
1036         return (FALSE);
1037 }
1038
1039 /*
1040  * Inserts the given whole VM object into the target map at the specified
1041  * address range.  The object's size should match that of the address range.
1042  *
1043  * The map must be exclusively locked.
1044  * The object must be held.
1045  * The caller must have reserved sufficient vm_map_entry structures.
1046  *
1047  * If object is non-NULL, ref count must be bumped by caller prior to
1048  * making call to account for the new entry.
1049  */
1050 int
1051 vm_map_insert(vm_map_t map, int *countp, void *map_object, void *map_aux,
1052               vm_ooffset_t offset, vm_offset_t start, vm_offset_t end,
1053               vm_maptype_t maptype, vm_subsys_t id,
1054               vm_prot_t prot, vm_prot_t max, int cow)
1055 {
1056         vm_map_entry_t new_entry;
1057         vm_map_entry_t prev_entry;
1058         vm_map_entry_t temp_entry;
1059         vm_eflags_t protoeflags;
1060         int must_drop = 0;
1061         vm_object_t object;
1062
1063         if (maptype == VM_MAPTYPE_UKSMAP)
1064                 object = NULL;
1065         else
1066                 object = map_object;
1067
1068         ASSERT_VM_MAP_LOCKED(map);
1069         if (object)
1070                 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1071
1072         /*
1073          * Check that the start and end points are not bogus.
1074          */
1075         if ((start < map->min_offset) || (end > map->max_offset) ||
1076             (start >= end))
1077                 return (KERN_INVALID_ADDRESS);
1078
1079         /*
1080          * Find the entry prior to the proposed starting address; if it's part
1081          * of an existing entry, this range is bogus.
1082          */
1083         if (vm_map_lookup_entry(map, start, &temp_entry))
1084                 return (KERN_NO_SPACE);
1085
1086         prev_entry = temp_entry;
1087
1088         /*
1089          * Assert that the next entry doesn't overlap the end point.
1090          */
1091
1092         if ((prev_entry->next != &map->header) &&
1093             (prev_entry->next->start < end))
1094                 return (KERN_NO_SPACE);
1095
1096         protoeflags = 0;
1097
1098         if (cow & MAP_COPY_ON_WRITE)
1099                 protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY;
1100
1101         if (cow & MAP_NOFAULT) {
1102                 protoeflags |= MAP_ENTRY_NOFAULT;
1103
1104                 KASSERT(object == NULL,
1105                         ("vm_map_insert: paradoxical MAP_NOFAULT request"));
1106         }
1107         if (cow & MAP_DISABLE_SYNCER)
1108                 protoeflags |= MAP_ENTRY_NOSYNC;
1109         if (cow & MAP_DISABLE_COREDUMP)
1110                 protoeflags |= MAP_ENTRY_NOCOREDUMP;
1111         if (cow & MAP_IS_STACK)
1112                 protoeflags |= MAP_ENTRY_STACK;
1113         if (cow & MAP_IS_KSTACK)
1114                 protoeflags |= MAP_ENTRY_KSTACK;
1115
1116         lwkt_gettoken(&map->token);
1117
1118         if (object) {
1119                 /*
1120                  * When object is non-NULL, it could be shared with another
1121                  * process.  We have to set or clear OBJ_ONEMAPPING
1122                  * appropriately.
1123                  *
1124                  * NOTE: This flag is only applicable to DEFAULT and SWAP
1125                  *       objects and will already be clear in other types
1126                  *       of objects, so a shared object lock is ok for
1127                  *       VNODE objects.
1128                  */
1129                 if ((object->ref_count > 1) || (object->shadow_count != 0)) {
1130                         vm_object_clear_flag(object, OBJ_ONEMAPPING);
1131                 }
1132         }
1133         else if ((prev_entry != &map->header) &&
1134                  (prev_entry->eflags == protoeflags) &&
1135                  (prev_entry->end == start) &&
1136                  (prev_entry->wired_count == 0) &&
1137                  (prev_entry->id == id) &&
1138                  prev_entry->maptype == maptype &&
1139                  maptype == VM_MAPTYPE_NORMAL &&
1140                  ((prev_entry->object.vm_object == NULL) ||
1141                   vm_object_coalesce(prev_entry->object.vm_object,
1142                                      OFF_TO_IDX(prev_entry->offset),
1143                                      (vm_size_t)(prev_entry->end - prev_entry->start),
1144                                      (vm_size_t)(end - prev_entry->end)))) {
1145                 /*
1146                  * We were able to extend the object.  Determine if we
1147                  * can extend the previous map entry to include the
1148                  * new range as well.
1149                  */
1150                 if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) &&
1151                     (prev_entry->protection == prot) &&
1152                     (prev_entry->max_protection == max)) {
1153                         map->size += (end - prev_entry->end);
1154                         prev_entry->end = end;
1155                         vm_map_simplify_entry(map, prev_entry, countp);
1156                         lwkt_reltoken(&map->token);
1157                         return (KERN_SUCCESS);
1158                 }
1159
1160                 /*
1161                  * If we can extend the object but cannot extend the
1162                  * map entry, we have to create a new map entry.  We
1163                  * must bump the ref count on the extended object to
1164                  * account for it.  object may be NULL.
1165                  *
1166                  * XXX if object is NULL should we set offset to 0 here ?
1167                  */
1168                 object = prev_entry->object.vm_object;
1169                 offset = prev_entry->offset +
1170                         (prev_entry->end - prev_entry->start);
1171                 if (object) {
1172                         vm_object_hold(object);
1173                         vm_object_chain_wait(object, 0);
1174                         vm_object_reference_locked(object);
1175                         must_drop = 1;
1176                         map_object = object;
1177                 }
1178         }
1179
1180         /*
1181          * NOTE: if conditionals fail, object can be NULL here.  This occurs
1182          * in things like the buffer map where we manage kva but do not manage
1183          * backing objects.
1184          */
1185
1186         /*
1187          * Create a new entry
1188          */
1189
1190         new_entry = vm_map_entry_create(map, countp);
1191         new_entry->start = start;
1192         new_entry->end = end;
1193         new_entry->id = id;
1194
1195         new_entry->maptype = maptype;
1196         new_entry->eflags = protoeflags;
1197         new_entry->object.map_object = map_object;
1198         new_entry->aux.master_pde = 0;          /* in case size is different */
1199         new_entry->aux.map_aux = map_aux;
1200         new_entry->offset = offset;
1201
1202         new_entry->inheritance = VM_INHERIT_DEFAULT;
1203         new_entry->protection = prot;
1204         new_entry->max_protection = max;
1205         new_entry->wired_count = 0;
1206
1207         /*
1208          * Insert the new entry into the list
1209          */
1210
1211         vm_map_entry_link(map, prev_entry, new_entry);
1212         map->size += new_entry->end - new_entry->start;
1213
1214         /*
1215          * Don't worry about updating freehint[] when inserting, allow
1216          * addresses to be lower than the actual first free spot.
1217          */
1218 #if 0
1219         /*
1220          * Temporarily removed to avoid MAP_STACK panic, due to
1221          * MAP_STACK being a huge hack.  Will be added back in
1222          * when MAP_STACK (and the user stack mapping) is fixed.
1223          */
1224         /*
1225          * It may be possible to simplify the entry
1226          */
1227         vm_map_simplify_entry(map, new_entry, countp);
1228 #endif
1229
1230         /*
1231          * Try to pre-populate the page table.  Mappings governed by virtual
1232          * page tables cannot be prepopulated without a lot of work, so
1233          * don't try.
1234          */
1235         if ((cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) &&
1236             maptype != VM_MAPTYPE_VPAGETABLE &&
1237             maptype != VM_MAPTYPE_UKSMAP) {
1238                 int dorelock = 0;
1239                 if (vm_map_relock_enable && (cow & MAP_PREFAULT_RELOCK)) {
1240                         dorelock = 1;
1241                         vm_object_lock_swap();
1242                         vm_object_drop(object);
1243                 }
1244                 pmap_object_init_pt(map->pmap, start, prot,
1245                                     object, OFF_TO_IDX(offset), end - start,
1246                                     cow & MAP_PREFAULT_PARTIAL);
1247                 if (dorelock) {
1248                         vm_object_hold(object);
1249                         vm_object_lock_swap();
1250                 }
1251         }
1252         if (must_drop)
1253                 vm_object_drop(object);
1254
1255         lwkt_reltoken(&map->token);
1256         return (KERN_SUCCESS);
1257 }
1258
1259 /*
1260  * Find sufficient space for `length' bytes in the given map, starting at
1261  * `start'.  Returns 0 on success, 1 on no space.
1262  *
1263  * This function will returned an arbitrarily aligned pointer.  If no
1264  * particular alignment is required you should pass align as 1.  Note that
1265  * the map may return PAGE_SIZE aligned pointers if all the lengths used in
1266  * the map are a multiple of PAGE_SIZE, even if you pass a smaller align
1267  * argument.
1268  *
1269  * 'align' should be a power of 2 but is not required to be.
1270  *
1271  * The map must be exclusively locked.
1272  * No other requirements.
1273  */
1274 int
1275 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length,
1276                  vm_size_t align, int flags, vm_offset_t *addr)
1277 {
1278         vm_map_entry_t entry, next;
1279         vm_map_entry_t tmp;
1280         vm_offset_t hole_start;
1281         vm_offset_t end;
1282         vm_offset_t align_mask;
1283
1284         if (start < map->min_offset)
1285                 start = map->min_offset;
1286         if (start > map->max_offset)
1287                 return (1);
1288
1289         /*
1290          * If the alignment is not a power of 2 we will have to use
1291          * a mod/division, set align_mask to a special value.
1292          */
1293         if ((align | (align - 1)) + 1 != (align << 1))
1294                 align_mask = (vm_offset_t)-1;
1295         else
1296                 align_mask = align - 1;
1297
1298         /*
1299          * Use freehint to adjust the start point, hopefully reducing
1300          * the iteration to O(1).
1301          */
1302         hole_start = vm_map_freehint_find(map, length, align);
1303         if (start < hole_start)
1304                 start = hole_start;
1305         if (vm_map_lookup_entry(map, start, &tmp))
1306                 start = tmp->end;
1307         entry = tmp;
1308
1309         /*
1310          * Look through the rest of the map, trying to fit a new region in the
1311          * gap between existing regions, or after the very last region.
1312          */
1313         for (;; start = (entry = next)->end) {
1314                 /*
1315                  * Adjust the proposed start by the requested alignment,
1316                  * be sure that we didn't wrap the address.
1317                  */
1318                 if (align_mask == (vm_offset_t)-1)
1319                         end = roundup(start, align);
1320                 else
1321                         end = (start + align_mask) & ~align_mask;
1322                 if (end < start)
1323                         return (1);
1324                 start = end;
1325
1326                 /*
1327                  * Find the end of the proposed new region.  Be sure we didn't
1328                  * go beyond the end of the map, or wrap around the address.
1329                  * Then check to see if this is the last entry or if the
1330                  * proposed end fits in the gap between this and the next
1331                  * entry.
1332                  */
1333                 end = start + length;
1334                 if (end > map->max_offset || end < start)
1335                         return (1);
1336                 next = entry->next;
1337
1338                 /*
1339                  * If the next entry's start address is beyond the desired
1340                  * end address we may have found a good entry.
1341                  *
1342                  * If the next entry is a stack mapping we do not map into
1343                  * the stack's reserved space.
1344                  *
1345                  * XXX continue to allow mapping into the stack's reserved
1346                  * space if doing a MAP_STACK mapping inside a MAP_STACK
1347                  * mapping, for backwards compatibility.  But the caller
1348                  * really should use MAP_STACK | MAP_TRYFIXED if they
1349                  * want to do that.
1350                  */
1351                 if (next == &map->header)
1352                         break;
1353                 if (next->start >= end) {
1354                         if ((next->eflags & MAP_ENTRY_STACK) == 0)
1355                                 break;
1356                         if (flags & MAP_STACK)
1357                                 break;
1358                         if (next->start - next->aux.avail_ssize >= end)
1359                                 break;
1360                 }
1361         }
1362
1363         /*
1364          * Update the freehint
1365          */
1366         vm_map_freehint_update(map, start, length, align);
1367
1368         /*
1369          * Grow the kernel_map if necessary.  pmap_growkernel() will panic
1370          * if it fails.  The kernel_map is locked and nothing can steal
1371          * our address space if pmap_growkernel() blocks.
1372          *
1373          * NOTE: This may be unconditionally called for kldload areas on
1374          *       x86_64 because these do not bump kernel_vm_end (which would
1375          *       fill 128G worth of page tables!).  Therefore we must not
1376          *       retry.
1377          */
1378         if (map == &kernel_map) {
1379                 vm_offset_t kstop;
1380
1381                 kstop = round_page(start + length);
1382                 if (kstop > kernel_vm_end)
1383                         pmap_growkernel(start, kstop);
1384         }
1385         *addr = start;
1386         return (0);
1387 }
1388
1389 /*
1390  * vm_map_find finds an unallocated region in the target address map with
1391  * the given length and allocates it.  The search is defined to be first-fit
1392  * from the specified address; the region found is returned in the same
1393  * parameter.
1394  *
1395  * If object is non-NULL, ref count must be bumped by caller
1396  * prior to making call to account for the new entry.
1397  *
1398  * No requirements.  This function will lock the map temporarily.
1399  */
1400 int
1401 vm_map_find(vm_map_t map, void *map_object, void *map_aux,
1402             vm_ooffset_t offset, vm_offset_t *addr,
1403             vm_size_t length, vm_size_t align, boolean_t fitit,
1404             vm_maptype_t maptype, vm_subsys_t id,
1405             vm_prot_t prot, vm_prot_t max, int cow)
1406 {
1407         vm_offset_t start;
1408         vm_object_t object;
1409         int result;
1410         int count;
1411
1412         if (maptype == VM_MAPTYPE_UKSMAP)
1413                 object = NULL;
1414         else
1415                 object = map_object;
1416
1417         start = *addr;
1418
1419         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1420         vm_map_lock(map);
1421         if (object)
1422                 vm_object_hold_shared(object);
1423         if (fitit) {
1424                 if (vm_map_findspace(map, start, length, align, 0, addr)) {
1425                         if (object)
1426                                 vm_object_drop(object);
1427                         vm_map_unlock(map);
1428                         vm_map_entry_release(count);
1429                         return (KERN_NO_SPACE);
1430                 }
1431                 start = *addr;
1432         }
1433         result = vm_map_insert(map, &count, map_object, map_aux,
1434                                offset, start, start + length,
1435                                maptype, id, prot, max, cow);
1436         if (object)
1437                 vm_object_drop(object);
1438         vm_map_unlock(map);
1439         vm_map_entry_release(count);
1440
1441         return (result);
1442 }
1443
1444 /*
1445  * Simplify the given map entry by merging with either neighbor.  This
1446  * routine also has the ability to merge with both neighbors.
1447  *
1448  * This routine guarentees that the passed entry remains valid (though
1449  * possibly extended).  When merging, this routine may delete one or
1450  * both neighbors.  No action is taken on entries which have their
1451  * in-transition flag set.
1452  *
1453  * The map must be exclusively locked.
1454  */
1455 void
1456 vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry, int *countp)
1457 {
1458         vm_map_entry_t next, prev;
1459         vm_size_t prevsize, esize;
1460
1461         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1462                 ++mycpu->gd_cnt.v_intrans_coll;
1463                 return;
1464         }
1465
1466         if (entry->maptype == VM_MAPTYPE_SUBMAP)
1467                 return;
1468         if (entry->maptype == VM_MAPTYPE_UKSMAP)
1469                 return;
1470
1471         prev = entry->prev;
1472         if (prev != &map->header) {
1473                 prevsize = prev->end - prev->start;
1474                 if ( (prev->end == entry->start) &&
1475                      (prev->maptype == entry->maptype) &&
1476                      (prev->object.vm_object == entry->object.vm_object) &&
1477                      (!prev->object.vm_object ||
1478                         (prev->offset + prevsize == entry->offset)) &&
1479                      (prev->eflags == entry->eflags) &&
1480                      (prev->protection == entry->protection) &&
1481                      (prev->max_protection == entry->max_protection) &&
1482                      (prev->inheritance == entry->inheritance) &&
1483                      (prev->id == entry->id) &&
1484                      (prev->wired_count == entry->wired_count)) {
1485                         vm_map_entry_unlink(map, prev);
1486                         entry->start = prev->start;
1487                         entry->offset = prev->offset;
1488                         if (prev->object.vm_object)
1489                                 vm_object_deallocate(prev->object.vm_object);
1490                         vm_map_entry_dispose(map, prev, countp);
1491                 }
1492         }
1493
1494         next = entry->next;
1495         if (next != &map->header) {
1496                 esize = entry->end - entry->start;
1497                 if ((entry->end == next->start) &&
1498                     (next->maptype == entry->maptype) &&
1499                     (next->object.vm_object == entry->object.vm_object) &&
1500                      (!entry->object.vm_object ||
1501                         (entry->offset + esize == next->offset)) &&
1502                     (next->eflags == entry->eflags) &&
1503                     (next->protection == entry->protection) &&
1504                     (next->max_protection == entry->max_protection) &&
1505                     (next->inheritance == entry->inheritance) &&
1506                     (next->id == entry->id) &&
1507                     (next->wired_count == entry->wired_count)) {
1508                         vm_map_entry_unlink(map, next);
1509                         entry->end = next->end;
1510                         if (next->object.vm_object)
1511                                 vm_object_deallocate(next->object.vm_object);
1512                         vm_map_entry_dispose(map, next, countp);
1513                 }
1514         }
1515 }
1516
1517 /*
1518  * Asserts that the given entry begins at or after the specified address.
1519  * If necessary, it splits the entry into two.
1520  */
1521 #define vm_map_clip_start(map, entry, startaddr, countp)                \
1522 {                                                                       \
1523         if (startaddr > entry->start)                                   \
1524                 _vm_map_clip_start(map, entry, startaddr, countp);      \
1525 }
1526
1527 /*
1528  * This routine is called only when it is known that the entry must be split.
1529  *
1530  * The map must be exclusively locked.
1531  */
1532 static void
1533 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start,
1534                    int *countp)
1535 {
1536         vm_map_entry_t new_entry;
1537
1538         /*
1539          * Split off the front portion -- note that we must insert the new
1540          * entry BEFORE this one, so that this entry has the specified
1541          * starting address.
1542          */
1543
1544         vm_map_simplify_entry(map, entry, countp);
1545
1546         /*
1547          * If there is no object backing this entry, we might as well create
1548          * one now.  If we defer it, an object can get created after the map
1549          * is clipped, and individual objects will be created for the split-up
1550          * map.  This is a bit of a hack, but is also about the best place to
1551          * put this improvement.
1552          */
1553         if (entry->object.vm_object == NULL && !map->system_map &&
1554             VM_MAP_ENTRY_WITHIN_PARTITION(entry)) {
1555                 vm_map_entry_allocate_object(entry);
1556         }
1557
1558         new_entry = vm_map_entry_create(map, countp);
1559         *new_entry = *entry;
1560
1561         new_entry->end = start;
1562         entry->offset += (start - entry->start);
1563         entry->start = start;
1564
1565         vm_map_entry_link(map, entry->prev, new_entry);
1566
1567         switch(entry->maptype) {
1568         case VM_MAPTYPE_NORMAL:
1569         case VM_MAPTYPE_VPAGETABLE:
1570                 if (new_entry->object.vm_object) {
1571                         vm_object_hold(new_entry->object.vm_object);
1572                         vm_object_chain_wait(new_entry->object.vm_object, 0);
1573                         vm_object_reference_locked(new_entry->object.vm_object);
1574                         vm_object_drop(new_entry->object.vm_object);
1575                 }
1576                 break;
1577         default:
1578                 break;
1579         }
1580 }
1581
1582 /*
1583  * Asserts that the given entry ends at or before the specified address.
1584  * If necessary, it splits the entry into two.
1585  *
1586  * The map must be exclusively locked.
1587  */
1588 #define vm_map_clip_end(map, entry, endaddr, countp)            \
1589 {                                                               \
1590         if (endaddr < entry->end)                               \
1591                 _vm_map_clip_end(map, entry, endaddr, countp);  \
1592 }
1593
1594 /*
1595  * This routine is called only when it is known that the entry must be split.
1596  *
1597  * The map must be exclusively locked.
1598  */
1599 static void
1600 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end,
1601                  int *countp)
1602 {
1603         vm_map_entry_t new_entry;
1604
1605         /*
1606          * If there is no object backing this entry, we might as well create
1607          * one now.  If we defer it, an object can get created after the map
1608          * is clipped, and individual objects will be created for the split-up
1609          * map.  This is a bit of a hack, but is also about the best place to
1610          * put this improvement.
1611          */
1612
1613         if (entry->object.vm_object == NULL && !map->system_map &&
1614             VM_MAP_ENTRY_WITHIN_PARTITION(entry)) {
1615                 vm_map_entry_allocate_object(entry);
1616         }
1617
1618         /*
1619          * Create a new entry and insert it AFTER the specified entry
1620          */
1621         new_entry = vm_map_entry_create(map, countp);
1622         *new_entry = *entry;
1623
1624         new_entry->start = entry->end = end;
1625         new_entry->offset += (end - entry->start);
1626
1627         vm_map_entry_link(map, entry, new_entry);
1628
1629         switch(entry->maptype) {
1630         case VM_MAPTYPE_NORMAL:
1631         case VM_MAPTYPE_VPAGETABLE:
1632                 if (new_entry->object.vm_object) {
1633                         vm_object_hold(new_entry->object.vm_object);
1634                         vm_object_chain_wait(new_entry->object.vm_object, 0);
1635                         vm_object_reference_locked(new_entry->object.vm_object);
1636                         vm_object_drop(new_entry->object.vm_object);
1637                 }
1638                 break;
1639         default:
1640                 break;
1641         }
1642 }
1643
1644 /*
1645  * Asserts that the starting and ending region addresses fall within the
1646  * valid range for the map.
1647  */
1648 #define VM_MAP_RANGE_CHECK(map, start, end)     \
1649 {                                               \
1650         if (start < vm_map_min(map))            \
1651                 start = vm_map_min(map);        \
1652         if (end > vm_map_max(map))              \
1653                 end = vm_map_max(map);          \
1654         if (start > end)                        \
1655                 start = end;                    \
1656 }
1657
1658 /*
1659  * Used to block when an in-transition collison occurs.  The map
1660  * is unlocked for the sleep and relocked before the return.
1661  */
1662 void
1663 vm_map_transition_wait(vm_map_t map, int relock)
1664 {
1665         tsleep_interlock(map, 0);
1666         vm_map_unlock(map);
1667         tsleep(map, PINTERLOCKED, "vment", 0);
1668         if (relock)
1669                 vm_map_lock(map);
1670 }
1671
1672 /*
1673  * When we do blocking operations with the map lock held it is
1674  * possible that a clip might have occured on our in-transit entry,
1675  * requiring an adjustment to the entry in our loop.  These macros
1676  * help the pageable and clip_range code deal with the case.  The
1677  * conditional costs virtually nothing if no clipping has occured.
1678  */
1679
1680 #define CLIP_CHECK_BACK(entry, save_start)              \
1681     do {                                                \
1682             while (entry->start != save_start) {        \
1683                     entry = entry->prev;                \
1684                     KASSERT(entry != &map->header, ("bad entry clip")); \
1685             }                                           \
1686     } while(0)
1687
1688 #define CLIP_CHECK_FWD(entry, save_end)                 \
1689     do {                                                \
1690             while (entry->end != save_end) {            \
1691                     entry = entry->next;                \
1692                     KASSERT(entry != &map->header, ("bad entry clip")); \
1693             }                                           \
1694     } while(0)
1695
1696
1697 /*
1698  * Clip the specified range and return the base entry.  The
1699  * range may cover several entries starting at the returned base
1700  * and the first and last entry in the covering sequence will be
1701  * properly clipped to the requested start and end address.
1702  *
1703  * If no holes are allowed you should pass the MAP_CLIP_NO_HOLES
1704  * flag.
1705  *
1706  * The MAP_ENTRY_IN_TRANSITION flag will be set for the entries
1707  * covered by the requested range.
1708  *
1709  * The map must be exclusively locked on entry and will remain locked
1710  * on return. If no range exists or the range contains holes and you
1711  * specified that no holes were allowed, NULL will be returned.  This
1712  * routine may temporarily unlock the map in order avoid a deadlock when
1713  * sleeping.
1714  */
1715 static
1716 vm_map_entry_t
1717 vm_map_clip_range(vm_map_t map, vm_offset_t start, vm_offset_t end,
1718                   int *countp, int flags)
1719 {
1720         vm_map_entry_t start_entry;
1721         vm_map_entry_t entry;
1722
1723         /*
1724          * Locate the entry and effect initial clipping.  The in-transition
1725          * case does not occur very often so do not try to optimize it.
1726          */
1727 again:
1728         if (vm_map_lookup_entry(map, start, &start_entry) == FALSE)
1729                 return (NULL);
1730         entry = start_entry;
1731         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1732                 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1733                 ++mycpu->gd_cnt.v_intrans_coll;
1734                 ++mycpu->gd_cnt.v_intrans_wait;
1735                 vm_map_transition_wait(map, 1);
1736                 /*
1737                  * entry and/or start_entry may have been clipped while
1738                  * we slept, or may have gone away entirely.  We have
1739                  * to restart from the lookup.
1740                  */
1741                 goto again;
1742         }
1743
1744         /*
1745          * Since we hold an exclusive map lock we do not have to restart
1746          * after clipping, even though clipping may block in zalloc.
1747          */
1748         vm_map_clip_start(map, entry, start, countp);
1749         vm_map_clip_end(map, entry, end, countp);
1750         entry->eflags |= MAP_ENTRY_IN_TRANSITION;
1751
1752         /*
1753          * Scan entries covered by the range.  When working on the next
1754          * entry a restart need only re-loop on the current entry which
1755          * we have already locked, since 'next' may have changed.  Also,
1756          * even though entry is safe, it may have been clipped so we
1757          * have to iterate forwards through the clip after sleeping.
1758          */
1759         while (entry->next != &map->header && entry->next->start < end) {
1760                 vm_map_entry_t next = entry->next;
1761
1762                 if (flags & MAP_CLIP_NO_HOLES) {
1763                         if (next->start > entry->end) {
1764                                 vm_map_unclip_range(map, start_entry,
1765                                         start, entry->end, countp, flags);
1766                                 return(NULL);
1767                         }
1768                 }
1769
1770                 if (next->eflags & MAP_ENTRY_IN_TRANSITION) {
1771                         vm_offset_t save_end = entry->end;
1772                         next->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1773                         ++mycpu->gd_cnt.v_intrans_coll;
1774                         ++mycpu->gd_cnt.v_intrans_wait;
1775                         vm_map_transition_wait(map, 1);
1776
1777                         /*
1778                          * clips might have occured while we blocked.
1779                          */
1780                         CLIP_CHECK_FWD(entry, save_end);
1781                         CLIP_CHECK_BACK(start_entry, start);
1782                         continue;
1783                 }
1784
1785                 /*
1786                  * No restart necessary even though clip_end may block, we
1787                  * are holding the map lock.
1788                  */
1789                 vm_map_clip_end(map, next, end, countp);
1790                 next->eflags |= MAP_ENTRY_IN_TRANSITION;
1791                 entry = next;
1792         }
1793         if (flags & MAP_CLIP_NO_HOLES) {
1794                 if (entry->end != end) {
1795                         vm_map_unclip_range(map, start_entry,
1796                                 start, entry->end, countp, flags);
1797                         return(NULL);
1798                 }
1799         }
1800         return(start_entry);
1801 }
1802
1803 /*
1804  * Undo the effect of vm_map_clip_range().  You should pass the same
1805  * flags and the same range that you passed to vm_map_clip_range().
1806  * This code will clear the in-transition flag on the entries and
1807  * wake up anyone waiting.  This code will also simplify the sequence
1808  * and attempt to merge it with entries before and after the sequence.
1809  *
1810  * The map must be locked on entry and will remain locked on return.
1811  *
1812  * Note that you should also pass the start_entry returned by
1813  * vm_map_clip_range().  However, if you block between the two calls
1814  * with the map unlocked please be aware that the start_entry may
1815  * have been clipped and you may need to scan it backwards to find
1816  * the entry corresponding with the original start address.  You are
1817  * responsible for this, vm_map_unclip_range() expects the correct
1818  * start_entry to be passed to it and will KASSERT otherwise.
1819  */
1820 static
1821 void
1822 vm_map_unclip_range(vm_map_t map, vm_map_entry_t start_entry,
1823                     vm_offset_t start, vm_offset_t end,
1824                     int *countp, int flags)
1825 {
1826         vm_map_entry_t entry;
1827
1828         entry = start_entry;
1829
1830         KASSERT(entry->start == start, ("unclip_range: illegal base entry"));
1831         while (entry != &map->header && entry->start < end) {
1832                 KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
1833                         ("in-transition flag not set during unclip on: %p",
1834                         entry));
1835                 KASSERT(entry->end <= end,
1836                         ("unclip_range: tail wasn't clipped"));
1837                 entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
1838                 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
1839                         entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
1840                         wakeup(map);
1841                 }
1842                 entry = entry->next;
1843         }
1844
1845         /*
1846          * Simplification does not block so there is no restart case.
1847          */
1848         entry = start_entry;
1849         while (entry != &map->header && entry->start < end) {
1850                 vm_map_simplify_entry(map, entry, countp);
1851                 entry = entry->next;
1852         }
1853 }
1854
1855 /*
1856  * Mark the given range as handled by a subordinate map.
1857  *
1858  * This range must have been created with vm_map_find(), and no other
1859  * operations may have been performed on this range prior to calling
1860  * vm_map_submap().
1861  *
1862  * Submappings cannot be removed.
1863  *
1864  * No requirements.
1865  */
1866 int
1867 vm_map_submap(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_map_t submap)
1868 {
1869         vm_map_entry_t entry;
1870         int result = KERN_INVALID_ARGUMENT;
1871         int count;
1872
1873         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1874         vm_map_lock(map);
1875
1876         VM_MAP_RANGE_CHECK(map, start, end);
1877
1878         if (vm_map_lookup_entry(map, start, &entry)) {
1879                 vm_map_clip_start(map, entry, start, &count);
1880         } else {
1881                 entry = entry->next;
1882         }
1883
1884         vm_map_clip_end(map, entry, end, &count);
1885
1886         if ((entry->start == start) && (entry->end == end) &&
1887             ((entry->eflags & MAP_ENTRY_COW) == 0) &&
1888             (entry->object.vm_object == NULL)) {
1889                 entry->object.sub_map = submap;
1890                 entry->maptype = VM_MAPTYPE_SUBMAP;
1891                 result = KERN_SUCCESS;
1892         }
1893         vm_map_unlock(map);
1894         vm_map_entry_release(count);
1895
1896         return (result);
1897 }
1898
1899 /*
1900  * Sets the protection of the specified address region in the target map.
1901  * If "set_max" is specified, the maximum protection is to be set;
1902  * otherwise, only the current protection is affected.
1903  *
1904  * The protection is not applicable to submaps, but is applicable to normal
1905  * maps and maps governed by virtual page tables.  For example, when operating
1906  * on a virtual page table our protection basically controls how COW occurs
1907  * on the backing object, whereas the virtual page table abstraction itself
1908  * is an abstraction for userland.
1909  *
1910  * No requirements.
1911  */
1912 int
1913 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
1914                vm_prot_t new_prot, boolean_t set_max)
1915 {
1916         vm_map_entry_t current;
1917         vm_map_entry_t entry;
1918         int count;
1919
1920         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1921         vm_map_lock(map);
1922
1923         VM_MAP_RANGE_CHECK(map, start, end);
1924
1925         if (vm_map_lookup_entry(map, start, &entry)) {
1926                 vm_map_clip_start(map, entry, start, &count);
1927         } else {
1928                 entry = entry->next;
1929         }
1930
1931         /*
1932          * Make a first pass to check for protection violations.
1933          */
1934         current = entry;
1935         while ((current != &map->header) && (current->start < end)) {
1936                 if (current->maptype == VM_MAPTYPE_SUBMAP) {
1937                         vm_map_unlock(map);
1938                         vm_map_entry_release(count);
1939                         return (KERN_INVALID_ARGUMENT);
1940                 }
1941                 if ((new_prot & current->max_protection) != new_prot) {
1942                         vm_map_unlock(map);
1943                         vm_map_entry_release(count);
1944                         return (KERN_PROTECTION_FAILURE);
1945                 }
1946                 current = current->next;
1947         }
1948
1949         /*
1950          * Go back and fix up protections. [Note that clipping is not
1951          * necessary the second time.]
1952          */
1953         current = entry;
1954
1955         while ((current != &map->header) && (current->start < end)) {
1956                 vm_prot_t old_prot;
1957
1958                 vm_map_clip_end(map, current, end, &count);
1959
1960                 old_prot = current->protection;
1961                 if (set_max) {
1962                         current->max_protection = new_prot;
1963                         current->protection = new_prot & old_prot;
1964                 } else {
1965                         current->protection = new_prot;
1966                 }
1967
1968                 /*
1969                  * Update physical map if necessary. Worry about copy-on-write
1970                  * here -- CHECK THIS XXX
1971                  */
1972                 if (current->protection != old_prot) {
1973 #define MASK(entry)     (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
1974                                                         VM_PROT_ALL)
1975
1976                         pmap_protect(map->pmap, current->start,
1977                             current->end,
1978                             current->protection & MASK(current));
1979 #undef  MASK
1980                 }
1981
1982                 vm_map_simplify_entry(map, current, &count);
1983
1984                 current = current->next;
1985         }
1986         vm_map_unlock(map);
1987         vm_map_entry_release(count);
1988         return (KERN_SUCCESS);
1989 }
1990
1991 /*
1992  * This routine traverses a processes map handling the madvise
1993  * system call.  Advisories are classified as either those effecting
1994  * the vm_map_entry structure, or those effecting the underlying
1995  * objects.
1996  *
1997  * The <value> argument is used for extended madvise calls.
1998  *
1999  * No requirements.
2000  */
2001 int
2002 vm_map_madvise(vm_map_t map, vm_offset_t start, vm_offset_t end,
2003                int behav, off_t value)
2004 {
2005         vm_map_entry_t current, entry;
2006         int modify_map = 0;
2007         int error = 0;
2008         int count;
2009
2010         /*
2011          * Some madvise calls directly modify the vm_map_entry, in which case
2012          * we need to use an exclusive lock on the map and we need to perform
2013          * various clipping operations.  Otherwise we only need a read-lock
2014          * on the map.
2015          */
2016         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2017
2018         switch(behav) {
2019         case MADV_NORMAL:
2020         case MADV_SEQUENTIAL:
2021         case MADV_RANDOM:
2022         case MADV_NOSYNC:
2023         case MADV_AUTOSYNC:
2024         case MADV_NOCORE:
2025         case MADV_CORE:
2026         case MADV_SETMAP:
2027                 modify_map = 1;
2028                 vm_map_lock(map);
2029                 break;
2030         case MADV_INVAL:
2031         case MADV_WILLNEED:
2032         case MADV_DONTNEED:
2033         case MADV_FREE:
2034                 vm_map_lock_read(map);
2035                 break;
2036         default:
2037                 vm_map_entry_release(count);
2038                 return (EINVAL);
2039         }
2040
2041         /*
2042          * Locate starting entry and clip if necessary.
2043          */
2044
2045         VM_MAP_RANGE_CHECK(map, start, end);
2046
2047         if (vm_map_lookup_entry(map, start, &entry)) {
2048                 if (modify_map)
2049                         vm_map_clip_start(map, entry, start, &count);
2050         } else {
2051                 entry = entry->next;
2052         }
2053
2054         if (modify_map) {
2055                 /*
2056                  * madvise behaviors that are implemented in the vm_map_entry.
2057                  *
2058                  * We clip the vm_map_entry so that behavioral changes are
2059                  * limited to the specified address range.
2060                  */
2061                 for (current = entry;
2062                      (current != &map->header) && (current->start < end);
2063                      current = current->next
2064                 ) {
2065                         if (current->maptype == VM_MAPTYPE_SUBMAP)
2066                                 continue;
2067
2068                         vm_map_clip_end(map, current, end, &count);
2069
2070                         switch (behav) {
2071                         case MADV_NORMAL:
2072                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
2073                                 break;
2074                         case MADV_SEQUENTIAL:
2075                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
2076                                 break;
2077                         case MADV_RANDOM:
2078                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
2079                                 break;
2080                         case MADV_NOSYNC:
2081                                 current->eflags |= MAP_ENTRY_NOSYNC;
2082                                 break;
2083                         case MADV_AUTOSYNC:
2084                                 current->eflags &= ~MAP_ENTRY_NOSYNC;
2085                                 break;
2086                         case MADV_NOCORE:
2087                                 current->eflags |= MAP_ENTRY_NOCOREDUMP;
2088                                 break;
2089                         case MADV_CORE:
2090                                 current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
2091                                 break;
2092                         case MADV_SETMAP:
2093                                 /*
2094                                  * Set the page directory page for a map
2095                                  * governed by a virtual page table.  Mark
2096                                  * the entry as being governed by a virtual
2097                                  * page table if it is not.
2098                                  *
2099                                  * XXX the page directory page is stored
2100                                  * in the avail_ssize field if the map_entry.
2101                                  *
2102                                  * XXX the map simplification code does not
2103                                  * compare this field so weird things may
2104                                  * happen if you do not apply this function
2105                                  * to the entire mapping governed by the
2106                                  * virtual page table.
2107                                  */
2108                                 if (current->maptype != VM_MAPTYPE_VPAGETABLE) {
2109                                         error = EINVAL;
2110                                         break;
2111                                 }
2112                                 current->aux.master_pde = value;
2113                                 pmap_remove(map->pmap,
2114                                             current->start, current->end);
2115                                 break;
2116                         case MADV_INVAL:
2117                                 /*
2118                                  * Invalidate the related pmap entries, used
2119                                  * to flush portions of the real kernel's
2120                                  * pmap when the caller has removed or
2121                                  * modified existing mappings in a virtual
2122                                  * page table.
2123                                  *
2124                                  * (exclusive locked map version does not
2125                                  * need the range interlock).
2126                                  */
2127                                 pmap_remove(map->pmap,
2128                                             current->start, current->end);
2129                                 break;
2130                         default:
2131                                 error = EINVAL;
2132                                 break;
2133                         }
2134                         vm_map_simplify_entry(map, current, &count);
2135                 }
2136                 vm_map_unlock(map);
2137         } else {
2138                 vm_pindex_t pindex;
2139                 vm_pindex_t delta;
2140
2141                 /*
2142                  * madvise behaviors that are implemented in the underlying
2143                  * vm_object.
2144                  *
2145                  * Since we don't clip the vm_map_entry, we have to clip
2146                  * the vm_object pindex and count.
2147                  *
2148                  * NOTE!  These functions are only supported on normal maps,
2149                  *        except MADV_INVAL which is also supported on
2150                  *        virtual page tables.
2151                  */
2152                 for (current = entry;
2153                      (current != &map->header) && (current->start < end);
2154                      current = current->next
2155                 ) {
2156                         vm_offset_t useStart;
2157
2158                         if (current->maptype != VM_MAPTYPE_NORMAL &&
2159                             (current->maptype != VM_MAPTYPE_VPAGETABLE ||
2160                              behav != MADV_INVAL)) {
2161                                 continue;
2162                         }
2163
2164                         pindex = OFF_TO_IDX(current->offset);
2165                         delta = atop(current->end - current->start);
2166                         useStart = current->start;
2167
2168                         if (current->start < start) {
2169                                 pindex += atop(start - current->start);
2170                                 delta -= atop(start - current->start);
2171                                 useStart = start;
2172                         }
2173                         if (current->end > end)
2174                                 delta -= atop(current->end - end);
2175
2176                         if ((vm_spindex_t)delta <= 0)
2177                                 continue;
2178
2179                         if (behav == MADV_INVAL) {
2180                                 /*
2181                                  * Invalidate the related pmap entries, used
2182                                  * to flush portions of the real kernel's
2183                                  * pmap when the caller has removed or
2184                                  * modified existing mappings in a virtual
2185                                  * page table.
2186                                  *
2187                                  * (shared locked map version needs the
2188                                  * interlock, see vm_fault()).
2189                                  */
2190                                 struct vm_map_ilock ilock;
2191
2192                                 KASSERT(useStart >= VM_MIN_USER_ADDRESS &&
2193                                             useStart + ptoa(delta) <=
2194                                             VM_MAX_USER_ADDRESS,
2195                                          ("Bad range %016jx-%016jx (%016jx)",
2196                                          useStart, useStart + ptoa(delta),
2197                                          delta));
2198                                 vm_map_interlock(map, &ilock,
2199                                                  useStart,
2200                                                  useStart + ptoa(delta));
2201                                 pmap_remove(map->pmap,
2202                                             useStart,
2203                                             useStart + ptoa(delta));
2204                                 vm_map_deinterlock(map, &ilock);
2205                         } else {
2206                                 vm_object_madvise(current->object.vm_object,
2207                                                   pindex, delta, behav);
2208                         }
2209
2210                         /*
2211                          * Try to populate the page table.  Mappings governed
2212                          * by virtual page tables cannot be pre-populated
2213                          * without a lot of work so don't try.
2214                          */
2215                         if (behav == MADV_WILLNEED &&
2216                             current->maptype != VM_MAPTYPE_VPAGETABLE) {
2217                                 pmap_object_init_pt(
2218                                     map->pmap,
2219                                     useStart,
2220                                     current->protection,
2221                                     current->object.vm_object,
2222                                     pindex,
2223                                     (count << PAGE_SHIFT),
2224                                     MAP_PREFAULT_MADVISE
2225                                 );
2226                         }
2227                 }
2228                 vm_map_unlock_read(map);
2229         }
2230         vm_map_entry_release(count);
2231         return(error);
2232 }
2233
2234
2235 /*
2236  * Sets the inheritance of the specified address range in the target map.
2237  * Inheritance affects how the map will be shared with child maps at the
2238  * time of vm_map_fork.
2239  */
2240 int
2241 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
2242                vm_inherit_t new_inheritance)
2243 {
2244         vm_map_entry_t entry;
2245         vm_map_entry_t temp_entry;
2246         int count;
2247
2248         switch (new_inheritance) {
2249         case VM_INHERIT_NONE:
2250         case VM_INHERIT_COPY:
2251         case VM_INHERIT_SHARE:
2252                 break;
2253         default:
2254                 return (KERN_INVALID_ARGUMENT);
2255         }
2256
2257         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2258         vm_map_lock(map);
2259
2260         VM_MAP_RANGE_CHECK(map, start, end);
2261
2262         if (vm_map_lookup_entry(map, start, &temp_entry)) {
2263                 entry = temp_entry;
2264                 vm_map_clip_start(map, entry, start, &count);
2265         } else
2266                 entry = temp_entry->next;
2267
2268         while ((entry != &map->header) && (entry->start < end)) {
2269                 vm_map_clip_end(map, entry, end, &count);
2270
2271                 entry->inheritance = new_inheritance;
2272
2273                 vm_map_simplify_entry(map, entry, &count);
2274
2275                 entry = entry->next;
2276         }
2277         vm_map_unlock(map);
2278         vm_map_entry_release(count);
2279         return (KERN_SUCCESS);
2280 }
2281
2282 /*
2283  * Implement the semantics of mlock
2284  */
2285 int
2286 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t real_end,
2287               boolean_t new_pageable)
2288 {
2289         vm_map_entry_t entry;
2290         vm_map_entry_t start_entry;
2291         vm_offset_t end;
2292         int rv = KERN_SUCCESS;
2293         int count;
2294
2295         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2296         vm_map_lock(map);
2297         VM_MAP_RANGE_CHECK(map, start, real_end);
2298         end = real_end;
2299
2300         start_entry = vm_map_clip_range(map, start, end, &count,
2301                                         MAP_CLIP_NO_HOLES);
2302         if (start_entry == NULL) {
2303                 vm_map_unlock(map);
2304                 vm_map_entry_release(count);
2305                 return (KERN_INVALID_ADDRESS);
2306         }
2307
2308         if (new_pageable == 0) {
2309                 entry = start_entry;
2310                 while ((entry != &map->header) && (entry->start < end)) {
2311                         vm_offset_t save_start;
2312                         vm_offset_t save_end;
2313
2314                         /*
2315                          * Already user wired or hard wired (trivial cases)
2316                          */
2317                         if (entry->eflags & MAP_ENTRY_USER_WIRED) {
2318                                 entry = entry->next;
2319                                 continue;
2320                         }
2321                         if (entry->wired_count != 0) {
2322                                 entry->wired_count++;
2323                                 entry->eflags |= MAP_ENTRY_USER_WIRED;
2324                                 entry = entry->next;
2325                                 continue;
2326                         }
2327
2328                         /*
2329                          * A new wiring requires instantiation of appropriate
2330                          * management structures and the faulting in of the
2331                          * page.
2332                          */
2333                         if (entry->maptype == VM_MAPTYPE_NORMAL ||
2334                             entry->maptype == VM_MAPTYPE_VPAGETABLE) {
2335                                 int copyflag = entry->eflags &
2336                                                MAP_ENTRY_NEEDS_COPY;
2337                                 if (copyflag && ((entry->protection &
2338                                                   VM_PROT_WRITE) != 0)) {
2339                                         vm_map_entry_shadow(entry, 0);
2340                                 } else if (entry->object.vm_object == NULL &&
2341                                            !map->system_map) {
2342                                         vm_map_entry_allocate_object(entry);
2343                                 }
2344                         }
2345                         entry->wired_count++;
2346                         entry->eflags |= MAP_ENTRY_USER_WIRED;
2347
2348                         /*
2349                          * Now fault in the area.  Note that vm_fault_wire()
2350                          * may release the map lock temporarily, it will be
2351                          * relocked on return.  The in-transition
2352                          * flag protects the entries.
2353                          */
2354                         save_start = entry->start;
2355                         save_end = entry->end;
2356                         rv = vm_fault_wire(map, entry, TRUE, 0);
2357                         if (rv) {
2358                                 CLIP_CHECK_BACK(entry, save_start);
2359                                 for (;;) {
2360                                         KASSERT(entry->wired_count == 1, ("bad wired_count on entry"));
2361                                         entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2362                                         entry->wired_count = 0;
2363                                         if (entry->end == save_end)
2364                                                 break;
2365                                         entry = entry->next;
2366                                         KASSERT(entry != &map->header, ("bad entry clip during backout"));
2367                                 }
2368                                 end = save_start;       /* unwire the rest */
2369                                 break;
2370                         }
2371                         /*
2372                          * note that even though the entry might have been
2373                          * clipped, the USER_WIRED flag we set prevents
2374                          * duplication so we do not have to do a
2375                          * clip check.
2376                          */
2377                         entry = entry->next;
2378                 }
2379
2380                 /*
2381                  * If we failed fall through to the unwiring section to
2382                  * unwire what we had wired so far.  'end' has already
2383                  * been adjusted.
2384                  */
2385                 if (rv)
2386                         new_pageable = 1;
2387
2388                 /*
2389                  * start_entry might have been clipped if we unlocked the
2390                  * map and blocked.  No matter how clipped it has gotten
2391                  * there should be a fragment that is on our start boundary.
2392                  */
2393                 CLIP_CHECK_BACK(start_entry, start);
2394         }
2395
2396         /*
2397          * Deal with the unwiring case.
2398          */
2399         if (new_pageable) {
2400                 /*
2401                  * This is the unwiring case.  We must first ensure that the
2402                  * range to be unwired is really wired down.  We know there
2403                  * are no holes.
2404                  */
2405                 entry = start_entry;
2406                 while ((entry != &map->header) && (entry->start < end)) {
2407                         if ((entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
2408                                 rv = KERN_INVALID_ARGUMENT;
2409                                 goto done;
2410                         }
2411                         KASSERT(entry->wired_count != 0, ("wired count was 0 with USER_WIRED set! %p", entry));
2412                         entry = entry->next;
2413                 }
2414
2415                 /*
2416                  * Now decrement the wiring count for each region. If a region
2417                  * becomes completely unwired, unwire its physical pages and
2418                  * mappings.
2419                  */
2420                 /*
2421                  * The map entries are processed in a loop, checking to
2422                  * make sure the entry is wired and asserting it has a wired
2423                  * count. However, another loop was inserted more-or-less in
2424                  * the middle of the unwiring path. This loop picks up the
2425                  * "entry" loop variable from the first loop without first
2426                  * setting it to start_entry. Naturally, the secound loop
2427                  * is never entered and the pages backing the entries are
2428                  * never unwired. This can lead to a leak of wired pages.
2429                  */
2430                 entry = start_entry;
2431                 while ((entry != &map->header) && (entry->start < end)) {
2432                         KASSERT(entry->eflags & MAP_ENTRY_USER_WIRED,
2433                                 ("expected USER_WIRED on entry %p", entry));
2434                         entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2435                         entry->wired_count--;
2436                         if (entry->wired_count == 0)
2437                                 vm_fault_unwire(map, entry);
2438                         entry = entry->next;
2439                 }
2440         }
2441 done:
2442         vm_map_unclip_range(map, start_entry, start, real_end, &count,
2443                 MAP_CLIP_NO_HOLES);
2444         vm_map_unlock(map);
2445         vm_map_entry_release(count);
2446
2447         return (rv);
2448 }
2449
2450 /*
2451  * Sets the pageability of the specified address range in the target map.
2452  * Regions specified as not pageable require locked-down physical
2453  * memory and physical page maps.
2454  *
2455  * The map must not be locked, but a reference must remain to the map
2456  * throughout the call.
2457  *
2458  * This function may be called via the zalloc path and must properly
2459  * reserve map entries for kernel_map.
2460  *
2461  * No requirements.
2462  */
2463 int
2464 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t real_end, int kmflags)
2465 {
2466         vm_map_entry_t entry;
2467         vm_map_entry_t start_entry;
2468         vm_offset_t end;
2469         int rv = KERN_SUCCESS;
2470         int count;
2471
2472         if (kmflags & KM_KRESERVE)
2473                 count = vm_map_entry_kreserve(MAP_RESERVE_COUNT);
2474         else
2475                 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2476         vm_map_lock(map);
2477         VM_MAP_RANGE_CHECK(map, start, real_end);
2478         end = real_end;
2479
2480         start_entry = vm_map_clip_range(map, start, end, &count,
2481                                         MAP_CLIP_NO_HOLES);
2482         if (start_entry == NULL) {
2483                 vm_map_unlock(map);
2484                 rv = KERN_INVALID_ADDRESS;
2485                 goto failure;
2486         }
2487         if ((kmflags & KM_PAGEABLE) == 0) {
2488                 /*
2489                  * Wiring.
2490                  *
2491                  * 1.  Holding the write lock, we create any shadow or zero-fill
2492                  * objects that need to be created. Then we clip each map
2493                  * entry to the region to be wired and increment its wiring
2494                  * count.  We create objects before clipping the map entries
2495                  * to avoid object proliferation.
2496                  *
2497                  * 2.  We downgrade to a read lock, and call vm_fault_wire to
2498                  * fault in the pages for any newly wired area (wired_count is
2499                  * 1).
2500                  *
2501                  * Downgrading to a read lock for vm_fault_wire avoids a
2502                  * possible deadlock with another process that may have faulted
2503                  * on one of the pages to be wired (it would mark the page busy,
2504                  * blocking us, then in turn block on the map lock that we
2505                  * hold).  Because of problems in the recursive lock package,
2506                  * we cannot upgrade to a write lock in vm_map_lookup.  Thus,
2507                  * any actions that require the write lock must be done
2508                  * beforehand.  Because we keep the read lock on the map, the
2509                  * copy-on-write status of the entries we modify here cannot
2510                  * change.
2511                  */
2512                 entry = start_entry;
2513                 while ((entry != &map->header) && (entry->start < end)) {
2514                         /*
2515                          * Trivial case if the entry is already wired
2516                          */
2517                         if (entry->wired_count) {
2518                                 entry->wired_count++;
2519                                 entry = entry->next;
2520                                 continue;
2521                         }
2522
2523                         /*
2524                          * The entry is being newly wired, we have to setup
2525                          * appropriate management structures.  A shadow
2526                          * object is required for a copy-on-write region,
2527                          * or a normal object for a zero-fill region.  We
2528                          * do not have to do this for entries that point to sub
2529                          * maps because we won't hold the lock on the sub map.
2530                          */
2531                         if (entry->maptype == VM_MAPTYPE_NORMAL ||
2532                             entry->maptype == VM_MAPTYPE_VPAGETABLE) {
2533                                 int copyflag = entry->eflags &
2534                                                MAP_ENTRY_NEEDS_COPY;
2535                                 if (copyflag && ((entry->protection &
2536                                                   VM_PROT_WRITE) != 0)) {
2537                                         vm_map_entry_shadow(entry, 0);
2538                                 } else if (entry->object.vm_object == NULL &&
2539                                            !map->system_map) {
2540                                         vm_map_entry_allocate_object(entry);
2541                                 }
2542                         }
2543
2544                         entry->wired_count++;
2545                         entry = entry->next;
2546                 }
2547
2548                 /*
2549                  * Pass 2.
2550                  */
2551
2552                 /*
2553                  * HACK HACK HACK HACK
2554                  *
2555                  * vm_fault_wire() temporarily unlocks the map to avoid
2556                  * deadlocks.  The in-transition flag from vm_map_clip_range
2557                  * call should protect us from changes while the map is
2558                  * unlocked.  T
2559                  *
2560                  * NOTE: Previously this comment stated that clipping might
2561                  *       still occur while the entry is unlocked, but from
2562                  *       what I can tell it actually cannot.
2563                  *
2564                  *       It is unclear whether the CLIP_CHECK_*() calls
2565                  *       are still needed but we keep them in anyway.
2566                  *
2567                  * HACK HACK HACK HACK
2568                  */
2569
2570                 entry = start_entry;
2571                 while (entry != &map->header && entry->start < end) {
2572                         /*
2573                          * If vm_fault_wire fails for any page we need to undo
2574                          * what has been done.  We decrement the wiring count
2575                          * for those pages which have not yet been wired (now)
2576                          * and unwire those that have (later).
2577                          */
2578                         vm_offset_t save_start = entry->start;
2579                         vm_offset_t save_end = entry->end;
2580
2581                         if (entry->wired_count == 1)
2582                                 rv = vm_fault_wire(map, entry, FALSE, kmflags);
2583                         if (rv) {
2584                                 CLIP_CHECK_BACK(entry, save_start);
2585                                 for (;;) {
2586                                         KASSERT(entry->wired_count == 1, ("wired_count changed unexpectedly"));
2587                                         entry->wired_count = 0;
2588                                         if (entry->end == save_end)
2589                                                 break;
2590                                         entry = entry->next;
2591                                         KASSERT(entry != &map->header, ("bad entry clip during backout"));
2592                                 }
2593                                 end = save_start;
2594                                 break;
2595                         }
2596                         CLIP_CHECK_FWD(entry, save_end);
2597                         entry = entry->next;
2598                 }
2599
2600                 /*
2601                  * If a failure occured undo everything by falling through
2602                  * to the unwiring code.  'end' has already been adjusted
2603                  * appropriately.
2604                  */
2605                 if (rv)
2606                         kmflags |= KM_PAGEABLE;
2607
2608                 /*
2609                  * start_entry is still IN_TRANSITION but may have been
2610                  * clipped since vm_fault_wire() unlocks and relocks the
2611                  * map.  No matter how clipped it has gotten there should
2612                  * be a fragment that is on our start boundary.
2613                  */
2614                 CLIP_CHECK_BACK(start_entry, start);
2615         }
2616
2617         if (kmflags & KM_PAGEABLE) {
2618                 /*
2619                  * This is the unwiring case.  We must first ensure that the
2620                  * range to be unwired is really wired down.  We know there
2621                  * are no holes.
2622                  */
2623                 entry = start_entry;
2624                 while ((entry != &map->header) && (entry->start < end)) {
2625                         if (entry->wired_count == 0) {
2626                                 rv = KERN_INVALID_ARGUMENT;
2627                                 goto done;
2628                         }
2629                         entry = entry->next;
2630                 }
2631
2632                 /*
2633                  * Now decrement the wiring count for each region. If a region
2634                  * becomes completely unwired, unwire its physical pages and
2635                  * mappings.
2636                  */
2637                 entry = start_entry;
2638                 while ((entry != &map->header) && (entry->start < end)) {
2639                         entry->wired_count--;
2640                         if (entry->wired_count == 0)
2641                                 vm_fault_unwire(map, entry);
2642                         entry = entry->next;
2643                 }
2644         }
2645 done:
2646         vm_map_unclip_range(map, start_entry, start, real_end,
2647                             &count, MAP_CLIP_NO_HOLES);
2648         vm_map_unlock(map);
2649 failure:
2650         if (kmflags & KM_KRESERVE)
2651                 vm_map_entry_krelease(count);
2652         else
2653                 vm_map_entry_release(count);
2654         return (rv);
2655 }
2656
2657 /*
2658  * Mark a newly allocated address range as wired but do not fault in
2659  * the pages.  The caller is expected to load the pages into the object.
2660  *
2661  * The map must be locked on entry and will remain locked on return.
2662  * No other requirements.
2663  */
2664 void
2665 vm_map_set_wired_quick(vm_map_t map, vm_offset_t addr, vm_size_t size,
2666                        int *countp)
2667 {
2668         vm_map_entry_t scan;
2669         vm_map_entry_t entry;
2670
2671         entry = vm_map_clip_range(map, addr, addr + size,
2672                                   countp, MAP_CLIP_NO_HOLES);
2673         for (scan = entry;
2674              scan != &map->header && scan->start < addr + size;
2675              scan = scan->next) {
2676             KKASSERT(scan->wired_count == 0);
2677             scan->wired_count = 1;
2678         }
2679         vm_map_unclip_range(map, entry, addr, addr + size,
2680                             countp, MAP_CLIP_NO_HOLES);
2681 }
2682
2683 /*
2684  * Push any dirty cached pages in the address range to their pager.
2685  * If syncio is TRUE, dirty pages are written synchronously.
2686  * If invalidate is TRUE, any cached pages are freed as well.
2687  *
2688  * This routine is called by sys_msync()
2689  *
2690  * Returns an error if any part of the specified range is not mapped.
2691  *
2692  * No requirements.
2693  */
2694 int
2695 vm_map_clean(vm_map_t map, vm_offset_t start, vm_offset_t end,
2696              boolean_t syncio, boolean_t invalidate)
2697 {
2698         vm_map_entry_t current;
2699         vm_map_entry_t entry;
2700         vm_size_t size;
2701         vm_object_t object;
2702         vm_object_t tobj;
2703         vm_ooffset_t offset;
2704
2705         vm_map_lock_read(map);
2706         VM_MAP_RANGE_CHECK(map, start, end);
2707         if (!vm_map_lookup_entry(map, start, &entry)) {
2708                 vm_map_unlock_read(map);
2709                 return (KERN_INVALID_ADDRESS);
2710         }
2711         lwkt_gettoken(&map->token);
2712
2713         /*
2714          * Make a first pass to check for holes.
2715          */
2716         for (current = entry; current->start < end; current = current->next) {
2717                 if (current->maptype == VM_MAPTYPE_SUBMAP) {
2718                         lwkt_reltoken(&map->token);
2719                         vm_map_unlock_read(map);
2720                         return (KERN_INVALID_ARGUMENT);
2721                 }
2722                 if (end > current->end &&
2723                     (current->next == &map->header ||
2724                         current->end != current->next->start)) {
2725                         lwkt_reltoken(&map->token);
2726                         vm_map_unlock_read(map);
2727                         return (KERN_INVALID_ADDRESS);
2728                 }
2729         }
2730
2731         if (invalidate)
2732                 pmap_remove(vm_map_pmap(map), start, end);
2733
2734         /*
2735          * Make a second pass, cleaning/uncaching pages from the indicated
2736          * objects as we go.
2737          */
2738         for (current = entry; current->start < end; current = current->next) {
2739                 offset = current->offset + (start - current->start);
2740                 size = (end <= current->end ? end : current->end) - start;
2741
2742                 switch(current->maptype) {
2743                 case VM_MAPTYPE_SUBMAP:
2744                 {
2745                         vm_map_t smap;
2746                         vm_map_entry_t tentry;
2747                         vm_size_t tsize;
2748
2749                         smap = current->object.sub_map;
2750                         vm_map_lock_read(smap);
2751                         vm_map_lookup_entry(smap, offset, &tentry);
2752                         tsize = tentry->end - offset;
2753                         if (tsize < size)
2754                                 size = tsize;
2755                         object = tentry->object.vm_object;
2756                         offset = tentry->offset + (offset - tentry->start);
2757                         vm_map_unlock_read(smap);
2758                         break;
2759                 }
2760                 case VM_MAPTYPE_NORMAL:
2761                 case VM_MAPTYPE_VPAGETABLE:
2762                         object = current->object.vm_object;
2763                         break;
2764                 default:
2765                         object = NULL;
2766                         break;
2767                 }
2768
2769                 if (object)
2770                         vm_object_hold(object);
2771
2772                 /*
2773                  * Note that there is absolutely no sense in writing out
2774                  * anonymous objects, so we track down the vnode object
2775                  * to write out.
2776                  * We invalidate (remove) all pages from the address space
2777                  * anyway, for semantic correctness.
2778                  *
2779                  * note: certain anonymous maps, such as MAP_NOSYNC maps,
2780                  * may start out with a NULL object.
2781                  */
2782                 while (object && (tobj = object->backing_object) != NULL) {
2783                         vm_object_hold(tobj);
2784                         if (tobj == object->backing_object) {
2785                                 vm_object_lock_swap();
2786                                 offset += object->backing_object_offset;
2787                                 vm_object_drop(object);
2788                                 object = tobj;
2789                                 if (object->size < OFF_TO_IDX(offset + size))
2790                                         size = IDX_TO_OFF(object->size) -
2791                                                offset;
2792                                 break;
2793                         }
2794                         vm_object_drop(tobj);
2795                 }
2796                 if (object && (object->type == OBJT_VNODE) &&
2797                     (current->protection & VM_PROT_WRITE) &&
2798                     (object->flags & OBJ_NOMSYNC) == 0) {
2799                         /*
2800                          * Flush pages if writing is allowed, invalidate them
2801                          * if invalidation requested.  Pages undergoing I/O
2802                          * will be ignored by vm_object_page_remove().
2803                          *
2804                          * We cannot lock the vnode and then wait for paging
2805                          * to complete without deadlocking against vm_fault.
2806                          * Instead we simply call vm_object_page_remove() and
2807                          * allow it to block internally on a page-by-page
2808                          * basis when it encounters pages undergoing async
2809                          * I/O.
2810                          */
2811                         int flags;
2812
2813                         /* no chain wait needed for vnode objects */
2814                         vm_object_reference_locked(object);
2815                         vn_lock(object->handle, LK_EXCLUSIVE | LK_RETRY);
2816                         flags = (syncio || invalidate) ? OBJPC_SYNC : 0;
2817                         flags |= invalidate ? OBJPC_INVAL : 0;
2818
2819                         /*
2820                          * When operating on a virtual page table just
2821                          * flush the whole object.  XXX we probably ought
2822                          * to
2823                          */
2824                         switch(current->maptype) {
2825                         case VM_MAPTYPE_NORMAL:
2826                                 vm_object_page_clean(object,
2827                                     OFF_TO_IDX(offset),
2828                                     OFF_TO_IDX(offset + size + PAGE_MASK),
2829                                     flags);
2830                                 break;
2831                         case VM_MAPTYPE_VPAGETABLE:
2832                                 vm_object_page_clean(object, 0, 0, flags);
2833                                 break;
2834                         }
2835                         vn_unlock(((struct vnode *)object->handle));
2836                         vm_object_deallocate_locked(object);
2837                 }
2838                 if (object && invalidate &&
2839                    ((object->type == OBJT_VNODE) ||
2840                     (object->type == OBJT_DEVICE) ||
2841                     (object->type == OBJT_MGTDEVICE))) {
2842                         int clean_only =
2843                                 ((object->type == OBJT_DEVICE) ||
2844                                 (object->type == OBJT_MGTDEVICE)) ? FALSE : TRUE;
2845                         /* no chain wait needed for vnode/device objects */
2846                         vm_object_reference_locked(object);
2847                         switch(current->maptype) {
2848                         case VM_MAPTYPE_NORMAL:
2849                                 vm_object_page_remove(object,
2850                                     OFF_TO_IDX(offset),
2851                                     OFF_TO_IDX(offset + size + PAGE_MASK),
2852                                     clean_only);
2853                                 break;
2854                         case VM_MAPTYPE_VPAGETABLE:
2855                                 vm_object_page_remove(object, 0, 0, clean_only);
2856                                 break;
2857                         }
2858                         vm_object_deallocate_locked(object);
2859                 }
2860                 start += size;
2861                 if (object)
2862                         vm_object_drop(object);
2863         }
2864
2865         lwkt_reltoken(&map->token);
2866         vm_map_unlock_read(map);
2867
2868         return (KERN_SUCCESS);
2869 }
2870
2871 /*
2872  * Make the region specified by this entry pageable.
2873  *
2874  * The vm_map must be exclusively locked.
2875  */
2876 static void
2877 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
2878 {
2879         entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2880         entry->wired_count = 0;
2881         vm_fault_unwire(map, entry);
2882 }
2883
2884 /*
2885  * Deallocate the given entry from the target map.
2886  *
2887  * The vm_map must be exclusively locked.
2888  */
2889 static void
2890 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry, int *countp)
2891 {
2892         vm_map_entry_unlink(map, entry);
2893         map->size -= entry->end - entry->start;
2894
2895         switch(entry->maptype) {
2896         case VM_MAPTYPE_NORMAL:
2897         case VM_MAPTYPE_VPAGETABLE:
2898         case VM_MAPTYPE_SUBMAP:
2899                 vm_object_deallocate(entry->object.vm_object);
2900                 break;
2901         case VM_MAPTYPE_UKSMAP:
2902                 /* XXX TODO */
2903                 break;
2904         default:
2905                 break;
2906         }
2907
2908         vm_map_entry_dispose(map, entry, countp);
2909 }
2910
2911 /*
2912  * Deallocates the given address range from the target map.
2913  *
2914  * The vm_map must be exclusively locked.
2915  */
2916 int
2917 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end, int *countp)
2918 {
2919         vm_object_t object;
2920         vm_map_entry_t entry;
2921         vm_map_entry_t first_entry;
2922         vm_offset_t hole_start;
2923
2924         ASSERT_VM_MAP_LOCKED(map);
2925         lwkt_gettoken(&map->token);
2926 again:
2927         /*
2928          * Find the start of the region, and clip it.  Set entry to point
2929          * at the first record containing the requested address or, if no
2930          * such record exists, the next record with a greater address.  The
2931          * loop will run from this point until a record beyond the termination
2932          * address is encountered.
2933          *
2934          * Adjust freehint[] for either the clip case or the extension case.
2935          *
2936          * GGG see other GGG comment.
2937          */
2938         if (vm_map_lookup_entry(map, start, &first_entry)) {
2939                 entry = first_entry;
2940                 vm_map_clip_start(map, entry, start, countp);
2941                 hole_start = start;
2942         } else {
2943                 entry = first_entry->next;
2944                 if (entry == &map->header)
2945                         hole_start = first_entry->start;
2946                 else
2947                         hole_start = first_entry->end;
2948         }
2949
2950         /*
2951          * Step through all entries in this region
2952          */
2953         while ((entry != &map->header) && (entry->start < end)) {
2954                 vm_map_entry_t next;
2955                 vm_offset_t s, e;
2956                 vm_pindex_t offidxstart, offidxend, count;
2957
2958                 /*
2959                  * If we hit an in-transition entry we have to sleep and
2960                  * retry.  It's easier (and not really slower) to just retry
2961                  * since this case occurs so rarely and the hint is already
2962                  * pointing at the right place.  We have to reset the
2963                  * start offset so as not to accidently delete an entry
2964                  * another process just created in vacated space.
2965                  */
2966                 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
2967                         entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2968                         start = entry->start;
2969                         ++mycpu->gd_cnt.v_intrans_coll;
2970                         ++mycpu->gd_cnt.v_intrans_wait;
2971                         vm_map_transition_wait(map, 1);
2972                         goto again;
2973                 }
2974                 vm_map_clip_end(map, entry, end, countp);
2975
2976                 s = entry->start;
2977                 e = entry->end;
2978                 next = entry->next;
2979
2980                 offidxstart = OFF_TO_IDX(entry->offset);
2981                 count = OFF_TO_IDX(e - s);
2982
2983                 switch(entry->maptype) {
2984                 case VM_MAPTYPE_NORMAL:
2985                 case VM_MAPTYPE_VPAGETABLE:
2986                 case VM_MAPTYPE_SUBMAP:
2987                         object = entry->object.vm_object;
2988                         break;
2989                 default:
2990                         object = NULL;
2991                         break;
2992                 }
2993
2994                 /*
2995                  * Unwire before removing addresses from the pmap; otherwise,
2996                  * unwiring will put the entries back in the pmap.
2997                  *
2998                  * Generally speaking, doing a bulk pmap_remove() before
2999                  * removing the pages from the VM object is better at
3000                  * reducing unnecessary IPIs.  The pmap code is now optimized
3001                  * to not blindly iterate the range when pt and pd pages
3002                  * are missing.
3003                  */
3004                 if (entry->wired_count != 0)
3005                         vm_map_entry_unwire(map, entry);
3006
3007                 offidxend = offidxstart + count;
3008
3009                 if (object == &kernel_object) {
3010                         pmap_remove(map->pmap, s, e);
3011                         vm_object_hold(object);
3012                         vm_object_page_remove(object, offidxstart,
3013                                               offidxend, FALSE);
3014                         vm_object_drop(object);
3015                 } else if (object && object->type != OBJT_DEFAULT &&
3016                            object->type != OBJT_SWAP) {
3017                         /*
3018                          * vnode object routines cannot be chain-locked,
3019                          * but since we aren't removing pages from the
3020                          * object here we can use a shared hold.
3021                          */
3022                         vm_object_hold_shared(object);
3023                         pmap_remove(map->pmap, s, e);
3024                         vm_object_drop(object);
3025                 } else if (object) {
3026                         vm_object_hold(object);
3027                         vm_object_chain_acquire(object, 0);
3028                         pmap_remove(map->pmap, s, e);
3029
3030                         if (object != NULL &&
3031                             object->ref_count != 1 &&
3032                             (object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) ==
3033                              OBJ_ONEMAPPING &&
3034                             (object->type == OBJT_DEFAULT ||
3035                              object->type == OBJT_SWAP)) {
3036                                 /*
3037                                  * When ONEMAPPING is set we can destroy the
3038                                  * pages underlying the entry's range.
3039                                  */
3040                                 vm_object_collapse(object, NULL);
3041                                 vm_object_page_remove(object, offidxstart,
3042                                                       offidxend, FALSE);
3043                                 if (object->type == OBJT_SWAP) {
3044                                         swap_pager_freespace(object,
3045                                                              offidxstart,
3046                                                              count);
3047                                 }
3048                                 if (offidxend >= object->size &&
3049                                     offidxstart < object->size) {
3050                                         object->size = offidxstart;
3051                                 }
3052                         }
3053                         vm_object_chain_release(object);
3054                         vm_object_drop(object);
3055                 } else if (entry->maptype == VM_MAPTYPE_UKSMAP) {
3056                         pmap_remove(map->pmap, s, e);
3057                 }
3058
3059                 /*
3060                  * Delete the entry (which may delete the object) only after
3061                  * removing all pmap entries pointing to its pages.
3062                  * (Otherwise, its page frames may be reallocated, and any
3063                  * modify bits will be set in the wrong object!)
3064                  */
3065                 vm_map_entry_delete(map, entry, countp);
3066                 entry = next;
3067         }
3068         if (entry == &map->header)
3069                 vm_map_freehint_hole(map, hole_start, entry->end - hole_start);
3070         else
3071                 vm_map_freehint_hole(map, hole_start,
3072                                      entry->start - hole_start);
3073
3074         lwkt_reltoken(&map->token);
3075
3076         return (KERN_SUCCESS);
3077 }
3078
3079 /*
3080  * Remove the given address range from the target map.
3081  * This is the exported form of vm_map_delete.
3082  *
3083  * No requirements.
3084  */
3085 int
3086 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
3087 {
3088         int result;
3089         int count;
3090
3091         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3092         vm_map_lock(map);
3093         VM_MAP_RANGE_CHECK(map, start, end);
3094         result = vm_map_delete(map, start, end, &count);
3095         vm_map_unlock(map);
3096         vm_map_entry_release(count);
3097
3098         return (result);
3099 }
3100
3101 /*
3102  * Assert that the target map allows the specified privilege on the
3103  * entire address region given.  The entire region must be allocated.
3104  *
3105  * The caller must specify whether the vm_map is already locked or not.
3106  */
3107 boolean_t
3108 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
3109                         vm_prot_t protection, boolean_t have_lock)
3110 {
3111         vm_map_entry_t entry;
3112         vm_map_entry_t tmp_entry;
3113         boolean_t result;
3114
3115         if (have_lock == FALSE)
3116                 vm_map_lock_read(map);
3117
3118         if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
3119                 if (have_lock == FALSE)
3120                         vm_map_unlock_read(map);
3121                 return (FALSE);
3122         }
3123         entry = tmp_entry;
3124
3125         result = TRUE;
3126         while (start < end) {
3127                 if (entry == &map->header) {
3128                         result = FALSE;
3129                         break;
3130                 }
3131                 /*
3132                  * No holes allowed!
3133                  */
3134
3135                 if (start < entry->start) {
3136                         result = FALSE;
3137                         break;
3138                 }
3139                 /*
3140                  * Check protection associated with entry.
3141                  */
3142
3143                 if ((entry->protection & protection) != protection) {
3144                         result = FALSE;
3145                         break;
3146                 }
3147                 /* go to next entry */
3148
3149                 start = entry->end;
3150                 entry = entry->next;
3151         }
3152         if (have_lock == FALSE)
3153                 vm_map_unlock_read(map);
3154         return (result);
3155 }
3156
3157 /*
3158  * If appropriate this function shadows the original object with a new object
3159  * and moves the VM pages from the original object to the new object.
3160  * The original object will also be collapsed, if possible.
3161  *
3162  * Caller must supply entry->object.vm_object held and chain_acquired, and
3163  * should chain_release and drop the object upon return.
3164  *
3165  * We can only do this for normal memory objects with a single mapping, and
3166  * it only makes sense to do it if there are 2 or more refs on the original
3167  * object.  i.e. typically a memory object that has been extended into
3168  * multiple vm_map_entry's with non-overlapping ranges.
3169  *
3170  * This makes it easier to remove unused pages and keeps object inheritance
3171  * from being a negative impact on memory usage.
3172  *
3173  * On return the (possibly new) entry->object.vm_object will have an
3174  * additional ref on it for the caller to dispose of (usually by cloning
3175  * the vm_map_entry).  The additional ref had to be done in this routine
3176  * to avoid racing a collapse.  The object's ONEMAPPING flag will also be
3177  * cleared.
3178  *
3179  * The vm_map must be locked and its token held.
3180  */
3181 static void
3182 vm_map_split(vm_map_entry_t entry, vm_object_t oobject)
3183 {
3184         /* OPTIMIZED */
3185         vm_object_t nobject, bobject;
3186         vm_offset_t s, e;
3187         vm_page_t m;
3188         vm_pindex_t offidxstart, offidxend, idx;
3189         vm_size_t size;
3190         vm_ooffset_t offset;
3191         int useshadowlist;
3192
3193         /*
3194          * Optimize away object locks for vnode objects.  Important exit/exec
3195          * critical path.
3196          *
3197          * OBJ_ONEMAPPING doesn't apply to vnode objects but clear the flag
3198          * anyway.
3199          */
3200         if (oobject->type != OBJT_DEFAULT && oobject->type != OBJT_SWAP) {
3201                 vm_object_reference_quick(oobject);
3202                 vm_object_clear_flag(oobject, OBJ_ONEMAPPING);
3203                 return;
3204         }
3205
3206 #if 0
3207         /*
3208          * Original object cannot be split?
3209          */
3210         if (oobject->handle == NULL) {
3211                 vm_object_reference_locked_chain_held(oobject);
3212                 vm_object_clear_flag(oobject, OBJ_ONEMAPPING);
3213                 return;
3214         }
3215 #endif
3216
3217         /*
3218          * Collapse original object with its backing store as an
3219          * optimization to reduce chain lengths when possible.
3220          *
3221          * If ref_count <= 1 there aren't other non-overlapping vm_map_entry's
3222          * for oobject, so there's no point collapsing it.
3223          *
3224          * Then re-check whether the object can be split.
3225          */
3226         vm_object_collapse(oobject, NULL);
3227
3228         if (oobject->ref_count <= 1 ||
3229             (oobject->type != OBJT_DEFAULT && oobject->type != OBJT_SWAP) ||
3230             (oobject->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) != OBJ_ONEMAPPING) {
3231                 vm_object_reference_locked_chain_held(oobject);
3232                 vm_object_clear_flag(oobject, OBJ_ONEMAPPING);
3233                 return;
3234         }
3235
3236         /*
3237          * Acquire the chain lock on the backing object.
3238          *
3239          * Give bobject an additional ref count for when it will be shadowed
3240          * by nobject.
3241          */
3242         useshadowlist = 0;
3243         if ((bobject = oobject->backing_object) != NULL) {
3244                 if (bobject->type != OBJT_VNODE) {
3245                         useshadowlist = 1;
3246                         vm_object_hold(bobject);
3247                         vm_object_chain_wait(bobject, 0);
3248                         /* ref for shadowing below */
3249                         vm_object_reference_locked(bobject);
3250                         vm_object_chain_acquire(bobject, 0);
3251                         KKASSERT(oobject->backing_object == bobject);
3252                         KKASSERT((bobject->flags & OBJ_DEAD) == 0);
3253                 } else {
3254                         /*
3255                          * vnodes are not placed on the shadow list but
3256                          * they still get another ref for the backing_object
3257                          * reference.
3258                          */
3259                         vm_object_reference_quick(bobject);
3260                 }
3261         }
3262
3263         /*
3264          * Calculate the object page range and allocate the new object.
3265          */
3266         offset = entry->offset;
3267         s = entry->start;
3268         e = entry->end;
3269
3270         offidxstart = OFF_TO_IDX(offset);
3271         offidxend = offidxstart + OFF_TO_IDX(e - s);
3272         size = offidxend - offidxstart;
3273
3274         switch(oobject->type) {
3275         case OBJT_DEFAULT:
3276                 nobject = default_pager_alloc(NULL, IDX_TO_OFF(size),
3277                                               VM_PROT_ALL, 0);
3278                 break;
3279         case OBJT_SWAP:
3280                 nobject = swap_pager_alloc(NULL, IDX_TO_OFF(size),
3281                                            VM_PROT_ALL, 0);
3282                 break;
3283         default:
3284                 /* not reached */
3285                 nobject = NULL;
3286                 KKASSERT(0);
3287         }
3288
3289         /*
3290          * If we could not allocate nobject just clear ONEMAPPING on
3291          * oobject and return.
3292          */
3293         if (nobject == NULL) {
3294                 if (bobject) {
3295                         if (useshadowlist) {
3296                                 vm_object_chain_release(bobject);
3297                                 vm_object_deallocate(bobject);
3298                                 vm_object_drop(bobject);
3299                         } else {
3300                                 vm_object_deallocate(bobject);
3301                         }
3302                 }
3303                 vm_object_reference_locked_chain_held(oobject);
3304                 vm_object_clear_flag(oobject, OBJ_ONEMAPPING);
3305                 return;
3306         }
3307
3308         /*
3309          * The new object will replace entry->object.vm_object so it needs
3310          * a second reference (the caller expects an additional ref).
3311          */
3312         vm_object_hold(nobject);
3313         vm_object_reference_locked(nobject);
3314         vm_object_chain_acquire(nobject, 0);
3315
3316         /*
3317          * nobject shadows bobject (oobject already shadows bobject).
3318          *
3319          * Adding an object to bobject's shadow list requires refing bobject
3320          * which we did above in the useshadowlist case.
3321          *
3322          * XXX it is unclear if we need to clear ONEMAPPING on bobject here
3323          *     or not.
3324          */
3325         if (bobject) {
3326                 nobject->backing_object_offset =
3327                     oobject->backing_object_offset + IDX_TO_OFF(offidxstart);
3328                 nobject->backing_object = bobject;
3329                 if (useshadowlist) {
3330                         bobject->shadow_count++;
3331                         atomic_add_int(&bobject->generation, 1);
3332                         LIST_INSERT_HEAD(&bobject->shadow_head,
3333                                          nobject, shadow_list);
3334                         vm_object_clear_flag(bobject, OBJ_ONEMAPPING); /*XXX*/
3335                         vm_object_set_flag(nobject, OBJ_ONSHADOW);
3336                 }
3337         }
3338
3339         /*
3340          * Move the VM pages from oobject to nobject
3341          */
3342         for (idx = 0; idx < size; idx++) {
3343                 vm_page_t m;
3344
3345                 m = vm_page_lookup_busy_wait(oobject, offidxstart + idx,
3346                                              TRUE, "vmpg");
3347                 if (m == NULL)
3348                         continue;
3349
3350                 /*
3351                  * We must wait for pending I/O to complete before we can
3352                  * rename the page.
3353                  *
3354                  * We do not have to VM_PROT_NONE the page as mappings should
3355                  * not be changed by this operation.
3356                  *
3357                  * NOTE: The act of renaming a page updates chaingen for both
3358                  *       objects.
3359                  */
3360                 vm_page_rename(m, nobject, idx);
3361                 /* page automatically made dirty by rename and cache handled */
3362                 /* page remains busy */
3363         }
3364
3365         if (oobject->type == OBJT_SWAP) {
3366                 vm_object_pip_add(oobject, 1);
3367                 /*
3368                  * copy oobject pages into nobject and destroy unneeded
3369                  * pages in shadow object.
3370                  */
3371                 swap_pager_copy(oobject, nobject, offidxstart, 0);
3372                 vm_object_pip_wakeup(oobject);
3373         }
3374
3375         /*
3376          * Wakeup the pages we played with.  No spl protection is needed
3377          * for a simple wakeup.
3378          */
3379         for (idx = 0; idx < size; idx++) {
3380                 m = vm_page_lookup(nobject, idx);
3381                 if (m) {
3382                         KKASSERT(m->busy_count & PBUSY_LOCKED);
3383                         vm_page_wakeup(m);
3384                 }
3385         }
3386         entry->object.vm_object = nobject;
3387         entry->offset = 0LL;
3388
3389         /*
3390          * The map is being split and nobject is going to wind up on both
3391          * vm_map_entry's, so make sure OBJ_ONEMAPPING is cleared on
3392          * nobject.
3393          */
3394         vm_object_clear_flag(nobject, OBJ_ONEMAPPING);
3395
3396         /*
3397          * Cleanup
3398          *
3399          * NOTE: There is no need to remove OBJ_ONEMAPPING from oobject, the
3400          *       related pages were moved and are no longer applicable to the
3401          *       original object.
3402          *
3403          * NOTE: Deallocate oobject (due to its entry->object.vm_object being
3404          *       replaced by nobject).
3405          */
3406         vm_object_chain_release(nobject);
3407         vm_object_drop(nobject);
3408         if (bobject && useshadowlist) {
3409                 vm_object_chain_release(bobject);
3410                 vm_object_drop(bobject);
3411         }
3412
3413 #if 0
3414         if (oobject->resident_page_count) {
3415                 kprintf("oobject %p still contains %jd pages!\n",
3416                         oobject, (intmax_t)oobject->resident_page_count);
3417                 for (idx = 0; idx < size; idx++) {
3418                         vm_page_t m;
3419
3420                         m = vm_page_lookup_busy_wait(oobject, offidxstart + idx,
3421                                                      TRUE, "vmpg");
3422                         if (m) {
3423                                 kprintf("oobject %p idx %jd\n",
3424                                         oobject,
3425                                         offidxstart + idx);
3426                                 vm_page_wakeup(m);
3427                         }
3428                 }
3429         }
3430 #endif
3431         /*vm_object_clear_flag(oobject, OBJ_ONEMAPPING);*/
3432         vm_object_deallocate_locked(oobject);
3433 }
3434
3435 /*
3436  * Copies the contents of the source entry to the destination
3437  * entry.  The entries *must* be aligned properly.
3438  *
3439  * The vm_maps must be exclusively locked.
3440  * The vm_map's token must be held.
3441  *
3442  * Because the maps are locked no faults can be in progress during the
3443  * operation.
3444  */
3445 static void
3446 vm_map_copy_entry(vm_map_t src_map, vm_map_t dst_map,
3447                   vm_map_entry_t src_entry, vm_map_entry_t dst_entry)
3448 {
3449         vm_object_t src_object;
3450         vm_object_t oobject;
3451
3452         if (dst_entry->maptype == VM_MAPTYPE_SUBMAP ||
3453             dst_entry->maptype == VM_MAPTYPE_UKSMAP)
3454                 return;
3455         if (src_entry->maptype == VM_MAPTYPE_SUBMAP ||
3456             src_entry->maptype == VM_MAPTYPE_UKSMAP)
3457                 return;
3458
3459         if (src_entry->wired_count == 0) {
3460                 /*
3461                  * If the source entry is marked needs_copy, it is already
3462                  * write-protected.
3463                  *
3464                  * To avoid interacting with a vm_fault that might have
3465                  * released its vm_map, we must acquire the fronting
3466                  * object.
3467                  */
3468                 oobject = src_entry->object.vm_object;
3469                 if (oobject) {
3470                         vm_object_hold(oobject);
3471                         vm_object_chain_acquire(oobject, 0);
3472                 }
3473
3474                 if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
3475                         pmap_protect(src_map->pmap,
3476                             src_entry->start,
3477                             src_entry->end,
3478                             src_entry->protection & ~VM_PROT_WRITE);
3479                 }
3480
3481                 /*
3482                  * Make a copy of the object.
3483                  *
3484                  * The object must be locked prior to checking the object type
3485                  * and for the call to vm_object_collapse() and vm_map_split().
3486                  * We cannot use *_hold() here because the split code will
3487                  * probably try to destroy the object.  The lock is a pool
3488                  * token and doesn't care.
3489                  *
3490                  * We must bump src_map->timestamp when setting
3491                  * MAP_ENTRY_NEEDS_COPY to force any concurrent fault
3492                  * to retry, otherwise the concurrent fault might improperly
3493                  * install a RW pte when its supposed to be a RO(COW) pte.
3494                  * This race can occur because a vnode-backed fault may have
3495                  * to temporarily release the map lock.  This was handled
3496                  * when the caller locked the map exclusively.
3497                  */
3498                 if (oobject) {
3499                         vm_map_split(src_entry, oobject);
3500
3501                         src_object = src_entry->object.vm_object;
3502                         dst_entry->object.vm_object = src_object;
3503                         src_entry->eflags |= (MAP_ENTRY_COW |
3504                                               MAP_ENTRY_NEEDS_COPY);
3505                         dst_entry->eflags |= (MAP_ENTRY_COW |
3506                                               MAP_ENTRY_NEEDS_COPY);
3507                         dst_entry->offset = src_entry->offset;
3508                 } else {
3509                         dst_entry->object.vm_object = NULL;
3510                         dst_entry->offset = 0;
3511                 }
3512                 pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start,
3513                           dst_entry->end - dst_entry->start,
3514                           src_entry->start);
3515                 if (oobject) {
3516                         vm_object_chain_release(oobject);
3517                         vm_object_drop(oobject);
3518                 }
3519         } else {
3520                 /*
3521                  * Of course, wired down pages can't be set copy-on-write.
3522                  * Cause wired pages to be copied into the new map by
3523                  * simulating faults (the new pages are pageable)
3524                  */
3525                 vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry);
3526         }
3527 }
3528
3529 /*
3530  * vmspace_fork:
3531  * Create a new process vmspace structure and vm_map
3532  * based on those of an existing process.  The new map
3533  * is based on the old map, according to the inheritance
3534  * values on the regions in that map.
3535  *
3536  * The source map must not be locked.
3537  * No requirements.
3538  */
3539 static void vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map,
3540                           vm_map_entry_t old_entry, int *countp);
3541 static void vmspace_fork_uksmap_entry(vm_map_t old_map, vm_map_t new_map,
3542                           vm_map_entry_t old_entry, int *countp);
3543
3544 struct vmspace *
3545 vmspace_fork(struct vmspace *vm1)
3546 {
3547         struct vmspace *vm2;
3548         vm_map_t old_map = &vm1->vm_map;
3549         vm_map_t new_map;
3550         vm_map_entry_t old_entry;
3551         int count;
3552
3553         lwkt_gettoken(&vm1->vm_map.token);
3554         vm_map_lock(old_map);
3555
3556         vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset);
3557         lwkt_gettoken(&vm2->vm_map.token);
3558
3559         /*
3560          * We must bump the timestamp to force any concurrent fault
3561          * to retry.
3562          */
3563         bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy,
3564               (caddr_t)&vm1->vm_endcopy - (caddr_t)&vm1->vm_startcopy);
3565         new_map = &vm2->vm_map; /* XXX */
3566         new_map->timestamp = 1;
3567
3568         vm_map_lock(new_map);
3569
3570         count = 0;
3571         old_entry = old_map->header.next;
3572         while (old_entry != &old_map->header) {
3573                 ++count;
3574                 old_entry = old_entry->next;
3575         }
3576
3577         count = vm_map_entry_reserve(count + MAP_RESERVE_COUNT);
3578
3579         old_entry = old_map->header.next;
3580         while (old_entry != &old_map->header) {
3581                 switch(old_entry->maptype) {
3582                 case VM_MAPTYPE_SUBMAP:
3583                         panic("vm_map_fork: encountered a submap");
3584                         break;
3585                 case VM_MAPTYPE_UKSMAP:
3586                         vmspace_fork_uksmap_entry(old_map, new_map,
3587                                                   old_entry, &count);
3588                         break;
3589                 case VM_MAPTYPE_NORMAL:
3590                 case VM_MAPTYPE_VPAGETABLE:
3591                         vmspace_fork_normal_entry(old_map, new_map,
3592                                                   old_entry, &count);
3593                         break;
3594                 }
3595                 old_entry = old_entry->next;
3596         }
3597
3598         new_map->size = old_map->size;
3599         vm_map_unlock(old_map);
3600         vm_map_unlock(new_map);
3601         vm_map_entry_release(count);
3602
3603         lwkt_reltoken(&vm2->vm_map.token);
3604         lwkt_reltoken(&vm1->vm_map.token);
3605
3606         return (vm2);
3607 }
3608
3609 static
3610 void
3611 vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map,
3612                           vm_map_entry_t old_entry, int *countp)
3613 {
3614         vm_map_entry_t new_entry;
3615         vm_object_t object;
3616
3617         switch (old_entry->inheritance) {
3618         case VM_INHERIT_NONE:
3619                 break;
3620         case VM_INHERIT_SHARE:
3621                 /*
3622                  * Clone the entry, creating the shared object if
3623                  * necessary.
3624                  */
3625                 if (old_entry->object.vm_object == NULL)
3626                         vm_map_entry_allocate_object(old_entry);
3627
3628                 if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3629                         /*
3630                          * Shadow a map_entry which needs a copy,
3631                          * replacing its object with a new object
3632                          * that points to the old one.  Ask the
3633                          * shadow code to automatically add an
3634                          * additional ref.  We can't do it afterwords
3635                          * because we might race a collapse.  The call
3636                          * to vm_map_entry_shadow() will also clear
3637                          * OBJ_ONEMAPPING.
3638                          */
3639                         vm_map_entry_shadow(old_entry, 1);
3640                 } else if (old_entry->object.vm_object) {
3641                         /*
3642                          * We will make a shared copy of the object,
3643                          * and must clear OBJ_ONEMAPPING.
3644                          *
3645                          * Optimize vnode objects.  OBJ_ONEMAPPING
3646                          * is non-applicable but clear it anyway,
3647                          * and its terminal so we don't have to deal
3648                          * with chains.  Reduces SMP conflicts.
3649                          *
3650                          * XXX assert that object.vm_object != NULL
3651                          *     since we allocate it above.
3652                          */
3653                         object = old_entry->object.vm_object;
3654                         if (object->type == OBJT_VNODE) {
3655                                 vm_object_reference_quick(object);
3656                                 vm_object_clear_flag(object,
3657                                                      OBJ_ONEMAPPING);
3658                         } else {
3659                                 vm_object_hold(object);
3660                                 vm_object_chain_wait(object, 0);
3661                                 vm_object_reference_locked(object);
3662                                 vm_object_clear_flag(object,
3663                                                      OBJ_ONEMAPPING);
3664                                 vm_object_drop(object);
3665                         }
3666                 }
3667
3668                 /*
3669                  * Clone the entry.  We've already bumped the ref on
3670                  * any vm_object.
3671                  */
3672                 new_entry = vm_map_entry_create(new_map, countp);
3673                 *new_entry = *old_entry;
3674                 new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3675                 new_entry->wired_count = 0;
3676
3677                 /*
3678                  * Insert the entry into the new map -- we know we're
3679                  * inserting at the end of the new map.
3680                  */
3681                 vm_map_entry_link(new_map, new_map->header.prev,
3682                                   new_entry);
3683
3684                 /*
3685                  * Update the physical map
3686                  */
3687                 pmap_copy(new_map->pmap, old_map->pmap,
3688                           new_entry->start,
3689                           (old_entry->end - old_entry->start),
3690                           old_entry->start);
3691                 break;
3692         case VM_INHERIT_COPY:
3693                 /*
3694                  * Clone the entry and link into the map.
3695                  */
3696                 new_entry = vm_map_entry_create(new_map, countp);
3697                 *new_entry = *old_entry;
3698                 new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3699                 new_entry->wired_count = 0;
3700                 new_entry->object.vm_object = NULL;
3701                 vm_map_entry_link(new_map, new_map->header.prev,
3702                                   new_entry);
3703                 vm_map_copy_entry(old_map, new_map, old_entry,
3704                                   new_entry);
3705                 break;
3706         }
3707 }
3708
3709 /*
3710  * When forking user-kernel shared maps, the map might change in the
3711  * child so do not try to copy the underlying pmap entries.
3712  */
3713 static
3714 void
3715 vmspace_fork_uksmap_entry(vm_map_t old_map, vm_map_t new_map,
3716                           vm_map_entry_t old_entry, int *countp)
3717 {
3718         vm_map_entry_t new_entry;
3719
3720         new_entry = vm_map_entry_create(new_map, countp);
3721         *new_entry = *old_entry;
3722         new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3723         new_entry->wired_count = 0;
3724         vm_map_entry_link(new_map, new_map->header.prev,
3725                           new_entry);
3726 }
3727
3728 /*
3729  * Create an auto-grow stack entry
3730  *
3731  * No requirements.
3732  */
3733 int
3734 vm_map_stack (vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
3735               int flags, vm_prot_t prot, vm_prot_t max, int cow)
3736 {
3737         vm_map_entry_t  prev_entry;
3738         vm_map_entry_t  new_stack_entry;
3739         vm_size_t       init_ssize;
3740         int             rv;
3741         int             count;
3742         vm_offset_t     tmpaddr;
3743
3744         cow |= MAP_IS_STACK;
3745
3746         if (max_ssize < sgrowsiz)
3747                 init_ssize = max_ssize;
3748         else
3749                 init_ssize = sgrowsiz;
3750
3751         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3752         vm_map_lock(map);
3753
3754         /*
3755          * Find space for the mapping
3756          */
3757         if ((flags & (MAP_FIXED | MAP_TRYFIXED)) == 0) {
3758                 if (vm_map_findspace(map, addrbos, max_ssize, 1,
3759                                      flags, &tmpaddr)) {
3760                         vm_map_unlock(map);
3761                         vm_map_entry_release(count);
3762                         return (KERN_NO_SPACE);
3763                 }
3764                 addrbos = tmpaddr;
3765         }
3766
3767         /* If addr is already mapped, no go */
3768         if (vm_map_lookup_entry(map, addrbos, &prev_entry)) {
3769                 vm_map_unlock(map);
3770                 vm_map_entry_release(count);
3771                 return (KERN_NO_SPACE);
3772         }
3773
3774 #if 0
3775         /* XXX already handled by kern_mmap() */
3776         /* If we would blow our VMEM resource limit, no go */
3777         if (map->size + init_ssize >
3778             curproc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
3779                 vm_map_unlock(map);
3780                 vm_map_entry_release(count);
3781                 return (KERN_NO_SPACE);
3782         }
3783 #endif
3784
3785         /*
3786          * If we can't accomodate max_ssize in the current mapping,
3787          * no go.  However, we need to be aware that subsequent user
3788          * mappings might map into the space we have reserved for
3789          * stack, and currently this space is not protected.
3790          *
3791          * Hopefully we will at least detect this condition
3792          * when we try to grow the stack.
3793          */
3794         if ((prev_entry->next != &map->header) &&
3795             (prev_entry->next->start < addrbos + max_ssize)) {
3796                 vm_map_unlock(map);
3797                 vm_map_entry_release(count);
3798                 return (KERN_NO_SPACE);
3799         }
3800
3801         /*
3802          * We initially map a stack of only init_ssize.  We will
3803          * grow as needed later.  Since this is to be a grow
3804          * down stack, we map at the top of the range.
3805          *
3806          * Note: we would normally expect prot and max to be
3807          * VM_PROT_ALL, and cow to be 0.  Possibly we should
3808          * eliminate these as input parameters, and just
3809          * pass these values here in the insert call.
3810          */
3811         rv = vm_map_insert(map, &count, NULL, NULL,
3812                            0, addrbos + max_ssize - init_ssize,
3813                            addrbos + max_ssize,
3814                            VM_MAPTYPE_NORMAL,
3815                            VM_SUBSYS_STACK, prot, max, cow);
3816
3817         /* Now set the avail_ssize amount */
3818         if (rv == KERN_SUCCESS) {
3819                 if (prev_entry != &map->header)
3820                         vm_map_clip_end(map, prev_entry, addrbos + max_ssize - init_ssize, &count);
3821                 new_stack_entry = prev_entry->next;
3822                 if (new_stack_entry->end   != addrbos + max_ssize ||
3823                     new_stack_entry->start != addrbos + max_ssize - init_ssize)
3824                         panic ("Bad entry start/end for new stack entry");
3825                 else
3826                         new_stack_entry->aux.avail_ssize = max_ssize - init_ssize;
3827         }
3828
3829         vm_map_unlock(map);
3830         vm_map_entry_release(count);
3831         return (rv);
3832 }
3833
3834 /*
3835  * Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if the
3836  * desired address is already mapped, or if we successfully grow
3837  * the stack.  Also returns KERN_SUCCESS if addr is outside the
3838  * stack range (this is strange, but preserves compatibility with
3839  * the grow function in vm_machdep.c).
3840  *
3841  * No requirements.
3842  */
3843 int
3844 vm_map_growstack (vm_map_t map, vm_offset_t addr)
3845 {
3846         vm_map_entry_t prev_entry;
3847         vm_map_entry_t stack_entry;
3848         vm_map_entry_t new_stack_entry;
3849         struct vmspace *vm;
3850         struct lwp *lp;
3851         struct proc *p;
3852         vm_offset_t    end;
3853         int grow_amount;
3854         int rv = KERN_SUCCESS;
3855         int is_procstack;
3856         int use_read_lock = 1;
3857         int count;
3858
3859         /*
3860          * Find the vm
3861          */
3862         lp = curthread->td_lwp;
3863         p = curthread->td_proc;
3864         KKASSERT(lp != NULL);
3865         vm = lp->lwp_vmspace;
3866
3867         /*
3868          * Growstack is only allowed on the current process.  We disallow
3869          * other use cases, e.g. trying to access memory via procfs that
3870          * the stack hasn't grown into.
3871          */
3872         if (map != &vm->vm_map) {
3873                 return KERN_FAILURE;
3874         }
3875
3876         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3877 Retry:
3878         if (use_read_lock)
3879                 vm_map_lock_read(map);
3880         else
3881                 vm_map_lock(map);
3882
3883         /* If addr is already in the entry range, no need to grow.*/
3884         if (vm_map_lookup_entry(map, addr, &prev_entry))
3885                 goto done;
3886
3887         if ((stack_entry = prev_entry->next) == &map->header)
3888                 goto done;
3889         if (prev_entry == &map->header)
3890                 end = stack_entry->start - stack_entry->aux.avail_ssize;
3891         else
3892                 end = prev_entry->end;
3893
3894         /*
3895          * This next test mimics the old grow function in vm_machdep.c.
3896          * It really doesn't quite make sense, but we do it anyway
3897          * for compatibility.
3898          *
3899          * If not growable stack, return success.  This signals the
3900          * caller to proceed as he would normally with normal vm.
3901          */
3902         if (stack_entry->aux.avail_ssize < 1 ||
3903             addr >= stack_entry->start ||
3904             addr <  stack_entry->start - stack_entry->aux.avail_ssize) {
3905                 goto done;
3906         }
3907
3908         /* Find the minimum grow amount */
3909         grow_amount = roundup (stack_entry->start - addr, PAGE_SIZE);
3910         if (grow_amount > stack_entry->aux.avail_ssize) {
3911                 rv = KERN_NO_SPACE;
3912                 goto done;
3913         }
3914
3915         /*
3916          * If there is no longer enough space between the entries
3917          * nogo, and adjust the available space.  Note: this
3918          * should only happen if the user has mapped into the
3919          * stack area after the stack was created, and is
3920          * probably an error.
3921          *
3922          * This also effectively destroys any guard page the user
3923          * might have intended by limiting the stack size.
3924          */
3925         if (grow_amount > stack_entry->start - end) {
3926                 if (use_read_lock && vm_map_lock_upgrade(map)) {
3927                         /* lost lock */
3928                         use_read_lock = 0;
3929                         goto Retry;
3930                 }
3931                 use_read_lock = 0;
3932                 stack_entry->aux.avail_ssize = stack_entry->start - end;
3933                 rv = KERN_NO_SPACE;
3934                 goto done;
3935         }
3936
3937         is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr;
3938
3939         /* If this is the main process stack, see if we're over the
3940          * stack limit.
3941          */
3942         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount >
3943                              p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
3944                 rv = KERN_NO_SPACE;
3945                 goto done;
3946         }
3947
3948         /* Round up the grow amount modulo SGROWSIZ */
3949         grow_amount = roundup (grow_amount, sgrowsiz);
3950         if (grow_amount > stack_entry->aux.avail_ssize) {
3951                 grow_amount = stack_entry->aux.avail_ssize;
3952         }
3953         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount >
3954                              p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
3955                 grow_amount = p->p_rlimit[RLIMIT_STACK].rlim_cur -
3956                               ctob(vm->vm_ssize);
3957         }
3958
3959         /* If we would blow our VMEM resource limit, no go */
3960         if (map->size + grow_amount > p->p_rlimit[RLIMIT_VMEM].rlim_cur) {
3961                 rv = KERN_NO_SPACE;
3962                 goto done;
3963         }
3964
3965         if (use_read_lock && vm_map_lock_upgrade(map)) {
3966                 /* lost lock */
3967                 use_read_lock = 0;
3968                 goto Retry;
3969         }
3970         use_read_lock = 0;
3971
3972         /* Get the preliminary new entry start value */
3973         addr = stack_entry->start - grow_amount;
3974
3975         /* If this puts us into the previous entry, cut back our growth
3976          * to the available space.  Also, see the note above.
3977          */
3978         if (addr < end) {
3979                 stack_entry->aux.avail_ssize = stack_entry->start - end;
3980                 addr = end;
3981         }
3982
3983         rv = vm_map_insert(map, &count, NULL, NULL,
3984                            0, addr, stack_entry->start,
3985                            VM_MAPTYPE_NORMAL,
3986                            VM_SUBSYS_STACK, VM_PROT_ALL, VM_PROT_ALL, 0);
3987
3988         /* Adjust the available stack space by the amount we grew. */
3989         if (rv == KERN_SUCCESS) {
3990                 if (prev_entry != &map->header)
3991                         vm_map_clip_end(map, prev_entry, addr, &count);
3992                 new_stack_entry = prev_entry->next;
3993                 if (new_stack_entry->end   != stack_entry->start  ||
3994                     new_stack_entry->start != addr)
3995                         panic ("Bad stack grow start/end in new stack entry");
3996                 else {
3997                         new_stack_entry->aux.avail_ssize =
3998                                 stack_entry->aux.avail_ssize -
3999                                 (new_stack_entry->end - new_stack_entry->start);
4000                         if (is_procstack)
4001                                 vm->vm_ssize += btoc(new_stack_entry->end -
4002                                                      new_stack_entry->start);
4003                 }
4004
4005                 if (map->flags & MAP_WIREFUTURE)
4006                         vm_map_unwire(map, new_stack_entry->start,
4007                                       new_stack_entry->end, FALSE);
4008         }
4009
4010 done:
4011         if (use_read_lock)
4012                 vm_map_unlock_read(map);
4013         else
4014                 vm_map_unlock(map);
4015         vm_map_entry_release(count);
4016         return (rv);
4017 }
4018
4019 /*
4020  * Unshare the specified VM space for exec.  If other processes are
4021  * mapped to it, then create a new one.  The new vmspace is null.
4022  *
4023  * No requirements.
4024  */
4025 void
4026 vmspace_exec(struct proc *p, struct vmspace *vmcopy)
4027 {
4028         struct vmspace *oldvmspace = p->p_vmspace;
4029         struct vmspace *newvmspace;
4030         vm_map_t map = &p->p_vmspace->vm_map;
4031
4032         /*
4033          * If we are execing a resident vmspace we fork it, otherwise
4034          * we create a new vmspace.  Note that exitingcnt is not
4035          * copied to the new vmspace.
4036          */
4037         lwkt_gettoken(&oldvmspace->vm_map.token);
4038         if (vmcopy)  {
4039                 newvmspace = vmspace_fork(vmcopy);
4040                 lwkt_gettoken(&newvmspace->vm_map.token);
4041         } else {
4042                 newvmspace = vmspace_alloc(map->min_offset, map->max_offset);
4043                 lwkt_gettoken(&newvmspace->vm_map.token);
4044                 bcopy(&oldvmspace->vm_startcopy, &newvmspace->vm_startcopy,
4045                       (caddr_t)&oldvmspace->vm_endcopy -
4046                        (caddr_t)&oldvmspace->vm_startcopy);
4047         }
4048
4049         /*
4050          * Finish initializing the vmspace before assigning it
4051          * to the process.  The vmspace will become the current vmspace
4052          * if p == curproc.
4053          */
4054         pmap_pinit2(vmspace_pmap(newvmspace));
4055         pmap_replacevm(p, newvmspace, 0);
4056         lwkt_reltoken(&newvmspace->vm_map.token);
4057         lwkt_reltoken(&oldvmspace->vm_map.token);
4058         vmspace_rel(oldvmspace);
4059 }
4060
4061 /*
4062  * Unshare the specified VM space for forcing COW.  This
4063  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
4064  */
4065 void
4066 vmspace_unshare(struct proc *p)
4067 {
4068         struct vmspace *oldvmspace = p->p_vmspace;
4069         struct vmspace *newvmspace;
4070
4071         lwkt_gettoken(&oldvmspace->vm_map.token);
4072         if (vmspace_getrefs(oldvmspace) == 1) {
4073                 lwkt_reltoken(&oldvmspace->vm_map.token);
4074                 return;
4075         }
4076         newvmspace = vmspace_fork(oldvmspace);
4077         lwkt_gettoken(&newvmspace->vm_map.token);
4078         pmap_pinit2(vmspace_pmap(newvmspace));
4079         pmap_replacevm(p, newvmspace, 0);
4080         lwkt_reltoken(&newvmspace->vm_map.token);
4081         lwkt_reltoken(&oldvmspace->vm_map.token);
4082         vmspace_rel(oldvmspace);
4083 }
4084
4085 /*
4086  * vm_map_hint: return the beginning of the best area suitable for
4087  * creating a new mapping with "prot" protection.
4088  *
4089  * No requirements.
4090  */
4091 vm_offset_t
4092 vm_map_hint(struct proc *p, vm_offset_t addr, vm_prot_t prot)
4093 {
4094         struct vmspace *vms = p->p_vmspace;
4095
4096         if (!randomize_mmap || addr != 0) {
4097                 /*
4098                  * Set a reasonable start point for the hint if it was
4099                  * not specified or if it falls within the heap space.
4100                  * Hinted mmap()s do not allocate out of the heap space.
4101                  */
4102                 if (addr == 0 ||
4103                     (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
4104                      addr < round_page((vm_offset_t)vms->vm_daddr + maxdsiz))) {
4105                         addr = round_page((vm_offset_t)vms->vm_daddr + maxdsiz);
4106                 }
4107
4108                 return addr;
4109         }
4110         addr = (vm_offset_t)vms->vm_daddr + MAXDSIZ;
4111         addr += karc4random() & (MIN((256 * 1024 * 1024), MAXDSIZ) - 1);
4112
4113         return (round_page(addr));
4114 }
4115
4116 /*
4117  * Finds the VM object, offset, and protection for a given virtual address
4118  * in the specified map, assuming a page fault of the type specified.
4119  *
4120  * Leaves the map in question locked for read; return values are guaranteed
4121  * until a vm_map_lookup_done call is performed.  Note that the map argument
4122  * is in/out; the returned map must be used in the call to vm_map_lookup_done.
4123  *
4124  * A handle (out_entry) is returned for use in vm_map_lookup_done, to make
4125  * that fast.
4126  *
4127  * If a lookup is requested with "write protection" specified, the map may
4128  * be changed to perform virtual copying operations, although the data
4129  * referenced will remain the same.
4130  *
4131  * No requirements.
4132  */
4133 int
4134 vm_map_lookup(vm_map_t *var_map,                /* IN/OUT */
4135               vm_offset_t vaddr,
4136               vm_prot_t fault_typea,
4137               vm_map_entry_t *out_entry,        /* OUT */
4138               vm_object_t *object,              /* OUT */
4139               vm_pindex_t *pindex,              /* OUT */
4140               vm_prot_t *out_prot,              /* OUT */
4141               int *wflags)                      /* OUT */
4142 {
4143         vm_map_entry_t entry;
4144         vm_map_t map = *var_map;
4145         vm_prot_t prot;
4146         vm_prot_t fault_type = fault_typea;
4147         int use_read_lock = 1;
4148         int rv = KERN_SUCCESS;
4149         int count;
4150         thread_t td = curthread;
4151
4152         /*
4153          * vm_map_entry_reserve() implements an important mitigation
4154          * against mmap() span running the kernel out of vm_map_entry
4155          * structures, but it can also cause an infinite call recursion.
4156          * Use td_nest_count to prevent an infinite recursion (allows
4157          * the vm_map code to dig into the pcpu vm_map_entry reserve).
4158          */
4159         count = 0;
4160         if (td->td_nest_count == 0) {
4161                 ++td->td_nest_count;
4162                 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
4163                 --td->td_nest_count;
4164         }
4165 RetryLookup:
4166         if (use_read_lock)
4167                 vm_map_lock_read(map);
4168         else
4169                 vm_map_lock(map);
4170
4171         /*
4172          * Always do a full lookup.  The hint doesn't get us much anymore
4173          * now that the map is RB'd.
4174          */
4175         cpu_ccfence();
4176         *out_entry = &map->header;
4177         *object = NULL;
4178
4179         {
4180                 vm_map_entry_t tmp_entry;
4181
4182                 if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
4183                         rv = KERN_INVALID_ADDRESS;
4184                         goto done;
4185                 }
4186                 entry = tmp_entry;
4187                 *out_entry = entry;
4188         }
4189
4190         /*
4191          * Handle submaps.
4192          */
4193         if (entry->maptype == VM_MAPTYPE_SUBMAP) {
4194                 vm_map_t old_map = map;
4195
4196                 *var_map = map = entry->object.sub_map;
4197                 if (use_read_lock)
4198                         vm_map_unlock_read(old_map);
4199                 else
4200                         vm_map_unlock(old_map);
4201                 use_read_lock = 1;
4202                 goto RetryLookup;
4203         }
4204
4205         /*
4206          * Check whether this task is allowed to have this page.
4207          * Note the special case for MAP_ENTRY_COW pages with an override.
4208          * This is to implement a forced COW for debuggers.
4209          */
4210         if (fault_type & VM_PROT_OVERRIDE_WRITE)
4211                 prot = entry->max_protection;
4212         else
4213                 prot = entry->protection;
4214
4215         fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
4216         if ((fault_type & prot) != fault_type) {
4217                 rv = KERN_PROTECTION_FAILURE;
4218                 goto done;
4219         }
4220
4221         if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
4222             (entry->eflags & MAP_ENTRY_COW) &&
4223             (fault_type & VM_PROT_WRITE) &&
4224             (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) {
4225                 rv = KERN_PROTECTION_FAILURE;
4226                 goto done;
4227         }
4228
4229         /*
4230          * If this page is not pageable, we have to get it for all possible
4231          * accesses.
4232          */
4233         *wflags = 0;
4234         if (entry->wired_count) {
4235                 *wflags |= FW_WIRED;
4236                 prot = fault_type = entry->protection;
4237         }
4238
4239         /*
4240          * Virtual page tables may need to update the accessed (A) bit
4241          * in a page table entry.  Upgrade the fault to a write fault for
4242          * that case if the map will support it.  If the map does not support
4243          * it the page table entry simply will not be updated.
4244          */
4245         if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
4246                 if (prot & VM_PROT_WRITE)
4247                         fault_type |= VM_PROT_WRITE;
4248         }
4249
4250         if (curthread->td_lwp && curthread->td_lwp->lwp_vmspace &&
4251             pmap_emulate_ad_bits(&curthread->td_lwp->lwp_vmspace->vm_pmap)) {
4252                 if ((prot & VM_PROT_WRITE) == 0)
4253                         fault_type |= VM_PROT_WRITE;
4254         }
4255
4256         /*
4257          * Only NORMAL and VPAGETABLE maps are object-based.  UKSMAPs are not.
4258          */
4259         if (entry->maptype != VM_MAPTYPE_NORMAL &&
4260             entry->maptype != VM_MAPTYPE_VPAGETABLE) {
4261                 *object = NULL;
4262                 goto skip;
4263         }
4264
4265         /*
4266          * If the entry was copy-on-write, we either ...
4267          */
4268         if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4269                 /*
4270                  * If we want to write the page, we may as well handle that
4271                  * now since we've got the map locked.
4272                  *
4273                  * If we don't need to write the page, we just demote the
4274                  * permissions allowed.
4275                  */
4276                 if (fault_type & VM_PROT_WRITE) {
4277                         /*
4278                          * Not allowed if TDF_NOFAULT is set as the shadowing
4279                          * operation can deadlock against the faulting
4280                          * function due to the copy-on-write.
4281                          */
4282                         if (curthread->td_flags & TDF_NOFAULT) {
4283                                 rv = KERN_FAILURE_NOFAULT;
4284                                 goto done;
4285                         }
4286
4287                         /*
4288                          * Make a new object, and place it in the object
4289                          * chain.  Note that no new references have appeared
4290                          * -- one just moved from the map to the new
4291                          * object.
4292                          */
4293                         if (use_read_lock && vm_map_lock_upgrade(map)) {
4294                                 /* lost lock */
4295                                 use_read_lock = 0;
4296                                 goto RetryLookup;
4297                         }
4298                         use_read_lock = 0;
4299                         vm_map_entry_shadow(entry, 0);
4300                         *wflags |= FW_DIDCOW;
4301                 } else {
4302                         /*
4303                          * We're attempting to read a copy-on-write page --
4304                          * don't allow writes.
4305                          */
4306                         prot &= ~VM_PROT_WRITE;
4307                 }
4308         }
4309
4310         /*
4311          * Create an object if necessary.  This code also handles
4312          * partitioning large entries to improve vm_fault performance.
4313          */
4314         if (entry->object.vm_object == NULL && !map->system_map) {
4315                 if (use_read_lock && vm_map_lock_upgrade(map))  {
4316                         /* lost lock */
4317                         use_read_lock = 0;
4318                         goto RetryLookup;
4319                 }
4320                 use_read_lock = 0;
4321
4322                 /*
4323                  * Partition large entries, giving each its own VM object,
4324                  * to improve concurrent fault performance.  This is only
4325                  * applicable to userspace.
4326                  */
4327                 if (map != &kernel_map &&
4328                     entry->maptype == VM_MAPTYPE_NORMAL &&
4329                     ((entry->start ^ entry->end) & ~MAP_ENTRY_PARTITION_MASK) &&
4330                     vm_map_partition_enable) {
4331                         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
4332                                 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
4333                                 ++mycpu->gd_cnt.v_intrans_coll;
4334                                 ++mycpu->gd_cnt.v_intrans_wait;
4335                                 vm_map_transition_wait(map, 0);
4336                                 goto RetryLookup;
4337                         }
4338                         vm_map_entry_partition(map, entry, vaddr, &count);
4339                 }
4340                 vm_map_entry_allocate_object(entry);
4341         }
4342
4343         /*
4344          * Return the object/offset from this entry.  If the entry was
4345          * copy-on-write or empty, it has been fixed up.
4346          */
4347         *object = entry->object.vm_object;
4348
4349 skip:
4350         *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
4351
4352         /*
4353          * Return whether this is the only map sharing this data.  On
4354          * success we return with a read lock held on the map.  On failure
4355          * we return with the map unlocked.
4356          */
4357         *out_prot = prot;
4358 done:
4359         if (rv == KERN_SUCCESS) {
4360                 if (use_read_lock == 0)
4361                         vm_map_lock_downgrade(map);
4362         } else if (use_read_lock) {
4363                 vm_map_unlock_read(map);
4364         } else {
4365                 vm_map_unlock(map);
4366         }
4367         if (count > 0)
4368                 vm_map_entry_release(count);
4369
4370         return (rv);
4371 }
4372
4373 /*
4374  * Releases locks acquired by a vm_map_lookup()
4375  * (according to the handle returned by that lookup).
4376  *
4377  * No other requirements.
4378  */
4379 void
4380 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry, int count)
4381 {
4382         /*
4383          * Unlock the main-level map
4384          */
4385         vm_map_unlock_read(map);
4386         if (count)
4387                 vm_map_entry_release(count);
4388 }
4389
4390 static void
4391 vm_map_entry_partition(vm_map_t map, vm_map_entry_t entry,
4392                        vm_offset_t vaddr, int *countp)
4393 {
4394         vaddr &= ~MAP_ENTRY_PARTITION_MASK;
4395         vm_map_clip_start(map, entry, vaddr, countp);
4396         vaddr += MAP_ENTRY_PARTITION_SIZE;
4397         vm_map_clip_end(map, entry, vaddr, countp);
4398 }
4399
4400 /*
4401  * Quick hack, needs some help to make it more SMP friendly.
4402  */
4403 void
4404 vm_map_interlock(vm_map_t map, struct vm_map_ilock *ilock,
4405                  vm_offset_t ran_beg, vm_offset_t ran_end)
4406 {
4407         struct vm_map_ilock *scan;
4408
4409         ilock->ran_beg = ran_beg;
4410         ilock->ran_end = ran_end;
4411         ilock->flags = 0;
4412
4413         spin_lock(&map->ilock_spin);
4414 restart:
4415         for (scan = map->ilock_base; scan; scan = scan->next) {
4416                 if (ran_end > scan->ran_beg && ran_beg < scan->ran_end) {
4417                         scan->flags |= ILOCK_WAITING;
4418                         ssleep(scan, &map->ilock_spin, 0, "ilock", 0);
4419                         goto restart;
4420                 }
4421         }
4422         ilock->next = map->ilock_base;
4423         map->ilock_base = ilock;
4424         spin_unlock(&map->ilock_spin);
4425 }
4426
4427 void
4428 vm_map_deinterlock(vm_map_t map, struct  vm_map_ilock *ilock)
4429 {
4430         struct vm_map_ilock *scan;
4431         struct vm_map_ilock **scanp;
4432
4433         spin_lock(&map->ilock_spin);
4434         scanp = &map->ilock_base;
4435         while ((scan = *scanp) != NULL) {
4436                 if (scan == ilock) {
4437                         *scanp = ilock->next;
4438                         spin_unlock(&map->ilock_spin);
4439                         if (ilock->flags & ILOCK_WAITING)
4440                                 wakeup(ilock);
4441                         return;
4442                 }
4443                 scanp = &scan->next;
4444         }
4445         spin_unlock(&map->ilock_spin);
4446         panic("vm_map_deinterlock: missing ilock!");
4447 }
4448
4449 #include "opt_ddb.h"
4450 #ifdef DDB
4451 #include <ddb/ddb.h>
4452
4453 /*
4454  * Debugging only
4455  */
4456 DB_SHOW_COMMAND(map, vm_map_print)
4457 {
4458         static int nlines;
4459         /* XXX convert args. */
4460         vm_map_t map = (vm_map_t)addr;
4461         boolean_t full = have_addr;
4462
4463         vm_map_entry_t entry;
4464
4465         db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
4466             (void *)map,
4467             (void *)map->pmap, map->nentries, map->timestamp);
4468         nlines++;
4469
4470         if (!full && db_indent)
4471                 return;
4472
4473         db_indent += 2;
4474         for (entry = map->header.next; entry != &map->header;
4475             entry = entry->next) {
4476                 db_iprintf("map entry %p: start=%p, end=%p\n",
4477                     (void *)entry, (void *)entry->start, (void *)entry->end);
4478                 nlines++;
4479                 {
4480                         static char *inheritance_name[4] =
4481                         {"share", "copy", "none", "donate_copy"};
4482
4483                         db_iprintf(" prot=%x/%x/%s",
4484                             entry->protection,
4485                             entry->max_protection,
4486                             inheritance_name[(int)(unsigned char)
4487                                                 entry->inheritance]);
4488                         if (entry->wired_count != 0)
4489                                 db_printf(", wired");
4490                 }
4491                 switch(entry->maptype) {
4492                 case VM_MAPTYPE_SUBMAP:
4493                         /* XXX no %qd in kernel.  Truncate entry->offset. */
4494                         db_printf(", share=%p, offset=0x%lx\n",
4495                             (void *)entry->object.sub_map,
4496                             (long)entry->offset);
4497                         nlines++;
4498                         if ((entry->prev == &map->header) ||
4499                             (entry->prev->object.sub_map !=
4500                                 entry->object.sub_map)) {
4501                                 db_indent += 2;
4502                                 vm_map_print((db_expr_t)(intptr_t)
4503                                              entry->object.sub_map,
4504                                              full, 0, NULL);
4505                                 db_indent -= 2;
4506                         }
4507                         break;
4508                 case VM_MAPTYPE_NORMAL:
4509                 case VM_MAPTYPE_VPAGETABLE:
4510                         /* XXX no %qd in kernel.  Truncate entry->offset. */
4511                         db_printf(", object=%p, offset=0x%lx",
4512                             (void *)entry->object.vm_object,
4513                             (long)entry->offset);
4514                         if (entry->eflags & MAP_ENTRY_COW)
4515                                 db_printf(", copy (%s)",
4516                                     (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
4517                         db_printf("\n");
4518                         nlines++;
4519
4520                         if ((entry->prev == &map->header) ||
4521                             (entry->prev->object.vm_object !=
4522                                 entry->object.vm_object)) {
4523                                 db_indent += 2;
4524                                 vm_object_print((db_expr_t)(intptr_t)
4525                                                 entry->object.vm_object,
4526                                                 full, 0, NULL);
4527                                 nlines += 4;
4528                                 db_indent -= 2;
4529                         }
4530                         break;
4531                 case VM_MAPTYPE_UKSMAP:
4532                         db_printf(", uksmap=%p, offset=0x%lx",
4533                             (void *)entry->object.uksmap,
4534                             (long)entry->offset);
4535                         if (entry->eflags & MAP_ENTRY_COW)
4536                                 db_printf(", copy (%s)",
4537                                     (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
4538                         db_printf("\n");
4539                         nlines++;
4540                         break;
4541                 default:
4542                         break;
4543                 }
4544         }
4545         db_indent -= 2;
4546         if (db_indent == 0)
4547                 nlines = 0;
4548 }
4549
4550 /*
4551  * Debugging only
4552  */
4553 DB_SHOW_COMMAND(procvm, procvm)
4554 {
4555         struct proc *p;
4556
4557         if (have_addr) {
4558                 p = (struct proc *) addr;
4559         } else {
4560                 p = curproc;
4561         }
4562
4563         db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
4564             (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
4565             (void *)vmspace_pmap(p->p_vmspace));
4566
4567         vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL);
4568 }
4569
4570 #endif /* DDB */