sys/vm/vm_map.c

   1 /*
   2  * Copyright (c) 1991, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * Copyright (c) 2003-2017 The DragonFly Project.  All rights reserved.
   5  *
   6  * This code is derived from software contributed to Berkeley by
   7  * The Mach Operating System project at Carnegie-Mellon University.
   8  *
   9  * This code is derived from software contributed to The DragonFly Project
  10  * by Matthew Dillon <dillon@backplane.com>
  11  *
  12  * Redistribution and use in source and binary forms, with or without
  13  * modification, are permitted provided that the following conditions
  14  * are met:
  15  * 1. Redistributions of source code must retain the above copyright
  16  *    notice, this list of conditions and the following disclaimer.
  17  * 2. Redistributions in binary form must reproduce the above copyright
  18  *    notice, this list of conditions and the following disclaimer in the
  19  *    documentation and/or other materials provided with the distribution.
  20  * 3. Neither the name of the University nor the names of its contributors
  21  *    may be used to endorse or promote products derived from this software
  22  *    without specific prior written permission.
  23  *
  24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  34  * SUCH DAMAGE.
  35  *
  36  *      from: @(#)vm_map.c      8.3 (Berkeley) 1/12/94
  37  *
  38  *
  39  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  40  * All rights reserved.
  41  *
  42  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  43  *
  44  * Permission to use, copy, modify and distribute this software and
  45  * its documentation is hereby granted, provided that both the copyright
  46  * notice and this permission notice appear in all copies of the
  47  * software, derivative works or modified versions, and any portions
  48  * thereof, and that both notices appear in supporting documentation.
  49  *
  50  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  51  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  52  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  53  *
  54  * Carnegie Mellon requests users of this software to return to
  55  *
  56  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  57  *  School of Computer Science
  58  *  Carnegie Mellon University
  59  *  Pittsburgh PA 15213-3890
  60  *
  61  * any improvements or extensions that they make and grant Carnegie the
  62  * rights to redistribute these changes.
  63  *
  64  * $FreeBSD: src/sys/vm/vm_map.c,v 1.187.2.19 2003/05/27 00:47:02 alc Exp $
  65  */
  66
  67 /*
  68  *      Virtual memory mapping module.
  69  */
  70
  71 #include <sys/param.h>
  72 #include <sys/systm.h>
  73 #include <sys/kernel.h>
  74 #include <sys/proc.h>
  75 #include <sys/serialize.h>
  76 #include <sys/lock.h>
  77 #include <sys/vmmeter.h>
  78 #include <sys/mman.h>
  79 #include <sys/vnode.h>
  80 #include <sys/resourcevar.h>
  81 #include <sys/shm.h>
  82 #include <sys/tree.h>
  83 #include <sys/malloc.h>
  84 #include <sys/objcache.h>
  85
  86 #include <vm/vm.h>
  87 #include <vm/vm_param.h>
  88 #include <vm/pmap.h>
  89 #include <vm/vm_map.h>
  90 #include <vm/vm_page.h>
  91 #include <vm/vm_object.h>
  92 #include <vm/vm_pager.h>
  93 #include <vm/vm_kern.h>
  94 #include <vm/vm_extern.h>
  95 #include <vm/swap_pager.h>
  96 #include <vm/vm_zone.h>
  97
  98 #include <sys/random.h>
  99 #include <sys/sysctl.h>
 100 #include <sys/spinlock.h>
 101
 102 #include <sys/thread2.h>
 103 #include <sys/spinlock2.h>
 104
 105 /*
 106  * Virtual memory maps provide for the mapping, protection, and sharing
 107  * of virtual memory objects.  In addition, this module provides for an
 108  * efficient virtual copy of memory from one map to another.
 109  *
 110  * Synchronization is required prior to most operations.
 111  *
 112  * Maps consist of an ordered doubly-linked list of simple entries.
 113  * A hint and a RB tree is used to speed-up lookups.
 114  *
 115  * Callers looking to modify maps specify start/end addresses which cause
 116  * the related map entry to be clipped if necessary, and then later
 117  * recombined if the pieces remained compatible.
 118  *
 119  * Virtual copy operations are performed by copying VM object references
 120  * from one map to another, and then marking both regions as copy-on-write.
 121  */
 122 static boolean_t vmspace_ctor(void *obj, void *privdata, int ocflags);
 123 static void vmspace_dtor(void *obj, void *privdata);
 124 static void vmspace_terminate(struct vmspace *vm, int final);
 125
 126 MALLOC_DEFINE(M_VMSPACE, "vmspace", "vmspace objcache backingstore");
 127 static struct objcache *vmspace_cache;
 128
 129 /*
 130  * per-cpu page table cross mappings are initialized in early boot
 131  * and might require a considerable number of vm_map_entry structures.
 132  */
 133 #define MAPENTRYBSP_CACHE       (MAXCPU+1)
 134 #define MAPENTRYAP_CACHE        8
 135
 136 /*
 137  * Partioning threaded programs with large anonymous memory areas can
 138  * improve concurrent fault performance.
 139  */
 140 #define MAP_ENTRY_PARTITION_SIZE        ((vm_offset_t)(32 * 1024 * 1024))
 141 #define MAP_ENTRY_PARTITION_MASK        (MAP_ENTRY_PARTITION_SIZE - 1)
 142
 143 #define VM_MAP_ENTRY_WITHIN_PARTITION(entry)    \
 144         ((((entry)->start ^ (entry)->end) & ~MAP_ENTRY_PARTITION_MASK) == 0)
 145
 146 static struct vm_zone mapentzone_store;
 147 static vm_zone_t mapentzone;
 148
 149 static struct vm_map_entry map_entry_init[MAX_MAPENT];
 150 static struct vm_map_entry cpu_map_entry_init_bsp[MAPENTRYBSP_CACHE];
 151 static struct vm_map_entry cpu_map_entry_init_ap[MAXCPU][MAPENTRYAP_CACHE];
 152
 153 static int randomize_mmap;
 154 SYSCTL_INT(_vm, OID_AUTO, randomize_mmap, CTLFLAG_RW, &randomize_mmap, 0,
 155     "Randomize mmap offsets");
 156 static int vm_map_relock_enable = 1;
 157 SYSCTL_INT(_vm, OID_AUTO, map_relock_enable, CTLFLAG_RW,
 158            &vm_map_relock_enable, 0, "insert pop pgtable optimization");
 159 static int vm_map_partition_enable = 1;
 160 SYSCTL_INT(_vm, OID_AUTO, map_partition_enable, CTLFLAG_RW,
 161            &vm_map_partition_enable, 0, "Break up larger vm_map_entry's");
 162
 163 static void vmspace_drop_notoken(struct vmspace *vm);
 164 static void vm_map_entry_shadow(vm_map_entry_t entry, int addref);
 165 static vm_map_entry_t vm_map_entry_create(vm_map_t map, int *);
 166 static void vm_map_entry_dispose (vm_map_t map, vm_map_entry_t entry, int *);
 167 static void _vm_map_clip_end (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
 168 static void _vm_map_clip_start (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
 169 static void vm_map_entry_delete (vm_map_t, vm_map_entry_t, int *);
 170 static void vm_map_entry_unwire (vm_map_t, vm_map_entry_t);
 171 static void vm_map_copy_entry (vm_map_t, vm_map_t, vm_map_entry_t,
 172                 vm_map_entry_t);
 173 static void vm_map_unclip_range (vm_map_t map, vm_map_entry_t start_entry,
 174                 vm_offset_t start, vm_offset_t end, int *countp, int flags);
 175 static void vm_map_entry_partition(vm_map_t map, vm_map_entry_t entry,
 176                 vm_offset_t vaddr, int *countp);
 177
 178 /*
 179  * Initialize the vm_map module.  Must be called before any other vm_map
 180  * routines.
 181  *
 182  * Map and entry structures are allocated from the general purpose
 183  * memory pool with some exceptions:
 184  *
 185  *      - The kernel map is allocated statically.
 186  *      - Initial kernel map entries are allocated out of a static pool.
 187  *      - We must set ZONE_SPECIAL here or the early boot code can get
 188  *        stuck if there are >63 cores.
 189  *
 190  *      These restrictions are necessary since malloc() uses the
 191  *      maps and requires map entries.
 192  *
 193  * Called from the low level boot code only.
 194  */
 195 void
 196 vm_map_startup(void)
 197 {
 198         mapentzone = &mapentzone_store;
 199         zbootinit(mapentzone, "MAP ENTRY", sizeof (struct vm_map_entry),
 200                   map_entry_init, MAX_MAPENT);
 201         mapentzone_store.zflags |= ZONE_SPECIAL;
 202 }
 203
 204 /*
 205  * Called prior to any vmspace allocations.
 206  *
 207  * Called from the low level boot code only.
 208  */
 209 void
 210 vm_init2(void)
 211 {
 212         vmspace_cache = objcache_create_mbacked(M_VMSPACE,
 213                                                 sizeof(struct vmspace),
 214                                                 0, ncpus * 4,
 215                                                 vmspace_ctor, vmspace_dtor,
 216                                                 NULL);
 217         zinitna(mapentzone, NULL, 0, 0, ZONE_USE_RESERVE | ZONE_SPECIAL);
 218         pmap_init2();
 219         vm_object_init2();
 220 }
 221
 222 /*
 223  * objcache support.  We leave the pmap root cached as long as possible
 224  * for performance reasons.
 225  */
 226 static
 227 boolean_t
 228 vmspace_ctor(void *obj, void *privdata, int ocflags)
 229 {
 230         struct vmspace *vm = obj;
 231
 232         bzero(vm, sizeof(*vm));
 233         vm->vm_refcnt = VM_REF_DELETED;
 234
 235         return 1;
 236 }
 237
 238 static
 239 void
 240 vmspace_dtor(void *obj, void *privdata)
 241 {
 242         struct vmspace *vm = obj;
 243
 244         KKASSERT(vm->vm_refcnt == VM_REF_DELETED);
 245         pmap_puninit(vmspace_pmap(vm));
 246 }
 247
 248 /*
 249  * Red black tree functions
 250  *
 251  * The caller must hold the related map lock.
 252  */
 253 static int rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b);
 254 RB_GENERATE(vm_map_rb_tree, vm_map_entry, rb_entry, rb_vm_map_compare);
 255
 256 /* a->start is address, and the only field has to be initialized */
 257 static int
 258 rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b)
 259 {
 260         if (a->start < b->start)
 261                 return(-1);
 262         else if (a->start > b->start)
 263                 return(1);
 264         return(0);
 265 }
 266
 267 /*
 268  * Initialize vmspace ref/hold counts vmspace0.  There is a holdcnt for
 269  * every refcnt.
 270  */
 271 void
 272 vmspace_initrefs(struct vmspace *vm)
 273 {
 274         vm->vm_refcnt = 1;
 275         vm->vm_holdcnt = 1;
 276 }
 277
 278 /*
 279  * Allocate a vmspace structure, including a vm_map and pmap.
 280  * Initialize numerous fields.  While the initial allocation is zerod,
 281  * subsequence reuse from the objcache leaves elements of the structure
 282  * intact (particularly the pmap), so portions must be zerod.
 283  *
 284  * Returns a referenced vmspace.
 285  *
 286  * No requirements.
 287  */
 288 struct vmspace *
 289 vmspace_alloc(vm_offset_t min, vm_offset_t max)
 290 {
 291         struct vmspace *vm;
 292
 293         vm = objcache_get(vmspace_cache, M_WAITOK);
 294
 295         bzero(&vm->vm_startcopy,
 296               (char *)&vm->vm_endcopy - (char *)&vm->vm_startcopy);
 297         vm_map_init(&vm->vm_map, min, max, NULL);       /* initializes token */
 298
 299         /*
 300          * NOTE: hold to acquires token for safety.
 301          *
 302          * On return vmspace is referenced (refs=1, hold=1).  That is,
 303          * each refcnt also has a holdcnt.  There can be additional holds
 304          * (holdcnt) above and beyond the refcnt.  Finalization is handled in
 305          * two stages, one on refs 1->0, and the the second on hold 1->0.
 306          */
 307         KKASSERT(vm->vm_holdcnt == 0);
 308         KKASSERT(vm->vm_refcnt == VM_REF_DELETED);
 309         vmspace_initrefs(vm);
 310         vmspace_hold(vm);
 311         pmap_pinit(vmspace_pmap(vm));           /* (some fields reused) */
 312         vm->vm_map.pmap = vmspace_pmap(vm);     /* XXX */
 313         vm->vm_shm = NULL;
 314         vm->vm_flags = 0;
 315         cpu_vmspace_alloc(vm);
 316         vmspace_drop(vm);
 317
 318         return (vm);
 319 }
 320
 321 /*
 322  * NOTE: Can return 0 if the vmspace is exiting.
 323  */
 324 int
 325 vmspace_getrefs(struct vmspace *vm)
 326 {
 327         int32_t n;
 328
 329         n = vm->vm_refcnt;
 330         cpu_ccfence();
 331         if (n & VM_REF_DELETED)
 332                 n = -1;
 333         return n;
 334 }
 335
 336 void
 337 vmspace_hold(struct vmspace *vm)
 338 {
 339         atomic_add_int(&vm->vm_holdcnt, 1);
 340         lwkt_gettoken(&vm->vm_map.token);
 341 }
 342
 343 /*
 344  * Drop with final termination interlock.
 345  */
 346 void
 347 vmspace_drop(struct vmspace *vm)
 348 {
 349         lwkt_reltoken(&vm->vm_map.token);
 350         vmspace_drop_notoken(vm);
 351 }
 352
 353 static void
 354 vmspace_drop_notoken(struct vmspace *vm)
 355 {
 356         if (atomic_fetchadd_int(&vm->vm_holdcnt, -1) == 1) {
 357                 if (vm->vm_refcnt & VM_REF_DELETED)
 358                         vmspace_terminate(vm, 1);
 359         }
 360 }
 361
 362 /*
 363  * A vmspace object must not be in a terminated state to be able to obtain
 364  * additional refs on it.
 365  *
 366  * These are official references to the vmspace, the count is used to check
 367  * for vmspace sharing.  Foreign accessors should use 'hold' and not 'ref'.
 368  *
 369  * XXX we need to combine hold & ref together into one 64-bit field to allow
 370  * holds to prevent stage-1 termination.
 371  */
 372 void
 373 vmspace_ref(struct vmspace *vm)
 374 {
 375         uint32_t n;
 376
 377         atomic_add_int(&vm->vm_holdcnt, 1);
 378         n = atomic_fetchadd_int(&vm->vm_refcnt, 1);
 379         KKASSERT((n & VM_REF_DELETED) == 0);
 380 }
 381
 382 /*
 383  * Release a ref on the vmspace.  On the 1->0 transition we do stage-1
 384  * termination of the vmspace.  Then, on the final drop of the hold we
 385  * will do stage-2 final termination.
 386  */
 387 void
 388 vmspace_rel(struct vmspace *vm)
 389 {
 390         uint32_t n;
 391
 392         /*
 393          * Drop refs.  Each ref also has a hold which is also dropped.
 394          *
 395          * When refs hits 0 compete to get the VM_REF_DELETED flag (hold
 396          * prevent finalization) to start termination processing.
 397          * Finalization occurs when the last hold count drops to 0.
 398          */
 399         n = atomic_fetchadd_int(&vm->vm_refcnt, -1) - 1;
 400         while (n == 0) {
 401                 if (atomic_cmpset_int(&vm->vm_refcnt, 0, VM_REF_DELETED)) {
 402                         vmspace_terminate(vm, 0);
 403                         break;
 404                 }
 405                 n = vm->vm_refcnt;
 406                 cpu_ccfence();
 407         }
 408         vmspace_drop_notoken(vm);
 409 }
 410
 411 /*
 412  * This is called during exit indicating that the vmspace is no
 413  * longer in used by an exiting process, but the process has not yet
 414  * been reaped.
 415  *
 416  * We drop refs, allowing for stage-1 termination, but maintain a holdcnt
 417  * to prevent stage-2 until the process is reaped.  Note hte order of
 418  * operation, we must hold first.
 419  *
 420  * No requirements.
 421  */
 422 void
 423 vmspace_relexit(struct vmspace *vm)
 424 {
 425         atomic_add_int(&vm->vm_holdcnt, 1);
 426         vmspace_rel(vm);
 427 }
 428
 429 /*
 430  * Called during reap to disconnect the remainder of the vmspace from
 431  * the process.  On the hold drop the vmspace termination is finalized.
 432  *
 433  * No requirements.
 434  */
 435 void
 436 vmspace_exitfree(struct proc *p)
 437 {
 438         struct vmspace *vm;
 439
 440         vm = p->p_vmspace;
 441         p->p_vmspace = NULL;
 442         vmspace_drop_notoken(vm);
 443 }
 444
 445 /*
 446  * Called in two cases:
 447  *
 448  * (1) When the last refcnt is dropped and the vmspace becomes inactive,
 449  *     called with final == 0.  refcnt will be (u_int)-1 at this point,
 450  *     and holdcnt will still be non-zero.
 451  *
 452  * (2) When holdcnt becomes 0, called with final == 1.  There should no
 453  *     longer be anyone with access to the vmspace.
 454  *
 455  * VMSPACE_EXIT1 flags the primary deactivation
 456  * VMSPACE_EXIT2 flags the last reap
 457  */
 458 static void
 459 vmspace_terminate(struct vmspace *vm, int final)
 460 {
 461         int count;
 462
 463         lwkt_gettoken(&vm->vm_map.token);
 464         if (final == 0) {
 465                 KKASSERT((vm->vm_flags & VMSPACE_EXIT1) == 0);
 466                 vm->vm_flags |= VMSPACE_EXIT1;
 467
 468                 /*
 469                  * Get rid of most of the resources.  Leave the kernel pmap
 470                  * intact.
 471                  *
 472                  * If the pmap does not contain wired pages we can bulk-delete
 473                  * the pmap as a performance optimization before removing the
 474                  * related mappings.
 475                  *
 476                  * If the pmap contains wired pages we cannot do this
 477                  * pre-optimization because currently vm_fault_unwire()
 478                  * expects the pmap pages to exist and will not decrement
 479                  * p->wire_count if they do not.
 480                  */
 481                 shmexit(vm);
 482                 if (vmspace_pmap(vm)->pm_stats.wired_count) {
 483                         vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS,
 484                                       VM_MAX_USER_ADDRESS);
 485                         pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS,
 486                                           VM_MAX_USER_ADDRESS);
 487                 } else {
 488                         pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS,
 489                                           VM_MAX_USER_ADDRESS);
 490                         vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS,
 491                                       VM_MAX_USER_ADDRESS);
 492                 }
 493                 lwkt_reltoken(&vm->vm_map.token);
 494         } else {
 495                 KKASSERT((vm->vm_flags & VMSPACE_EXIT1) != 0);
 496                 KKASSERT((vm->vm_flags & VMSPACE_EXIT2) == 0);
 497
 498                 /*
 499                  * Get rid of remaining basic resources.
 500                  */
 501                 vm->vm_flags |= VMSPACE_EXIT2;
 502                 shmexit(vm);
 503
 504                 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
 505                 vm_map_lock(&vm->vm_map);
 506                 cpu_vmspace_free(vm);
 507
 508                 /*
 509                  * Lock the map, to wait out all other references to it.
 510                  * Delete all of the mappings and pages they hold, then call
 511                  * the pmap module to reclaim anything left.
 512                  */
 513                 vm_map_delete(&vm->vm_map, vm->vm_map.min_offset,
 514                               vm->vm_map.max_offset, &count);
 515                 vm_map_unlock(&vm->vm_map);
 516                 vm_map_entry_release(count);
 517
 518                 pmap_release(vmspace_pmap(vm));
 519                 lwkt_reltoken(&vm->vm_map.token);
 520                 objcache_put(vmspace_cache, vm);
 521         }
 522 }
 523
 524 /*
 525  * Swap useage is determined by taking the proportional swap used by
 526  * VM objects backing the VM map.  To make up for fractional losses,
 527  * if the VM object has any swap use at all the associated map entries
 528  * count for at least 1 swap page.
 529  *
 530  * No requirements.
 531  */
 532 vm_offset_t
 533 vmspace_swap_count(struct vmspace *vm)
 534 {
 535         vm_map_t map = &vm->vm_map;
 536         vm_map_entry_t cur;
 537         vm_object_t object;
 538         vm_offset_t count = 0;
 539         vm_offset_t n;
 540
 541         vmspace_hold(vm);
 542         for (cur = map->header.next; cur != &map->header; cur = cur->next) {
 543                 switch(cur->maptype) {
 544                 case VM_MAPTYPE_NORMAL:
 545                 case VM_MAPTYPE_VPAGETABLE:
 546                         if ((object = cur->object.vm_object) == NULL)
 547                                 break;
 548                         if (object->swblock_count) {
 549                                 n = (cur->end - cur->start) / PAGE_SIZE;
 550                                 count += object->swblock_count *
 551                                     SWAP_META_PAGES * n / object->size + 1;
 552                         }
 553                         break;
 554                 default:
 555                         break;
 556                 }
 557         }
 558         vmspace_drop(vm);
 559
 560         return(count);
 561 }
 562
 563 /*
 564  * Calculate the approximate number of anonymous pages in use by
 565  * this vmspace.  To make up for fractional losses, we count each
 566  * VM object as having at least 1 anonymous page.
 567  *
 568  * No requirements.
 569  */
 570 vm_offset_t
 571 vmspace_anonymous_count(struct vmspace *vm)
 572 {
 573         vm_map_t map = &vm->vm_map;
 574         vm_map_entry_t cur;
 575         vm_object_t object;
 576         vm_offset_t count = 0;
 577
 578         vmspace_hold(vm);
 579         for (cur = map->header.next; cur != &map->header; cur = cur->next) {
 580                 switch(cur->maptype) {
 581                 case VM_MAPTYPE_NORMAL:
 582                 case VM_MAPTYPE_VPAGETABLE:
 583                         if ((object = cur->object.vm_object) == NULL)
 584                                 break;
 585                         if (object->type != OBJT_DEFAULT &&
 586                             object->type != OBJT_SWAP) {
 587                                 break;
 588                         }
 589                         count += object->resident_page_count;
 590                         break;
 591                 default:
 592                         break;
 593                 }
 594         }
 595         vmspace_drop(vm);
 596
 597         return(count);
 598 }
 599
 600 /*
 601  * Initialize an existing vm_map structure such as that in the vmspace
 602  * structure.  The pmap is initialized elsewhere.
 603  *
 604  * No requirements.
 605  */
 606 void
 607 vm_map_init(struct vm_map *map, vm_offset_t min, vm_offset_t max, pmap_t pmap)
 608 {
 609         map->header.next = map->header.prev = &map->header;
 610         RB_INIT(&map->rb_root);
 611         spin_init(&map->ilock_spin, "ilock");
 612         map->ilock_base = NULL;
 613         map->nentries = 0;
 614         map->size = 0;
 615         map->system_map = 0;
 616         map->min_offset = min;
 617         map->max_offset = max;
 618         map->pmap = pmap;
 619         map->timestamp = 0;
 620         map->flags = 0;
 621         bzero(&map->freehint, sizeof(map->freehint));
 622         lwkt_token_init(&map->token, "vm_map");
 623         lockinit(&map->lock, "vm_maplk", (hz + 9) / 10, 0);
 624 }
 625
 626 /*
 627  * Find the first possible free address for the specified request length.
 628  * Returns 0 if we don't have one cached.
 629  */
 630 static
 631 vm_offset_t
 632 vm_map_freehint_find(vm_map_t map, vm_size_t length, vm_size_t align)
 633 {
 634         vm_map_freehint_t *scan;
 635
 636         scan = &map->freehint[0];
 637         while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
 638                 if (scan->length == length && scan->align == align)
 639                         return(scan->start);
 640                 ++scan;
 641         }
 642         return 0;
 643 }
 644
 645 /*
 646  * Unconditionally set the freehint.  Called by vm_map_findspace() after
 647  * it finds an address.  This will help us iterate optimally on the next
 648  * similar findspace.
 649  */
 650 static
 651 void
 652 vm_map_freehint_update(vm_map_t map, vm_offset_t start,
 653                        vm_size_t length, vm_size_t align)
 654 {
 655         vm_map_freehint_t *scan;
 656
 657         scan = &map->freehint[0];
 658         while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
 659                 if (scan->length == length && scan->align == align) {
 660                         scan->start = start;
 661                         return;
 662                 }
 663                 ++scan;
 664         }
 665         scan = &map->freehint[map->freehint_newindex & VM_MAP_FFMASK];
 666         scan->start = start;
 667         scan->align = align;
 668         scan->length = length;
 669         ++map->freehint_newindex;
 670 }
 671
 672 /*
 673  * Update any existing freehints (for any alignment), for the hole we just
 674  * added.
 675  */
 676 static
 677 void
 678 vm_map_freehint_hole(vm_map_t map, vm_offset_t start, vm_size_t length)
 679 {
 680         vm_map_freehint_t *scan;
 681
 682         scan = &map->freehint[0];
 683         while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
 684                 if (scan->length <= length && scan->start > start)
 685                         scan->start = start;
 686                 ++scan;
 687         }
 688 }
 689
 690 /*
 691  * Shadow the vm_map_entry's object.  This typically needs to be done when
 692  * a write fault is taken on an entry which had previously been cloned by
 693  * fork().  The shared object (which might be NULL) must become private so
 694  * we add a shadow layer above it.
 695  *
 696  * Object allocation for anonymous mappings is defered as long as possible.
 697  * When creating a shadow, however, the underlying object must be instantiated
 698  * so it can be shared.
 699  *
 700  * If the map segment is governed by a virtual page table then it is
 701  * possible to address offsets beyond the mapped area.  Just allocate
 702  * a maximally sized object for this case.
 703  *
 704  * If addref is non-zero an additional reference is added to the returned
 705  * entry.  This mechanic exists because the additional reference might have
 706  * to be added atomically and not after return to prevent a premature
 707  * collapse.
 708  *
 709  * The vm_map must be exclusively locked.
 710  * No other requirements.
 711  */
 712 static
 713 void
 714 vm_map_entry_shadow(vm_map_entry_t entry, int addref)
 715 {
 716         if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
 717                 vm_object_shadow(&entry->object.vm_object, &entry->offset,
 718                                  0x7FFFFFFF, addref);   /* XXX */
 719         } else {
 720                 vm_object_shadow(&entry->object.vm_object, &entry->offset,
 721                                  atop(entry->end - entry->start), addref);
 722         }
 723         entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
 724 }
 725
 726 /*
 727  * Allocate an object for a vm_map_entry.
 728  *
 729  * Object allocation for anonymous mappings is defered as long as possible.
 730  * This function is called when we can defer no longer, generally when a map
 731  * entry might be split or forked or takes a page fault.
 732  *
 733  * If the map segment is governed by a virtual page table then it is
 734  * possible to address offsets beyond the mapped area.  Just allocate
 735  * a maximally sized object for this case.
 736  *
 737  * The vm_map must be exclusively locked.
 738  * No other requirements.
 739  */
 740 void
 741 vm_map_entry_allocate_object(vm_map_entry_t entry)
 742 {
 743         vm_object_t obj;
 744
 745         if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
 746                 obj = vm_object_allocate(OBJT_DEFAULT, 0x7FFFFFFF); /* XXX */
 747         } else {
 748                 obj = vm_object_allocate(OBJT_DEFAULT,
 749                                          atop(entry->end - entry->start));
 750         }
 751         entry->object.vm_object = obj;
 752         entry->offset = 0;
 753 }
 754
 755 /*
 756  * Set an initial negative count so the first attempt to reserve
 757  * space preloads a bunch of vm_map_entry's for this cpu.  Also
 758  * pre-allocate 2 vm_map_entries which will be needed by zalloc() to
 759  * map a new page for vm_map_entry structures.  SMP systems are
 760  * particularly sensitive.
 761  *
 762  * This routine is called in early boot so we cannot just call
 763  * vm_map_entry_reserve().
 764  *
 765  * Called from the low level boot code only (for each cpu)
 766  *
 767  * WARNING! Take care not to have too-big a static/BSS structure here
 768  *          as MAXCPU can be 256+, otherwise the loader's 64MB heap
 769  *          can get blown out by the kernel plus the initrd image.
 770  */
 771 void
 772 vm_map_entry_reserve_cpu_init(globaldata_t gd)
 773 {
 774         vm_map_entry_t entry;
 775         int count;
 776         int i;
 777
 778         atomic_add_int(&gd->gd_vme_avail, -MAP_RESERVE_COUNT * 2);
 779         if (gd->gd_cpuid == 0) {
 780                 entry = &cpu_map_entry_init_bsp[0];
 781                 count = MAPENTRYBSP_CACHE;
 782         } else {
 783                 entry = &cpu_map_entry_init_ap[gd->gd_cpuid][0];
 784                 count = MAPENTRYAP_CACHE;
 785         }
 786         for (i = 0; i < count; ++i, ++entry) {
 787                 entry->next = gd->gd_vme_base;
 788                 gd->gd_vme_base = entry;
 789         }
 790 }
 791
 792 /*
 793  * Reserves vm_map_entry structures so code later-on can manipulate
 794  * map_entry structures within a locked map without blocking trying
 795  * to allocate a new vm_map_entry.
 796  *
 797  * No requirements.
 798  *
 799  * WARNING!  We must not decrement gd_vme_avail until after we have
 800  *           ensured that sufficient entries exist, otherwise we can
 801  *           get into an endless call recursion in the zalloc code
 802  *           itself.
 803  */
 804 int
 805 vm_map_entry_reserve(int count)
 806 {
 807         struct globaldata *gd = mycpu;
 808         vm_map_entry_t entry;
 809
 810         /*
 811          * Make sure we have enough structures in gd_vme_base to handle
 812          * the reservation request.
 813          *
 814          * Use a critical section to protect against VM faults.  It might
 815          * not be needed, but we have to be careful here.
 816          */
 817         if (gd->gd_vme_avail < count) {
 818                 crit_enter();
 819                 while (gd->gd_vme_avail < count) {
 820                         entry = zalloc(mapentzone);
 821                         entry->next = gd->gd_vme_base;
 822                         gd->gd_vme_base = entry;
 823                         atomic_add_int(&gd->gd_vme_avail, 1);
 824                 }
 825                 crit_exit();
 826         }
 827         atomic_add_int(&gd->gd_vme_avail, -count);
 828
 829         return(count);
 830 }
 831
 832 /*
 833  * Releases previously reserved vm_map_entry structures that were not
 834  * used.  If we have too much junk in our per-cpu cache clean some of
 835  * it out.
 836  *
 837  * No requirements.
 838  */
 839 void
 840 vm_map_entry_release(int count)
 841 {
 842         struct globaldata *gd = mycpu;
 843         vm_map_entry_t entry;
 844         vm_map_entry_t efree;
 845
 846         count = atomic_fetchadd_int(&gd->gd_vme_avail, count) + count;
 847         if (gd->gd_vme_avail > MAP_RESERVE_SLOP) {
 848                 efree = NULL;
 849                 crit_enter();
 850                 while (gd->gd_vme_avail > MAP_RESERVE_HYST) {
 851                         entry = gd->gd_vme_base;
 852                         KKASSERT(entry != NULL);
 853                         gd->gd_vme_base = entry->next;
 854                         atomic_add_int(&gd->gd_vme_avail, -1);
 855                         entry->next = efree;
 856                         efree = entry;
 857                 }
 858                 crit_exit();
 859                 while ((entry = efree) != NULL) {
 860                         efree = efree->next;
 861                         zfree(mapentzone, entry);
 862                 }
 863         }
 864 }
 865
 866 /*
 867  * Reserve map entry structures for use in kernel_map itself.  These
 868  * entries have *ALREADY* been reserved on a per-cpu basis when the map
 869  * was inited.  This function is used by zalloc() to avoid a recursion
 870  * when zalloc() itself needs to allocate additional kernel memory.
 871  *
 872  * This function works like the normal reserve but does not load the
 873  * vm_map_entry cache (because that would result in an infinite
 874  * recursion).  Note that gd_vme_avail may go negative.  This is expected.
 875  *
 876  * Any caller of this function must be sure to renormalize after
 877  * potentially eating entries to ensure that the reserve supply
 878  * remains intact.
 879  *
 880  * No requirements.
 881  */
 882 int
 883 vm_map_entry_kreserve(int count)
 884 {
 885         struct globaldata *gd = mycpu;
 886
 887         atomic_add_int(&gd->gd_vme_avail, -count);
 888         KASSERT(gd->gd_vme_base != NULL,
 889                 ("no reserved entries left, gd_vme_avail = %d",
 890                 gd->gd_vme_avail));
 891         return(count);
 892 }
 893
 894 /*
 895  * Release previously reserved map entries for kernel_map.  We do not
 896  * attempt to clean up like the normal release function as this would
 897  * cause an unnecessary (but probably not fatal) deep procedure call.
 898  *
 899  * No requirements.
 900  */
 901 void
 902 vm_map_entry_krelease(int count)
 903 {
 904         struct globaldata *gd = mycpu;
 905
 906         atomic_add_int(&gd->gd_vme_avail, count);
 907 }
 908
 909 /*
 910  * Allocates a VM map entry for insertion.  No entry fields are filled in.
 911  *
 912  * The entries should have previously been reserved.  The reservation count
 913  * is tracked in (*countp).
 914  *
 915  * No requirements.
 916  */
 917 static vm_map_entry_t
 918 vm_map_entry_create(vm_map_t map, int *countp)
 919 {
 920         struct globaldata *gd = mycpu;
 921         vm_map_entry_t entry;
 922
 923         KKASSERT(*countp > 0);
 924         --*countp;
 925         crit_enter();
 926         entry = gd->gd_vme_base;
 927         KASSERT(entry != NULL, ("gd_vme_base NULL! count %d", *countp));
 928         gd->gd_vme_base = entry->next;
 929         crit_exit();
 930
 931         return(entry);
 932 }
 933
 934 /*
 935  * Dispose of a vm_map_entry that is no longer being referenced.
 936  *
 937  * No requirements.
 938  */
 939 static void
 940 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry, int *countp)
 941 {
 942         struct globaldata *gd = mycpu;
 943
 944         ++*countp;
 945         crit_enter();
 946         entry->next = gd->gd_vme_base;
 947         gd->gd_vme_base = entry;
 948         crit_exit();
 949 }
 950
 951
 952 /*
 953  * Insert/remove entries from maps.
 954  *
 955  * The related map must be exclusively locked.
 956  * The caller must hold map->token
 957  * No other requirements.
 958  */
 959 static __inline void
 960 vm_map_entry_link(vm_map_t map,
 961                   vm_map_entry_t after_where,
 962                   vm_map_entry_t entry)
 963 {
 964         ASSERT_VM_MAP_LOCKED(map);
 965
 966         map->nentries++;
 967         entry->prev = after_where;
 968         entry->next = after_where->next;
 969         entry->next->prev = entry;
 970         after_where->next = entry;
 971         if (vm_map_rb_tree_RB_INSERT(&map->rb_root, entry))
 972                 panic("vm_map_entry_link: dup addr map %p ent %p", map, entry);
 973 }
 974
 975 static __inline void
 976 vm_map_entry_unlink(vm_map_t map,
 977                     vm_map_entry_t entry)
 978 {
 979         vm_map_entry_t prev;
 980         vm_map_entry_t next;
 981
 982         ASSERT_VM_MAP_LOCKED(map);
 983
 984         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
 985                 panic("vm_map_entry_unlink: attempt to mess with "
 986                       "locked entry! %p", entry);
 987         }
 988         prev = entry->prev;
 989         next = entry->next;
 990         next->prev = prev;
 991         prev->next = next;
 992         vm_map_rb_tree_RB_REMOVE(&map->rb_root, entry);
 993         map->nentries--;
 994 }
 995
 996 /*
 997  * Finds the map entry containing (or immediately preceding) the specified
 998  * address in the given map.  The entry is returned in (*entry).
 999  *
1000  * The boolean result indicates whether the address is actually contained
1001  * in the map.
1002  *
1003  * The related map must be locked.
1004  * No other requirements.
1005  */
1006 boolean_t
1007 vm_map_lookup_entry(vm_map_t map, vm_offset_t address, vm_map_entry_t *entry)
1008 {
1009         vm_map_entry_t tmp;
1010         vm_map_entry_t last;
1011
1012         ASSERT_VM_MAP_LOCKED(map);
1013
1014         /*
1015          * Locate the record from the top of the tree.  'last' tracks the
1016          * closest prior record and is returned if no match is found, which
1017          * in binary tree terms means tracking the most recent right-branch
1018          * taken.  If there is no prior record, &map->header is returned.
1019          */
1020         last = &map->header;
1021         tmp = RB_ROOT(&map->rb_root);
1022
1023         while (tmp) {
1024                 if (address >= tmp->start) {
1025                         if (address < tmp->end) {
1026                                 *entry = tmp;
1027                                 return(TRUE);
1028                         }
1029                         last = tmp;
1030                         tmp = RB_RIGHT(tmp, rb_entry);
1031                 } else {
1032                         tmp = RB_LEFT(tmp, rb_entry);
1033                 }
1034         }
1035         *entry = last;
1036         return (FALSE);
1037 }
1038
1039 /*
1040  * Inserts the given whole VM object into the target map at the specified
1041  * address range.  The object's size should match that of the address range.
1042  *
1043  * The map must be exclusively locked.
1044  * The object must be held.
1045  * The caller must have reserved sufficient vm_map_entry structures.
1046  *
1047  * If object is non-NULL, ref count must be bumped by caller prior to
1048  * making call to account for the new entry.
1049  */
1050 int
1051 vm_map_insert(vm_map_t map, int *countp, void *map_object, void *map_aux,
1052               vm_ooffset_t offset, vm_offset_t start, vm_offset_t end,
1053               vm_maptype_t maptype, vm_subsys_t id,
1054               vm_prot_t prot, vm_prot_t max, int cow)
1055 {
1056         vm_map_entry_t new_entry;
1057         vm_map_entry_t prev_entry;
1058         vm_map_entry_t temp_entry;
1059         vm_eflags_t protoeflags;
1060         int must_drop = 0;
1061         vm_object_t object;
1062
1063         if (maptype == VM_MAPTYPE_UKSMAP)
1064                 object = NULL;
1065         else
1066                 object = map_object;
1067
1068         ASSERT_VM_MAP_LOCKED(map);
1069         if (object)
1070                 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1071
1072         /*
1073          * Check that the start and end points are not bogus.
1074          */
1075         if ((start < map->min_offset) || (end > map->max_offset) ||
1076             (start >= end))
1077                 return (KERN_INVALID_ADDRESS);
1078
1079         /*
1080          * Find the entry prior to the proposed starting address; if it's part
1081          * of an existing entry, this range is bogus.
1082          */
1083         if (vm_map_lookup_entry(map, start, &temp_entry))
1084                 return (KERN_NO_SPACE);
1085
1086         prev_entry = temp_entry;
1087
1088         /*
1089          * Assert that the next entry doesn't overlap the end point.
1090          */
1091
1092         if ((prev_entry->next != &map->header) &&
1093             (prev_entry->next->start < end))
1094                 return (KERN_NO_SPACE);
1095
1096         protoeflags = 0;
1097
1098         if (cow & MAP_COPY_ON_WRITE)
1099                 protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY;
1100
1101         if (cow & MAP_NOFAULT) {
1102                 protoeflags |= MAP_ENTRY_NOFAULT;
1103
1104                 KASSERT(object == NULL,
1105                         ("vm_map_insert: paradoxical MAP_NOFAULT request"));
1106         }
1107         if (cow & MAP_DISABLE_SYNCER)
1108                 protoeflags |= MAP_ENTRY_NOSYNC;
1109         if (cow & MAP_DISABLE_COREDUMP)
1110                 protoeflags |= MAP_ENTRY_NOCOREDUMP;
1111         if (cow & MAP_IS_STACK)
1112                 protoeflags |= MAP_ENTRY_STACK;
1113         if (cow & MAP_IS_KSTACK)
1114                 protoeflags |= MAP_ENTRY_KSTACK;
1115
1116         lwkt_gettoken(&map->token);
1117
1118         if (object) {
1119                 /*
1120                  * When object is non-NULL, it could be shared with another
1121                  * process.  We have to set or clear OBJ_ONEMAPPING
1122                  * appropriately.
1123                  *
1124                  * NOTE: This flag is only applicable to DEFAULT and SWAP
1125                  *       objects and will already be clear in other types
1126                  *       of objects, so a shared object lock is ok for
1127                  *       VNODE objects.
1128                  */
1129                 if ((object->ref_count > 1) || (object->shadow_count != 0)) {
1130                         vm_object_clear_flag(object, OBJ_ONEMAPPING);
1131                 }
1132         }
1133         else if ((prev_entry != &map->header) &&
1134                  (prev_entry->eflags == protoeflags) &&
1135                  (prev_entry->end == start) &&
1136                  (prev_entry->wired_count == 0) &&
1137                  (prev_entry->id == id) &&
1138                  prev_entry->maptype == maptype &&
1139                  maptype == VM_MAPTYPE_NORMAL &&
1140                  ((prev_entry->object.vm_object == NULL) ||
1141                   vm_object_coalesce(prev_entry->object.vm_object,
1142                                      OFF_TO_IDX(prev_entry->offset),
1143                                      (vm_size_t)(prev_entry->end - prev_entry->start),
1144                                      (vm_size_t)(end - prev_entry->end)))) {
1145                 /*
1146                  * We were able to extend the object.  Determine if we
1147                  * can extend the previous map entry to include the
1148                  * new range as well.
1149                  */
1150                 if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) &&
1151                     (prev_entry->protection == prot) &&
1152                     (prev_entry->max_protection == max)) {
1153                         map->size += (end - prev_entry->end);
1154                         prev_entry->end = end;
1155                         vm_map_simplify_entry(map, prev_entry, countp);
1156                         lwkt_reltoken(&map->token);
1157                         return (KERN_SUCCESS);
1158                 }
1159
1160                 /*
1161                  * If we can extend the object but cannot extend the
1162                  * map entry, we have to create a new map entry.  We
1163                  * must bump the ref count on the extended object to
1164                  * account for it.  object may be NULL.
1165                  *
1166                  * XXX if object is NULL should we set offset to 0 here ?
1167                  */
1168                 object = prev_entry->object.vm_object;
1169                 offset = prev_entry->offset +
1170                         (prev_entry->end - prev_entry->start);
1171                 if (object) {
1172                         vm_object_hold(object);
1173                         vm_object_chain_wait(object, 0);
1174                         vm_object_reference_locked(object);
1175                         must_drop = 1;
1176                         map_object = object;
1177                 }
1178         }
1179
1180         /*
1181          * NOTE: if conditionals fail, object can be NULL here.  This occurs
1182          * in things like the buffer map where we manage kva but do not manage
1183          * backing objects.
1184          */
1185
1186         /*
1187          * Create a new entry
1188          */
1189
1190         new_entry = vm_map_entry_create(map, countp);
1191         new_entry->start = start;
1192         new_entry->end = end;
1193         new_entry->id = id;
1194
1195         new_entry->maptype = maptype;
1196         new_entry->eflags = protoeflags;
1197         new_entry->object.map_object = map_object;
1198         new_entry->aux.master_pde = 0;          /* in case size is different */
1199         new_entry->aux.map_aux = map_aux;
1200         new_entry->offset = offset;
1201
1202         new_entry->inheritance = VM_INHERIT_DEFAULT;
1203         new_entry->protection = prot;
1204         new_entry->max_protection = max;
1205         new_entry->wired_count = 0;
1206
1207         /*
1208          * Insert the new entry into the list
1209          */
1210
1211         vm_map_entry_link(map, prev_entry, new_entry);
1212         map->size += new_entry->end - new_entry->start;
1213
1214         /*
1215          * Don't worry about updating freehint[] when inserting, allow
1216          * addresses to be lower than the actual first free spot.
1217          */
1218 #if 0
1219         /*
1220          * Temporarily removed to avoid MAP_STACK panic, due to
1221          * MAP_STACK being a huge hack.  Will be added back in
1222          * when MAP_STACK (and the user stack mapping) is fixed.
1223          */
1224         /*
1225          * It may be possible to simplify the entry
1226          */
1227         vm_map_simplify_entry(map, new_entry, countp);
1228 #endif
1229
1230         /*
1231          * Try to pre-populate the page table.  Mappings governed by virtual
1232          * page tables cannot be prepopulated without a lot of work, so
1233          * don't try.
1234          */
1235         if ((cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) &&
1236             maptype != VM_MAPTYPE_VPAGETABLE &&
1237             maptype != VM_MAPTYPE_UKSMAP) {
1238                 int dorelock = 0;
1239                 if (vm_map_relock_enable && (cow & MAP_PREFAULT_RELOCK)) {
1240                         dorelock = 1;
1241                         vm_object_lock_swap();
1242                         vm_object_drop(object);
1243                 }
1244                 pmap_object_init_pt(map->pmap, start, prot,
1245                                     object, OFF_TO_IDX(offset), end - start,
1246                                     cow & MAP_PREFAULT_PARTIAL);
1247                 if (dorelock) {
1248                         vm_object_hold(object);
1249                         vm_object_lock_swap();
1250                 }
1251         }
1252         if (must_drop)
1253                 vm_object_drop(object);
1254
1255         lwkt_reltoken(&map->token);
1256         return (KERN_SUCCESS);
1257 }
1258
1259 /*
1260  * Find sufficient space for `length' bytes in the given map, starting at
1261  * `start'.  Returns 0 on success, 1 on no space.
1262  *
1263  * This function will returned an arbitrarily aligned pointer.  If no
1264  * particular alignment is required you should pass align as 1.  Note that
1265  * the map may return PAGE_SIZE aligned pointers if all the lengths used in
1266  * the map are a multiple of PAGE_SIZE, even if you pass a smaller align
1267  * argument.
1268  *
1269  * 'align' should be a power of 2 but is not required to be.
1270  *
1271  * The map must be exclusively locked.
1272  * No other requirements.
1273  */
1274 int
1275 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length,
1276                  vm_size_t align, int flags, vm_offset_t *addr)
1277 {
1278         vm_map_entry_t entry, next;
1279         vm_map_entry_t tmp;
1280         vm_offset_t hole_start;
1281         vm_offset_t end;
1282         vm_offset_t align_mask;
1283
1284         if (start < map->min_offset)
1285                 start = map->min_offset;
1286         if (start > map->max_offset)
1287                 return (1);
1288
1289         /*
1290          * If the alignment is not a power of 2 we will have to use
1291          * a mod/division, set align_mask to a special value.
1292          */
1293         if ((align | (align - 1)) + 1 != (align << 1))
1294                 align_mask = (vm_offset_t)-1;
1295         else
1296                 align_mask = align - 1;
1297
1298         /*
1299          * Use freehint to adjust the start point, hopefully reducing
1300          * the iteration to O(1).
1301          */
1302         hole_start = vm_map_freehint_find(map, length, align);
1303         if (start < hole_start)
1304                 start = hole_start;
1305         if (vm_map_lookup_entry(map, start, &tmp))
1306                 start = tmp->end;
1307         entry = tmp;
1308
1309         /*
1310          * Look through the rest of the map, trying to fit a new region in the
1311          * gap between existing regions, or after the very last region.
1312          */
1313         for (;; start = (entry = next)->end) {
1314                 /*
1315                  * Adjust the proposed start by the requested alignment,
1316                  * be sure that we didn't wrap the address.
1317                  */
1318                 if (align_mask == (vm_offset_t)-1)
1319                         end = roundup(start, align);
1320                 else
1321                         end = (start + align_mask) & ~align_mask;
1322                 if (end < start)
1323                         return (1);
1324                 start = end;
1325
1326                 /*
1327                  * Find the end of the proposed new region.  Be sure we didn't
1328                  * go beyond the end of the map, or wrap around the address.
1329                  * Then check to see if this is the last entry or if the
1330                  * proposed end fits in the gap between this and the next
1331                  * entry.
1332                  */
1333                 end = start + length;
1334                 if (end > map->max_offset || end < start)
1335                         return (1);
1336                 next = entry->next;
1337
1338                 /*
1339                  * If the next entry's start address is beyond the desired
1340                  * end address we may have found a good entry.
1341                  *
1342                  * If the next entry is a stack mapping we do not map into
1343                  * the stack's reserved space.
1344                  *
1345                  * XXX continue to allow mapping into the stack's reserved
1346                  * space if doing a MAP_STACK mapping inside a MAP_STACK
1347                  * mapping, for backwards compatibility.  But the caller
1348                  * really should use MAP_STACK | MAP_TRYFIXED if they
1349                  * want to do that.
1350                  */
1351                 if (next == &map->header)
1352                         break;
1353                 if (next->start >= end) {
1354                         if ((next->eflags & MAP_ENTRY_STACK) == 0)
1355                                 break;
1356                         if (flags & MAP_STACK)
1357                                 break;
1358                         if (next->start - next->aux.avail_ssize >= end)
1359                                 break;
1360                 }
1361         }
1362
1363         /*
1364          * Update the freehint
1365          */
1366         vm_map_freehint_update(map, start, length, align);
1367
1368         /*
1369          * Grow the kernel_map if necessary.  pmap_growkernel() will panic
1370          * if it fails.  The kernel_map is locked and nothing can steal
1371          * our address space if pmap_growkernel() blocks.
1372          *
1373          * NOTE: This may be unconditionally called for kldload areas on
1374          *       x86_64 because these do not bump kernel_vm_end (which would
1375          *       fill 128G worth of page tables!).  Therefore we must not
1376          *       retry.
1377          */
1378         if (map == &kernel_map) {
1379                 vm_offset_t kstop;
1380
1381                 kstop = round_page(start + length);
1382                 if (kstop > kernel_vm_end)
1383                         pmap_growkernel(start, kstop);
1384         }
1385         *addr = start;
1386         return (0);
1387 }
1388
1389 /*
1390  * vm_map_find finds an unallocated region in the target address map with
1391  * the given length and allocates it.  The search is defined to be first-fit
1392  * from the specified address; the region found is returned in the same
1393  * parameter.
1394  *
1395  * If object is non-NULL, ref count must be bumped by caller
1396  * prior to making call to account for the new entry.
1397  *
1398  * No requirements.  This function will lock the map temporarily.
1399  */
1400 int
1401 vm_map_find(vm_map_t map, void *map_object, void *map_aux,
1402             vm_ooffset_t offset, vm_offset_t *addr,
1403             vm_size_t length, vm_size_t align, boolean_t fitit,
1404             vm_maptype_t maptype, vm_subsys_t id,
1405             vm_prot_t prot, vm_prot_t max, int cow)
1406 {
1407         vm_offset_t start;
1408         vm_object_t object;
1409         int result;
1410         int count;
1411
1412         if (maptype == VM_MAPTYPE_UKSMAP)
1413                 object = NULL;
1414         else
1415                 object = map_object;
1416
1417         start = *addr;
1418
1419         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1420         vm_map_lock(map);
1421         if (object)
1422                 vm_object_hold_shared(object);
1423         if (fitit) {
1424                 if (vm_map_findspace(map, start, length, align, 0, addr)) {
1425                         if (object)
1426                                 vm_object_drop(object);
1427                         vm_map_unlock(map);
1428                         vm_map_entry_release(count);
1429                         return (KERN_NO_SPACE);
1430                 }
1431                 start = *addr;
1432         }
1433         result = vm_map_insert(map, &count, map_object, map_aux,
1434                                offset, start, start + length,
1435                                maptype, id, prot, max, cow);
1436         if (object)
1437                 vm_object_drop(object);
1438         vm_map_unlock(map);
1439         vm_map_entry_release(count);
1440
1441         return (result);
1442 }
1443
1444 /*
1445  * Simplify the given map entry by merging with either neighbor.  This
1446  * routine also has the ability to merge with both neighbors.
1447  *
1448  * This routine guarentees that the passed entry remains valid (though
1449  * possibly extended).  When merging, this routine may delete one or
1450  * both neighbors.  No action is taken on entries which have their
1451  * in-transition flag set.
1452  *
1453  * The map must be exclusively locked.
1454  */
1455 void
1456 vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry, int *countp)
1457 {
1458         vm_map_entry_t next, prev;
1459         vm_size_t prevsize, esize;
1460
1461         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1462                 ++mycpu->gd_cnt.v_intrans_coll;
1463                 return;
1464         }
1465
1466         if (entry->maptype == VM_MAPTYPE_SUBMAP)
1467                 return;
1468         if (entry->maptype == VM_MAPTYPE_UKSMAP)
1469                 return;
1470
1471         prev = entry->prev;
1472         if (prev != &map->header) {
1473                 prevsize = prev->end - prev->start;
1474                 if ( (prev->end == entry->start) &&
1475                      (prev->maptype == entry->maptype) &&
1476                      (prev->object.vm_object == entry->object.vm_object) &&
1477                      (!prev->object.vm_object ||
1478                         (prev->offset + prevsize == entry->offset)) &&
1479                      (prev->eflags == entry->eflags) &&
1480                      (prev->protection == entry->protection) &&
1481                      (prev->max_protection == entry->max_protection) &&
1482                      (prev->inheritance == entry->inheritance) &&
1483                      (prev->id == entry->id) &&
1484                      (prev->wired_count == entry->wired_count)) {
1485                         vm_map_entry_unlink(map, prev);
1486                         entry->start = prev->start;
1487                         entry->offset = prev->offset;
1488                         if (prev->object.vm_object)
1489                                 vm_object_deallocate(prev->object.vm_object);
1490                         vm_map_entry_dispose(map, prev, countp);
1491                 }
1492         }
1493
1494         next = entry->next;
1495         if (next != &map->header) {
1496                 esize = entry->end - entry->start;
1497                 if ((entry->end == next->start) &&
1498                     (next->maptype == entry->maptype) &&
1499                     (next->object.vm_object == entry->object.vm_object) &&
1500                      (!entry->object.vm_object ||
1501                         (entry->offset + esize == next->offset)) &&
1502                     (next->eflags == entry->eflags) &&
1503                     (next->protection == entry->protection) &&
1504                     (next->max_protection == entry->max_protection) &&
1505                     (next->inheritance == entry->inheritance) &&
1506                     (next->id == entry->id) &&
1507                     (next->wired_count == entry->wired_count)) {
1508                         vm_map_entry_unlink(map, next);
1509                         entry->end = next->end;
1510                         if (next->object.vm_object)
1511                                 vm_object_deallocate(next->object.vm_object);
1512                         vm_map_entry_dispose(map, next, countp);
1513                 }
1514         }
1515 }
1516
1517 /*
1518  * Asserts that the given entry begins at or after the specified address.
1519  * If necessary, it splits the entry into two.
1520  */
1521 #define vm_map_clip_start(map, entry, startaddr, countp)                \
1522 {                                                                       \
1523         if (startaddr > entry->start)                                   \
1524                 _vm_map_clip_start(map, entry, startaddr, countp);      \
1525 }
1526
1527 /*
1528  * This routine is called only when it is known that the entry must be split.
1529  *
1530  * The map must be exclusively locked.
1531  */
1532 static void
1533 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start,
1534                    int *countp)
1535 {
1536         vm_map_entry_t new_entry;
1537
1538         /*
1539          * Split off the front portion -- note that we must insert the new
1540          * entry BEFORE this one, so that this entry has the specified
1541          * starting address.
1542          */
1543
1544         vm_map_simplify_entry(map, entry, countp);
1545
1546         /*
1547          * If there is no object backing this entry, we might as well create
1548          * one now.  If we defer it, an object can get created after the map
1549          * is clipped, and individual objects will be created for the split-up
1550          * map.  This is a bit of a hack, but is also about the best place to
1551          * put this improvement.
1552          */
1553         if (entry->object.vm_object == NULL && !map->system_map &&
1554             VM_MAP_ENTRY_WITHIN_PARTITION(entry)) {
1555                 vm_map_entry_allocate_object(entry);
1556         }
1557
1558         new_entry = vm_map_entry_create(map, countp);
1559         *new_entry = *entry;
1560
1561         new_entry->end = start;
1562         entry->offset += (start - entry->start);
1563         entry->start = start;
1564
1565         vm_map_entry_link(map, entry->prev, new_entry);
1566
1567         switch(entry->maptype) {
1568         case VM_MAPTYPE_NORMAL:
1569         case VM_MAPTYPE_VPAGETABLE:
1570                 if (new_entry->object.vm_object) {
1571                         vm_object_hold(new_entry->object.vm_object);
1572                         vm_object_chain_wait(new_entry->object.vm_object, 0);
1573                         vm_object_reference_locked(new_entry->object.vm_object);
1574                         vm_object_drop(new_entry->object.vm_object);
1575                 }
1576                 break;
1577         default:
1578                 break;
1579         }
1580 }
1581
1582 /*
1583  * Asserts that the given entry ends at or before the specified address.
1584  * If necessary, it splits the entry into two.
1585  *
1586  * The map must be exclusively locked.
1587  */
1588 #define vm_map_clip_end(map, entry, endaddr, countp)            \
1589 {                                                               \
1590         if (endaddr < entry->end)                               \
1591                 _vm_map_clip_end(map, entry, endaddr, countp);  \
1592 }
1593
1594 /*
1595  * This routine is called only when it is known that the entry must be split.
1596  *
1597  * The map must be exclusively locked.
1598  */
1599 static void
1600 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end,
1601                  int *countp)
1602 {
1603         vm_map_entry_t new_entry;
1604
1605         /*
1606          * If there is no object backing this entry, we might as well create
1607          * one now.  If we defer it, an object can get created after the map
1608          * is clipped, and individual objects will be created for the split-up
1609          * map.  This is a bit of a hack, but is also about the best place to
1610          * put this improvement.
1611          */
1612
1613         if (entry->object.vm_object == NULL && !map->system_map &&
1614             VM_MAP_ENTRY_WITHIN_PARTITION(entry)) {
1615                 vm_map_entry_allocate_object(entry);
1616         }
1617
1618         /*
1619          * Create a new entry and insert it AFTER the specified entry
1620          */
1621         new_entry = vm_map_entry_create(map, countp);
1622         *new_entry = *entry;
1623
1624         new_entry->start = entry->end = end;
1625         new_entry->offset += (end - entry->start);
1626
1627         vm_map_entry_link(map, entry, new_entry);
1628
1629         switch(entry->maptype) {
1630         case VM_MAPTYPE_NORMAL:
1631         case VM_MAPTYPE_VPAGETABLE:
1632                 if (new_entry->object.vm_object) {
1633                         vm_object_hold(new_entry->object.vm_object);
1634                         vm_object_chain_wait(new_entry->object.vm_object, 0);
1635                         vm_object_reference_locked(new_entry->object.vm_object);
1636                         vm_object_drop(new_entry->object.vm_object);
1637                 }
1638                 break;
1639         default:
1640                 break;
1641         }
1642 }
1643
1644 /*
1645  * Asserts that the starting and ending region addresses fall within the
1646  * valid range for the map.
1647  */
1648 #define VM_MAP_RANGE_CHECK(map, start, end)     \
1649 {                                               \
1650         if (start < vm_map_min(map))            \
1651                 start = vm_map_min(map);        \
1652         if (end > vm_map_max(map))              \
1653                 end = vm_map_max(map);          \
1654         if (start > end)                        \
1655                 start = end;                    \
1656 }
1657
1658 /*
1659  * Used to block when an in-transition collison occurs.  The map
1660  * is unlocked for the sleep and relocked before the return.
1661  */
1662 void
1663 vm_map_transition_wait(vm_map_t map, int relock)
1664 {
1665         tsleep_interlock(map, 0);
1666         vm_map_unlock(map);
1667         tsleep(map, PINTERLOCKED, "vment", 0);
1668         if (relock)
1669                 vm_map_lock(map);
1670 }
1671
1672 /*
1673  * When we do blocking operations with the map lock held it is
1674  * possible that a clip might have occured on our in-transit entry,
1675  * requiring an adjustment to the entry in our loop.  These macros
1676  * help the pageable and clip_range code deal with the case.  The
1677  * conditional costs virtually nothing if no clipping has occured.
1678  */
1679
1680 #define CLIP_CHECK_BACK(entry, save_start)              \
1681     do {                                                \
1682             while (entry->start != save_start) {        \
1683                     entry = entry->prev;                \
1684                     KASSERT(entry != &map->header, ("bad entry clip")); \
1685             }                                           \
1686     } while(0)
1687
1688 #define CLIP_CHECK_FWD(entry, save_end)                 \
1689     do {                                                \
1690             while (entry->end != save_end) {            \
1691                     entry = entry->next;                \
1692                     KASSERT(entry != &map->header, ("bad entry clip")); \
1693             }                                           \
1694     } while(0)
1695
1696
1697 /*
1698  * Clip the specified range and return the base entry.  The
1699  * range may cover several entries starting at the returned base
1700  * and the first and last entry in the covering sequence will be
1701  * properly clipped to the requested start and end address.
1702  *
1703  * If no holes are allowed you should pass the MAP_CLIP_NO_HOLES
1704  * flag.
1705  *
1706  * The MAP_ENTRY_IN_TRANSITION flag will be set for the entries
1707  * covered by the requested range.
1708  *
1709  * The map must be exclusively locked on entry and will remain locked
1710  * on return. If no range exists or the range contains holes and you
1711  * specified that no holes were allowed, NULL will be returned.  This
1712  * routine may temporarily unlock the map in order avoid a deadlock when
1713  * sleeping.
1714  */
1715 static
1716 vm_map_entry_t
1717 vm_map_clip_range(vm_map_t map, vm_offset_t start, vm_offset_t end,
1718                   int *countp, int flags)
1719 {
1720         vm_map_entry_t start_entry;
1721         vm_map_entry_t entry;
1722
1723         /*
1724          * Locate the entry and effect initial clipping.  The in-transition
1725          * case does not occur very often so do not try to optimize it.
1726          */
1727 again:
1728         if (vm_map_lookup_entry(map, start, &start_entry) == FALSE)
1729                 return (NULL);
1730         entry = start_entry;
1731         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1732                 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1733                 ++mycpu->gd_cnt.v_intrans_coll;
1734                 ++mycpu->gd_cnt.v_intrans_wait;
1735                 vm_map_transition_wait(map, 1);
1736                 /*
1737                  * entry and/or start_entry may have been clipped while
1738                  * we slept, or may have gone away entirely.  We have
1739                  * to restart from the lookup.
1740                  */
1741                 goto again;
1742         }
1743
1744         /*
1745          * Since we hold an exclusive map lock we do not have to restart
1746          * after clipping, even though clipping may block in zalloc.
1747          */
1748         vm_map_clip_start(map, entry, start, countp);
1749         vm_map_clip_end(map, entry, end, countp);
1750         entry->eflags |= MAP_ENTRY_IN_TRANSITION;
1751
1752         /*
1753          * Scan entries covered by the range.  When working on the next
1754          * entry a restart need only re-loop on the current entry which
1755          * we have already locked, since 'next' may have changed.  Also,
1756          * even though entry is safe, it may have been clipped so we
1757          * have to iterate forwards through the clip after sleeping.
1758          */
1759         while (entry->next != &map->header && entry->next->start < end) {
1760                 vm_map_entry_t next = entry->next;
1761
1762                 if (flags & MAP_CLIP_NO_HOLES) {
1763                         if (next->start > entry->end) {
1764                                 vm_map_unclip_range(map, start_entry,
1765                                         start, entry->end, countp, flags);
1766                                 return(NULL);
1767                         }
1768                 }
1769
1770                 if (next->eflags & MAP_ENTRY_IN_TRANSITION) {
1771                         vm_offset_t save_end = entry->end;
1772                         next->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1773                         ++mycpu->gd_cnt.v_intrans_coll;
1774                         ++mycpu->gd_cnt.v_intrans_wait;
1775                         vm_map_transition_wait(map, 1);
1776
1777                         /*
1778                          * clips might have occured while we blocked.
1779                          */
1780                         CLIP_CHECK_FWD(entry, save_end);
1781                         CLIP_CHECK_BACK(start_entry, start);
1782                         continue;
1783                 }
1784
1785                 /*
1786                  * No restart necessary even though clip_end may block, we
1787                  * are holding the map lock.
1788                  */
1789                 vm_map_clip_end(map, next, end, countp);
1790                 next->eflags |= MAP_ENTRY_IN_TRANSITION;
1791                 entry = next;
1792         }
1793         if (flags & MAP_CLIP_NO_HOLES) {
1794                 if (entry->end != end) {
1795                         vm_map_unclip_range(map, start_entry,
1796                                 start, entry->end, countp, flags);
1797                         return(NULL);
1798                 }
1799         }
1800         return(start_entry);
1801 }
1802
1803 /*
1804  * Undo the effect of vm_map_clip_range().  You should pass the same
1805  * flags and the same range that you passed to vm_map_clip_range().
1806  * This code will clear the in-transition flag on the entries and
1807  * wake up anyone waiting.  This code will also simplify the sequence
1808  * and attempt to merge it with entries before and after the sequence.
1809  *
1810  * The map must be locked on entry and will remain locked on return.
1811  *
1812  * Note that you should also pass the start_entry returned by
1813  * vm_map_clip_range().  However, if you block between the two calls
1814  * with the map unlocked please be aware that the start_entry may
1815  * have been clipped and you may need to scan it backwards to find
1816  * the entry corresponding with the original start address.  You are
1817  * responsible for this, vm_map_unclip_range() expects the correct
1818  * start_entry to be passed to it and will KASSERT otherwise.
1819  */
1820 static
1821 void
1822 vm_map_unclip_range(vm_map_t map, vm_map_entry_t start_entry,
1823                     vm_offset_t start, vm_offset_t end,
1824                     int *countp, int flags)
1825 {
1826         vm_map_entry_t entry;
1827
1828         entry = start_entry;
1829
1830         KASSERT(entry->start == start, ("unclip_range: illegal base entry"));
1831         while (entry != &map->header && entry->start < end) {
1832                 KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
1833                         ("in-transition flag not set during unclip on: %p",
1834                         entry));
1835                 KASSERT(entry->end <= end,
1836                         ("unclip_range: tail wasn't clipped"));
1837                 entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
1838                 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
1839                         entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
1840                         wakeup(map);
1841                 }
1842                 entry = entry->next;
1843         }
1844
1845         /*
1846          * Simplification does not block so there is no restart case.
1847          */
1848         entry = start_entry;
1849         while (entry != &map->header && entry->start < end) {
1850                 vm_map_simplify_entry(map, entry, countp);
1851                 entry = entry->next;
1852         }
1853 }
1854
1855 /*
1856  * Mark the given range as handled by a subordinate map.
1857  *
1858  * This range must have been created with vm_map_find(), and no other
1859  * operations may have been performed on this range prior to calling
1860  * vm_map_submap().
1861  *
1862  * Submappings cannot be removed.
1863  *
1864  * No requirements.
1865  */
1866 int
1867 vm_map_submap(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_map_t submap)
1868 {
1869         vm_map_entry_t entry;
1870         int result = KERN_INVALID_ARGUMENT;
1871         int count;
1872
1873         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1874         vm_map_lock(map);
1875
1876         VM_MAP_RANGE_CHECK(map, start, end);
1877
1878         if (vm_map_lookup_entry(map, start, &entry)) {
1879                 vm_map_clip_start(map, entry, start, &count);
1880         } else {
1881                 entry = entry->next;
1882         }
1883
1884         vm_map_clip_end(map, entry, end, &count);
1885
1886         if ((entry->start == start) && (entry->end == end) &&
1887             ((entry->eflags & MAP_ENTRY_COW) == 0) &&
1888             (entry->object.vm_object == NULL)) {
1889                 entry->object.sub_map = submap;
1890                 entry->maptype = VM_MAPTYPE_SUBMAP;
1891                 result = KERN_SUCCESS;
1892         }
1893         vm_map_unlock(map);
1894         vm_map_entry_release(count);
1895
1896         return (result);
1897 }
1898
1899 /*
1900  * Sets the protection of the specified address region in the target map.
1901  * If "set_max" is specified, the maximum protection is to be set;
1902  * otherwise, only the current protection is affected.
1903  *
1904  * The protection is not applicable to submaps, but is applicable to normal
1905  * maps and maps governed by virtual page tables.  For example, when operating
1906  * on a virtual page table our protection basically controls how COW occurs
1907  * on the backing object, whereas the virtual page table abstraction itself
1908  * is an abstraction for userland.
1909  *
1910  * No requirements.
1911  */
1912 int
1913 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
1914                vm_prot_t new_prot, boolean_t set_max)
1915 {
1916         vm_map_entry_t current;
1917         vm_map_entry_t entry;
1918         int count;
1919
1920         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1921         vm_map_lock(map);
1922
1923         VM_MAP_RANGE_CHECK(map, start, end);
1924
1925         if (vm_map_lookup_entry(map, start, &entry)) {
1926                 vm_map_clip_start(map, entry, start, &count);
1927         } else {
1928                 entry = entry->next;
1929         }
1930
1931         /*
1932          * Make a first pass to check for protection violations.
1933          */
1934         current = entry;
1935         while ((current != &map->header) && (current->start < end)) {
1936                 if (current->maptype == VM_MAPTYPE_SUBMAP) {
1937                         vm_map_unlock(map);
1938                         vm_map_entry_release(count);
1939                         return (KERN_INVALID_ARGUMENT);
1940                 }
1941                 if ((new_prot & current->max_protection) != new_prot) {
1942                         vm_map_unlock(map);
1943                         vm_map_entry_release(count);
1944                         return (KERN_PROTECTION_FAILURE);
1945                 }
1946
1947                 /*
1948                  * When making a SHARED+RW file mmap writable, update
1949                  * v_lastwrite_ts.
1950                  */
1951                 if (new_prot & PROT_WRITE &&
1952                     (current->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
1953                     (current->maptype == VM_MAPTYPE_NORMAL ||
1954                      current->maptype == VM_MAPTYPE_VPAGETABLE) &&
1955                     current->object.vm_object &&
1956                     current->object.vm_object->type == OBJT_VNODE) {
1957                         struct vnode *vp;
1958
1959                         vp = current->object.vm_object->handle;
1960                         if (vp && vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT) == 0) {
1961                                 vfs_timestamp(&vp->v_lastwrite_ts);
1962                                 vsetflags(vp, VLASTWRITETS);
1963                                 vn_unlock(vp);
1964                         }
1965                 }
1966                 current = current->next;
1967         }
1968
1969         /*
1970          * Go back and fix up protections. [Note that clipping is not
1971          * necessary the second time.]
1972          */
1973         current = entry;
1974
1975         while ((current != &map->header) && (current->start < end)) {
1976                 vm_prot_t old_prot;
1977
1978                 vm_map_clip_end(map, current, end, &count);
1979
1980                 old_prot = current->protection;
1981                 if (set_max) {
1982                         current->max_protection = new_prot;
1983                         current->protection = new_prot & old_prot;
1984                 } else {
1985                         current->protection = new_prot;
1986                 }
1987
1988                 /*
1989                  * Update physical map if necessary. Worry about copy-on-write
1990                  * here -- CHECK THIS XXX
1991                  */
1992                 if (current->protection != old_prot) {
1993 #define MASK(entry)     (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
1994                                                         VM_PROT_ALL)
1995
1996                         pmap_protect(map->pmap, current->start,
1997                             current->end,
1998                             current->protection & MASK(current));
1999 #undef  MASK
2000                 }
2001
2002                 vm_map_simplify_entry(map, current, &count);
2003
2004                 current = current->next;
2005         }
2006         vm_map_unlock(map);
2007         vm_map_entry_release(count);
2008         return (KERN_SUCCESS);
2009 }
2010
2011 /*
2012  * This routine traverses a processes map handling the madvise
2013  * system call.  Advisories are classified as either those effecting
2014  * the vm_map_entry structure, or those effecting the underlying
2015  * objects.
2016  *
2017  * The <value> argument is used for extended madvise calls.
2018  *
2019  * No requirements.
2020  */
2021 int
2022 vm_map_madvise(vm_map_t map, vm_offset_t start, vm_offset_t end,
2023                int behav, off_t value)
2024 {
2025         vm_map_entry_t current, entry;
2026         int modify_map = 0;
2027         int error = 0;
2028         int count;
2029
2030         /*
2031          * Some madvise calls directly modify the vm_map_entry, in which case
2032          * we need to use an exclusive lock on the map and we need to perform
2033          * various clipping operations.  Otherwise we only need a read-lock
2034          * on the map.
2035          */
2036         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2037
2038         switch(behav) {
2039         case MADV_NORMAL:
2040         case MADV_SEQUENTIAL:
2041         case MADV_RANDOM:
2042         case MADV_NOSYNC:
2043         case MADV_AUTOSYNC:
2044         case MADV_NOCORE:
2045         case MADV_CORE:
2046         case MADV_SETMAP:
2047                 modify_map = 1;
2048                 vm_map_lock(map);
2049                 break;
2050         case MADV_INVAL:
2051         case MADV_WILLNEED:
2052         case MADV_DONTNEED:
2053         case MADV_FREE:
2054                 vm_map_lock_read(map);
2055                 break;
2056         default:
2057                 vm_map_entry_release(count);
2058                 return (EINVAL);
2059         }
2060
2061         /*
2062          * Locate starting entry and clip if necessary.
2063          */
2064
2065         VM_MAP_RANGE_CHECK(map, start, end);
2066
2067         if (vm_map_lookup_entry(map, start, &entry)) {
2068                 if (modify_map)
2069                         vm_map_clip_start(map, entry, start, &count);
2070         } else {
2071                 entry = entry->next;
2072         }
2073
2074         if (modify_map) {
2075                 /*
2076                  * madvise behaviors that are implemented in the vm_map_entry.
2077                  *
2078                  * We clip the vm_map_entry so that behavioral changes are
2079                  * limited to the specified address range.
2080                  */
2081                 for (current = entry;
2082                      (current != &map->header) && (current->start < end);
2083                      current = current->next
2084                 ) {
2085                         if (current->maptype == VM_MAPTYPE_SUBMAP)
2086                                 continue;
2087
2088                         vm_map_clip_end(map, current, end, &count);
2089
2090                         switch (behav) {
2091                         case MADV_NORMAL:
2092                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
2093                                 break;
2094                         case MADV_SEQUENTIAL:
2095                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
2096                                 break;
2097                         case MADV_RANDOM:
2098                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
2099                                 break;
2100                         case MADV_NOSYNC:
2101                                 current->eflags |= MAP_ENTRY_NOSYNC;
2102                                 break;
2103                         case MADV_AUTOSYNC:
2104                                 current->eflags &= ~MAP_ENTRY_NOSYNC;
2105                                 break;
2106                         case MADV_NOCORE:
2107                                 current->eflags |= MAP_ENTRY_NOCOREDUMP;
2108                                 break;
2109                         case MADV_CORE:
2110                                 current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
2111                                 break;
2112                         case MADV_SETMAP:
2113                                 /*
2114                                  * Set the page directory page for a map
2115                                  * governed by a virtual page table.  Mark
2116                                  * the entry as being governed by a virtual
2117                                  * page table if it is not.
2118                                  *
2119                                  * XXX the page directory page is stored
2120                                  * in the avail_ssize field if the map_entry.
2121                                  *
2122                                  * XXX the map simplification code does not
2123                                  * compare this field so weird things may
2124                                  * happen if you do not apply this function
2125                                  * to the entire mapping governed by the
2126                                  * virtual page table.
2127                                  */
2128                                 if (current->maptype != VM_MAPTYPE_VPAGETABLE) {
2129                                         error = EINVAL;
2130                                         break;
2131                                 }
2132                                 current->aux.master_pde = value;
2133                                 pmap_remove(map->pmap,
2134                                             current->start, current->end);
2135                                 break;
2136                         case MADV_INVAL:
2137                                 /*
2138                                  * Invalidate the related pmap entries, used
2139                                  * to flush portions of the real kernel's
2140                                  * pmap when the caller has removed or
2141                                  * modified existing mappings in a virtual
2142                                  * page table.
2143                                  *
2144                                  * (exclusive locked map version does not
2145                                  * need the range interlock).
2146                                  */
2147                                 pmap_remove(map->pmap,
2148                                             current->start, current->end);
2149                                 break;
2150                         default:
2151                                 error = EINVAL;
2152                                 break;
2153                         }
2154                         vm_map_simplify_entry(map, current, &count);
2155                 }
2156                 vm_map_unlock(map);
2157         } else {
2158                 vm_pindex_t pindex;
2159                 vm_pindex_t delta;
2160
2161                 /*
2162                  * madvise behaviors that are implemented in the underlying
2163                  * vm_object.
2164                  *
2165                  * Since we don't clip the vm_map_entry, we have to clip
2166                  * the vm_object pindex and count.
2167                  *
2168                  * NOTE!  These functions are only supported on normal maps,
2169                  *        except MADV_INVAL which is also supported on
2170                  *        virtual page tables.
2171                  */
2172                 for (current = entry;
2173                      (current != &map->header) && (current->start < end);
2174                      current = current->next
2175                 ) {
2176                         vm_offset_t useStart;
2177
2178                         if (current->maptype != VM_MAPTYPE_NORMAL &&
2179                             (current->maptype != VM_MAPTYPE_VPAGETABLE ||
2180                              behav != MADV_INVAL)) {
2181                                 continue;
2182                         }
2183
2184                         pindex = OFF_TO_IDX(current->offset);
2185                         delta = atop(current->end - current->start);
2186                         useStart = current->start;
2187
2188                         if (current->start < start) {
2189                                 pindex += atop(start - current->start);
2190                                 delta -= atop(start - current->start);
2191                                 useStart = start;
2192                         }
2193                         if (current->end > end)
2194                                 delta -= atop(current->end - end);
2195
2196                         if ((vm_spindex_t)delta <= 0)
2197                                 continue;
2198
2199                         if (behav == MADV_INVAL) {
2200                                 /*
2201                                  * Invalidate the related pmap entries, used
2202                                  * to flush portions of the real kernel's
2203                                  * pmap when the caller has removed or
2204                                  * modified existing mappings in a virtual
2205                                  * page table.
2206                                  *
2207                                  * (shared locked map version needs the
2208                                  * interlock, see vm_fault()).
2209                                  */
2210                                 struct vm_map_ilock ilock;
2211
2212                                 KASSERT(useStart >= VM_MIN_USER_ADDRESS &&
2213                                             useStart + ptoa(delta) <=
2214                                             VM_MAX_USER_ADDRESS,
2215                                          ("Bad range %016jx-%016jx (%016jx)",
2216                                          useStart, useStart + ptoa(delta),
2217                                          delta));
2218                                 vm_map_interlock(map, &ilock,
2219                                                  useStart,
2220                                                  useStart + ptoa(delta));
2221                                 pmap_remove(map->pmap,
2222                                             useStart,
2223                                             useStart + ptoa(delta));
2224                                 vm_map_deinterlock(map, &ilock);
2225                         } else {
2226                                 vm_object_madvise(current->object.vm_object,
2227                                                   pindex, delta, behav);
2228                         }
2229
2230                         /*
2231                          * Try to populate the page table.  Mappings governed
2232                          * by virtual page tables cannot be pre-populated
2233                          * without a lot of work so don't try.
2234                          */
2235                         if (behav == MADV_WILLNEED &&
2236                             current->maptype != VM_MAPTYPE_VPAGETABLE) {
2237                                 pmap_object_init_pt(
2238                                     map->pmap,
2239                                     useStart,
2240                                     current->protection,
2241                                     current->object.vm_object,
2242                                     pindex,
2243                                     (count << PAGE_SHIFT),
2244                                     MAP_PREFAULT_MADVISE
2245                                 );
2246                         }
2247                 }
2248                 vm_map_unlock_read(map);
2249         }
2250         vm_map_entry_release(count);
2251         return(error);
2252 }
2253
2254
2255 /*
2256  * Sets the inheritance of the specified address range in the target map.
2257  * Inheritance affects how the map will be shared with child maps at the
2258  * time of vm_map_fork.
2259  */
2260 int
2261 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
2262                vm_inherit_t new_inheritance)
2263 {
2264         vm_map_entry_t entry;
2265         vm_map_entry_t temp_entry;
2266         int count;
2267
2268         switch (new_inheritance) {
2269         case VM_INHERIT_NONE:
2270         case VM_INHERIT_COPY:
2271         case VM_INHERIT_SHARE:
2272                 break;
2273         default:
2274                 return (KERN_INVALID_ARGUMENT);
2275         }
2276
2277         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2278         vm_map_lock(map);
2279
2280         VM_MAP_RANGE_CHECK(map, start, end);
2281
2282         if (vm_map_lookup_entry(map, start, &temp_entry)) {
2283                 entry = temp_entry;
2284                 vm_map_clip_start(map, entry, start, &count);
2285         } else
2286                 entry = temp_entry->next;
2287
2288         while ((entry != &map->header) && (entry->start < end)) {
2289                 vm_map_clip_end(map, entry, end, &count);
2290
2291                 entry->inheritance = new_inheritance;
2292
2293                 vm_map_simplify_entry(map, entry, &count);
2294
2295                 entry = entry->next;
2296         }
2297         vm_map_unlock(map);
2298         vm_map_entry_release(count);
2299         return (KERN_SUCCESS);
2300 }
2301
2302 /*
2303  * Implement the semantics of mlock
2304  */
2305 int
2306 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t real_end,
2307               boolean_t new_pageable)
2308 {
2309         vm_map_entry_t entry;
2310         vm_map_entry_t start_entry;
2311         vm_offset_t end;
2312         int rv = KERN_SUCCESS;
2313         int count;
2314
2315         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2316         vm_map_lock(map);
2317         VM_MAP_RANGE_CHECK(map, start, real_end);
2318         end = real_end;
2319
2320         start_entry = vm_map_clip_range(map, start, end, &count,
2321                                         MAP_CLIP_NO_HOLES);
2322         if (start_entry == NULL) {
2323                 vm_map_unlock(map);
2324                 vm_map_entry_release(count);
2325                 return (KERN_INVALID_ADDRESS);
2326         }
2327
2328         if (new_pageable == 0) {
2329                 entry = start_entry;
2330                 while ((entry != &map->header) && (entry->start < end)) {
2331                         vm_offset_t save_start;
2332                         vm_offset_t save_end;
2333
2334                         /*
2335                          * Already user wired or hard wired (trivial cases)
2336                          */
2337                         if (entry->eflags & MAP_ENTRY_USER_WIRED) {
2338                                 entry = entry->next;
2339                                 continue;
2340                         }
2341                         if (entry->wired_count != 0) {
2342                                 entry->wired_count++;
2343                                 entry->eflags |= MAP_ENTRY_USER_WIRED;
2344                                 entry = entry->next;
2345                                 continue;
2346                         }
2347
2348                         /*
2349                          * A new wiring requires instantiation of appropriate
2350                          * management structures and the faulting in of the
2351                          * page.
2352                          */
2353                         if (entry->maptype == VM_MAPTYPE_NORMAL ||
2354                             entry->maptype == VM_MAPTYPE_VPAGETABLE) {
2355                                 int copyflag = entry->eflags &
2356                                                MAP_ENTRY_NEEDS_COPY;
2357                                 if (copyflag && ((entry->protection &
2358                                                   VM_PROT_WRITE) != 0)) {
2359                                         vm_map_entry_shadow(entry, 0);
2360                                 } else if (entry->object.vm_object == NULL &&
2361                                            !map->system_map) {
2362                                         vm_map_entry_allocate_object(entry);
2363                                 }
2364                         }
2365                         entry->wired_count++;
2366                         entry->eflags |= MAP_ENTRY_USER_WIRED;
2367
2368                         /*
2369                          * Now fault in the area.  Note that vm_fault_wire()
2370                          * may release the map lock temporarily, it will be
2371                          * relocked on return.  The in-transition
2372                          * flag protects the entries.
2373                          */
2374                         save_start = entry->start;
2375                         save_end = entry->end;
2376                         rv = vm_fault_wire(map, entry, TRUE, 0);
2377                         if (rv) {
2378                                 CLIP_CHECK_BACK(entry, save_start);
2379                                 for (;;) {
2380                                         KASSERT(entry->wired_count == 1, ("bad wired_count on entry"));
2381                                         entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2382                                         entry->wired_count = 0;
2383                                         if (entry->end == save_end)
2384                                                 break;
2385                                         entry = entry->next;
2386                                         KASSERT(entry != &map->header, ("bad entry clip during backout"));
2387                                 }
2388                                 end = save_start;       /* unwire the rest */
2389                                 break;
2390                         }
2391                         /*
2392                          * note that even though the entry might have been
2393                          * clipped, the USER_WIRED flag we set prevents
2394                          * duplication so we do not have to do a
2395                          * clip check.
2396                          */
2397                         entry = entry->next;
2398                 }
2399
2400                 /*
2401                  * If we failed fall through to the unwiring section to
2402                  * unwire what we had wired so far.  'end' has already
2403                  * been adjusted.
2404                  */
2405                 if (rv)
2406                         new_pageable = 1;
2407
2408                 /*
2409                  * start_entry might have been clipped if we unlocked the
2410                  * map and blocked.  No matter how clipped it has gotten
2411                  * there should be a fragment that is on our start boundary.
2412                  */
2413                 CLIP_CHECK_BACK(start_entry, start);
2414         }
2415
2416         /*
2417          * Deal with the unwiring case.
2418          */
2419         if (new_pageable) {
2420                 /*
2421                  * This is the unwiring case.  We must first ensure that the
2422                  * range to be unwired is really wired down.  We know there
2423                  * are no holes.
2424                  */
2425                 entry = start_entry;
2426                 while ((entry != &map->header) && (entry->start < end)) {
2427                         if ((entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
2428                                 rv = KERN_INVALID_ARGUMENT;
2429                                 goto done;
2430                         }
2431                         KASSERT(entry->wired_count != 0, ("wired count was 0 with USER_WIRED set! %p", entry));
2432                         entry = entry->next;
2433                 }
2434
2435                 /*
2436                  * Now decrement the wiring count for each region. If a region
2437                  * becomes completely unwired, unwire its physical pages and
2438                  * mappings.
2439                  */
2440                 /*
2441                  * The map entries are processed in a loop, checking to
2442                  * make sure the entry is wired and asserting it has a wired
2443                  * count. However, another loop was inserted more-or-less in
2444                  * the middle of the unwiring path. This loop picks up the
2445                  * "entry" loop variable from the first loop without first
2446                  * setting it to start_entry. Naturally, the secound loop
2447                  * is never entered and the pages backing the entries are
2448                  * never unwired. This can lead to a leak of wired pages.
2449                  */
2450                 entry = start_entry;
2451                 while ((entry != &map->header) && (entry->start < end)) {
2452                         KASSERT(entry->eflags & MAP_ENTRY_USER_WIRED,
2453                                 ("expected USER_WIRED on entry %p", entry));
2454                         entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2455                         entry->wired_count--;
2456                         if (entry->wired_count == 0)
2457                                 vm_fault_unwire(map, entry);
2458                         entry = entry->next;
2459                 }
2460         }
2461 done:
2462         vm_map_unclip_range(map, start_entry, start, real_end, &count,
2463                 MAP_CLIP_NO_HOLES);
2464         vm_map_unlock(map);
2465         vm_map_entry_release(count);
2466
2467         return (rv);
2468 }
2469
2470 /*
2471  * Sets the pageability of the specified address range in the target map.
2472  * Regions specified as not pageable require locked-down physical
2473  * memory and physical page maps.
2474  *
2475  * The map must not be locked, but a reference must remain to the map
2476  * throughout the call.
2477  *
2478  * This function may be called via the zalloc path and must properly
2479  * reserve map entries for kernel_map.
2480  *
2481  * No requirements.
2482  */
2483 int
2484 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t real_end, int kmflags)
2485 {
2486         vm_map_entry_t entry;
2487         vm_map_entry_t start_entry;
2488         vm_offset_t end;
2489         int rv = KERN_SUCCESS;
2490         int count;
2491
2492         if (kmflags & KM_KRESERVE)
2493                 count = vm_map_entry_kreserve(MAP_RESERVE_COUNT);
2494         else
2495                 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2496         vm_map_lock(map);
2497         VM_MAP_RANGE_CHECK(map, start, real_end);
2498         end = real_end;
2499
2500         start_entry = vm_map_clip_range(map, start, end, &count,
2501                                         MAP_CLIP_NO_HOLES);
2502         if (start_entry == NULL) {
2503                 vm_map_unlock(map);
2504                 rv = KERN_INVALID_ADDRESS;
2505                 goto failure;
2506         }
2507         if ((kmflags & KM_PAGEABLE) == 0) {
2508                 /*
2509                  * Wiring.
2510                  *
2511                  * 1.  Holding the write lock, we create any shadow or zero-fill
2512                  * objects that need to be created. Then we clip each map
2513                  * entry to the region to be wired and increment its wiring
2514                  * count.  We create objects before clipping the map entries
2515                  * to avoid object proliferation.
2516                  *
2517                  * 2.  We downgrade to a read lock, and call vm_fault_wire to
2518                  * fault in the pages for any newly wired area (wired_count is
2519                  * 1).
2520                  *
2521                  * Downgrading to a read lock for vm_fault_wire avoids a
2522                  * possible deadlock with another process that may have faulted
2523                  * on one of the pages to be wired (it would mark the page busy,
2524                  * blocking us, then in turn block on the map lock that we
2525                  * hold).  Because of problems in the recursive lock package,
2526                  * we cannot upgrade to a write lock in vm_map_lookup.  Thus,
2527                  * any actions that require the write lock must be done
2528                  * beforehand.  Because we keep the read lock on the map, the
2529                  * copy-on-write status of the entries we modify here cannot
2530                  * change.
2531                  */
2532                 entry = start_entry;
2533                 while ((entry != &map->header) && (entry->start < end)) {
2534                         /*
2535                          * Trivial case if the entry is already wired
2536                          */
2537                         if (entry->wired_count) {
2538                                 entry->wired_count++;
2539                                 entry = entry->next;
2540                                 continue;
2541                         }
2542
2543                         /*
2544                          * The entry is being newly wired, we have to setup
2545                          * appropriate management structures.  A shadow
2546                          * object is required for a copy-on-write region,
2547                          * or a normal object for a zero-fill region.  We
2548                          * do not have to do this for entries that point to sub
2549                          * maps because we won't hold the lock on the sub map.
2550                          */
2551                         if (entry->maptype == VM_MAPTYPE_NORMAL ||
2552                             entry->maptype == VM_MAPTYPE_VPAGETABLE) {
2553                                 int copyflag = entry->eflags &
2554                                                MAP_ENTRY_NEEDS_COPY;
2555                                 if (copyflag && ((entry->protection &
2556                                                   VM_PROT_WRITE) != 0)) {
2557                                         vm_map_entry_shadow(entry, 0);
2558                                 } else if (entry->object.vm_object == NULL &&
2559                                            !map->system_map) {
2560                                         vm_map_entry_allocate_object(entry);
2561                                 }
2562                         }
2563
2564                         entry->wired_count++;
2565                         entry = entry->next;
2566                 }
2567
2568                 /*
2569                  * Pass 2.
2570                  */
2571
2572                 /*
2573                  * HACK HACK HACK HACK
2574                  *
2575                  * vm_fault_wire() temporarily unlocks the map to avoid
2576                  * deadlocks.  The in-transition flag from vm_map_clip_range
2577                  * call should protect us from changes while the map is
2578                  * unlocked.  T
2579                  *
2580                  * NOTE: Previously this comment stated that clipping might
2581                  *       still occur while the entry is unlocked, but from
2582                  *       what I can tell it actually cannot.
2583                  *
2584                  *       It is unclear whether the CLIP_CHECK_*() calls
2585                  *       are still needed but we keep them in anyway.
2586                  *
2587                  * HACK HACK HACK HACK
2588                  */
2589
2590                 entry = start_entry;
2591                 while (entry != &map->header && entry->start < end) {
2592                         /*
2593                          * If vm_fault_wire fails for any page we need to undo
2594                          * what has been done.  We decrement the wiring count
2595                          * for those pages which have not yet been wired (now)
2596                          * and unwire those that have (later).
2597                          */
2598                         vm_offset_t save_start = entry->start;
2599                         vm_offset_t save_end = entry->end;
2600
2601                         if (entry->wired_count == 1)
2602                                 rv = vm_fault_wire(map, entry, FALSE, kmflags);
2603                         if (rv) {
2604                                 CLIP_CHECK_BACK(entry, save_start);
2605                                 for (;;) {
2606                                         KASSERT(entry->wired_count == 1, ("wired_count changed unexpectedly"));
2607                                         entry->wired_count = 0;
2608                                         if (entry->end == save_end)
2609                                                 break;
2610                                         entry = entry->next;
2611                                         KASSERT(entry != &map->header, ("bad entry clip during backout"));
2612                                 }
2613                                 end = save_start;
2614                                 break;
2615                         }
2616                         CLIP_CHECK_FWD(entry, save_end);
2617                         entry = entry->next;
2618                 }
2619
2620                 /*
2621                  * If a failure occured undo everything by falling through
2622                  * to the unwiring code.  'end' has already been adjusted
2623                  * appropriately.
2624                  */
2625                 if (rv)
2626                         kmflags |= KM_PAGEABLE;
2627
2628                 /*
2629                  * start_entry is still IN_TRANSITION but may have been
2630                  * clipped since vm_fault_wire() unlocks and relocks the
2631                  * map.  No matter how clipped it has gotten there should
2632                  * be a fragment that is on our start boundary.
2633                  */
2634                 CLIP_CHECK_BACK(start_entry, start);
2635         }
2636
2637         if (kmflags & KM_PAGEABLE) {
2638                 /*
2639                  * This is the unwiring case.  We must first ensure that the
2640                  * range to be unwired is really wired down.  We know there
2641                  * are no holes.
2642                  */
2643                 entry = start_entry;
2644                 while ((entry != &map->header) && (entry->start < end)) {
2645                         if (entry->wired_count == 0) {
2646                                 rv = KERN_INVALID_ARGUMENT;
2647                                 goto done;
2648                         }
2649                         entry = entry->next;
2650                 }
2651
2652                 /*
2653                  * Now decrement the wiring count for each region. If a region
2654                  * becomes completely unwired, unwire its physical pages and
2655                  * mappings.
2656                  */
2657                 entry = start_entry;
2658                 while ((entry != &map->header) && (entry->start < end)) {
2659                         entry->wired_count--;
2660                         if (entry->wired_count == 0)
2661                                 vm_fault_unwire(map, entry);
2662                         entry = entry->next;
2663                 }
2664         }
2665 done:
2666         vm_map_unclip_range(map, start_entry, start, real_end,
2667                             &count, MAP_CLIP_NO_HOLES);
2668         vm_map_unlock(map);
2669 failure:
2670         if (kmflags & KM_KRESERVE)
2671                 vm_map_entry_krelease(count);
2672         else
2673                 vm_map_entry_release(count);
2674         return (rv);
2675 }
2676
2677 /*
2678  * Mark a newly allocated address range as wired but do not fault in
2679  * the pages.  The caller is expected to load the pages into the object.
2680  *
2681  * The map must be locked on entry and will remain locked on return.
2682  * No other requirements.
2683  */
2684 void
2685 vm_map_set_wired_quick(vm_map_t map, vm_offset_t addr, vm_size_t size,
2686                        int *countp)
2687 {
2688         vm_map_entry_t scan;
2689         vm_map_entry_t entry;
2690
2691         entry = vm_map_clip_range(map, addr, addr + size,
2692                                   countp, MAP_CLIP_NO_HOLES);
2693         for (scan = entry;
2694              scan != &map->header && scan->start < addr + size;
2695              scan = scan->next) {
2696             KKASSERT(scan->wired_count == 0);
2697             scan->wired_count = 1;
2698         }
2699         vm_map_unclip_range(map, entry, addr, addr + size,
2700                             countp, MAP_CLIP_NO_HOLES);
2701 }
2702
2703 /*
2704  * Push any dirty cached pages in the address range to their pager.
2705  * If syncio is TRUE, dirty pages are written synchronously.
2706  * If invalidate is TRUE, any cached pages are freed as well.
2707  *
2708  * This routine is called by sys_msync()
2709  *
2710  * Returns an error if any part of the specified range is not mapped.
2711  *
2712  * No requirements.
2713  */
2714 int
2715 vm_map_clean(vm_map_t map, vm_offset_t start, vm_offset_t end,
2716              boolean_t syncio, boolean_t invalidate)
2717 {
2718         vm_map_entry_t current;
2719         vm_map_entry_t entry;
2720         vm_size_t size;
2721         vm_object_t object;
2722         vm_object_t tobj;
2723         vm_ooffset_t offset;
2724
2725         vm_map_lock_read(map);
2726         VM_MAP_RANGE_CHECK(map, start, end);
2727         if (!vm_map_lookup_entry(map, start, &entry)) {
2728                 vm_map_unlock_read(map);
2729                 return (KERN_INVALID_ADDRESS);
2730         }
2731         lwkt_gettoken(&map->token);
2732
2733         /*
2734          * Make a first pass to check for holes.
2735          */
2736         for (current = entry; current->start < end; current = current->next) {
2737                 if (current->maptype == VM_MAPTYPE_SUBMAP) {
2738                         lwkt_reltoken(&map->token);
2739                         vm_map_unlock_read(map);
2740                         return (KERN_INVALID_ARGUMENT);
2741                 }
2742                 if (end > current->end &&
2743                     (current->next == &map->header ||
2744                         current->end != current->next->start)) {
2745                         lwkt_reltoken(&map->token);
2746                         vm_map_unlock_read(map);
2747                         return (KERN_INVALID_ADDRESS);
2748                 }
2749         }
2750
2751         if (invalidate)
2752                 pmap_remove(vm_map_pmap(map), start, end);
2753
2754         /*
2755          * Make a second pass, cleaning/uncaching pages from the indicated
2756          * objects as we go.
2757          */
2758         for (current = entry; current->start < end; current = current->next) {
2759                 offset = current->offset + (start - current->start);
2760                 size = (end <= current->end ? end : current->end) - start;
2761
2762                 switch(current->maptype) {
2763                 case VM_MAPTYPE_SUBMAP:
2764                 {
2765                         vm_map_t smap;
2766                         vm_map_entry_t tentry;
2767                         vm_size_t tsize;
2768
2769                         smap = current->object.sub_map;
2770                         vm_map_lock_read(smap);
2771                         vm_map_lookup_entry(smap, offset, &tentry);
2772                         tsize = tentry->end - offset;
2773                         if (tsize < size)
2774                                 size = tsize;
2775                         object = tentry->object.vm_object;
2776                         offset = tentry->offset + (offset - tentry->start);
2777                         vm_map_unlock_read(smap);
2778                         break;
2779                 }
2780                 case VM_MAPTYPE_NORMAL:
2781                 case VM_MAPTYPE_VPAGETABLE:
2782                         object = current->object.vm_object;
2783                         break;
2784                 default:
2785                         object = NULL;
2786                         break;
2787                 }
2788
2789                 if (object)
2790                         vm_object_hold(object);
2791
2792                 /*
2793                  * Note that there is absolutely no sense in writing out
2794                  * anonymous objects, so we track down the vnode object
2795                  * to write out.
2796                  * We invalidate (remove) all pages from the address space
2797                  * anyway, for semantic correctness.
2798                  *
2799                  * note: certain anonymous maps, such as MAP_NOSYNC maps,
2800                  * may start out with a NULL object.
2801                  */
2802                 while (object && (tobj = object->backing_object) != NULL) {
2803                         vm_object_hold(tobj);
2804                         if (tobj == object->backing_object) {
2805                                 vm_object_lock_swap();
2806                                 offset += object->backing_object_offset;
2807                                 vm_object_drop(object);
2808                                 object = tobj;
2809                                 if (object->size < OFF_TO_IDX(offset + size))
2810                                         size = IDX_TO_OFF(object->size) -
2811                                                offset;
2812                                 break;
2813                         }
2814                         vm_object_drop(tobj);
2815                 }
2816                 if (object && (object->type == OBJT_VNODE) &&
2817                     (current->protection & VM_PROT_WRITE) &&
2818                     (object->flags & OBJ_NOMSYNC) == 0) {
2819                         /*
2820                          * Flush pages if writing is allowed, invalidate them
2821                          * if invalidation requested.  Pages undergoing I/O
2822                          * will be ignored by vm_object_page_remove().
2823                          *
2824                          * We cannot lock the vnode and then wait for paging
2825                          * to complete without deadlocking against vm_fault.
2826                          * Instead we simply call vm_object_page_remove() and
2827                          * allow it to block internally on a page-by-page
2828                          * basis when it encounters pages undergoing async
2829                          * I/O.
2830                          */
2831                         int flags;
2832
2833                         /* no chain wait needed for vnode objects */
2834                         vm_object_reference_locked(object);
2835                         vn_lock(object->handle, LK_EXCLUSIVE | LK_RETRY);
2836                         flags = (syncio || invalidate) ? OBJPC_SYNC : 0;
2837                         flags |= invalidate ? OBJPC_INVAL : 0;
2838
2839                         /*
2840                          * When operating on a virtual page table just
2841                          * flush the whole object.  XXX we probably ought
2842                          * to
2843                          */
2844                         switch(current->maptype) {
2845                         case VM_MAPTYPE_NORMAL:
2846                                 vm_object_page_clean(object,
2847                                     OFF_TO_IDX(offset),
2848                                     OFF_TO_IDX(offset + size + PAGE_MASK),
2849                                     flags);
2850                                 break;
2851                         case VM_MAPTYPE_VPAGETABLE:
2852                                 vm_object_page_clean(object, 0, 0, flags);
2853                                 break;
2854                         }
2855                         vn_unlock(((struct vnode *)object->handle));
2856                         vm_object_deallocate_locked(object);
2857                 }
2858                 if (object && invalidate &&
2859                    ((object->type == OBJT_VNODE) ||
2860                     (object->type == OBJT_DEVICE) ||
2861                     (object->type == OBJT_MGTDEVICE))) {
2862                         int clean_only =
2863                                 ((object->type == OBJT_DEVICE) ||
2864                                 (object->type == OBJT_MGTDEVICE)) ? FALSE : TRUE;
2865                         /* no chain wait needed for vnode/device objects */
2866                         vm_object_reference_locked(object);
2867                         switch(current->maptype) {
2868                         case VM_MAPTYPE_NORMAL:
2869                                 vm_object_page_remove(object,
2870                                     OFF_TO_IDX(offset),
2871                                     OFF_TO_IDX(offset + size + PAGE_MASK),
2872                                     clean_only);
2873                                 break;
2874                         case VM_MAPTYPE_VPAGETABLE:
2875                                 vm_object_page_remove(object, 0, 0, clean_only);
2876                                 break;
2877                         }
2878                         vm_object_deallocate_locked(object);
2879                 }
2880                 start += size;
2881                 if (object)
2882                         vm_object_drop(object);
2883         }
2884
2885         lwkt_reltoken(&map->token);
2886         vm_map_unlock_read(map);
2887
2888         return (KERN_SUCCESS);
2889 }
2890
2891 /*
2892  * Make the region specified by this entry pageable.
2893  *
2894  * The vm_map must be exclusively locked.
2895  */
2896 static void
2897 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
2898 {
2899         entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2900         entry->wired_count = 0;
2901         vm_fault_unwire(map, entry);
2902 }
2903
2904 /*
2905  * Deallocate the given entry from the target map.
2906  *
2907  * The vm_map must be exclusively locked.
2908  */
2909 static void
2910 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry, int *countp)
2911 {
2912         vm_map_entry_unlink(map, entry);
2913         map->size -= entry->end - entry->start;
2914
2915         switch(entry->maptype) {
2916         case VM_MAPTYPE_NORMAL:
2917         case VM_MAPTYPE_VPAGETABLE:
2918         case VM_MAPTYPE_SUBMAP:
2919                 vm_object_deallocate(entry->object.vm_object);
2920                 break;
2921         case VM_MAPTYPE_UKSMAP:
2922                 /* XXX TODO */
2923                 break;
2924         default:
2925                 break;
2926         }
2927
2928         vm_map_entry_dispose(map, entry, countp);
2929 }
2930
2931 /*
2932  * Deallocates the given address range from the target map.
2933  *
2934  * The vm_map must be exclusively locked.
2935  */
2936 int
2937 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end, int *countp)
2938 {
2939         vm_object_t object;
2940         vm_map_entry_t entry;
2941         vm_map_entry_t first_entry;
2942         vm_offset_t hole_start;
2943
2944         ASSERT_VM_MAP_LOCKED(map);
2945         lwkt_gettoken(&map->token);
2946 again:
2947         /*
2948          * Find the start of the region, and clip it.  Set entry to point
2949          * at the first record containing the requested address or, if no
2950          * such record exists, the next record with a greater address.  The
2951          * loop will run from this point until a record beyond the termination
2952          * address is encountered.
2953          *
2954          * Adjust freehint[] for either the clip case or the extension case.
2955          *
2956          * GGG see other GGG comment.
2957          */
2958         if (vm_map_lookup_entry(map, start, &first_entry)) {
2959                 entry = first_entry;
2960                 vm_map_clip_start(map, entry, start, countp);
2961                 hole_start = start;
2962         } else {
2963                 entry = first_entry->next;
2964                 if (entry == &map->header)
2965                         hole_start = first_entry->start;
2966                 else
2967                         hole_start = first_entry->end;
2968         }
2969
2970         /*
2971          * Step through all entries in this region
2972          */
2973         while ((entry != &map->header) && (entry->start < end)) {
2974                 vm_map_entry_t next;
2975                 vm_offset_t s, e;
2976                 vm_pindex_t offidxstart, offidxend, count;
2977
2978                 /*
2979                  * If we hit an in-transition entry we have to sleep and
2980                  * retry.  It's easier (and not really slower) to just retry
2981                  * since this case occurs so rarely and the hint is already
2982                  * pointing at the right place.  We have to reset the
2983                  * start offset so as not to accidently delete an entry
2984                  * another process just created in vacated space.
2985                  */
2986                 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
2987                         entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2988                         start = entry->start;
2989                         ++mycpu->gd_cnt.v_intrans_coll;
2990                         ++mycpu->gd_cnt.v_intrans_wait;
2991                         vm_map_transition_wait(map, 1);
2992                         goto again;
2993                 }
2994                 vm_map_clip_end(map, entry, end, countp);
2995
2996                 s = entry->start;
2997                 e = entry->end;
2998                 next = entry->next;
2999
3000                 offidxstart = OFF_TO_IDX(entry->offset);
3001                 count = OFF_TO_IDX(e - s);
3002
3003                 switch(entry->maptype) {
3004                 case VM_MAPTYPE_NORMAL:
3005                 case VM_MAPTYPE_VPAGETABLE:
3006                 case VM_MAPTYPE_SUBMAP:
3007                         object = entry->object.vm_object;
3008                         break;
3009                 default:
3010                         object = NULL;
3011                         break;
3012                 }
3013
3014                 /*
3015                  * Unwire before removing addresses from the pmap; otherwise,
3016                  * unwiring will put the entries back in the pmap.
3017                  *
3018                  * Generally speaking, doing a bulk pmap_remove() before
3019                  * removing the pages from the VM object is better at
3020                  * reducing unnecessary IPIs.  The pmap code is now optimized
3021                  * to not blindly iterate the range when pt and pd pages
3022                  * are missing.
3023                  */
3024                 if (entry->wired_count != 0)
3025                         vm_map_entry_unwire(map, entry);
3026
3027                 offidxend = offidxstart + count;
3028
3029                 if (object == &kernel_object) {
3030                         pmap_remove(map->pmap, s, e);
3031                         vm_object_hold(object);
3032                         vm_object_page_remove(object, offidxstart,
3033                                               offidxend, FALSE);
3034                         vm_object_drop(object);
3035                 } else if (object && object->type != OBJT_DEFAULT &&
3036                            object->type != OBJT_SWAP) {
3037                         /*
3038                          * vnode object routines cannot be chain-locked,
3039                          * but since we aren't removing pages from the
3040                          * object here we can use a shared hold.
3041                          */
3042                         vm_object_hold_shared(object);
3043                         pmap_remove(map->pmap, s, e);
3044                         vm_object_drop(object);
3045                 } else if (object) {
3046                         vm_object_hold(object);
3047                         vm_object_chain_acquire(object, 0);
3048                         pmap_remove(map->pmap, s, e);
3049
3050                         if (object != NULL &&
3051                             object->ref_count != 1 &&
3052                             (object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) ==
3053                              OBJ_ONEMAPPING &&
3054                             (object->type == OBJT_DEFAULT ||
3055                              object->type == OBJT_SWAP)) {
3056                                 /*
3057                                  * When ONEMAPPING is set we can destroy the
3058                                  * pages underlying the entry's range.
3059                                  */
3060                                 vm_object_collapse(object, NULL);
3061                                 vm_object_page_remove(object, offidxstart,
3062                                                       offidxend, FALSE);
3063                                 if (object->type == OBJT_SWAP) {
3064                                         swap_pager_freespace(object,
3065                                                              offidxstart,
3066                                                              count);
3067                                 }
3068                                 if (offidxend >= object->size &&
3069                                     offidxstart < object->size) {
3070                                         object->size = offidxstart;
3071                                 }
3072                         }
3073                         vm_object_chain_release(object);
3074                         vm_object_drop(object);
3075                 } else if (entry->maptype == VM_MAPTYPE_UKSMAP) {
3076                         pmap_remove(map->pmap, s, e);
3077                 }
3078
3079                 /*
3080                  * Delete the entry (which may delete the object) only after
3081                  * removing all pmap entries pointing to its pages.
3082                  * (Otherwise, its page frames may be reallocated, and any
3083                  * modify bits will be set in the wrong object!)
3084                  */
3085                 vm_map_entry_delete(map, entry, countp);
3086                 entry = next;
3087         }
3088         if (entry == &map->header)
3089                 vm_map_freehint_hole(map, hole_start, entry->end - hole_start);
3090         else
3091                 vm_map_freehint_hole(map, hole_start,
3092                                      entry->start - hole_start);
3093
3094         lwkt_reltoken(&map->token);
3095
3096         return (KERN_SUCCESS);
3097 }
3098
3099 /*
3100  * Remove the given address range from the target map.
3101  * This is the exported form of vm_map_delete.
3102  *
3103  * No requirements.
3104  */
3105 int
3106 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
3107 {
3108         int result;
3109         int count;
3110
3111         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3112         vm_map_lock(map);
3113         VM_MAP_RANGE_CHECK(map, start, end);
3114         result = vm_map_delete(map, start, end, &count);
3115         vm_map_unlock(map);
3116         vm_map_entry_release(count);
3117
3118         return (result);
3119 }
3120
3121 /*
3122  * Assert that the target map allows the specified privilege on the
3123  * entire address region given.  The entire region must be allocated.
3124  *
3125  * The caller must specify whether the vm_map is already locked or not.
3126  */
3127 boolean_t
3128 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
3129                         vm_prot_t protection, boolean_t have_lock)
3130 {
3131         vm_map_entry_t entry;
3132         vm_map_entry_t tmp_entry;
3133         boolean_t result;
3134
3135         if (have_lock == FALSE)
3136                 vm_map_lock_read(map);
3137
3138         if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
3139                 if (have_lock == FALSE)
3140                         vm_map_unlock_read(map);
3141                 return (FALSE);
3142         }
3143         entry = tmp_entry;
3144
3145         result = TRUE;
3146         while (start < end) {
3147                 if (entry == &map->header) {
3148                         result = FALSE;
3149                         break;
3150                 }
3151                 /*
3152                  * No holes allowed!
3153                  */
3154
3155                 if (start < entry->start) {
3156                         result = FALSE;
3157                         break;
3158                 }
3159                 /*
3160                  * Check protection associated with entry.
3161                  */
3162
3163                 if ((entry->protection & protection) != protection) {
3164                         result = FALSE;
3165                         break;
3166                 }
3167                 /* go to next entry */
3168
3169                 start = entry->end;
3170                 entry = entry->next;
3171         }
3172         if (have_lock == FALSE)
3173                 vm_map_unlock_read(map);
3174         return (result);
3175 }
3176
3177 /*
3178  * If appropriate this function shadows the original object with a new object
3179  * and moves the VM pages from the original object to the new object.
3180  * The original object will also be collapsed, if possible.
3181  *
3182  * Caller must supply entry->object.vm_object held and chain_acquired, and
3183  * should chain_release and drop the object upon return.
3184  *
3185  * We can only do this for normal memory objects with a single mapping, and
3186  * it only makes sense to do it if there are 2 or more refs on the original
3187  * object.  i.e. typically a memory object that has been extended into
3188  * multiple vm_map_entry's with non-overlapping ranges.
3189  *
3190  * This makes it easier to remove unused pages and keeps object inheritance
3191  * from being a negative impact on memory usage.
3192  *
3193  * On return the (possibly new) entry->object.vm_object will have an
3194  * additional ref on it for the caller to dispose of (usually by cloning
3195  * the vm_map_entry).  The additional ref had to be done in this routine
3196  * to avoid racing a collapse.  The object's ONEMAPPING flag will also be
3197  * cleared.
3198  *
3199  * The vm_map must be locked and its token held.
3200  */
3201 static void
3202 vm_map_split(vm_map_entry_t entry, vm_object_t oobject)
3203 {
3204         /* OPTIMIZED */
3205         vm_object_t nobject, bobject;
3206         vm_offset_t s, e;
3207         vm_page_t m;
3208         vm_pindex_t offidxstart, offidxend, idx;
3209         vm_size_t size;
3210         vm_ooffset_t offset;
3211         int useshadowlist;
3212
3213         /*
3214          * Optimize away object locks for vnode objects.  Important exit/exec
3215          * critical path.
3216          *
3217          * OBJ_ONEMAPPING doesn't apply to vnode objects but clear the flag
3218          * anyway.
3219          */
3220         if (oobject->type != OBJT_DEFAULT && oobject->type != OBJT_SWAP) {
3221                 vm_object_reference_quick(oobject);
3222                 vm_object_clear_flag(oobject, OBJ_ONEMAPPING);
3223                 return;
3224         }
3225
3226 #if 0
3227         /*
3228          * Original object cannot be split?
3229          */
3230         if (oobject->handle == NULL) {
3231                 vm_object_reference_locked_chain_held(oobject);
3232                 vm_object_clear_flag(oobject, OBJ_ONEMAPPING);
3233                 return;
3234         }
3235 #endif
3236
3237         /*
3238          * Collapse original object with its backing store as an
3239          * optimization to reduce chain lengths when possible.
3240          *
3241          * If ref_count <= 1 there aren't other non-overlapping vm_map_entry's
3242          * for oobject, so there's no point collapsing it.
3243          *
3244          * Then re-check whether the object can be split.
3245          */
3246         vm_object_collapse(oobject, NULL);
3247
3248         if (oobject->ref_count <= 1 ||
3249             (oobject->type != OBJT_DEFAULT && oobject->type != OBJT_SWAP) ||
3250             (oobject->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) != OBJ_ONEMAPPING) {
3251                 vm_object_reference_locked_chain_held(oobject);
3252                 vm_object_clear_flag(oobject, OBJ_ONEMAPPING);
3253                 return;
3254         }
3255
3256         /*
3257          * Acquire the chain lock on the backing object.
3258          *
3259          * Give bobject an additional ref count for when it will be shadowed
3260          * by nobject.
3261          */
3262         useshadowlist = 0;
3263         if ((bobject = oobject->backing_object) != NULL) {
3264                 if (bobject->type != OBJT_VNODE) {
3265                         useshadowlist = 1;
3266                         vm_object_hold(bobject);
3267                         vm_object_chain_wait(bobject, 0);
3268                         /* ref for shadowing below */
3269                         vm_object_reference_locked(bobject);
3270                         vm_object_chain_acquire(bobject, 0);
3271                         KKASSERT(oobject->backing_object == bobject);
3272                         KKASSERT((bobject->flags & OBJ_DEAD) == 0);
3273                 } else {
3274                         /*
3275                          * vnodes are not placed on the shadow list but
3276                          * they still get another ref for the backing_object
3277                          * reference.
3278                          */
3279                         vm_object_reference_quick(bobject);
3280                 }
3281         }
3282
3283         /*
3284          * Calculate the object page range and allocate the new object.
3285          */
3286         offset = entry->offset;
3287         s = entry->start;
3288         e = entry->end;
3289
3290         offidxstart = OFF_TO_IDX(offset);
3291         offidxend = offidxstart + OFF_TO_IDX(e - s);
3292         size = offidxend - offidxstart;
3293
3294         switch(oobject->type) {
3295         case OBJT_DEFAULT:
3296                 nobject = default_pager_alloc(NULL, IDX_TO_OFF(size),
3297                                               VM_PROT_ALL, 0);
3298                 break;
3299         case OBJT_SWAP:
3300                 nobject = swap_pager_alloc(NULL, IDX_TO_OFF(size),
3301                                            VM_PROT_ALL, 0);
3302                 break;
3303         default:
3304                 /* not reached */
3305                 nobject = NULL;
3306                 KKASSERT(0);
3307         }
3308
3309         /*
3310          * If we could not allocate nobject just clear ONEMAPPING on
3311          * oobject and return.
3312          */
3313         if (nobject == NULL) {
3314                 if (bobject) {
3315                         if (useshadowlist) {
3316                                 vm_object_chain_release(bobject);
3317                                 vm_object_deallocate(bobject);
3318                                 vm_object_drop(bobject);
3319                         } else {
3320                                 vm_object_deallocate(bobject);
3321                         }
3322                 }
3323                 vm_object_reference_locked_chain_held(oobject);
3324                 vm_object_clear_flag(oobject, OBJ_ONEMAPPING);
3325                 return;
3326         }
3327
3328         /*
3329          * The new object will replace entry->object.vm_object so it needs
3330          * a second reference (the caller expects an additional ref).
3331          */
3332         vm_object_hold(nobject);
3333         vm_object_reference_locked(nobject);
3334         vm_object_chain_acquire(nobject, 0);
3335
3336         /*
3337          * nobject shadows bobject (oobject already shadows bobject).
3338          *
3339          * Adding an object to bobject's shadow list requires refing bobject
3340          * which we did above in the useshadowlist case.
3341          *
3342          * XXX it is unclear if we need to clear ONEMAPPING on bobject here
3343          *     or not.
3344          */
3345         if (bobject) {
3346                 nobject->backing_object_offset =
3347                     oobject->backing_object_offset + IDX_TO_OFF(offidxstart);
3348                 nobject->backing_object = bobject;
3349                 if (useshadowlist) {
3350                         bobject->shadow_count++;
3351                         atomic_add_int(&bobject->generation, 1);
3352                         LIST_INSERT_HEAD(&bobject->shadow_head,
3353                                          nobject, shadow_list);
3354                         vm_object_clear_flag(bobject, OBJ_ONEMAPPING); /*XXX*/
3355                         vm_object_set_flag(nobject, OBJ_ONSHADOW);
3356                 }
3357         }
3358
3359         /*
3360          * Move the VM pages from oobject to nobject
3361          */
3362         for (idx = 0; idx < size; idx++) {
3363                 vm_page_t m;
3364
3365                 m = vm_page_lookup_busy_wait(oobject, offidxstart + idx,
3366                                              TRUE, "vmpg");
3367                 if (m == NULL)
3368                         continue;
3369
3370                 /*
3371                  * We must wait for pending I/O to complete before we can
3372                  * rename the page.
3373                  *
3374                  * We do not have to VM_PROT_NONE the page as mappings should
3375                  * not be changed by this operation.
3376                  *
3377                  * NOTE: The act of renaming a page updates chaingen for both
3378                  *       objects.
3379                  */
3380                 vm_page_rename(m, nobject, idx);
3381                 /* page automatically made dirty by rename and cache handled */
3382                 /* page remains busy */
3383         }
3384
3385         if (oobject->type == OBJT_SWAP) {
3386                 vm_object_pip_add(oobject, 1);
3387                 /*
3388                  * copy oobject pages into nobject and destroy unneeded
3389                  * pages in shadow object.
3390                  */
3391                 swap_pager_copy(oobject, nobject, offidxstart, 0);
3392                 vm_object_pip_wakeup(oobject);
3393         }
3394
3395         /*
3396          * Wakeup the pages we played with.  No spl protection is needed
3397          * for a simple wakeup.
3398          */
3399         for (idx = 0; idx < size; idx++) {
3400                 m = vm_page_lookup(nobject, idx);
3401                 if (m) {
3402                         KKASSERT(m->busy_count & PBUSY_LOCKED);
3403                         vm_page_wakeup(m);
3404                 }
3405         }
3406         entry->object.vm_object = nobject;
3407         entry->offset = 0LL;
3408
3409         /*
3410          * The map is being split and nobject is going to wind up on both
3411          * vm_map_entry's, so make sure OBJ_ONEMAPPING is cleared on
3412          * nobject.
3413          */
3414         vm_object_clear_flag(nobject, OBJ_ONEMAPPING);
3415
3416         /*
3417          * Cleanup
3418          *
3419          * NOTE: There is no need to remove OBJ_ONEMAPPING from oobject, the
3420          *       related pages were moved and are no longer applicable to the
3421          *       original object.
3422          *
3423          * NOTE: Deallocate oobject (due to its entry->object.vm_object being
3424          *       replaced by nobject).
3425          */
3426         vm_object_chain_release(nobject);
3427         vm_object_drop(nobject);
3428         if (bobject && useshadowlist) {
3429                 vm_object_chain_release(bobject);
3430                 vm_object_drop(bobject);
3431         }
3432
3433 #if 0
3434         if (oobject->resident_page_count) {
3435                 kprintf("oobject %p still contains %jd pages!\n",
3436                         oobject, (intmax_t)oobject->resident_page_count);
3437                 for (idx = 0; idx < size; idx++) {
3438                         vm_page_t m;
3439
3440                         m = vm_page_lookup_busy_wait(oobject, offidxstart + idx,
3441                                                      TRUE, "vmpg");
3442                         if (m) {
3443                                 kprintf("oobject %p idx %jd\n",
3444                                         oobject,
3445                                         offidxstart + idx);
3446                                 vm_page_wakeup(m);
3447                         }
3448                 }
3449         }
3450 #endif
3451         /*vm_object_clear_flag(oobject, OBJ_ONEMAPPING);*/
3452         vm_object_deallocate_locked(oobject);
3453 }
3454
3455 /*
3456  * Copies the contents of the source entry to the destination
3457  * entry.  The entries *must* be aligned properly.
3458  *
3459  * The vm_maps must be exclusively locked.
3460  * The vm_map's token must be held.
3461  *
3462  * Because the maps are locked no faults can be in progress during the
3463  * operation.
3464  */
3465 static void
3466 vm_map_copy_entry(vm_map_t src_map, vm_map_t dst_map,
3467                   vm_map_entry_t src_entry, vm_map_entry_t dst_entry)
3468 {
3469         vm_object_t src_object;
3470         vm_object_t oobject;
3471
3472         if (dst_entry->maptype == VM_MAPTYPE_SUBMAP ||
3473             dst_entry->maptype == VM_MAPTYPE_UKSMAP)
3474                 return;
3475         if (src_entry->maptype == VM_MAPTYPE_SUBMAP ||
3476             src_entry->maptype == VM_MAPTYPE_UKSMAP)
3477                 return;
3478
3479         if (src_entry->wired_count == 0) {
3480                 /*
3481                  * If the source entry is marked needs_copy, it is already
3482                  * write-protected.
3483                  *
3484                  * To avoid interacting with a vm_fault that might have
3485                  * released its vm_map, we must acquire the fronting
3486                  * object.
3487                  */
3488                 oobject = src_entry->object.vm_object;
3489                 if (oobject) {
3490                         vm_object_hold(oobject);
3491                         vm_object_chain_acquire(oobject, 0);
3492                 }
3493
3494                 if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
3495                         pmap_protect(src_map->pmap,
3496                             src_entry->start,
3497                             src_entry->end,
3498                             src_entry->protection & ~VM_PROT_WRITE);
3499                 }
3500
3501                 /*
3502                  * Make a copy of the object.
3503                  *
3504                  * The object must be locked prior to checking the object type
3505                  * and for the call to vm_object_collapse() and vm_map_split().
3506                  * We cannot use *_hold() here because the split code will
3507                  * probably try to destroy the object.  The lock is a pool
3508                  * token and doesn't care.
3509                  *
3510                  * We must bump src_map->timestamp when setting
3511                  * MAP_ENTRY_NEEDS_COPY to force any concurrent fault
3512                  * to retry, otherwise the concurrent fault might improperly
3513                  * install a RW pte when its supposed to be a RO(COW) pte.
3514                  * This race can occur because a vnode-backed fault may have
3515                  * to temporarily release the map lock.  This was handled
3516                  * when the caller locked the map exclusively.
3517                  */
3518                 if (oobject) {
3519                         vm_map_split(src_entry, oobject);
3520
3521                         src_object = src_entry->object.vm_object;
3522                         dst_entry->object.vm_object = src_object;
3523                         src_entry->eflags |= (MAP_ENTRY_COW |
3524                                               MAP_ENTRY_NEEDS_COPY);
3525                         dst_entry->eflags |= (MAP_ENTRY_COW |
3526                                               MAP_ENTRY_NEEDS_COPY);
3527                         dst_entry->offset = src_entry->offset;
3528                 } else {
3529                         dst_entry->object.vm_object = NULL;
3530                         dst_entry->offset = 0;
3531                 }
3532                 pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start,
3533                           dst_entry->end - dst_entry->start,
3534                           src_entry->start);
3535                 if (oobject) {
3536                         vm_object_chain_release(oobject);
3537                         vm_object_drop(oobject);
3538                 }
3539         } else {
3540                 /*
3541                  * Of course, wired down pages can't be set copy-on-write.
3542                  * Cause wired pages to be copied into the new map by
3543                  * simulating faults (the new pages are pageable)
3544                  */
3545                 vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry);
3546         }
3547 }
3548
3549 /*
3550  * vmspace_fork:
3551  * Create a new process vmspace structure and vm_map
3552  * based on those of an existing process.  The new map
3553  * is based on the old map, according to the inheritance
3554  * values on the regions in that map.
3555  *
3556  * The source map must not be locked.
3557  * No requirements.
3558  */
3559 static void vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map,
3560                           vm_map_entry_t old_entry, int *countp);
3561 static void vmspace_fork_uksmap_entry(vm_map_t old_map, vm_map_t new_map,
3562                           vm_map_entry_t old_entry, int *countp);
3563
3564 struct vmspace *
3565 vmspace_fork(struct vmspace *vm1)
3566 {
3567         struct vmspace *vm2;
3568         vm_map_t old_map = &vm1->vm_map;
3569         vm_map_t new_map;
3570         vm_map_entry_t old_entry;
3571         int count;
3572
3573         lwkt_gettoken(&vm1->vm_map.token);
3574         vm_map_lock(old_map);
3575
3576         vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset);
3577         lwkt_gettoken(&vm2->vm_map.token);
3578
3579         /*
3580          * We must bump the timestamp to force any concurrent fault
3581          * to retry.
3582          */
3583         bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy,
3584               (caddr_t)&vm1->vm_endcopy - (caddr_t)&vm1->vm_startcopy);
3585         new_map = &vm2->vm_map; /* XXX */
3586         new_map->timestamp = 1;
3587
3588         vm_map_lock(new_map);
3589
3590         count = 0;
3591         old_entry = old_map->header.next;
3592         while (old_entry != &old_map->header) {
3593                 ++count;
3594                 old_entry = old_entry->next;
3595         }
3596
3597         count = vm_map_entry_reserve(count + MAP_RESERVE_COUNT);
3598
3599         old_entry = old_map->header.next;
3600         while (old_entry != &old_map->header) {
3601                 switch(old_entry->maptype) {
3602                 case VM_MAPTYPE_SUBMAP:
3603                         panic("vm_map_fork: encountered a submap");
3604                         break;
3605                 case VM_MAPTYPE_UKSMAP:
3606                         vmspace_fork_uksmap_entry(old_map, new_map,
3607                                                   old_entry, &count);
3608                         break;
3609                 case VM_MAPTYPE_NORMAL:
3610                 case VM_MAPTYPE_VPAGETABLE:
3611                         vmspace_fork_normal_entry(old_map, new_map,
3612                                                   old_entry, &count);
3613                         break;
3614                 }
3615                 old_entry = old_entry->next;
3616         }
3617
3618         new_map->size = old_map->size;
3619         vm_map_unlock(old_map);
3620         vm_map_unlock(new_map);
3621         vm_map_entry_release(count);
3622
3623         lwkt_reltoken(&vm2->vm_map.token);
3624         lwkt_reltoken(&vm1->vm_map.token);
3625
3626         return (vm2);
3627 }
3628
3629 static
3630 void
3631 vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map,
3632                           vm_map_entry_t old_entry, int *countp)
3633 {
3634         vm_map_entry_t new_entry;
3635         vm_object_t object;
3636
3637         switch (old_entry->inheritance) {
3638         case VM_INHERIT_NONE:
3639                 break;
3640         case VM_INHERIT_SHARE:
3641                 /*
3642                  * Clone the entry, creating the shared object if
3643                  * necessary.
3644                  */
3645                 if (old_entry->object.vm_object == NULL)
3646                         vm_map_entry_allocate_object(old_entry);
3647
3648                 if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3649                         /*
3650                          * Shadow a map_entry which needs a copy,
3651                          * replacing its object with a new object
3652                          * that points to the old one.  Ask the
3653                          * shadow code to automatically add an
3654                          * additional ref.  We can't do it afterwords
3655                          * because we might race a collapse.  The call
3656                          * to vm_map_entry_shadow() will also clear
3657                          * OBJ_ONEMAPPING.
3658                          */
3659                         vm_map_entry_shadow(old_entry, 1);
3660                 } else if (old_entry->object.vm_object) {
3661                         /*
3662                          * We will make a shared copy of the object,
3663                          * and must clear OBJ_ONEMAPPING.
3664                          *
3665                          * Optimize vnode objects.  OBJ_ONEMAPPING
3666                          * is non-applicable but clear it anyway,
3667                          * and its terminal so we don't have to deal
3668                          * with chains.  Reduces SMP conflicts.
3669                          *
3670                          * XXX assert that object.vm_object != NULL
3671                          *     since we allocate it above.
3672                          */
3673                         object = old_entry->object.vm_object;
3674                         if (object->type == OBJT_VNODE) {
3675                                 vm_object_reference_quick(object);
3676                                 vm_object_clear_flag(object,
3677                                                      OBJ_ONEMAPPING);
3678                         } else {
3679                                 vm_object_hold(object);
3680                                 vm_object_chain_wait(object, 0);
3681                                 vm_object_reference_locked(object);
3682                                 vm_object_clear_flag(object,
3683                                                      OBJ_ONEMAPPING);
3684                                 vm_object_drop(object);
3685                         }
3686                 }
3687
3688                 /*
3689                  * Clone the entry.  We've already bumped the ref on
3690                  * any vm_object.
3691                  */
3692                 new_entry = vm_map_entry_create(new_map, countp);
3693                 *new_entry = *old_entry;
3694                 new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3695                 new_entry->wired_count = 0;
3696
3697                 /*
3698                  * Insert the entry into the new map -- we know we're
3699                  * inserting at the end of the new map.
3700                  */
3701                 vm_map_entry_link(new_map, new_map->header.prev,
3702                                   new_entry);
3703
3704                 /*
3705                  * Update the physical map
3706                  */
3707                 pmap_copy(new_map->pmap, old_map->pmap,
3708                           new_entry->start,
3709                           (old_entry->end - old_entry->start),
3710                           old_entry->start);
3711                 break;
3712         case VM_INHERIT_COPY:
3713                 /*
3714                  * Clone the entry and link into the map.
3715                  */
3716                 new_entry = vm_map_entry_create(new_map, countp);
3717                 *new_entry = *old_entry;
3718                 new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3719                 new_entry->wired_count = 0;
3720                 new_entry->object.vm_object = NULL;
3721                 vm_map_entry_link(new_map, new_map->header.prev,
3722                                   new_entry);
3723                 vm_map_copy_entry(old_map, new_map, old_entry,
3724                                   new_entry);
3725                 break;
3726         }
3727 }
3728
3729 /*
3730  * When forking user-kernel shared maps, the map might change in the
3731  * child so do not try to copy the underlying pmap entries.
3732  */
3733 static
3734 void
3735 vmspace_fork_uksmap_entry(vm_map_t old_map, vm_map_t new_map,
3736                           vm_map_entry_t old_entry, int *countp)
3737 {
3738         vm_map_entry_t new_entry;
3739
3740         new_entry = vm_map_entry_create(new_map, countp);
3741         *new_entry = *old_entry;
3742         new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3743         new_entry->wired_count = 0;
3744         vm_map_entry_link(new_map, new_map->header.prev,
3745                           new_entry);
3746 }
3747
3748 /*
3749  * Create an auto-grow stack entry
3750  *
3751  * No requirements.
3752  */
3753 int
3754 vm_map_stack (vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
3755               int flags, vm_prot_t prot, vm_prot_t max, int cow)
3756 {
3757         vm_map_entry_t  prev_entry;
3758         vm_map_entry_t  new_stack_entry;
3759         vm_size_t       init_ssize;
3760         int             rv;
3761         int             count;
3762         vm_offset_t     tmpaddr;
3763
3764         cow |= MAP_IS_STACK;
3765
3766         if (max_ssize < sgrowsiz)
3767                 init_ssize = max_ssize;
3768         else
3769                 init_ssize = sgrowsiz;
3770
3771         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3772         vm_map_lock(map);
3773
3774         /*
3775          * Find space for the mapping
3776          */
3777         if ((flags & (MAP_FIXED | MAP_TRYFIXED)) == 0) {
3778                 if (vm_map_findspace(map, addrbos, max_ssize, 1,
3779                                      flags, &tmpaddr)) {
3780                         vm_map_unlock(map);
3781                         vm_map_entry_release(count);
3782                         return (KERN_NO_SPACE);
3783                 }
3784                 addrbos = tmpaddr;
3785         }
3786
3787         /* If addr is already mapped, no go */
3788         if (vm_map_lookup_entry(map, addrbos, &prev_entry)) {
3789                 vm_map_unlock(map);
3790                 vm_map_entry_release(count);
3791                 return (KERN_NO_SPACE);
3792         }
3793
3794 #if 0
3795         /* XXX already handled by kern_mmap() */
3796         /* If we would blow our VMEM resource limit, no go */
3797         if (map->size + init_ssize >
3798             curproc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
3799                 vm_map_unlock(map);
3800                 vm_map_entry_release(count);
3801                 return (KERN_NO_SPACE);
3802         }
3803 #endif
3804
3805         /*
3806          * If we can't accomodate max_ssize in the current mapping,
3807          * no go.  However, we need to be aware that subsequent user
3808          * mappings might map into the space we have reserved for
3809          * stack, and currently this space is not protected.
3810          *
3811          * Hopefully we will at least detect this condition
3812          * when we try to grow the stack.
3813          */
3814         if ((prev_entry->next != &map->header) &&
3815             (prev_entry->next->start < addrbos + max_ssize)) {
3816                 vm_map_unlock(map);
3817                 vm_map_entry_release(count);
3818                 return (KERN_NO_SPACE);
3819         }
3820
3821         /*
3822          * We initially map a stack of only init_ssize.  We will
3823          * grow as needed later.  Since this is to be a grow
3824          * down stack, we map at the top of the range.
3825          *
3826          * Note: we would normally expect prot and max to be
3827          * VM_PROT_ALL, and cow to be 0.  Possibly we should
3828          * eliminate these as input parameters, and just
3829          * pass these values here in the insert call.
3830          */
3831         rv = vm_map_insert(map, &count, NULL, NULL,
3832                            0, addrbos + max_ssize - init_ssize,
3833                            addrbos + max_ssize,
3834                            VM_MAPTYPE_NORMAL,
3835                            VM_SUBSYS_STACK, prot, max, cow);
3836
3837         /* Now set the avail_ssize amount */
3838         if (rv == KERN_SUCCESS) {
3839                 if (prev_entry != &map->header)
3840                         vm_map_clip_end(map, prev_entry, addrbos + max_ssize - init_ssize, &count);
3841                 new_stack_entry = prev_entry->next;
3842                 if (new_stack_entry->end   != addrbos + max_ssize ||
3843                     new_stack_entry->start != addrbos + max_ssize - init_ssize)
3844                         panic ("Bad entry start/end for new stack entry");
3845                 else
3846                         new_stack_entry->aux.avail_ssize = max_ssize - init_ssize;
3847         }
3848
3849         vm_map_unlock(map);
3850         vm_map_entry_release(count);
3851         return (rv);
3852 }
3853
3854 /*
3855  * Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if the
3856  * desired address is already mapped, or if we successfully grow
3857  * the stack.  Also returns KERN_SUCCESS if addr is outside the
3858  * stack range (this is strange, but preserves compatibility with
3859  * the grow function in vm_machdep.c).
3860  *
3861  * No requirements.
3862  */
3863 int
3864 vm_map_growstack (vm_map_t map, vm_offset_t addr)
3865 {
3866         vm_map_entry_t prev_entry;
3867         vm_map_entry_t stack_entry;
3868         vm_map_entry_t new_stack_entry;
3869         struct vmspace *vm;
3870         struct lwp *lp;
3871         struct proc *p;
3872         vm_offset_t    end;
3873         int grow_amount;
3874         int rv = KERN_SUCCESS;
3875         int is_procstack;
3876         int use_read_lock = 1;
3877         int count;
3878
3879         /*
3880          * Find the vm
3881          */
3882         lp = curthread->td_lwp;
3883         p = curthread->td_proc;
3884         KKASSERT(lp != NULL);
3885         vm = lp->lwp_vmspace;
3886
3887         /*
3888          * Growstack is only allowed on the current process.  We disallow
3889          * other use cases, e.g. trying to access memory via procfs that
3890          * the stack hasn't grown into.
3891          */
3892         if (map != &vm->vm_map) {
3893                 return KERN_FAILURE;
3894         }
3895
3896         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3897 Retry:
3898         if (use_read_lock)
3899                 vm_map_lock_read(map);
3900         else
3901                 vm_map_lock(map);
3902
3903         /* If addr is already in the entry range, no need to grow.*/
3904         if (vm_map_lookup_entry(map, addr, &prev_entry))
3905                 goto done;
3906
3907         if ((stack_entry = prev_entry->next) == &map->header)
3908                 goto done;
3909         if (prev_entry == &map->header)
3910                 end = stack_entry->start - stack_entry->aux.avail_ssize;
3911         else
3912                 end = prev_entry->end;
3913
3914         /*
3915          * This next test mimics the old grow function in vm_machdep.c.
3916          * It really doesn't quite make sense, but we do it anyway
3917          * for compatibility.
3918          *
3919          * If not growable stack, return success.  This signals the
3920          * caller to proceed as he would normally with normal vm.
3921          */
3922         if (stack_entry->aux.avail_ssize < 1 ||
3923             addr >= stack_entry->start ||
3924             addr <  stack_entry->start - stack_entry->aux.avail_ssize) {
3925                 goto done;
3926         }
3927
3928         /* Find the minimum grow amount */
3929         grow_amount = roundup (stack_entry->start - addr, PAGE_SIZE);
3930         if (grow_amount > stack_entry->aux.avail_ssize) {
3931                 rv = KERN_NO_SPACE;
3932                 goto done;
3933         }
3934
3935         /*
3936          * If there is no longer enough space between the entries
3937          * nogo, and adjust the available space.  Note: this
3938          * should only happen if the user has mapped into the
3939          * stack area after the stack was created, and is
3940          * probably an error.
3941          *
3942          * This also effectively destroys any guard page the user
3943          * might have intended by limiting the stack size.
3944          */
3945         if (grow_amount > stack_entry->start - end) {
3946                 if (use_read_lock && vm_map_lock_upgrade(map)) {
3947                         /* lost lock */
3948                         use_read_lock = 0;
3949                         goto Retry;
3950                 }
3951                 use_read_lock = 0;
3952                 stack_entry->aux.avail_ssize = stack_entry->start - end;
3953                 rv = KERN_NO_SPACE;
3954                 goto done;
3955         }
3956
3957         is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr;
3958
3959         /* If this is the main process stack, see if we're over the
3960          * stack limit.
3961          */
3962         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount >
3963                              p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
3964                 rv = KERN_NO_SPACE;
3965                 goto done;
3966         }
3967
3968         /* Round up the grow amount modulo SGROWSIZ */
3969         grow_amount = roundup (grow_amount, sgrowsiz);
3970         if (grow_amount > stack_entry->aux.avail_ssize) {
3971                 grow_amount = stack_entry->aux.avail_ssize;
3972         }
3973         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount >
3974                              p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
3975                 grow_amount = p->p_rlimit[RLIMIT_STACK].rlim_cur -
3976                               ctob(vm->vm_ssize);
3977         }
3978
3979         /* If we would blow our VMEM resource limit, no go */
3980         if (map->size + grow_amount > p->p_rlimit[RLIMIT_VMEM].rlim_cur) {
3981                 rv = KERN_NO_SPACE;
3982                 goto done;
3983         }
3984
3985         if (use_read_lock && vm_map_lock_upgrade(map)) {
3986                 /* lost lock */
3987                 use_read_lock = 0;
3988                 goto Retry;
3989         }
3990         use_read_lock = 0;
3991
3992         /* Get the preliminary new entry start value */
3993         addr = stack_entry->start - grow_amount;
3994
3995         /* If this puts us into the previous entry, cut back our growth
3996          * to the available space.  Also, see the note above.
3997          */
3998         if (addr < end) {
3999                 stack_entry->aux.avail_ssize = stack_entry->start - end;
4000                 addr = end;
4001         }
4002
4003         rv = vm_map_insert(map, &count, NULL, NULL,
4004                            0, addr, stack_entry->start,
4005                            VM_MAPTYPE_NORMAL,
4006                            VM_SUBSYS_STACK, VM_PROT_ALL, VM_PROT_ALL, 0);
4007
4008         /* Adjust the available stack space by the amount we grew. */
4009         if (rv == KERN_SUCCESS) {
4010                 if (prev_entry != &map->header)
4011                         vm_map_clip_end(map, prev_entry, addr, &count);
4012                 new_stack_entry = prev_entry->next;
4013                 if (new_stack_entry->end   != stack_entry->start  ||
4014                     new_stack_entry->start != addr)
4015                         panic ("Bad stack grow start/end in new stack entry");
4016                 else {
4017                         new_stack_entry->aux.avail_ssize =
4018                                 stack_entry->aux.avail_ssize -
4019                                 (new_stack_entry->end - new_stack_entry->start);
4020                         if (is_procstack)
4021                                 vm->vm_ssize += btoc(new_stack_entry->end -
4022                                                      new_stack_entry->start);
4023                 }
4024
4025                 if (map->flags & MAP_WIREFUTURE)
4026                         vm_map_unwire(map, new_stack_entry->start,
4027                                       new_stack_entry->end, FALSE);
4028         }
4029
4030 done:
4031         if (use_read_lock)
4032                 vm_map_unlock_read(map);
4033         else
4034                 vm_map_unlock(map);
4035         vm_map_entry_release(count);
4036         return (rv);
4037 }
4038
4039 /*
4040  * Unshare the specified VM space for exec.  If other processes are
4041  * mapped to it, then create a new one.  The new vmspace is null.
4042  *
4043  * No requirements.
4044  */
4045 void
4046 vmspace_exec(struct proc *p, struct vmspace *vmcopy)
4047 {
4048         struct vmspace *oldvmspace = p->p_vmspace;
4049         struct vmspace *newvmspace;
4050         vm_map_t map = &p->p_vmspace->vm_map;
4051
4052         /*
4053          * If we are execing a resident vmspace we fork it, otherwise
4054          * we create a new vmspace.  Note that exitingcnt is not
4055          * copied to the new vmspace.
4056          */
4057         lwkt_gettoken(&oldvmspace->vm_map.token);
4058         if (vmcopy)  {
4059                 newvmspace = vmspace_fork(vmcopy);
4060                 lwkt_gettoken(&newvmspace->vm_map.token);
4061         } else {
4062                 newvmspace = vmspace_alloc(map->min_offset, map->max_offset);
4063                 lwkt_gettoken(&newvmspace->vm_map.token);
4064                 bcopy(&oldvmspace->vm_startcopy, &newvmspace->vm_startcopy,
4065                       (caddr_t)&oldvmspace->vm_endcopy -
4066                        (caddr_t)&oldvmspace->vm_startcopy);
4067         }
4068
4069         /*
4070          * Finish initializing the vmspace before assigning it
4071          * to the process.  The vmspace will become the current vmspace
4072          * if p == curproc.
4073          */
4074         pmap_pinit2(vmspace_pmap(newvmspace));
4075         pmap_replacevm(p, newvmspace, 0);
4076         lwkt_reltoken(&newvmspace->vm_map.token);
4077         lwkt_reltoken(&oldvmspace->vm_map.token);
4078         vmspace_rel(oldvmspace);
4079 }
4080
4081 /*
4082  * Unshare the specified VM space for forcing COW.  This
4083  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
4084  */
4085 void
4086 vmspace_unshare(struct proc *p)
4087 {
4088         struct vmspace *oldvmspace = p->p_vmspace;
4089         struct vmspace *newvmspace;
4090
4091         lwkt_gettoken(&oldvmspace->vm_map.token);
4092         if (vmspace_getrefs(oldvmspace) == 1) {
4093                 lwkt_reltoken(&oldvmspace->vm_map.token);
4094                 return;
4095         }
4096         newvmspace = vmspace_fork(oldvmspace);
4097         lwkt_gettoken(&newvmspace->vm_map.token);
4098         pmap_pinit2(vmspace_pmap(newvmspace));
4099         pmap_replacevm(p, newvmspace, 0);
4100         lwkt_reltoken(&newvmspace->vm_map.token);
4101         lwkt_reltoken(&oldvmspace->vm_map.token);
4102         vmspace_rel(oldvmspace);
4103 }
4104
4105 /*
4106  * vm_map_hint: return the beginning of the best area suitable for
4107  * creating a new mapping with "prot" protection.
4108  *
4109  * No requirements.
4110  */
4111 vm_offset_t
4112 vm_map_hint(struct proc *p, vm_offset_t addr, vm_prot_t prot)
4113 {
4114         struct vmspace *vms = p->p_vmspace;
4115
4116         if (!randomize_mmap || addr != 0) {
4117                 /*
4118                  * Set a reasonable start point for the hint if it was
4119                  * not specified or if it falls within the heap space.
4120                  * Hinted mmap()s do not allocate out of the heap space.
4121                  */
4122                 if (addr == 0 ||
4123                     (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
4124                      addr < round_page((vm_offset_t)vms->vm_daddr + maxdsiz))) {
4125                         addr = round_page((vm_offset_t)vms->vm_daddr + maxdsiz);
4126                 }
4127
4128                 return addr;
4129         }
4130         addr = (vm_offset_t)vms->vm_daddr + MAXDSIZ;
4131         addr += karc4random() & (MIN((256 * 1024 * 1024), MAXDSIZ) - 1);
4132
4133         return (round_page(addr));
4134 }
4135
4136 /*
4137  * Finds the VM object, offset, and protection for a given virtual address
4138  * in the specified map, assuming a page fault of the type specified.
4139  *
4140  * Leaves the map in question locked for read; return values are guaranteed
4141  * until a vm_map_lookup_done call is performed.  Note that the map argument
4142  * is in/out; the returned map must be used in the call to vm_map_lookup_done.
4143  *
4144  * A handle (out_entry) is returned for use in vm_map_lookup_done, to make
4145  * that fast.
4146  *
4147  * If a lookup is requested with "write protection" specified, the map may
4148  * be changed to perform virtual copying operations, although the data
4149  * referenced will remain the same.
4150  *
4151  * No requirements.
4152  */
4153 int
4154 vm_map_lookup(vm_map_t *var_map,                /* IN/OUT */
4155               vm_offset_t vaddr,
4156               vm_prot_t fault_typea,
4157               vm_map_entry_t *out_entry,        /* OUT */
4158               vm_object_t *object,              /* OUT */
4159               vm_pindex_t *pindex,              /* OUT */
4160               vm_prot_t *out_prot,              /* OUT */
4161               int *wflags)                      /* OUT */
4162 {
4163         vm_map_entry_t entry;
4164         vm_map_t map = *var_map;
4165         vm_prot_t prot;
4166         vm_prot_t fault_type = fault_typea;
4167         int use_read_lock = 1;
4168         int rv = KERN_SUCCESS;
4169         int count;
4170         thread_t td = curthread;
4171
4172         /*
4173          * vm_map_entry_reserve() implements an important mitigation
4174          * against mmap() span running the kernel out of vm_map_entry
4175          * structures, but it can also cause an infinite call recursion.
4176          * Use td_nest_count to prevent an infinite recursion (allows
4177          * the vm_map code to dig into the pcpu vm_map_entry reserve).
4178          */
4179         count = 0;
4180         if (td->td_nest_count == 0) {
4181                 ++td->td_nest_count;
4182                 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
4183                 --td->td_nest_count;
4184         }
4185 RetryLookup:
4186         if (use_read_lock)
4187                 vm_map_lock_read(map);
4188         else
4189                 vm_map_lock(map);
4190
4191         /*
4192          * Always do a full lookup.  The hint doesn't get us much anymore
4193          * now that the map is RB'd.
4194          */
4195         cpu_ccfence();
4196         *out_entry = &map->header;
4197         *object = NULL;
4198
4199         {
4200                 vm_map_entry_t tmp_entry;
4201
4202                 if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
4203                         rv = KERN_INVALID_ADDRESS;
4204                         goto done;
4205                 }
4206                 entry = tmp_entry;
4207                 *out_entry = entry;
4208         }
4209
4210         /*
4211          * Handle submaps.
4212          */
4213         if (entry->maptype == VM_MAPTYPE_SUBMAP) {
4214                 vm_map_t old_map = map;
4215
4216                 *var_map = map = entry->object.sub_map;
4217                 if (use_read_lock)
4218                         vm_map_unlock_read(old_map);
4219                 else
4220                         vm_map_unlock(old_map);
4221                 use_read_lock = 1;
4222                 goto RetryLookup;
4223         }
4224
4225         /*
4226          * Check whether this task is allowed to have this page.
4227          * Note the special case for MAP_ENTRY_COW pages with an override.
4228          * This is to implement a forced COW for debuggers.
4229          */
4230         if (fault_type & VM_PROT_OVERRIDE_WRITE)
4231                 prot = entry->max_protection;
4232         else
4233                 prot = entry->protection;
4234
4235         fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
4236         if ((fault_type & prot) != fault_type) {
4237                 rv = KERN_PROTECTION_FAILURE;
4238                 goto done;
4239         }
4240
4241         if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
4242             (entry->eflags & MAP_ENTRY_COW) &&
4243             (fault_type & VM_PROT_WRITE) &&
4244             (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) {
4245                 rv = KERN_PROTECTION_FAILURE;
4246                 goto done;
4247         }
4248
4249         /*
4250          * If this page is not pageable, we have to get it for all possible
4251          * accesses.
4252          */
4253         *wflags = 0;
4254         if (entry->wired_count) {
4255                 *wflags |= FW_WIRED;
4256                 prot = fault_type = entry->protection;
4257         }
4258
4259         /*
4260          * Virtual page tables may need to update the accessed (A) bit
4261          * in a page table entry.  Upgrade the fault to a write fault for
4262          * that case if the map will support it.  If the map does not support
4263          * it the page table entry simply will not be updated.
4264          */
4265         if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
4266                 if (prot & VM_PROT_WRITE)
4267                         fault_type |= VM_PROT_WRITE;
4268         }
4269
4270         if (curthread->td_lwp && curthread->td_lwp->lwp_vmspace &&
4271             pmap_emulate_ad_bits(&curthread->td_lwp->lwp_vmspace->vm_pmap)) {
4272                 if ((prot & VM_PROT_WRITE) == 0)
4273                         fault_type |= VM_PROT_WRITE;
4274         }
4275
4276         /*
4277          * Only NORMAL and VPAGETABLE maps are object-based.  UKSMAPs are not.
4278          */
4279         if (entry->maptype != VM_MAPTYPE_NORMAL &&
4280             entry->maptype != VM_MAPTYPE_VPAGETABLE) {
4281                 *object = NULL;
4282                 goto skip;
4283         }
4284
4285         /*
4286          * If the entry was copy-on-write, we either ...
4287          */
4288         if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4289                 /*
4290                  * If we want to write the page, we may as well handle that
4291                  * now since we've got the map locked.
4292                  *
4293                  * If we don't need to write the page, we just demote the
4294                  * permissions allowed.
4295                  */
4296                 if (fault_type & VM_PROT_WRITE) {
4297                         /*
4298                          * Not allowed if TDF_NOFAULT is set as the shadowing
4299                          * operation can deadlock against the faulting
4300                          * function due to the copy-on-write.
4301                          */
4302                         if (curthread->td_flags & TDF_NOFAULT) {
4303                                 rv = KERN_FAILURE_NOFAULT;
4304                                 goto done;
4305                         }
4306
4307                         /*
4308                          * Make a new object, and place it in the object
4309                          * chain.  Note that no new references have appeared
4310                          * -- one just moved from the map to the new
4311                          * object.
4312                          */
4313                         if (use_read_lock && vm_map_lock_upgrade(map)) {
4314                                 /* lost lock */
4315                                 use_read_lock = 0;
4316                                 goto RetryLookup;
4317                         }
4318                         use_read_lock = 0;
4319                         vm_map_entry_shadow(entry, 0);
4320                         *wflags |= FW_DIDCOW;
4321                 } else {
4322                         /*
4323                          * We're attempting to read a copy-on-write page --
4324                          * don't allow writes.
4325                          */
4326                         prot &= ~VM_PROT_WRITE;
4327                 }
4328         }
4329
4330         /*
4331          * Create an object if necessary.  This code also handles
4332          * partitioning large entries to improve vm_fault performance.
4333          */
4334         if (entry->object.vm_object == NULL && !map->system_map) {
4335                 if (use_read_lock && vm_map_lock_upgrade(map))  {
4336                         /* lost lock */
4337                         use_read_lock = 0;
4338                         goto RetryLookup;
4339                 }
4340                 use_read_lock = 0;
4341
4342                 /*
4343                  * Partition large entries, giving each its own VM object,
4344                  * to improve concurrent fault performance.  This is only
4345                  * applicable to userspace.
4346                  */
4347                 if (map != &kernel_map &&
4348                     entry->maptype == VM_MAPTYPE_NORMAL &&
4349                     ((entry->start ^ entry->end) & ~MAP_ENTRY_PARTITION_MASK) &&
4350                     vm_map_partition_enable) {
4351                         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
4352                                 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
4353                                 ++mycpu->gd_cnt.v_intrans_coll;
4354                                 ++mycpu->gd_cnt.v_intrans_wait;
4355                                 vm_map_transition_wait(map, 0);
4356                                 goto RetryLookup;
4357                         }
4358                         vm_map_entry_partition(map, entry, vaddr, &count);
4359                 }
4360                 vm_map_entry_allocate_object(entry);
4361         }
4362
4363         /*
4364          * Return the object/offset from this entry.  If the entry was
4365          * copy-on-write or empty, it has been fixed up.
4366          */
4367         *object = entry->object.vm_object;
4368
4369 skip:
4370         *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
4371
4372         /*
4373          * Return whether this is the only map sharing this data.  On
4374          * success we return with a read lock held on the map.  On failure
4375          * we return with the map unlocked.
4376          */
4377         *out_prot = prot;
4378 done:
4379         if (rv == KERN_SUCCESS) {
4380                 if (use_read_lock == 0)
4381                         vm_map_lock_downgrade(map);
4382         } else if (use_read_lock) {
4383                 vm_map_unlock_read(map);
4384         } else {
4385                 vm_map_unlock(map);
4386         }
4387         if (count > 0)
4388                 vm_map_entry_release(count);
4389
4390         return (rv);
4391 }
4392
4393 /*
4394  * Releases locks acquired by a vm_map_lookup()
4395  * (according to the handle returned by that lookup).
4396  *
4397  * No other requirements.
4398  */
4399 void
4400 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry, int count)
4401 {
4402         /*
4403          * Unlock the main-level map
4404          */
4405         vm_map_unlock_read(map);
4406         if (count)
4407                 vm_map_entry_release(count);
4408 }
4409
4410 static void
4411 vm_map_entry_partition(vm_map_t map, vm_map_entry_t entry,
4412                        vm_offset_t vaddr, int *countp)
4413 {
4414         vaddr &= ~MAP_ENTRY_PARTITION_MASK;
4415         vm_map_clip_start(map, entry, vaddr, countp);
4416         vaddr += MAP_ENTRY_PARTITION_SIZE;
4417         vm_map_clip_end(map, entry, vaddr, countp);
4418 }
4419
4420 /*
4421  * Quick hack, needs some help to make it more SMP friendly.
4422  */
4423 void
4424 vm_map_interlock(vm_map_t map, struct vm_map_ilock *ilock,
4425                  vm_offset_t ran_beg, vm_offset_t ran_end)
4426 {
4427         struct vm_map_ilock *scan;
4428
4429         ilock->ran_beg = ran_beg;
4430         ilock->ran_end = ran_end;
4431         ilock->flags = 0;
4432
4433         spin_lock(&map->ilock_spin);
4434 restart:
4435         for (scan = map->ilock_base; scan; scan = scan->next) {
4436                 if (ran_end > scan->ran_beg && ran_beg < scan->ran_end) {
4437                         scan->flags |= ILOCK_WAITING;
4438                         ssleep(scan, &map->ilock_spin, 0, "ilock", 0);
4439                         goto restart;
4440                 }
4441         }
4442         ilock->next = map->ilock_base;
4443         map->ilock_base = ilock;
4444         spin_unlock(&map->ilock_spin);
4445 }
4446
4447 void
4448 vm_map_deinterlock(vm_map_t map, struct  vm_map_ilock *ilock)
4449 {
4450         struct vm_map_ilock *scan;
4451         struct vm_map_ilock **scanp;
4452
4453         spin_lock(&map->ilock_spin);
4454         scanp = &map->ilock_base;
4455         while ((scan = *scanp) != NULL) {
4456                 if (scan == ilock) {
4457                         *scanp = ilock->next;
4458                         spin_unlock(&map->ilock_spin);
4459                         if (ilock->flags & ILOCK_WAITING)
4460                                 wakeup(ilock);
4461                         return;
4462                 }
4463                 scanp = &scan->next;
4464         }
4465         spin_unlock(&map->ilock_spin);
4466         panic("vm_map_deinterlock: missing ilock!");
4467 }
4468
4469 #include "opt_ddb.h"
4470 #ifdef DDB
4471 #include <ddb/ddb.h>
4472
4473 /*
4474  * Debugging only
4475  */
4476 DB_SHOW_COMMAND(map, vm_map_print)
4477 {
4478         static int nlines;
4479         /* XXX convert args. */
4480         vm_map_t map = (vm_map_t)addr;
4481         boolean_t full = have_addr;
4482
4483         vm_map_entry_t entry;
4484
4485         db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
4486             (void *)map,
4487             (void *)map->pmap, map->nentries, map->timestamp);
4488         nlines++;
4489
4490         if (!full && db_indent)
4491                 return;
4492
4493         db_indent += 2;
4494         for (entry = map->header.next; entry != &map->header;
4495             entry = entry->next) {
4496                 db_iprintf("map entry %p: start=%p, end=%p\n",
4497                     (void *)entry, (void *)entry->start, (void *)entry->end);
4498                 nlines++;
4499                 {
4500                         static char *inheritance_name[4] =
4501                         {"share", "copy", "none", "donate_copy"};
4502
4503                         db_iprintf(" prot=%x/%x/%s",
4504                             entry->protection,
4505                             entry->max_protection,
4506                             inheritance_name[(int)(unsigned char)
4507                                                 entry->inheritance]);
4508                         if (entry->wired_count != 0)
4509                                 db_printf(", wired");
4510                 }
4511                 switch(entry->maptype) {
4512                 case VM_MAPTYPE_SUBMAP:
4513                         /* XXX no %qd in kernel.  Truncate entry->offset. */
4514                         db_printf(", share=%p, offset=0x%lx\n",
4515                             (void *)entry->object.sub_map,
4516                             (long)entry->offset);
4517                         nlines++;
4518                         if ((entry->prev == &map->header) ||
4519                             (entry->prev->object.sub_map !=
4520                                 entry->object.sub_map)) {
4521                                 db_indent += 2;
4522                                 vm_map_print((db_expr_t)(intptr_t)
4523                                              entry->object.sub_map,
4524                                              full, 0, NULL);
4525                                 db_indent -= 2;
4526                         }
4527                         break;
4528                 case VM_MAPTYPE_NORMAL:
4529                 case VM_MAPTYPE_VPAGETABLE:
4530                         /* XXX no %qd in kernel.  Truncate entry->offset. */
4531                         db_printf(", object=%p, offset=0x%lx",
4532                             (void *)entry->object.vm_object,
4533                             (long)entry->offset);
4534                         if (entry->eflags & MAP_ENTRY_COW)
4535                                 db_printf(", copy (%s)",
4536                                     (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
4537                         db_printf("\n");
4538                         nlines++;
4539
4540                         if ((entry->prev == &map->header) ||
4541                             (entry->prev->object.vm_object !=
4542                                 entry->object.vm_object)) {
4543                                 db_indent += 2;
4544                                 vm_object_print((db_expr_t)(intptr_t)
4545                                                 entry->object.vm_object,
4546                                                 full, 0, NULL);
4547                                 nlines += 4;
4548                                 db_indent -= 2;
4549                         }
4550                         break;
4551                 case VM_MAPTYPE_UKSMAP:
4552                         db_printf(", uksmap=%p, offset=0x%lx",
4553                             (void *)entry->object.uksmap,
4554                             (long)entry->offset);
4555                         if (entry->eflags & MAP_ENTRY_COW)
4556                                 db_printf(", copy (%s)",
4557                                     (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
4558                         db_printf("\n");
4559                         nlines++;
4560                         break;
4561                 default:
4562                         break;
4563                 }
4564         }
4565         db_indent -= 2;
4566         if (db_indent == 0)
4567                 nlines = 0;
4568 }
4569
4570 /*
4571  * Debugging only
4572  */
4573 DB_SHOW_COMMAND(procvm, procvm)
4574 {
4575         struct proc *p;
4576
4577         if (have_addr) {
4578                 p = (struct proc *) addr;
4579         } else {
4580                 p = curproc;
4581         }
4582
4583         db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
4584             (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
4585             (void *)vmspace_pmap(p->p_vmspace));
4586
4587         vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL);
4588 }
4589
4590 #endif /* DDB */