sys/vm/vm_map.c

   1 /*
   2  * (MPSAFE)
   3  *
   4  * Copyright (c) 1991, 1993
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * This code is derived from software contributed to Berkeley by
   8  * The Mach Operating System project at Carnegie-Mellon University.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. All advertising materials mentioning features or use of this software
  19  *    must display the following acknowledgement:
  20  *      This product includes software developed by the University of
  21  *      California, Berkeley and its contributors.
  22  * 4. Neither the name of the University nor the names of its contributors
  23  *    may be used to endorse or promote products derived from this software
  24  *    without specific prior written permission.
  25  *
  26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  36  * SUCH DAMAGE.
  37  *
  38  *      from: @(#)vm_map.c      8.3 (Berkeley) 1/12/94
  39  *
  40  *
  41  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  42  * All rights reserved.
  43  *
  44  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  45  *
  46  * Permission to use, copy, modify and distribute this software and
  47  * its documentation is hereby granted, provided that both the copyright
  48  * notice and this permission notice appear in all copies of the
  49  * software, derivative works or modified versions, and any portions
  50  * thereof, and that both notices appear in supporting documentation.
  51  *
  52  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  53  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  54  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  55  *
  56  * Carnegie Mellon requests users of this software to return to
  57  *
  58  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  59  *  School of Computer Science
  60  *  Carnegie Mellon University
  61  *  Pittsburgh PA 15213-3890
  62  *
  63  * any improvements or extensions that they make and grant Carnegie the
  64  * rights to redistribute these changes.
  65  *
  66  * $FreeBSD: src/sys/vm/vm_map.c,v 1.187.2.19 2003/05/27 00:47:02 alc Exp $
  67  * $DragonFly: src/sys/vm/vm_map.c,v 1.56 2007/04/29 18:25:41 dillon Exp $
  68  */
  69
  70 /*
  71  *      Virtual memory mapping module.
  72  */
  73
  74 #include <sys/param.h>
  75 #include <sys/systm.h>
  76 #include <sys/kernel.h>
  77 #include <sys/proc.h>
  78 #include <sys/lock.h>
  79 #include <sys/vmmeter.h>
  80 #include <sys/mman.h>
  81 #include <sys/vnode.h>
  82 #include <sys/resourcevar.h>
  83 #include <sys/shm.h>
  84 #include <sys/tree.h>
  85 #include <sys/malloc.h>
  86
  87 #include <vm/vm.h>
  88 #include <vm/vm_param.h>
  89 #include <vm/pmap.h>
  90 #include <vm/vm_map.h>
  91 #include <vm/vm_page.h>
  92 #include <vm/vm_object.h>
  93 #include <vm/vm_pager.h>
  94 #include <vm/vm_kern.h>
  95 #include <vm/vm_extern.h>
  96 #include <vm/swap_pager.h>
  97 #include <vm/vm_zone.h>
  98
  99 #include <sys/thread2.h>
 100 #include <sys/sysref2.h>
 101
 102 /*
 103  * Virtual memory maps provide for the mapping, protection, and sharing
 104  * of virtual memory objects.  In addition, this module provides for an
 105  * efficient virtual copy of memory from one map to another.
 106  *
 107  * Synchronization is required prior to most operations.
 108  *
 109  * Maps consist of an ordered doubly-linked list of simple entries.
 110  * A hint and a RB tree is used to speed-up lookups.
 111  *
 112  * Callers looking to modify maps specify start/end addresses which cause
 113  * the related map entry to be clipped if necessary, and then later
 114  * recombined if the pieces remained compatible.
 115  *
 116  * Virtual copy operations are performed by copying VM object references
 117  * from one map to another, and then marking both regions as copy-on-write.
 118  */
 119 static void vmspace_terminate(struct vmspace *vm);
 120 static void vmspace_lock(struct vmspace *vm);
 121 static void vmspace_unlock(struct vmspace *vm);
 122 static void vmspace_dtor(void *obj, void *private);
 123
 124 MALLOC_DEFINE(M_VMSPACE, "vmspace", "vmspace objcache backingstore");
 125
 126 struct sysref_class vmspace_sysref_class = {
 127         .name =         "vmspace",
 128         .mtype =        M_VMSPACE,
 129         .proto =        SYSREF_PROTO_VMSPACE,
 130         .offset =       offsetof(struct vmspace, vm_sysref),
 131         .objsize =      sizeof(struct vmspace),
 132         .mag_capacity = 32,
 133         .flags = SRC_MANAGEDINIT,
 134         .dtor = vmspace_dtor,
 135         .ops = {
 136                 .terminate = (sysref_terminate_func_t)vmspace_terminate,
 137                 .lock = (sysref_lock_func_t)vmspace_lock,
 138                 .unlock = (sysref_lock_func_t)vmspace_unlock
 139         }
 140 };
 141
 142 #define VMEPERCPU       2
 143
 144 static struct vm_zone mapentzone_store, mapzone_store;
 145 static vm_zone_t mapentzone, mapzone;
 146 static struct vm_object mapentobj, mapobj;
 147
 148 static struct vm_map_entry map_entry_init[MAX_MAPENT];
 149 static struct vm_map_entry cpu_map_entry_init[MAXCPU][VMEPERCPU];
 150 static struct vm_map map_init[MAX_KMAP];
 151
 152 static void vm_map_entry_shadow(vm_map_entry_t entry);
 153 static vm_map_entry_t vm_map_entry_create(vm_map_t map, int *);
 154 static void vm_map_entry_dispose (vm_map_t map, vm_map_entry_t entry, int *);
 155 static void _vm_map_clip_end (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
 156 static void _vm_map_clip_start (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
 157 static void vm_map_entry_delete (vm_map_t, vm_map_entry_t, int *);
 158 static void vm_map_entry_unwire (vm_map_t, vm_map_entry_t);
 159 static void vm_map_copy_entry (vm_map_t, vm_map_t, vm_map_entry_t,
 160                 vm_map_entry_t);
 161 static void vm_map_split (vm_map_entry_t);
 162 static void vm_map_unclip_range (vm_map_t map, vm_map_entry_t start_entry, vm_offset_t start, vm_offset_t end, int *count, int flags);
 163
 164 /*
 165  * Initialize the vm_map module.  Must be called before any other vm_map
 166  * routines.
 167  *
 168  * Map and entry structures are allocated from the general purpose
 169  * memory pool with some exceptions:
 170  *
 171  *      - The kernel map is allocated statically.
 172  *      - Initial kernel map entries are allocated out of a static pool.
 173  *
 174  *      These restrictions are necessary since malloc() uses the
 175  *      maps and requires map entries.
 176  *
 177  * Called from the low level boot code only.
 178  */
 179 void
 180 vm_map_startup(void)
 181 {
 182         mapzone = &mapzone_store;
 183         zbootinit(mapzone, "MAP", sizeof (struct vm_map),
 184                 map_init, MAX_KMAP);
 185         mapentzone = &mapentzone_store;
 186         zbootinit(mapentzone, "MAP ENTRY", sizeof (struct vm_map_entry),
 187                 map_entry_init, MAX_MAPENT);
 188 }
 189
 190 /*
 191  * Called prior to any vmspace allocations.
 192  *
 193  * Called from the low level boot code only.
 194  */
 195 void
 196 vm_init2(void)
 197 {
 198         zinitna(mapentzone, &mapentobj, NULL, 0, 0,
 199                 ZONE_USE_RESERVE | ZONE_SPECIAL, 1);
 200         zinitna(mapzone, &mapobj, NULL, 0, 0, 0, 1);
 201         pmap_init2();
 202         vm_object_init2();
 203 }
 204
 205
 206 /*
 207  * Red black tree functions
 208  *
 209  * The caller must hold the related map lock.
 210  */
 211 static int rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b);
 212 RB_GENERATE(vm_map_rb_tree, vm_map_entry, rb_entry, rb_vm_map_compare);
 213
 214 /* a->start is address, and the only field has to be initialized */
 215 static int
 216 rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b)
 217 {
 218         if (a->start < b->start)
 219                 return(-1);
 220         else if (a->start > b->start)
 221                 return(1);
 222         return(0);
 223 }
 224
 225 /*
 226  * Allocate a vmspace structure, including a vm_map and pmap.
 227  * Initialize numerous fields.  While the initial allocation is zerod,
 228  * subsequence reuse from the objcache leaves elements of the structure
 229  * intact (particularly the pmap), so portions must be zerod.
 230  *
 231  * The structure is not considered activated until we call sysref_activate().
 232  *
 233  * No requirements.
 234  */
 235 struct vmspace *
 236 vmspace_alloc(vm_offset_t min, vm_offset_t max)
 237 {
 238         struct vmspace *vm;
 239
 240         lwkt_gettoken(&vmspace_token);
 241         vm = sysref_alloc(&vmspace_sysref_class);
 242         bzero(&vm->vm_startcopy,
 243               (char *)&vm->vm_endcopy - (char *)&vm->vm_startcopy);
 244         vm_map_init(&vm->vm_map, min, max, NULL);
 245         pmap_pinit(vmspace_pmap(vm));           /* (some fields reused) */
 246         vm->vm_map.pmap = vmspace_pmap(vm);             /* XXX */
 247         vm->vm_shm = NULL;
 248         vm->vm_exitingcnt = 0;
 249         cpu_vmspace_alloc(vm);
 250         sysref_activate(&vm->vm_sysref);
 251         lwkt_reltoken(&vmspace_token);
 252
 253         return (vm);
 254 }
 255
 256 /*
 257  * dtor function - Some elements of the pmap are retained in the
 258  * free-cached vmspaces to improve performance.  We have to clean them up
 259  * here before returning the vmspace to the memory pool.
 260  *
 261  * No requirements.
 262  */
 263 static void
 264 vmspace_dtor(void *obj, void *private)
 265 {
 266         struct vmspace *vm = obj;
 267
 268         pmap_puninit(vmspace_pmap(vm));
 269 }
 270
 271 /*
 272  * Called in two cases:
 273  *
 274  * (1) When the last sysref is dropped, but exitingcnt might still be
 275  *     non-zero.
 276  *
 277  * (2) When there are no sysrefs (i.e. refcnt is negative) left and the
 278  *     exitingcnt becomes zero
 279  *
 280  * sysref will not scrap the object until we call sysref_put() once more
 281  * after the last ref has been dropped.
 282  *
 283  * Interlocked by the sysref API.
 284  */
 285 static void
 286 vmspace_terminate(struct vmspace *vm)
 287 {
 288         int count;
 289
 290         /*
 291          * If exitingcnt is non-zero we can't get rid of the entire vmspace
 292          * yet, but we can scrap user memory.
 293          */
 294         lwkt_gettoken(&vmspace_token);
 295         if (vm->vm_exitingcnt) {
 296                 shmexit(vm);
 297                 pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS,
 298                                   VM_MAX_USER_ADDRESS);
 299                 vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS,
 300                               VM_MAX_USER_ADDRESS);
 301                 lwkt_reltoken(&vmspace_token);
 302                 return;
 303         }
 304         cpu_vmspace_free(vm);
 305
 306         /*
 307          * Make sure any SysV shm is freed, it might not have in
 308          * exit1()
 309          */
 310         shmexit(vm);
 311
 312         KKASSERT(vm->vm_upcalls == NULL);
 313
 314         /*
 315          * Lock the map, to wait out all other references to it.
 316          * Delete all of the mappings and pages they hold, then call
 317          * the pmap module to reclaim anything left.
 318          */
 319         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
 320         vm_map_lock(&vm->vm_map);
 321         vm_map_delete(&vm->vm_map, vm->vm_map.min_offset,
 322                 vm->vm_map.max_offset, &count);
 323         vm_map_unlock(&vm->vm_map);
 324         vm_map_entry_release(count);
 325
 326         pmap_release(vmspace_pmap(vm));
 327         sysref_put(&vm->vm_sysref);
 328         lwkt_reltoken(&vmspace_token);
 329 }
 330
 331 /*
 332  * vmspaces are not currently locked.
 333  */
 334 static void
 335 vmspace_lock(struct vmspace *vm __unused)
 336 {
 337 }
 338
 339 static void
 340 vmspace_unlock(struct vmspace *vm __unused)
 341 {
 342 }
 343
 344 /*
 345  * This is called during exit indicating that the vmspace is no
 346  * longer in used by an exiting process, but the process has not yet
 347  * been cleaned up.
 348  *
 349  * No requirements.
 350  */
 351 void
 352 vmspace_exitbump(struct vmspace *vm)
 353 {
 354         lwkt_gettoken(&vmspace_token);
 355         ++vm->vm_exitingcnt;
 356         lwkt_reltoken(&vmspace_token);
 357 }
 358
 359 /*
 360  * This is called in the wait*() handling code.  The vmspace can be terminated
 361  * after the last wait is finished using it.
 362  *
 363  * No requirements.
 364  */
 365 void
 366 vmspace_exitfree(struct proc *p)
 367 {
 368         struct vmspace *vm;
 369
 370         lwkt_gettoken(&vmspace_token);
 371         vm = p->p_vmspace;
 372         p->p_vmspace = NULL;
 373
 374         if (--vm->vm_exitingcnt == 0 && sysref_isinactive(&vm->vm_sysref))
 375                 vmspace_terminate(vm);
 376         lwkt_reltoken(&vmspace_token);
 377 }
 378
 379 /*
 380  * Swap useage is determined by taking the proportional swap used by
 381  * VM objects backing the VM map.  To make up for fractional losses,
 382  * if the VM object has any swap use at all the associated map entries
 383  * count for at least 1 swap page.
 384  *
 385  * No requirements.
 386  */
 387 int
 388 vmspace_swap_count(struct vmspace *vmspace)
 389 {
 390         vm_map_t map = &vmspace->vm_map;
 391         vm_map_entry_t cur;
 392         vm_object_t object;
 393         int count = 0;
 394         int n;
 395
 396         lwkt_gettoken(&vmspace_token);
 397         for (cur = map->header.next; cur != &map->header; cur = cur->next) {
 398                 switch(cur->maptype) {
 399                 case VM_MAPTYPE_NORMAL:
 400                 case VM_MAPTYPE_VPAGETABLE:
 401                         if ((object = cur->object.vm_object) == NULL)
 402                                 break;
 403                         if (object->swblock_count) {
 404                                 n = (cur->end - cur->start) / PAGE_SIZE;
 405                                 count += object->swblock_count *
 406                                     SWAP_META_PAGES * n / object->size + 1;
 407                         }
 408                         break;
 409                 default:
 410                         break;
 411                 }
 412         }
 413         lwkt_reltoken(&vmspace_token);
 414         return(count);
 415 }
 416
 417 /*
 418  * Calculate the approximate number of anonymous pages in use by
 419  * this vmspace.  To make up for fractional losses, we count each
 420  * VM object as having at least 1 anonymous page.
 421  *
 422  * No requirements.
 423  */
 424 int
 425 vmspace_anonymous_count(struct vmspace *vmspace)
 426 {
 427         vm_map_t map = &vmspace->vm_map;
 428         vm_map_entry_t cur;
 429         vm_object_t object;
 430         int count = 0;
 431
 432         lwkt_gettoken(&vmspace_token);
 433         for (cur = map->header.next; cur != &map->header; cur = cur->next) {
 434                 switch(cur->maptype) {
 435                 case VM_MAPTYPE_NORMAL:
 436                 case VM_MAPTYPE_VPAGETABLE:
 437                         if ((object = cur->object.vm_object) == NULL)
 438                                 break;
 439                         if (object->type != OBJT_DEFAULT &&
 440                             object->type != OBJT_SWAP) {
 441                                 break;
 442                         }
 443                         count += object->resident_page_count;
 444                         break;
 445                 default:
 446                         break;
 447                 }
 448         }
 449         lwkt_reltoken(&vmspace_token);
 450         return(count);
 451 }
 452
 453 /*
 454  * Creates and returns a new empty VM map with the given physical map
 455  * structure, and having the given lower and upper address bounds.
 456  *
 457  * No requirements.
 458  */
 459 vm_map_t
 460 vm_map_create(vm_map_t result, pmap_t pmap, vm_offset_t min, vm_offset_t max)
 461 {
 462         if (result == NULL)
 463                 result = zalloc(mapzone);
 464         vm_map_init(result, min, max, pmap);
 465         return (result);
 466 }
 467
 468 /*
 469  * Initialize an existing vm_map structure such as that in the vmspace
 470  * structure.  The pmap is initialized elsewhere.
 471  *
 472  * No requirements.
 473  */
 474 void
 475 vm_map_init(struct vm_map *map, vm_offset_t min, vm_offset_t max, pmap_t pmap)
 476 {
 477         map->header.next = map->header.prev = &map->header;
 478         RB_INIT(&map->rb_root);
 479         map->nentries = 0;
 480         map->size = 0;
 481         map->system_map = 0;
 482         map->infork = 0;
 483         map->min_offset = min;
 484         map->max_offset = max;
 485         map->pmap = pmap;
 486         map->first_free = &map->header;
 487         map->hint = &map->header;
 488         map->timestamp = 0;
 489         lockinit(&map->lock, "thrd_sleep", 0, 0);
 490 }
 491
 492 /*
 493  * Shadow the vm_map_entry's object.  This typically needs to be done when
 494  * a write fault is taken on an entry which had previously been cloned by
 495  * fork().  The shared object (which might be NULL) must become private so
 496  * we add a shadow layer above it.
 497  *
 498  * Object allocation for anonymous mappings is defered as long as possible.
 499  * When creating a shadow, however, the underlying object must be instantiated
 500  * so it can be shared.
 501  *
 502  * If the map segment is governed by a virtual page table then it is
 503  * possible to address offsets beyond the mapped area.  Just allocate
 504  * a maximally sized object for this case.
 505  *
 506  * The vm_map must be exclusively locked.
 507  * No other requirements.
 508  */
 509 static
 510 void
 511 vm_map_entry_shadow(vm_map_entry_t entry)
 512 {
 513         if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
 514                 vm_object_shadow(&entry->object.vm_object, &entry->offset,
 515                                  0x7FFFFFFF);   /* XXX */
 516         } else {
 517                 vm_object_shadow(&entry->object.vm_object, &entry->offset,
 518                                  atop(entry->end - entry->start));
 519         }
 520         entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
 521 }
 522
 523 /*
 524  * Allocate an object for a vm_map_entry.
 525  *
 526  * Object allocation for anonymous mappings is defered as long as possible.
 527  * This function is called when we can defer no longer, generally when a map
 528  * entry might be split or forked or takes a page fault.
 529  *
 530  * If the map segment is governed by a virtual page table then it is
 531  * possible to address offsets beyond the mapped area.  Just allocate
 532  * a maximally sized object for this case.
 533  *
 534  * The vm_map must be exclusively locked.
 535  * No other requirements.
 536  */
 537 void
 538 vm_map_entry_allocate_object(vm_map_entry_t entry)
 539 {
 540         vm_object_t obj;
 541
 542         if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
 543                 obj = vm_object_allocate(OBJT_DEFAULT, 0x7FFFFFFF); /* XXX */
 544         } else {
 545                 obj = vm_object_allocate(OBJT_DEFAULT,
 546                                          atop(entry->end - entry->start));
 547         }
 548         entry->object.vm_object = obj;
 549         entry->offset = 0;
 550 }
 551
 552 /*
 553  * Set an initial negative count so the first attempt to reserve
 554  * space preloads a bunch of vm_map_entry's for this cpu.  Also
 555  * pre-allocate 2 vm_map_entries which will be needed by zalloc() to
 556  * map a new page for vm_map_entry structures.  SMP systems are
 557  * particularly sensitive.
 558  *
 559  * This routine is called in early boot so we cannot just call
 560  * vm_map_entry_reserve().
 561  *
 562  * Called from the low level boot code only (for each cpu)
 563  */
 564 void
 565 vm_map_entry_reserve_cpu_init(globaldata_t gd)
 566 {
 567         vm_map_entry_t entry;
 568         int i;
 569
 570         gd->gd_vme_avail -= MAP_RESERVE_COUNT * 2;
 571         entry = &cpu_map_entry_init[gd->gd_cpuid][0];
 572         for (i = 0; i < VMEPERCPU; ++i, ++entry) {
 573                 entry->next = gd->gd_vme_base;
 574                 gd->gd_vme_base = entry;
 575         }
 576 }
 577
 578 /*
 579  * Reserves vm_map_entry structures so code later on can manipulate
 580  * map_entry structures within a locked map without blocking trying
 581  * to allocate a new vm_map_entry.
 582  *
 583  * No requirements.
 584  */
 585 int
 586 vm_map_entry_reserve(int count)
 587 {
 588         struct globaldata *gd = mycpu;
 589         vm_map_entry_t entry;
 590
 591         /*
 592          * Make sure we have enough structures in gd_vme_base to handle
 593          * the reservation request.
 594          */
 595         crit_enter();
 596         while (gd->gd_vme_avail < count) {
 597                 entry = zalloc(mapentzone);
 598                 entry->next = gd->gd_vme_base;
 599                 gd->gd_vme_base = entry;
 600                 ++gd->gd_vme_avail;
 601         }
 602         gd->gd_vme_avail -= count;
 603         crit_exit();
 604
 605         return(count);
 606 }
 607
 608 /*
 609  * Releases previously reserved vm_map_entry structures that were not
 610  * used.  If we have too much junk in our per-cpu cache clean some of
 611  * it out.
 612  *
 613  * No requirements.
 614  */
 615 void
 616 vm_map_entry_release(int count)
 617 {
 618         struct globaldata *gd = mycpu;
 619         vm_map_entry_t entry;
 620
 621         crit_enter();
 622         gd->gd_vme_avail += count;
 623         while (gd->gd_vme_avail > MAP_RESERVE_SLOP) {
 624                 entry = gd->gd_vme_base;
 625                 KKASSERT(entry != NULL);
 626                 gd->gd_vme_base = entry->next;
 627                 --gd->gd_vme_avail;
 628                 crit_exit();
 629                 zfree(mapentzone, entry);
 630                 crit_enter();
 631         }
 632         crit_exit();
 633 }
 634
 635 /*
 636  * Reserve map entry structures for use in kernel_map itself.  These
 637  * entries have *ALREADY* been reserved on a per-cpu basis when the map
 638  * was inited.  This function is used by zalloc() to avoid a recursion
 639  * when zalloc() itself needs to allocate additional kernel memory.
 640  *
 641  * This function works like the normal reserve but does not load the
 642  * vm_map_entry cache (because that would result in an infinite
 643  * recursion).  Note that gd_vme_avail may go negative.  This is expected.
 644  *
 645  * Any caller of this function must be sure to renormalize after
 646  * potentially eating entries to ensure that the reserve supply
 647  * remains intact.
 648  *
 649  * No requirements.
 650  */
 651 int
 652 vm_map_entry_kreserve(int count)
 653 {
 654         struct globaldata *gd = mycpu;
 655
 656         crit_enter();
 657         gd->gd_vme_avail -= count;
 658         crit_exit();
 659         KASSERT(gd->gd_vme_base != NULL,
 660                 ("no reserved entries left, gd_vme_avail = %d\n",
 661                 gd->gd_vme_avail));
 662         return(count);
 663 }
 664
 665 /*
 666  * Release previously reserved map entries for kernel_map.  We do not
 667  * attempt to clean up like the normal release function as this would
 668  * cause an unnecessary (but probably not fatal) deep procedure call.
 669  *
 670  * No requirements.
 671  */
 672 void
 673 vm_map_entry_krelease(int count)
 674 {
 675         struct globaldata *gd = mycpu;
 676
 677         crit_enter();
 678         gd->gd_vme_avail += count;
 679         crit_exit();
 680 }
 681
 682 /*
 683  * Allocates a VM map entry for insertion.  No entry fields are filled in.
 684  *
 685  * The entries should have previously been reserved.  The reservation count
 686  * is tracked in (*countp).
 687  *
 688  * No requirements.
 689  */
 690 static vm_map_entry_t
 691 vm_map_entry_create(vm_map_t map, int *countp)
 692 {
 693         struct globaldata *gd = mycpu;
 694         vm_map_entry_t entry;
 695
 696         KKASSERT(*countp > 0);
 697         --*countp;
 698         crit_enter();
 699         entry = gd->gd_vme_base;
 700         KASSERT(entry != NULL, ("gd_vme_base NULL! count %d", *countp));
 701         gd->gd_vme_base = entry->next;
 702         crit_exit();
 703
 704         return(entry);
 705 }
 706
 707 /*
 708  * Dispose of a vm_map_entry that is no longer being referenced.
 709  *
 710  * No requirements.
 711  */
 712 static void
 713 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry, int *countp)
 714 {
 715         struct globaldata *gd = mycpu;
 716
 717         KKASSERT(map->hint != entry);
 718         KKASSERT(map->first_free != entry);
 719
 720         ++*countp;
 721         crit_enter();
 722         entry->next = gd->gd_vme_base;
 723         gd->gd_vme_base = entry;
 724         crit_exit();
 725 }
 726
 727
 728 /*
 729  * Insert/remove entries from maps.
 730  *
 731  * The related map must be exclusively locked.
 732  * No other requirements.
 733  *
 734  * NOTE! We currently acquire the vmspace_token only to avoid races
 735  *       against the pageout daemon's calls to vmspace_*_count(), which
 736  *       are unable to safely lock the vm_map without potentially
 737  *       deadlocking.
 738  */
 739 static __inline void
 740 vm_map_entry_link(vm_map_t map,
 741                   vm_map_entry_t after_where,
 742                   vm_map_entry_t entry)
 743 {
 744         ASSERT_VM_MAP_LOCKED(map);
 745
 746         lwkt_gettoken(&vmspace_token);
 747         map->nentries++;
 748         entry->prev = after_where;
 749         entry->next = after_where->next;
 750         entry->next->prev = entry;
 751         after_where->next = entry;
 752         if (vm_map_rb_tree_RB_INSERT(&map->rb_root, entry))
 753                 panic("vm_map_entry_link: dup addr map %p ent %p", map, entry);
 754         lwkt_reltoken(&vmspace_token);
 755 }
 756
 757 static __inline void
 758 vm_map_entry_unlink(vm_map_t map,
 759                     vm_map_entry_t entry)
 760 {
 761         vm_map_entry_t prev;
 762         vm_map_entry_t next;
 763
 764         ASSERT_VM_MAP_LOCKED(map);
 765
 766         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
 767                 panic("vm_map_entry_unlink: attempt to mess with "
 768                       "locked entry! %p", entry);
 769         }
 770         lwkt_gettoken(&vmspace_token);
 771         prev = entry->prev;
 772         next = entry->next;
 773         next->prev = prev;
 774         prev->next = next;
 775         vm_map_rb_tree_RB_REMOVE(&map->rb_root, entry);
 776         map->nentries--;
 777         lwkt_reltoken(&vmspace_token);
 778 }
 779
 780 /*
 781  * Finds the map entry containing (or immediately preceding) the specified
 782  * address in the given map.  The entry is returned in (*entry).
 783  *
 784  * The boolean result indicates whether the address is actually contained
 785  * in the map.
 786  *
 787  * The related map must be locked.
 788  * No other requirements.
 789  */
 790 boolean_t
 791 vm_map_lookup_entry(vm_map_t map, vm_offset_t address, vm_map_entry_t *entry)
 792 {
 793         vm_map_entry_t tmp;
 794         vm_map_entry_t last;
 795
 796         ASSERT_VM_MAP_LOCKED(map);
 797 #if 0
 798         /*
 799          * XXX TEMPORARILY DISABLED.  For some reason our attempt to revive
 800          * the hint code with the red-black lookup meets with system crashes
 801          * and lockups.  We do not yet know why.
 802          *
 803          * It is possible that the problem is related to the setting
 804          * of the hint during map_entry deletion, in the code specified
 805          * at the GGG comment later on in this file.
 806          */
 807         /*
 808          * Quickly check the cached hint, there's a good chance of a match.
 809          */
 810         if (map->hint != &map->header) {
 811                 tmp = map->hint;
 812                 if (address >= tmp->start && address < tmp->end) {
 813                         *entry = tmp;
 814                         return(TRUE);
 815                 }
 816         }
 817 #endif
 818
 819         /*
 820          * Locate the record from the top of the tree.  'last' tracks the
 821          * closest prior record and is returned if no match is found, which
 822          * in binary tree terms means tracking the most recent right-branch
 823          * taken.  If there is no prior record, &map->header is returned.
 824          */
 825         last = &map->header;
 826         tmp = RB_ROOT(&map->rb_root);
 827
 828         while (tmp) {
 829                 if (address >= tmp->start) {
 830                         if (address < tmp->end) {
 831                                 *entry = tmp;
 832                                 map->hint = tmp;
 833                                 return(TRUE);
 834                         }
 835                         last = tmp;
 836                         tmp = RB_RIGHT(tmp, rb_entry);
 837                 } else {
 838                         tmp = RB_LEFT(tmp, rb_entry);
 839                 }
 840         }
 841         *entry = last;
 842         return (FALSE);
 843 }
 844
 845 /*
 846  * Inserts the given whole VM object into the target map at the specified
 847  * address range.  The object's size should match that of the address range.
 848  *
 849  * The map must be exclusively locked.
 850  * The caller must have reserved sufficient vm_map_entry structures.
 851  *
 852  * If object is non-NULL, ref count must be bumped by caller
 853  * prior to making call to account for the new entry.
 854  */
 855 int
 856 vm_map_insert(vm_map_t map, int *countp,
 857               vm_object_t object, vm_ooffset_t offset,
 858               vm_offset_t start, vm_offset_t end,
 859               vm_maptype_t maptype,
 860               vm_prot_t prot, vm_prot_t max,
 861               int cow)
 862 {
 863         vm_map_entry_t new_entry;
 864         vm_map_entry_t prev_entry;
 865         vm_map_entry_t temp_entry;
 866         vm_eflags_t protoeflags;
 867
 868         ASSERT_VM_MAP_LOCKED(map);
 869
 870         /*
 871          * Check that the start and end points are not bogus.
 872          */
 873         if ((start < map->min_offset) || (end > map->max_offset) ||
 874             (start >= end))
 875                 return (KERN_INVALID_ADDRESS);
 876
 877         /*
 878          * Find the entry prior to the proposed starting address; if it's part
 879          * of an existing entry, this range is bogus.
 880          */
 881         if (vm_map_lookup_entry(map, start, &temp_entry))
 882                 return (KERN_NO_SPACE);
 883
 884         prev_entry = temp_entry;
 885
 886         /*
 887          * Assert that the next entry doesn't overlap the end point.
 888          */
 889
 890         if ((prev_entry->next != &map->header) &&
 891             (prev_entry->next->start < end))
 892                 return (KERN_NO_SPACE);
 893
 894         protoeflags = 0;
 895
 896         if (cow & MAP_COPY_ON_WRITE)
 897                 protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY;
 898
 899         if (cow & MAP_NOFAULT) {
 900                 protoeflags |= MAP_ENTRY_NOFAULT;
 901
 902                 KASSERT(object == NULL,
 903                         ("vm_map_insert: paradoxical MAP_NOFAULT request"));
 904         }
 905         if (cow & MAP_DISABLE_SYNCER)
 906                 protoeflags |= MAP_ENTRY_NOSYNC;
 907         if (cow & MAP_DISABLE_COREDUMP)
 908                 protoeflags |= MAP_ENTRY_NOCOREDUMP;
 909         if (cow & MAP_IS_STACK)
 910                 protoeflags |= MAP_ENTRY_STACK;
 911
 912         if (object) {
 913                 /*
 914                  * When object is non-NULL, it could be shared with another
 915                  * process.  We have to set or clear OBJ_ONEMAPPING
 916                  * appropriately.
 917                  */
 918                 if ((object->ref_count > 1) || (object->shadow_count != 0)) {
 919                         vm_object_clear_flag(object, OBJ_ONEMAPPING);
 920                 }
 921         }
 922         else if ((prev_entry != &map->header) &&
 923                  (prev_entry->eflags == protoeflags) &&
 924                  (prev_entry->end == start) &&
 925                  (prev_entry->wired_count == 0) &&
 926                  prev_entry->maptype == maptype &&
 927                  ((prev_entry->object.vm_object == NULL) ||
 928                   vm_object_coalesce(prev_entry->object.vm_object,
 929                                      OFF_TO_IDX(prev_entry->offset),
 930                                      (vm_size_t)(prev_entry->end - prev_entry->start),
 931                                      (vm_size_t)(end - prev_entry->end)))) {
 932                 /*
 933                  * We were able to extend the object.  Determine if we
 934                  * can extend the previous map entry to include the
 935                  * new range as well.
 936                  */
 937                 if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) &&
 938                     (prev_entry->protection == prot) &&
 939                     (prev_entry->max_protection == max)) {
 940                         map->size += (end - prev_entry->end);
 941                         prev_entry->end = end;
 942                         vm_map_simplify_entry(map, prev_entry, countp);
 943                         return (KERN_SUCCESS);
 944                 }
 945
 946                 /*
 947                  * If we can extend the object but cannot extend the
 948                  * map entry, we have to create a new map entry.  We
 949                  * must bump the ref count on the extended object to
 950                  * account for it.  object may be NULL.
 951                  */
 952                 object = prev_entry->object.vm_object;
 953                 offset = prev_entry->offset +
 954                         (prev_entry->end - prev_entry->start);
 955                 vm_object_reference(object);
 956         }
 957
 958         /*
 959          * NOTE: if conditionals fail, object can be NULL here.  This occurs
 960          * in things like the buffer map where we manage kva but do not manage
 961          * backing objects.
 962          */
 963
 964         /*
 965          * Create a new entry
 966          */
 967
 968         new_entry = vm_map_entry_create(map, countp);
 969         new_entry->start = start;
 970         new_entry->end = end;
 971
 972         new_entry->maptype = maptype;
 973         new_entry->eflags = protoeflags;
 974         new_entry->object.vm_object = object;
 975         new_entry->offset = offset;
 976         new_entry->aux.master_pde = 0;
 977
 978         new_entry->inheritance = VM_INHERIT_DEFAULT;
 979         new_entry->protection = prot;
 980         new_entry->max_protection = max;
 981         new_entry->wired_count = 0;
 982
 983         /*
 984          * Insert the new entry into the list
 985          */
 986
 987         vm_map_entry_link(map, prev_entry, new_entry);
 988         map->size += new_entry->end - new_entry->start;
 989
 990         /*
 991          * Update the free space hint.  Entries cannot overlap.
 992          * An exact comparison is needed to avoid matching
 993          * against the map->header.
 994          */
 995         if ((map->first_free == prev_entry) &&
 996             (prev_entry->end == new_entry->start)) {
 997                 map->first_free = new_entry;
 998         }
 999
1000 #if 0
1001         /*
1002          * Temporarily removed to avoid MAP_STACK panic, due to
1003          * MAP_STACK being a huge hack.  Will be added back in
1004          * when MAP_STACK (and the user stack mapping) is fixed.
1005          */
1006         /*
1007          * It may be possible to simplify the entry
1008          */
1009         vm_map_simplify_entry(map, new_entry, countp);
1010 #endif
1011
1012         /*
1013          * Try to pre-populate the page table.  Mappings governed by virtual
1014          * page tables cannot be prepopulated without a lot of work, so
1015          * don't try.
1016          */
1017         if ((cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) &&
1018             maptype != VM_MAPTYPE_VPAGETABLE) {
1019                 pmap_object_init_pt(map->pmap, start, prot,
1020                                     object, OFF_TO_IDX(offset), end - start,
1021                                     cow & MAP_PREFAULT_PARTIAL);
1022         }
1023
1024         return (KERN_SUCCESS);
1025 }
1026
1027 /*
1028  * Find sufficient space for `length' bytes in the given map, starting at
1029  * `start'.  Returns 0 on success, 1 on no space.
1030  *
1031  * This function will returned an arbitrarily aligned pointer.  If no
1032  * particular alignment is required you should pass align as 1.  Note that
1033  * the map may return PAGE_SIZE aligned pointers if all the lengths used in
1034  * the map are a multiple of PAGE_SIZE, even if you pass a smaller align
1035  * argument.
1036  *
1037  * 'align' should be a power of 2 but is not required to be.
1038  *
1039  * The map must be exclusively locked.
1040  * No other requirements.
1041  */
1042 int
1043 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length,
1044                  vm_size_t align, int flags, vm_offset_t *addr)
1045 {
1046         vm_map_entry_t entry, next;
1047         vm_offset_t end;
1048         vm_offset_t align_mask;
1049
1050         if (start < map->min_offset)
1051                 start = map->min_offset;
1052         if (start > map->max_offset)
1053                 return (1);
1054
1055         /*
1056          * If the alignment is not a power of 2 we will have to use
1057          * a mod/division, set align_mask to a special value.
1058          */
1059         if ((align | (align - 1)) + 1 != (align << 1))
1060                 align_mask = (vm_offset_t)-1;
1061         else
1062                 align_mask = align - 1;
1063
1064 retry:
1065         /*
1066          * Look for the first possible address; if there's already something
1067          * at this address, we have to start after it.
1068          */
1069         if (start == map->min_offset) {
1070                 if ((entry = map->first_free) != &map->header)
1071                         start = entry->end;
1072         } else {
1073                 vm_map_entry_t tmp;
1074
1075                 if (vm_map_lookup_entry(map, start, &tmp))
1076                         start = tmp->end;
1077                 entry = tmp;
1078         }
1079
1080         /*
1081          * Look through the rest of the map, trying to fit a new region in the
1082          * gap between existing regions, or after the very last region.
1083          */
1084         for (;; start = (entry = next)->end) {
1085                 /*
1086                  * Adjust the proposed start by the requested alignment,
1087                  * be sure that we didn't wrap the address.
1088                  */
1089                 if (align_mask == (vm_offset_t)-1)
1090                         end = ((start + align - 1) / align) * align;
1091                 else
1092                         end = (start + align_mask) & ~align_mask;
1093                 if (end < start)
1094                         return (1);
1095                 start = end;
1096                 /*
1097                  * Find the end of the proposed new region.  Be sure we didn't
1098                  * go beyond the end of the map, or wrap around the address.
1099                  * Then check to see if this is the last entry or if the
1100                  * proposed end fits in the gap between this and the next
1101                  * entry.
1102                  */
1103                 end = start + length;
1104                 if (end > map->max_offset || end < start)
1105                         return (1);
1106                 next = entry->next;
1107
1108                 /*
1109                  * If the next entry's start address is beyond the desired
1110                  * end address we may have found a good entry.
1111                  *
1112                  * If the next entry is a stack mapping we do not map into
1113                  * the stack's reserved space.
1114                  *
1115                  * XXX continue to allow mapping into the stack's reserved
1116                  * space if doing a MAP_STACK mapping inside a MAP_STACK
1117                  * mapping, for backwards compatibility.  But the caller
1118                  * really should use MAP_STACK | MAP_TRYFIXED if they
1119                  * want to do that.
1120                  */
1121                 if (next == &map->header)
1122                         break;
1123                 if (next->start >= end) {
1124                         if ((next->eflags & MAP_ENTRY_STACK) == 0)
1125                                 break;
1126                         if (flags & MAP_STACK)
1127                                 break;
1128                         if (next->start - next->aux.avail_ssize >= end)
1129                                 break;
1130                 }
1131         }
1132         map->hint = entry;
1133         if (map == &kernel_map) {
1134                 vm_offset_t ksize;
1135                 if ((ksize = round_page(start + length)) > kernel_vm_end) {
1136                         pmap_growkernel(ksize);
1137                         goto retry;
1138                 }
1139         }
1140         *addr = start;
1141         return (0);
1142 }
1143
1144 /*
1145  * vm_map_find finds an unallocated region in the target address map with
1146  * the given length.  The search is defined to be first-fit from the
1147  * specified address; the region found is returned in the same parameter.
1148  *
1149  * If object is non-NULL, ref count must be bumped by caller
1150  * prior to making call to account for the new entry.
1151  *
1152  * No requirements.  This function will lock the map temporarily.
1153  */
1154 int
1155 vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1156             vm_offset_t *addr,  vm_size_t length, vm_size_t align,
1157             boolean_t fitit,
1158             vm_maptype_t maptype,
1159             vm_prot_t prot, vm_prot_t max,
1160             int cow)
1161 {
1162         vm_offset_t start;
1163         int result;
1164         int count;
1165
1166         start = *addr;
1167
1168         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1169         vm_map_lock(map);
1170         if (fitit) {
1171                 if (vm_map_findspace(map, start, length, align, 0, addr)) {
1172                         vm_map_unlock(map);
1173                         vm_map_entry_release(count);
1174                         return (KERN_NO_SPACE);
1175                 }
1176                 start = *addr;
1177         }
1178         result = vm_map_insert(map, &count, object, offset,
1179                                start, start + length,
1180                                maptype,
1181                                prot, max,
1182                                cow);
1183         vm_map_unlock(map);
1184         vm_map_entry_release(count);
1185
1186         return (result);
1187 }
1188
1189 /*
1190  * Simplify the given map entry by merging with either neighbor.  This
1191  * routine also has the ability to merge with both neighbors.
1192  *
1193  * This routine guarentees that the passed entry remains valid (though
1194  * possibly extended).  When merging, this routine may delete one or
1195  * both neighbors.  No action is taken on entries which have their
1196  * in-transition flag set.
1197  *
1198  * The map must be exclusively locked.
1199  */
1200 void
1201 vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry, int *countp)
1202 {
1203         vm_map_entry_t next, prev;
1204         vm_size_t prevsize, esize;
1205
1206         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1207                 ++mycpu->gd_cnt.v_intrans_coll;
1208                 return;
1209         }
1210
1211         if (entry->maptype == VM_MAPTYPE_SUBMAP)
1212                 return;
1213
1214         prev = entry->prev;
1215         if (prev != &map->header) {
1216                 prevsize = prev->end - prev->start;
1217                 if ( (prev->end == entry->start) &&
1218                      (prev->maptype == entry->maptype) &&
1219                      (prev->object.vm_object == entry->object.vm_object) &&
1220                      (!prev->object.vm_object ||
1221                         (prev->offset + prevsize == entry->offset)) &&
1222                      (prev->eflags == entry->eflags) &&
1223                      (prev->protection == entry->protection) &&
1224                      (prev->max_protection == entry->max_protection) &&
1225                      (prev->inheritance == entry->inheritance) &&
1226                      (prev->wired_count == entry->wired_count)) {
1227                         if (map->first_free == prev)
1228                                 map->first_free = entry;
1229                         if (map->hint == prev)
1230                                 map->hint = entry;
1231                         vm_map_entry_unlink(map, prev);
1232                         entry->start = prev->start;
1233                         entry->offset = prev->offset;
1234                         if (prev->object.vm_object)
1235                                 vm_object_deallocate(prev->object.vm_object);
1236                         vm_map_entry_dispose(map, prev, countp);
1237                 }
1238         }
1239
1240         next = entry->next;
1241         if (next != &map->header) {
1242                 esize = entry->end - entry->start;
1243                 if ((entry->end == next->start) &&
1244                     (next->maptype == entry->maptype) &&
1245                     (next->object.vm_object == entry->object.vm_object) &&
1246                      (!entry->object.vm_object ||
1247                         (entry->offset + esize == next->offset)) &&
1248                     (next->eflags == entry->eflags) &&
1249                     (next->protection == entry->protection) &&
1250                     (next->max_protection == entry->max_protection) &&
1251                     (next->inheritance == entry->inheritance) &&
1252                     (next->wired_count == entry->wired_count)) {
1253                         if (map->first_free == next)
1254                                 map->first_free = entry;
1255                         if (map->hint == next)
1256                                 map->hint = entry;
1257                         vm_map_entry_unlink(map, next);
1258                         entry->end = next->end;
1259                         if (next->object.vm_object)
1260                                 vm_object_deallocate(next->object.vm_object);
1261                         vm_map_entry_dispose(map, next, countp);
1262                 }
1263         }
1264 }
1265
1266 /*
1267  * Asserts that the given entry begins at or after the specified address.
1268  * If necessary, it splits the entry into two.
1269  */
1270 #define vm_map_clip_start(map, entry, startaddr, countp)                \
1271 {                                                                       \
1272         if (startaddr > entry->start)                                   \
1273                 _vm_map_clip_start(map, entry, startaddr, countp);      \
1274 }
1275
1276 /*
1277  * This routine is called only when it is known that the entry must be split.
1278  *
1279  * The map must be exclusively locked.
1280  */
1281 static void
1282 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start,
1283                    int *countp)
1284 {
1285         vm_map_entry_t new_entry;
1286
1287         /*
1288          * Split off the front portion -- note that we must insert the new
1289          * entry BEFORE this one, so that this entry has the specified
1290          * starting address.
1291          */
1292
1293         vm_map_simplify_entry(map, entry, countp);
1294
1295         /*
1296          * If there is no object backing this entry, we might as well create
1297          * one now.  If we defer it, an object can get created after the map
1298          * is clipped, and individual objects will be created for the split-up
1299          * map.  This is a bit of a hack, but is also about the best place to
1300          * put this improvement.
1301          */
1302         if (entry->object.vm_object == NULL && !map->system_map) {
1303                 vm_map_entry_allocate_object(entry);
1304         }
1305
1306         new_entry = vm_map_entry_create(map, countp);
1307         *new_entry = *entry;
1308
1309         new_entry->end = start;
1310         entry->offset += (start - entry->start);
1311         entry->start = start;
1312
1313         vm_map_entry_link(map, entry->prev, new_entry);
1314
1315         switch(entry->maptype) {
1316         case VM_MAPTYPE_NORMAL:
1317         case VM_MAPTYPE_VPAGETABLE:
1318                 vm_object_reference(new_entry->object.vm_object);
1319                 break;
1320         default:
1321                 break;
1322         }
1323 }
1324
1325 /*
1326  * Asserts that the given entry ends at or before the specified address.
1327  * If necessary, it splits the entry into two.
1328  *
1329  * The map must be exclusively locked.
1330  */
1331 #define vm_map_clip_end(map, entry, endaddr, countp)            \
1332 {                                                               \
1333         if (endaddr < entry->end)                               \
1334                 _vm_map_clip_end(map, entry, endaddr, countp);  \
1335 }
1336
1337 /*
1338  * This routine is called only when it is known that the entry must be split.
1339  *
1340  * The map must be exclusively locked.
1341  */
1342 static void
1343 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end,
1344                  int *countp)
1345 {
1346         vm_map_entry_t new_entry;
1347
1348         /*
1349          * If there is no object backing this entry, we might as well create
1350          * one now.  If we defer it, an object can get created after the map
1351          * is clipped, and individual objects will be created for the split-up
1352          * map.  This is a bit of a hack, but is also about the best place to
1353          * put this improvement.
1354          */
1355
1356         if (entry->object.vm_object == NULL && !map->system_map) {
1357                 vm_map_entry_allocate_object(entry);
1358         }
1359
1360         /*
1361          * Create a new entry and insert it AFTER the specified entry
1362          */
1363
1364         new_entry = vm_map_entry_create(map, countp);
1365         *new_entry = *entry;
1366
1367         new_entry->start = entry->end = end;
1368         new_entry->offset += (end - entry->start);
1369
1370         vm_map_entry_link(map, entry, new_entry);
1371
1372         switch(entry->maptype) {
1373         case VM_MAPTYPE_NORMAL:
1374         case VM_MAPTYPE_VPAGETABLE:
1375                 vm_object_reference(new_entry->object.vm_object);
1376                 break;
1377         default:
1378                 break;
1379         }
1380 }
1381
1382 /*
1383  * Asserts that the starting and ending region addresses fall within the
1384  * valid range for the map.
1385  */
1386 #define VM_MAP_RANGE_CHECK(map, start, end)     \
1387 {                                               \
1388         if (start < vm_map_min(map))            \
1389                 start = vm_map_min(map);        \
1390         if (end > vm_map_max(map))              \
1391                 end = vm_map_max(map);          \
1392         if (start > end)                        \
1393                 start = end;                    \
1394 }
1395
1396 /*
1397  * Used to block when an in-transition collison occurs.  The map
1398  * is unlocked for the sleep and relocked before the return.
1399  */
1400 static
1401 void
1402 vm_map_transition_wait(vm_map_t map)
1403 {
1404         vm_map_unlock(map);
1405         tsleep(map, 0, "vment", 0);
1406         vm_map_lock(map);
1407 }
1408
1409 /*
1410  * When we do blocking operations with the map lock held it is
1411  * possible that a clip might have occured on our in-transit entry,
1412  * requiring an adjustment to the entry in our loop.  These macros
1413  * help the pageable and clip_range code deal with the case.  The
1414  * conditional costs virtually nothing if no clipping has occured.
1415  */
1416
1417 #define CLIP_CHECK_BACK(entry, save_start)              \
1418     do {                                                \
1419             while (entry->start != save_start) {        \
1420                     entry = entry->prev;                \
1421                     KASSERT(entry != &map->header, ("bad entry clip")); \
1422             }                                           \
1423     } while(0)
1424
1425 #define CLIP_CHECK_FWD(entry, save_end)                 \
1426     do {                                                \
1427             while (entry->end != save_end) {            \
1428                     entry = entry->next;                \
1429                     KASSERT(entry != &map->header, ("bad entry clip")); \
1430             }                                           \
1431     } while(0)
1432
1433
1434 /*
1435  * Clip the specified range and return the base entry.  The
1436  * range may cover several entries starting at the returned base
1437  * and the first and last entry in the covering sequence will be
1438  * properly clipped to the requested start and end address.
1439  *
1440  * If no holes are allowed you should pass the MAP_CLIP_NO_HOLES
1441  * flag.
1442  *
1443  * The MAP_ENTRY_IN_TRANSITION flag will be set for the entries
1444  * covered by the requested range.
1445  *
1446  * The map must be exclusively locked on entry and will remain locked
1447  * on return. If no range exists or the range contains holes and you
1448  * specified that no holes were allowed, NULL will be returned.  This
1449  * routine may temporarily unlock the map in order avoid a deadlock when
1450  * sleeping.
1451  */
1452 static
1453 vm_map_entry_t
1454 vm_map_clip_range(vm_map_t map, vm_offset_t start, vm_offset_t end,
1455                   int *countp, int flags)
1456 {
1457         vm_map_entry_t start_entry;
1458         vm_map_entry_t entry;
1459
1460         /*
1461          * Locate the entry and effect initial clipping.  The in-transition
1462          * case does not occur very often so do not try to optimize it.
1463          */
1464 again:
1465         if (vm_map_lookup_entry(map, start, &start_entry) == FALSE)
1466                 return (NULL);
1467         entry = start_entry;
1468         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1469                 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1470                 ++mycpu->gd_cnt.v_intrans_coll;
1471                 ++mycpu->gd_cnt.v_intrans_wait;
1472                 vm_map_transition_wait(map);
1473                 /*
1474                  * entry and/or start_entry may have been clipped while
1475                  * we slept, or may have gone away entirely.  We have
1476                  * to restart from the lookup.
1477                  */
1478                 goto again;
1479         }
1480
1481         /*
1482          * Since we hold an exclusive map lock we do not have to restart
1483          * after clipping, even though clipping may block in zalloc.
1484          */
1485         vm_map_clip_start(map, entry, start, countp);
1486         vm_map_clip_end(map, entry, end, countp);
1487         entry->eflags |= MAP_ENTRY_IN_TRANSITION;
1488
1489         /*
1490          * Scan entries covered by the range.  When working on the next
1491          * entry a restart need only re-loop on the current entry which
1492          * we have already locked, since 'next' may have changed.  Also,
1493          * even though entry is safe, it may have been clipped so we
1494          * have to iterate forwards through the clip after sleeping.
1495          */
1496         while (entry->next != &map->header && entry->next->start < end) {
1497                 vm_map_entry_t next = entry->next;
1498
1499                 if (flags & MAP_CLIP_NO_HOLES) {
1500                         if (next->start > entry->end) {
1501                                 vm_map_unclip_range(map, start_entry,
1502                                         start, entry->end, countp, flags);
1503                                 return(NULL);
1504                         }
1505                 }
1506
1507                 if (next->eflags & MAP_ENTRY_IN_TRANSITION) {
1508                         vm_offset_t save_end = entry->end;
1509                         next->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1510                         ++mycpu->gd_cnt.v_intrans_coll;
1511                         ++mycpu->gd_cnt.v_intrans_wait;
1512                         vm_map_transition_wait(map);
1513
1514                         /*
1515                          * clips might have occured while we blocked.
1516                          */
1517                         CLIP_CHECK_FWD(entry, save_end);
1518                         CLIP_CHECK_BACK(start_entry, start);
1519                         continue;
1520                 }
1521                 /*
1522                  * No restart necessary even though clip_end may block, we
1523                  * are holding the map lock.
1524                  */
1525                 vm_map_clip_end(map, next, end, countp);
1526                 next->eflags |= MAP_ENTRY_IN_TRANSITION;
1527                 entry = next;
1528         }
1529         if (flags & MAP_CLIP_NO_HOLES) {
1530                 if (entry->end != end) {
1531                         vm_map_unclip_range(map, start_entry,
1532                                 start, entry->end, countp, flags);
1533                         return(NULL);
1534                 }
1535         }
1536         return(start_entry);
1537 }
1538
1539 /*
1540  * Undo the effect of vm_map_clip_range().  You should pass the same
1541  * flags and the same range that you passed to vm_map_clip_range().
1542  * This code will clear the in-transition flag on the entries and
1543  * wake up anyone waiting.  This code will also simplify the sequence
1544  * and attempt to merge it with entries before and after the sequence.
1545  *
1546  * The map must be locked on entry and will remain locked on return.
1547  *
1548  * Note that you should also pass the start_entry returned by
1549  * vm_map_clip_range().  However, if you block between the two calls
1550  * with the map unlocked please be aware that the start_entry may
1551  * have been clipped and you may need to scan it backwards to find
1552  * the entry corresponding with the original start address.  You are
1553  * responsible for this, vm_map_unclip_range() expects the correct
1554  * start_entry to be passed to it and will KASSERT otherwise.
1555  */
1556 static
1557 void
1558 vm_map_unclip_range(vm_map_t map, vm_map_entry_t start_entry,
1559                     vm_offset_t start, vm_offset_t end,
1560                     int *countp, int flags)
1561 {
1562         vm_map_entry_t entry;
1563
1564         entry = start_entry;
1565
1566         KASSERT(entry->start == start, ("unclip_range: illegal base entry"));
1567         while (entry != &map->header && entry->start < end) {
1568                 KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
1569                         ("in-transition flag not set during unclip on: %p",
1570                         entry));
1571                 KASSERT(entry->end <= end,
1572                         ("unclip_range: tail wasn't clipped"));
1573                 entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
1574                 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
1575                         entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
1576                         wakeup(map);
1577                 }
1578                 entry = entry->next;
1579         }
1580
1581         /*
1582          * Simplification does not block so there is no restart case.
1583          */
1584         entry = start_entry;
1585         while (entry != &map->header && entry->start < end) {
1586                 vm_map_simplify_entry(map, entry, countp);
1587                 entry = entry->next;
1588         }
1589 }
1590
1591 /*
1592  * Mark the given range as handled by a subordinate map.
1593  *
1594  * This range must have been created with vm_map_find(), and no other
1595  * operations may have been performed on this range prior to calling
1596  * vm_map_submap().
1597  *
1598  * Submappings cannot be removed.
1599  *
1600  * No requirements.
1601  */
1602 int
1603 vm_map_submap(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_map_t submap)
1604 {
1605         vm_map_entry_t entry;
1606         int result = KERN_INVALID_ARGUMENT;
1607         int count;
1608
1609         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1610         vm_map_lock(map);
1611
1612         VM_MAP_RANGE_CHECK(map, start, end);
1613
1614         if (vm_map_lookup_entry(map, start, &entry)) {
1615                 vm_map_clip_start(map, entry, start, &count);
1616         } else {
1617                 entry = entry->next;
1618         }
1619
1620         vm_map_clip_end(map, entry, end, &count);
1621
1622         if ((entry->start == start) && (entry->end == end) &&
1623             ((entry->eflags & MAP_ENTRY_COW) == 0) &&
1624             (entry->object.vm_object == NULL)) {
1625                 entry->object.sub_map = submap;
1626                 entry->maptype = VM_MAPTYPE_SUBMAP;
1627                 result = KERN_SUCCESS;
1628         }
1629         vm_map_unlock(map);
1630         vm_map_entry_release(count);
1631
1632         return (result);
1633 }
1634
1635 /*
1636  * Sets the protection of the specified address region in the target map.
1637  * If "set_max" is specified, the maximum protection is to be set;
1638  * otherwise, only the current protection is affected.
1639  *
1640  * The protection is not applicable to submaps, but is applicable to normal
1641  * maps and maps governed by virtual page tables.  For example, when operating
1642  * on a virtual page table our protection basically controls how COW occurs
1643  * on the backing object, whereas the virtual page table abstraction itself
1644  * is an abstraction for userland.
1645  *
1646  * No requirements.
1647  */
1648 int
1649 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
1650                vm_prot_t new_prot, boolean_t set_max)
1651 {
1652         vm_map_entry_t current;
1653         vm_map_entry_t entry;
1654         int count;
1655
1656         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1657         vm_map_lock(map);
1658
1659         VM_MAP_RANGE_CHECK(map, start, end);
1660
1661         if (vm_map_lookup_entry(map, start, &entry)) {
1662                 vm_map_clip_start(map, entry, start, &count);
1663         } else {
1664                 entry = entry->next;
1665         }
1666
1667         /*
1668          * Make a first pass to check for protection violations.
1669          */
1670         current = entry;
1671         while ((current != &map->header) && (current->start < end)) {
1672                 if (current->maptype == VM_MAPTYPE_SUBMAP) {
1673                         vm_map_unlock(map);
1674                         vm_map_entry_release(count);
1675                         return (KERN_INVALID_ARGUMENT);
1676                 }
1677                 if ((new_prot & current->max_protection) != new_prot) {
1678                         vm_map_unlock(map);
1679                         vm_map_entry_release(count);
1680                         return (KERN_PROTECTION_FAILURE);
1681                 }
1682                 current = current->next;
1683         }
1684
1685         /*
1686          * Go back and fix up protections. [Note that clipping is not
1687          * necessary the second time.]
1688          */
1689         current = entry;
1690
1691         while ((current != &map->header) && (current->start < end)) {
1692                 vm_prot_t old_prot;
1693
1694                 vm_map_clip_end(map, current, end, &count);
1695
1696                 old_prot = current->protection;
1697                 if (set_max) {
1698                         current->protection =
1699                             (current->max_protection = new_prot) &
1700                             old_prot;
1701                 } else {
1702                         current->protection = new_prot;
1703                 }
1704
1705                 /*
1706                  * Update physical map if necessary. Worry about copy-on-write
1707                  * here -- CHECK THIS XXX
1708                  */
1709
1710                 if (current->protection != old_prot) {
1711 #define MASK(entry)     (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
1712                                                         VM_PROT_ALL)
1713
1714                         pmap_protect(map->pmap, current->start,
1715                             current->end,
1716                             current->protection & MASK(current));
1717 #undef  MASK
1718                 }
1719
1720                 vm_map_simplify_entry(map, current, &count);
1721
1722                 current = current->next;
1723         }
1724
1725         vm_map_unlock(map);
1726         vm_map_entry_release(count);
1727         return (KERN_SUCCESS);
1728 }
1729
1730 /*
1731  * This routine traverses a processes map handling the madvise
1732  * system call.  Advisories are classified as either those effecting
1733  * the vm_map_entry structure, or those effecting the underlying
1734  * objects.
1735  *
1736  * The <value> argument is used for extended madvise calls.
1737  *
1738  * No requirements.
1739  */
1740 int
1741 vm_map_madvise(vm_map_t map, vm_offset_t start, vm_offset_t end,
1742                int behav, off_t value)
1743 {
1744         vm_map_entry_t current, entry;
1745         int modify_map = 0;
1746         int error = 0;
1747         int count;
1748
1749         /*
1750          * Some madvise calls directly modify the vm_map_entry, in which case
1751          * we need to use an exclusive lock on the map and we need to perform
1752          * various clipping operations.  Otherwise we only need a read-lock
1753          * on the map.
1754          */
1755
1756         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1757
1758         switch(behav) {
1759         case MADV_NORMAL:
1760         case MADV_SEQUENTIAL:
1761         case MADV_RANDOM:
1762         case MADV_NOSYNC:
1763         case MADV_AUTOSYNC:
1764         case MADV_NOCORE:
1765         case MADV_CORE:
1766         case MADV_SETMAP:
1767         case MADV_INVAL:
1768                 modify_map = 1;
1769                 vm_map_lock(map);
1770                 break;
1771         case MADV_WILLNEED:
1772         case MADV_DONTNEED:
1773         case MADV_FREE:
1774                 vm_map_lock_read(map);
1775                 break;
1776         default:
1777                 vm_map_entry_release(count);
1778                 return (EINVAL);
1779         }
1780
1781         /*
1782          * Locate starting entry and clip if necessary.
1783          */
1784
1785         VM_MAP_RANGE_CHECK(map, start, end);
1786
1787         if (vm_map_lookup_entry(map, start, &entry)) {
1788                 if (modify_map)
1789                         vm_map_clip_start(map, entry, start, &count);
1790         } else {
1791                 entry = entry->next;
1792         }
1793
1794         if (modify_map) {
1795                 /*
1796                  * madvise behaviors that are implemented in the vm_map_entry.
1797                  *
1798                  * We clip the vm_map_entry so that behavioral changes are
1799                  * limited to the specified address range.
1800                  */
1801                 for (current = entry;
1802                      (current != &map->header) && (current->start < end);
1803                      current = current->next
1804                 ) {
1805                         if (current->maptype == VM_MAPTYPE_SUBMAP)
1806                                 continue;
1807
1808                         vm_map_clip_end(map, current, end, &count);
1809
1810                         switch (behav) {
1811                         case MADV_NORMAL:
1812                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
1813                                 break;
1814                         case MADV_SEQUENTIAL:
1815                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
1816                                 break;
1817                         case MADV_RANDOM:
1818                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
1819                                 break;
1820                         case MADV_NOSYNC:
1821                                 current->eflags |= MAP_ENTRY_NOSYNC;
1822                                 break;
1823                         case MADV_AUTOSYNC:
1824                                 current->eflags &= ~MAP_ENTRY_NOSYNC;
1825                                 break;
1826                         case MADV_NOCORE:
1827                                 current->eflags |= MAP_ENTRY_NOCOREDUMP;
1828                                 break;
1829                         case MADV_CORE:
1830                                 current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
1831                                 break;
1832                         case MADV_INVAL:
1833                                 /*
1834                                  * Invalidate the related pmap entries, used
1835                                  * to flush portions of the real kernel's
1836                                  * pmap when the caller has removed or
1837                                  * modified existing mappings in a virtual
1838                                  * page table.
1839                                  */
1840                                 pmap_remove(map->pmap,
1841                                             current->start, current->end);
1842                                 break;
1843                         case MADV_SETMAP:
1844                                 /*
1845                                  * Set the page directory page for a map
1846                                  * governed by a virtual page table.  Mark
1847                                  * the entry as being governed by a virtual
1848                                  * page table if it is not.
1849                                  *
1850                                  * XXX the page directory page is stored
1851                                  * in the avail_ssize field if the map_entry.
1852                                  *
1853                                  * XXX the map simplification code does not
1854                                  * compare this field so weird things may
1855                                  * happen if you do not apply this function
1856                                  * to the entire mapping governed by the
1857                                  * virtual page table.
1858                                  */
1859                                 if (current->maptype != VM_MAPTYPE_VPAGETABLE) {
1860                                         error = EINVAL;
1861                                         break;
1862                                 }
1863                                 current->aux.master_pde = value;
1864                                 pmap_remove(map->pmap,
1865                                             current->start, current->end);
1866                                 break;
1867                         default:
1868                                 error = EINVAL;
1869                                 break;
1870                         }
1871                         vm_map_simplify_entry(map, current, &count);
1872                 }
1873                 vm_map_unlock(map);
1874         } else {
1875                 vm_pindex_t pindex;
1876                 int count;
1877
1878                 /*
1879                  * madvise behaviors that are implemented in the underlying
1880                  * vm_object.
1881                  *
1882                  * Since we don't clip the vm_map_entry, we have to clip
1883                  * the vm_object pindex and count.
1884                  *
1885                  * NOTE!  We currently do not support these functions on
1886                  * virtual page tables.
1887                  */
1888                 for (current = entry;
1889                      (current != &map->header) && (current->start < end);
1890                      current = current->next
1891                 ) {
1892                         vm_offset_t useStart;
1893
1894                         if (current->maptype != VM_MAPTYPE_NORMAL)
1895                                 continue;
1896
1897                         pindex = OFF_TO_IDX(current->offset);
1898                         count = atop(current->end - current->start);
1899                         useStart = current->start;
1900
1901                         if (current->start < start) {
1902                                 pindex += atop(start - current->start);
1903                                 count -= atop(start - current->start);
1904                                 useStart = start;
1905                         }
1906                         if (current->end > end)
1907                                 count -= atop(current->end - end);
1908
1909                         if (count <= 0)
1910                                 continue;
1911
1912                         vm_object_madvise(current->object.vm_object,
1913                                           pindex, count, behav);
1914
1915                         /*
1916                          * Try to populate the page table.  Mappings governed
1917                          * by virtual page tables cannot be pre-populated
1918                          * without a lot of work so don't try.
1919                          */
1920                         if (behav == MADV_WILLNEED &&
1921                             current->maptype != VM_MAPTYPE_VPAGETABLE) {
1922                                 pmap_object_init_pt(
1923                                     map->pmap,
1924                                     useStart,
1925                                     current->protection,
1926                                     current->object.vm_object,
1927                                     pindex,
1928                                     (count << PAGE_SHIFT),
1929                                     MAP_PREFAULT_MADVISE
1930                                 );
1931                         }
1932                 }
1933                 vm_map_unlock_read(map);
1934         }
1935         vm_map_entry_release(count);
1936         return(error);
1937 }
1938
1939
1940 /*
1941  * Sets the inheritance of the specified address range in the target map.
1942  * Inheritance affects how the map will be shared with child maps at the
1943  * time of vm_map_fork.
1944  */
1945 int
1946 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
1947                vm_inherit_t new_inheritance)
1948 {
1949         vm_map_entry_t entry;
1950         vm_map_entry_t temp_entry;
1951         int count;
1952
1953         switch (new_inheritance) {
1954         case VM_INHERIT_NONE:
1955         case VM_INHERIT_COPY:
1956         case VM_INHERIT_SHARE:
1957                 break;
1958         default:
1959                 return (KERN_INVALID_ARGUMENT);
1960         }
1961
1962         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1963         vm_map_lock(map);
1964
1965         VM_MAP_RANGE_CHECK(map, start, end);
1966
1967         if (vm_map_lookup_entry(map, start, &temp_entry)) {
1968                 entry = temp_entry;
1969                 vm_map_clip_start(map, entry, start, &count);
1970         } else
1971                 entry = temp_entry->next;
1972
1973         while ((entry != &map->header) && (entry->start < end)) {
1974                 vm_map_clip_end(map, entry, end, &count);
1975
1976                 entry->inheritance = new_inheritance;
1977
1978                 vm_map_simplify_entry(map, entry, &count);
1979
1980                 entry = entry->next;
1981         }
1982         vm_map_unlock(map);
1983         vm_map_entry_release(count);
1984         return (KERN_SUCCESS);
1985 }
1986
1987 /*
1988  * Implement the semantics of mlock
1989  */
1990 int
1991 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t real_end,
1992               boolean_t new_pageable)
1993 {
1994         vm_map_entry_t entry;
1995         vm_map_entry_t start_entry;
1996         vm_offset_t end;
1997         int rv = KERN_SUCCESS;
1998         int count;
1999
2000         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2001         vm_map_lock(map);
2002         VM_MAP_RANGE_CHECK(map, start, real_end);
2003         end = real_end;
2004
2005         start_entry = vm_map_clip_range(map, start, end, &count,
2006                                         MAP_CLIP_NO_HOLES);
2007         if (start_entry == NULL) {
2008                 vm_map_unlock(map);
2009                 vm_map_entry_release(count);
2010                 return (KERN_INVALID_ADDRESS);
2011         }
2012
2013         if (new_pageable == 0) {
2014                 entry = start_entry;
2015                 while ((entry != &map->header) && (entry->start < end)) {
2016                         vm_offset_t save_start;
2017                         vm_offset_t save_end;
2018
2019                         /*
2020                          * Already user wired or hard wired (trivial cases)
2021                          */
2022                         if (entry->eflags & MAP_ENTRY_USER_WIRED) {
2023                                 entry = entry->next;
2024                                 continue;
2025                         }
2026                         if (entry->wired_count != 0) {
2027                                 entry->wired_count++;
2028                                 entry->eflags |= MAP_ENTRY_USER_WIRED;
2029                                 entry = entry->next;
2030                                 continue;
2031                         }
2032
2033                         /*
2034                          * A new wiring requires instantiation of appropriate
2035                          * management structures and the faulting in of the
2036                          * page.
2037                          */
2038                         if (entry->maptype != VM_MAPTYPE_SUBMAP) {
2039                                 int copyflag = entry->eflags &
2040                                                MAP_ENTRY_NEEDS_COPY;
2041                                 if (copyflag && ((entry->protection &
2042                                                   VM_PROT_WRITE) != 0)) {
2043                                         vm_map_entry_shadow(entry);
2044                                 } else if (entry->object.vm_object == NULL &&
2045                                            !map->system_map) {
2046                                         vm_map_entry_allocate_object(entry);
2047                                 }
2048                         }
2049                         entry->wired_count++;
2050                         entry->eflags |= MAP_ENTRY_USER_WIRED;
2051
2052                         /*
2053                          * Now fault in the area.  Note that vm_fault_wire()
2054                          * may release the map lock temporarily, it will be
2055                          * relocked on return.  The in-transition
2056                          * flag protects the entries.
2057                          */
2058                         save_start = entry->start;
2059                         save_end = entry->end;
2060                         rv = vm_fault_wire(map, entry, TRUE);
2061                         if (rv) {
2062                                 CLIP_CHECK_BACK(entry, save_start);
2063                                 for (;;) {
2064                                         KASSERT(entry->wired_count == 1, ("bad wired_count on entry"));
2065                                         entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2066                                         entry->wired_count = 0;
2067                                         if (entry->end == save_end)
2068                                                 break;
2069                                         entry = entry->next;
2070                                         KASSERT(entry != &map->header, ("bad entry clip during backout"));
2071                                 }
2072                                 end = save_start;       /* unwire the rest */
2073                                 break;
2074                         }
2075                         /*
2076                          * note that even though the entry might have been
2077                          * clipped, the USER_WIRED flag we set prevents
2078                          * duplication so we do not have to do a
2079                          * clip check.
2080                          */
2081                         entry = entry->next;
2082                 }
2083
2084                 /*
2085                  * If we failed fall through to the unwiring section to
2086                  * unwire what we had wired so far.  'end' has already
2087                  * been adjusted.
2088                  */
2089                 if (rv)
2090                         new_pageable = 1;
2091
2092                 /*
2093                  * start_entry might have been clipped if we unlocked the
2094                  * map and blocked.  No matter how clipped it has gotten
2095                  * there should be a fragment that is on our start boundary.
2096                  */
2097                 CLIP_CHECK_BACK(start_entry, start);
2098         }
2099
2100         /*
2101          * Deal with the unwiring case.
2102          */
2103         if (new_pageable) {
2104                 /*
2105                  * This is the unwiring case.  We must first ensure that the
2106                  * range to be unwired is really wired down.  We know there
2107                  * are no holes.
2108                  */
2109                 entry = start_entry;
2110                 while ((entry != &map->header) && (entry->start < end)) {
2111                         if ((entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
2112                                 rv = KERN_INVALID_ARGUMENT;
2113                                 goto done;
2114                         }
2115                         KASSERT(entry->wired_count != 0, ("wired count was 0 with USER_WIRED set! %p", entry));
2116                         entry = entry->next;
2117                 }
2118
2119                 /*
2120                  * Now decrement the wiring count for each region. If a region
2121                  * becomes completely unwired, unwire its physical pages and
2122                  * mappings.
2123                  */
2124                 /*
2125                  * The map entries are processed in a loop, checking to
2126                  * make sure the entry is wired and asserting it has a wired
2127                  * count. However, another loop was inserted more-or-less in
2128                  * the middle of the unwiring path. This loop picks up the
2129                  * "entry" loop variable from the first loop without first
2130                  * setting it to start_entry. Naturally, the secound loop
2131                  * is never entered and the pages backing the entries are
2132                  * never unwired. This can lead to a leak of wired pages.
2133                  */
2134                 entry = start_entry;
2135                 while ((entry != &map->header) && (entry->start < end)) {
2136                         KASSERT(entry->eflags & MAP_ENTRY_USER_WIRED,
2137                                 ("expected USER_WIRED on entry %p", entry));
2138                         entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2139                         entry->wired_count--;
2140                         if (entry->wired_count == 0)
2141                                 vm_fault_unwire(map, entry);
2142                         entry = entry->next;
2143                 }
2144         }
2145 done:
2146         vm_map_unclip_range(map, start_entry, start, real_end, &count,
2147                 MAP_CLIP_NO_HOLES);
2148         map->timestamp++;
2149         vm_map_unlock(map);
2150         vm_map_entry_release(count);
2151         return (rv);
2152 }
2153
2154 /*
2155  * Sets the pageability of the specified address range in the target map.
2156  * Regions specified as not pageable require locked-down physical
2157  * memory and physical page maps.
2158  *
2159  * The map must not be locked, but a reference must remain to the map
2160  * throughout the call.
2161  *
2162  * This function may be called via the zalloc path and must properly
2163  * reserve map entries for kernel_map.
2164  *
2165  * No requirements.
2166  */
2167 int
2168 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t real_end, int kmflags)
2169 {
2170         vm_map_entry_t entry;
2171         vm_map_entry_t start_entry;
2172         vm_offset_t end;
2173         int rv = KERN_SUCCESS;
2174         int count;
2175
2176         if (kmflags & KM_KRESERVE)
2177                 count = vm_map_entry_kreserve(MAP_RESERVE_COUNT);
2178         else
2179                 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2180         vm_map_lock(map);
2181         VM_MAP_RANGE_CHECK(map, start, real_end);
2182         end = real_end;
2183
2184         start_entry = vm_map_clip_range(map, start, end, &count,
2185                                         MAP_CLIP_NO_HOLES);
2186         if (start_entry == NULL) {
2187                 vm_map_unlock(map);
2188                 rv = KERN_INVALID_ADDRESS;
2189                 goto failure;
2190         }
2191         if ((kmflags & KM_PAGEABLE) == 0) {
2192                 /*
2193                  * Wiring.
2194                  *
2195                  * 1.  Holding the write lock, we create any shadow or zero-fill
2196                  * objects that need to be created. Then we clip each map
2197                  * entry to the region to be wired and increment its wiring
2198                  * count.  We create objects before clipping the map entries
2199                  * to avoid object proliferation.
2200                  *
2201                  * 2.  We downgrade to a read lock, and call vm_fault_wire to
2202                  * fault in the pages for any newly wired area (wired_count is
2203                  * 1).
2204                  *
2205                  * Downgrading to a read lock for vm_fault_wire avoids a
2206                  * possible deadlock with another process that may have faulted
2207                  * on one of the pages to be wired (it would mark the page busy,
2208                  * blocking us, then in turn block on the map lock that we
2209                  * hold).  Because of problems in the recursive lock package,
2210                  * we cannot upgrade to a write lock in vm_map_lookup.  Thus,
2211                  * any actions that require the write lock must be done
2212                  * beforehand.  Because we keep the read lock on the map, the
2213                  * copy-on-write status of the entries we modify here cannot
2214                  * change.
2215                  */
2216                 entry = start_entry;
2217                 while ((entry != &map->header) && (entry->start < end)) {
2218                         /*
2219                          * Trivial case if the entry is already wired
2220                          */
2221                         if (entry->wired_count) {
2222                                 entry->wired_count++;
2223                                 entry = entry->next;
2224                                 continue;
2225                         }
2226
2227                         /*
2228                          * The entry is being newly wired, we have to setup
2229                          * appropriate management structures.  A shadow
2230                          * object is required for a copy-on-write region,
2231                          * or a normal object for a zero-fill region.  We
2232                          * do not have to do this for entries that point to sub
2233                          * maps because we won't hold the lock on the sub map.
2234                          */
2235                         if (entry->maptype != VM_MAPTYPE_SUBMAP) {
2236                                 int copyflag = entry->eflags &
2237                                                MAP_ENTRY_NEEDS_COPY;
2238                                 if (copyflag && ((entry->protection &
2239                                                   VM_PROT_WRITE) != 0)) {
2240                                         vm_map_entry_shadow(entry);
2241                                 } else if (entry->object.vm_object == NULL &&
2242                                            !map->system_map) {
2243                                         vm_map_entry_allocate_object(entry);
2244                                 }
2245                         }
2246
2247                         entry->wired_count++;
2248                         entry = entry->next;
2249                 }
2250
2251                 /*
2252                  * Pass 2.
2253                  */
2254
2255                 /*
2256                  * HACK HACK HACK HACK
2257                  *
2258                  * vm_fault_wire() temporarily unlocks the map to avoid
2259                  * deadlocks.  The in-transition flag from vm_map_clip_range
2260                  * call should protect us from changes while the map is
2261                  * unlocked.  T
2262                  *
2263                  * NOTE: Previously this comment stated that clipping might
2264                  *       still occur while the entry is unlocked, but from
2265                  *       what I can tell it actually cannot.
2266                  *
2267                  *       It is unclear whether the CLIP_CHECK_*() calls
2268                  *       are still needed but we keep them in anyway.
2269                  *
2270                  * HACK HACK HACK HACK
2271                  */
2272
2273                 entry = start_entry;
2274                 while (entry != &map->header && entry->start < end) {
2275                         /*
2276                          * If vm_fault_wire fails for any page we need to undo
2277                          * what has been done.  We decrement the wiring count
2278                          * for those pages which have not yet been wired (now)
2279                          * and unwire those that have (later).
2280                          */
2281                         vm_offset_t save_start = entry->start;
2282                         vm_offset_t save_end = entry->end;
2283
2284                         if (entry->wired_count == 1)
2285                                 rv = vm_fault_wire(map, entry, FALSE);
2286                         if (rv) {
2287                                 CLIP_CHECK_BACK(entry, save_start);
2288                                 for (;;) {
2289                                         KASSERT(entry->wired_count == 1, ("wired_count changed unexpectedly"));
2290                                         entry->wired_count = 0;
2291                                         if (entry->end == save_end)
2292                                                 break;
2293                                         entry = entry->next;
2294                                         KASSERT(entry != &map->header, ("bad entry clip during backout"));
2295                                 }
2296                                 end = save_start;
2297                                 break;
2298                         }
2299                         CLIP_CHECK_FWD(entry, save_end);
2300                         entry = entry->next;
2301                 }
2302
2303                 /*
2304                  * If a failure occured undo everything by falling through
2305                  * to the unwiring code.  'end' has already been adjusted
2306                  * appropriately.
2307                  */
2308                 if (rv)
2309                         kmflags |= KM_PAGEABLE;
2310
2311                 /*
2312                  * start_entry is still IN_TRANSITION but may have been
2313                  * clipped since vm_fault_wire() unlocks and relocks the
2314                  * map.  No matter how clipped it has gotten there should
2315                  * be a fragment that is on our start boundary.
2316                  */
2317                 CLIP_CHECK_BACK(start_entry, start);
2318         }
2319
2320         if (kmflags & KM_PAGEABLE) {
2321                 /*
2322                  * This is the unwiring case.  We must first ensure that the
2323                  * range to be unwired is really wired down.  We know there
2324                  * are no holes.
2325                  */
2326                 entry = start_entry;
2327                 while ((entry != &map->header) && (entry->start < end)) {
2328                         if (entry->wired_count == 0) {
2329                                 rv = KERN_INVALID_ARGUMENT;
2330                                 goto done;
2331                         }
2332                         entry = entry->next;
2333                 }
2334
2335                 /*
2336                  * Now decrement the wiring count for each region. If a region
2337                  * becomes completely unwired, unwire its physical pages and
2338                  * mappings.
2339                  */
2340                 entry = start_entry;
2341                 while ((entry != &map->header) && (entry->start < end)) {
2342                         entry->wired_count--;
2343                         if (entry->wired_count == 0)
2344                                 vm_fault_unwire(map, entry);
2345                         entry = entry->next;
2346                 }
2347         }
2348 done:
2349         vm_map_unclip_range(map, start_entry, start, real_end,
2350                             &count, MAP_CLIP_NO_HOLES);
2351         map->timestamp++;
2352         vm_map_unlock(map);
2353 failure:
2354         if (kmflags & KM_KRESERVE)
2355                 vm_map_entry_krelease(count);
2356         else
2357                 vm_map_entry_release(count);
2358         return (rv);
2359 }
2360
2361 /*
2362  * Mark a newly allocated address range as wired but do not fault in
2363  * the pages.  The caller is expected to load the pages into the object.
2364  *
2365  * The map must be locked on entry and will remain locked on return.
2366  * No other requirements.
2367  */
2368 void
2369 vm_map_set_wired_quick(vm_map_t map, vm_offset_t addr, vm_size_t size,
2370                        int *countp)
2371 {
2372         vm_map_entry_t scan;
2373         vm_map_entry_t entry;
2374
2375         entry = vm_map_clip_range(map, addr, addr + size,
2376                                   countp, MAP_CLIP_NO_HOLES);
2377         for (scan = entry;
2378              scan != &map->header && scan->start < addr + size;
2379              scan = scan->next) {
2380             KKASSERT(entry->wired_count == 0);
2381             entry->wired_count = 1;
2382         }
2383         vm_map_unclip_range(map, entry, addr, addr + size,
2384                             countp, MAP_CLIP_NO_HOLES);
2385 }
2386
2387 /*
2388  * Push any dirty cached pages in the address range to their pager.
2389  * If syncio is TRUE, dirty pages are written synchronously.
2390  * If invalidate is TRUE, any cached pages are freed as well.
2391  *
2392  * This routine is called by sys_msync()
2393  *
2394  * Returns an error if any part of the specified range is not mapped.
2395  *
2396  * No requirements.
2397  */
2398 int
2399 vm_map_clean(vm_map_t map, vm_offset_t start, vm_offset_t end,
2400              boolean_t syncio, boolean_t invalidate)
2401 {
2402         vm_map_entry_t current;
2403         vm_map_entry_t entry;
2404         vm_size_t size;
2405         vm_object_t object;
2406         vm_ooffset_t offset;
2407
2408         vm_map_lock_read(map);
2409         VM_MAP_RANGE_CHECK(map, start, end);
2410         if (!vm_map_lookup_entry(map, start, &entry)) {
2411                 vm_map_unlock_read(map);
2412                 return (KERN_INVALID_ADDRESS);
2413         }
2414         /*
2415          * Make a first pass to check for holes.
2416          */
2417         for (current = entry; current->start < end; current = current->next) {
2418                 if (current->maptype == VM_MAPTYPE_SUBMAP) {
2419                         vm_map_unlock_read(map);
2420                         return (KERN_INVALID_ARGUMENT);
2421                 }
2422                 if (end > current->end &&
2423                     (current->next == &map->header ||
2424                         current->end != current->next->start)) {
2425                         vm_map_unlock_read(map);
2426                         return (KERN_INVALID_ADDRESS);
2427                 }
2428         }
2429
2430         if (invalidate)
2431                 pmap_remove(vm_map_pmap(map), start, end);
2432
2433         /*
2434          * Make a second pass, cleaning/uncaching pages from the indicated
2435          * objects as we go.
2436          *
2437          * Hold vm_token to avoid blocking in vm_object_reference()
2438          */
2439         lwkt_gettoken(&vm_token);
2440         for (current = entry; current->start < end; current = current->next) {
2441                 offset = current->offset + (start - current->start);
2442                 size = (end <= current->end ? end : current->end) - start;
2443                 if (current->maptype == VM_MAPTYPE_SUBMAP) {
2444                         vm_map_t smap;
2445                         vm_map_entry_t tentry;
2446                         vm_size_t tsize;
2447
2448                         smap = current->object.sub_map;
2449                         vm_map_lock_read(smap);
2450                         vm_map_lookup_entry(smap, offset, &tentry);
2451                         tsize = tentry->end - offset;
2452                         if (tsize < size)
2453                                 size = tsize;
2454                         object = tentry->object.vm_object;
2455                         offset = tentry->offset + (offset - tentry->start);
2456                         vm_map_unlock_read(smap);
2457                 } else {
2458                         object = current->object.vm_object;
2459                 }
2460                 /*
2461                  * Note that there is absolutely no sense in writing out
2462                  * anonymous objects, so we track down the vnode object
2463                  * to write out.
2464                  * We invalidate (remove) all pages from the address space
2465                  * anyway, for semantic correctness.
2466                  *
2467                  * note: certain anonymous maps, such as MAP_NOSYNC maps,
2468                  * may start out with a NULL object.
2469                  */
2470                 while (object && object->backing_object) {
2471                         offset += object->backing_object_offset;
2472                         object = object->backing_object;
2473                         if (object->size < OFF_TO_IDX( offset + size))
2474                                 size = IDX_TO_OFF(object->size) - offset;
2475                 }
2476                 if (object && (object->type == OBJT_VNODE) &&
2477                     (current->protection & VM_PROT_WRITE) &&
2478                     (object->flags & OBJ_NOMSYNC) == 0) {
2479                         /*
2480                          * Flush pages if writing is allowed, invalidate them
2481                          * if invalidation requested.  Pages undergoing I/O
2482                          * will be ignored by vm_object_page_remove().
2483                          *
2484                          * We cannot lock the vnode and then wait for paging
2485                          * to complete without deadlocking against vm_fault.
2486                          * Instead we simply call vm_object_page_remove() and
2487                          * allow it to block internally on a page-by-page
2488                          * basis when it encounters pages undergoing async
2489                          * I/O.
2490                          */
2491                         int flags;
2492
2493                         vm_object_reference(object);
2494                         vn_lock(object->handle, LK_EXCLUSIVE | LK_RETRY);
2495                         flags = (syncio || invalidate) ? OBJPC_SYNC : 0;
2496                         flags |= invalidate ? OBJPC_INVAL : 0;
2497
2498                         /*
2499                          * When operating on a virtual page table just
2500                          * flush the whole object.  XXX we probably ought
2501                          * to
2502                          */
2503                         switch(current->maptype) {
2504                         case VM_MAPTYPE_NORMAL:
2505                                 vm_object_page_clean(object,
2506                                     OFF_TO_IDX(offset),
2507                                     OFF_TO_IDX(offset + size + PAGE_MASK),
2508                                     flags);
2509                                 break;
2510                         case VM_MAPTYPE_VPAGETABLE:
2511                                 vm_object_page_clean(object, 0, 0, flags);
2512                                 break;
2513                         }
2514                         vn_unlock(((struct vnode *)object->handle));
2515                         vm_object_deallocate(object);
2516                 }
2517                 if (object && invalidate &&
2518                    ((object->type == OBJT_VNODE) ||
2519                     (object->type == OBJT_DEVICE))) {
2520                         int clean_only =
2521                                 (object->type == OBJT_DEVICE) ? FALSE : TRUE;
2522                         vm_object_reference(object);
2523                         switch(current->maptype) {
2524                         case VM_MAPTYPE_NORMAL:
2525                                 vm_object_page_remove(object,
2526                                     OFF_TO_IDX(offset),
2527                                     OFF_TO_IDX(offset + size + PAGE_MASK),
2528                                     clean_only);
2529                                 break;
2530                         case VM_MAPTYPE_VPAGETABLE:
2531                                 vm_object_page_remove(object, 0, 0, clean_only);
2532                                 break;
2533                         }
2534                         vm_object_deallocate(object);
2535                 }
2536                 start += size;
2537         }
2538         vm_map_unlock_read(map);
2539         lwkt_reltoken(&vm_token);
2540
2541         return (KERN_SUCCESS);
2542 }
2543
2544 /*
2545  * Make the region specified by this entry pageable.
2546  *
2547  * The vm_map must be exclusively locked.
2548  */
2549 static void
2550 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
2551 {
2552         entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2553         entry->wired_count = 0;
2554         vm_fault_unwire(map, entry);
2555 }
2556
2557 /*
2558  * Deallocate the given entry from the target map.
2559  *
2560  * The vm_map must be exclusively locked.
2561  */
2562 static void
2563 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry, int *countp)
2564 {
2565         vm_map_entry_unlink(map, entry);
2566         map->size -= entry->end - entry->start;
2567
2568         switch(entry->maptype) {
2569         case VM_MAPTYPE_NORMAL:
2570         case VM_MAPTYPE_VPAGETABLE:
2571                 vm_object_deallocate(entry->object.vm_object);
2572                 break;
2573         default:
2574                 break;
2575         }
2576
2577         vm_map_entry_dispose(map, entry, countp);
2578 }
2579
2580 /*
2581  * Deallocates the given address range from the target map.
2582  *
2583  * The vm_map must be exclusively locked.
2584  */
2585 int
2586 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end, int *countp)
2587 {
2588         vm_object_t object;
2589         vm_map_entry_t entry;
2590         vm_map_entry_t first_entry;
2591
2592         ASSERT_VM_MAP_LOCKED(map);
2593 again:
2594         /*
2595          * Find the start of the region, and clip it.  Set entry to point
2596          * at the first record containing the requested address or, if no
2597          * such record exists, the next record with a greater address.  The
2598          * loop will run from this point until a record beyond the termination
2599          * address is encountered.
2600          *
2601          * map->hint must be adjusted to not point to anything we delete,
2602          * so set it to the entry prior to the one being deleted.
2603          *
2604          * GGG see other GGG comment.
2605          */
2606         if (vm_map_lookup_entry(map, start, &first_entry)) {
2607                 entry = first_entry;
2608                 vm_map_clip_start(map, entry, start, countp);
2609                 map->hint = entry->prev;        /* possible problem XXX */
2610         } else {
2611                 map->hint = first_entry;        /* possible problem XXX */
2612                 entry = first_entry->next;
2613         }
2614
2615         /*
2616          * If a hole opens up prior to the current first_free then
2617          * adjust first_free.  As with map->hint, map->first_free
2618          * cannot be left set to anything we might delete.
2619          */
2620         if (entry == &map->header) {
2621                 map->first_free = &map->header;
2622         } else if (map->first_free->start >= start) {
2623                 map->first_free = entry->prev;
2624         }
2625
2626         /*
2627          * Step through all entries in this region
2628          */
2629         while ((entry != &map->header) && (entry->start < end)) {
2630                 vm_map_entry_t next;
2631                 vm_offset_t s, e;
2632                 vm_pindex_t offidxstart, offidxend, count;
2633
2634                 /*
2635                  * If we hit an in-transition entry we have to sleep and
2636                  * retry.  It's easier (and not really slower) to just retry
2637                  * since this case occurs so rarely and the hint is already
2638                  * pointing at the right place.  We have to reset the
2639                  * start offset so as not to accidently delete an entry
2640                  * another process just created in vacated space.
2641                  */
2642                 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
2643                         entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2644                         start = entry->start;
2645                         ++mycpu->gd_cnt.v_intrans_coll;
2646                         ++mycpu->gd_cnt.v_intrans_wait;
2647                         vm_map_transition_wait(map);
2648                         goto again;
2649                 }
2650                 vm_map_clip_end(map, entry, end, countp);
2651
2652                 s = entry->start;
2653                 e = entry->end;
2654                 next = entry->next;
2655
2656                 offidxstart = OFF_TO_IDX(entry->offset);
2657                 count = OFF_TO_IDX(e - s);
2658                 object = entry->object.vm_object;
2659
2660                 /*
2661                  * Unwire before removing addresses from the pmap; otherwise,
2662                  * unwiring will put the entries back in the pmap.
2663                  */
2664                 if (entry->wired_count != 0)
2665                         vm_map_entry_unwire(map, entry);
2666
2667                 offidxend = offidxstart + count;
2668
2669                 /*
2670                  * Hold vm_token when manipulating vm_objects.
2671                  */
2672                 lwkt_gettoken(&vm_token);
2673                 if (object == &kernel_object) {
2674                         vm_object_page_remove(object, offidxstart,
2675                                               offidxend, FALSE);
2676                 } else {
2677                         pmap_remove(map->pmap, s, e);
2678                         if (object != NULL &&
2679                             object->ref_count != 1 &&
2680                             (object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) ==
2681                              OBJ_ONEMAPPING &&
2682                             (object->type == OBJT_DEFAULT ||
2683                              object->type == OBJT_SWAP)) {
2684                                 vm_object_collapse(object);
2685                                 vm_object_page_remove(object, offidxstart,
2686                                                       offidxend, FALSE);
2687                                 if (object->type == OBJT_SWAP) {
2688                                         swap_pager_freespace(object,
2689                                                              offidxstart,
2690                                                              count);
2691                                 }
2692                                 if (offidxend >= object->size &&
2693                                     offidxstart < object->size) {
2694                                         object->size = offidxstart;
2695                                 }
2696                         }
2697                 }
2698                 lwkt_reltoken(&vm_token);
2699
2700                 /*
2701                  * Delete the entry (which may delete the object) only after
2702                  * removing all pmap entries pointing to its pages.
2703                  * (Otherwise, its page frames may be reallocated, and any
2704                  * modify bits will be set in the wrong object!)
2705                  */
2706                 vm_map_entry_delete(map, entry, countp);
2707                 entry = next;
2708         }
2709         return (KERN_SUCCESS);
2710 }
2711
2712 /*
2713  * Remove the given address range from the target map.
2714  * This is the exported form of vm_map_delete.
2715  *
2716  * No requirements.
2717  */
2718 int
2719 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
2720 {
2721         int result;
2722         int count;
2723
2724         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2725         vm_map_lock(map);
2726         VM_MAP_RANGE_CHECK(map, start, end);
2727         result = vm_map_delete(map, start, end, &count);
2728         vm_map_unlock(map);
2729         vm_map_entry_release(count);
2730
2731         return (result);
2732 }
2733
2734 /*
2735  * Assert that the target map allows the specified privilege on the
2736  * entire address region given.  The entire region must be allocated.
2737  *
2738  * The caller must specify whether the vm_map is already locked or not.
2739  */
2740 boolean_t
2741 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
2742                         vm_prot_t protection, boolean_t have_lock)
2743 {
2744         vm_map_entry_t entry;
2745         vm_map_entry_t tmp_entry;
2746         boolean_t result;
2747
2748         if (have_lock == FALSE)
2749                 vm_map_lock_read(map);
2750
2751         if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
2752                 if (have_lock == FALSE)
2753                         vm_map_unlock_read(map);
2754                 return (FALSE);
2755         }
2756         entry = tmp_entry;
2757
2758         result = TRUE;
2759         while (start < end) {
2760                 if (entry == &map->header) {
2761                         result = FALSE;
2762                         break;
2763                 }
2764                 /*
2765                  * No holes allowed!
2766                  */
2767
2768                 if (start < entry->start) {
2769                         result = FALSE;
2770                         break;
2771                 }
2772                 /*
2773                  * Check protection associated with entry.
2774                  */
2775
2776                 if ((entry->protection & protection) != protection) {
2777                         result = FALSE;
2778                         break;
2779                 }
2780                 /* go to next entry */
2781
2782                 start = entry->end;
2783                 entry = entry->next;
2784         }
2785         if (have_lock == FALSE)
2786                 vm_map_unlock_read(map);
2787         return (result);
2788 }
2789
2790 /*
2791  * Split the pages in a map entry into a new object.  This affords
2792  * easier removal of unused pages, and keeps object inheritance from
2793  * being a negative impact on memory usage.
2794  *
2795  * The vm_map must be exclusively locked.
2796  */
2797 static void
2798 vm_map_split(vm_map_entry_t entry)
2799 {
2800         vm_page_t m;
2801         vm_object_t orig_object, new_object, source;
2802         vm_offset_t s, e;
2803         vm_pindex_t offidxstart, offidxend, idx;
2804         vm_size_t size;
2805         vm_ooffset_t offset;
2806
2807         orig_object = entry->object.vm_object;
2808         if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP)
2809                 return;
2810         if (orig_object->ref_count <= 1)
2811                 return;
2812
2813         offset = entry->offset;
2814         s = entry->start;
2815         e = entry->end;
2816
2817         offidxstart = OFF_TO_IDX(offset);
2818         offidxend = offidxstart + OFF_TO_IDX(e - s);
2819         size = offidxend - offidxstart;
2820
2821         switch(orig_object->type) {
2822         case OBJT_DEFAULT:
2823                 new_object = default_pager_alloc(NULL, IDX_TO_OFF(size),
2824                                                  VM_PROT_ALL, 0);
2825                 break;
2826         case OBJT_SWAP:
2827                 new_object = swap_pager_alloc(NULL, IDX_TO_OFF(size),
2828                                               VM_PROT_ALL, 0);
2829                 break;
2830         default:
2831                 /* not reached */
2832                 new_object = NULL;
2833                 KKASSERT(0);
2834         }
2835         if (new_object == NULL)
2836                 return;
2837
2838         /*
2839          * vm_token required when manipulating vm_objects.
2840          */
2841         lwkt_gettoken(&vm_token);
2842
2843         source = orig_object->backing_object;
2844         if (source != NULL) {
2845                 vm_object_reference(source);    /* Referenced by new_object */
2846                 LIST_INSERT_HEAD(&source->shadow_head,
2847                                   new_object, shadow_list);
2848                 vm_object_clear_flag(source, OBJ_ONEMAPPING);
2849                 new_object->backing_object_offset =
2850                         orig_object->backing_object_offset + IDX_TO_OFF(offidxstart);
2851                 new_object->backing_object = source;
2852                 source->shadow_count++;
2853                 source->generation++;
2854         }
2855
2856         for (idx = 0; idx < size; idx++) {
2857                 vm_page_t m;
2858
2859                 crit_enter();
2860         retry:
2861                 m = vm_page_lookup(orig_object, offidxstart + idx);
2862                 if (m == NULL) {
2863                         crit_exit();
2864                         continue;
2865                 }
2866
2867                 /*
2868                  * We must wait for pending I/O to complete before we can
2869                  * rename the page.
2870                  *
2871                  * We do not have to VM_PROT_NONE the page as mappings should
2872                  * not be changed by this operation.
2873                  */
2874                 if (vm_page_sleep_busy(m, TRUE, "spltwt"))
2875                         goto retry;
2876                 vm_page_busy(m);
2877                 vm_page_rename(m, new_object, idx);
2878                 /* page automatically made dirty by rename and cache handled */
2879                 vm_page_busy(m);
2880                 crit_exit();
2881         }
2882
2883         if (orig_object->type == OBJT_SWAP) {
2884                 vm_object_pip_add(orig_object, 1);
2885                 /*
2886                  * copy orig_object pages into new_object
2887                  * and destroy unneeded pages in
2888                  * shadow object.
2889                  */
2890                 swap_pager_copy(orig_object, new_object, offidxstart, 0);
2891                 vm_object_pip_wakeup(orig_object);
2892         }
2893
2894         /*
2895          * Wakeup the pages we played with.  No spl protection is needed
2896          * for a simple wakeup.
2897          */
2898         for (idx = 0; idx < size; idx++) {
2899                 m = vm_page_lookup(new_object, idx);
2900                 if (m)
2901                         vm_page_wakeup(m);
2902         }
2903
2904         entry->object.vm_object = new_object;
2905         entry->offset = 0LL;
2906         vm_object_deallocate(orig_object);
2907         lwkt_reltoken(&vm_token);
2908 }
2909
2910 /*
2911  * Copies the contents of the source entry to the destination
2912  * entry.  The entries *must* be aligned properly.
2913  *
2914  * The vm_map must be exclusively locked.
2915  */
2916 static void
2917 vm_map_copy_entry(vm_map_t src_map, vm_map_t dst_map,
2918         vm_map_entry_t src_entry, vm_map_entry_t dst_entry)
2919 {
2920         vm_object_t src_object;
2921
2922         if (dst_entry->maptype == VM_MAPTYPE_SUBMAP)
2923                 return;
2924         if (src_entry->maptype == VM_MAPTYPE_SUBMAP)
2925                 return;
2926
2927         lwkt_gettoken(&vm_token);
2928         if (src_entry->wired_count == 0) {
2929                 /*
2930                  * If the source entry is marked needs_copy, it is already
2931                  * write-protected.
2932                  */
2933                 if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
2934                         pmap_protect(src_map->pmap,
2935                             src_entry->start,
2936                             src_entry->end,
2937                             src_entry->protection & ~VM_PROT_WRITE);
2938                 }
2939
2940                 /*
2941                  * Make a copy of the object.
2942                  */
2943                 if ((src_object = src_entry->object.vm_object) != NULL) {
2944                         if ((src_object->handle == NULL) &&
2945                                 (src_object->type == OBJT_DEFAULT ||
2946                                  src_object->type == OBJT_SWAP)) {
2947                                 vm_object_collapse(src_object);
2948                                 if ((src_object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) {
2949                                         vm_map_split(src_entry);
2950                                         src_object = src_entry->object.vm_object;
2951                                 }
2952                         }
2953
2954                         vm_object_reference(src_object);
2955                         vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
2956                         dst_entry->object.vm_object = src_object;
2957                         src_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
2958                         dst_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
2959                         dst_entry->offset = src_entry->offset;
2960                 } else {
2961                         dst_entry->object.vm_object = NULL;
2962                         dst_entry->offset = 0;
2963                 }
2964
2965                 pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start,
2966                     dst_entry->end - dst_entry->start, src_entry->start);
2967         } else {
2968                 /*
2969                  * Of course, wired down pages can't be set copy-on-write.
2970                  * Cause wired pages to be copied into the new map by
2971                  * simulating faults (the new pages are pageable)
2972                  */
2973                 vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry);
2974         }
2975         lwkt_reltoken(&vm_token);
2976 }
2977
2978 /*
2979  * vmspace_fork:
2980  * Create a new process vmspace structure and vm_map
2981  * based on those of an existing process.  The new map
2982  * is based on the old map, according to the inheritance
2983  * values on the regions in that map.
2984  *
2985  * The source map must not be locked.
2986  * No requirements.
2987  */
2988 struct vmspace *
2989 vmspace_fork(struct vmspace *vm1)
2990 {
2991         struct vmspace *vm2;
2992         vm_map_t old_map = &vm1->vm_map;
2993         vm_map_t new_map;
2994         vm_map_entry_t old_entry;
2995         vm_map_entry_t new_entry;
2996         vm_object_t object;
2997         int count;
2998
2999         lwkt_gettoken(&vm_token);
3000         lwkt_gettoken(&vmspace_token);
3001         vm_map_lock(old_map);
3002         old_map->infork = 1;
3003
3004         /*
3005          * XXX Note: upcalls are not copied.
3006          */
3007         vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset);
3008         bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy,
3009             (caddr_t)&vm1->vm_endcopy - (caddr_t)&vm1->vm_startcopy);
3010         new_map = &vm2->vm_map; /* XXX */
3011         new_map->timestamp = 1;
3012
3013         vm_map_lock(new_map);
3014
3015         count = 0;
3016         old_entry = old_map->header.next;
3017         while (old_entry != &old_map->header) {
3018                 ++count;
3019                 old_entry = old_entry->next;
3020         }
3021
3022         count = vm_map_entry_reserve(count + MAP_RESERVE_COUNT);
3023
3024         old_entry = old_map->header.next;
3025         while (old_entry != &old_map->header) {
3026                 if (old_entry->maptype == VM_MAPTYPE_SUBMAP)
3027                         panic("vm_map_fork: encountered a submap");
3028
3029                 switch (old_entry->inheritance) {
3030                 case VM_INHERIT_NONE:
3031                         break;
3032                 case VM_INHERIT_SHARE:
3033                         /*
3034                          * Clone the entry, creating the shared object if
3035                          * necessary.
3036                          */
3037                         object = old_entry->object.vm_object;
3038                         if (object == NULL) {
3039                                 vm_map_entry_allocate_object(old_entry);
3040                                 object = old_entry->object.vm_object;
3041                         }
3042
3043                         /*
3044                          * Add the reference before calling vm_map_entry_shadow
3045                          * to insure that a shadow object is created.
3046                          */
3047                         vm_object_reference(object);
3048                         if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3049                                 vm_map_entry_shadow(old_entry);
3050                                 /* Transfer the second reference too. */
3051                                 vm_object_reference(
3052                                     old_entry->object.vm_object);
3053                                 vm_object_deallocate(object);
3054                                 object = old_entry->object.vm_object;
3055                         }
3056                         vm_object_clear_flag(object, OBJ_ONEMAPPING);
3057
3058                         /*
3059                          * Clone the entry, referencing the shared object.
3060                          */
3061                         new_entry = vm_map_entry_create(new_map, &count);
3062                         *new_entry = *old_entry;
3063                         new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3064                         new_entry->wired_count = 0;
3065
3066                         /*
3067                          * Insert the entry into the new map -- we know we're
3068                          * inserting at the end of the new map.
3069                          */
3070
3071                         vm_map_entry_link(new_map, new_map->header.prev,
3072                                           new_entry);
3073
3074                         /*
3075                          * Update the physical map
3076                          */
3077                         pmap_copy(new_map->pmap, old_map->pmap,
3078                             new_entry->start,
3079                             (old_entry->end - old_entry->start),
3080                             old_entry->start);
3081                         break;
3082                 case VM_INHERIT_COPY:
3083                         /*
3084                          * Clone the entry and link into the map.
3085                          */
3086                         new_entry = vm_map_entry_create(new_map, &count);
3087                         *new_entry = *old_entry;
3088                         new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3089                         new_entry->wired_count = 0;
3090                         new_entry->object.vm_object = NULL;
3091                         vm_map_entry_link(new_map, new_map->header.prev,
3092                                           new_entry);
3093                         vm_map_copy_entry(old_map, new_map, old_entry,
3094                                           new_entry);
3095                         break;
3096                 }
3097                 old_entry = old_entry->next;
3098         }
3099
3100         new_map->size = old_map->size;
3101         old_map->infork = 0;
3102         vm_map_unlock(old_map);
3103         vm_map_unlock(new_map);
3104         vm_map_entry_release(count);
3105         lwkt_reltoken(&vmspace_token);
3106         lwkt_reltoken(&vm_token);
3107
3108         return (vm2);
3109 }
3110
3111 /*
3112  * Create an auto-grow stack entry
3113  *
3114  * No requirements.
3115  */
3116 int
3117 vm_map_stack (vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
3118               int flags, vm_prot_t prot, vm_prot_t max, int cow)
3119 {
3120         vm_map_entry_t  prev_entry;
3121         vm_map_entry_t  new_stack_entry;
3122         vm_size_t       init_ssize;
3123         int             rv;
3124         int             count;
3125         vm_offset_t     tmpaddr;
3126
3127         cow |= MAP_IS_STACK;
3128
3129         if (max_ssize < sgrowsiz)
3130                 init_ssize = max_ssize;
3131         else
3132                 init_ssize = sgrowsiz;
3133
3134         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3135         vm_map_lock(map);
3136
3137         /*
3138          * Find space for the mapping
3139          */
3140         if ((flags & (MAP_FIXED | MAP_TRYFIXED)) == 0) {
3141                 if (vm_map_findspace(map, addrbos, max_ssize, 1,
3142                                      flags, &tmpaddr)) {
3143                         vm_map_unlock(map);
3144                         vm_map_entry_release(count);
3145                         return (KERN_NO_SPACE);
3146                 }
3147                 addrbos = tmpaddr;
3148         }
3149
3150         /* If addr is already mapped, no go */
3151         if (vm_map_lookup_entry(map, addrbos, &prev_entry)) {
3152                 vm_map_unlock(map);
3153                 vm_map_entry_release(count);
3154                 return (KERN_NO_SPACE);
3155         }
3156
3157 #if 0
3158         /* XXX already handled by kern_mmap() */
3159         /* If we would blow our VMEM resource limit, no go */
3160         if (map->size + init_ssize >
3161             curproc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
3162                 vm_map_unlock(map);
3163                 vm_map_entry_release(count);
3164                 return (KERN_NO_SPACE);
3165         }
3166 #endif
3167
3168         /*
3169          * If we can't accomodate max_ssize in the current mapping,
3170          * no go.  However, we need to be aware that subsequent user
3171          * mappings might map into the space we have reserved for
3172          * stack, and currently this space is not protected.
3173          *
3174          * Hopefully we will at least detect this condition
3175          * when we try to grow the stack.
3176          */
3177         if ((prev_entry->next != &map->header) &&
3178             (prev_entry->next->start < addrbos + max_ssize)) {
3179                 vm_map_unlock(map);
3180                 vm_map_entry_release(count);
3181                 return (KERN_NO_SPACE);
3182         }
3183
3184         /*
3185          * We initially map a stack of only init_ssize.  We will
3186          * grow as needed later.  Since this is to be a grow
3187          * down stack, we map at the top of the range.
3188          *
3189          * Note: we would normally expect prot and max to be
3190          * VM_PROT_ALL, and cow to be 0.  Possibly we should
3191          * eliminate these as input parameters, and just
3192          * pass these values here in the insert call.
3193          */
3194         rv = vm_map_insert(map, &count,
3195                            NULL, 0, addrbos + max_ssize - init_ssize,
3196                            addrbos + max_ssize,
3197                            VM_MAPTYPE_NORMAL,
3198                            prot, max,
3199                            cow);
3200
3201         /* Now set the avail_ssize amount */
3202         if (rv == KERN_SUCCESS) {
3203                 if (prev_entry != &map->header)
3204                         vm_map_clip_end(map, prev_entry, addrbos + max_ssize - init_ssize, &count);
3205                 new_stack_entry = prev_entry->next;
3206                 if (new_stack_entry->end   != addrbos + max_ssize ||
3207                     new_stack_entry->start != addrbos + max_ssize - init_ssize)
3208                         panic ("Bad entry start/end for new stack entry");
3209                 else
3210                         new_stack_entry->aux.avail_ssize = max_ssize - init_ssize;
3211         }
3212
3213         vm_map_unlock(map);
3214         vm_map_entry_release(count);
3215         return (rv);
3216 }
3217
3218 /*
3219  * Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if the
3220  * desired address is already mapped, or if we successfully grow
3221  * the stack.  Also returns KERN_SUCCESS if addr is outside the
3222  * stack range (this is strange, but preserves compatibility with
3223  * the grow function in vm_machdep.c).
3224  *
3225  * No requirements.
3226  */
3227 int
3228 vm_map_growstack (struct proc *p, vm_offset_t addr)
3229 {
3230         vm_map_entry_t prev_entry;
3231         vm_map_entry_t stack_entry;
3232         vm_map_entry_t new_stack_entry;
3233         struct vmspace *vm = p->p_vmspace;
3234         vm_map_t map = &vm->vm_map;
3235         vm_offset_t    end;
3236         int grow_amount;
3237         int rv = KERN_SUCCESS;
3238         int is_procstack;
3239         int use_read_lock = 1;
3240         int count;
3241
3242         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3243 Retry:
3244         if (use_read_lock)
3245                 vm_map_lock_read(map);
3246         else
3247                 vm_map_lock(map);
3248
3249         /* If addr is already in the entry range, no need to grow.*/
3250         if (vm_map_lookup_entry(map, addr, &prev_entry))
3251                 goto done;
3252
3253         if ((stack_entry = prev_entry->next) == &map->header)
3254                 goto done;
3255         if (prev_entry == &map->header)
3256                 end = stack_entry->start - stack_entry->aux.avail_ssize;
3257         else
3258                 end = prev_entry->end;
3259
3260         /*
3261          * This next test mimics the old grow function in vm_machdep.c.
3262          * It really doesn't quite make sense, but we do it anyway
3263          * for compatibility.
3264          *
3265          * If not growable stack, return success.  This signals the
3266          * caller to proceed as he would normally with normal vm.
3267          */
3268         if (stack_entry->aux.avail_ssize < 1 ||
3269             addr >= stack_entry->start ||
3270             addr <  stack_entry->start - stack_entry->aux.avail_ssize) {
3271                 goto done;
3272         }
3273
3274         /* Find the minimum grow amount */
3275         grow_amount = roundup (stack_entry->start - addr, PAGE_SIZE);
3276         if (grow_amount > stack_entry->aux.avail_ssize) {
3277                 rv = KERN_NO_SPACE;
3278                 goto done;
3279         }
3280
3281         /*
3282          * If there is no longer enough space between the entries
3283          * nogo, and adjust the available space.  Note: this
3284          * should only happen if the user has mapped into the
3285          * stack area after the stack was created, and is
3286          * probably an error.
3287          *
3288          * This also effectively destroys any guard page the user
3289          * might have intended by limiting the stack size.
3290          */
3291         if (grow_amount > stack_entry->start - end) {
3292                 if (use_read_lock && vm_map_lock_upgrade(map)) {
3293                         use_read_lock = 0;
3294                         goto Retry;
3295                 }
3296                 use_read_lock = 0;
3297                 stack_entry->aux.avail_ssize = stack_entry->start - end;
3298                 rv = KERN_NO_SPACE;
3299                 goto done;
3300         }
3301
3302         is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr;
3303
3304         /* If this is the main process stack, see if we're over the
3305          * stack limit.
3306          */
3307         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount >
3308                              p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
3309                 rv = KERN_NO_SPACE;
3310                 goto done;
3311         }
3312
3313         /* Round up the grow amount modulo SGROWSIZ */
3314         grow_amount = roundup (grow_amount, sgrowsiz);
3315         if (grow_amount > stack_entry->aux.avail_ssize) {
3316                 grow_amount = stack_entry->aux.avail_ssize;
3317         }
3318         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount >
3319                              p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
3320                 grow_amount = p->p_rlimit[RLIMIT_STACK].rlim_cur -
3321                               ctob(vm->vm_ssize);
3322         }
3323
3324         /* If we would blow our VMEM resource limit, no go */
3325         if (map->size + grow_amount > p->p_rlimit[RLIMIT_VMEM].rlim_cur) {
3326                 rv = KERN_NO_SPACE;
3327                 goto done;
3328         }
3329
3330         if (use_read_lock && vm_map_lock_upgrade(map)) {
3331                 use_read_lock = 0;
3332                 goto Retry;
3333         }
3334         use_read_lock = 0;
3335
3336         /* Get the preliminary new entry start value */
3337         addr = stack_entry->start - grow_amount;
3338
3339         /* If this puts us into the previous entry, cut back our growth
3340          * to the available space.  Also, see the note above.
3341          */
3342         if (addr < end) {
3343                 stack_entry->aux.avail_ssize = stack_entry->start - end;
3344                 addr = end;
3345         }
3346
3347         rv = vm_map_insert(map, &count,
3348                            NULL, 0, addr, stack_entry->start,
3349                            VM_MAPTYPE_NORMAL,
3350                            VM_PROT_ALL, VM_PROT_ALL,
3351                            0);
3352
3353         /* Adjust the available stack space by the amount we grew. */
3354         if (rv == KERN_SUCCESS) {
3355                 if (prev_entry != &map->header)
3356                         vm_map_clip_end(map, prev_entry, addr, &count);
3357                 new_stack_entry = prev_entry->next;
3358                 if (new_stack_entry->end   != stack_entry->start  ||
3359                     new_stack_entry->start != addr)
3360                         panic ("Bad stack grow start/end in new stack entry");
3361                 else {
3362                         new_stack_entry->aux.avail_ssize =
3363                                 stack_entry->aux.avail_ssize -
3364                                 (new_stack_entry->end - new_stack_entry->start);
3365                         if (is_procstack)
3366                                 vm->vm_ssize += btoc(new_stack_entry->end -
3367                                                      new_stack_entry->start);
3368                 }
3369         }
3370
3371 done:
3372         if (use_read_lock)
3373                 vm_map_unlock_read(map);
3374         else
3375                 vm_map_unlock(map);
3376         vm_map_entry_release(count);
3377         return (rv);
3378 }
3379
3380 /*
3381  * Unshare the specified VM space for exec.  If other processes are
3382  * mapped to it, then create a new one.  The new vmspace is null.
3383  *
3384  * No requirements.
3385  */
3386 void
3387 vmspace_exec(struct proc *p, struct vmspace *vmcopy)
3388 {
3389         struct vmspace *oldvmspace = p->p_vmspace;
3390         struct vmspace *newvmspace;
3391         vm_map_t map = &p->p_vmspace->vm_map;
3392
3393         /*
3394          * If we are execing a resident vmspace we fork it, otherwise
3395          * we create a new vmspace.  Note that exitingcnt and upcalls
3396          * are not copied to the new vmspace.
3397          */
3398         lwkt_gettoken(&vmspace_token);
3399         if (vmcopy)  {
3400                 newvmspace = vmspace_fork(vmcopy);
3401         } else {
3402                 newvmspace = vmspace_alloc(map->min_offset, map->max_offset);
3403                 bcopy(&oldvmspace->vm_startcopy, &newvmspace->vm_startcopy,
3404                       (caddr_t)&oldvmspace->vm_endcopy -
3405                        (caddr_t)&oldvmspace->vm_startcopy);
3406         }
3407
3408         /*
3409          * Finish initializing the vmspace before assigning it
3410          * to the process.  The vmspace will become the current vmspace
3411          * if p == curproc.
3412          */
3413         pmap_pinit2(vmspace_pmap(newvmspace));
3414         pmap_replacevm(p, newvmspace, 0);
3415         sysref_put(&oldvmspace->vm_sysref);
3416         lwkt_reltoken(&vmspace_token);
3417 }
3418
3419 /*
3420  * Unshare the specified VM space for forcing COW.  This
3421  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
3422  *
3423  * The exitingcnt test is not strictly necessary but has been
3424  * included for code sanity (to make the code a bit more deterministic).
3425  */
3426 void
3427 vmspace_unshare(struct proc *p)
3428 {
3429         struct vmspace *oldvmspace = p->p_vmspace;
3430         struct vmspace *newvmspace;
3431
3432         lwkt_gettoken(&vmspace_token);
3433         if (oldvmspace->vm_sysref.refcnt == 1 && oldvmspace->vm_exitingcnt == 0)
3434                 return;
3435         newvmspace = vmspace_fork(oldvmspace);
3436         pmap_pinit2(vmspace_pmap(newvmspace));
3437         pmap_replacevm(p, newvmspace, 0);
3438         sysref_put(&oldvmspace->vm_sysref);
3439         lwkt_reltoken(&vmspace_token);
3440 }
3441
3442 /*
3443  * Finds the VM object, offset, and protection for a given virtual address
3444  * in the specified map, assuming a page fault of the type specified.
3445  *
3446  * Leaves the map in question locked for read; return values are guaranteed
3447  * until a vm_map_lookup_done call is performed.  Note that the map argument
3448  * is in/out; the returned map must be used in the call to vm_map_lookup_done.
3449  *
3450  * A handle (out_entry) is returned for use in vm_map_lookup_done, to make
3451  * that fast.
3452  *
3453  * If a lookup is requested with "write protection" specified, the map may
3454  * be changed to perform virtual copying operations, although the data
3455  * referenced will remain the same.
3456  *
3457  * No requirements.
3458  */
3459 int
3460 vm_map_lookup(vm_map_t *var_map,                /* IN/OUT */
3461               vm_offset_t vaddr,
3462               vm_prot_t fault_typea,
3463               vm_map_entry_t *out_entry,        /* OUT */
3464               vm_object_t *object,              /* OUT */
3465               vm_pindex_t *pindex,              /* OUT */
3466               vm_prot_t *out_prot,              /* OUT */
3467               boolean_t *wired)                 /* OUT */
3468 {
3469         vm_map_entry_t entry;
3470         vm_map_t map = *var_map;
3471         vm_prot_t prot;
3472         vm_prot_t fault_type = fault_typea;
3473         int use_read_lock = 1;
3474         int rv = KERN_SUCCESS;
3475
3476 RetryLookup:
3477         if (use_read_lock)
3478                 vm_map_lock_read(map);
3479         else
3480                 vm_map_lock(map);
3481
3482         /*
3483          * If the map has an interesting hint, try it before calling full
3484          * blown lookup routine.
3485          */
3486         entry = map->hint;
3487         *out_entry = entry;
3488
3489         if ((entry == &map->header) ||
3490             (vaddr < entry->start) || (vaddr >= entry->end)) {
3491                 vm_map_entry_t tmp_entry;
3492
3493                 /*
3494                  * Entry was either not a valid hint, or the vaddr was not
3495                  * contained in the entry, so do a full lookup.
3496                  */
3497                 if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
3498                         rv = KERN_INVALID_ADDRESS;
3499                         goto done;
3500                 }
3501
3502                 entry = tmp_entry;
3503                 *out_entry = entry;
3504         }
3505
3506         /*
3507          * Handle submaps.
3508          */
3509         if (entry->maptype == VM_MAPTYPE_SUBMAP) {
3510                 vm_map_t old_map = map;
3511
3512                 *var_map = map = entry->object.sub_map;
3513                 if (use_read_lock)
3514                         vm_map_unlock_read(old_map);
3515                 else
3516                         vm_map_unlock(old_map);
3517                 use_read_lock = 1;
3518                 goto RetryLookup;
3519         }
3520
3521         /*
3522          * Check whether this task is allowed to have this page.
3523          * Note the special case for MAP_ENTRY_COW
3524          * pages with an override.  This is to implement a forced
3525          * COW for debuggers.
3526          */
3527
3528         if (fault_type & VM_PROT_OVERRIDE_WRITE)
3529                 prot = entry->max_protection;
3530         else
3531                 prot = entry->protection;
3532
3533         fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
3534         if ((fault_type & prot) != fault_type) {
3535                 rv = KERN_PROTECTION_FAILURE;
3536                 goto done;
3537         }
3538
3539         if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
3540             (entry->eflags & MAP_ENTRY_COW) &&
3541             (fault_type & VM_PROT_WRITE) &&
3542             (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) {
3543                 rv = KERN_PROTECTION_FAILURE;
3544                 goto done;
3545         }
3546
3547         /*
3548          * If this page is not pageable, we have to get it for all possible
3549          * accesses.
3550          */
3551         *wired = (entry->wired_count != 0);
3552         if (*wired)
3553                 prot = fault_type = entry->protection;
3554
3555         /*
3556          * Virtual page tables may need to update the accessed (A) bit
3557          * in a page table entry.  Upgrade the fault to a write fault for
3558          * that case if the map will support it.  If the map does not support
3559          * it the page table entry simply will not be updated.
3560          */
3561         if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
3562                 if (prot & VM_PROT_WRITE)
3563                         fault_type |= VM_PROT_WRITE;
3564         }
3565
3566         /*
3567          * If the entry was copy-on-write, we either ...
3568          */
3569         if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3570                 /*
3571                  * If we want to write the page, we may as well handle that
3572                  * now since we've got the map locked.
3573                  *
3574                  * If we don't need to write the page, we just demote the
3575                  * permissions allowed.
3576                  */
3577
3578                 if (fault_type & VM_PROT_WRITE) {
3579                         /*
3580                          * Make a new object, and place it in the object
3581                          * chain.  Note that no new references have appeared
3582                          * -- one just moved from the map to the new
3583                          * object.
3584                          */
3585
3586                         if (use_read_lock && vm_map_lock_upgrade(map)) {
3587                                 use_read_lock = 0;
3588                                 goto RetryLookup;
3589                         }
3590                         use_read_lock = 0;
3591
3592                         vm_map_entry_shadow(entry);
3593                 } else {
3594                         /*
3595                          * We're attempting to read a copy-on-write page --
3596                          * don't allow writes.
3597                          */
3598
3599                         prot &= ~VM_PROT_WRITE;
3600                 }
3601         }
3602
3603         /*
3604          * Create an object if necessary.
3605          */
3606         if (entry->object.vm_object == NULL &&
3607             !map->system_map) {
3608                 if (use_read_lock && vm_map_lock_upgrade(map))  {
3609                         use_read_lock = 0;
3610                         goto RetryLookup;
3611                 }
3612                 use_read_lock = 0;
3613                 vm_map_entry_allocate_object(entry);
3614         }
3615
3616         /*
3617          * Return the object/offset from this entry.  If the entry was
3618          * copy-on-write or empty, it has been fixed up.
3619          */
3620
3621         *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
3622         *object = entry->object.vm_object;
3623
3624         /*
3625          * Return whether this is the only map sharing this data.  On
3626          * success we return with a read lock held on the map.  On failure
3627          * we return with the map unlocked.
3628          */
3629         *out_prot = prot;
3630 done:
3631         if (rv == KERN_SUCCESS) {
3632                 if (use_read_lock == 0)
3633                         vm_map_lock_downgrade(map);
3634         } else if (use_read_lock) {
3635                 vm_map_unlock_read(map);
3636         } else {
3637                 vm_map_unlock(map);
3638         }
3639         return (rv);
3640 }
3641
3642 /*
3643  * Releases locks acquired by a vm_map_lookup()
3644  * (according to the handle returned by that lookup).
3645  *
3646  * No other requirements.
3647  */
3648 void
3649 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry, int count)
3650 {
3651         /*
3652          * Unlock the main-level map
3653          */
3654         vm_map_unlock_read(map);
3655         if (count)
3656                 vm_map_entry_release(count);
3657 }
3658
3659 #include "opt_ddb.h"
3660 #ifdef DDB
3661 #include <sys/kernel.h>
3662
3663 #include <ddb/ddb.h>
3664
3665 /*
3666  * Debugging only
3667  */
3668 DB_SHOW_COMMAND(map, vm_map_print)
3669 {
3670         static int nlines;
3671         /* XXX convert args. */
3672         vm_map_t map = (vm_map_t)addr;
3673         boolean_t full = have_addr;
3674
3675         vm_map_entry_t entry;
3676
3677         db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
3678             (void *)map,
3679             (void *)map->pmap, map->nentries, map->timestamp);
3680         nlines++;
3681
3682         if (!full && db_indent)
3683                 return;
3684
3685         db_indent += 2;
3686         for (entry = map->header.next; entry != &map->header;
3687             entry = entry->next) {
3688                 db_iprintf("map entry %p: start=%p, end=%p\n",
3689                     (void *)entry, (void *)entry->start, (void *)entry->end);
3690                 nlines++;
3691                 {
3692                         static char *inheritance_name[4] =
3693                         {"share", "copy", "none", "donate_copy"};
3694
3695                         db_iprintf(" prot=%x/%x/%s",
3696                             entry->protection,
3697                             entry->max_protection,
3698                             inheritance_name[(int)(unsigned char)entry->inheritance]);
3699                         if (entry->wired_count != 0)
3700                                 db_printf(", wired");
3701                 }
3702                 if (entry->maptype == VM_MAPTYPE_SUBMAP) {
3703                         /* XXX no %qd in kernel.  Truncate entry->offset. */
3704                         db_printf(", share=%p, offset=0x%lx\n",
3705                             (void *)entry->object.sub_map,
3706                             (long)entry->offset);
3707                         nlines++;
3708                         if ((entry->prev == &map->header) ||
3709                             (entry->prev->object.sub_map !=
3710                                 entry->object.sub_map)) {
3711                                 db_indent += 2;
3712                                 vm_map_print((db_expr_t)(intptr_t)
3713                                              entry->object.sub_map,
3714                                              full, 0, NULL);
3715                                 db_indent -= 2;
3716                         }
3717                 } else {
3718                         /* XXX no %qd in kernel.  Truncate entry->offset. */
3719                         db_printf(", object=%p, offset=0x%lx",
3720                             (void *)entry->object.vm_object,
3721                             (long)entry->offset);
3722                         if (entry->eflags & MAP_ENTRY_COW)
3723                                 db_printf(", copy (%s)",
3724                                     (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
3725                         db_printf("\n");
3726                         nlines++;
3727
3728                         if ((entry->prev == &map->header) ||
3729                             (entry->prev->object.vm_object !=
3730                                 entry->object.vm_object)) {
3731                                 db_indent += 2;
3732                                 vm_object_print((db_expr_t)(intptr_t)
3733                                                 entry->object.vm_object,
3734                                                 full, 0, NULL);
3735                                 nlines += 4;
3736                                 db_indent -= 2;
3737                         }
3738                 }
3739         }
3740         db_indent -= 2;
3741         if (db_indent == 0)
3742                 nlines = 0;
3743 }
3744
3745 /*
3746  * Debugging only
3747  */
3748 DB_SHOW_COMMAND(procvm, procvm)
3749 {
3750         struct proc *p;
3751
3752         if (have_addr) {
3753                 p = (struct proc *) addr;
3754         } else {
3755                 p = curproc;
3756         }
3757
3758         db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
3759             (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
3760             (void *)vmspace_pmap(p->p_vmspace));
3761
3762         vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL);
3763 }
3764
3765 #endif /* DDB */