kernel/vm/seg_kmem.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2016 Joyent, Inc.
  24  */
  25
  26 #include <sys/types.h>
  27 #include <sys/t_lock.h>
  28 #include <sys/param.h>
  29 #include <sys/sysmacros.h>
  30 #include <sys/tuneable.h>
  31 #include <sys/systm.h>
  32 #include <sys/vm.h>
  33 #include <sys/kmem.h>
  34 #include <sys/vmem.h>
  35 #include <sys/mman.h>
  36 #include <sys/cmn_err.h>
  37 #include <sys/debug.h>
  38 #include <sys/dumphdr.h>
  39 #include <sys/bootconf.h>
  40 #include <sys/lgrp.h>
  41 #include <vm/seg_kmem.h>
  42 #include <vm/hat.h>
  43 #include <vm/page.h>
  44 #include <vm/vm_dep.h>
  45 #include <vm/faultcode.h>
  46 #include <sys/promif.h>
  47 #include <vm/seg_kp.h>
  48 #include <sys/bitmap.h>
  49
  50
  51 /*
  52  * seg_kmem is the primary kernel memory segment driver.  It
  53  * maps the kernel heap [kernelheap, ekernelheap), module text,
  54  * and all memory which was allocated before the VM was initialized
  55  * into kas.
  56  *
  57  * Pages which belong to seg_kmem are hashed into &kvp vnode at
  58  * an offset equal to (uoff_t)virt_addr, and have p_lckcnt >= 1.
  59  * They must never be paged out since segkmem_fault() is a no-op to
  60  * prevent recursive faults.
  61  *
  62  * Currently, seg_kmem pages are sharelocked (p_sharelock == 1) on
  63  * __x86 and are unlocked (p_sharelock == 0) on __sparc.  Once __x86
  64  * supports relocation the #ifdef kludges can be removed.
  65  *
  66  * seg_kmem pages may be subject to relocation by page_relocate(),
  67  * provided that the HAT supports it; if this is so, segkmem_reloc
  68  * will be set to a nonzero value. All boot time allocated memory as
  69  * well as static memory is considered off limits to relocation.
  70  * Pages are "relocatable" if p_state does not have P_NORELOC set, so
  71  * we request P_NORELOC pages for memory that isn't safe to relocate.
  72  *
  73  * The kernel heap is logically divided up into four pieces:
  74  *
  75  *   heap32_arena is for allocations that require 32-bit absolute
  76  *   virtual addresses (e.g. code that uses 32-bit pointers/offsets).
  77  *
  78  *   heap_core is for allocations that require 2GB *relative*
  79  *   offsets; in other words all memory from heap_core is within
  80  *   2GB of all other memory from the same arena. This is a requirement
  81  *   of the addressing modes of some processors in supervisor code.
  82  *
  83  *   heap_arena is the general heap arena.
  84  *
  85  *   static_arena is the static memory arena.  Allocations from it
  86  *   are not subject to relocation so it is safe to use the memory
  87  *   physical address as well as the virtual address (e.g. the VA to
  88  *   PA translations are static).  Caches may import from static_arena;
  89  *   all other static memory allocations should use static_alloc_arena.
  90  *
  91  * On some platforms which have limited virtual address space, seg_kmem
  92  * may share [kernelheap, ekernelheap) with seg_kp; if this is so,
  93  * segkp_bitmap is non-NULL, and each bit represents a page of virtual
  94  * address space which is actually seg_kp mapped.
  95  */
  96
  97 extern ulong_t *segkp_bitmap;   /* Is set if segkp is from the kernel heap */
  98
  99 char *kernelheap;               /* start of primary kernel heap */
 100 char *ekernelheap;              /* end of primary kernel heap */
 101 struct seg kvseg;               /* primary kernel heap segment */
 102 struct seg kvseg_core;          /* "core" kernel heap segment */
 103 struct seg kzioseg;             /* Segment for zio mappings */
 104 vmem_t *heap_arena;             /* primary kernel heap arena */
 105 vmem_t *heap_core_arena;        /* core kernel heap arena */
 106 char *heap_core_base;           /* start of core kernel heap arena */
 107 char *heap_lp_base;             /* start of kernel large page heap arena */
 108 char *heap_lp_end;              /* end of kernel large page heap arena */
 109 vmem_t *hat_memload_arena;      /* HAT translation data */
 110 struct seg kvseg32;             /* 32-bit kernel heap segment */
 111 vmem_t *heap32_arena;           /* 32-bit kernel heap arena */
 112 vmem_t *heaptext_arena;         /* heaptext arena */
 113 struct as kas;                  /* kernel address space */
 114 int segkmem_reloc;              /* enable/disable relocatable segkmem pages */
 115 vmem_t *static_arena;           /* arena for caches to import static memory */
 116 vmem_t *static_alloc_arena;     /* arena for allocating static memory */
 117 vmem_t *zio_arena = NULL;       /* arena for allocating zio memory */
 118 vmem_t *zio_alloc_arena = NULL; /* arena for allocating zio memory */
 119
 120 /*
 121  * seg_kmem driver can map part of the kernel heap with large pages.
 122  * Currently this functionality is implemented for sparc platforms only.
 123  *
 124  * The large page size "segkmem_lpsize" for kernel heap is selected in the
 125  * platform specific code. It can also be modified via /etc/system file.
 126  * Setting segkmem_lpsize to PAGESIZE in /etc/system disables usage of large
 127  * pages for kernel heap. "segkmem_lpshift" is adjusted appropriately to
 128  * match segkmem_lpsize.
 129  *
 130  * At boot time we carve from kernel heap arena a range of virtual addresses
 131  * that will be used for large page mappings. This range [heap_lp_base,
 132  * heap_lp_end) is set up as a separate vmem arena - "heap_lp_arena". We also
 133  * create "kmem_lp_arena" that caches memory already backed up by large
 134  * pages. kmem_lp_arena imports virtual segments from heap_lp_arena.
 135  */
 136
 137 size_t  segkmem_lpsize;
 138 static  uint_t  segkmem_lpshift = PAGESHIFT;
 139 int     segkmem_lpszc = 0;
 140
 141 size_t  segkmem_kmemlp_quantum = 0x400000;      /* 4MB */
 142 size_t  segkmem_heaplp_quantum;
 143 vmem_t *heap_lp_arena;
 144 static  vmem_t *kmem_lp_arena;
 145 static  vmem_t *segkmem_ppa_arena;
 146 static  segkmem_lpcb_t segkmem_lpcb;
 147
 148 /*
 149  * We use "segkmem_kmemlp_max" to limit the total amount of physical memory
 150  * consumed by the large page heap. By default this parameter is set to 1/8 of
 151  * physmem but can be adjusted through /etc/system either directly or
 152  * indirectly by setting "segkmem_kmemlp_pcnt" to the percent of physmem
 153  * we allow for large page heap.
 154  */
 155 size_t  segkmem_kmemlp_max;
 156 static  uint_t  segkmem_kmemlp_pcnt;
 157
 158 /*
 159  * Getting large pages for kernel heap could be problematic due to
 160  * physical memory fragmentation. That's why we allow to preallocate
 161  * "segkmem_kmemlp_min" bytes at boot time.
 162  */
 163 static  size_t  segkmem_kmemlp_min;
 164
 165 /*
 166  * Throttling is used to avoid expensive tries to allocate large pages
 167  * for kernel heap when a lot of succesive attempts to do so fail.
 168  */
 169 static  ulong_t segkmem_lpthrottle_max = 0x400000;
 170 static  ulong_t segkmem_lpthrottle_start = 0x40;
 171 static  ulong_t segkmem_use_lpthrottle = 1;
 172
 173 /*
 174  * Freed pages accumulate on a garbage list until segkmem is ready,
 175  * at which point we call segkmem_gc() to free it all.
 176  */
 177 typedef struct segkmem_gc_list {
 178         struct segkmem_gc_list  *gc_next;
 179         vmem_t                  *gc_arena;
 180         size_t                  gc_size;
 181 } segkmem_gc_list_t;
 182
 183 static segkmem_gc_list_t *segkmem_gc_list;
 184
 185 /*
 186  * Allocations from the hat_memload arena add VM_MEMLOAD to their
 187  * vmflags so that segkmem_xalloc() can inform the hat layer that it needs
 188  * to take steps to prevent infinite recursion.  HAT allocations also
 189  * must be non-relocatable to prevent recursive page faults.
 190  */
 191 static void *
 192 hat_memload_alloc(vmem_t *vmp, size_t size, int flags)
 193 {
 194         flags |= (VM_MEMLOAD | VM_NORELOC);
 195         return (segkmem_alloc(vmp, size, flags));
 196 }
 197
 198 /*
 199  * Allocations from static_arena arena (or any other arena that uses
 200  * segkmem_alloc_permanent()) require non-relocatable (permanently
 201  * wired) memory pages, since these pages are referenced by physical
 202  * as well as virtual address.
 203  */
 204 void *
 205 segkmem_alloc_permanent(vmem_t *vmp, size_t size, int flags)
 206 {
 207         return (segkmem_alloc(vmp, size, flags | VM_NORELOC));
 208 }
 209
 210 /*
 211  * Initialize kernel heap boundaries.
 212  */
 213 void
 214 kernelheap_init(
 215         void *heap_start,
 216         void *heap_end,
 217         char *first_avail,
 218         void *core_start,
 219         void *core_end)
 220 {
 221         uintptr_t textbase;
 222         size_t core_size;
 223         size_t heap_size;
 224         vmem_t *heaptext_parent;
 225         size_t  heap_lp_size = 0;
 226
 227         kernelheap = heap_start;
 228         ekernelheap = heap_end;
 229
 230
 231         /*
 232          * If this platform has a 'core' heap area, then the space for
 233          * overflow module text should be carved out of the end of that
 234          * heap.  Otherwise, it gets carved out of the general purpose
 235          * heap.
 236          */
 237         core_size = (uintptr_t)core_end - (uintptr_t)core_start;
 238         if (core_size > 0) {
 239                 ASSERT(core_size >= HEAPTEXT_SIZE);
 240                 textbase = (uintptr_t)core_end - HEAPTEXT_SIZE;
 241                 core_size -= HEAPTEXT_SIZE;
 242         }
 243         else {
 244                 ekernelheap -= HEAPTEXT_SIZE;
 245                 textbase = (uintptr_t)ekernelheap;
 246         }
 247
 248         heap_size = (uintptr_t)ekernelheap - (uintptr_t)kernelheap;
 249         heap_arena = vmem_init("heap", kernelheap, heap_size, PAGESIZE,
 250             segkmem_alloc, segkmem_free);
 251
 252         if (core_size > 0) {
 253                 heap_core_arena = vmem_create("heap_core", core_start,
 254                     core_size, PAGESIZE, NULL, NULL, NULL, 0, VM_SLEEP);
 255                 heap_core_base = core_start;
 256         } else {
 257                 heap_core_arena = heap_arena;
 258                 heap_core_base = kernelheap;
 259         }
 260
 261         /*
 262          * reserve space for the large page heap. If large pages for kernel
 263          * heap is enabled large page heap arean will be created later in the
 264          * boot sequence in segkmem_heap_lp_init(). Otherwise the allocated
 265          * range will be returned back to the heap_arena.
 266          */
 267         if (heap_lp_size) {
 268                 (void) vmem_xalloc(heap_arena, heap_lp_size, PAGESIZE, 0, 0,
 269                     heap_lp_base, heap_lp_end,
 270                     VM_NOSLEEP | VM_BESTFIT | VM_PANIC);
 271         }
 272
 273         /*
 274          * Remove the already-spoken-for memory range [kernelheap, first_avail).
 275          */
 276         (void) vmem_xalloc(heap_arena, first_avail - kernelheap, PAGESIZE,
 277             0, 0, kernelheap, first_avail, VM_NOSLEEP | VM_BESTFIT | VM_PANIC);
 278
 279         heap32_arena = heap_core_arena;
 280         heaptext_parent = heap_core_arena;
 281
 282         heaptext_arena = vmem_create("heaptext", (void *)textbase,
 283             HEAPTEXT_SIZE, PAGESIZE, NULL, NULL, heaptext_parent, 0, VM_SLEEP);
 284
 285         /*
 286          * Create a set of arenas for memory with static translations
 287          * (e.g. VA -> PA translations cannot change).  Since using
 288          * kernel pages by physical address implies it isn't safe to
 289          * walk across page boundaries, the static_arena quantum must
 290          * be PAGESIZE.  Any kmem caches that require static memory
 291          * should source from static_arena, while direct allocations
 292          * should only use static_alloc_arena.
 293          */
 294         static_arena = vmem_create("static", NULL, 0, PAGESIZE,
 295             segkmem_alloc_permanent, segkmem_free, heap_arena, 0, VM_SLEEP);
 296         static_alloc_arena = vmem_create("static_alloc", NULL, 0,
 297             sizeof (uint64_t), vmem_alloc, vmem_free, static_arena,
 298             0, VM_SLEEP);
 299
 300         /*
 301          * Create an arena for translation data (ptes, hmes, or hblks).
 302          * We need an arena for this because hat_memload() is essential
 303          * to vmem_populate() (see comments in kernel/os/vmem.c).
 304          *
 305          * Note: any kmem cache that allocates from hat_memload_arena
 306          * must be created as a KMC_NOHASH cache (i.e. no external slab
 307          * and bufctl structures to allocate) so that slab creation doesn't
 308          * require anything more than a single vmem_alloc().
 309          */
 310         hat_memload_arena = vmem_create("hat_memload", NULL, 0, PAGESIZE,
 311             hat_memload_alloc, segkmem_free, heap_arena, 0,
 312             VM_SLEEP | VMC_POPULATOR | VMC_DUMPSAFE);
 313 }
 314
 315 void
 316 boot_mapin(caddr_t addr, size_t size)
 317 {
 318         caddr_t  eaddr;
 319         page_t  *pp;
 320         pfn_t    pfnum;
 321
 322         if (page_resv(btop(size), KM_NOSLEEP) == 0)
 323                 panic("boot_mapin: page_resv failed");
 324
 325         for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) {
 326                 pfnum = va_to_pfn(addr);
 327                 if (pfnum == PFN_INVALID)
 328                         continue;
 329                 if ((pp = page_numtopp_nolock(pfnum)) == NULL)
 330                         panic("boot_mapin(): No pp for pfnum = %lx", pfnum);
 331
 332                 /*
 333                  * must break up any large pages that may have constituent
 334                  * pages being utilized for BOP_ALLOC()'s before calling
 335                  * page_numtopp().The locking code (ie. page_reclaim())
 336                  * can't handle them
 337                  */
 338                 if (pp->p_szc != 0)
 339                         page_boot_demote(pp);
 340
 341                 pp = page_numtopp(pfnum, SE_EXCL);
 342                 if (pp == NULL || PP_ISFREE(pp))
 343                         panic("boot_alloc: pp is NULL or free");
 344
 345                 (void) page_hashin(pp, &kvp.v_object, (uoff_t)(uintptr_t)addr,
 346                                    false);
 347                 pp->p_lckcnt = 1;
 348 #if defined(__x86)
 349                 page_downgrade(pp);
 350 #else
 351                 page_unlock(pp);
 352 #endif
 353         }
 354 }
 355
 356 /*
 357  * Get pages from boot and hash them into the kernel's vp.
 358  * Used after page structs have been allocated, but before segkmem is ready.
 359  */
 360 void *
 361 boot_alloc(void *inaddr, size_t size, uint_t align)
 362 {
 363         caddr_t addr = inaddr;
 364
 365         if (bootops == NULL)
 366                 prom_panic("boot_alloc: attempt to allocate memory after "
 367                     "BOP_GONE");
 368
 369         size = ptob(btopr(size));
 370         if (BOP_ALLOC(bootops, addr, size, align) != addr)
 371                 panic("boot_alloc: BOP_ALLOC failed");
 372         boot_mapin((caddr_t)addr, size);
 373         return (addr);
 374 }
 375
 376 static void
 377 segkmem_badop()
 378 {
 379         panic("segkmem_badop");
 380 }
 381
 382 #define SEGKMEM_BADOP(t)        (t(*)())segkmem_badop
 383
 384 /*ARGSUSED*/
 385 static faultcode_t
 386 segkmem_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t size,
 387         enum fault_type type, enum seg_rw rw)
 388 {
 389         pgcnt_t npages;
 390         spgcnt_t pg;
 391         page_t *pp;
 392         struct vnode *vp = seg->s_data;
 393
 394         ASSERT(RW_READ_HELD(&seg->s_as->a_lock));
 395
 396         if (seg->s_as != &kas || size > seg->s_size ||
 397             addr < seg->s_base || addr + size > seg->s_base + seg->s_size)
 398                 panic("segkmem_fault: bad args");
 399
 400         /*
 401          * If it is one of segkp pages, call segkp_fault.
 402          */
 403         if (segkp_bitmap && seg == &kvseg &&
 404             BT_TEST(segkp_bitmap, btop((uintptr_t)(addr - seg->s_base))))
 405                 return (segop_fault(hat, segkp, addr, size, type, rw));
 406
 407         if (rw != S_READ && rw != S_WRITE && rw != S_OTHER)
 408                 return (FC_NOSUPPORT);
 409
 410         npages = btopr(size);
 411
 412         switch (type) {
 413         case F_SOFTLOCK:        /* lock down already-loaded translations */
 414                 for (pg = 0; pg < npages; pg++) {
 415                         pp = page_lookup(&vp->v_object, (uoff_t)(uintptr_t)addr,
 416                             SE_SHARED);
 417                         if (pp == NULL) {
 418                                 /*
 419                                  * Hmm, no page. Does a kernel mapping
 420                                  * exist for it?
 421                                  */
 422                                 if (!hat_probe(kas.a_hat, addr)) {
 423                                         addr -= PAGESIZE;
 424                                         while (--pg >= 0) {
 425                                                 pp = page_find(&vp->v_object,
 426                                                     (uoff_t)(uintptr_t)addr);
 427                                                 if (pp)
 428                                                         page_unlock(pp);
 429                                                 addr -= PAGESIZE;
 430                                         }
 431                                         return (FC_NOMAP);
 432                                 }
 433                         }
 434                         addr += PAGESIZE;
 435                 }
 436                 if (rw == S_OTHER)
 437                         hat_reserve(seg->s_as, addr, size);
 438                 return (0);
 439         case F_SOFTUNLOCK:
 440                 while (npages--) {
 441                         pp = page_find(&vp->v_object, (uoff_t)(uintptr_t)addr);
 442                         if (pp)
 443                                 page_unlock(pp);
 444                         addr += PAGESIZE;
 445                 }
 446                 return (0);
 447         default:
 448                 return (FC_NOSUPPORT);
 449         }
 450         /*NOTREACHED*/
 451 }
 452
 453 static int
 454 segkmem_setprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
 455 {
 456         ASSERT(RW_LOCK_HELD(&seg->s_as->a_lock));
 457
 458         if (seg->s_as != &kas || size > seg->s_size ||
 459             addr < seg->s_base || addr + size > seg->s_base + seg->s_size)
 460                 panic("segkmem_setprot: bad args");
 461
 462         /*
 463          * If it is one of segkp pages, call segkp.
 464          */
 465         if (segkp_bitmap && seg == &kvseg &&
 466             BT_TEST(segkp_bitmap, btop((uintptr_t)(addr - seg->s_base))))
 467                 return (segop_setprot(segkp, addr, size, prot));
 468
 469         if (prot == 0)
 470                 hat_unload(kas.a_hat, addr, size, HAT_UNLOAD);
 471         else
 472                 hat_chgprot(kas.a_hat, addr, size, prot);
 473         return (0);
 474 }
 475
 476 /*
 477  * This is a dummy segkmem function overloaded to call segkp
 478  * when segkp is under the heap.
 479  */
 480 /* ARGSUSED */
 481 static int
 482 segkmem_checkprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
 483 {
 484         ASSERT(RW_LOCK_HELD(&seg->s_as->a_lock));
 485
 486         if (seg->s_as != &kas)
 487                 segkmem_badop();
 488
 489         /*
 490          * If it is one of segkp pages, call into segkp.
 491          */
 492         if (segkp_bitmap && seg == &kvseg &&
 493             BT_TEST(segkp_bitmap, btop((uintptr_t)(addr - seg->s_base))))
 494                 return (segop_checkprot(segkp, addr, size, prot));
 495
 496         segkmem_badop();
 497         return (0);
 498 }
 499
 500 /*
 501  * This is a dummy segkmem function overloaded to call segkp
 502  * when segkp is under the heap.
 503  */
 504 /* ARGSUSED */
 505 static int
 506 segkmem_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
 507 {
 508         ASSERT(RW_LOCK_HELD(&seg->s_as->a_lock));
 509
 510         if (seg->s_as != &kas)
 511                 segkmem_badop();
 512
 513         /*
 514          * If it is one of segkp pages, call into segkp.
 515          */
 516         if (segkp_bitmap && seg == &kvseg &&
 517             BT_TEST(segkp_bitmap, btop((uintptr_t)(addr - seg->s_base))))
 518                 return (segop_kluster(segkp, addr, delta));
 519
 520         segkmem_badop();
 521         return (0);
 522 }
 523
 524 static void
 525 segkmem_xdump_range(void *arg, void *start, size_t size)
 526 {
 527         struct as *as = arg;
 528         caddr_t addr = start;
 529         caddr_t addr_end = addr + size;
 530
 531         while (addr < addr_end) {
 532                 pfn_t pfn = hat_getpfnum(kas.a_hat, addr);
 533                 if (pfn != PFN_INVALID && pfn <= physmax && pf_is_memory(pfn))
 534                         dump_addpage(as, addr, pfn);
 535                 addr += PAGESIZE;
 536                 dump_timeleft = dump_timeout;
 537         }
 538 }
 539
 540 static void
 541 segkmem_dump_range(void *arg, void *start, size_t size)
 542 {
 543         caddr_t addr = start;
 544         caddr_t addr_end = addr + size;
 545
 546         /*
 547          * If we are about to start dumping the range of addresses we
 548          * carved out of the kernel heap for the large page heap walk
 549          * heap_lp_arena to find what segments are actually populated
 550          */
 551         if (SEGKMEM_USE_LARGEPAGES &&
 552             addr == heap_lp_base && addr_end == heap_lp_end &&
 553             vmem_size(heap_lp_arena, VMEM_ALLOC) < size) {
 554                 vmem_walk(heap_lp_arena, VMEM_ALLOC | VMEM_REENTRANT,
 555                     segkmem_xdump_range, arg);
 556         } else {
 557                 segkmem_xdump_range(arg, start, size);
 558         }
 559 }
 560
 561 static void
 562 segkmem_dump(struct seg *seg)
 563 {
 564         /*
 565          * The kernel's heap_arena (represented by kvseg) is a very large
 566          * VA space, most of which is typically unused.  To speed up dumping
 567          * we use vmem_walk() to quickly find the pieces of heap_arena that
 568          * are actually in use.  We do the same for heap32_arena and
 569          * heap_core.
 570          *
 571          * We specify VMEM_REENTRANT to vmem_walk() because dump_addpage()
 572          * may ultimately need to allocate memory.  Reentrant walks are
 573          * necessarily imperfect snapshots.  The kernel heap continues
 574          * to change during a live crash dump, for example.  For a normal
 575          * crash dump, however, we know that there won't be any other threads
 576          * messing with the heap.  Therefore, at worst, we may fail to dump
 577          * the pages that get allocated by the act of dumping; but we will
 578          * always dump every page that was allocated when the walk began.
 579          *
 580          * The other segkmem segments are dense (fully populated), so there's
 581          * no need to use this technique when dumping them.
 582          *
 583          * Note: when adding special dump handling for any new sparsely-
 584          * populated segments, be sure to add similar handling to the ::kgrep
 585          * code in mdb.
 586          */
 587         if (seg == &kvseg) {
 588                 vmem_walk(heap_arena, VMEM_ALLOC | VMEM_REENTRANT,
 589                     segkmem_dump_range, seg->s_as);
 590                 vmem_walk(heaptext_arena, VMEM_ALLOC | VMEM_REENTRANT,
 591                     segkmem_dump_range, seg->s_as);
 592         } else if (seg == &kvseg_core) {
 593                 vmem_walk(heap_core_arena, VMEM_ALLOC | VMEM_REENTRANT,
 594                     segkmem_dump_range, seg->s_as);
 595         } else if (seg == &kvseg32) {
 596                 vmem_walk(heap32_arena, VMEM_ALLOC | VMEM_REENTRANT,
 597                     segkmem_dump_range, seg->s_as);
 598                 vmem_walk(heaptext_arena, VMEM_ALLOC | VMEM_REENTRANT,
 599                     segkmem_dump_range, seg->s_as);
 600         } else if (seg == &kzioseg) {
 601                 /*
 602                  * We don't want to dump pages attached to kzioseg since they
 603                  * contain file data from ZFS.  If this page's segment is
 604                  * kzioseg return instead of writing it to the dump device.
 605                  */
 606                 return;
 607         } else {
 608                 segkmem_dump_range(seg->s_as, seg->s_base, seg->s_size);
 609         }
 610 }
 611
 612 /*
 613  * lock/unlock kmem pages over a given range [addr, addr+len).
 614  * Returns a shadow list of pages in ppp. If there are holes
 615  * in the range (e.g. some of the kernel mappings do not have
 616  * underlying page_ts) returns ENOTSUP so that as_pagelock()
 617  * will handle the range via as_fault(F_SOFTLOCK).
 618  */
 619 /*ARGSUSED*/
 620 static int
 621 segkmem_pagelock(struct seg *seg, caddr_t addr, size_t len,
 622         page_t ***ppp, enum lock_type type, enum seg_rw rw)
 623 {
 624         page_t **pplist, *pp;
 625         pgcnt_t npages;
 626         spgcnt_t pg;
 627         size_t nb;
 628         struct vnode *vp = seg->s_data;
 629
 630         ASSERT(ppp != NULL);
 631
 632         /*
 633          * If it is one of segkp pages, call into segkp.
 634          */
 635         if (segkp_bitmap && seg == &kvseg &&
 636             BT_TEST(segkp_bitmap, btop((uintptr_t)(addr - seg->s_base))))
 637                 return (segop_pagelock(segkp, addr, len, ppp, type, rw));
 638
 639         npages = btopr(len);
 640         nb = sizeof (page_t *) * npages;
 641
 642         if (type == L_PAGEUNLOCK) {
 643                 pplist = *ppp;
 644                 ASSERT(pplist != NULL);
 645
 646                 for (pg = 0; pg < npages; pg++) {
 647                         pp = pplist[pg];
 648                         page_unlock(pp);
 649                 }
 650                 kmem_free(pplist, nb);
 651                 return (0);
 652         }
 653
 654         ASSERT(type == L_PAGELOCK);
 655
 656         pplist = kmem_alloc(nb, KM_NOSLEEP);
 657         if (pplist == NULL) {
 658                 *ppp = NULL;
 659                 return (ENOTSUP);       /* take the slow path */
 660         }
 661
 662         for (pg = 0; pg < npages; pg++) {
 663                 pp = page_lookup(&vp->v_object, (uoff_t)(uintptr_t)addr,
 664                                  SE_SHARED);
 665                 if (pp == NULL) {
 666                         while (--pg >= 0)
 667                                 page_unlock(pplist[pg]);
 668                         kmem_free(pplist, nb);
 669                         *ppp = NULL;
 670                         return (ENOTSUP);
 671                 }
 672                 pplist[pg] = pp;
 673                 addr += PAGESIZE;
 674         }
 675
 676         *ppp = pplist;
 677         return (0);
 678 }
 679
 680 /*
 681  * This is a dummy segkmem function overloaded to call segkp
 682  * when segkp is under the heap.
 683  */
 684 /* ARGSUSED */
 685 static int
 686 segkmem_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
 687 {
 688         ASSERT(RW_LOCK_HELD(&seg->s_as->a_lock));
 689
 690         if (seg->s_as != &kas)
 691                 segkmem_badop();
 692
 693         /*
 694          * If it is one of segkp pages, call into segkp.
 695          */
 696         if (segkp_bitmap && seg == &kvseg &&
 697             BT_TEST(segkp_bitmap, btop((uintptr_t)(addr - seg->s_base))))
 698                 return (segop_getmemid(segkp, addr, memidp));
 699
 700         segkmem_badop();
 701         return (0);
 702 }
 703
 704 /*ARGSUSED*/
 705 static int
 706 segkmem_capable(struct seg *seg, segcapability_t capability)
 707 {
 708         if (capability == S_CAPABILITY_NOMINFLT)
 709                 return (1);
 710         return (0);
 711 }
 712
 713 const struct seg_ops segkmem_ops = {
 714         .dup            = SEGKMEM_BADOP(int),
 715         .unmap          = SEGKMEM_BADOP(int),
 716         .free           = SEGKMEM_BADOP(void),
 717         .fault          = segkmem_fault,
 718         .faulta         = SEGKMEM_BADOP(faultcode_t),
 719         .setprot        = segkmem_setprot,
 720         .checkprot      = segkmem_checkprot,
 721         .kluster        = segkmem_kluster,
 722         .sync           = SEGKMEM_BADOP(int),
 723         .incore         = SEGKMEM_BADOP(size_t),
 724         .lockop         = SEGKMEM_BADOP(int),
 725         .getprot        = SEGKMEM_BADOP(int),
 726         .getoffset      = SEGKMEM_BADOP(uoff_t),
 727         .gettype        = SEGKMEM_BADOP(int),
 728         .getvp          = SEGKMEM_BADOP(int),
 729         .advise         = SEGKMEM_BADOP(int),
 730         .dump           = segkmem_dump,
 731         .pagelock       = segkmem_pagelock,
 732         .setpagesize    = SEGKMEM_BADOP(int),
 733         .getmemid       = segkmem_getmemid,
 734         .capable        = segkmem_capable,
 735 };
 736
 737 int
 738 segkmem_zio_create(struct seg *seg)
 739 {
 740         ASSERT(seg->s_as == &kas && RW_WRITE_HELD(&kas.a_lock));
 741         seg->s_ops = &segkmem_ops;
 742         seg->s_data = &zvp;
 743         kas.a_size += seg->s_size;
 744         return (0);
 745 }
 746
 747 int
 748 segkmem_create(struct seg *seg)
 749 {
 750         ASSERT(seg->s_as == &kas && RW_WRITE_HELD(&kas.a_lock));
 751         seg->s_ops = &segkmem_ops;
 752         seg->s_data = &kvp;
 753         kas.a_size += seg->s_size;
 754         return (0);
 755 }
 756
 757 /*ARGSUSED*/
 758 page_t *
 759 segkmem_page_create(void *addr, size_t size, int vmflag, void *arg)
 760 {
 761         struct seg kseg;
 762         int pgflags;
 763         struct vnode *vp = arg;
 764
 765         if (vp == NULL)
 766                 vp = &kvp;
 767
 768         kseg.s_as = &kas;
 769         pgflags = PG_EXCL;
 770
 771         if (segkmem_reloc == 0 || (vmflag & VM_NORELOC))
 772                 pgflags |= PG_NORELOC;
 773         if ((vmflag & VM_NOSLEEP) == 0)
 774                 pgflags |= PG_WAIT;
 775         if (vmflag & VM_PANIC)
 776                 pgflags |= PG_PANIC;
 777         if (vmflag & VM_PUSHPAGE)
 778                 pgflags |= PG_PUSHPAGE;
 779         if (vmflag & VM_NORMALPRI) {
 780                 ASSERT(vmflag & VM_NOSLEEP);
 781                 pgflags |= PG_NORMALPRI;
 782         }
 783
 784         return (page_create_va(&vp->v_object, (uoff_t)(uintptr_t)addr, size,
 785             pgflags, &kseg, addr));
 786 }
 787
 788 /*
 789  * Allocate pages to back the virtual address range [addr, addr + size).
 790  * If addr is NULL, allocate the virtual address space as well.
 791  */
 792 void *
 793 segkmem_xalloc(vmem_t *vmp, void *inaddr, size_t size, int vmflag, uint_t attr,
 794         page_t *(*page_create_func)(void *, size_t, int, void *), void *pcarg)
 795 {
 796         page_t *ppl;
 797         caddr_t addr = inaddr;
 798         pgcnt_t npages = btopr(size);
 799         int allocflag;
 800
 801         if (inaddr == NULL && (addr = vmem_alloc(vmp, size, vmflag)) == NULL)
 802                 return (NULL);
 803
 804         ASSERT(((uintptr_t)addr & PAGEOFFSET) == 0);
 805
 806         if (page_resv(npages, vmflag & VM_KMFLAGS) == 0) {
 807                 if (inaddr == NULL)
 808                         vmem_free(vmp, addr, size);
 809                 return (NULL);
 810         }
 811
 812         ppl = page_create_func(addr, size, vmflag, pcarg);
 813         if (ppl == NULL) {
 814                 if (inaddr == NULL)
 815                         vmem_free(vmp, addr, size);
 816                 page_unresv(npages);
 817                 return (NULL);
 818         }
 819
 820         /*
 821          * Under certain conditions, we need to let the HAT layer know
 822          * that it cannot safely allocate memory.  Allocations from
 823          * the hat_memload vmem arena always need this, to prevent
 824          * infinite recursion.
 825          *
 826          * In addition, the x86 hat cannot safely do memory
 827          * allocations while in vmem_populate(), because there
 828          * is no simple bound on its usage.
 829          */
 830         if (vmflag & VM_MEMLOAD)
 831                 allocflag = HAT_NO_KALLOC;
 832 #if defined(__x86)
 833         else if (vmem_is_populator())
 834                 allocflag = HAT_NO_KALLOC;
 835 #endif
 836         else
 837                 allocflag = 0;
 838
 839         while (ppl != NULL) {
 840                 page_t *pp = ppl;
 841                 page_sub(&ppl, pp);
 842                 ASSERT(page_iolock_assert(pp));
 843                 ASSERT(PAGE_EXCL(pp));
 844                 page_io_unlock(pp);
 845                 hat_memload(kas.a_hat, (caddr_t)(uintptr_t)pp->p_offset, pp,
 846                     (PROT_ALL & ~PROT_USER) | HAT_NOSYNC | attr,
 847                     HAT_LOAD_LOCK | allocflag);
 848                 pp->p_lckcnt = 1;
 849 #if defined(__x86)
 850                 page_downgrade(pp);
 851 #else
 852                 if (vmflag & SEGKMEM_SHARELOCKED)
 853                         page_downgrade(pp);
 854                 else
 855                         page_unlock(pp);
 856 #endif
 857         }
 858
 859         return (addr);
 860 }
 861
 862 static void *
 863 segkmem_alloc_vn(vmem_t *vmp, size_t size, int vmflag, struct vnode *vp)
 864 {
 865         void *addr;
 866         segkmem_gc_list_t *gcp, **prev_gcpp;
 867
 868         ASSERT(vp != NULL);
 869
 870         if (kvseg.s_base == NULL) {
 871                 if (bootops->bsys_alloc == NULL)
 872                         halt("Memory allocation between bop_alloc() and "
 873                             "kmem_alloc().\n");
 874
 875                 /*
 876                  * There's not a lot of memory to go around during boot,
 877                  * so recycle it if we can.
 878                  */
 879                 for (prev_gcpp = &segkmem_gc_list; (gcp = *prev_gcpp) != NULL;
 880                     prev_gcpp = &gcp->gc_next) {
 881                         if (gcp->gc_arena == vmp && gcp->gc_size == size) {
 882                                 *prev_gcpp = gcp->gc_next;
 883                                 return (gcp);
 884                         }
 885                 }
 886
 887                 addr = vmem_alloc(vmp, size, vmflag | VM_PANIC);
 888                 if (boot_alloc(addr, size, BO_NO_ALIGN) != addr)
 889                         panic("segkmem_alloc: boot_alloc failed");
 890                 return (addr);
 891         }
 892         return (segkmem_xalloc(vmp, NULL, size, vmflag, 0,
 893             segkmem_page_create, vp));
 894 }
 895
 896 void *
 897 segkmem_alloc(vmem_t *vmp, size_t size, int vmflag)
 898 {
 899         return (segkmem_alloc_vn(vmp, size, vmflag, &kvp));
 900 }
 901
 902 void *
 903 segkmem_zio_alloc(vmem_t *vmp, size_t size, int vmflag)
 904 {
 905         return (segkmem_alloc_vn(vmp, size, vmflag, &zvp));
 906 }
 907
 908 /*
 909  * Any changes to this routine must also be carried over to
 910  * devmap_free_pages() in the seg_dev driver. This is because
 911  * we currently don't have a special kernel segment for non-paged
 912  * kernel memory that is exported by drivers to user space.
 913  */
 914 static void
 915 segkmem_free_vn(vmem_t *vmp, void *inaddr, size_t size, struct vnode *vp,
 916     void (*func)(page_t *))
 917 {
 918         page_t *pp;
 919         caddr_t addr = inaddr;
 920         caddr_t eaddr;
 921         pgcnt_t npages = btopr(size);
 922
 923         ASSERT(((uintptr_t)addr & PAGEOFFSET) == 0);
 924         ASSERT(vp != NULL);
 925
 926         if (kvseg.s_base == NULL) {
 927                 segkmem_gc_list_t *gc = inaddr;
 928                 gc->gc_arena = vmp;
 929                 gc->gc_size = size;
 930                 gc->gc_next = segkmem_gc_list;
 931                 segkmem_gc_list = gc;
 932                 return;
 933         }
 934
 935         hat_unload(kas.a_hat, addr, size, HAT_UNLOAD_UNLOCK);
 936
 937         for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) {
 938 #if defined(__x86)
 939                 pp = page_find(&vp->v_object, (uoff_t)(uintptr_t)addr);
 940                 if (pp == NULL)
 941                         panic("segkmem_free: page not found");
 942                 if (!page_tryupgrade(pp)) {
 943                         /*
 944                          * Some other thread has a sharelock. Wait for
 945                          * it to drop the lock so we can free this page.
 946                          */
 947                         page_unlock(pp);
 948                         pp = page_lookup(&vp->v_object, (uoff_t)(uintptr_t)addr,
 949                             SE_EXCL);
 950                 }
 951 #else
 952                 pp = page_lookup(&vp->v_object, (uoff_t)(uintptr_t)addr,
 953                                  SE_EXCL);
 954 #endif
 955                 if (pp == NULL)
 956                         panic("segkmem_free: page not found");
 957                 /* Clear p_lckcnt so page_destroy() doesn't update availrmem */
 958                 pp->p_lckcnt = 0;
 959                 if (func)
 960                         func(pp);
 961                 else
 962                         page_destroy(pp, 0);
 963         }
 964         if (func == NULL)
 965                 page_unresv(npages);
 966
 967         if (vmp != NULL)
 968                 vmem_free(vmp, inaddr, size);
 969
 970 }
 971
 972 void
 973 segkmem_xfree(vmem_t *vmp, void *inaddr, size_t size, void (*func)(page_t *))
 974 {
 975         segkmem_free_vn(vmp, inaddr, size, &kvp, func);
 976 }
 977
 978 void
 979 segkmem_free(vmem_t *vmp, void *inaddr, size_t size)
 980 {
 981         segkmem_free_vn(vmp, inaddr, size, &kvp, NULL);
 982 }
 983
 984 void
 985 segkmem_zio_free(vmem_t *vmp, void *inaddr, size_t size)
 986 {
 987         segkmem_free_vn(vmp, inaddr, size, &zvp, NULL);
 988 }
 989
 990 void
 991 segkmem_gc(void)
 992 {
 993         ASSERT(kvseg.s_base != NULL);
 994         while (segkmem_gc_list != NULL) {
 995                 segkmem_gc_list_t *gc = segkmem_gc_list;
 996                 segkmem_gc_list = gc->gc_next;
 997                 segkmem_free(gc->gc_arena, gc, gc->gc_size);
 998         }
 999 }
1000
1001 /*
1002  * Legacy entry points from here to end of file.
1003  */
1004 void
1005 segkmem_mapin(struct seg *seg, void *addr, size_t size, uint_t vprot,
1006     pfn_t pfn, uint_t flags)
1007 {
1008         hat_unload(seg->s_as->a_hat, addr, size, HAT_UNLOAD_UNLOCK);
1009         hat_devload(seg->s_as->a_hat, addr, size, pfn, vprot,
1010             flags | HAT_LOAD_LOCK);
1011 }
1012
1013 void
1014 segkmem_mapout(struct seg *seg, void *addr, size_t size)
1015 {
1016         hat_unload(seg->s_as->a_hat, addr, size, HAT_UNLOAD_UNLOCK);
1017 }
1018
1019 void *
1020 kmem_getpages(pgcnt_t npages, int kmflag)
1021 {
1022         return (kmem_alloc(ptob(npages), kmflag));
1023 }
1024
1025 void
1026 kmem_freepages(void *addr, pgcnt_t npages)
1027 {
1028         kmem_free(addr, ptob(npages));
1029 }
1030
1031 /*
1032  * segkmem_page_create_large() allocates a large page to be used for the kmem
1033  * caches. If kpr is enabled we ask for a relocatable page unless requested
1034  * otherwise. If kpr is disabled we have to ask for a non-reloc page
1035  */
1036 static page_t *
1037 segkmem_page_create_large(void *addr, size_t size, int vmflag, void *arg)
1038 {
1039         int pgflags;
1040
1041         pgflags = PG_EXCL;
1042
1043         if (segkmem_reloc == 0 || (vmflag & VM_NORELOC))
1044                 pgflags |= PG_NORELOC;
1045         if (!(vmflag & VM_NOSLEEP))
1046                 pgflags |= PG_WAIT;
1047         if (vmflag & VM_PUSHPAGE)
1048                 pgflags |= PG_PUSHPAGE;
1049         if (vmflag & VM_NORMALPRI)
1050                 pgflags |= PG_NORMALPRI;
1051
1052         return (page_create_va_large(&kvp.v_object, (uoff_t)(uintptr_t)addr,
1053                                      size, pgflags, &kvseg, addr, arg));
1054 }
1055
1056 /*
1057  * Allocate a large page to back the virtual address range
1058  * [addr, addr + size).  If addr is NULL, allocate the virtual address
1059  * space as well.
1060  */
1061 static void *
1062 segkmem_xalloc_lp(vmem_t *vmp, void *inaddr, size_t size, int vmflag,
1063     uint_t attr, page_t *(*page_create_func)(void *, size_t, int, void *),
1064     void *pcarg)
1065 {
1066         caddr_t addr = inaddr, pa;
1067         size_t  lpsize = segkmem_lpsize;
1068         pgcnt_t npages = btopr(size);
1069         pgcnt_t nbpages = btop(lpsize);
1070         pgcnt_t nlpages = size >> segkmem_lpshift;
1071         size_t  ppasize = nbpages * sizeof (page_t *);
1072         page_t *pp, *rootpp, **ppa, *pplist = NULL;
1073         int i;
1074
1075         vmflag |= VM_NOSLEEP;
1076
1077         if (page_resv(npages, vmflag & VM_KMFLAGS) == 0) {
1078                 return (NULL);
1079         }
1080
1081         /*
1082          * allocate an array we need for hat_memload_array.
1083          * we use a separate arena to avoid recursion.
1084          * we will not need this array when hat_memload_array learns pp++
1085          */
1086         if ((ppa = vmem_alloc(segkmem_ppa_arena, ppasize, vmflag)) == NULL) {
1087                 goto fail_array_alloc;
1088         }
1089
1090         if (inaddr == NULL && (addr = vmem_alloc(vmp, size, vmflag)) == NULL)
1091                 goto fail_vmem_alloc;
1092
1093         ASSERT(((uintptr_t)addr & (lpsize - 1)) == 0);
1094
1095         /* create all the pages */
1096         for (pa = addr, i = 0; i < nlpages; i++, pa += lpsize) {
1097                 if ((pp = page_create_func(pa, lpsize, vmflag, pcarg)) == NULL)
1098                         goto fail_page_create;
1099                 page_list_concat(&pplist, &pp);
1100         }
1101
1102         /* at this point we have all the resource to complete the request */
1103         while ((rootpp = pplist) != NULL) {
1104                 for (i = 0; i < nbpages; i++) {
1105                         ASSERT(pplist != NULL);
1106                         pp = pplist;
1107                         page_sub(&pplist, pp);
1108                         ASSERT(page_iolock_assert(pp));
1109                         page_io_unlock(pp);
1110                         ppa[i] = pp;
1111                 }
1112                 /*
1113                  * Load the locked entry. It's OK to preload the entry into the
1114                  * TSB since we now support large mappings in the kernel TSB.
1115                  */
1116                 hat_memload_array(kas.a_hat,
1117                     (caddr_t)(uintptr_t)rootpp->p_offset, lpsize,
1118                     ppa, (PROT_ALL & ~PROT_USER) | HAT_NOSYNC | attr,
1119                     HAT_LOAD_LOCK);
1120
1121                 for (--i; i >= 0; --i) {
1122                         ppa[i]->p_lckcnt = 1;
1123                         page_unlock(ppa[i]);
1124                 }
1125         }
1126
1127         vmem_free(segkmem_ppa_arena, ppa, ppasize);
1128         return (addr);
1129
1130 fail_page_create:
1131         while ((rootpp = pplist) != NULL) {
1132                 for (i = 0, pp = pplist; i < nbpages; i++, pp = pplist) {
1133                         ASSERT(pp != NULL);
1134                         page_sub(&pplist, pp);
1135                         ASSERT(page_iolock_assert(pp));
1136                         page_io_unlock(pp);
1137                 }
1138                 page_destroy_pages(rootpp);
1139         }
1140
1141         if (inaddr == NULL)
1142                 vmem_free(vmp, addr, size);
1143
1144 fail_vmem_alloc:
1145         vmem_free(segkmem_ppa_arena, ppa, ppasize);
1146
1147 fail_array_alloc:
1148         page_unresv(npages);
1149
1150         return (NULL);
1151 }
1152
1153 static void
1154 segkmem_free_one_lp(caddr_t addr, size_t size)
1155 {
1156         page_t          *pp, *rootpp = NULL;
1157         pgcnt_t         pgs_left = btopr(size);
1158
1159         ASSERT(size == segkmem_lpsize);
1160
1161         hat_unload(kas.a_hat, addr, size, HAT_UNLOAD_UNLOCK);
1162
1163         for (; pgs_left > 0; addr += PAGESIZE, pgs_left--) {
1164                 pp = page_lookup(&kvp.v_object, (uoff_t)(uintptr_t)addr, SE_EXCL);
1165                 if (pp == NULL)
1166                         panic("segkmem_free_one_lp: page not found");
1167                 ASSERT(PAGE_EXCL(pp));
1168                 pp->p_lckcnt = 0;
1169                 if (rootpp == NULL)
1170                         rootpp = pp;
1171         }
1172         ASSERT(rootpp != NULL);
1173         page_destroy_pages(rootpp);
1174
1175         /* page_unresv() is done by the caller */
1176 }
1177
1178 /*
1179  * This function is called to import new spans into the vmem arenas like
1180  * kmem_default_arena and kmem_oversize_arena. It first tries to import
1181  * spans from large page arena - kmem_lp_arena. In order to do this it might
1182  * have to "upgrade the requested size" to kmem_lp_arena quantum. If
1183  * it was not able to satisfy the upgraded request it then calls regular
1184  * segkmem_alloc() that satisfies the request by importing from "*vmp" arena
1185  */
1186 /*ARGSUSED*/
1187 void *
1188 segkmem_alloc_lp(vmem_t *vmp, size_t *sizep, size_t align, int vmflag)
1189 {
1190         size_t size;
1191         kthread_t *t = curthread;
1192         segkmem_lpcb_t *lpcb = &segkmem_lpcb;
1193
1194         ASSERT(sizep != NULL);
1195
1196         size = *sizep;
1197
1198         if (lpcb->lp_uselp && !(t->t_flag & T_PANIC) &&
1199             !(vmflag & SEGKMEM_SHARELOCKED)) {
1200
1201                 size_t kmemlp_qnt = segkmem_kmemlp_quantum;
1202                 size_t asize = P2ROUNDUP(size, kmemlp_qnt);
1203                 void  *addr = NULL;
1204                 ulong_t *lpthrtp = &lpcb->lp_throttle;
1205                 ulong_t lpthrt = *lpthrtp;
1206                 int     dowakeup = 0;
1207                 int     doalloc = 1;
1208
1209                 ASSERT(kmem_lp_arena != NULL);
1210                 ASSERT(asize >= size);
1211
1212                 if (lpthrt != 0) {
1213                         /* try to update the throttle value */
1214                         lpthrt = atomic_inc_ulong_nv(lpthrtp);
1215                         if (lpthrt >= segkmem_lpthrottle_max) {
1216                                 lpthrt = atomic_cas_ulong(lpthrtp, lpthrt,
1217                                     segkmem_lpthrottle_max / 4);
1218                         }
1219
1220                         /*
1221                          * when we get above throttle start do an exponential
1222                          * backoff at trying large pages and reaping
1223                          */
1224                         if (lpthrt > segkmem_lpthrottle_start &&
1225                             !ISP2(lpthrt)) {
1226                                 lpcb->allocs_throttled++;
1227                                 lpthrt--;
1228                                 if (ISP2(lpthrt))
1229                                         kmem_reap();
1230                                 return (segkmem_alloc(vmp, size, vmflag));
1231                         }
1232                 }
1233
1234                 if (!(vmflag & VM_NOSLEEP) &&
1235                     segkmem_heaplp_quantum >= (8 * kmemlp_qnt) &&
1236                     vmem_size(kmem_lp_arena, VMEM_FREE) <= kmemlp_qnt &&
1237                     asize < (segkmem_heaplp_quantum - kmemlp_qnt)) {
1238
1239                         /*
1240                          * we are low on free memory in kmem_lp_arena
1241                          * we let only one guy to allocate heap_lp
1242                          * quantum size chunk that everybody is going to
1243                          * share
1244                          */
1245                         mutex_enter(&lpcb->lp_lock);
1246
1247                         if (lpcb->lp_wait) {
1248
1249                                 /* we are not the first one - wait */
1250                                 cv_wait(&lpcb->lp_cv, &lpcb->lp_lock);
1251                                 if (vmem_size(kmem_lp_arena, VMEM_FREE) <
1252                                     kmemlp_qnt)  {
1253                                         doalloc = 0;
1254                                 }
1255                         } else if (vmem_size(kmem_lp_arena, VMEM_FREE) <=
1256                             kmemlp_qnt) {
1257
1258                                 /*
1259                                  * we are the first one, make sure we import
1260                                  * a large page
1261                                  */
1262                                 if (asize == kmemlp_qnt)
1263                                         asize += kmemlp_qnt;
1264                                 dowakeup = 1;
1265                                 lpcb->lp_wait = 1;
1266                         }
1267
1268                         mutex_exit(&lpcb->lp_lock);
1269                 }
1270
1271                 /*
1272                  * VM_ABORT flag prevents sleeps in vmem_xalloc when
1273                  * large pages are not available. In that case this allocation
1274                  * attempt will fail and we will retry allocation with small
1275                  * pages. We also do not want to panic if this allocation fails
1276                  * because we are going to retry.
1277                  */
1278                 if (doalloc) {
1279                         addr = vmem_alloc(kmem_lp_arena, asize,
1280                             (vmflag | VM_ABORT) & ~VM_PANIC);
1281
1282                         if (dowakeup) {
1283                                 mutex_enter(&lpcb->lp_lock);
1284                                 ASSERT(lpcb->lp_wait != 0);
1285                                 lpcb->lp_wait = 0;
1286                                 cv_broadcast(&lpcb->lp_cv);
1287                                 mutex_exit(&lpcb->lp_lock);
1288                         }
1289                 }
1290
1291                 if (addr != NULL) {
1292                         *sizep = asize;
1293                         *lpthrtp = 0;
1294                         return (addr);
1295                 }
1296
1297                 if (vmflag & VM_NOSLEEP)
1298                         lpcb->nosleep_allocs_failed++;
1299                 else
1300                         lpcb->sleep_allocs_failed++;
1301                 lpcb->alloc_bytes_failed += size;
1302
1303                 /* if large page throttling is not started yet do it */
1304                 if (segkmem_use_lpthrottle && lpthrt == 0) {
1305                         lpthrt = atomic_cas_ulong(lpthrtp, lpthrt, 1);
1306                 }
1307         }
1308         return (segkmem_alloc(vmp, size, vmflag));
1309 }
1310
1311 void
1312 segkmem_free_lp(vmem_t *vmp, void *inaddr, size_t size)
1313 {
1314         if (kmem_lp_arena == NULL || !IS_KMEM_VA_LARGEPAGE((caddr_t)inaddr)) {
1315                 segkmem_free(vmp, inaddr, size);
1316         } else {
1317                 vmem_free(kmem_lp_arena, inaddr, size);
1318         }
1319 }
1320
1321 /*
1322  * segkmem_alloc_lpi() imports virtual memory from large page heap arena
1323  * into kmem_lp arena. In the process it maps the imported segment with
1324  * large pages
1325  */
1326 static void *
1327 segkmem_alloc_lpi(vmem_t *vmp, size_t size, int vmflag)
1328 {
1329         segkmem_lpcb_t *lpcb = &segkmem_lpcb;
1330         void  *addr;
1331
1332         ASSERT(size != 0);
1333         ASSERT(vmp == heap_lp_arena);
1334
1335         /* do not allow large page heap grow beyound limits */
1336         if (vmem_size(vmp, VMEM_ALLOC) >= segkmem_kmemlp_max) {
1337                 lpcb->allocs_limited++;
1338                 return (NULL);
1339         }
1340
1341         addr = segkmem_xalloc_lp(vmp, NULL, size, vmflag, 0,
1342             segkmem_page_create_large, NULL);
1343         return (addr);
1344 }
1345
1346 /*
1347  * segkmem_free_lpi() returns virtual memory back into large page heap arena
1348  * from kmem_lp arena. Beore doing this it unmaps the segment and frees
1349  * large pages used to map it.
1350  */
1351 static void
1352 segkmem_free_lpi(vmem_t *vmp, void *inaddr, size_t size)
1353 {
1354         pgcnt_t         nlpages = size >> segkmem_lpshift;
1355         size_t          lpsize = segkmem_lpsize;
1356         caddr_t         addr = inaddr;
1357         pgcnt_t         npages = btopr(size);
1358         int             i;
1359
1360         ASSERT(vmp == heap_lp_arena);
1361         ASSERT(IS_KMEM_VA_LARGEPAGE(addr));
1362         ASSERT(((uintptr_t)inaddr & (lpsize - 1)) == 0);
1363
1364         for (i = 0; i < nlpages; i++) {
1365                 segkmem_free_one_lp(addr, lpsize);
1366                 addr += lpsize;
1367         }
1368
1369         page_unresv(npages);
1370
1371         vmem_free(vmp, inaddr, size);
1372 }
1373
1374 /*
1375  * This function is called at system boot time by kmem_init right after
1376  * /etc/system file has been read. It checks based on hardware configuration
1377  * and /etc/system settings if system is going to use large pages. The
1378  * initialiazation necessary to actually start using large pages
1379  * happens later in the process after segkmem_heap_lp_init() is called.
1380  */
1381 int
1382 segkmem_lpsetup()
1383 {
1384         int use_large_pages = 0;
1385
1386         return (use_large_pages);
1387 }
1388
1389 void
1390 segkmem_zio_init(void *zio_mem_base, size_t zio_mem_size)
1391 {
1392         ASSERT(zio_mem_base != NULL);
1393         ASSERT(zio_mem_size != 0);
1394
1395         /*
1396          * To reduce VA space fragmentation, we set up quantum caches for the
1397          * smaller sizes;  we chose 32k because that translates to 128k VA
1398          * slabs, which matches nicely with the common 128k zio_data bufs.
1399          */
1400         zio_arena = vmem_create("zfs_file_data", zio_mem_base, zio_mem_size,
1401             PAGESIZE, NULL, NULL, NULL, 32 * 1024, VM_SLEEP);
1402
1403         zio_alloc_arena = vmem_create("zfs_file_data_buf", NULL, 0, PAGESIZE,
1404             segkmem_zio_alloc, segkmem_zio_free, zio_arena, 0, VM_SLEEP);
1405
1406         ASSERT(zio_arena != NULL);
1407         ASSERT(zio_alloc_arena != NULL);
1408 }
1409