usr/src/uts/i86pc/vm/vm_machdep.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 /*
  25  * Copyright (c) 2010, Intel Corporation.
  26  * All rights reserved.
  27  * Copyright 2019, Joyent, Inc.
  28  */
  29
  30 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  31 /*      All Rights Reserved   */
  32
  33 /*
  34  * Portions of this source code were derived from Berkeley 4.3 BSD
  35  * under license from the Regents of the University of California.
  36  */
  37
  38 /*
  39  * UNIX machine dependent virtual memory support.
  40  */
  41
  42 #include <sys/types.h>
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/user.h>
  46 #include <sys/proc.h>
  47 #include <sys/kmem.h>
  48 #include <sys/vmem.h>
  49 #include <sys/buf.h>
  50 #include <sys/cpuvar.h>
  51 #include <sys/lgrp.h>
  52 #include <sys/disp.h>
  53 #include <sys/vm.h>
  54 #include <sys/mman.h>
  55 #include <sys/vnode.h>
  56 #include <sys/cred.h>
  57 #include <sys/exec.h>
  58 #include <sys/exechdr.h>
  59 #include <sys/debug.h>
  60 #include <sys/vmsystm.h>
  61 #include <sys/swap.h>
  62 #include <sys/dumphdr.h>
  63 #include <sys/random.h>
  64
  65 #include <vm/hat.h>
  66 #include <vm/as.h>
  67 #include <vm/seg.h>
  68 #include <vm/seg_kp.h>
  69 #include <vm/seg_vn.h>
  70 #include <vm/page.h>
  71 #include <vm/seg_kmem.h>
  72 #include <vm/seg_kpm.h>
  73 #include <vm/vm_dep.h>
  74
  75 #include <sys/cpu.h>
  76 #include <sys/vm_machparam.h>
  77 #include <sys/memlist.h>
  78 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */
  79 #include <vm/hat_i86.h>
  80 #include <sys/x86_archext.h>
  81 #include <sys/elf_386.h>
  82 #include <sys/cmn_err.h>
  83 #include <sys/archsystm.h>
  84 #include <sys/machsystm.h>
  85 #include <sys/secflags.h>
  86
  87 #include <sys/vtrace.h>
  88 #include <sys/ddidmareq.h>
  89 #include <sys/promif.h>
  90 #include <sys/memnode.h>
  91 #include <sys/stack.h>
  92 #include <util/qsort.h>
  93 #include <sys/taskq.h>
  94
  95 #ifdef __xpv
  96
  97 #include <sys/hypervisor.h>
  98 #include <sys/xen_mmu.h>
  99 #include <sys/balloon_impl.h>
 100
 101 /*
 102  * domain 0 pages usable for DMA are kept pre-allocated and kept in
 103  * distinct lists, ordered by increasing mfn.
 104  */
 105 static kmutex_t io_pool_lock;
 106 static kmutex_t contig_list_lock;
 107 static page_t *io_pool_4g;      /* pool for 32 bit dma limited devices */
 108 static page_t *io_pool_16m;     /* pool for 24 bit dma limited legacy devices */
 109 static long io_pool_cnt;
 110 static long io_pool_cnt_max = 0;
 111 #define DEFAULT_IO_POOL_MIN     128
 112 static long io_pool_cnt_min = DEFAULT_IO_POOL_MIN;
 113 static long io_pool_cnt_lowater = 0;
 114 static long io_pool_shrink_attempts; /* how many times did we try to shrink */
 115 static long io_pool_shrinks;    /* how many times did we really shrink */
 116 static long io_pool_grows;      /* how many times did we grow */
 117 static mfn_t start_mfn = 1;
 118 static caddr_t io_pool_kva;     /* use to alloc pages when needed */
 119
 120 static int create_contig_pfnlist(uint_t);
 121
 122 /*
 123  * percentage of phys mem to hold in the i/o pool
 124  */
 125 #define DEFAULT_IO_POOL_PCT     2
 126 static long io_pool_physmem_pct = DEFAULT_IO_POOL_PCT;
 127 static void page_io_pool_sub(page_t **, page_t *, page_t *);
 128 int ioalloc_dbg = 0;
 129
 130 #endif /* __xpv */
 131
 132 uint_t vac_colors = 1;
 133
 134 int largepagesupport = 0;
 135 extern uint_t page_create_new;
 136 extern uint_t page_create_exists;
 137 extern uint_t page_create_putbacks;
 138 /*
 139  * Allow users to disable the kernel's use of SSE.
 140  */
 141 extern int use_sse_pagecopy, use_sse_pagezero;
 142
 143 /*
 144  * combined memory ranges from mnode and memranges[] to manage single
 145  * mnode/mtype dimension in the page lists.
 146  */
 147 typedef struct {
 148         pfn_t   mnr_pfnlo;
 149         pfn_t   mnr_pfnhi;
 150         int     mnr_mnode;
 151         int     mnr_memrange;           /* index into memranges[] */
 152         int     mnr_next;               /* next lower PA mnoderange */
 153         int     mnr_exists;
 154         /* maintain page list stats */
 155         pgcnt_t mnr_mt_clpgcnt;         /* cache list cnt */
 156         pgcnt_t mnr_mt_flpgcnt[MMU_PAGE_SIZES]; /* free list cnt per szc */
 157         pgcnt_t mnr_mt_totcnt;          /* sum of cache and free lists */
 158 #ifdef DEBUG
 159         struct mnr_mts {                /* mnode/mtype szc stats */
 160                 pgcnt_t mnr_mts_pgcnt;
 161                 int     mnr_mts_colors;
 162                 pgcnt_t *mnr_mtsc_pgcnt;
 163         }       *mnr_mts;
 164 #endif
 165 } mnoderange_t;
 166
 167 #define MEMRANGEHI(mtype)                                               \
 168         ((mtype > 0) ? memranges[mtype - 1] - 1: physmax)
 169 #define MEMRANGELO(mtype)       (memranges[mtype])
 170
 171 #define MTYPE_FREEMEM(mt)       (mnoderanges[mt].mnr_mt_totcnt)
 172
 173 /*
 174  * As the PC architecture evolved memory up was clumped into several
 175  * ranges for various historical I/O devices to do DMA.
 176  * < 16Meg - ISA bus
 177  * < 2Gig - ???
 178  * < 4Gig - PCI bus or drivers that don't understand PAE mode
 179  *
 180  * These are listed in reverse order, so that we can skip over unused
 181  * ranges on machines with small memories.
 182  *
 183  * For now under the Hypervisor, we'll only ever have one memrange.
 184  */
 185 #define PFN_4GIG        0x100000
 186 #define PFN_16MEG       0x1000
 187 /* Indices into the memory range (arch_memranges) array. */
 188 #define MRI_4G          0
 189 #define MRI_2G          1
 190 #define MRI_16M         2
 191 #define MRI_0           3
 192 static pfn_t arch_memranges[NUM_MEM_RANGES] = {
 193     PFN_4GIG,   /* pfn range for 4G and above */
 194     0x80000,    /* pfn range for 2G-4G */
 195     PFN_16MEG,  /* pfn range for 16M-2G */
 196     0x00000,    /* pfn range for 0-16M */
 197 };
 198 pfn_t *memranges = &arch_memranges[0];
 199 int nranges = NUM_MEM_RANGES;
 200
 201 /*
 202  * This combines mem_node_config and memranges into one data
 203  * structure to be used for page list management.
 204  */
 205 static mnoderange_t *mnoderanges;
 206 static int mnoderangecnt;
 207 static int mtype4g;
 208 static int mtype16m;
 209 static int mtypetop;
 210
 211 /*
 212  * 4g memory management variables for systems with more than 4g of memory:
 213  *
 214  * physical memory below 4g is required for 32bit dma devices and, currently,
 215  * for kmem memory. On systems with more than 4g of memory, the pool of memory
 216  * below 4g can be depleted without any paging activity given that there is
 217  * likely to be sufficient memory above 4g.
 218  *
 219  * physmax4g is set true if the largest pfn is over 4g. The rest of the
 220  * 4g memory management code is enabled only when physmax4g is true.
 221  *
 222  * maxmem4g is the count of the maximum number of pages on the page lists
 223  * with physical addresses below 4g. It can be a lot less then 4g given that
 224  * BIOS may reserve large chunks of space below 4g for hot plug pci devices,
 225  * agp aperture etc.
 226  *
 227  * freemem4g maintains the count of the number of available pages on the
 228  * page lists with physical addresses below 4g.
 229  *
 230  * DESFREE4G specifies the desired amount of below 4g memory. It defaults to
 231  * 6% (desfree4gshift = 4) of maxmem4g.
 232  *
 233  * RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G
 234  * and the amount of physical memory above 4g is greater than freemem4g.
 235  * In this case, page_get_* routines will restrict below 4g allocations
 236  * for requests that don't specifically require it.
 237  */
 238
 239 #define DESFREE4G       (maxmem4g >> desfree4gshift)
 240
 241 #define RESTRICT4G_ALLOC                                        \
 242         (physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem))
 243
 244 static pgcnt_t  maxmem4g;
 245 static pgcnt_t  freemem4g;
 246 static int      physmax4g;
 247 static int      desfree4gshift = 4;     /* maxmem4g shift to derive DESFREE4G */
 248
 249 /*
 250  * 16m memory management:
 251  *
 252  * reserve some amount of physical memory below 16m for legacy devices.
 253  *
 254  * RESTRICT16M_ALLOC returns true if an there are sufficient free pages above
 255  * 16m or if the 16m pool drops below DESFREE16M.
 256  *
 257  * In this case, general page allocations via page_get_{free,cache}list
 258  * routines will be restricted from allocating from the 16m pool. Allocations
 259  * that require specific pfn ranges (page_get_anylist) and PG_PANIC allocations
 260  * are not restricted.
 261  */
 262
 263 #define FREEMEM16M      MTYPE_FREEMEM(mtype16m)
 264 #define DESFREE16M      desfree16m
 265 #define RESTRICT16M_ALLOC(freemem, pgcnt, flags) \
 266         (mtype16m != -1 && (freemem != 0) && ((flags & PG_PANIC) == 0) && \
 267             ((freemem >= (FREEMEM16M)) || \
 268             (FREEMEM16M  < (DESFREE16M + pgcnt))))
 269
 270 static pgcnt_t  desfree16m = 0x380;
 271
 272 /*
 273  * This can be patched via /etc/system to allow old non-PAE aware device
 274  * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM.
 275  */
 276 int restricted_kmemalloc = 0;
 277
 278 #ifdef VM_STATS
 279 struct {
 280         ulong_t pga_alloc;
 281         ulong_t pga_notfullrange;
 282         ulong_t pga_nulldmaattr;
 283         ulong_t pga_allocok;
 284         ulong_t pga_allocfailed;
 285         ulong_t pgma_alloc;
 286         ulong_t pgma_allocok;
 287         ulong_t pgma_allocfailed;
 288         ulong_t pgma_allocempty;
 289 } pga_vmstats;
 290 #endif
 291
 292 uint_t mmu_page_sizes;
 293
 294 /* How many page sizes the users can see */
 295 uint_t mmu_exported_page_sizes;
 296
 297 /* page sizes that legacy applications can see */
 298 uint_t mmu_legacy_page_sizes;
 299
 300 /*
 301  * Number of pages in 1 GB.  Don't enable automatic large pages if we have
 302  * fewer than this many pages.
 303  */
 304 pgcnt_t shm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
 305 pgcnt_t privm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
 306
 307 /*
 308  * Maximum and default segment size tunables for user private
 309  * and shared anon memory, and user text and initialized data.
 310  * These can be patched via /etc/system to allow large pages
 311  * to be used for mapping application private and shared anon memory.
 312  */
 313 size_t mcntl0_lpsize = MMU_PAGESIZE;
 314 size_t max_uheap_lpsize = MMU_PAGESIZE;
 315 size_t default_uheap_lpsize = MMU_PAGESIZE;
 316 size_t max_ustack_lpsize = MMU_PAGESIZE;
 317 size_t default_ustack_lpsize = MMU_PAGESIZE;
 318 size_t max_privmap_lpsize = MMU_PAGESIZE;
 319 size_t max_uidata_lpsize = MMU_PAGESIZE;
 320 size_t max_utext_lpsize = MMU_PAGESIZE;
 321 size_t max_shm_lpsize = MMU_PAGESIZE;
 322
 323
 324 /*
 325  * initialized by page_coloring_init().
 326  */
 327 uint_t  page_colors;
 328 uint_t  page_colors_mask;
 329 uint_t  page_coloring_shift;
 330 int     cpu_page_colors;
 331 static uint_t   l2_colors;
 332
 333 /*
 334  * Page freelists and cachelists are dynamically allocated once mnoderangecnt
 335  * and page_colors are calculated from the l2 cache n-way set size.  Within a
 336  * mnode range, the page freelist and cachelist are hashed into bins based on
 337  * color. This makes it easier to search for a page within a specific memory
 338  * range.
 339  */
 340 #define PAGE_COLORS_MIN 16
 341
 342 page_t ****page_freelists;
 343 page_t ***page_cachelists;
 344
 345
 346 /*
 347  * Used by page layer to know about page sizes
 348  */
 349 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1];
 350
 351 kmutex_t        *fpc_mutex[NPC_MUTEX];
 352 kmutex_t        *cpc_mutex[NPC_MUTEX];
 353
 354 /* Lock to protect mnoderanges array for memory DR operations. */
 355 static kmutex_t mnoderange_lock;
 356
 357 /*
 358  * Only let one thread at a time try to coalesce large pages, to
 359  * prevent them from working against each other.
 360  */
 361 static kmutex_t contig_lock;
 362 #define CONTIG_LOCK()   mutex_enter(&contig_lock);
 363 #define CONTIG_UNLOCK() mutex_exit(&contig_lock);
 364
 365 #define PFN_16M         (mmu_btop((uint64_t)0x1000000))
 366
 367 caddr_t
 368 i86devmap(pfn_t pf, pgcnt_t pgcnt, uint_t prot)
 369 {
 370         caddr_t addr;
 371         caddr_t addr1;
 372         page_t *pp;
 373
 374         addr1 = addr = vmem_alloc(heap_arena, mmu_ptob(pgcnt), VM_SLEEP);
 375
 376         for (; pgcnt != 0; addr += MMU_PAGESIZE, ++pf, --pgcnt) {
 377                 pp = page_numtopp_nolock(pf);
 378                 if (pp == NULL) {
 379                         hat_devload(kas.a_hat, addr, MMU_PAGESIZE, pf,
 380                             prot | HAT_NOSYNC, HAT_LOAD_LOCK);
 381                 } else {
 382                         hat_memload(kas.a_hat, addr, pp,
 383                             prot | HAT_NOSYNC, HAT_LOAD_LOCK);
 384                 }
 385         }
 386
 387         return (addr1);
 388 }
 389
 390 /*
 391  * This routine is like page_numtopp, but accepts only free pages, which
 392  * it allocates (unfrees) and returns with the exclusive lock held.
 393  * It is used by machdep.c/dma_init() to find contiguous free pages.
 394  */
 395 page_t *
 396 page_numtopp_alloc(pfn_t pfnum)
 397 {
 398         page_t *pp;
 399
 400 retry:
 401         pp = page_numtopp_nolock(pfnum);
 402         if (pp == NULL) {
 403                 return (NULL);
 404         }
 405
 406         if (!page_trylock(pp, SE_EXCL)) {
 407                 return (NULL);
 408         }
 409
 410         if (page_pptonum(pp) != pfnum) {
 411                 page_unlock(pp);
 412                 goto retry;
 413         }
 414
 415         if (!PP_ISFREE(pp)) {
 416                 page_unlock(pp);
 417                 return (NULL);
 418         }
 419         if (pp->p_szc) {
 420                 page_demote_free_pages(pp);
 421                 page_unlock(pp);
 422                 goto retry;
 423         }
 424
 425         /* If associated with a vnode, destroy mappings */
 426
 427         if (pp->p_vnode) {
 428
 429                 page_destroy_free(pp);
 430
 431                 if (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_NO_RECLAIM)) {
 432                         return (NULL);
 433                 }
 434
 435                 if (page_pptonum(pp) != pfnum) {
 436                         page_unlock(pp);
 437                         goto retry;
 438                 }
 439         }
 440
 441         if (!PP_ISFREE(pp)) {
 442                 page_unlock(pp);
 443                 return (NULL);
 444         }
 445
 446         if (!page_reclaim(pp, (kmutex_t *)NULL))
 447                 return (NULL);
 448
 449         return (pp);
 450 }
 451
 452 /*
 453  * Return the optimum page size for a given mapping
 454  */
 455 /*ARGSUSED*/
 456 size_t
 457 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl)
 458 {
 459         level_t l = 0;
 460         size_t pgsz = MMU_PAGESIZE;
 461         size_t max_lpsize;
 462         uint_t mszc;
 463
 464         ASSERT(maptype != MAPPGSZ_VA);
 465
 466         if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) {
 467                 return (MMU_PAGESIZE);
 468         }
 469
 470         switch (maptype) {
 471         case MAPPGSZ_HEAP:
 472         case MAPPGSZ_STK:
 473                 max_lpsize = memcntl ? mcntl0_lpsize : (maptype ==
 474                     MAPPGSZ_HEAP ? max_uheap_lpsize : max_ustack_lpsize);
 475                 if (max_lpsize == MMU_PAGESIZE) {
 476                         return (MMU_PAGESIZE);
 477                 }
 478                 if (len == 0) {
 479                         len = (maptype == MAPPGSZ_HEAP) ? p->p_brkbase +
 480                             p->p_brksize - p->p_bssbase : p->p_stksize;
 481                 }
 482                 len = (maptype == MAPPGSZ_HEAP) ? MAX(len,
 483                     default_uheap_lpsize) : MAX(len, default_ustack_lpsize);
 484
 485                 /*
 486                  * use the pages size that best fits len
 487                  */
 488                 for (l = mmu.umax_page_level; l > 0; --l) {
 489                         if (LEVEL_SIZE(l) > max_lpsize || len < LEVEL_SIZE(l)) {
 490                                 continue;
 491                         } else {
 492                                 pgsz = LEVEL_SIZE(l);
 493                         }
 494                         break;
 495                 }
 496
 497                 mszc = (maptype == MAPPGSZ_HEAP ? p->p_brkpageszc :
 498                     p->p_stkpageszc);
 499                 if (addr == 0 && (pgsz < hw_page_array[mszc].hp_size)) {
 500                         pgsz = hw_page_array[mszc].hp_size;
 501                 }
 502                 return (pgsz);
 503
 504         case MAPPGSZ_ISM:
 505                 for (l = mmu.umax_page_level; l > 0; --l) {
 506                         if (len >= LEVEL_SIZE(l))
 507                                 return (LEVEL_SIZE(l));
 508                 }
 509                 return (LEVEL_SIZE(0));
 510         }
 511         return (pgsz);
 512 }
 513
 514 static uint_t
 515 map_szcvec(caddr_t addr, size_t size, uintptr_t off, size_t max_lpsize,
 516     size_t min_physmem)
 517 {
 518         caddr_t eaddr = addr + size;
 519         uint_t szcvec = 0;
 520         caddr_t raddr;
 521         caddr_t readdr;
 522         size_t  pgsz;
 523         int i;
 524
 525         if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) {
 526                 return (0);
 527         }
 528
 529         for (i = mmu_exported_page_sizes - 1; i > 0; i--) {
 530                 pgsz = page_get_pagesize(i);
 531                 if (pgsz > max_lpsize) {
 532                         continue;
 533                 }
 534                 raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
 535                 readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
 536                 if (raddr < addr || raddr >= readdr) {
 537                         continue;
 538                 }
 539                 if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
 540                         continue;
 541                 }
 542                 /*
 543                  * Set szcvec to the remaining page sizes.
 544                  */
 545                 szcvec = ((1 << (i + 1)) - 1) & ~1;
 546                 break;
 547         }
 548         return (szcvec);
 549 }
 550
 551 /*
 552  * Return a bit vector of large page size codes that
 553  * can be used to map [addr, addr + len) region.
 554  */
 555 /*ARGSUSED*/
 556 uint_t
 557 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type,
 558     int memcntl)
 559 {
 560         size_t max_lpsize = mcntl0_lpsize;
 561
 562         if (mmu.max_page_level == 0)
 563                 return (0);
 564
 565         if (flags & MAP_TEXT) {
 566                 if (!memcntl)
 567                         max_lpsize = max_utext_lpsize;
 568                 return (map_szcvec(addr, size, off, max_lpsize,
 569                     shm_lpg_min_physmem));
 570
 571         } else if (flags & MAP_INITDATA) {
 572                 if (!memcntl)
 573                         max_lpsize = max_uidata_lpsize;
 574                 return (map_szcvec(addr, size, off, max_lpsize,
 575                     privm_lpg_min_physmem));
 576
 577         } else if (type == MAPPGSZC_SHM) {
 578                 if (!memcntl)
 579                         max_lpsize = max_shm_lpsize;
 580                 return (map_szcvec(addr, size, off, max_lpsize,
 581                     shm_lpg_min_physmem));
 582
 583         } else if (type == MAPPGSZC_HEAP) {
 584                 if (!memcntl)
 585                         max_lpsize = max_uheap_lpsize;
 586                 return (map_szcvec(addr, size, off, max_lpsize,
 587                     privm_lpg_min_physmem));
 588
 589         } else if (type == MAPPGSZC_STACK) {
 590                 if (!memcntl)
 591                         max_lpsize = max_ustack_lpsize;
 592                 return (map_szcvec(addr, size, off, max_lpsize,
 593                     privm_lpg_min_physmem));
 594
 595         } else {
 596                 if (!memcntl)
 597                         max_lpsize = max_privmap_lpsize;
 598                 return (map_szcvec(addr, size, off, max_lpsize,
 599                     privm_lpg_min_physmem));
 600         }
 601 }
 602
 603 /*
 604  * Handle a pagefault.
 605  */
 606 faultcode_t
 607 pagefault(
 608         caddr_t addr,
 609         enum fault_type type,
 610         enum seg_rw rw,
 611         int iskernel)
 612 {
 613         struct as *as;
 614         struct hat *hat;
 615         struct proc *p;
 616         kthread_t *t;
 617         faultcode_t res;
 618         caddr_t base;
 619         size_t len;
 620         int err;
 621         int mapped_red;
 622         uintptr_t ea;
 623
 624         ASSERT_STACK_ALIGNED();
 625
 626         if (INVALID_VADDR(addr))
 627                 return (FC_NOMAP);
 628
 629         mapped_red = segkp_map_red();
 630
 631         if (iskernel) {
 632                 as = &kas;
 633                 hat = as->a_hat;
 634         } else {
 635                 t = curthread;
 636                 p = ttoproc(t);
 637                 as = p->p_as;
 638                 hat = as->a_hat;
 639         }
 640
 641         /*
 642          * Dispatch pagefault.
 643          */
 644         res = as_fault(hat, as, addr, 1, type, rw);
 645
 646         /*
 647          * If this isn't a potential unmapped hole in the user's
 648          * UNIX data or stack segments, just return status info.
 649          */
 650         if (res != FC_NOMAP || iskernel)
 651                 goto out;
 652
 653         /*
 654          * Check to see if we happened to faulted on a currently unmapped
 655          * part of the UNIX data or stack segments.  If so, create a zfod
 656          * mapping there and then try calling the fault routine again.
 657          */
 658         base = p->p_brkbase;
 659         len = p->p_brksize;
 660
 661         if (addr < base || addr >= base + len) {                /* data seg? */
 662                 base = (caddr_t)p->p_usrstack - p->p_stksize;
 663                 len = p->p_stksize;
 664                 if (addr < base || addr >= p->p_usrstack) {     /* stack seg? */
 665                         /* not in either UNIX data or stack segments */
 666                         res = FC_NOMAP;
 667                         goto out;
 668                 }
 669         }
 670
 671         /*
 672          * the rest of this function implements a 3.X 4.X 5.X compatibility
 673          * This code is probably not needed anymore
 674          */
 675         if (p->p_model == DATAMODEL_ILP32) {
 676
 677                 /* expand the gap to the page boundaries on each side */
 678                 ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE);
 679                 base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE);
 680                 len = ea - (uintptr_t)base;
 681
 682                 as_rangelock(as);
 683                 if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) ==
 684                     0) {
 685                         err = as_map(as, base, len, segvn_create, zfod_argsp);
 686                         as_rangeunlock(as);
 687                         if (err) {
 688                                 res = FC_MAKE_ERR(err);
 689                                 goto out;
 690                         }
 691                 } else {
 692                         /*
 693                          * This page is already mapped by another thread after
 694                          * we returned from as_fault() above.  We just fall
 695                          * through as_fault() below.
 696                          */
 697                         as_rangeunlock(as);
 698                 }
 699
 700                 res = as_fault(hat, as, addr, 1, F_INVAL, rw);
 701         }
 702
 703 out:
 704         if (mapped_red)
 705                 segkp_unmap_red();
 706
 707         return (res);
 708 }
 709
 710 void
 711 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
 712 {
 713         struct proc *p = curproc;
 714         caddr_t userlimit = (flags & _MAP_LOW32) ?
 715             (caddr_t)_userlimit32 : p->p_as->a_userlimit;
 716
 717         map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags);
 718 }
 719
 720 /*ARGSUSED*/
 721 int
 722 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
 723 {
 724         return (0);
 725 }
 726
 727 /*
 728  * The maximum amount a randomized mapping will be slewed.  We should perhaps
 729  * arrange things so these tunables can be separate for mmap, mmapobj, and
 730  * ld.so
 731  */
 732 size_t aslr_max_map_skew = 256 * 1024 * 1024; /* 256MB */
 733
 734 /*
 735  * map_addr_proc() is the routine called when the system is to
 736  * choose an address for the user.  We will pick an address
 737  * range which is the highest available below userlimit.
 738  *
 739  * Every mapping will have a redzone of a single page on either side of
 740  * the request. This is done to leave one page unmapped between segments.
 741  * This is not required, but it's useful for the user because if their
 742  * program strays across a segment boundary, it will catch a fault
 743  * immediately making debugging a little easier.  Currently the redzone
 744  * is mandatory.
 745  *
 746  * addrp is a value/result parameter.
 747  *      On input it is a hint from the user to be used in a completely
 748  *      machine dependent fashion.  We decide to completely ignore this hint.
 749  *      If MAP_ALIGN was specified, addrp contains the minimal alignment, which
 750  *      must be some "power of two" multiple of pagesize.
 751  *
 752  *      On output it is NULL if no address can be found in the current
 753  *      processes address space or else an address that is currently
 754  *      not mapped for len bytes with a page of red zone on either side.
 755  *
 756  *      vacalign is not needed on x86 (it's for viturally addressed caches)
 757  */
 758 /*ARGSUSED*/
 759 void
 760 map_addr_proc(
 761         caddr_t *addrp,
 762         size_t len,
 763         offset_t off,
 764         int vacalign,
 765         caddr_t userlimit,
 766         struct proc *p,
 767         uint_t flags)
 768 {
 769         struct as *as = p->p_as;
 770         caddr_t addr;
 771         caddr_t base;
 772         size_t slen;
 773         size_t align_amount;
 774
 775         ASSERT32(userlimit == as->a_userlimit);
 776
 777         base = p->p_brkbase;
 778         if (p->p_model == DATAMODEL_NATIVE) {
 779                 if (userlimit < as->a_userlimit) {
 780                         /*
 781                          * This happens when a program wants to map
 782                          * something in a range that's accessible to a
 783                          * program in a smaller address space.  For example,
 784                          * a 64-bit program calling mmap32(2) to guarantee
 785                          * that the returned address is below 4Gbytes.
 786                          */
 787                         ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff));
 788
 789                         if (userlimit > base)
 790                                 slen = userlimit - base;
 791                         else {
 792                                 *addrp = NULL;
 793                                 return;
 794                         }
 795                 } else {
 796                         /*
 797                          * With the stack positioned at a higher address than
 798                          * the heap for 64-bit processes, it is necessary to be
 799                          * mindful of its location and potential size.
 800                          *
 801                          * Unallocated space above the top of the stack (that
 802                          * is, at a lower address) but still within the bounds
 803                          * of the stack limit should be considered unavailable.
 804                          *
 805                          * As the 64-bit stack guard is mapped in immediately
 806                          * adjacent to the stack limit boundary, this prevents
 807                          * new mappings from having accidentally dangerous
 808                          * proximity to the stack.
 809                          */
 810                         slen = p->p_usrstack - base -
 811                             ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK);
 812                 }
 813         } else {
 814                 slen = userlimit - base;
 815         }
 816
 817         /* Make len be a multiple of PAGESIZE */
 818         len = (len + PAGEOFFSET) & PAGEMASK;
 819
 820         /*
 821          * figure out what the alignment should be
 822          *
 823          * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same????
 824          */
 825         if (len <= ELF_386_MAXPGSZ) {
 826                 /*
 827                  * Align virtual addresses to ensure that ELF shared libraries
 828                  * are mapped with the appropriate alignment constraints by
 829                  * the run-time linker.
 830                  */
 831                 align_amount = ELF_386_MAXPGSZ;
 832         } else {
 833                 /*
 834                  * For 32-bit processes, only those which have specified
 835                  * MAP_ALIGN and an addr will be aligned on a larger page size.
 836                  * Not doing so can potentially waste up to 1G of process
 837                  * address space.
 838                  */
 839                 int lvl = (p->p_model == DATAMODEL_ILP32) ? 1 :
 840                     mmu.umax_page_level;
 841
 842                 while (lvl && len < LEVEL_SIZE(lvl))
 843                         --lvl;
 844
 845                 align_amount = LEVEL_SIZE(lvl);
 846         }
 847         if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount))
 848                 align_amount = (uintptr_t)*addrp;
 849
 850         ASSERT(ISP2(align_amount));
 851         ASSERT(align_amount == 0 || align_amount >= PAGESIZE);
 852
 853         off = off & (align_amount - 1);
 854
 855         /*
 856          * Look for a large enough hole starting below userlimit.
 857          * After finding it, use the upper part.
 858          */
 859         if (as_gap_aligned(as, len, &base, &slen, AH_HI, NULL, align_amount,
 860             PAGESIZE, off) == 0) {
 861                 caddr_t as_addr;
 862
 863                 /*
 864                  * addr is the highest possible address to use since we have
 865                  * a PAGESIZE redzone at the beginning and end.
 866                  */
 867                 addr = base + slen - (PAGESIZE + len);
 868                 as_addr = addr;
 869                 /*
 870                  * Round address DOWN to the alignment amount and
 871                  * add the offset in.
 872                  * If addr is greater than as_addr, len would not be large
 873                  * enough to include the redzone, so we must adjust down
 874                  * by the alignment amount.
 875                  */
 876                 addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1)));
 877                 addr += (uintptr_t)off;
 878                 if (addr > as_addr) {
 879                         addr -= align_amount;
 880                 }
 881
 882                 /*
 883                  * If randomization is requested, slew the allocation
 884                  * backwards, within the same gap, by a random amount.
 885                  */
 886                 if (flags & _MAP_RANDOMIZE) {
 887                         uint32_t slew;
 888
 889                         (void) random_get_pseudo_bytes((uint8_t *)&slew,
 890                             sizeof (slew));
 891
 892                         slew = slew % MIN(aslr_max_map_skew, (addr - base));
 893                         addr -= P2ALIGN(slew, align_amount);
 894                 }
 895
 896                 ASSERT(addr > base);
 897                 ASSERT(addr + len < base + slen);
 898                 ASSERT(((uintptr_t)addr & (align_amount - 1)) ==
 899                     ((uintptr_t)(off)));
 900                 *addrp = addr;
 901         } else {
 902                 *addrp = NULL;  /* no more virtual space */
 903         }
 904 }
 905
 906 int valid_va_range_aligned_wraparound;
 907
 908 /*
 909  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
 910  * addresses at least "minlen" long, where the base of the range is at "off"
 911  * phase from an "align" boundary and there is space for a "redzone"-sized
 912  * redzone on either side of the range.  On success, 1 is returned and *basep
 913  * and *lenp are adjusted to describe the acceptable range (including
 914  * the redzone).  On failure, 0 is returned.
 915  */
 916 /*ARGSUSED3*/
 917 int
 918 valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir,
 919     size_t align, size_t redzone, size_t off)
 920 {
 921         uintptr_t hi, lo;
 922         size_t tot_len;
 923
 924         ASSERT(align == 0 ? off == 0 : off < align);
 925         ASSERT(ISP2(align));
 926         ASSERT(align == 0 || align >= PAGESIZE);
 927
 928         lo = (uintptr_t)*basep;
 929         hi = lo + *lenp;
 930         tot_len = minlen + 2 * redzone; /* need at least this much space */
 931
 932         /*
 933          * If hi rolled over the top, try cutting back.
 934          */
 935         if (hi < lo) {
 936                 *lenp = 0UL - lo - 1UL;
 937                 /* See if this really happens. If so, then we figure out why */
 938                 valid_va_range_aligned_wraparound++;
 939                 hi = lo + *lenp;
 940         }
 941         if (*lenp < tot_len) {
 942                 return (0);
 943         }
 944
 945         /*
 946          * Deal with a possible hole in the address range between
 947          * hole_start and hole_end that should never be mapped.
 948          */
 949         if (lo < hole_start) {
 950                 if (hi > hole_start) {
 951                         if (hi < hole_end) {
 952                                 hi = hole_start;
 953                         } else {
 954                                 /* lo < hole_start && hi >= hole_end */
 955                                 if (dir == AH_LO) {
 956                                         /*
 957                                          * prefer lowest range
 958                                          */
 959                                         if (hole_start - lo >= tot_len)
 960                                                 hi = hole_start;
 961                                         else if (hi - hole_end >= tot_len)
 962                                                 lo = hole_end;
 963                                         else
 964                                                 return (0);
 965                                 } else {
 966                                         /*
 967                                          * prefer highest range
 968                                          */
 969                                         if (hi - hole_end >= tot_len)
 970                                                 lo = hole_end;
 971                                         else if (hole_start - lo >= tot_len)
 972                                                 hi = hole_start;
 973                                         else
 974                                                 return (0);
 975                                 }
 976                         }
 977                 }
 978         } else {
 979                 /* lo >= hole_start */
 980                 if (hi < hole_end)
 981                         return (0);
 982                 if (lo < hole_end)
 983                         lo = hole_end;
 984         }
 985
 986         if (hi - lo < tot_len)
 987                 return (0);
 988
 989         if (align > 1) {
 990                 uintptr_t tlo = lo + redzone;
 991                 uintptr_t thi = hi - redzone;
 992                 tlo = (uintptr_t)P2PHASEUP(tlo, align, off);
 993                 if (tlo < lo + redzone) {
 994                         return (0);
 995                 }
 996                 if (thi < tlo || thi - tlo < minlen) {
 997                         return (0);
 998                 }
 999         }
1000
1001         *basep = (caddr_t)lo;
1002         *lenp = hi - lo;
1003         return (1);
1004 }
1005
1006 /*
1007  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
1008  * addresses at least "minlen" long.  On success, 1 is returned and *basep
1009  * and *lenp are adjusted to describe the acceptable range.  On failure, 0
1010  * is returned.
1011  */
1012 int
1013 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
1014 {
1015         return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0));
1016 }
1017
1018 /*
1019  * Default to forbidding the first 64k of address space.  This protects most
1020  * reasonably sized structures from dereferences through NULL:
1021  *     ((foo_t *)0)->bar
1022  */
1023 uintptr_t forbidden_null_mapping_sz = 0x10000;
1024
1025 /*
1026  * Determine whether [addr, addr+len] are valid user addresses.
1027  */
1028 /*ARGSUSED*/
1029 int
1030 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
1031     caddr_t userlimit)
1032 {
1033         caddr_t eaddr = addr + len;
1034
1035         if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
1036                 return (RANGE_BADADDR);
1037
1038         if ((addr <= (caddr_t)forbidden_null_mapping_sz) &&
1039             as->a_proc != NULL &&
1040             secflag_enabled(as->a_proc, PROC_SEC_FORBIDNULLMAP))
1041                 return (RANGE_BADADDR);
1042
1043         /*
1044          * Check for the VA hole
1045          */
1046         if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end)
1047                 return (RANGE_BADADDR);
1048
1049         return (RANGE_OKAY);
1050 }
1051
1052 /*
1053  * Return 1 if the page frame is onboard memory, else 0.
1054  */
1055 int
1056 pf_is_memory(pfn_t pf)
1057 {
1058         if (pfn_is_foreign(pf))
1059                 return (0);
1060         return (address_in_memlist(phys_install, pfn_to_pa(pf), 1));
1061 }
1062
1063 /*
1064  * return the memrange containing pfn
1065  */
1066 int
1067 memrange_num(pfn_t pfn)
1068 {
1069         int n;
1070
1071         for (n = 0; n < nranges - 1; ++n) {
1072                 if (pfn >= memranges[n])
1073                         break;
1074         }
1075         return (n);
1076 }
1077
1078 /*
1079  * return the mnoderange containing pfn
1080  */
1081 /*ARGSUSED*/
1082 int
1083 pfn_2_mtype(pfn_t pfn)
1084 {
1085 #if defined(__xpv)
1086         return (0);
1087 #else
1088         int     n;
1089
1090         /* Always start from highest pfn and work our way down */
1091         for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
1092                 if (pfn >= mnoderanges[n].mnr_pfnlo) {
1093                         break;
1094                 }
1095         }
1096         return (n);
1097 #endif
1098 }
1099
1100 #if !defined(__xpv)
1101 /*
1102  * is_contigpage_free:
1103  *      returns a page list of contiguous pages. It minimally has to return
1104  *      minctg pages. Caller determines minctg based on the scatter-gather
1105  *      list length.
1106  *
1107  *      pfnp is set to the next page frame to search on return.
1108  */
1109 static page_t *
1110 is_contigpage_free(
1111         pfn_t *pfnp,
1112         pgcnt_t *pgcnt,
1113         pgcnt_t minctg,
1114         uint64_t pfnseg,
1115         int iolock)
1116 {
1117         int     i = 0;
1118         pfn_t   pfn = *pfnp;
1119         page_t  *pp;
1120         page_t  *plist = NULL;
1121
1122         /*
1123          * fail if pfn + minctg crosses a segment boundary.
1124          * Adjust for next starting pfn to begin at segment boundary.
1125          */
1126
1127         if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) {
1128                 *pfnp = roundup(*pfnp, pfnseg + 1);
1129                 return (NULL);
1130         }
1131
1132         do {
1133 retry:
1134                 pp = page_numtopp_nolock(pfn + i);
1135                 if ((pp == NULL) || IS_DUMP_PAGE(pp) ||
1136                     (page_trylock(pp, SE_EXCL) == 0)) {
1137                         (*pfnp)++;
1138                         break;
1139                 }
1140                 if (page_pptonum(pp) != pfn + i) {
1141                         page_unlock(pp);
1142                         goto retry;
1143                 }
1144
1145                 if (!(PP_ISFREE(pp))) {
1146                         page_unlock(pp);
1147                         (*pfnp)++;
1148                         break;
1149                 }
1150
1151                 if (!PP_ISAGED(pp)) {
1152                         page_list_sub(pp, PG_CACHE_LIST);
1153                         page_hashout(pp, (kmutex_t *)NULL);
1154                 } else {
1155                         page_list_sub(pp, PG_FREE_LIST);
1156                 }
1157
1158                 if (iolock)
1159                         page_io_lock(pp);
1160                 page_list_concat(&plist, &pp);
1161
1162                 /*
1163                  * exit loop when pgcnt satisfied or segment boundary reached.
1164                  */
1165
1166         } while ((++i < *pgcnt) && ((pfn + i) & pfnseg));
1167
1168         *pfnp += i;             /* set to next pfn to search */
1169
1170         if (i >= minctg) {
1171                 *pgcnt -= i;
1172                 return (plist);
1173         }
1174
1175         /*
1176          * failure: minctg not satisfied.
1177          *
1178          * if next request crosses segment boundary, set next pfn
1179          * to search from the segment boundary.
1180          */
1181         if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg))
1182                 *pfnp = roundup(*pfnp, pfnseg + 1);
1183
1184         /* clean up any pages already allocated */
1185
1186         while (plist) {
1187                 pp = plist;
1188                 page_sub(&plist, pp);
1189                 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
1190                 if (iolock)
1191                         page_io_unlock(pp);
1192                 page_unlock(pp);
1193         }
1194
1195         return (NULL);
1196 }
1197 #endif  /* !__xpv */
1198
1199 /*
1200  * verify that pages being returned from allocator have correct DMA attribute
1201  */
1202 #ifndef DEBUG
1203 #define check_dma(a, b, c) (void)(0)
1204 #else
1205 static void
1206 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt)
1207 {
1208         if (dma_attr == NULL)
1209                 return;
1210
1211         while (cnt-- > 0) {
1212                 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) <
1213                     dma_attr->dma_attr_addr_lo)
1214                         panic("PFN (pp=%p) below dma_attr_addr_lo", (void *)pp);
1215                 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) >=
1216                     dma_attr->dma_attr_addr_hi)
1217                         panic("PFN (pp=%p) above dma_attr_addr_hi", (void *)pp);
1218                 pp = pp->p_next;
1219         }
1220 }
1221 #endif
1222
1223 #if !defined(__xpv)
1224 static page_t *
1225 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock)
1226 {
1227         pfn_t           pfn;
1228         int             sgllen;
1229         uint64_t        pfnseg;
1230         pgcnt_t         minctg;
1231         page_t          *pplist = NULL, *plist;
1232         uint64_t        lo, hi;
1233         pgcnt_t         pfnalign = 0;
1234         static pfn_t    startpfn;
1235         static pgcnt_t  lastctgcnt;
1236         uintptr_t       align;
1237
1238         CONTIG_LOCK();
1239
1240         if (mattr) {
1241                 lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET));
1242                 hi = mmu_btop(mattr->dma_attr_addr_hi);
1243                 if (hi >= physmax)
1244                         hi = physmax - 1;
1245                 sgllen = mattr->dma_attr_sgllen;
1246                 pfnseg = mmu_btop(mattr->dma_attr_seg);
1247
1248                 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
1249                 if (align > MMU_PAGESIZE)
1250                         pfnalign = mmu_btop(align);
1251
1252                 /*
1253                  * in order to satisfy the request, must minimally
1254                  * acquire minctg contiguous pages
1255                  */
1256                 minctg = howmany(*pgcnt, sgllen);
1257
1258                 ASSERT(hi >= lo);
1259
1260                 /*
1261                  * start from where last searched if the minctg >= lastctgcnt
1262                  */
1263                 if (minctg < lastctgcnt || startpfn < lo || startpfn > hi)
1264                         startpfn = lo;
1265         } else {
1266                 hi = physmax - 1;
1267                 lo = 0;
1268                 sgllen = 1;
1269                 pfnseg = mmu.highest_pfn;
1270                 minctg = *pgcnt;
1271
1272                 if (minctg < lastctgcnt)
1273                         startpfn = lo;
1274         }
1275         lastctgcnt = minctg;
1276
1277         ASSERT(pfnseg + 1 >= (uint64_t)minctg);
1278
1279         /* conserve 16m memory - start search above 16m when possible */
1280         if (hi > PFN_16M && startpfn < PFN_16M)
1281                 startpfn = PFN_16M;
1282
1283         pfn = startpfn;
1284         if (pfnalign)
1285                 pfn = P2ROUNDUP(pfn, pfnalign);
1286
1287         while (pfn + minctg - 1 <= hi) {
1288
1289                 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
1290                 if (plist) {
1291                         page_list_concat(&pplist, &plist);
1292                         sgllen--;
1293                         /*
1294                          * return when contig pages no longer needed
1295                          */
1296                         if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
1297                                 startpfn = pfn;
1298                                 CONTIG_UNLOCK();
1299                                 check_dma(mattr, pplist, *pgcnt);
1300                                 return (pplist);
1301                         }
1302                         minctg = howmany(*pgcnt, sgllen);
1303                 }
1304                 if (pfnalign)
1305                         pfn = P2ROUNDUP(pfn, pfnalign);
1306         }
1307
1308         /* cannot find contig pages in specified range */
1309         if (startpfn == lo) {
1310                 CONTIG_UNLOCK();
1311                 return (NULL);
1312         }
1313
1314         /* did not start with lo previously */
1315         pfn = lo;
1316         if (pfnalign)
1317                 pfn = P2ROUNDUP(pfn, pfnalign);
1318
1319         /* allow search to go above startpfn */
1320         while (pfn < startpfn) {
1321
1322                 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
1323                 if (plist != NULL) {
1324
1325                         page_list_concat(&pplist, &plist);
1326                         sgllen--;
1327
1328                         /*
1329                          * return when contig pages no longer needed
1330                          */
1331                         if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
1332                                 startpfn = pfn;
1333                                 CONTIG_UNLOCK();
1334                                 check_dma(mattr, pplist, *pgcnt);
1335                                 return (pplist);
1336                         }
1337                         minctg = howmany(*pgcnt, sgllen);
1338                 }
1339                 if (pfnalign)
1340                         pfn = P2ROUNDUP(pfn, pfnalign);
1341         }
1342         CONTIG_UNLOCK();
1343         return (NULL);
1344 }
1345 #endif  /* !__xpv */
1346
1347 /*
1348  * mnode_range_cnt() calculates the number of memory ranges for mnode and
1349  * memranges[]. Used to determine the size of page lists and mnoderanges.
1350  */
1351 int
1352 mnode_range_cnt(int mnode)
1353 {
1354 #if defined(__xpv)
1355         ASSERT(mnode == 0);
1356         return (1);
1357 #else   /* __xpv */
1358         int     mri;
1359         int     mnrcnt = 0;
1360
1361         if (mem_node_config[mnode].exists != 0) {
1362                 mri = nranges - 1;
1363
1364                 /* find the memranges index below contained in mnode range */
1365
1366                 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1367                         mri--;
1368
1369                 /*
1370                  * increment mnode range counter when memranges or mnode
1371                  * boundary is reached.
1372                  */
1373                 while (mri >= 0 &&
1374                     mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
1375                         mnrcnt++;
1376                         if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1377                                 mri--;
1378                         else
1379                                 break;
1380                 }
1381         }
1382         ASSERT(mnrcnt <= MAX_MNODE_MRANGES);
1383         return (mnrcnt);
1384 #endif  /* __xpv */
1385 }
1386
1387 static int
1388 mnoderange_cmp(const void *v1, const void *v2)
1389 {
1390         const mnoderange_t *m1 = v1;
1391         const mnoderange_t *m2 = v2;
1392
1393         if (m1->mnr_pfnlo < m2->mnr_pfnlo)
1394                 return (-1);
1395         return (m1->mnr_pfnlo > m2->mnr_pfnlo);
1396 }
1397
1398 void
1399 mnode_range_setup(mnoderange_t *mnoderanges)
1400 {
1401         mnoderange_t *mp;
1402         ssize_t nr_ranges;
1403         size_t mnode;
1404
1405         for (mnode = 0, nr_ranges = 0, mp = mnoderanges;
1406             mnode < max_mem_nodes; mnode++) {
1407                 ssize_t mri = nranges - 1;
1408
1409                 if (mem_node_config[mnode].exists == 0)
1410                         continue;
1411
1412                 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1413                         mri--;
1414
1415                 while (mri >= 0 && mem_node_config[mnode].physmax >=
1416                     MEMRANGELO(mri)) {
1417                         mp->mnr_pfnlo = MAX(MEMRANGELO(mri),
1418                             mem_node_config[mnode].physbase);
1419                         mp->mnr_pfnhi = MIN(MEMRANGEHI(mri),
1420                             mem_node_config[mnode].physmax);
1421                         mp->mnr_mnode = mnode;
1422                         mp->mnr_memrange = mri;
1423                         mp->mnr_next = -1;
1424                         mp->mnr_exists = 1;
1425                         mp++;
1426                         nr_ranges++;
1427                         if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1428                                 mri--;
1429                         else
1430                                 break;
1431                 }
1432         }
1433
1434         /*
1435          * mnoderangecnt can be larger than nr_ranges when memory DR is
1436          * supposedly supported.
1437          */
1438         VERIFY3U(nr_ranges, <=, mnoderangecnt);
1439
1440         qsort(mnoderanges, nr_ranges, sizeof (mnoderange_t), mnoderange_cmp);
1441
1442         /*
1443          * If some intrepid soul takes the axe to the memory DR code, we can
1444          * remove ->mnr_next altogether, as we just sorted by ->mnr_pfnlo order.
1445          *
1446          * The VERIFY3U() above can be "==" then too.
1447          */
1448         for (size_t i = 1; i < nr_ranges; i++)
1449                 mnoderanges[i].mnr_next = i - 1;
1450
1451         mtypetop = nr_ranges - 1;
1452         mtype16m = pfn_2_mtype(PFN_16MEG - 1); /* Can be -1 ... */
1453         if (physmax4g)
1454                 mtype4g = pfn_2_mtype(0xfffff);
1455 }
1456
1457 #ifndef __xpv
1458 /*
1459  * Update mnoderanges for memory hot-add DR operations.
1460  */
1461 static void
1462 mnode_range_add(int mnode)
1463 {
1464         int     *prev;
1465         int     n, mri;
1466         pfn_t   start, end;
1467         extern  void membar_sync(void);
1468
1469         ASSERT(0 <= mnode && mnode < max_mem_nodes);
1470         ASSERT(mem_node_config[mnode].exists);
1471         start = mem_node_config[mnode].physbase;
1472         end = mem_node_config[mnode].physmax;
1473         ASSERT(start <= end);
1474         mutex_enter(&mnoderange_lock);
1475
1476 #ifdef  DEBUG
1477         /* Check whether it interleaves with other memory nodes. */
1478         for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
1479                 ASSERT(mnoderanges[n].mnr_exists);
1480                 if (mnoderanges[n].mnr_mnode == mnode)
1481                         continue;
1482                 ASSERT(start > mnoderanges[n].mnr_pfnhi ||
1483                     end < mnoderanges[n].mnr_pfnlo);
1484         }
1485 #endif  /* DEBUG */
1486
1487         mri = nranges - 1;
1488         while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1489                 mri--;
1490         while (mri >= 0 && mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
1491                 /* Check whether mtype already exists. */
1492                 for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
1493                         if (mnoderanges[n].mnr_mnode == mnode &&
1494                             mnoderanges[n].mnr_memrange == mri) {
1495                                 mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri),
1496                                     start);
1497                                 mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri),
1498                                     end);
1499                                 break;
1500                         }
1501                 }
1502
1503                 /* Add a new entry if it doesn't exist yet. */
1504                 if (n == -1) {
1505                         /* Try to find an unused entry in mnoderanges array. */
1506                         for (n = 0; n < mnoderangecnt; n++) {
1507                                 if (mnoderanges[n].mnr_exists == 0)
1508                                         break;
1509                         }
1510                         ASSERT(n < mnoderangecnt);
1511                         mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri), start);
1512                         mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri), end);
1513                         mnoderanges[n].mnr_mnode = mnode;
1514                         mnoderanges[n].mnr_memrange = mri;
1515                         mnoderanges[n].mnr_exists = 1;
1516                         /* Page 0 should always be present. */
1517                         for (prev = &mtypetop;
1518                             mnoderanges[*prev].mnr_pfnlo > start;
1519                             prev = &mnoderanges[*prev].mnr_next) {
1520                                 ASSERT(mnoderanges[*prev].mnr_next >= 0);
1521                                 ASSERT(mnoderanges[*prev].mnr_pfnlo > end);
1522                         }
1523                         mnoderanges[n].mnr_next = *prev;
1524                         membar_sync();
1525                         *prev = n;
1526                 }
1527
1528                 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1529                         mri--;
1530                 else
1531                         break;
1532         }
1533
1534         mutex_exit(&mnoderange_lock);
1535 }
1536
1537 /*
1538  * Update mnoderanges for memory hot-removal DR operations.
1539  */
1540 static void
1541 mnode_range_del(int mnode)
1542 {
1543         _NOTE(ARGUNUSED(mnode));
1544         ASSERT(0 <= mnode && mnode < max_mem_nodes);
1545         /* TODO: support deletion operation. */
1546         ASSERT(0);
1547 }
1548
1549 void
1550 plat_slice_add(pfn_t start, pfn_t end)
1551 {
1552         mem_node_add_slice(start, end);
1553         if (plat_dr_enabled()) {
1554                 mnode_range_add(PFN_2_MEM_NODE(start));
1555         }
1556 }
1557
1558 void
1559 plat_slice_del(pfn_t start, pfn_t end)
1560 {
1561         ASSERT(PFN_2_MEM_NODE(start) == PFN_2_MEM_NODE(end));
1562         ASSERT(plat_dr_enabled());
1563         mnode_range_del(PFN_2_MEM_NODE(start));
1564         mem_node_del_slice(start, end);
1565 }
1566 #endif  /* __xpv */
1567
1568 /*ARGSUSED*/
1569 int
1570 mtype_init(vnode_t *vp, caddr_t vaddr, uint_t *flags, size_t pgsz)
1571 {
1572         int mtype = mtypetop;
1573
1574 #if !defined(__xpv)
1575         if (RESTRICT4G_ALLOC) {
1576                 VM_STAT_ADD(vmm_vmstats.restrict4gcnt);
1577                 /* here only for > 4g systems */
1578                 *flags |= PGI_MT_RANGE4G;
1579         } else if (RESTRICT16M_ALLOC(freemem, btop(pgsz), *flags)) {
1580                 *flags |= PGI_MT_RANGE16M;
1581         } else {
1582                 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1583                 VM_STAT_COND_ADD((*flags & PG_PANIC), vmm_vmstats.pgpanicalloc);
1584                 *flags |= PGI_MT_RANGE0;
1585         }
1586 #endif /* !__xpv */
1587         return (mtype);
1588 }
1589
1590
1591 /* mtype init for page_get_replacement_page */
1592 /*ARGSUSED*/
1593 int
1594 mtype_pgr_init(int *flags, page_t *pp, pgcnt_t pgcnt)
1595 {
1596         int mtype = mtypetop;
1597 #if !defined(__xpv)
1598         if (RESTRICT16M_ALLOC(freemem, pgcnt, *flags)) {
1599                 *flags |= PGI_MT_RANGE16M;
1600         } else {
1601                 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1602                 *flags |= PGI_MT_RANGE0;
1603         }
1604 #endif
1605         return (mtype);
1606 }
1607
1608 /*
1609  * Determine if the mnode range specified in mtype contains memory belonging
1610  * to memory node mnode.  If flags & PGI_MT_RANGE is set then mtype contains
1611  * the range from high pfn to 0, 16m or 4g.
1612  *
1613  * Return first mnode range type index found otherwise return -1 if none found.
1614  */
1615 int
1616 mtype_func(int mnode, int mtype, uint_t flags)
1617 {
1618         if (flags & PGI_MT_RANGE) {
1619                 int     mnr_lim = MRI_0;
1620
1621                 if (flags & PGI_MT_NEXT) {
1622                         mtype = mnoderanges[mtype].mnr_next;
1623                 }
1624                 if (flags & PGI_MT_RANGE4G)
1625                         mnr_lim = MRI_4G;       /* exclude 0-4g range */
1626                 else if (flags & PGI_MT_RANGE16M)
1627                         mnr_lim = MRI_16M;      /* exclude 0-16m range */
1628                 while (mtype != -1 &&
1629                     mnoderanges[mtype].mnr_memrange <= mnr_lim) {
1630                         if (mnoderanges[mtype].mnr_mnode == mnode)
1631                                 return (mtype);
1632                         mtype = mnoderanges[mtype].mnr_next;
1633                 }
1634         } else if (mnoderanges[mtype].mnr_mnode == mnode) {
1635                 return (mtype);
1636         }
1637         return (-1);
1638 }
1639
1640 /*
1641  * Update the page list max counts with the pfn range specified by the
1642  * input parameters.
1643  */
1644 void
1645 mtype_modify_max(pfn_t startpfn, long cnt)
1646 {
1647         int             mtype;
1648         pgcnt_t         inc;
1649         spgcnt_t        scnt = (spgcnt_t)(cnt);
1650         pgcnt_t         acnt = ABS(scnt);
1651         pfn_t           endpfn = startpfn + acnt;
1652         pfn_t           pfn, lo;
1653
1654         if (!physmax4g)
1655                 return;
1656
1657         mtype = mtypetop;
1658         for (pfn = endpfn; pfn > startpfn; ) {
1659                 ASSERT(mtype != -1);
1660                 lo = mnoderanges[mtype].mnr_pfnlo;
1661                 if (pfn > lo) {
1662                         if (startpfn >= lo) {
1663                                 inc = pfn - startpfn;
1664                         } else {
1665                                 inc = pfn - lo;
1666                         }
1667                         if (mnoderanges[mtype].mnr_memrange != MRI_4G) {
1668                                 if (scnt > 0)
1669                                         maxmem4g += inc;
1670                                 else
1671                                         maxmem4g -= inc;
1672                         }
1673                         pfn -= inc;
1674                 }
1675                 mtype = mnoderanges[mtype].mnr_next;
1676         }
1677 }
1678
1679 int
1680 mtype_2_mrange(int mtype)
1681 {
1682         return (mnoderanges[mtype].mnr_memrange);
1683 }
1684
1685 void
1686 mnodetype_2_pfn(int mnode, int mtype, pfn_t *pfnlo, pfn_t *pfnhi)
1687 {
1688         _NOTE(ARGUNUSED(mnode));
1689         ASSERT(mnoderanges[mtype].mnr_mnode == mnode);
1690         *pfnlo = mnoderanges[mtype].mnr_pfnlo;
1691         *pfnhi = mnoderanges[mtype].mnr_pfnhi;
1692 }
1693
1694 size_t
1695 plcnt_sz(size_t ctrs_sz)
1696 {
1697 #ifdef DEBUG
1698         int     szc, colors;
1699
1700         ctrs_sz += mnoderangecnt * sizeof (struct mnr_mts) * mmu_page_sizes;
1701         for (szc = 0; szc < mmu_page_sizes; szc++) {
1702                 colors = page_get_pagecolors(szc);
1703                 ctrs_sz += mnoderangecnt * sizeof (pgcnt_t) * colors;
1704         }
1705 #endif
1706         return (ctrs_sz);
1707 }
1708
1709 caddr_t
1710 plcnt_init(caddr_t addr)
1711 {
1712 #ifdef DEBUG
1713         int     mt, szc, colors;
1714
1715         for (mt = 0; mt < mnoderangecnt; mt++) {
1716                 mnoderanges[mt].mnr_mts = (struct mnr_mts *)addr;
1717                 addr += (sizeof (struct mnr_mts) * mmu_page_sizes);
1718                 for (szc = 0; szc < mmu_page_sizes; szc++) {
1719                         colors = page_get_pagecolors(szc);
1720                         mnoderanges[mt].mnr_mts[szc].mnr_mts_colors = colors;
1721                         mnoderanges[mt].mnr_mts[szc].mnr_mtsc_pgcnt =
1722                             (pgcnt_t *)addr;
1723                         addr += (sizeof (pgcnt_t) * colors);
1724                 }
1725         }
1726 #endif
1727         return (addr);
1728 }
1729
1730 void
1731 plcnt_inc_dec(page_t *pp, int mtype, int szc, long cnt, int flags)
1732 {
1733         _NOTE(ARGUNUSED(pp));
1734 #ifdef DEBUG
1735         int     bin = PP_2_BIN(pp);
1736
1737         atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mts_pgcnt, cnt);
1738         atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mtsc_pgcnt[bin],
1739             cnt);
1740 #endif
1741         ASSERT(mtype == PP_2_MTYPE(pp));
1742         if (physmax4g && mnoderanges[mtype].mnr_memrange != MRI_4G)
1743                 atomic_add_long(&freemem4g, cnt);
1744         if (flags & PG_CACHE_LIST)
1745                 atomic_add_long(&mnoderanges[mtype].mnr_mt_clpgcnt, cnt);
1746         else
1747                 atomic_add_long(&mnoderanges[mtype].mnr_mt_flpgcnt[szc], cnt);
1748         atomic_add_long(&mnoderanges[mtype].mnr_mt_totcnt, cnt);
1749 }
1750
1751 /*
1752  * Returns the free page count for mnode
1753  */
1754 int
1755 mnode_pgcnt(int mnode)
1756 {
1757         int     mtype = mtypetop;
1758         int     flags = PGI_MT_RANGE0;
1759         pgcnt_t pgcnt = 0;
1760
1761         mtype = mtype_func(mnode, mtype, flags);
1762
1763         while (mtype != -1) {
1764                 pgcnt += MTYPE_FREEMEM(mtype);
1765                 mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT);
1766         }
1767         return (pgcnt);
1768 }
1769
1770 /*
1771  * Initialize page coloring variables based on the l2 cache parameters.
1772  * Calculate and return memory needed for page coloring data structures.
1773  */
1774 size_t
1775 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc)
1776 {
1777         _NOTE(ARGUNUSED(l2_linesz));
1778         size_t  colorsz = 0;
1779         int     i;
1780         int     colors;
1781
1782 #if defined(__xpv)
1783         /*
1784          * Hypervisor domains currently don't have any concept of NUMA.
1785          * Hence we'll act like there is only 1 memrange.
1786          */
1787         i = memrange_num(1);
1788 #else /* !__xpv */
1789         /*
1790          * Reduce the memory ranges lists if we don't have large amounts
1791          * of memory. This avoids searching known empty free lists.
1792          * To support memory DR operations, we need to keep memory ranges
1793          * for possible memory hot-add operations.
1794          */
1795         if (plat_dr_physmax > physmax)
1796                 i = memrange_num(plat_dr_physmax);
1797         else
1798                 i = memrange_num(physmax);
1799         /* physmax greater than 4g */
1800         if (i == MRI_4G)
1801                 physmax4g = 1;
1802 #endif /* !__xpv */
1803         memranges += i;
1804         nranges -= i;
1805
1806         ASSERT(mmu_page_sizes <= MMU_PAGE_SIZES);
1807
1808         ASSERT(ISP2(l2_linesz));
1809         ASSERT(l2_sz > MMU_PAGESIZE);
1810
1811         /* l2_assoc is 0 for fully associative l2 cache */
1812         if (l2_assoc)
1813                 l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE));
1814         else
1815                 l2_colors = 1;
1816
1817         ASSERT(ISP2(l2_colors));
1818
1819         /* for scalability, configure at least PAGE_COLORS_MIN color bins */
1820         page_colors = MAX(l2_colors, PAGE_COLORS_MIN);
1821
1822         /*
1823          * cpu_page_colors is non-zero when a page color may be spread across
1824          * multiple bins.
1825          */
1826         if (l2_colors < page_colors)
1827                 cpu_page_colors = l2_colors;
1828
1829         ASSERT(ISP2(page_colors));
1830
1831         page_colors_mask = page_colors - 1;
1832
1833         ASSERT(ISP2(CPUSETSIZE()));
1834         page_coloring_shift = lowbit(CPUSETSIZE());
1835
1836         /* initialize number of colors per page size */
1837         for (i = 0; i <= mmu.max_page_level; i++) {
1838                 hw_page_array[i].hp_size = LEVEL_SIZE(i);
1839                 hw_page_array[i].hp_shift = LEVEL_SHIFT(i);
1840                 hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0);
1841                 hw_page_array[i].hp_colors = (page_colors_mask >>
1842                     (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
1843                     + 1;
1844                 colorequivszc[i] = 0;
1845         }
1846
1847         /*
1848          * The value of cpu_page_colors determines if additional color bins
1849          * need to be checked for a particular color in the page_get routines.
1850          */
1851         if (cpu_page_colors != 0) {
1852
1853                 int a = lowbit(page_colors) - lowbit(cpu_page_colors);
1854                 ASSERT(a > 0);
1855                 ASSERT(a < 16);
1856
1857                 for (i = 0; i <= mmu.max_page_level; i++) {
1858                         if ((colors = hw_page_array[i].hp_colors) <= 1) {
1859                                 colorequivszc[i] = 0;
1860                                 continue;
1861                         }
1862                         while ((colors >> a) == 0)
1863                                 a--;
1864                         ASSERT(a >= 0);
1865
1866                         /* higher 4 bits encodes color equiv mask */
1867                         colorequivszc[i] = (a << 4);
1868                 }
1869         }
1870
1871         /* factor in colorequiv to check additional 'equivalent' bins. */
1872         if (colorequiv > 1) {
1873
1874                 int a = lowbit(colorequiv) - 1;
1875                 if (a > 15)
1876                         a = 15;
1877
1878                 for (i = 0; i <= mmu.max_page_level; i++) {
1879                         if ((colors = hw_page_array[i].hp_colors) <= 1) {
1880                                 continue;
1881                         }
1882                         while ((colors >> a) == 0)
1883                                 a--;
1884                         if ((a << 4) > colorequivszc[i]) {
1885                                 colorequivszc[i] = (a << 4);
1886                         }
1887                 }
1888         }
1889
1890         /* size for mnoderanges */
1891         for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++)
1892                 mnoderangecnt += mnode_range_cnt(i);
1893         if (plat_dr_support_memory()) {
1894                 /*
1895                  * Reserve enough space for memory DR operations.
1896                  * Two extra mnoderanges for possbile fragmentations,
1897                  * one for the 2G boundary and the other for the 4G boundary.
1898                  * We don't expect a memory board crossing the 16M boundary
1899                  * for memory hot-add operations on x86 platforms.
1900                  */
1901                 mnoderangecnt += 2 + max_mem_nodes - lgrp_plat_node_cnt;
1902         }
1903         colorsz = mnoderangecnt * sizeof (mnoderange_t);
1904
1905         /* size for fpc_mutex and cpc_mutex */
1906         colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX);
1907
1908         /* size of page_freelists */
1909         colorsz += mnoderangecnt * sizeof (page_t ***);
1910         colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **);
1911
1912         for (i = 0; i < mmu_page_sizes; i++) {
1913                 colors = page_get_pagecolors(i);
1914                 colorsz += mnoderangecnt * colors * sizeof (page_t *);
1915         }
1916
1917         /* size of page_cachelists */
1918         colorsz += mnoderangecnt * sizeof (page_t **);
1919         colorsz += mnoderangecnt * page_colors * sizeof (page_t *);
1920
1921         return (colorsz);
1922 }
1923
1924 /*
1925  * Called once at startup to configure page_coloring data structures and
1926  * does the 1st page_free()/page_freelist_add().
1927  */
1928 void
1929 page_coloring_setup(caddr_t pcmemaddr)
1930 {
1931         int     i;
1932         int     j;
1933         int     k;
1934         caddr_t addr;
1935         int     colors;
1936
1937         /*
1938          * do page coloring setup
1939          */
1940         addr = pcmemaddr;
1941
1942         mnoderanges = (mnoderange_t *)addr;
1943         addr += (mnoderangecnt * sizeof (mnoderange_t));
1944
1945         mnode_range_setup(mnoderanges);
1946
1947         for (k = 0; k < NPC_MUTEX; k++) {
1948                 fpc_mutex[k] = (kmutex_t *)addr;
1949                 addr += (max_mem_nodes * sizeof (kmutex_t));
1950         }
1951         for (k = 0; k < NPC_MUTEX; k++) {
1952                 cpc_mutex[k] = (kmutex_t *)addr;
1953                 addr += (max_mem_nodes * sizeof (kmutex_t));
1954         }
1955         page_freelists = (page_t ****)addr;
1956         addr += (mnoderangecnt * sizeof (page_t ***));
1957
1958         page_cachelists = (page_t ***)addr;
1959         addr += (mnoderangecnt * sizeof (page_t **));
1960
1961         for (i = 0; i < mnoderangecnt; i++) {
1962                 page_freelists[i] = (page_t ***)addr;
1963                 addr += (mmu_page_sizes * sizeof (page_t **));
1964
1965                 for (j = 0; j < mmu_page_sizes; j++) {
1966                         colors = page_get_pagecolors(j);
1967                         page_freelists[i][j] = (page_t **)addr;
1968                         addr += (colors * sizeof (page_t *));
1969                 }
1970                 page_cachelists[i] = (page_t **)addr;
1971                 addr += (page_colors * sizeof (page_t *));
1972         }
1973 }
1974
1975 #if defined(__xpv)
1976 /*
1977  * Give back 10% of the io_pool pages to the free list.
1978  * Don't shrink the pool below some absolute minimum.
1979  */
1980 static void
1981 page_io_pool_shrink()
1982 {
1983         int retcnt;
1984         page_t *pp, *pp_first, *pp_last, **curpool;
1985         mfn_t mfn;
1986         int bothpools = 0;
1987
1988         mutex_enter(&io_pool_lock);
1989         io_pool_shrink_attempts++;      /* should be a kstat? */
1990         retcnt = io_pool_cnt / 10;
1991         if (io_pool_cnt - retcnt < io_pool_cnt_min)
1992                 retcnt = io_pool_cnt - io_pool_cnt_min;
1993         if (retcnt <= 0)
1994                 goto done;
1995         io_pool_shrinks++;      /* should be a kstat? */
1996         curpool = &io_pool_4g;
1997 domore:
1998         /*
1999          * Loop through taking pages from the end of the list
2000          * (highest mfns) till amount to return reached.
2001          */
2002         for (pp = *curpool; pp && retcnt > 0; ) {
2003                 pp_first = pp_last = pp->p_prev;
2004                 if (pp_first == *curpool)
2005                         break;
2006                 retcnt--;
2007                 io_pool_cnt--;
2008                 page_io_pool_sub(curpool, pp_first, pp_last);
2009                 if ((mfn = pfn_to_mfn(pp->p_pagenum)) < start_mfn)
2010                         start_mfn = mfn;
2011                 page_free(pp_first, 1);
2012                 pp = *curpool;
2013         }
2014         if (retcnt != 0 && !bothpools) {
2015                 /*
2016                  * If not enough found in less constrained pool try the
2017                  * more constrained one.
2018                  */
2019                 curpool = &io_pool_16m;
2020                 bothpools = 1;
2021                 goto domore;
2022         }
2023 done:
2024         mutex_exit(&io_pool_lock);
2025 }
2026
2027 #endif  /* __xpv */
2028
2029 uint_t
2030 page_create_update_flags_x86(uint_t flags)
2031 {
2032 #if defined(__xpv)
2033         /*
2034          * Check this is an urgent allocation and free pages are depleted.
2035          */
2036         if (!(flags & PG_WAIT) && freemem < desfree)
2037                 page_io_pool_shrink();
2038 #else /* !__xpv */
2039         /*
2040          * page_create_get_something may call this because 4g memory may be
2041          * depleted. Set flags to allow for relocation of base page below
2042          * 4g if necessary.
2043          */
2044         if (physmax4g)
2045                 flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI);
2046 #endif /* __xpv */
2047         return (flags);
2048 }
2049
2050 /*ARGSUSED*/
2051 int
2052 bp_color(struct buf *bp)
2053 {
2054         return (0);
2055 }
2056
2057 #if defined(__xpv)
2058
2059 /*
2060  * Take pages out of an io_pool
2061  */
2062 static void
2063 page_io_pool_sub(page_t **poolp, page_t *pp_first, page_t *pp_last)
2064 {
2065         if (*poolp == pp_first) {
2066                 *poolp = pp_last->p_next;
2067                 if (*poolp == pp_first)
2068                         *poolp = NULL;
2069         }
2070         pp_first->p_prev->p_next = pp_last->p_next;
2071         pp_last->p_next->p_prev = pp_first->p_prev;
2072         pp_first->p_prev = pp_last;
2073         pp_last->p_next = pp_first;
2074 }
2075
2076 /*
2077  * Put a page on the io_pool list. The list is ordered by increasing MFN.
2078  */
2079 static void
2080 page_io_pool_add(page_t **poolp, page_t *pp)
2081 {
2082         page_t  *look;
2083         mfn_t   mfn = mfn_list[pp->p_pagenum];
2084
2085         if (*poolp == NULL) {
2086                 *poolp = pp;
2087                 pp->p_next = pp;
2088                 pp->p_prev = pp;
2089                 return;
2090         }
2091
2092         /*
2093          * Since we try to take pages from the high end of the pool
2094          * chances are good that the pages to be put on the list will
2095          * go at or near the end of the list. so start at the end and
2096          * work backwards.
2097          */
2098         look = (*poolp)->p_prev;
2099         while (mfn < mfn_list[look->p_pagenum]) {
2100                 look = look->p_prev;
2101                 if (look == (*poolp)->p_prev)
2102                         break; /* backed all the way to front of list */
2103         }
2104
2105         /* insert after look */
2106         pp->p_prev = look;
2107         pp->p_next = look->p_next;
2108         pp->p_next->p_prev = pp;
2109         look->p_next = pp;
2110         if (mfn < mfn_list[(*poolp)->p_pagenum]) {
2111                 /*
2112                  * we inserted a new first list element
2113                  * adjust pool pointer to newly inserted element
2114                  */
2115                 *poolp = pp;
2116         }
2117 }
2118
2119 /*
2120  * Add a page to the io_pool.  Setting the force flag will force the page
2121  * into the io_pool no matter what.
2122  */
2123 static void
2124 add_page_to_pool(page_t *pp, int force)
2125 {
2126         page_t *highest;
2127         page_t *freep = NULL;
2128
2129         mutex_enter(&io_pool_lock);
2130         /*
2131          * Always keep the scarce low memory pages
2132          */
2133         if (mfn_list[pp->p_pagenum] < PFN_16MEG) {
2134                 ++io_pool_cnt;
2135                 page_io_pool_add(&io_pool_16m, pp);
2136                 goto done;
2137         }
2138         if (io_pool_cnt < io_pool_cnt_max || force || io_pool_4g == NULL) {
2139                 ++io_pool_cnt;
2140                 page_io_pool_add(&io_pool_4g, pp);
2141         } else {
2142                 highest = io_pool_4g->p_prev;
2143                 if (mfn_list[pp->p_pagenum] < mfn_list[highest->p_pagenum]) {
2144                         page_io_pool_sub(&io_pool_4g, highest, highest);
2145                         page_io_pool_add(&io_pool_4g, pp);
2146                         freep = highest;
2147                 } else {
2148                         freep = pp;
2149                 }
2150         }
2151 done:
2152         mutex_exit(&io_pool_lock);
2153         if (freep)
2154                 page_free(freep, 1);
2155 }
2156
2157
2158 int contig_pfn_cnt;     /* no of pfns in the contig pfn list */
2159 int contig_pfn_max;     /* capacity of the contig pfn list */
2160 int next_alloc_pfn;     /* next position in list to start a contig search */
2161 int contig_pfnlist_updates;     /* pfn list update count */
2162 int contig_pfnlist_builds;      /* how many times have we (re)built list */
2163 int contig_pfnlist_buildfailed; /* how many times has list build failed */
2164 int create_contig_pending;      /* nonzero means taskq creating contig list */
2165 pfn_t *contig_pfn_list = NULL;  /* list of contig pfns in ascending mfn order */
2166
2167 /*
2168  * Function to use in sorting a list of pfns by their underlying mfns.
2169  */
2170 static int
2171 mfn_compare(const void *pfnp1, const void *pfnp2)
2172 {
2173         mfn_t mfn1 = mfn_list[*(pfn_t *)pfnp1];
2174         mfn_t mfn2 = mfn_list[*(pfn_t *)pfnp2];
2175
2176         if (mfn1 > mfn2)
2177                 return (1);
2178         if (mfn1 < mfn2)
2179                 return (-1);
2180         return (0);
2181 }
2182
2183 /*
2184  * Compact the contig_pfn_list by tossing all the non-contiguous
2185  * elements from the list.
2186  */
2187 static void
2188 compact_contig_pfn_list(void)
2189 {
2190         pfn_t pfn, lapfn, prev_lapfn;
2191         mfn_t mfn;
2192         int i, newcnt = 0;
2193
2194         prev_lapfn = 0;
2195         for (i = 0; i < contig_pfn_cnt - 1; i++) {
2196                 pfn = contig_pfn_list[i];
2197                 lapfn = contig_pfn_list[i + 1];
2198                 mfn = mfn_list[pfn];
2199                 /*
2200                  * See if next pfn is for a contig mfn
2201                  */
2202                 if (mfn_list[lapfn] != mfn + 1)
2203                         continue;
2204                 /*
2205                  * pfn and lookahead are both put in list
2206                  * unless pfn is the previous lookahead.
2207                  */
2208                 if (pfn != prev_lapfn)
2209                         contig_pfn_list[newcnt++] = pfn;
2210                 contig_pfn_list[newcnt++] = lapfn;
2211                 prev_lapfn = lapfn;
2212         }
2213         for (i = newcnt; i < contig_pfn_cnt; i++)
2214                 contig_pfn_list[i] = 0;
2215         contig_pfn_cnt = newcnt;
2216 }
2217
2218 /*ARGSUSED*/
2219 static void
2220 call_create_contiglist(void *arg)
2221 {
2222         (void) create_contig_pfnlist(PG_WAIT);
2223 }
2224
2225 /*
2226  * Create list of freelist pfns that have underlying
2227  * contiguous mfns.  The list is kept in ascending mfn order.
2228  * returns 1 if list created else 0.
2229  */
2230 static int
2231 create_contig_pfnlist(uint_t flags)
2232 {
2233         pfn_t pfn;
2234         page_t *pp;
2235         int ret = 1;
2236
2237         mutex_enter(&contig_list_lock);
2238         if (contig_pfn_list != NULL)
2239                 goto out;
2240         contig_pfn_max = freemem + (freemem / 10);
2241         contig_pfn_list = kmem_zalloc(contig_pfn_max * sizeof (pfn_t),
2242             (flags & PG_WAIT) ? KM_SLEEP : KM_NOSLEEP);
2243         if (contig_pfn_list == NULL) {
2244                 /*
2245                  * If we could not create the contig list (because
2246                  * we could not sleep for memory).  Dispatch a taskq that can
2247                  * sleep to get the memory.
2248                  */
2249                 if (!create_contig_pending) {
2250                         if (taskq_dispatch(system_taskq, call_create_contiglist,
2251                             NULL, TQ_NOSLEEP) != TASKQID_INVALID)
2252                                 create_contig_pending = 1;
2253                 }
2254                 contig_pfnlist_buildfailed++;   /* count list build failures */
2255                 ret = 0;
2256                 goto out;
2257         }
2258         create_contig_pending = 0;
2259         ASSERT(contig_pfn_cnt == 0);
2260         for (pfn = 0; pfn < mfn_count; pfn++) {
2261                 pp = page_numtopp_nolock(pfn);
2262                 if (pp == NULL || !PP_ISFREE(pp))
2263                         continue;
2264                 contig_pfn_list[contig_pfn_cnt] = pfn;
2265                 if (++contig_pfn_cnt == contig_pfn_max)
2266                         break;
2267         }
2268         /*
2269          * Sanity check the new list.
2270          */
2271         if (contig_pfn_cnt < 2) { /* no contig pfns */
2272                 contig_pfn_cnt = 0;
2273                 contig_pfnlist_buildfailed++;
2274                 kmem_free(contig_pfn_list, contig_pfn_max * sizeof (pfn_t));
2275                 contig_pfn_list = NULL;
2276                 contig_pfn_max = 0;
2277                 ret = 0;
2278                 goto out;
2279         }
2280         qsort(contig_pfn_list, contig_pfn_cnt, sizeof (pfn_t), mfn_compare);
2281         compact_contig_pfn_list();
2282         /*
2283          * Make sure next search of the newly created contiguous pfn
2284          * list starts at the beginning of the list.
2285          */
2286         next_alloc_pfn = 0;
2287         contig_pfnlist_builds++;        /* count list builds */
2288 out:
2289         mutex_exit(&contig_list_lock);
2290         return (ret);
2291 }
2292
2293
2294 /*
2295  * Toss the current contig pfnlist.  Someone is about to do a massive
2296  * update to pfn<->mfn mappings.  So we have them destroy the list and lock
2297  * it till they are done with their update.
2298  */
2299 void
2300 clear_and_lock_contig_pfnlist()
2301 {
2302         pfn_t *listp = NULL;
2303         size_t listsize;
2304
2305         mutex_enter(&contig_list_lock);
2306         if (contig_pfn_list != NULL) {
2307                 listp = contig_pfn_list;
2308                 listsize = contig_pfn_max * sizeof (pfn_t);
2309                 contig_pfn_list = NULL;
2310                 contig_pfn_max = contig_pfn_cnt = 0;
2311         }
2312         if (listp != NULL)
2313                 kmem_free(listp, listsize);
2314 }
2315
2316 /*
2317  * Unlock the contig_pfn_list.  The next attempted use of it will cause
2318  * it to be re-created.
2319  */
2320 void
2321 unlock_contig_pfnlist()
2322 {
2323         mutex_exit(&contig_list_lock);
2324 }
2325
2326 /*
2327  * Update the contiguous pfn list in response to a pfn <-> mfn reassignment
2328  */
2329 void
2330 update_contig_pfnlist(pfn_t pfn, mfn_t oldmfn, mfn_t newmfn)
2331 {
2332         int probe_hi, probe_lo, probe_pos, insert_after, insert_point;
2333         pfn_t probe_pfn;
2334         mfn_t probe_mfn;
2335         int drop_lock = 0;
2336
2337         if (mutex_owner(&contig_list_lock) != curthread) {
2338                 drop_lock = 1;
2339                 mutex_enter(&contig_list_lock);
2340         }
2341         if (contig_pfn_list == NULL)
2342                 goto done;
2343         contig_pfnlist_updates++;
2344         /*
2345          * Find the pfn in the current list.  Use a binary chop to locate it.
2346          */
2347         probe_hi = contig_pfn_cnt - 1;
2348         probe_lo = 0;
2349         probe_pos = (probe_hi + probe_lo) / 2;
2350         while ((probe_pfn = contig_pfn_list[probe_pos]) != pfn) {
2351                 if (probe_pos == probe_lo) { /* pfn not in list */
2352                         probe_pos = -1;
2353                         break;
2354                 }
2355                 if (pfn_to_mfn(probe_pfn) <= oldmfn)
2356                         probe_lo = probe_pos;
2357                 else
2358                         probe_hi = probe_pos;
2359                 probe_pos = (probe_hi + probe_lo) / 2;
2360         }
2361         if (probe_pos >= 0) {
2362                 /*
2363                  * Remove pfn from list and ensure next alloc
2364                  * position stays in bounds.
2365                  */
2366                 if (--contig_pfn_cnt <= next_alloc_pfn)
2367                         next_alloc_pfn = 0;
2368                 if (contig_pfn_cnt < 2) { /* no contig pfns */
2369                         contig_pfn_cnt = 0;
2370                         kmem_free(contig_pfn_list,
2371                             contig_pfn_max * sizeof (pfn_t));
2372                         contig_pfn_list = NULL;
2373                         contig_pfn_max = 0;
2374                         goto done;
2375                 }
2376                 ovbcopy(&contig_pfn_list[probe_pos + 1],
2377                     &contig_pfn_list[probe_pos],
2378                     (contig_pfn_cnt - probe_pos) * sizeof (pfn_t));
2379         }
2380         if (newmfn == MFN_INVALID)
2381                 goto done;
2382         /*
2383          * Check if new mfn has adjacent mfns in the list
2384          */
2385         probe_hi = contig_pfn_cnt - 1;
2386         probe_lo = 0;
2387         insert_after = -2;
2388         do {
2389                 probe_pos = (probe_hi + probe_lo) / 2;
2390                 probe_mfn = pfn_to_mfn(contig_pfn_list[probe_pos]);
2391                 if (newmfn == probe_mfn + 1)
2392                         insert_after = probe_pos;
2393                 else if (newmfn == probe_mfn - 1)
2394                         insert_after = probe_pos - 1;
2395                 if (probe_pos == probe_lo)
2396                         break;
2397                 if (probe_mfn <= newmfn)
2398                         probe_lo = probe_pos;
2399                 else
2400                         probe_hi = probe_pos;
2401         } while (insert_after == -2);
2402         /*
2403          * If there is space in the list and there are adjacent mfns
2404          * insert the pfn in to its proper place in the list.
2405          */
2406         if (insert_after != -2 && contig_pfn_cnt + 1 <= contig_pfn_max) {
2407                 insert_point = insert_after + 1;
2408                 ovbcopy(&contig_pfn_list[insert_point],
2409                     &contig_pfn_list[insert_point + 1],
2410                     (contig_pfn_cnt - insert_point) * sizeof (pfn_t));
2411                 contig_pfn_list[insert_point] = pfn;
2412                 contig_pfn_cnt++;
2413         }
2414 done:
2415         if (drop_lock)
2416                 mutex_exit(&contig_list_lock);
2417 }
2418
2419 /*
2420  * Called to (re-)populate the io_pool from the free page lists.
2421  */
2422 long
2423 populate_io_pool(void)
2424 {
2425         pfn_t pfn;
2426         mfn_t mfn, max_mfn;
2427         page_t *pp;
2428
2429         /*
2430          * Figure out the bounds of the pool on first invocation.
2431          * We use a percentage of memory for the io pool size.
2432          * we allow that to shrink, but not to less than a fixed minimum
2433          */
2434         if (io_pool_cnt_max == 0) {
2435                 io_pool_cnt_max = physmem / (100 / io_pool_physmem_pct);
2436                 io_pool_cnt_lowater = io_pool_cnt_max;
2437                 /*
2438                  * This is the first time in populate_io_pool, grab a va to use
2439                  * when we need to allocate pages.
2440                  */
2441                 io_pool_kva = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
2442         }
2443         /*
2444          * If we are out of pages in the pool, then grow the size of the pool
2445          */
2446         if (io_pool_cnt == 0) {
2447                 /*
2448                  * Grow the max size of the io pool by 5%, but never more than
2449                  * 25% of physical memory.
2450                  */
2451                 if (io_pool_cnt_max < physmem / 4)
2452                         io_pool_cnt_max += io_pool_cnt_max / 20;
2453         }
2454         io_pool_grows++;        /* should be a kstat? */
2455
2456         /*
2457          * Get highest mfn on this platform, but limit to the 32 bit DMA max.
2458          */
2459         (void) mfn_to_pfn(start_mfn);
2460         max_mfn = MIN(cached_max_mfn, PFN_4GIG);
2461         for (mfn = start_mfn; mfn < max_mfn; start_mfn = ++mfn) {
2462                 pfn = mfn_to_pfn(mfn);
2463                 if (pfn & PFN_IS_FOREIGN_MFN)
2464                         continue;
2465                 /*
2466                  * try to allocate it from free pages
2467                  */
2468                 pp = page_numtopp_alloc(pfn);
2469                 if (pp == NULL)
2470                         continue;
2471                 PP_CLRFREE(pp);
2472                 add_page_to_pool(pp, 1);
2473                 if (io_pool_cnt >= io_pool_cnt_max)
2474                         break;
2475         }
2476
2477         return (io_pool_cnt);
2478 }
2479
2480 /*
2481  * Destroy a page that was being used for DMA I/O. It may or
2482  * may not actually go back to the io_pool.
2483  */
2484 void
2485 page_destroy_io(page_t *pp)
2486 {
2487         mfn_t mfn = mfn_list[pp->p_pagenum];
2488
2489         /*
2490          * When the page was alloc'd a reservation was made, release it now
2491          */
2492         page_unresv(1);
2493         /*
2494          * Unload translations, if any, then hash out the
2495          * page to erase its identity.
2496          */
2497         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
2498         page_hashout(pp, NULL);
2499
2500         /*
2501          * If the page came from the free lists, just put it back to them.
2502          * DomU pages always go on the free lists as well.
2503          */
2504         if (!DOMAIN_IS_INITDOMAIN(xen_info) || mfn >= PFN_4GIG) {
2505                 page_free(pp, 1);
2506                 return;
2507         }
2508
2509         add_page_to_pool(pp, 0);
2510 }
2511
2512
2513 long contig_searches;           /* count of times contig pages requested */
2514 long contig_search_restarts;    /* count of contig ranges tried */
2515 long contig_search_failed;      /* count of contig alloc failures */
2516
2517 /*
2518  * Free partial page list
2519  */
2520 static void
2521 free_partial_list(page_t **pplist)
2522 {
2523         page_t *pp;
2524
2525         while (*pplist != NULL) {
2526                 pp = *pplist;
2527                 page_io_pool_sub(pplist, pp, pp);
2528                 page_free(pp, 1);
2529         }
2530 }
2531
2532 /*
2533  * Look thru the contiguous pfns that are not part of the io_pool for
2534  * contiguous free pages.  Return a list of the found pages or NULL.
2535  */
2536 page_t *
2537 find_contig_free(uint_t npages, uint_t flags, uint64_t pfnseg,
2538     pgcnt_t pfnalign)
2539 {
2540         page_t *pp, *plist = NULL;
2541         mfn_t mfn, prev_mfn, start_mfn;
2542         pfn_t pfn;
2543         int pages_needed, pages_requested;
2544         int search_start;
2545
2546         /*
2547          * create the contig pfn list if not already done
2548          */
2549 retry:
2550         mutex_enter(&contig_list_lock);
2551         if (contig_pfn_list == NULL) {
2552                 mutex_exit(&contig_list_lock);
2553                 if (!create_contig_pfnlist(flags)) {
2554                         return (NULL);
2555                 }
2556                 goto retry;
2557         }
2558         contig_searches++;
2559         /*
2560          * Search contiguous pfn list for physically contiguous pages not in
2561          * the io_pool.  Start the search where the last search left off.
2562          */
2563         pages_requested = pages_needed = npages;
2564         search_start = next_alloc_pfn;
2565         start_mfn = prev_mfn = 0;
2566         while (pages_needed) {
2567                 pfn = contig_pfn_list[next_alloc_pfn];
2568                 mfn = pfn_to_mfn(pfn);
2569                 /*
2570                  * Check if mfn is first one or contig to previous one and
2571                  * if page corresponding to mfn is free and that mfn
2572                  * range is not crossing a segment boundary.
2573                  */
2574                 if ((prev_mfn == 0 || mfn == prev_mfn + 1) &&
2575                     (pp = page_numtopp_alloc(pfn)) != NULL &&
2576                     !((mfn & pfnseg) < (start_mfn & pfnseg))) {
2577                         PP_CLRFREE(pp);
2578                         page_io_pool_add(&plist, pp);
2579                         pages_needed--;
2580                         if (prev_mfn == 0) {
2581                                 if (pfnalign &&
2582                                     mfn != P2ROUNDUP(mfn, pfnalign)) {
2583                                         /*
2584                                          * not properly aligned
2585                                          */
2586                                         contig_search_restarts++;
2587                                         free_partial_list(&plist);
2588                                         pages_needed = pages_requested;
2589                                         start_mfn = prev_mfn = 0;
2590                                         goto skip;
2591                                 }
2592                                 start_mfn = mfn;
2593                         }
2594                         prev_mfn = mfn;
2595                 } else {
2596                         contig_search_restarts++;
2597                         free_partial_list(&plist);
2598                         pages_needed = pages_requested;
2599                         start_mfn = prev_mfn = 0;
2600                 }
2601 skip:
2602                 if (++next_alloc_pfn == contig_pfn_cnt)
2603                         next_alloc_pfn = 0;
2604                 if (next_alloc_pfn == search_start)
2605                         break; /* all pfns searched */
2606         }
2607         mutex_exit(&contig_list_lock);
2608         if (pages_needed) {
2609                 contig_search_failed++;
2610                 /*
2611                  * Failed to find enough contig pages.
2612                  * free partial page list
2613                  */
2614                 free_partial_list(&plist);
2615         }
2616         return (plist);
2617 }
2618
2619 /*
2620  * Search the reserved io pool pages for a page range with the
2621  * desired characteristics.
2622  */
2623 page_t *
2624 page_io_pool_alloc(ddi_dma_attr_t *mattr, int contig, pgcnt_t minctg)
2625 {
2626         page_t *pp_first, *pp_last;
2627         page_t *pp, **poolp;
2628         pgcnt_t nwanted, pfnalign;
2629         uint64_t pfnseg;
2630         mfn_t mfn, tmfn, hi_mfn, lo_mfn;
2631         int align, attempt = 0;
2632
2633         if (minctg == 1)
2634                 contig = 0;
2635         lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
2636         hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
2637         pfnseg = mmu_btop(mattr->dma_attr_seg);
2638         align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
2639         if (align > MMU_PAGESIZE)
2640                 pfnalign = mmu_btop(align);
2641         else
2642                 pfnalign = 0;
2643
2644 try_again:
2645         /*
2646          * See if we want pages for a legacy device
2647          */
2648         if (hi_mfn < PFN_16MEG)
2649                 poolp = &io_pool_16m;
2650         else
2651                 poolp = &io_pool_4g;
2652 try_smaller:
2653         /*
2654          * Take pages from I/O pool. We'll use pages from the highest
2655          * MFN range possible.
2656          */
2657         pp_first = pp_last = NULL;
2658         mutex_enter(&io_pool_lock);
2659         nwanted = minctg;
2660         for (pp = *poolp; pp && nwanted > 0; ) {
2661                 pp = pp->p_prev;
2662
2663                 /*
2664                  * skip pages above allowable range
2665                  */
2666                 mfn = mfn_list[pp->p_pagenum];
2667                 if (hi_mfn < mfn)
2668                         goto skip;
2669
2670                 /*
2671                  * stop at pages below allowable range
2672                  */
2673                 if (lo_mfn > mfn)
2674                         break;
2675 restart:
2676                 if (pp_last == NULL) {
2677                         /*
2678                          * Check alignment
2679                          */
2680                         tmfn = mfn - (minctg - 1);
2681                         if (pfnalign && tmfn != P2ROUNDUP(tmfn, pfnalign))
2682                                 goto skip; /* not properly aligned */
2683                         /*
2684                          * Check segment
2685                          */
2686                         if ((mfn & pfnseg) < (tmfn & pfnseg))
2687                                 goto skip; /* crosses seg boundary */
2688                         /*
2689                          * Start building page list
2690                          */
2691                         pp_first = pp_last = pp;
2692                         nwanted--;
2693                 } else {
2694                         /*
2695                          * check physical contiguity if required
2696                          */
2697                         if (contig &&
2698                             mfn_list[pp_first->p_pagenum] != mfn + 1) {
2699                                 /*
2700                                  * not a contiguous page, restart list.
2701                                  */
2702                                 pp_last = NULL;
2703                                 nwanted = minctg;
2704                                 goto restart;
2705                         } else { /* add page to list */
2706                                 pp_first = pp;
2707                                 nwanted--;
2708                         }
2709                 }
2710 skip:
2711                 if (pp == *poolp)
2712                         break;
2713         }
2714
2715         /*
2716          * If we didn't find memory. Try the more constrained pool, then
2717          * sweep free pages into the DMA pool and try again.
2718          */
2719         if (nwanted != 0) {
2720                 mutex_exit(&io_pool_lock);
2721                 /*
2722                  * If we were looking in the less constrained pool and
2723                  * didn't find pages, try the more constrained pool.
2724                  */
2725                 if (poolp == &io_pool_4g) {
2726                         poolp = &io_pool_16m;
2727                         goto try_smaller;
2728                 }
2729                 kmem_reap();
2730                 if (++attempt < 4) {
2731                         /*
2732                          * Grab some more io_pool pages
2733                          */
2734                         (void) populate_io_pool();
2735                         goto try_again; /* go around and retry */
2736                 }
2737                 return (NULL);
2738         }
2739         /*
2740          * Found the pages, now snip them from the list
2741          */
2742         page_io_pool_sub(poolp, pp_first, pp_last);
2743         io_pool_cnt -= minctg;
2744         /*
2745          * reset low water mark
2746          */
2747         if (io_pool_cnt < io_pool_cnt_lowater)
2748                 io_pool_cnt_lowater = io_pool_cnt;
2749         mutex_exit(&io_pool_lock);
2750         return (pp_first);
2751 }
2752
2753 page_t *
2754 page_swap_with_hypervisor(struct vnode *vp, u_offset_t off, caddr_t vaddr,
2755     ddi_dma_attr_t *mattr, uint_t flags, pgcnt_t minctg)
2756 {
2757         uint_t kflags;
2758         int order, extra, extpages, i, contig, nbits, extents;
2759         page_t *pp, *expp, *pp_first, **pplist = NULL;
2760         mfn_t *mfnlist = NULL;
2761
2762         extra = 0;
2763         contig = flags & PG_PHYSCONTIG;
2764         if (minctg == 1)
2765                 contig = 0;
2766         flags &= ~PG_PHYSCONTIG;
2767         kflags = flags & PG_WAIT ? KM_SLEEP : KM_NOSLEEP;
2768         /*
2769          * Hypervisor will allocate extents, if we want contig
2770          * pages extent must be >= minctg
2771          */
2772         if (contig) {
2773                 order = highbit(minctg) - 1;
2774                 if (minctg & ((1 << order) - 1))
2775                         order++;
2776                 extpages = 1 << order;
2777         } else {
2778                 order = 0;
2779                 extpages = minctg;
2780         }
2781         if (extpages > minctg) {
2782                 extra = extpages - minctg;
2783                 if (!page_resv(extra, kflags))
2784                         return (NULL);
2785         }
2786         pp_first = NULL;
2787         pplist = kmem_alloc(extpages * sizeof (page_t *), kflags);
2788         if (pplist == NULL)
2789                 goto balloon_fail;
2790         mfnlist = kmem_alloc(extpages * sizeof (mfn_t), kflags);
2791         if (mfnlist == NULL)
2792                 goto balloon_fail;
2793         pp = page_create_va(vp, off, minctg * PAGESIZE, flags, &kvseg, vaddr);
2794         if (pp == NULL)
2795                 goto balloon_fail;
2796         pp_first = pp;
2797         if (extpages > minctg) {
2798                 /*
2799                  * fill out the rest of extent pages to swap
2800                  * with the hypervisor
2801                  */
2802                 for (i = 0; i < extra; i++) {
2803                         expp = page_create_va(vp,
2804                             (u_offset_t)(uintptr_t)io_pool_kva,
2805                             PAGESIZE, flags, &kvseg, io_pool_kva);
2806                         if (expp == NULL)
2807                                 goto balloon_fail;
2808                         (void) hat_pageunload(expp, HAT_FORCE_PGUNLOAD);
2809                         page_io_unlock(expp);
2810                         page_hashout(expp, NULL);
2811                         page_io_lock(expp);
2812                         /*
2813                          * add page to end of list
2814                          */
2815                         expp->p_prev = pp_first->p_prev;
2816                         expp->p_next = pp_first;
2817                         expp->p_prev->p_next = expp;
2818                         pp_first->p_prev = expp;
2819                 }
2820
2821         }
2822         for (i = 0; i < extpages; i++) {
2823                 pplist[i] = pp;
2824                 pp = pp->p_next;
2825         }
2826         nbits = highbit(mattr->dma_attr_addr_hi);
2827         extents = contig ? 1 : minctg;
2828         if (balloon_replace_pages(extents, pplist, nbits, order,
2829             mfnlist) != extents) {
2830                 if (ioalloc_dbg)
2831                         cmn_err(CE_NOTE, "request to hypervisor"
2832                             " for %d pages, maxaddr %" PRIx64 " failed",
2833                             extpages, mattr->dma_attr_addr_hi);
2834                 goto balloon_fail;
2835         }
2836
2837         kmem_free(pplist, extpages * sizeof (page_t *));
2838         kmem_free(mfnlist, extpages * sizeof (mfn_t));
2839         /*
2840          * Return any excess pages to free list
2841          */
2842         if (extpages > minctg) {
2843                 for (i = 0; i < extra; i++) {
2844                         pp = pp_first->p_prev;
2845                         page_sub(&pp_first, pp);
2846                         page_io_unlock(pp);
2847                         page_unresv(1);
2848                         page_free(pp, 1);
2849                 }
2850         }
2851         return (pp_first);
2852 balloon_fail:
2853         /*
2854          * Return pages to free list and return failure
2855          */
2856         while (pp_first != NULL) {
2857                 pp = pp_first;
2858                 page_sub(&pp_first, pp);
2859                 page_io_unlock(pp);
2860                 if (pp->p_vnode != NULL)
2861                         page_hashout(pp, NULL);
2862                 page_free(pp, 1);
2863         }
2864         if (pplist)
2865                 kmem_free(pplist, extpages * sizeof (page_t *));
2866         if (mfnlist)
2867                 kmem_free(mfnlist, extpages * sizeof (mfn_t));
2868         page_unresv(extpages - minctg);
2869         return (NULL);
2870 }
2871
2872 static void
2873 return_partial_alloc(page_t *plist)
2874 {
2875         page_t *pp;
2876
2877         while (plist != NULL) {
2878                 pp = plist;
2879                 page_sub(&plist, pp);
2880                 page_io_unlock(pp);
2881                 page_destroy_io(pp);
2882         }
2883 }
2884
2885 static page_t *
2886 page_get_contigpages(
2887         struct vnode    *vp,
2888         u_offset_t      off,
2889         int             *npagesp,
2890         uint_t          flags,
2891         caddr_t         vaddr,
2892         ddi_dma_attr_t  *mattr)
2893 {
2894         mfn_t   max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
2895         page_t  *plist; /* list to return */
2896         page_t  *pp, *mcpl;
2897         int     contig, anyaddr, npages, getone = 0;
2898         mfn_t   lo_mfn;
2899         mfn_t   hi_mfn;
2900         pgcnt_t pfnalign = 0;
2901         int     align, sgllen;
2902         uint64_t pfnseg;
2903         pgcnt_t minctg;
2904
2905         npages = *npagesp;
2906         ASSERT(mattr != NULL);
2907         lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
2908         hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
2909         sgllen = mattr->dma_attr_sgllen;
2910         pfnseg = mmu_btop(mattr->dma_attr_seg);
2911         align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
2912         if (align > MMU_PAGESIZE)
2913                 pfnalign = mmu_btop(align);
2914
2915         contig = flags & PG_PHYSCONTIG;
2916         if (npages == -1) {
2917                 npages = 1;
2918                 pfnalign = 0;
2919         }
2920         /*
2921          * Clear the contig flag if only one page is needed.
2922          */
2923         if (npages == 1) {
2924                 getone = 1;
2925                 contig = 0;
2926         }
2927
2928         /*
2929          * Check if any page in the system is fine.
2930          */
2931         anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn;
2932         if (!contig && anyaddr && !pfnalign) {
2933                 flags &= ~PG_PHYSCONTIG;
2934                 plist = page_create_va(vp, off, npages * MMU_PAGESIZE,
2935                     flags, &kvseg, vaddr);
2936                 if (plist != NULL) {
2937                         *npagesp = 0;
2938                         return (plist);
2939                 }
2940         }
2941         plist = NULL;
2942         minctg = howmany(npages, sgllen);
2943         while (npages > sgllen || getone) {
2944                 if (minctg > npages)
2945                         minctg = npages;
2946                 mcpl = NULL;
2947                 /*
2948                  * We could want contig pages with no address range limits.
2949                  */
2950                 if (anyaddr && contig) {
2951                         /*
2952                          * Look for free contig pages to satisfy the request.
2953                          */
2954                         mcpl = find_contig_free(minctg, flags, pfnseg,
2955                             pfnalign);
2956                 }
2957                 /*
2958                  * Try the reserved io pools next
2959                  */
2960                 if (mcpl == NULL)
2961                         mcpl = page_io_pool_alloc(mattr, contig, minctg);
2962                 if (mcpl != NULL) {
2963                         pp = mcpl;
2964                         do {
2965                                 if (!page_hashin(pp, vp, off, NULL)) {
2966                                         panic("page_get_contigpages:"
2967                                             " hashin failed"
2968                                             " pp %p, vp %p, off %llx",
2969                                             (void *)pp, (void *)vp, off);
2970                                 }
2971                                 off += MMU_PAGESIZE;
2972                                 PP_CLRFREE(pp);
2973                                 PP_CLRAGED(pp);
2974                                 page_set_props(pp, P_REF);
2975                                 page_io_lock(pp);
2976                                 pp = pp->p_next;
2977                         } while (pp != mcpl);
2978                 } else {
2979                         /*
2980                          * Hypervisor exchange doesn't handle segment or
2981                          * alignment constraints
2982                          */
2983                         if (mattr->dma_attr_seg < mattr->dma_attr_addr_hi ||
2984                             pfnalign)
2985                                 goto fail;
2986                         /*
2987                          * Try exchanging pages with the hypervisor
2988                          */
2989                         mcpl = page_swap_with_hypervisor(vp, off, vaddr, mattr,
2990                             flags, minctg);
2991                         if (mcpl == NULL)
2992                                 goto fail;
2993                         off += minctg * MMU_PAGESIZE;
2994                 }
2995                 check_dma(mattr, mcpl, minctg);
2996                 /*
2997                  * Here with a minctg run of contiguous pages, add them to the
2998                  * list we will return for this request.
2999                  */
3000                 page_list_concat(&plist, &mcpl);
3001                 npages -= minctg;
3002                 *npagesp = npages;
3003                 sgllen--;
3004                 if (getone)
3005                         break;
3006         }
3007         return (plist);
3008 fail:
3009         return_partial_alloc(plist);
3010         return (NULL);
3011 }
3012
3013 /*
3014  * Allocator for domain 0 I/O pages. We match the required
3015  * DMA attributes and contiguity constraints.
3016  */
3017 /*ARGSUSED*/
3018 page_t *
3019 page_create_io(
3020         struct vnode    *vp,
3021         u_offset_t      off,
3022         uint_t          bytes,
3023         uint_t          flags,
3024         struct as       *as,
3025         caddr_t         vaddr,
3026         ddi_dma_attr_t  *mattr)
3027 {
3028         page_t  *plist = NULL, *pp;
3029         int     npages = 0, contig, anyaddr, pages_req;
3030         mfn_t   lo_mfn;
3031         mfn_t   hi_mfn;
3032         pgcnt_t pfnalign = 0;
3033         int     align;
3034         int     is_domu = 0;
3035         int     dummy, bytes_got;
3036         mfn_t   max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
3037
3038         ASSERT(mattr != NULL);
3039         lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
3040         hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
3041         align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
3042         if (align > MMU_PAGESIZE)
3043                 pfnalign = mmu_btop(align);
3044
3045         /*
3046          * Clear the contig flag if only one page is needed or the scatter
3047          * gather list length is >= npages.
3048          */
3049         pages_req = npages = mmu_btopr(bytes);
3050         contig = (flags & PG_PHYSCONTIG);
3051         bytes = P2ROUNDUP(bytes, MMU_PAGESIZE);
3052         if (bytes == MMU_PAGESIZE || mattr->dma_attr_sgllen >= npages)
3053                 contig = 0;
3054
3055         /*
3056          * Check if any old page in the system is fine.
3057          * DomU should always go down this path.
3058          */
3059         is_domu = !DOMAIN_IS_INITDOMAIN(xen_info);
3060         anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn && !pfnalign;
3061         if ((!contig && anyaddr) || is_domu) {
3062                 flags &= ~PG_PHYSCONTIG;
3063                 plist = page_create_va(vp, off, bytes, flags, &kvseg, vaddr);
3064                 if (plist != NULL)
3065                         return (plist);
3066                 else if (is_domu)
3067                         return (NULL); /* no memory available */
3068         }
3069         /*
3070          * DomU should never reach here
3071          */
3072         if (contig) {
3073                 plist = page_get_contigpages(vp, off, &npages, flags, vaddr,
3074                     mattr);
3075                 if (plist == NULL)
3076                         goto fail;
3077                 bytes_got = (pages_req - npages) << MMU_PAGESHIFT;
3078                 vaddr += bytes_got;
3079                 off += bytes_got;
3080                 /*
3081                  * We now have all the contiguous pages we need, but
3082                  * we may still need additional non-contiguous pages.
3083                  */
3084         }
3085         /*
3086          * now loop collecting the requested number of pages, these do
3087          * not have to be contiguous pages but we will use the contig
3088          * page alloc code to get the pages since it will honor any
3089          * other constraints the pages may have.
3090          */
3091         while (npages--) {
3092                 dummy = -1;
3093                 pp = page_get_contigpages(vp, off, &dummy, flags, vaddr, mattr);
3094                 if (pp == NULL)
3095                         goto fail;
3096                 page_add(&plist, pp);
3097                 vaddr += MMU_PAGESIZE;
3098                 off += MMU_PAGESIZE;
3099         }
3100         return (plist);
3101 fail:
3102         /*
3103          * Failed to get enough pages, return ones we did get
3104          */
3105         return_partial_alloc(plist);
3106         return (NULL);
3107 }
3108
3109 /*
3110  * Lock and return the page with the highest mfn that we can find.  last_mfn
3111  * holds the last one found, so the next search can start from there.  We
3112  * also keep a counter so that we don't loop forever if the machine has no
3113  * free pages.
3114  *
3115  * This is called from the balloon thread to find pages to give away.  new_high
3116  * is used when new mfn's have been added to the system - we will reset our
3117  * search if the new mfn's are higher than our current search position.
3118  */
3119 page_t *
3120 page_get_high_mfn(mfn_t new_high)
3121 {
3122         static mfn_t last_mfn = 0;
3123         pfn_t pfn;
3124         page_t *pp;
3125         ulong_t loop_count = 0;
3126
3127         if (new_high > last_mfn)
3128                 last_mfn = new_high;
3129
3130         for (; loop_count < mfn_count; loop_count++, last_mfn--) {
3131                 if (last_mfn == 0) {
3132                         last_mfn = cached_max_mfn;
3133                 }
3134
3135                 pfn = mfn_to_pfn(last_mfn);
3136                 if (pfn & PFN_IS_FOREIGN_MFN)
3137                         continue;
3138
3139                 /* See if the page is free.  If so, lock it. */
3140                 pp = page_numtopp_alloc(pfn);
3141                 if (pp == NULL)
3142                         continue;
3143                 PP_CLRFREE(pp);
3144
3145                 ASSERT(PAGE_EXCL(pp));
3146                 ASSERT(pp->p_vnode == NULL);
3147                 ASSERT(!hat_page_is_mapped(pp));
3148                 last_mfn--;
3149                 return (pp);
3150         }
3151         return (NULL);
3152 }
3153
3154 #else /* !__xpv */
3155
3156 /*
3157  * get a page from any list with the given mnode
3158  */
3159 static page_t *
3160 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags,
3161     int mnode, int mtype, ddi_dma_attr_t *dma_attr)
3162 {
3163         kmutex_t                *pcm;
3164         int                     i;
3165         page_t                  *pp;
3166         page_t                  *first_pp;
3167         uint64_t                pgaddr;
3168         ulong_t                 bin;
3169         int                     mtypestart;
3170         int                     plw_initialized;
3171         page_list_walker_t      plw;
3172
3173         VM_STAT_ADD(pga_vmstats.pgma_alloc);
3174
3175         ASSERT((flags & PG_MATCH_COLOR) == 0);
3176         ASSERT(szc == 0);
3177         ASSERT(dma_attr != NULL);
3178
3179         MTYPE_START(mnode, mtype, flags);
3180         if (mtype < 0) {
3181                 VM_STAT_ADD(pga_vmstats.pgma_allocempty);
3182                 return (NULL);
3183         }
3184
3185         mtypestart = mtype;
3186
3187         bin = origbin;
3188
3189         /*
3190          * check up to page_colors + 1 bins - origbin may be checked twice
3191          * because of BIN_STEP skip
3192          */
3193         do {
3194                 plw_initialized = 0;
3195
3196                 for (plw.plw_count = 0;
3197                     plw.plw_count < page_colors; plw.plw_count++) {
3198
3199                         if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL)
3200                                 goto nextfreebin;
3201
3202                         pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
3203                         mutex_enter(pcm);
3204                         pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
3205                         first_pp = pp;
3206                         while (pp != NULL) {
3207                                 if (IS_DUMP_PAGE(pp) || page_trylock(pp,
3208                                     SE_EXCL) == 0) {
3209                                         pp = pp->p_next;
3210                                         if (pp == first_pp) {
3211                                                 pp = NULL;
3212                                         }
3213                                         continue;
3214                                 }
3215
3216                                 ASSERT(PP_ISFREE(pp));
3217                                 ASSERT(PP_ISAGED(pp));
3218                                 ASSERT(pp->p_vnode == NULL);
3219                                 ASSERT(pp->p_hash == NULL);
3220                                 ASSERT(pp->p_offset == (u_offset_t)-1);
3221                                 ASSERT(pp->p_szc == szc);
3222                                 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3223                                 /* check if page within DMA attributes */
3224                                 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum));
3225                                 if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
3226                                     (pgaddr + MMU_PAGESIZE - 1 <=
3227                                     dma_attr->dma_attr_addr_hi)) {
3228                                         break;
3229                                 }
3230
3231                                 /* continue looking */
3232                                 page_unlock(pp);
3233                                 pp = pp->p_next;
3234                                 if (pp == first_pp)
3235                                         pp = NULL;
3236
3237                         }
3238                         if (pp != NULL) {
3239                                 ASSERT(mtype == PP_2_MTYPE(pp));
3240                                 ASSERT(pp->p_szc == 0);
3241
3242                                 /* found a page with specified DMA attributes */
3243                                 page_sub(&PAGE_FREELISTS(mnode, szc, bin,
3244                                     mtype), pp);
3245                                 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
3246
3247                                 if ((PP_ISFREE(pp) == 0) ||
3248                                     (PP_ISAGED(pp) == 0)) {
3249                                         cmn_err(CE_PANIC, "page %p is not free",
3250                                             (void *)pp);
3251                                 }
3252
3253                                 mutex_exit(pcm);
3254                                 check_dma(dma_attr, pp, 1);
3255                                 VM_STAT_ADD(pga_vmstats.pgma_allocok);
3256                                 return (pp);
3257                         }
3258                         mutex_exit(pcm);
3259 nextfreebin:
3260                         if (plw_initialized == 0) {
3261                                 page_list_walk_init(szc, 0, bin, 1, 0, &plw);
3262                                 ASSERT(plw.plw_ceq_dif == page_colors);
3263                                 plw_initialized = 1;
3264                         }
3265
3266                         if (plw.plw_do_split) {
3267                                 pp = page_freelist_split(szc, bin, mnode,
3268                                     mtype,
3269                                     mmu_btop(dma_attr->dma_attr_addr_lo),
3270                                     mmu_btop(dma_attr->dma_attr_addr_hi + 1),
3271                                     &plw);
3272                                 if (pp != NULL) {
3273                                         check_dma(dma_attr, pp, 1);
3274                                         return (pp);
3275                                 }
3276                         }
3277
3278                         bin = page_list_walk_next_bin(szc, bin, &plw);
3279                 }
3280
3281                 MTYPE_NEXT(mnode, mtype, flags);
3282         } while (mtype >= 0);
3283
3284         /* failed to find a page in the freelist; try it in the cachelist */
3285
3286         /* reset mtype start for cachelist search */
3287         mtype = mtypestart;
3288         ASSERT(mtype >= 0);
3289
3290         /* start with the bin of matching color */
3291         bin = origbin;
3292
3293         do {
3294                 for (i = 0; i <= page_colors; i++) {
3295                         if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL)
3296                                 goto nextcachebin;
3297                         pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
3298                         mutex_enter(pcm);
3299                         pp = PAGE_CACHELISTS(mnode, bin, mtype);
3300                         first_pp = pp;
3301                         while (pp != NULL) {
3302                                 if (IS_DUMP_PAGE(pp) || page_trylock(pp,
3303                                     SE_EXCL) == 0) {
3304                                         pp = pp->p_next;
3305                                         if (pp == first_pp)
3306                                                 pp = NULL;
3307                                         continue;
3308                                 }
3309                                 ASSERT(pp->p_vnode);
3310                                 ASSERT(PP_ISAGED(pp) == 0);
3311                                 ASSERT(pp->p_szc == 0);
3312                                 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3313
3314                                 /* check if page within DMA attributes */
3315
3316                                 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum));
3317                                 if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
3318                                     (pgaddr + MMU_PAGESIZE - 1 <=
3319                                     dma_attr->dma_attr_addr_hi)) {
3320                                         break;
3321                                 }
3322
3323                                 /* continue looking */
3324                                 page_unlock(pp);
3325                                 pp = pp->p_next;
3326                                 if (pp == first_pp)
3327                                         pp = NULL;
3328                         }
3329
3330                         if (pp != NULL) {
3331                                 ASSERT(mtype == PP_2_MTYPE(pp));
3332                                 ASSERT(pp->p_szc == 0);
3333
3334                                 /* found a page with specified DMA attributes */
3335                                 page_sub(&PAGE_CACHELISTS(mnode, bin,
3336                                     mtype), pp);
3337                                 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
3338
3339                                 mutex_exit(pcm);
3340                                 ASSERT(pp->p_vnode);
3341                                 ASSERT(PP_ISAGED(pp) == 0);
3342                                 check_dma(dma_attr, pp, 1);
3343                                 VM_STAT_ADD(pga_vmstats.pgma_allocok);
3344                                 return (pp);
3345                         }
3346                         mutex_exit(pcm);
3347 nextcachebin:
3348                         bin += (i == 0) ? BIN_STEP : 1;
3349                         bin &= page_colors_mask;
3350                 }
3351                 MTYPE_NEXT(mnode, mtype, flags);
3352         } while (mtype >= 0);
3353
3354         VM_STAT_ADD(pga_vmstats.pgma_allocfailed);
3355         return (NULL);
3356 }
3357
3358 /*
3359  * This function is similar to page_get_freelist()/page_get_cachelist()
3360  * but it searches both the lists to find a page with the specified
3361  * color (or no color) and DMA attributes. The search is done in the
3362  * freelist first and then in the cache list within the highest memory
3363  * range (based on DMA attributes) before searching in the lower
3364  * memory ranges.
3365  *
3366  * Note: This function is called only by page_create_io().
3367  */
3368 /*ARGSUSED*/
3369 static page_t *
3370 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr,
3371     size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp)
3372 {
3373         uint_t          bin;
3374         int             mtype;
3375         page_t          *pp;
3376         int             n;
3377         int             m;
3378         int             szc;
3379         int             fullrange;
3380         int             mnode;
3381         int             local_failed_stat = 0;
3382         lgrp_mnode_cookie_t     lgrp_cookie;
3383
3384         VM_STAT_ADD(pga_vmstats.pga_alloc);
3385
3386         /* only base pagesize currently supported */
3387         if (size != MMU_PAGESIZE)
3388                 return (NULL);
3389
3390         /*
3391          * If we're passed a specific lgroup, we use it.  Otherwise,
3392          * assume first-touch placement is desired.
3393          */
3394         if (!LGRP_EXISTS(lgrp))
3395                 lgrp = lgrp_home_lgrp();
3396
3397         /* LINTED */
3398         AS_2_BIN(as, seg, vp, vaddr, bin, 0);
3399
3400         /*
3401          * Only hold one freelist or cachelist lock at a time, that way we
3402          * can start anywhere and not have to worry about lock
3403          * ordering.
3404          */
3405         if (dma_attr == NULL) {
3406                 n = mtype16m;
3407                 m = mtypetop;
3408                 fullrange = 1;
3409                 VM_STAT_ADD(pga_vmstats.pga_nulldmaattr);
3410         } else {
3411                 pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo);
3412                 pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi);
3413
3414                 /*
3415                  * We can guarantee alignment only for page boundary.
3416                  */
3417                 if (dma_attr->dma_attr_align > MMU_PAGESIZE)
3418                         return (NULL);
3419
3420                 /* Sanity check the dma_attr */
3421                 if (pfnlo > pfnhi)
3422                         return (NULL);
3423
3424                 n = pfn_2_mtype(pfnlo);
3425                 m = pfn_2_mtype(pfnhi);
3426
3427                 fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) &&
3428                     (pfnhi >= mnoderanges[m].mnr_pfnhi));
3429         }
3430         VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange);
3431
3432         szc = 0;
3433
3434         /* cylcing thru mtype handled by RANGE0 if n == mtype16m */
3435         if (n == mtype16m) {
3436                 flags |= PGI_MT_RANGE0;
3437                 n = m;
3438         }
3439
3440         /*
3441          * Try local memory node first, but try remote if we can't
3442          * get a page of the right color.
3443          */
3444         LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER);
3445         while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3446                 /*
3447                  * allocate pages from high pfn to low.
3448                  */
3449                 mtype = m;
3450                 do {
3451                         if (fullrange != 0) {
3452                                 pp = page_get_mnode_freelist(mnode,
3453                                     bin, mtype, szc, flags);
3454                                 if (pp == NULL) {
3455                                         pp = page_get_mnode_cachelist(
3456                                             bin, flags, mnode, mtype);
3457                                 }
3458                         } else {
3459                                 pp = page_get_mnode_anylist(bin, szc,
3460                                     flags, mnode, mtype, dma_attr);
3461                         }
3462                         if (pp != NULL) {
3463                                 VM_STAT_ADD(pga_vmstats.pga_allocok);
3464                                 check_dma(dma_attr, pp, 1);
3465                                 return (pp);
3466                         }
3467                 } while (mtype != n &&
3468                     (mtype = mnoderanges[mtype].mnr_next) != -1);
3469                 if (!local_failed_stat) {
3470                         lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3471                         local_failed_stat = 1;
3472                 }
3473         }
3474         VM_STAT_ADD(pga_vmstats.pga_allocfailed);
3475
3476         return (NULL);
3477 }
3478
3479 /*
3480  * page_create_io()
3481  *
3482  * This function is a copy of page_create_va() with an additional
3483  * argument 'mattr' that specifies DMA memory requirements to
3484  * the page list functions. This function is used by the segkmem
3485  * allocator so it is only to create new pages (i.e PG_EXCL is
3486  * set).
3487  *
3488  * Note: This interface is currently used by x86 PSM only and is
3489  *       not fully specified so the commitment level is only for
3490  *       private interface specific to x86. This interface uses PSM
3491  *       specific page_get_anylist() interface.
3492  */
3493
3494 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \
3495         for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
3496                 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
3497                         break; \
3498         } \
3499 }
3500
3501
3502 page_t *
3503 page_create_io(
3504         struct vnode    *vp,
3505         u_offset_t      off,
3506         uint_t          bytes,
3507         uint_t          flags,
3508         struct as       *as,
3509         caddr_t         vaddr,
3510         ddi_dma_attr_t  *mattr) /* DMA memory attributes if any */
3511 {
3512         page_t          *plist = NULL;
3513         uint_t          plist_len = 0;
3514         pgcnt_t         npages;
3515         page_t          *npp = NULL;
3516         uint_t          pages_req;
3517         page_t          *pp;
3518         kmutex_t        *phm = NULL;
3519         uint_t          index;
3520
3521         TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
3522             "page_create_start:vp %p off %llx bytes %u flags %x",
3523             vp, off, bytes, flags);
3524
3525         ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0);
3526
3527         pages_req = npages = mmu_btopr(bytes);
3528
3529         /*
3530          * Do the freemem and pcf accounting.
3531          */
3532         if (!page_create_wait(npages, flags)) {
3533                 return (NULL);
3534         }
3535
3536         TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
3537             "page_create_success:vp %p off %llx", vp, off);
3538
3539         /*
3540          * If satisfying this request has left us with too little
3541          * memory, start the wheels turning to get some back.  The
3542          * first clause of the test prevents waking up the pageout
3543          * daemon in situations where it would decide that there's
3544          * nothing to do.
3545          */
3546         if (nscan < desscan && freemem < minfree) {
3547                 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
3548                     "pageout_cv_signal:freemem %ld", freemem);
3549                 WAKE_PAGEOUT_SCANNER(page__create__io);
3550         }
3551
3552         if (flags & PG_PHYSCONTIG) {
3553
3554                 plist = page_get_contigpage(&npages, mattr, 1);
3555                 if (plist == NULL) {
3556                         page_create_putback(npages);
3557                         return (NULL);
3558                 }
3559
3560                 pp = plist;
3561
3562                 do {
3563                         if (!page_hashin(pp, vp, off, NULL)) {
3564                                 panic("pg_creat_io: hashin failed %p %p %llx",
3565                                     (void *)pp, (void *)vp, off);
3566                         }
3567                         VM_STAT_ADD(page_create_new);
3568                         off += MMU_PAGESIZE;
3569                         PP_CLRFREE(pp);
3570                         PP_CLRAGED(pp);
3571                         page_set_props(pp, P_REF);
3572                         pp = pp->p_next;
3573                 } while (pp != plist);
3574
3575                 if (!npages) {
3576                         check_dma(mattr, plist, pages_req);
3577                         return (plist);
3578                 } else {
3579                         vaddr += (pages_req - npages) << MMU_PAGESHIFT;
3580                 }
3581
3582                 /*
3583                  * fall-thru:
3584                  *
3585                  * page_get_contigpage returns when npages <= sgllen.
3586                  * Grab the rest of the non-contig pages below from anylist.
3587                  */
3588         }
3589
3590         /*
3591          * Loop around collecting the requested number of pages.
3592          * Most of the time, we have to `create' a new page. With
3593          * this in mind, pull the page off the free list before
3594          * getting the hash lock.  This will minimize the hash
3595          * lock hold time, nesting, and the like.  If it turns
3596          * out we don't need the page, we put it back at the end.
3597          */
3598         while (npages--) {
3599                 phm = NULL;
3600
3601                 index = PAGE_HASH_FUNC(vp, off);
3602 top:
3603                 ASSERT(phm == NULL);
3604                 ASSERT(index == PAGE_HASH_FUNC(vp, off));
3605                 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3606
3607                 if (npp == NULL) {
3608                         /*
3609                          * Try to get the page of any color either from
3610                          * the freelist or from the cache list.
3611                          */
3612                         npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE,
3613                             flags & ~PG_MATCH_COLOR, mattr, NULL);
3614                         if (npp == NULL) {
3615                                 if (mattr == NULL) {
3616                                         /*
3617                                          * Not looking for a special page;
3618                                          * panic!
3619                                          */
3620                                         panic("no page found %d", (int)npages);
3621                                 }
3622                                 /*
3623                                  * No page found! This can happen
3624                                  * if we are looking for a page
3625                                  * within a specific memory range
3626                                  * for DMA purposes. If PG_WAIT is
3627                                  * specified then we wait for a
3628                                  * while and then try again. The
3629                                  * wait could be forever if we
3630                                  * don't get the page(s) we need.
3631                                  *
3632                                  * Note: XXX We really need a mechanism
3633                                  * to wait for pages in the desired
3634                                  * range. For now, we wait for any
3635                                  * pages and see if we can use it.
3636                                  */
3637
3638                                 if ((mattr != NULL) && (flags & PG_WAIT)) {
3639                                         delay(10);
3640                                         goto top;
3641                                 }
3642                                 goto fail; /* undo accounting stuff */
3643                         }
3644
3645                         if (PP_ISAGED(npp) == 0) {
3646                                 /*
3647                                  * Since this page came from the
3648                                  * cachelist, we must destroy the
3649                                  * old vnode association.
3650                                  */
3651                                 page_hashout(npp, (kmutex_t *)NULL);
3652                         }
3653                 }
3654
3655                 /*
3656                  * We own this page!
3657                  */
3658                 ASSERT(PAGE_EXCL(npp));
3659                 ASSERT(npp->p_vnode == NULL);
3660                 ASSERT(!hat_page_is_mapped(npp));
3661                 PP_CLRFREE(npp);
3662                 PP_CLRAGED(npp);
3663
3664                 /*
3665                  * Here we have a page in our hot little mits and are
3666                  * just waiting to stuff it on the appropriate lists.
3667                  * Get the mutex and check to see if it really does
3668                  * not exist.
3669                  */
3670                 phm = PAGE_HASH_MUTEX(index);
3671                 mutex_enter(phm);
3672                 PAGE_HASH_SEARCH(index, pp, vp, off);
3673                 if (pp == NULL) {
3674                         VM_STAT_ADD(page_create_new);
3675                         pp = npp;
3676                         npp = NULL;
3677                         if (!page_hashin(pp, vp, off, phm)) {
3678                                 /*
3679                                  * Since we hold the page hash mutex and
3680                                  * just searched for this page, page_hashin
3681                                  * had better not fail.  If it does, that
3682                                  * means somethread did not follow the
3683                                  * page hash mutex rules.  Panic now and
3684                                  * get it over with.  As usual, go down
3685                                  * holding all the locks.
3686                                  */
3687                                 ASSERT(MUTEX_HELD(phm));
3688                                 panic("page_create: hashin fail %p %p %llx %p",
3689                                     (void *)pp, (void *)vp, off, (void *)phm);
3690
3691                         }
3692                         ASSERT(MUTEX_HELD(phm));
3693                         mutex_exit(phm);
3694                         phm = NULL;
3695
3696                         /*
3697                          * Hat layer locking need not be done to set
3698                          * the following bits since the page is not hashed
3699                          * and was on the free list (i.e., had no mappings).
3700                          *
3701                          * Set the reference bit to protect
3702                          * against immediate pageout
3703                          *
3704                          * XXXmh modify freelist code to set reference
3705                          * bit so we don't have to do it here.
3706                          */
3707                         page_set_props(pp, P_REF);
3708                 } else {
3709                         ASSERT(MUTEX_HELD(phm));
3710                         mutex_exit(phm);
3711                         phm = NULL;
3712                         /*
3713                          * NOTE: This should not happen for pages associated
3714                          *       with kernel vnode 'kvp'.
3715                          */
3716                         /* XX64 - to debug why this happens! */
3717                         ASSERT(!VN_ISKAS(vp));
3718                         if (VN_ISKAS(vp))
3719                                 cmn_err(CE_NOTE,
3720                                     "page_create: page not expected "
3721                                     "in hash list for kernel vnode - pp 0x%p",
3722                                     (void *)pp);
3723                         VM_STAT_ADD(page_create_exists);
3724                         goto fail;
3725                 }
3726
3727                 /*
3728                  * Got a page!  It is locked.  Acquire the i/o
3729                  * lock since we are going to use the p_next and
3730                  * p_prev fields to link the requested pages together.
3731                  */
3732                 page_io_lock(pp);
3733                 page_add(&plist, pp);
3734                 plist = plist->p_next;
3735                 off += MMU_PAGESIZE;
3736                 vaddr += MMU_PAGESIZE;
3737         }
3738
3739         check_dma(mattr, plist, pages_req);
3740         return (plist);
3741
3742 fail:
3743         if (npp != NULL) {
3744                 /*
3745                  * Did not need this page after all.
3746                  * Put it back on the free list.
3747                  */
3748                 VM_STAT_ADD(page_create_putbacks);
3749                 PP_SETFREE(npp);
3750                 PP_SETAGED(npp);
3751                 npp->p_offset = (u_offset_t)-1;
3752                 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
3753                 page_unlock(npp);
3754         }
3755
3756         /*
3757          * Give up the pages we already got.
3758          */
3759         while (plist != NULL) {
3760                 pp = plist;
3761                 page_sub(&plist, pp);
3762                 page_io_unlock(pp);
3763                 plist_len++;
3764                 /*LINTED: constant in conditional ctx*/
3765                 VN_DISPOSE(pp, B_INVAL, 0, kcred);
3766         }
3767
3768         /*
3769          * VN_DISPOSE does freemem accounting for the pages in plist
3770          * by calling page_free. So, we need to undo the pcf accounting
3771          * for only the remaining pages.
3772          */
3773         VM_STAT_ADD(page_create_putbacks);
3774         page_create_putback(pages_req - plist_len);
3775
3776         return (NULL);
3777 }
3778 #endif /* !__xpv */
3779
3780
3781 /*
3782  * Copy the data from the physical page represented by "frompp" to
3783  * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and
3784  * CPU->cpu_caddr2.  It assumes that no one uses either map at interrupt
3785  * level and no one sleeps with an active mapping there.
3786  *
3787  * Note that the ref/mod bits in the page_t's are not affected by
3788  * this operation, hence it is up to the caller to update them appropriately.
3789  */
3790 int
3791 ppcopy(page_t *frompp, page_t *topp)
3792 {
3793         caddr_t         pp_addr1;
3794         caddr_t         pp_addr2;
3795         hat_mempte_t    pte1;
3796         hat_mempte_t    pte2;
3797         label_t         ljb;
3798         int             ret;
3799
3800         ASSERT_STACK_ALIGNED();
3801         ASSERT(PAGE_LOCKED(frompp));
3802         ASSERT(PAGE_LOCKED(topp));
3803
3804         if (kpm_enable) {
3805                 pp_addr1 = hat_kpm_page2va(frompp, 0);
3806                 pp_addr2 = hat_kpm_page2va(topp, 0);
3807                 kpreempt_disable();
3808         } else {
3809                 /*
3810                  * disable pre-emption so that CPU can't change
3811                  */
3812                 kpreempt_disable();
3813
3814                 pp_addr1 = CPU->cpu_caddr1;
3815                 pp_addr2 = CPU->cpu_caddr2;
3816                 pte1 = CPU->cpu_caddr1pte;
3817                 pte2 = CPU->cpu_caddr2pte;
3818
3819                 mutex_enter(&CPU->cpu_ppaddr_mutex);
3820
3821                 hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1,
3822                     PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST);
3823                 hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2,
3824                     PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
3825                     HAT_LOAD_NOCONSIST);
3826         }
3827
3828         if (on_fault(&ljb)) {
3829                 ret = 0;
3830                 goto faulted;
3831         } else {
3832                 ret = 1;
3833         }
3834         if (use_sse_pagecopy)
3835 #ifdef __xpv
3836                 page_copy_no_xmm(pp_addr2, pp_addr1);
3837 #else
3838                 hwblkpagecopy(pp_addr1, pp_addr2);
3839 #endif
3840         else
3841                 bcopy(pp_addr1, pp_addr2, PAGESIZE);
3842
3843         no_fault();
3844 faulted:
3845         if (!kpm_enable) {
3846 #ifdef __xpv
3847                 /*
3848                  * We can't leave unused mappings laying about under the
3849                  * hypervisor, so blow them away.
3850                  */
3851                 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr1, 0,
3852                     UVMF_INVLPG | UVMF_LOCAL) < 0)
3853                         panic("HYPERVISOR_update_va_mapping() failed");
3854                 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0,
3855                     UVMF_INVLPG | UVMF_LOCAL) < 0)
3856                         panic("HYPERVISOR_update_va_mapping() failed");
3857 #endif
3858                 mutex_exit(&CPU->cpu_ppaddr_mutex);
3859         }
3860         kpreempt_enable();
3861         return (ret);
3862 }
3863
3864 void
3865 pagezero(page_t *pp, uint_t off, uint_t len)
3866 {
3867         ASSERT(PAGE_LOCKED(pp));
3868         pfnzero(page_pptonum(pp), off, len);
3869 }
3870
3871 /*
3872  * Zero the physical page from off to off + len given by pfn
3873  * without changing the reference and modified bits of page.
3874  *
3875  * We use this using CPU private page address #2, see ppcopy() for more info.
3876  * pfnzero() must not be called at interrupt level.
3877  */
3878 void
3879 pfnzero(pfn_t pfn, uint_t off, uint_t len)
3880 {
3881         caddr_t         pp_addr2;
3882         hat_mempte_t    pte2;
3883         kmutex_t        *ppaddr_mutex = NULL;
3884
3885         ASSERT_STACK_ALIGNED();
3886         ASSERT(len <= MMU_PAGESIZE);
3887         ASSERT(off <= MMU_PAGESIZE);
3888         ASSERT(off + len <= MMU_PAGESIZE);
3889
3890         if (kpm_enable && !pfn_is_foreign(pfn)) {
3891                 pp_addr2 = hat_kpm_pfn2va(pfn);
3892                 kpreempt_disable();
3893         } else {
3894                 kpreempt_disable();
3895
3896                 pp_addr2 = CPU->cpu_caddr2;
3897                 pte2 = CPU->cpu_caddr2pte;
3898
3899                 ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
3900                 mutex_enter(ppaddr_mutex);
3901
3902                 hat_mempte_remap(pfn, pp_addr2, pte2,
3903                     PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
3904                     HAT_LOAD_NOCONSIST);
3905         }
3906
3907         if (use_sse_pagezero) {
3908 #ifdef __xpv
3909                 uint_t rem;
3910
3911                 /*
3912                  * zero a byte at a time until properly aligned for
3913                  * block_zero_no_xmm().
3914                  */
3915                 while (!P2NPHASE(off, ((uint_t)BLOCKZEROALIGN)) && len-- > 0)
3916                         pp_addr2[off++] = 0;
3917
3918                 /*
3919                  * Now use faster block_zero_no_xmm() for any range
3920                  * that is properly aligned and sized.
3921                  */
3922                 rem = P2PHASE(len, ((uint_t)BLOCKZEROALIGN));
3923                 len -= rem;
3924                 if (len != 0) {
3925                         block_zero_no_xmm(pp_addr2 + off, len);
3926                         off += len;
3927                 }
3928
3929                 /*
3930                  * zero remainder with byte stores.
3931                  */
3932                 while (rem-- > 0)
3933                         pp_addr2[off++] = 0;
3934 #else
3935                 hwblkclr(pp_addr2 + off, len);
3936 #endif
3937         } else {
3938                 bzero(pp_addr2 + off, len);
3939         }
3940
3941         if (!kpm_enable || pfn_is_foreign(pfn)) {
3942 #ifdef __xpv
3943                 /*
3944                  * On the hypervisor this page might get used for a page
3945                  * table before any intervening change to this mapping,
3946                  * so blow it away.
3947                  */
3948                 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0,
3949                     UVMF_INVLPG) < 0)
3950                         panic("HYPERVISOR_update_va_mapping() failed");
3951 #endif
3952                 mutex_exit(ppaddr_mutex);
3953         }
3954
3955         kpreempt_enable();
3956 }
3957
3958 /*
3959  * Platform-dependent page scrub call.
3960  */
3961 void
3962 pagescrub(page_t *pp, uint_t off, uint_t len)
3963 {
3964         /*
3965          * For now, we rely on the fact that pagezero() will
3966          * always clear UEs.
3967          */
3968         pagezero(pp, off, len);
3969 }
3970
3971 /*
3972  * set up two private addresses for use on a given CPU for use in ppcopy()
3973  */
3974 void
3975 setup_vaddr_for_ppcopy(struct cpu *cpup)
3976 {
3977         void *addr;
3978         hat_mempte_t pte_pa;
3979
3980         addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
3981         pte_pa = hat_mempte_setup(addr);
3982         cpup->cpu_caddr1 = addr;
3983         cpup->cpu_caddr1pte = pte_pa;
3984
3985         addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
3986         pte_pa = hat_mempte_setup(addr);
3987         cpup->cpu_caddr2 = addr;
3988         cpup->cpu_caddr2pte = pte_pa;
3989
3990         mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL);
3991 }
3992
3993 /*
3994  * Undo setup_vaddr_for_ppcopy
3995  */
3996 void
3997 teardown_vaddr_for_ppcopy(struct cpu *cpup)
3998 {
3999         mutex_destroy(&cpup->cpu_ppaddr_mutex);
4000
4001         hat_mempte_release(cpup->cpu_caddr2, cpup->cpu_caddr2pte);
4002         cpup->cpu_caddr2pte = 0;
4003         vmem_free(heap_arena, cpup->cpu_caddr2, mmu_ptob(1));
4004         cpup->cpu_caddr2 = 0;
4005
4006         hat_mempte_release(cpup->cpu_caddr1, cpup->cpu_caddr1pte);
4007         cpup->cpu_caddr1pte = 0;
4008         vmem_free(heap_arena, cpup->cpu_caddr1, mmu_ptob(1));
4009         cpup->cpu_caddr1 = 0;
4010 }
4011
4012 /*
4013  * Function for flushing D-cache when performing module relocations
4014  * to an alternate mapping.  Unnecessary on Intel / AMD platforms.
4015  */
4016 void
4017 dcache_flushall()
4018 {}
4019
4020 /*
4021  * Allocate a memory page.  The argument 'seed' can be any pseudo-random
4022  * number to vary where the pages come from.  This is quite a hacked up
4023  * method -- it works for now, but really needs to be fixed up a bit.
4024  *
4025  * We currently use page_create_va() on the kvp with fake offsets,
4026  * segments and virt address.  This is pretty bogus, but was copied from the
4027  * old hat_i86.c code.  A better approach would be to specify either mnode
4028  * random or mnode local and takes a page from whatever color has the MOST
4029  * available - this would have a minimal impact on page coloring.
4030  */
4031 page_t *
4032 page_get_physical(uintptr_t seed)
4033 {
4034         page_t *pp;
4035         u_offset_t offset;
4036         static struct seg tmpseg;
4037         static uintptr_t ctr = 0;
4038
4039         /*
4040          * This code is gross, we really need a simpler page allocator.
4041          *
4042          * We need to assign an offset for the page to call page_create_va()
4043          * To avoid conflicts with other pages, we get creative with the offset.
4044          * For 32 bits, we need an offset > 4Gig
4045          * For 64 bits, need an offset somewhere in the VA hole.
4046          */
4047         offset = seed;
4048         if (offset > kernelbase)
4049                 offset -= kernelbase;
4050         offset <<= MMU_PAGESHIFT;
4051         offset += mmu.hole_start;       /* something in VA hole */
4052
4053         if (page_resv(1, KM_NOSLEEP) == 0)
4054                 return (NULL);
4055
4056 #ifdef  DEBUG
4057         pp = page_exists(&kvp, offset);
4058         if (pp != NULL)
4059                 panic("page already exists %p", (void *)pp);
4060 #endif
4061
4062         pp = page_create_va(&kvp, offset, MMU_PAGESIZE, PG_EXCL,
4063             &tmpseg, (caddr_t)(ctr += MMU_PAGESIZE));   /* changing VA usage */
4064         if (pp != NULL) {
4065                 page_io_unlock(pp);
4066                 page_downgrade(pp);
4067         }
4068         return (pp);
4069 }