usr/src/uts/common/os/mem_config.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 #include <sys/types.h>
  27 #include <sys/cmn_err.h>
  28 #include <sys/vmem.h>
  29 #include <sys/kmem.h>
  30 #include <sys/systm.h>
  31 #include <sys/machsystm.h>      /* for page_freelist_coalesce() */
  32 #include <sys/errno.h>
  33 #include <sys/memnode.h>
  34 #include <sys/memlist.h>
  35 #include <sys/memlist_impl.h>
  36 #include <sys/tuneable.h>
  37 #include <sys/proc.h>
  38 #include <sys/disp.h>
  39 #include <sys/debug.h>
  40 #include <sys/vm.h>
  41 #include <sys/callb.h>
  42 #include <sys/memlist_plat.h>   /* for installed_top_size() */
  43 #include <sys/condvar_impl.h>   /* for CV_HAS_WAITERS() */
  44 #include <sys/dumphdr.h>        /* for dump_resize() */
  45 #include <sys/atomic.h>         /* for use in stats collection */
  46 #include <sys/rwlock.h>
  47 #include <sys/cpuvar.h>
  48 #include <vm/seg_kmem.h>
  49 #include <vm/seg_kpm.h>
  50 #include <vm/page.h>
  51 #include <vm/vm_dep.h>
  52 #define SUNDDI_IMPL             /* so sunddi.h will not redefine splx() et al */
  53 #include <sys/sunddi.h>
  54 #include <sys/mem_config.h>
  55 #include <sys/mem_cage.h>
  56 #include <sys/lgrp.h>
  57 #include <sys/ddi.h>
  58 #include <sys/modctl.h>
  59
  60 extern struct memlist *phys_avail;
  61
  62 extern uint_t page_ctrs_adjust(int);
  63 void page_ctrs_cleanup(void);
  64 static void kphysm_setup_post_add(pgcnt_t);
  65 static int kphysm_setup_pre_del(pgcnt_t);
  66 static void kphysm_setup_post_del(pgcnt_t, int);
  67
  68 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs);
  69
  70 static int delspan_reserve(pfn_t, pgcnt_t);
  71 static void delspan_unreserve(pfn_t, pgcnt_t);
  72
  73 kmutex_t memseg_lists_lock;
  74 struct memseg *memseg_va_avail;
  75 struct memseg *memseg_alloc(void);
  76 static struct memseg *memseg_delete_junk;
  77 static struct memseg *memseg_edit_junk;
  78 void memseg_remap_init(void);
  79 static void memseg_remap_to_dummy(struct memseg *);
  80 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t);
  81 static struct memseg *memseg_reuse(pgcnt_t);
  82
  83 static struct kmem_cache *memseg_cache;
  84
  85 /*
  86  * Interfaces to manage externally allocated
  87  * page_t memory (metadata) for a memseg.
  88  */
  89 #pragma weak    memseg_alloc_meta
  90 #pragma weak    memseg_free_meta
  91 #pragma weak    memseg_get_metapfn
  92 #pragma weak    memseg_remap_meta
  93
  94 extern int ppvm_enable;
  95 extern page_t *ppvm_base;
  96 extern int memseg_alloc_meta(pfn_t, pgcnt_t, void **, pgcnt_t *);
  97 extern void memseg_free_meta(void *, pgcnt_t);
  98 extern pfn_t memseg_get_metapfn(void *, pgcnt_t);
  99 extern void memseg_remap_meta(struct memseg *);
 100 static int memseg_is_dynamic(struct memseg *);
 101 static int memseg_includes_meta(struct memseg *);
 102 pfn_t memseg_get_start(struct memseg *);
 103 static void memseg_cpu_vm_flush(void);
 104
 105 int meta_alloc_enable;
 106
 107 #ifdef  DEBUG
 108 static int memseg_debug;
 109 #define MEMSEG_DEBUG(args...) if (memseg_debug) printf(args)
 110 #else
 111 #define MEMSEG_DEBUG(...)
 112 #endif
 113
 114 /*
 115  * Add a chunk of memory to the system.
 116  * base: starting PAGESIZE page of new memory.
 117  * npgs: length in PAGESIZE pages.
 118  *
 119  * Adding mem this way doesn't increase the size of the hash tables;
 120  * growing them would be too hard.  This should be OK, but adding memory
 121  * dynamically most likely means more hash misses, since the tables will
 122  * be smaller than they otherwise would be.
 123  */
 124 int
 125 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs)
 126 {
 127         page_t *pp;
 128         page_t          *opp, *oepp, *segpp;
 129         struct memseg   *seg;
 130         uint64_t        avmem;
 131         pfn_t           pfn;
 132         pfn_t           pt_base = base;
 133         pgcnt_t         tpgs = npgs;
 134         pgcnt_t         metapgs = 0;
 135         int             exhausted;
 136         pfn_t           pnum;
 137         int             mnode;
 138         caddr_t         vaddr;
 139         int             reuse;
 140         int             mlret;
 141         int             rv;
 142         int             flags;
 143         int             meta_alloc = 0;
 144         void            *mapva;
 145         void            *metabase = (void *)base;
 146         pgcnt_t         nkpmpgs = 0;
 147         offset_t        kpm_pages_off;
 148
 149         cmn_err(CE_CONT,
 150             "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n",
 151             npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT);
 152
 153         /*
 154          * Add this span in the delete list to prevent interactions.
 155          */
 156         if (!delspan_reserve(base, npgs)) {
 157                 return (KPHYSM_ESPAN);
 158         }
 159         /*
 160          * Check to see if any of the memory span has been added
 161          * by trying an add to the installed memory list. This
 162          * forms the interlocking process for add.
 163          */
 164
 165         memlist_write_lock();
 166
 167         mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT,
 168             (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
 169
 170         if (mlret == MEML_SPANOP_OK)
 171                 installed_top_size(phys_install, &physmax, &physinstalled);
 172
 173         memlist_write_unlock();
 174
 175         if (mlret != MEML_SPANOP_OK) {
 176                 if (mlret == MEML_SPANOP_EALLOC) {
 177                         delspan_unreserve(pt_base, tpgs);
 178                         return (KPHYSM_ERESOURCE);
 179                 } else if (mlret == MEML_SPANOP_ESPAN) {
 180                         delspan_unreserve(pt_base, tpgs);
 181                         return (KPHYSM_ESPAN);
 182                 } else {
 183                         delspan_unreserve(pt_base, tpgs);
 184                         return (KPHYSM_ERESOURCE);
 185                 }
 186         }
 187
 188         if (meta_alloc_enable) {
 189                 /*
 190                  * Allocate the page_t's from existing memory;
 191                  * if that fails, allocate from the incoming memory.
 192                  */
 193                 rv = memseg_alloc_meta(base, npgs, &metabase, &metapgs);
 194                 if (rv == KPHYSM_OK) {
 195                         ASSERT(metapgs);
 196                         ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
 197                         meta_alloc = 1;
 198                         goto mapalloc;
 199                 }
 200         }
 201
 202         /*
 203          * We store the page_t's for this new memory in the first
 204          * few pages of the chunk. Here, we go and get'em ...
 205          */
 206
 207         /*
 208          * The expression after the '-' gives the number of pages
 209          * that will fit in the new memory based on a requirement
 210          * of (PAGESIZE + sizeof (page_t)) bytes per page.
 211          */
 212         metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) /
 213             (PAGESIZE + sizeof (page_t)));
 214
 215         npgs -= metapgs;
 216         base += metapgs;
 217
 218         ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
 219
 220         exhausted = (metapgs == 0 || npgs == 0);
 221
 222         if (kpm_enable && !exhausted) {
 223                 pgcnt_t start, end, nkpmpgs_prelim;
 224                 size_t  ptsz;
 225
 226                 /*
 227                  * A viable kpm large page mapping must not overlap two
 228                  * dynamic memsegs. Therefore the total size is checked
 229                  * to be at least kpm_pgsz and also whether start and end
 230                  * points are at least kpm_pgsz aligned.
 231                  */
 232                 if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) ||
 233                     pmodkpmp(base + npgs)) {
 234
 235                         kphysm_addmem_error_undospan(pt_base, tpgs);
 236
 237                         /*
 238                          * There is no specific error code for violating
 239                          * kpm granularity constraints.
 240                          */
 241                         return (KPHYSM_ENOTVIABLE);
 242                 }
 243
 244                 start = kpmptop(ptokpmp(base));
 245                 end = kpmptop(ptokpmp(base + npgs));
 246                 nkpmpgs_prelim = ptokpmp(end - start);
 247                 ptsz = npgs * sizeof (page_t);
 248                 metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ);
 249                 exhausted = (tpgs <= metapgs);
 250                 if (!exhausted) {
 251                         npgs = tpgs - metapgs;
 252                         base = pt_base + metapgs;
 253
 254                         /* final nkpmpgs */
 255                         start = kpmptop(ptokpmp(base));
 256                         nkpmpgs = ptokpmp(end - start);
 257                         kpm_pages_off = ptsz +
 258                             (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ;
 259                 }
 260         }
 261
 262         /*
 263          * Is memory area supplied too small?
 264          */
 265         if (exhausted) {
 266                 kphysm_addmem_error_undospan(pt_base, tpgs);
 267                 /*
 268                  * There is no specific error code for 'too small'.
 269                  */
 270                 return (KPHYSM_ERESOURCE);
 271         }
 272
 273 mapalloc:
 274         /*
 275          * We may re-use a previously allocated VA space for the page_ts
 276          * eventually, but we need to initialize and lock the pages first.
 277          */
 278
 279         /*
 280          * Get an address in the kernel address map, map
 281          * the page_t pages and see if we can touch them.
 282          */
 283
 284         mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP);
 285         if (mapva == NULL) {
 286                 cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
 287                     " Can't allocate VA for page_ts");
 288
 289                 if (meta_alloc)
 290                         memseg_free_meta(metabase, metapgs);
 291                 kphysm_addmem_error_undospan(pt_base, tpgs);
 292
 293                 return (KPHYSM_ERESOURCE);
 294         }
 295         pp = mapva;
 296
 297         if (physmax < (pt_base + tpgs))
 298                 physmax = (pt_base + tpgs);
 299
 300         /*
 301          * In the remapping code we map one page at a time so we must do
 302          * the same here to match mapping sizes.
 303          */
 304         pfn = pt_base;
 305         vaddr = (caddr_t)pp;
 306         for (pnum = 0; pnum < metapgs; pnum++) {
 307                 if (meta_alloc)
 308                         pfn = memseg_get_metapfn(metabase, (pgcnt_t)pnum);
 309                 hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
 310                     PROT_READ | PROT_WRITE,
 311                     HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
 312                 pfn++;
 313                 vaddr += ptob(1);
 314         }
 315
 316         if (ddi_peek32((dev_info_t *)NULL,
 317             (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) {
 318
 319                 cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
 320                     " Can't access pp array at 0x%p [phys 0x%lx]",
 321                     (void *)pp, pt_base);
 322
 323                 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
 324                     HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
 325
 326                 vmem_free(heap_arena, mapva, ptob(metapgs));
 327                 if (meta_alloc)
 328                         memseg_free_meta(metabase, metapgs);
 329                 kphysm_addmem_error_undospan(pt_base, tpgs);
 330
 331                 return (KPHYSM_EFAULT);
 332         }
 333
 334         /*
 335          * Add this memory slice to its memory node translation.
 336          *
 337          * Note that right now, each node may have only one slice;
 338          * this may change with COD or in larger SSM systems with
 339          * nested latency groups, so we must not assume that the
 340          * node does not yet exist.
 341          *
 342          * Note that there may be multiple memory nodes associated with
 343          * a single lgrp node on x86 systems.
 344          */
 345         pnum = pt_base + tpgs - 1;
 346         mem_node_add_range(pt_base, pnum);
 347
 348         /*
 349          * Allocate or resize page counters as necessary to accommodate
 350          * the increase in memory pages.
 351          */
 352         mnode = PFN_2_MEM_NODE(pnum);
 353         PAGE_CTRS_ADJUST(base, npgs, rv);
 354         if (rv) {
 355
 356                 mem_node_del_range(pt_base, pnum);
 357
 358                 /* cleanup the  page counters */
 359                 page_ctrs_cleanup();
 360
 361                 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
 362                     HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
 363
 364                 vmem_free(heap_arena, mapva, ptob(metapgs));
 365                 if (meta_alloc)
 366                         memseg_free_meta(metabase, metapgs);
 367                 kphysm_addmem_error_undospan(pt_base, tpgs);
 368
 369                 return (KPHYSM_ERESOURCE);
 370         }
 371
 372         /*
 373          * Update the phys_avail memory list.
 374          * The phys_install list was done at the start.
 375          */
 376
 377         memlist_write_lock();
 378
 379         mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT,
 380             (uint64_t)(npgs) << PAGESHIFT, &phys_avail);
 381         ASSERT(mlret == MEML_SPANOP_OK);
 382
 383         memlist_write_unlock();
 384
 385         /* See if we can find a memseg to re-use. */
 386         if (meta_alloc) {
 387                 seg = memseg_reuse(0);
 388                 reuse = 1;      /* force unmapping of temp mapva */
 389                 flags = MEMSEG_DYNAMIC | MEMSEG_META_ALLOC;
 390                 /*
 391                  * There is a 1:1 fixed relationship between a pfn
 392                  * and a page_t VA.  The pfn is used as an index into
 393                  * the ppvm_base page_t table in order to calculate
 394                  * the page_t base address for a given pfn range.
 395                  */
 396                 segpp = ppvm_base + base;
 397         } else {
 398                 seg = memseg_reuse(metapgs);
 399                 reuse = (seg != NULL);
 400                 flags = MEMSEG_DYNAMIC | MEMSEG_META_INCL;
 401                 segpp = pp;
 402         }
 403
 404         /*
 405          * Initialize the memseg structure representing this memory
 406          * and add it to the existing list of memsegs. Do some basic
 407          * initialization and add the memory to the system.
 408          * In order to prevent lock deadlocks, the add_physmem()
 409          * code is repeated here, but split into several stages.
 410          *
 411          * If a memseg is reused, invalidate memseg pointers in
 412          * all cpu vm caches.  We need to do this this since the check
 413          *      pp >= seg->pages && pp < seg->epages
 414          * used in various places is not atomic and so the first compare
 415          * can happen before reuse and the second compare after reuse.
 416          * The invalidation ensures that a memseg is not deferenced while
 417          * it's page/pfn pointers are changing.
 418          */
 419         if (seg == NULL) {
 420                 seg = memseg_alloc();
 421                 ASSERT(seg != NULL);
 422                 seg->msegflags = flags;
 423                 MEMSEG_DEBUG("memseg_get: alloc seg=0x%p, pages=0x%p",
 424                     (void *)seg, (void *)(seg->pages));
 425                 seg->pages = segpp;
 426         } else {
 427                 ASSERT(seg->msegflags == flags);
 428                 ASSERT(seg->pages_base == seg->pages_end);
 429                 MEMSEG_DEBUG("memseg_get: reuse seg=0x%p, pages=0x%p",
 430                     (void *)seg, (void *)(seg->pages));
 431                 if (meta_alloc) {
 432                         memseg_cpu_vm_flush();
 433                         seg->pages = segpp;
 434                 }
 435         }
 436
 437         seg->epages = seg->pages + npgs;
 438         seg->pages_base = base;
 439         seg->pages_end = base + npgs;
 440
 441         /*
 442          * Initialize metadata. The page_ts are set to locked state
 443          * ready to be freed.
 444          */
 445         bzero((caddr_t)pp, ptob(metapgs));
 446
 447         pfn = seg->pages_base;
 448         /* Save the original pp base in case we reuse a memseg. */
 449         opp = pp;
 450         oepp = opp + npgs;
 451         for (pp = opp; pp < oepp; pp++) {
 452                 pp->p_pagenum = pfn;
 453                 pfn++;
 454                 page_iolock_init(pp);
 455                 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
 456                         continue;
 457                 pp->p_offset = (u_offset_t)-1;
 458         }
 459
 460         if (reuse) {
 461                 /* Remap our page_ts to the re-used memseg VA space. */
 462                 pfn = pt_base;
 463                 vaddr = (caddr_t)seg->pages;
 464                 for (pnum = 0; pnum < metapgs; pnum++) {
 465                         if (meta_alloc)
 466                                 pfn = memseg_get_metapfn(metabase,
 467                                     (pgcnt_t)pnum);
 468                         hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
 469                             PROT_READ | PROT_WRITE,
 470                             HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST);
 471                         pfn++;
 472                         vaddr += ptob(1);
 473                 }
 474
 475                 hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs),
 476                     HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
 477
 478                 vmem_free(heap_arena, mapva, ptob(metapgs));
 479         }
 480
 481         hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off);
 482
 483         memsegs_lock(1);
 484
 485         /*
 486          * The new memseg is inserted at the beginning of the list.
 487          * Not only does this save searching for the tail, but in the
 488          * case of a re-used memseg, it solves the problem of what
 489          * happens if some process has still got a pointer to the
 490          * memseg and follows the next pointer to continue traversing
 491          * the memsegs list.
 492          */
 493
 494         hat_kpm_addmem_mseg_insert(seg);
 495
 496         seg->next = memsegs;
 497         membar_producer();
 498
 499         hat_kpm_addmem_memsegs_update(seg);
 500
 501         memsegs = seg;
 502
 503         build_pfn_hash();
 504
 505         total_pages += npgs;
 506
 507         /*
 508          * Recalculate the paging parameters now total_pages has changed.
 509          * This will also cause the clock hands to be reset before next use.
 510          */
 511         setupclock(1);
 512
 513         memsegs_unlock(1);
 514
 515         PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs);
 516
 517         /*
 518          * Free the pages outside the lock to avoid locking loops.
 519          */
 520         for (pp = seg->pages; pp < seg->epages; pp++) {
 521                 page_free(pp, 1);
 522         }
 523
 524         /*
 525          * Now that we've updated the appropriate memory lists we
 526          * need to reset a number of globals, since we've increased memory.
 527          * Several have already been updated for us as noted above. The
 528          * globals we're interested in at this point are:
 529          *   physmax - highest page frame number.
 530          *   physinstalled - number of pages currently installed (done earlier)
 531          *   maxmem - max free pages in the system
 532          *   physmem - physical memory pages available
 533          *   availrmem - real memory available
 534          */
 535
 536         mutex_enter(&freemem_lock);
 537         maxmem += npgs;
 538         physmem += npgs;
 539         availrmem += npgs;
 540         availrmem_initial += npgs;
 541
 542         mutex_exit(&freemem_lock);
 543
 544         dump_resize();
 545
 546         page_freelist_coalesce_all(mnode);
 547
 548         kphysm_setup_post_add(npgs);
 549
 550         cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK "
 551             "(0x%" PRIx64 ")\n",
 552             physinstalled << (PAGESHIFT - 10),
 553             (uint64_t)physinstalled << PAGESHIFT);
 554
 555         avmem = (uint64_t)freemem << PAGESHIFT;
 556         cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: "
 557             "avail mem = %" PRId64 "\n", avmem);
 558
 559         /*
 560          * Update lgroup generation number on single lgroup systems
 561          */
 562         if (nlgrps == 1)
 563                 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
 564
 565         /*
 566          * Inform DDI of update
 567          */
 568         ddi_mem_update((uint64_t)(pt_base) << PAGESHIFT,
 569             (uint64_t)(tpgs) << PAGESHIFT);
 570
 571         delspan_unreserve(pt_base, tpgs);
 572
 573         return (KPHYSM_OK);             /* Successfully added system memory */
 574 }
 575
 576 /*
 577  * There are various error conditions in kphysm_add_memory_dynamic()
 578  * which require a rollback of already changed global state.
 579  */
 580 static void
 581 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs)
 582 {
 583         int mlret;
 584
 585         /* Unreserve memory span. */
 586         memlist_write_lock();
 587
 588         mlret = memlist_delete_span(
 589             (uint64_t)(pt_base) << PAGESHIFT,
 590             (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
 591
 592         ASSERT(mlret == MEML_SPANOP_OK);
 593         phys_install_has_changed();
 594         installed_top_size(phys_install, &physmax, &physinstalled);
 595
 596         memlist_write_unlock();
 597         delspan_unreserve(pt_base, tpgs);
 598 }
 599
 600 /*
 601  * Only return an available memseg of exactly the right size
 602  * if size is required.
 603  * When the meta data area has it's own virtual address space
 604  * we will need to manage this more carefully and do best fit
 605  * allocations, possibly splitting an available area.
 606  */
 607 struct memseg *
 608 memseg_reuse(pgcnt_t metapgs)
 609 {
 610         int type;
 611         struct memseg **segpp, *seg;
 612
 613         mutex_enter(&memseg_lists_lock);
 614
 615         segpp = &memseg_va_avail;
 616         for (; (seg = *segpp) != NULL; segpp = &seg->lnext) {
 617                 caddr_t end;
 618
 619                 /*
 620                  * Make sure we are reusing the right segment type.
 621                  */
 622                 type = metapgs ? MEMSEG_META_INCL : MEMSEG_META_ALLOC;
 623
 624                 if ((seg->msegflags & (MEMSEG_META_INCL | MEMSEG_META_ALLOC))
 625                     != type)
 626                         continue;
 627
 628                 if (kpm_enable)
 629                         end = hat_kpm_mseg_reuse(seg);
 630                 else
 631                         end = (caddr_t)seg->epages;
 632
 633                 /*
 634                  * Check for the right size if it is provided.
 635                  */
 636                 if (!metapgs || btopr(end - (caddr_t)seg->pages) == metapgs) {
 637                         *segpp = seg->lnext;
 638                         seg->lnext = NULL;
 639                         break;
 640                 }
 641         }
 642         mutex_exit(&memseg_lists_lock);
 643
 644         return (seg);
 645 }
 646
 647 static uint_t handle_gen;
 648
 649 struct memdelspan {
 650         struct memdelspan *mds_next;
 651         pfn_t           mds_base;
 652         pgcnt_t         mds_npgs;
 653         uint_t          *mds_bitmap;
 654         uint_t          *mds_bitmap_retired;
 655 };
 656
 657 #define NBPBMW          (sizeof (uint_t) * NBBY)
 658 #define MDS_BITMAPBYTES(MDSP) \
 659         ((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t))
 660
 661 struct transit_list {
 662         struct transit_list     *trl_next;
 663         struct memdelspan       *trl_spans;
 664         int                     trl_collect;
 665 };
 666
 667 struct transit_list_head {
 668         kmutex_t                trh_lock;
 669         struct transit_list     *trh_head;
 670 };
 671
 672 static struct transit_list_head transit_list_head;
 673
 674 struct mem_handle;
 675 static void transit_list_collect(struct mem_handle *, int);
 676 static void transit_list_insert(struct transit_list *);
 677 static void transit_list_remove(struct transit_list *);
 678
 679 #ifdef DEBUG
 680 #define MEM_DEL_STATS
 681 #endif /* DEBUG */
 682
 683 #ifdef MEM_DEL_STATS
 684 static int mem_del_stat_print = 0;
 685 struct mem_del_stat {
 686         uint_t  nloop;
 687         uint_t  need_free;
 688         uint_t  free_loop;
 689         uint_t  free_low;
 690         uint_t  free_failed;
 691         uint_t  ncheck;
 692         uint_t  nopaget;
 693         uint_t  lockfail;
 694         uint_t  nfree;
 695         uint_t  nreloc;
 696         uint_t  nrelocfail;
 697         uint_t  already_done;
 698         uint_t  first_notfree;
 699         uint_t  npplocked;
 700         uint_t  nlockreloc;
 701         uint_t  nnorepl;
 702         uint_t  nmodreloc;
 703         uint_t  ndestroy;
 704         uint_t  nputpage;
 705         uint_t  nnoreclaim;
 706         uint_t  ndelay;
 707         uint_t  demotefail;
 708         uint64_t nticks_total;
 709         uint64_t nticks_pgrp;
 710         uint_t  retired;
 711         uint_t  toxic;
 712         uint_t  failing;
 713         uint_t  modtoxic;
 714         uint_t  npplkdtoxic;
 715         uint_t  gptlmodfail;
 716         uint_t  gptllckfail;
 717 };
 718 /*
 719  * The stat values are only incremented in the delete thread
 720  * so no locking or atomic required.
 721  */
 722 #define MDSTAT_INCR(MHP, FLD)   (MHP)->mh_delstat.FLD++
 723 #define MDSTAT_TOTAL(MHP, ntck) ((MHP)->mh_delstat.nticks_total += (ntck))
 724 #define MDSTAT_PGRP(MHP, ntck)  ((MHP)->mh_delstat.nticks_pgrp += (ntck))
 725 static void mem_del_stat_print_func(struct mem_handle *);
 726 #define MDSTAT_PRINT(MHP)       mem_del_stat_print_func((MHP))
 727 #else /* MEM_DEL_STATS */
 728 #define MDSTAT_INCR(MHP, FLD)
 729 #define MDSTAT_TOTAL(MHP, ntck)
 730 #define MDSTAT_PGRP(MHP, ntck)
 731 #define MDSTAT_PRINT(MHP)
 732 #endif /* MEM_DEL_STATS */
 733
 734 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING,
 735         MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t;
 736
 737 /*
 738  * mh_mutex must be taken to examine or change mh_exthandle and mh_state.
 739  * The mutex may not be required for other fields, dependent on mh_state.
 740  */
 741 struct mem_handle {
 742         kmutex_t        mh_mutex;
 743         struct mem_handle *mh_next;
 744         memhandle_t     mh_exthandle;
 745         mhnd_state_t    mh_state;
 746         struct transit_list mh_transit;
 747         pgcnt_t         mh_phys_pages;
 748         pgcnt_t         mh_vm_pages;
 749         pgcnt_t         mh_hold_todo;
 750         void            (*mh_delete_complete)(void *, int error);
 751         void            *mh_delete_complete_arg;
 752         volatile uint_t mh_cancel;
 753         volatile uint_t mh_dr_aio_cleanup_cancel;
 754         volatile uint_t mh_aio_cleanup_done;
 755         kcondvar_t      mh_cv;
 756         kthread_id_t    mh_thread_id;
 757         page_t          *mh_deleted;    /* link through p_next */
 758 #ifdef MEM_DEL_STATS
 759         struct mem_del_stat mh_delstat;
 760 #endif /* MEM_DEL_STATS */
 761 };
 762
 763 static struct mem_handle *mem_handle_head;
 764 static kmutex_t mem_handle_list_mutex;
 765
 766 static struct mem_handle *
 767 kphysm_allocate_mem_handle()
 768 {
 769         struct mem_handle *mhp;
 770
 771         mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP);
 772         mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL);
 773         mutex_enter(&mem_handle_list_mutex);
 774         mutex_enter(&mhp->mh_mutex);
 775         /* handle_gen is protected by list mutex. */
 776         mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen);
 777         mhp->mh_next = mem_handle_head;
 778         mem_handle_head = mhp;
 779         mutex_exit(&mem_handle_list_mutex);
 780
 781         return (mhp);
 782 }
 783
 784 static void
 785 kphysm_free_mem_handle(struct mem_handle *mhp)
 786 {
 787         struct mem_handle **mhpp;
 788
 789         ASSERT(mutex_owned(&mhp->mh_mutex));
 790         ASSERT(mhp->mh_state == MHND_FREE);
 791         /*
 792          * Exit the mutex to preserve locking order. This is OK
 793          * here as once in the FREE state, the handle cannot
 794          * be found by a lookup.
 795          */
 796         mutex_exit(&mhp->mh_mutex);
 797
 798         mutex_enter(&mem_handle_list_mutex);
 799         mhpp = &mem_handle_head;
 800         while (*mhpp != NULL && *mhpp != mhp)
 801                 mhpp = &(*mhpp)->mh_next;
 802         ASSERT(*mhpp == mhp);
 803         /*
 804          * No need to lock the handle (mh_mutex) as only
 805          * mh_next changing and this is the only thread that
 806          * can be referncing mhp.
 807          */
 808         *mhpp = mhp->mh_next;
 809         mutex_exit(&mem_handle_list_mutex);
 810
 811         mutex_destroy(&mhp->mh_mutex);
 812         kmem_free(mhp, sizeof (struct mem_handle));
 813 }
 814
 815 /*
 816  * This function finds the internal mem_handle corresponding to an
 817  * external handle and returns it with the mh_mutex held.
 818  */
 819 static struct mem_handle *
 820 kphysm_lookup_mem_handle(memhandle_t handle)
 821 {
 822         struct mem_handle *mhp;
 823
 824         mutex_enter(&mem_handle_list_mutex);
 825         for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) {
 826                 if (mhp->mh_exthandle == handle) {
 827                         mutex_enter(&mhp->mh_mutex);
 828                         /*
 829                          * The state of the handle could have been changed
 830                          * by kphysm_del_release() while waiting for mh_mutex.
 831                          */
 832                         if (mhp->mh_state == MHND_FREE) {
 833                                 mutex_exit(&mhp->mh_mutex);
 834                                 continue;
 835                         }
 836                         break;
 837                 }
 838         }
 839         mutex_exit(&mem_handle_list_mutex);
 840         return (mhp);
 841 }
 842
 843 int
 844 kphysm_del_gethandle(memhandle_t *xmhp)
 845 {
 846         struct mem_handle *mhp;
 847
 848         mhp = kphysm_allocate_mem_handle();
 849         /*
 850          * The handle is allocated using KM_SLEEP, so cannot fail.
 851          * If the implementation is changed, the correct error to return
 852          * here would be KPHYSM_ENOHANDLES.
 853          */
 854         ASSERT(mhp->mh_state == MHND_FREE);
 855         mhp->mh_state = MHND_INIT;
 856         *xmhp = mhp->mh_exthandle;
 857         mutex_exit(&mhp->mh_mutex);
 858         return (KPHYSM_OK);
 859 }
 860
 861 static int
 862 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2)
 863 {
 864         pfn_t e1, e2;
 865
 866         e1 = b1 + l1;
 867         e2 = b2 + l2;
 868
 869         return (!(b2 >= e1 || b1 >= e2));
 870 }
 871
 872 static int can_remove_pgs(pgcnt_t);
 873
 874 static struct memdelspan *
 875 span_to_install(pfn_t base, pgcnt_t npgs)
 876 {
 877         struct memdelspan *mdsp;
 878         struct memdelspan *mdsp_new;
 879         uint64_t address, size, thislen;
 880         struct memlist *mlp;
 881
 882         mdsp_new = NULL;
 883
 884         address = (uint64_t)base << PAGESHIFT;
 885         size = (uint64_t)npgs << PAGESHIFT;
 886         while (size != 0) {
 887                 memlist_read_lock();
 888                 for (mlp = phys_install; mlp != NULL; mlp = mlp->ml_next) {
 889                         if (address >= (mlp->ml_address + mlp->ml_size))
 890                                 continue;
 891                         if ((address + size) > mlp->ml_address)
 892                                 break;
 893                 }
 894                 if (mlp == NULL) {
 895                         address += size;
 896                         size = 0;
 897                         thislen = 0;
 898                 } else {
 899                         if (address < mlp->ml_address) {
 900                                 size -= (mlp->ml_address - address);
 901                                 address = mlp->ml_address;
 902                         }
 903                         ASSERT(address >= mlp->ml_address);
 904                         if ((address + size) >
 905                             (mlp->ml_address + mlp->ml_size)) {
 906                                 thislen =
 907                                     mlp->ml_size - (address - mlp->ml_address);
 908                         } else {
 909                                 thislen = size;
 910                         }
 911                 }
 912                 memlist_read_unlock();
 913                 /* TODO: phys_install could change now */
 914                 if (thislen == 0)
 915                         continue;
 916                 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
 917                 mdsp->mds_base = btop(address);
 918                 mdsp->mds_npgs = btop(thislen);
 919                 mdsp->mds_next = mdsp_new;
 920                 mdsp_new = mdsp;
 921                 address += thislen;
 922                 size -= thislen;
 923         }
 924         return (mdsp_new);
 925 }
 926
 927 static void
 928 free_delspans(struct memdelspan *mdsp)
 929 {
 930         struct memdelspan *amdsp;
 931
 932         while ((amdsp = mdsp) != NULL) {
 933                 mdsp = amdsp->mds_next;
 934                 kmem_free(amdsp, sizeof (struct memdelspan));
 935         }
 936 }
 937
 938 /*
 939  * Concatenate lists. No list ordering is required.
 940  */
 941
 942 static void
 943 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp)
 944 {
 945         while (*mdspp != NULL)
 946                 mdspp = &(*mdspp)->mds_next;
 947
 948         *mdspp = mdsp;
 949 }
 950
 951 /*
 952  * Given a new list of delspans, check there is no overlap with
 953  * all existing span activity (add or delete) and then concatenate
 954  * the new spans to the given list.
 955  * Return 1 for OK, 0 if overlapping.
 956  */
 957 static int
 958 delspan_insert(
 959         struct transit_list *my_tlp,
 960         struct memdelspan *mdsp_new)
 961 {
 962         struct transit_list_head *trh;
 963         struct transit_list *tlp;
 964         int ret;
 965
 966         trh = &transit_list_head;
 967
 968         ASSERT(my_tlp != NULL);
 969         ASSERT(mdsp_new != NULL);
 970
 971         ret = 1;
 972         mutex_enter(&trh->trh_lock);
 973         /* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */
 974         for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
 975                 struct memdelspan *mdsp;
 976
 977                 for (mdsp = tlp->trl_spans; mdsp != NULL;
 978                     mdsp = mdsp->mds_next) {
 979                         struct memdelspan *nmdsp;
 980
 981                         for (nmdsp = mdsp_new; nmdsp != NULL;
 982                             nmdsp = nmdsp->mds_next) {
 983                                 if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
 984                                     nmdsp->mds_base, nmdsp->mds_npgs)) {
 985                                         ret = 0;
 986                                         goto done;
 987                                 }
 988                         }
 989                 }
 990         }
 991 done:
 992         if (ret != 0) {
 993                 if (my_tlp->trl_spans == NULL)
 994                         transit_list_insert(my_tlp);
 995                 delspan_concat(&my_tlp->trl_spans, mdsp_new);
 996         }
 997         mutex_exit(&trh->trh_lock);
 998         return (ret);
 999 }
1000
1001 static void
1002 delspan_remove(
1003         struct transit_list *my_tlp,
1004         pfn_t base,
1005         pgcnt_t npgs)
1006 {
1007         struct transit_list_head *trh;
1008         struct memdelspan *mdsp;
1009
1010         trh = &transit_list_head;
1011
1012         ASSERT(my_tlp != NULL);
1013
1014         mutex_enter(&trh->trh_lock);
1015         if ((mdsp = my_tlp->trl_spans) != NULL) {
1016                 if (npgs == 0) {
1017                         my_tlp->trl_spans = NULL;
1018                         free_delspans(mdsp);
1019                         transit_list_remove(my_tlp);
1020                 } else {
1021                         struct memdelspan **prv;
1022
1023                         prv = &my_tlp->trl_spans;
1024                         while (mdsp != NULL) {
1025                                 pfn_t p_end;
1026
1027                                 p_end = mdsp->mds_base + mdsp->mds_npgs;
1028                                 if (mdsp->mds_base >= base &&
1029                                     p_end <= (base + npgs)) {
1030                                         *prv = mdsp->mds_next;
1031                                         mdsp->mds_next = NULL;
1032                                         free_delspans(mdsp);
1033                                 } else {
1034                                         prv = &mdsp->mds_next;
1035                                 }
1036                                 mdsp = *prv;
1037                         }
1038                         if (my_tlp->trl_spans == NULL)
1039                                 transit_list_remove(my_tlp);
1040                 }
1041         }
1042         mutex_exit(&trh->trh_lock);
1043 }
1044
1045 /*
1046  * Reserve interface for add to stop delete before add finished.
1047  * This list is only accessed through the delspan_insert/remove
1048  * functions and so is fully protected by the mutex in struct transit_list.
1049  */
1050
1051 static struct transit_list reserve_transit;
1052
1053 static int
1054 delspan_reserve(pfn_t base, pgcnt_t npgs)
1055 {
1056         struct memdelspan *mdsp;
1057         int ret;
1058
1059         mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
1060         mdsp->mds_base = base;
1061         mdsp->mds_npgs = npgs;
1062         if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) {
1063                 free_delspans(mdsp);
1064         }
1065         return (ret);
1066 }
1067
1068 static void
1069 delspan_unreserve(pfn_t base, pgcnt_t npgs)
1070 {
1071         delspan_remove(&reserve_transit, base, npgs);
1072 }
1073
1074 /*
1075  * Return whether memseg was created by kphysm_add_memory_dynamic().
1076  */
1077 static int
1078 memseg_is_dynamic(struct memseg *seg)
1079 {
1080         return (seg->msegflags & MEMSEG_DYNAMIC);
1081 }
1082
1083 int
1084 kphysm_del_span(
1085         memhandle_t handle,
1086         pfn_t base,
1087         pgcnt_t npgs)
1088 {
1089         struct mem_handle *mhp;
1090         struct memseg *seg;
1091         struct memdelspan *mdsp;
1092         struct memdelspan *mdsp_new;
1093         pgcnt_t phys_pages, vm_pages;
1094         pfn_t p_end;
1095         page_t *pp;
1096         int ret;
1097
1098         mhp = kphysm_lookup_mem_handle(handle);
1099         if (mhp == NULL) {
1100                 return (KPHYSM_EHANDLE);
1101         }
1102         if (mhp->mh_state != MHND_INIT) {
1103                 mutex_exit(&mhp->mh_mutex);
1104                 return (KPHYSM_ESEQUENCE);
1105         }
1106
1107         /*
1108          * Intersect the span with the installed memory list (phys_install).
1109          */
1110         mdsp_new = span_to_install(base, npgs);
1111         if (mdsp_new == NULL) {
1112                 /*
1113                  * No physical memory in this range. Is this an
1114                  * error? If an attempt to start the delete is made
1115                  * for OK returns from del_span such as this, start will
1116                  * return an error.
1117                  * Could return KPHYSM_ENOWORK.
1118                  */
1119                 /*
1120                  * It is assumed that there are no error returns
1121                  * from span_to_install() due to kmem_alloc failure.
1122                  */
1123                 mutex_exit(&mhp->mh_mutex);
1124                 return (KPHYSM_OK);
1125         }
1126         /*
1127          * Does this span overlap an existing span?
1128          */
1129         if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) {
1130                 /*
1131                  * Differentiate between already on list for this handle
1132                  * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY).
1133                  */
1134                 ret = KPHYSM_EBUSY;
1135                 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1136                     mdsp = mdsp->mds_next) {
1137                         if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
1138                             base, npgs)) {
1139                                 ret = KPHYSM_EDUP;
1140                                 break;
1141                         }
1142                 }
1143                 mutex_exit(&mhp->mh_mutex);
1144                 free_delspans(mdsp_new);
1145                 return (ret);
1146         }
1147         /*
1148          * At this point the spans in mdsp_new have been inserted into the
1149          * list of spans for this handle and thereby to the global list of
1150          * spans being processed. Each of these spans must now be checked
1151          * for relocatability. As a side-effect segments in the memseg list
1152          * may be split.
1153          *
1154          * Note that mdsp_new can no longer be used as it is now part of
1155          * a larger list. Select elements of this larger list based
1156          * on base and npgs.
1157          */
1158 restart:
1159         phys_pages = 0;
1160         vm_pages = 0;
1161         ret = KPHYSM_OK;
1162         for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1163             mdsp = mdsp->mds_next) {
1164                 pgcnt_t pages_checked;
1165
1166                 if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) {
1167                         continue;
1168                 }
1169                 p_end = mdsp->mds_base + mdsp->mds_npgs;
1170                 /*
1171                  * The pages_checked count is a hack. All pages should be
1172                  * checked for relocatability. Those not covered by memsegs
1173                  * should be tested with arch_kphysm_del_span_ok().
1174                  */
1175                 pages_checked = 0;
1176                 for (seg = memsegs; seg; seg = seg->next) {
1177                         pfn_t mseg_start;
1178
1179                         if (seg->pages_base >= p_end ||
1180                             seg->pages_end <= mdsp->mds_base) {
1181                                 /* Span and memseg don't overlap. */
1182                                 continue;
1183                         }
1184                         mseg_start = memseg_get_start(seg);
1185                         /* Check that segment is suitable for delete. */
1186                         if (memseg_includes_meta(seg)) {
1187                                 /*
1188                                  * Check that this segment is completely
1189                                  * within the span.
1190                                  */
1191                                 if (mseg_start < mdsp->mds_base ||
1192                                     seg->pages_end > p_end) {
1193                                         ret = KPHYSM_EBUSY;
1194                                         break;
1195                                 }
1196                                 pages_checked += seg->pages_end - mseg_start;
1197                         } else {
1198                                 /*
1199                                  * If this segment is larger than the span,
1200                                  * try to split it. After the split, it
1201                                  * is necessary to restart.
1202                                  */
1203                                 if (seg->pages_base < mdsp->mds_base ||
1204                                     seg->pages_end > p_end) {
1205                                         pfn_t abase;
1206                                         pgcnt_t anpgs;
1207                                         int s_ret;
1208
1209                                         /* Split required.  */
1210                                         if (mdsp->mds_base < seg->pages_base)
1211                                                 abase = seg->pages_base;
1212                                         else
1213                                                 abase = mdsp->mds_base;
1214                                         if (p_end > seg->pages_end)
1215                                                 anpgs = seg->pages_end - abase;
1216                                         else
1217                                                 anpgs = p_end - abase;
1218                                         s_ret = kphysm_split_memseg(abase,
1219                                             anpgs);
1220                                         if (s_ret == 0) {
1221                                                 /* Split failed. */
1222                                                 ret = KPHYSM_ERESOURCE;
1223                                                 break;
1224                                         }
1225                                         goto restart;
1226                                 }
1227                                 pages_checked +=
1228                                     seg->pages_end - seg->pages_base;
1229                         }
1230                         /*
1231                          * The memseg is wholly within the delete span.
1232                          * The individual pages can now be checked.
1233                          */
1234                         /* Cage test. */
1235                         for (pp = seg->pages; pp < seg->epages; pp++) {
1236                                 if (PP_ISNORELOC(pp)) {
1237                                         ret = KPHYSM_ENONRELOC;
1238                                         break;
1239                                 }
1240                         }
1241                         if (ret != KPHYSM_OK) {
1242                                 break;
1243                         }
1244                         phys_pages += (seg->pages_end - mseg_start);
1245                         vm_pages += MSEG_NPAGES(seg);
1246                 }
1247                 if (ret != KPHYSM_OK)
1248                         break;
1249                 if (pages_checked != mdsp->mds_npgs) {
1250                         ret = KPHYSM_ENONRELOC;
1251                         break;
1252                 }
1253         }
1254
1255         if (ret == KPHYSM_OK) {
1256                 mhp->mh_phys_pages += phys_pages;
1257                 mhp->mh_vm_pages += vm_pages;
1258         } else {
1259                 /*
1260                  * Keep holding the mh_mutex to prevent it going away.
1261                  */
1262                 delspan_remove(&mhp->mh_transit, base, npgs);
1263         }
1264         mutex_exit(&mhp->mh_mutex);
1265         return (ret);
1266 }
1267
1268 int
1269 kphysm_del_span_query(
1270         pfn_t base,
1271         pgcnt_t npgs,
1272         memquery_t *mqp)
1273 {
1274         struct memdelspan *mdsp;
1275         struct memdelspan *mdsp_new;
1276         int done_first_nonreloc;
1277
1278         mqp->phys_pages = 0;
1279         mqp->managed = 0;
1280         mqp->nonrelocatable = 0;
1281         mqp->first_nonrelocatable = 0;
1282         mqp->last_nonrelocatable = 0;
1283
1284         mdsp_new = span_to_install(base, npgs);
1285         /*
1286          * It is OK to proceed here if mdsp_new == NULL.
1287          */
1288         done_first_nonreloc = 0;
1289         for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) {
1290                 pfn_t sbase;
1291                 pgcnt_t snpgs;
1292
1293                 mqp->phys_pages += mdsp->mds_npgs;
1294                 sbase = mdsp->mds_base;
1295                 snpgs = mdsp->mds_npgs;
1296                 while (snpgs != 0) {
1297                         struct memseg *lseg, *seg;
1298                         pfn_t p_end;
1299                         page_t *pp;
1300                         pfn_t mseg_start;
1301
1302                         p_end = sbase + snpgs;
1303                         /*
1304                          * Find the lowest addressed memseg that starts
1305                          * after sbase and account for it.
1306                          * This is to catch dynamic memsegs whose start
1307                          * is hidden.
1308                          */
1309                         seg = NULL;
1310                         for (lseg = memsegs; lseg != NULL; lseg = lseg->next) {
1311                                 if ((lseg->pages_base >= sbase) ||
1312                                     (lseg->pages_base < p_end &&
1313                                     lseg->pages_end > sbase)) {
1314                                         if (seg == NULL ||
1315                                             seg->pages_base > lseg->pages_base)
1316                                                 seg = lseg;
1317                                 }
1318                         }
1319                         if (seg != NULL) {
1320                                 mseg_start = memseg_get_start(seg);
1321                                 /*
1322                                  * Now have the full extent of the memseg so
1323                                  * do the range check.
1324                                  */
1325                                 if (mseg_start >= p_end ||
1326                                     seg->pages_end <= sbase) {
1327                                         /* Span does not overlap memseg. */
1328                                         seg = NULL;
1329                                 }
1330                         }
1331                         /*
1332                          * Account for gap either before the segment if
1333                          * there is one or to the end of the span.
1334                          */
1335                         if (seg == NULL || mseg_start > sbase) {
1336                                 pfn_t a_end;
1337
1338                                 a_end = (seg == NULL) ? p_end : mseg_start;
1339                                 /*
1340                                  * Check with arch layer for relocatability.
1341                                  */
1342                                 if (arch_kphysm_del_span_ok(sbase,
1343                                     (a_end - sbase))) {
1344                                         /*
1345                                          * No non-relocatble pages in this
1346                                          * area, avoid the fine-grained
1347                                          * test.
1348                                          */
1349                                         snpgs -= (a_end - sbase);
1350                                         sbase = a_end;
1351                                 }
1352                                 while (sbase < a_end) {
1353                                         if (!arch_kphysm_del_span_ok(sbase,
1354                                             1)) {
1355                                                 mqp->nonrelocatable++;
1356                                                 if (!done_first_nonreloc) {
1357                                                         mqp->
1358                                                             first_nonrelocatable
1359                                                             = sbase;
1360                                                         done_first_nonreloc = 1;
1361                                                 }
1362                                                 mqp->last_nonrelocatable =
1363                                                     sbase;
1364                                         }
1365                                         sbase++;
1366                                         snpgs--;
1367                                 }
1368                         }
1369                         if (seg != NULL) {
1370                                 ASSERT(mseg_start <= sbase);
1371                                 if (seg->pages_base != mseg_start &&
1372                                     seg->pages_base > sbase) {
1373                                         pgcnt_t skip_pgs;
1374
1375                                         /*
1376                                          * Skip the page_t area of a
1377                                          * dynamic memseg.
1378                                          */
1379                                         skip_pgs = seg->pages_base - sbase;
1380                                         if (snpgs <= skip_pgs) {
1381                                                 sbase += snpgs;
1382                                                 snpgs = 0;
1383                                                 continue;
1384                                         }
1385                                         snpgs -= skip_pgs;
1386                                         sbase += skip_pgs;
1387                                 }
1388                                 ASSERT(snpgs != 0);
1389                                 ASSERT(seg->pages_base <= sbase);
1390                                 /*
1391                                  * The individual pages can now be checked.
1392                                  */
1393                                 for (pp = seg->pages +
1394                                     (sbase - seg->pages_base);
1395                                     snpgs != 0 && pp < seg->epages; pp++) {
1396                                         mqp->managed++;
1397                                         if (PP_ISNORELOC(pp)) {
1398                                                 mqp->nonrelocatable++;
1399                                                 if (!done_first_nonreloc) {
1400                                                         mqp->
1401                                                             first_nonrelocatable
1402                                                             = sbase;
1403                                                         done_first_nonreloc = 1;
1404                                                 }
1405                                                 mqp->last_nonrelocatable =
1406                                                     sbase;
1407                                         }
1408                                         sbase++;
1409                                         snpgs--;
1410                                 }
1411                         }
1412                 }
1413         }
1414
1415         free_delspans(mdsp_new);
1416
1417         return (KPHYSM_OK);
1418 }
1419
1420 /*
1421  * This release function can be called at any stage as follows:
1422  *      _gethandle only called
1423  *      _span(s) only called
1424  *      _start called but failed
1425  *      delete thread exited
1426  */
1427 int
1428 kphysm_del_release(memhandle_t handle)
1429 {
1430         struct mem_handle *mhp;
1431
1432         mhp = kphysm_lookup_mem_handle(handle);
1433         if (mhp == NULL) {
1434                 return (KPHYSM_EHANDLE);
1435         }
1436         switch (mhp->mh_state) {
1437         case MHND_STARTING:
1438         case MHND_RUNNING:
1439                 mutex_exit(&mhp->mh_mutex);
1440                 return (KPHYSM_ENOTFINISHED);
1441         case MHND_FREE:
1442                 ASSERT(mhp->mh_state != MHND_FREE);
1443                 mutex_exit(&mhp->mh_mutex);
1444                 return (KPHYSM_EHANDLE);
1445         case MHND_INIT:
1446                 break;
1447         case MHND_DONE:
1448                 break;
1449         case MHND_RELEASE:
1450                 mutex_exit(&mhp->mh_mutex);
1451                 return (KPHYSM_ESEQUENCE);
1452         default:
1453 #ifdef DEBUG
1454                 cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d",
1455                     (void *)mhp, mhp->mh_state);
1456 #endif /* DEBUG */
1457                 mutex_exit(&mhp->mh_mutex);
1458                 return (KPHYSM_EHANDLE);
1459         }
1460         /*
1461          * Set state so that we can wait if necessary.
1462          * Also this means that we have read/write access to all
1463          * fields except mh_exthandle and mh_state.
1464          */
1465         mhp->mh_state = MHND_RELEASE;
1466         /*
1467          * The mem_handle cannot be de-allocated by any other operation
1468          * now, so no need to hold mh_mutex.
1469          */
1470         mutex_exit(&mhp->mh_mutex);
1471
1472         delspan_remove(&mhp->mh_transit, 0, 0);
1473         mhp->mh_phys_pages = 0;
1474         mhp->mh_vm_pages = 0;
1475         mhp->mh_hold_todo = 0;
1476         mhp->mh_delete_complete = NULL;
1477         mhp->mh_delete_complete_arg = NULL;
1478         mhp->mh_cancel = 0;
1479
1480         mutex_enter(&mhp->mh_mutex);
1481         ASSERT(mhp->mh_state == MHND_RELEASE);
1482         mhp->mh_state = MHND_FREE;
1483
1484         kphysm_free_mem_handle(mhp);
1485
1486         return (KPHYSM_OK);
1487 }
1488
1489 /*
1490  * This cancel function can only be called with the thread running.
1491  */
1492 int
1493 kphysm_del_cancel(memhandle_t handle)
1494 {
1495         struct mem_handle *mhp;
1496
1497         mhp = kphysm_lookup_mem_handle(handle);
1498         if (mhp == NULL) {
1499                 return (KPHYSM_EHANDLE);
1500         }
1501         if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) {
1502                 mutex_exit(&mhp->mh_mutex);
1503                 return (KPHYSM_ENOTRUNNING);
1504         }
1505         /*
1506          * Set the cancel flag and wake the delete thread up.
1507          * The thread may be waiting on I/O, so the effect of the cancel
1508          * may be delayed.
1509          */
1510         if (mhp->mh_cancel == 0) {
1511                 mhp->mh_cancel = KPHYSM_ECANCELLED;
1512                 cv_signal(&mhp->mh_cv);
1513         }
1514         mutex_exit(&mhp->mh_mutex);
1515         return (KPHYSM_OK);
1516 }
1517
1518 int
1519 kphysm_del_status(
1520         memhandle_t handle,
1521         memdelstat_t *mdstp)
1522 {
1523         struct mem_handle *mhp;
1524
1525         mhp = kphysm_lookup_mem_handle(handle);
1526         if (mhp == NULL) {
1527                 return (KPHYSM_EHANDLE);
1528         }
1529         /*
1530          * Calling kphysm_del_status() is allowed before the delete
1531          * is started to allow for status display.
1532          */
1533         if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING &&
1534             mhp->mh_state != MHND_RUNNING) {
1535                 mutex_exit(&mhp->mh_mutex);
1536                 return (KPHYSM_ENOTRUNNING);
1537         }
1538         mdstp->phys_pages = mhp->mh_phys_pages;
1539         mdstp->managed = mhp->mh_vm_pages;
1540         mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo;
1541         mutex_exit(&mhp->mh_mutex);
1542         return (KPHYSM_OK);
1543 }
1544
1545 static int mem_delete_additional_pages = 100;
1546
1547 static int
1548 can_remove_pgs(pgcnt_t npgs)
1549 {
1550         /*
1551          * If all pageable pages were paged out, freemem would
1552          * equal availrmem.  There is a minimum requirement for
1553          * availrmem.
1554          */
1555         if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages))
1556             < npgs)
1557                 return (0);
1558         /* TODO: check swap space, etc. */
1559         return (1);
1560 }
1561
1562 static int
1563 get_availrmem(pgcnt_t npgs)
1564 {
1565         int ret;
1566
1567         mutex_enter(&freemem_lock);
1568         ret = can_remove_pgs(npgs);
1569         if (ret != 0)
1570                 availrmem -= npgs;
1571         mutex_exit(&freemem_lock);
1572         return (ret);
1573 }
1574
1575 static void
1576 put_availrmem(pgcnt_t npgs)
1577 {
1578         mutex_enter(&freemem_lock);
1579         availrmem += npgs;
1580         mutex_exit(&freemem_lock);
1581 }
1582
1583 #define FREEMEM_INCR    100
1584 static pgcnt_t freemem_incr = FREEMEM_INCR;
1585 #define DEL_FREE_WAIT_FRAC      4
1586 #define DEL_FREE_WAIT_TICKS     ((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC)
1587
1588 #define DEL_BUSY_WAIT_FRAC      20
1589 #define DEL_BUSY_WAIT_TICKS     ((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC)
1590
1591 static void kphysm_del_cleanup(struct mem_handle *);
1592
1593 static void page_delete_collect(page_t *, struct mem_handle *);
1594
1595 static pgcnt_t
1596 delthr_get_freemem(struct mem_handle *mhp)
1597 {
1598         pgcnt_t free_get;
1599         int ret;
1600
1601         ASSERT(MUTEX_HELD(&mhp->mh_mutex));
1602
1603         MDSTAT_INCR(mhp, need_free);
1604         /*
1605          * Get up to freemem_incr pages.
1606          */
1607         free_get = freemem_incr;
1608         if (free_get > mhp->mh_hold_todo)
1609                 free_get = mhp->mh_hold_todo;
1610         /*
1611          * Take free_get pages away from freemem,
1612          * waiting if necessary.
1613          */
1614
1615         while (!mhp->mh_cancel) {
1616                 mutex_exit(&mhp->mh_mutex);
1617                 MDSTAT_INCR(mhp, free_loop);
1618                 /*
1619                  * Duplicate test from page_create_throttle()
1620                  * but don't override with !PG_WAIT.
1621                  */
1622                 if (freemem < (free_get + throttlefree)) {
1623                         MDSTAT_INCR(mhp, free_low);
1624                         ret = 0;
1625                 } else {
1626                         ret = page_create_wait(free_get, 0);
1627                         if (ret == 0) {
1628                                 /* EMPTY */
1629                                 MDSTAT_INCR(mhp, free_failed);
1630                         }
1631                 }
1632                 if (ret != 0) {
1633                         mutex_enter(&mhp->mh_mutex);
1634                         return (free_get);
1635                 }
1636
1637                 /*
1638                  * Put pressure on pageout.
1639                  */
1640                 page_needfree(free_get);
1641                 cv_signal(&proc_pageout->p_cv);
1642
1643                 mutex_enter(&mhp->mh_mutex);
1644                 (void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
1645                     DEL_FREE_WAIT_TICKS, TR_CLOCK_TICK);
1646                 mutex_exit(&mhp->mh_mutex);
1647                 page_needfree(-(spgcnt_t)free_get);
1648
1649                 mutex_enter(&mhp->mh_mutex);
1650         }
1651         return (0);
1652 }
1653
1654 #define DR_AIO_CLEANUP_DELAY    25000   /* 0.025secs, in usec */
1655 #define DR_AIO_CLEANUP_MAXLOOPS_NODELAY 100
1656 /*
1657  * This function is run as a helper thread for delete_memory_thread.
1658  * It is needed in order to force kaio cleanup, so that pages used in kaio
1659  * will be unlocked and subsequently relocated by delete_memory_thread.
1660  * The address of the delete_memory_threads's mem_handle is passed in to
1661  * this thread function, and is used to set the mh_aio_cleanup_done member
1662  * prior to calling thread_exit().
1663  */
1664 static void
1665 dr_aio_cleanup_thread(caddr_t amhp)
1666 {
1667         proc_t *procp;
1668         int (*aio_cleanup_dr_delete_memory)(proc_t *);
1669         int cleaned;
1670         int n = 0;
1671         struct mem_handle *mhp;
1672         volatile uint_t *pcancel;
1673
1674         mhp = (struct mem_handle *)amhp;
1675         ASSERT(mhp != NULL);
1676         pcancel = &mhp->mh_dr_aio_cleanup_cancel;
1677         if (modload("sys", "kaio") == -1) {
1678                 mhp->mh_aio_cleanup_done = 1;
1679                 cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio");
1680                 thread_exit();
1681         }
1682         aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
1683             modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
1684         if (aio_cleanup_dr_delete_memory == NULL) {
1685                 mhp->mh_aio_cleanup_done = 1;
1686                 cmn_err(CE_WARN,
1687             "aio_cleanup_dr_delete_memory not found in kaio");
1688                 thread_exit();
1689         }
1690         do {
1691                 cleaned = 0;
1692                 mutex_enter(&pidlock);
1693                 for (procp = practive; (*pcancel == 0) && (procp != NULL);
1694                     procp = procp->p_next) {
1695                         mutex_enter(&procp->p_lock);
1696                         if (procp->p_aio != NULL) {
1697                                 /* cleanup proc's outstanding kaio */
1698                                 cleaned +=
1699                                     (*aio_cleanup_dr_delete_memory)(procp);
1700                         }
1701                         mutex_exit(&procp->p_lock);
1702                 }
1703                 mutex_exit(&pidlock);
1704                 if ((*pcancel == 0) &&
1705                     (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) {
1706                         /* delay a bit before retrying all procs again */
1707                         delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
1708                         n = 0;
1709                 }
1710         } while (*pcancel == 0);
1711         mhp->mh_aio_cleanup_done = 1;
1712         thread_exit();
1713 }
1714
1715 static void
1716 delete_memory_thread(caddr_t amhp)
1717 {
1718         struct mem_handle *mhp;
1719         struct memdelspan *mdsp;
1720         callb_cpr_t cprinfo;
1721         page_t *pp_targ;
1722         spgcnt_t freemem_left;
1723         void (*del_complete_funcp)(void *, int error);
1724         void *del_complete_arg;
1725         int comp_code;
1726         int ret;
1727         int first_scan;
1728         uint_t szc;
1729 #ifdef MEM_DEL_STATS
1730         uint64_t start_total, ntick_total;
1731         uint64_t start_pgrp, ntick_pgrp;
1732 #endif /* MEM_DEL_STATS */
1733
1734         mhp = (struct mem_handle *)amhp;
1735
1736 #ifdef MEM_DEL_STATS
1737         start_total = ddi_get_lbolt();
1738 #endif /* MEM_DEL_STATS */
1739
1740         CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex,
1741             callb_generic_cpr, "memdel");
1742
1743         mutex_enter(&mhp->mh_mutex);
1744         ASSERT(mhp->mh_state == MHND_STARTING);
1745
1746         mhp->mh_state = MHND_RUNNING;
1747         mhp->mh_thread_id = curthread;
1748
1749         mhp->mh_hold_todo = mhp->mh_vm_pages;
1750         mutex_exit(&mhp->mh_mutex);
1751
1752         /* Allocate the remap pages now, if necessary. */
1753         memseg_remap_init();
1754
1755         /*
1756          * Subtract from availrmem now if possible as availrmem
1757          * may not be available by the end of the delete.
1758          */
1759         if (!get_availrmem(mhp->mh_vm_pages)) {
1760                 comp_code = KPHYSM_ENOTVIABLE;
1761                 mutex_enter(&mhp->mh_mutex);
1762                 goto early_exit;
1763         }
1764
1765         ret = kphysm_setup_pre_del(mhp->mh_vm_pages);
1766
1767         mutex_enter(&mhp->mh_mutex);
1768
1769         if (ret != 0) {
1770                 mhp->mh_cancel = KPHYSM_EREFUSED;
1771                 goto refused;
1772         }
1773
1774         transit_list_collect(mhp, 1);
1775
1776         for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1777             mdsp = mdsp->mds_next) {
1778                 ASSERT(mdsp->mds_bitmap == NULL);
1779                 mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP);
1780                 mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp),
1781                     KM_SLEEP);
1782         }
1783
1784         first_scan = 1;
1785         freemem_left = 0;
1786         /*
1787          * Start dr_aio_cleanup_thread, which periodically iterates
1788          * through the process list and invokes aio cleanup.  This
1789          * is needed in order to avoid a deadly embrace between the
1790          * delete_memory_thread (waiting on writer lock for page, with the
1791          * exclusive-wanted bit set), kaio read request threads (waiting for a
1792          * reader lock on the same page that is wanted by the
1793          * delete_memory_thread), and threads waiting for kaio completion
1794          * (blocked on spt_amp->lock).
1795          */
1796         mhp->mh_dr_aio_cleanup_cancel = 0;
1797         mhp->mh_aio_cleanup_done = 0;
1798         (void) thread_create(NULL, 0, dr_aio_cleanup_thread,
1799             (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1);
1800         while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) {
1801                 pgcnt_t collected;
1802
1803                 MDSTAT_INCR(mhp, nloop);
1804                 collected = 0;
1805                 for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) &&
1806                     (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) {
1807                         pfn_t pfn, p_end;
1808
1809                         p_end = mdsp->mds_base + mdsp->mds_npgs;
1810                         for (pfn = mdsp->mds_base; (pfn < p_end) &&
1811                             (mhp->mh_cancel == 0); pfn++) {
1812                                 page_t *pp, *tpp, *tpp_targ;
1813                                 pgcnt_t bit;
1814                                 struct vnode *vp;
1815                                 u_offset_t offset;
1816                                 int mod, result;
1817                                 spgcnt_t pgcnt;
1818
1819                                 bit = pfn - mdsp->mds_base;
1820                                 if ((mdsp->mds_bitmap[bit / NBPBMW] &
1821                                     (1 << (bit % NBPBMW))) != 0) {
1822                                         MDSTAT_INCR(mhp, already_done);
1823                                         continue;
1824                                 }
1825                                 if (freemem_left == 0) {
1826                                         freemem_left += delthr_get_freemem(mhp);
1827                                         if (freemem_left == 0)
1828                                                 break;
1829                                 }
1830
1831                                 /*
1832                                  * Release mh_mutex - some of this
1833                                  * stuff takes some time (eg PUTPAGE).
1834                                  */
1835
1836                                 mutex_exit(&mhp->mh_mutex);
1837                                 MDSTAT_INCR(mhp, ncheck);
1838
1839                                 pp = page_numtopp_nolock(pfn);
1840                                 if (pp == NULL) {
1841                                         /*
1842                                          * Not covered by a page_t - will
1843                                          * be dealt with elsewhere.
1844                                          */
1845                                         MDSTAT_INCR(mhp, nopaget);
1846                                         mutex_enter(&mhp->mh_mutex);
1847                                         mdsp->mds_bitmap[bit / NBPBMW] |=
1848                                             (1 << (bit % NBPBMW));
1849                                         continue;
1850                                 }
1851
1852                                 if (!page_try_reclaim_lock(pp, SE_EXCL,
1853                                     SE_EXCL_WANTED | SE_RETIRED)) {
1854                                         /*
1855                                          * Page in use elsewhere.  Skip it.
1856                                          */
1857                                         MDSTAT_INCR(mhp, lockfail);
1858                                         mutex_enter(&mhp->mh_mutex);
1859                                         continue;
1860                                 }
1861                                 /*
1862                                  * See if the cage expanded into the delete.
1863                                  * This can happen as we have to allow the
1864                                  * cage to expand.
1865                                  */
1866                                 if (PP_ISNORELOC(pp)) {
1867                                         page_unlock(pp);
1868                                         mutex_enter(&mhp->mh_mutex);
1869                                         mhp->mh_cancel = KPHYSM_ENONRELOC;
1870                                         break;
1871                                 }
1872                                 if (PP_RETIRED(pp)) {
1873                                         /*
1874                                          * Page has been retired and is
1875                                          * not part of the cage so we
1876                                          * can now do the accounting for
1877                                          * it.
1878                                          */
1879                                         MDSTAT_INCR(mhp, retired);
1880                                         mutex_enter(&mhp->mh_mutex);
1881                                         mdsp->mds_bitmap[bit / NBPBMW]
1882                                             |= (1 << (bit % NBPBMW));
1883                                         mdsp->mds_bitmap_retired[bit /
1884                                             NBPBMW] |=
1885                                             (1 << (bit % NBPBMW));
1886                                         mhp->mh_hold_todo--;
1887                                         continue;
1888                                 }
1889                                 ASSERT(freemem_left != 0);
1890                                 if (PP_ISFREE(pp)) {
1891                                         /*
1892                                          * Like page_reclaim() only 'freemem'
1893                                          * processing is already done.
1894                                          */
1895                                         MDSTAT_INCR(mhp, nfree);
1896                                 free_page_collect:
1897                                         if (PP_ISAGED(pp)) {
1898                                                 page_list_sub(pp,
1899                                                     PG_FREE_LIST);
1900                                         } else {
1901                                                 page_list_sub(pp,
1902                                                     PG_CACHE_LIST);
1903                                         }
1904                                         PP_CLRFREE(pp);
1905                                         PP_CLRAGED(pp);
1906                                         collected++;
1907                                         mutex_enter(&mhp->mh_mutex);
1908                                         page_delete_collect(pp, mhp);
1909                                         mdsp->mds_bitmap[bit / NBPBMW] |=
1910                                             (1 << (bit % NBPBMW));
1911                                         freemem_left--;
1912                                         continue;
1913                                 }
1914                                 ASSERT(pp->p_vnode != NULL);
1915                                 if (first_scan) {
1916                                         MDSTAT_INCR(mhp, first_notfree);
1917                                         page_unlock(pp);
1918                                         mutex_enter(&mhp->mh_mutex);
1919                                         continue;
1920                                 }
1921                                 /*
1922                                  * Keep stats on pages encountered that
1923                                  * are marked for retirement.
1924                                  */
1925                                 if (PP_TOXIC(pp)) {
1926                                         MDSTAT_INCR(mhp, toxic);
1927                                 } else if (PP_PR_REQ(pp)) {
1928                                         MDSTAT_INCR(mhp, failing);
1929                                 }
1930                                 /*
1931                                  * In certain cases below, special exceptions
1932                                  * are made for pages that are toxic.  This
1933                                  * is because the current meaning of toxic
1934                                  * is that an uncorrectable error has been
1935                                  * previously associated with the page.
1936                                  */
1937                                 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1938                                         if (!PP_TOXIC(pp)) {
1939                                                 /*
1940                                                  * Must relocate locked in
1941                                                  * memory pages.
1942                                                  */
1943 #ifdef MEM_DEL_STATS
1944                                                 start_pgrp = ddi_get_lbolt();
1945 #endif /* MEM_DEL_STATS */
1946                                                 /*
1947                                                  * Lock all constituent pages
1948                                                  * of a large page to ensure
1949                                                  * that p_szc won't change.
1950                                                  */
1951                                                 if (!group_page_trylock(pp,
1952                                                     SE_EXCL)) {
1953                                                         MDSTAT_INCR(mhp,
1954                                                             gptllckfail);
1955                                                         page_unlock(pp);
1956                                                         mutex_enter(
1957                                                             &mhp->mh_mutex);
1958                                                         continue;
1959                                                 }
1960                                                 MDSTAT_INCR(mhp, npplocked);
1961                                                 pp_targ =
1962                                                     page_get_replacement_page(
1963                                                     pp, NULL, 0);
1964                                                 if (pp_targ != NULL) {
1965 #ifdef MEM_DEL_STATS
1966                                                         ntick_pgrp =
1967                                                             (uint64_t)
1968                                                             ddi_get_lbolt() -
1969                                                             start_pgrp;
1970 #endif /* MEM_DEL_STATS */
1971                                                         MDSTAT_PGRP(mhp,
1972                                                             ntick_pgrp);
1973                                                         MDSTAT_INCR(mhp,
1974                                                             nlockreloc);
1975                                                         goto reloc;
1976                                                 }
1977                                                 group_page_unlock(pp);
1978                                                 page_unlock(pp);
1979 #ifdef MEM_DEL_STATS
1980                                                 ntick_pgrp =
1981                                                     (uint64_t)ddi_get_lbolt() -
1982                                                     start_pgrp;
1983 #endif /* MEM_DEL_STATS */
1984                                                 MDSTAT_PGRP(mhp, ntick_pgrp);
1985                                                 MDSTAT_INCR(mhp, nnorepl);
1986                                                 mutex_enter(&mhp->mh_mutex);
1987                                                 continue;
1988                                         } else {
1989                                                 /*
1990                                                  * Cannot do anything about
1991                                                  * this page because it is
1992                                                  * toxic.
1993                                                  */
1994                                                 MDSTAT_INCR(mhp, npplkdtoxic);
1995                                                 page_unlock(pp);
1996                                                 mutex_enter(&mhp->mh_mutex);
1997                                                 continue;
1998                                         }
1999                                 }
2000                                 /*
2001                                  * Unload the mappings and check if mod bit
2002                                  * is set.
2003                                  */
2004                                 ASSERT(!PP_ISKAS(pp));
2005                                 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
2006                                 mod = hat_ismod(pp);
2007
2008 #ifdef MEM_DEL_STATS
2009                                 start_pgrp = ddi_get_lbolt();
2010 #endif /* MEM_DEL_STATS */
2011                                 if (mod && !PP_TOXIC(pp)) {
2012                                         /*
2013                                          * Lock all constituent pages
2014                                          * of a large page to ensure
2015                                          * that p_szc won't change.
2016                                          */
2017                                         if (!group_page_trylock(pp, SE_EXCL)) {
2018                                                 MDSTAT_INCR(mhp, gptlmodfail);
2019                                                 page_unlock(pp);
2020                                                 mutex_enter(&mhp->mh_mutex);
2021                                                 continue;
2022                                         }
2023                                         pp_targ = page_get_replacement_page(pp,
2024                                             NULL, 0);
2025                                         if (pp_targ != NULL) {
2026                                                 MDSTAT_INCR(mhp, nmodreloc);
2027 #ifdef MEM_DEL_STATS
2028                                                 ntick_pgrp =
2029                                                     (uint64_t)ddi_get_lbolt() -
2030                                                     start_pgrp;
2031 #endif /* MEM_DEL_STATS */
2032                                                 MDSTAT_PGRP(mhp, ntick_pgrp);
2033                                                 goto reloc;
2034                                         }
2035                                         group_page_unlock(pp);
2036                                 }
2037
2038                                 if (!page_try_demote_pages(pp)) {
2039                                         MDSTAT_INCR(mhp, demotefail);
2040                                         page_unlock(pp);
2041 #ifdef MEM_DEL_STATS
2042                                         ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2043                                             start_pgrp;
2044 #endif /* MEM_DEL_STATS */
2045                                         MDSTAT_PGRP(mhp, ntick_pgrp);
2046                                         mutex_enter(&mhp->mh_mutex);
2047                                         continue;
2048                                 }
2049
2050                                 /*
2051                                  * Regular 'page-out'.
2052                                  */
2053                                 if (!mod) {
2054                                         MDSTAT_INCR(mhp, ndestroy);
2055                                         page_destroy(pp, 1);
2056                                         /*
2057                                          * page_destroy was called with
2058                                          * dontfree. As long as p_lckcnt
2059                                          * and p_cowcnt are both zero, the
2060                                          * only additional action of
2061                                          * page_destroy with !dontfree is to
2062                                          * call page_free, so we can collect
2063                                          * the page here.
2064                                          */
2065                                         collected++;
2066 #ifdef MEM_DEL_STATS
2067                                         ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2068                                             start_pgrp;
2069 #endif /* MEM_DEL_STATS */
2070                                         MDSTAT_PGRP(mhp, ntick_pgrp);
2071                                         mutex_enter(&mhp->mh_mutex);
2072                                         page_delete_collect(pp, mhp);
2073                                         mdsp->mds_bitmap[bit / NBPBMW] |=
2074                                             (1 << (bit % NBPBMW));
2075                                         continue;
2076                                 }
2077                                 /*
2078                                  * The page is toxic and the mod bit is
2079                                  * set, we cannot do anything here to deal
2080                                  * with it.
2081                                  */
2082                                 if (PP_TOXIC(pp)) {
2083                                         page_unlock(pp);
2084 #ifdef MEM_DEL_STATS
2085                                         ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2086                                             start_pgrp;
2087 #endif /* MEM_DEL_STATS */
2088                                         MDSTAT_PGRP(mhp, ntick_pgrp);
2089                                         MDSTAT_INCR(mhp, modtoxic);
2090                                         mutex_enter(&mhp->mh_mutex);
2091                                         continue;
2092                                 }
2093                                 MDSTAT_INCR(mhp, nputpage);
2094                                 vp = pp->p_vnode;
2095                                 offset = pp->p_offset;
2096                                 VN_HOLD(vp);
2097                                 page_unlock(pp);
2098                                 (void) VOP_PUTPAGE(vp, offset, PAGESIZE,
2099                                     B_INVAL|B_FORCE, kcred, NULL);
2100                                 VN_RELE(vp);
2101 #ifdef MEM_DEL_STATS
2102                                 ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2103                                     start_pgrp;
2104 #endif /* MEM_DEL_STATS */
2105                                 MDSTAT_PGRP(mhp, ntick_pgrp);
2106                                 /*
2107                                  * Try to get the page back immediately
2108                                  * so that it can be collected.
2109                                  */
2110                                 pp = page_numtopp_nolock(pfn);
2111                                 if (pp == NULL) {
2112                                         MDSTAT_INCR(mhp, nnoreclaim);
2113                                         /*
2114                                          * This should not happen as this
2115                                          * thread is deleting the page.
2116                                          * If this code is generalized, this
2117                                          * becomes a reality.
2118                                          */
2119 #ifdef DEBUG
2120                                         cmn_err(CE_WARN,
2121                                             "delete_memory_thread(0x%p) "
2122                                             "pfn 0x%lx has no page_t",
2123                                             (void *)mhp, pfn);
2124 #endif /* DEBUG */
2125                                         mutex_enter(&mhp->mh_mutex);
2126                                         continue;
2127                                 }
2128                                 if (page_try_reclaim_lock(pp, SE_EXCL,
2129                                     SE_EXCL_WANTED | SE_RETIRED)) {
2130                                         if (PP_ISFREE(pp)) {
2131                                                 goto free_page_collect;
2132                                         }
2133                                         page_unlock(pp);
2134                                 }
2135                                 MDSTAT_INCR(mhp, nnoreclaim);
2136                                 mutex_enter(&mhp->mh_mutex);
2137                                 continue;
2138
2139                         reloc:
2140                                 /*
2141                                  * Got some freemem and a target
2142                                  * page, so move the data to avoid
2143                                  * I/O and lock problems.
2144                                  */
2145                                 ASSERT(!page_iolock_assert(pp));
2146                                 MDSTAT_INCR(mhp, nreloc);
2147                                 /*
2148                                  * page_relocate() will return pgcnt: the
2149                                  * number of consecutive pages relocated.
2150                                  * If it is successful, pp will be a
2151                                  * linked list of the page structs that
2152                                  * were relocated. If page_relocate() is
2153                                  * unsuccessful, pp will be unmodified.
2154                                  */
2155 #ifdef MEM_DEL_STATS
2156                                 start_pgrp = ddi_get_lbolt();
2157 #endif /* MEM_DEL_STATS */
2158                                 result = page_relocate(&pp, &pp_targ, 0, 0,
2159                                     &pgcnt, NULL);
2160 #ifdef MEM_DEL_STATS
2161                                 ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2162                                     start_pgrp;
2163 #endif /* MEM_DEL_STATS */
2164                                 MDSTAT_PGRP(mhp, ntick_pgrp);
2165                                 if (result != 0) {
2166                                         MDSTAT_INCR(mhp, nrelocfail);
2167                                         /*
2168                                          * We did not succeed. We need
2169                                          * to give the pp_targ pages back.
2170                                          * page_free(pp_targ, 1) without
2171                                          * the freemem accounting.
2172                                          */
2173                                         group_page_unlock(pp);
2174                                         page_free_replacement_page(pp_targ);
2175                                         page_unlock(pp);
2176                                         mutex_enter(&mhp->mh_mutex);
2177                                         continue;
2178                                 }
2179
2180                                 /*
2181                                  * We will then collect pgcnt pages.
2182                                  */
2183                                 ASSERT(pgcnt > 0);
2184                                 mutex_enter(&mhp->mh_mutex);
2185                                 /*
2186                                  * We need to make sure freemem_left is
2187                                  * large enough.
2188                                  */
2189                                 while ((freemem_left < pgcnt) &&
2190                                     (!mhp->mh_cancel)) {
2191                                         freemem_left +=
2192                                             delthr_get_freemem(mhp);
2193                                 }
2194
2195                                 /*
2196                                  * Do not proceed if mh_cancel is set.
2197                                  */
2198                                 if (mhp->mh_cancel) {
2199                                         while (pp_targ != NULL) {
2200                                                 /*
2201                                                  * Unlink and unlock each page.
2202                                                  */
2203                                                 tpp_targ = pp_targ;
2204                                                 page_sub(&pp_targ, tpp_targ);
2205                                                 page_unlock(tpp_targ);
2206                                         }
2207                                         /*
2208                                          * We need to give the pp pages back.
2209                                          * page_free(pp, 1) without the
2210                                          * freemem accounting.
2211                                          */
2212                                         page_free_replacement_page(pp);
2213                                         break;
2214                                 }
2215
2216                                 /* Now remove pgcnt from freemem_left */
2217                                 freemem_left -= pgcnt;
2218                                 ASSERT(freemem_left >= 0);
2219                                 szc = pp->p_szc;
2220                                 while (pp != NULL) {
2221                                         /*
2222                                          * pp and pp_targ were passed back as
2223                                          * a linked list of pages.
2224                                          * Unlink and unlock each page.
2225                                          */
2226                                         tpp_targ = pp_targ;
2227                                         page_sub(&pp_targ, tpp_targ);
2228                                         page_unlock(tpp_targ);
2229                                         /*
2230                                          * The original page is now free
2231                                          * so remove it from the linked
2232                                          * list and collect it.
2233                                          */
2234                                         tpp = pp;
2235                                         page_sub(&pp, tpp);
2236                                         pfn = page_pptonum(tpp);
2237                                         collected++;
2238                                         ASSERT(PAGE_EXCL(tpp));
2239                                         ASSERT(tpp->p_vnode == NULL);
2240                                         ASSERT(!hat_page_is_mapped(tpp));
2241                                         ASSERT(tpp->p_szc == szc);
2242                                         tpp->p_szc = 0;
2243                                         page_delete_collect(tpp, mhp);
2244                                         bit = pfn - mdsp->mds_base;
2245                                         mdsp->mds_bitmap[bit / NBPBMW] |=
2246                                             (1 << (bit % NBPBMW));
2247                                 }
2248                                 ASSERT(pp_targ == NULL);
2249                         }
2250                 }
2251                 first_scan = 0;
2252                 if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) &&
2253                     (collected == 0)) {
2254                         /*
2255                          * This code is needed as we cannot wait
2256                          * for a page to be locked OR the delete to
2257                          * be cancelled.  Also, we must delay so
2258                          * that other threads get a chance to run
2259                          * on our cpu, otherwise page locks may be
2260                          * held indefinitely by those threads.
2261                          */
2262                         MDSTAT_INCR(mhp, ndelay);
2263                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
2264                         (void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
2265                             DEL_BUSY_WAIT_TICKS, TR_CLOCK_TICK);
2266                         CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2267                 }
2268         }
2269         /* stop the dr aio cleanup thread */
2270         mhp->mh_dr_aio_cleanup_cancel = 1;
2271         transit_list_collect(mhp, 0);
2272         if (freemem_left != 0) {
2273                 /* Return any surplus. */
2274                 page_create_putback(freemem_left);
2275                 freemem_left = 0;
2276         }
2277 #ifdef MEM_DEL_STATS
2278         ntick_total = (uint64_t)ddi_get_lbolt() - start_total;
2279 #endif /* MEM_DEL_STATS */
2280         MDSTAT_TOTAL(mhp, ntick_total);
2281         MDSTAT_PRINT(mhp);
2282
2283         /*
2284          * If the memory delete was cancelled, exclusive-wanted bits must
2285          * be cleared. If there are retired pages being deleted, they need
2286          * to be unretired.
2287          */
2288         for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2289             mdsp = mdsp->mds_next) {
2290                 pfn_t pfn, p_end;
2291
2292                 p_end = mdsp->mds_base + mdsp->mds_npgs;
2293                 for (pfn = mdsp->mds_base; pfn < p_end; pfn++) {
2294                         page_t *pp;
2295                         pgcnt_t bit;
2296
2297                         bit = pfn - mdsp->mds_base;
2298                         if (mhp->mh_cancel) {
2299                                 pp = page_numtopp_nolock(pfn);
2300                                 if (pp != NULL) {
2301                                         if ((mdsp->mds_bitmap[bit / NBPBMW] &
2302                                             (1 << (bit % NBPBMW))) == 0) {
2303                                                 page_lock_clr_exclwanted(pp);
2304                                         }
2305                                 }
2306                         } else {
2307                                 pp = NULL;
2308                         }
2309                         if ((mdsp->mds_bitmap_retired[bit / NBPBMW] &
2310                             (1 << (bit % NBPBMW))) != 0) {
2311                                 /* do we already have pp? */
2312                                 if (pp == NULL) {
2313                                         pp = page_numtopp_nolock(pfn);
2314                                 }
2315                                 ASSERT(pp != NULL);
2316                                 ASSERT(PP_RETIRED(pp));
2317                                 if (mhp->mh_cancel != 0) {
2318                                         page_unlock(pp);
2319                                         /*
2320                                          * To satisfy ASSERT below in
2321                                          * cancel code.
2322                                          */
2323                                         mhp->mh_hold_todo++;
2324                                 } else {
2325                                         (void) page_unretire_pp(pp,
2326                                             PR_UNR_CLEAN);
2327                                 }
2328                         }
2329                 }
2330         }
2331         /*
2332          * Free retired page bitmap and collected page bitmap
2333          */
2334         for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2335             mdsp = mdsp->mds_next) {
2336                 ASSERT(mdsp->mds_bitmap_retired != NULL);
2337                 kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp));
2338                 mdsp->mds_bitmap_retired = NULL;        /* Paranoia. */
2339                 ASSERT(mdsp->mds_bitmap != NULL);
2340                 kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp));
2341                 mdsp->mds_bitmap = NULL;        /* Paranoia. */
2342         }
2343
2344         /* wait for our dr aio cancel thread to exit */
2345         while (!(mhp->mh_aio_cleanup_done)) {
2346                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2347                 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
2348                 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2349         }
2350 refused:
2351         if (mhp->mh_cancel != 0) {
2352                 page_t *pp;
2353
2354                 comp_code = mhp->mh_cancel;
2355                 /*
2356                  * Go through list of deleted pages (mh_deleted) freeing
2357                  * them.
2358                  */
2359                 while ((pp = mhp->mh_deleted) != NULL) {
2360                         mhp->mh_deleted = pp->p_next;
2361                         mhp->mh_hold_todo++;
2362                         mutex_exit(&mhp->mh_mutex);
2363                         /* Restore p_next. */
2364                         pp->p_next = pp->p_prev;
2365                         if (PP_ISFREE(pp)) {
2366                                 cmn_err(CE_PANIC,
2367                                     "page %p is free",
2368                                     (void *)pp);
2369                         }
2370                         page_free(pp, 1);
2371                         mutex_enter(&mhp->mh_mutex);
2372                 }
2373                 ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages);
2374
2375                 mutex_exit(&mhp->mh_mutex);
2376                 put_availrmem(mhp->mh_vm_pages);
2377                 mutex_enter(&mhp->mh_mutex);
2378
2379                 goto t_exit;
2380         }
2381
2382         /*
2383          * All the pages are no longer in use and are exclusively locked.
2384          */
2385
2386         mhp->mh_deleted = NULL;
2387
2388         kphysm_del_cleanup(mhp);
2389
2390         /*
2391          * mem_node_del_range needs to be after kphysm_del_cleanup so
2392          * that the mem_node_config[] will remain intact for the cleanup.
2393          */
2394         for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2395             mdsp = mdsp->mds_next) {
2396                 mem_node_del_range(mdsp->mds_base,
2397                     mdsp->mds_base + mdsp->mds_npgs - 1);
2398         }
2399         /* cleanup the page counters */
2400         page_ctrs_cleanup();
2401
2402         comp_code = KPHYSM_OK;
2403
2404 t_exit:
2405         mutex_exit(&mhp->mh_mutex);
2406         kphysm_setup_post_del(mhp->mh_vm_pages,
2407             (comp_code == KPHYSM_OK) ? 0 : 1);
2408         mutex_enter(&mhp->mh_mutex);
2409
2410 early_exit:
2411         /* mhp->mh_mutex exited by CALLB_CPR_EXIT() */
2412         mhp->mh_state = MHND_DONE;
2413         del_complete_funcp = mhp->mh_delete_complete;
2414         del_complete_arg = mhp->mh_delete_complete_arg;
2415         CALLB_CPR_EXIT(&cprinfo);
2416         (*del_complete_funcp)(del_complete_arg, comp_code);
2417         thread_exit();
2418         /*NOTREACHED*/
2419 }
2420
2421 /*
2422  * Start the delete of the memory from the system.
2423  */
2424 int
2425 kphysm_del_start(
2426         memhandle_t handle,
2427         void (*complete)(void *, int),
2428         void *complete_arg)
2429 {
2430         struct mem_handle *mhp;
2431
2432         mhp = kphysm_lookup_mem_handle(handle);
2433         if (mhp == NULL) {
2434                 return (KPHYSM_EHANDLE);
2435         }
2436         switch (mhp->mh_state) {
2437         case MHND_FREE:
2438                 ASSERT(mhp->mh_state != MHND_FREE);
2439                 mutex_exit(&mhp->mh_mutex);
2440                 return (KPHYSM_EHANDLE);
2441         case MHND_INIT:
2442                 break;
2443         case MHND_STARTING:
2444         case MHND_RUNNING:
2445                 mutex_exit(&mhp->mh_mutex);
2446                 return (KPHYSM_ESEQUENCE);
2447         case MHND_DONE:
2448                 mutex_exit(&mhp->mh_mutex);
2449                 return (KPHYSM_ESEQUENCE);
2450         case MHND_RELEASE:
2451                 mutex_exit(&mhp->mh_mutex);
2452                 return (KPHYSM_ESEQUENCE);
2453         default:
2454 #ifdef DEBUG
2455                 cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d",
2456                     (void *)mhp, mhp->mh_state);
2457 #endif /* DEBUG */
2458                 mutex_exit(&mhp->mh_mutex);
2459                 return (KPHYSM_EHANDLE);
2460         }
2461
2462         if (mhp->mh_transit.trl_spans == NULL) {
2463                 mutex_exit(&mhp->mh_mutex);
2464                 return (KPHYSM_ENOWORK);
2465         }
2466
2467         ASSERT(complete != NULL);
2468         mhp->mh_delete_complete = complete;
2469         mhp->mh_delete_complete_arg = complete_arg;
2470         mhp->mh_state = MHND_STARTING;
2471         /*
2472          * Release the mutex in case thread_create sleeps.
2473          */
2474         mutex_exit(&mhp->mh_mutex);
2475
2476         /*
2477          * The "obvious" process for this thread is pageout (proc_pageout)
2478          * but this gives the thread too much power over freemem
2479          * which results in freemem starvation.
2480          */
2481         (void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0,
2482             TS_RUN, maxclsyspri - 1);
2483
2484         return (KPHYSM_OK);
2485 }
2486
2487 static kmutex_t pp_dummy_lock;          /* Protects init. of pp_dummy. */
2488 static caddr_t pp_dummy;
2489 static pgcnt_t pp_dummy_npages;
2490 static pfn_t *pp_dummy_pfn;     /* Array of dummy pfns. */
2491
2492 static void
2493 memseg_remap_init_pages(page_t *pages, page_t *epages)
2494 {
2495         page_t *pp;
2496
2497         for (pp = pages; pp < epages; pp++) {
2498                 pp->p_pagenum = PFN_INVALID;    /* XXXX */
2499                 pp->p_offset = (u_offset_t)-1;
2500                 page_iolock_init(pp);
2501                 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
2502                         continue;
2503                 page_lock_delete(pp);
2504         }
2505 }
2506
2507 void
2508 memseg_remap_init()
2509 {
2510         mutex_enter(&pp_dummy_lock);
2511         if (pp_dummy == NULL) {
2512                 uint_t dpages;
2513                 int i;
2514
2515                 /*
2516                  * dpages starts off as the size of the structure and
2517                  * ends up as the minimum number of pages that will
2518                  * hold a whole number of page_t structures.
2519                  */
2520                 dpages = sizeof (page_t);
2521                 ASSERT(dpages != 0);
2522                 ASSERT(dpages <= MMU_PAGESIZE);
2523
2524                 while ((dpages & 1) == 0)
2525                         dpages >>= 1;
2526
2527                 pp_dummy_npages = dpages;
2528                 /*
2529                  * Allocate pp_dummy pages directly from static_arena,
2530                  * since these are whole page allocations and are
2531                  * referenced by physical address.  This also has the
2532                  * nice fringe benefit of hiding the memory from
2533                  * ::findleaks since it doesn't deal well with allocated
2534                  * kernel heap memory that doesn't have any mappings.
2535                  */
2536                 pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages),
2537                     PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP);
2538                 bzero(pp_dummy, ptob(pp_dummy_npages));
2539                 ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0);
2540                 pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) *
2541                     pp_dummy_npages, KM_SLEEP);
2542                 for (i = 0; i < pp_dummy_npages; i++) {
2543                         pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat,
2544                             &pp_dummy[MMU_PAGESIZE * i]);
2545                         ASSERT(pp_dummy_pfn[i] != PFN_INVALID);
2546                 }
2547                 /*
2548                  * Initialize the page_t's to a known 'deleted' state
2549                  * that matches the state of deleted pages.
2550                  */
2551                 memseg_remap_init_pages((page_t *)pp_dummy,
2552                     (page_t *)(pp_dummy + ptob(pp_dummy_npages)));
2553                 /* Remove kmem mappings for the pages for safety. */
2554                 hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages),
2555                     HAT_UNLOAD_UNLOCK);
2556                 /* Leave pp_dummy pointer set as flag that init is done. */
2557         }
2558         mutex_exit(&pp_dummy_lock);
2559 }
2560
2561 /*
2562  * Remap a page-aglined range of page_t's to dummy pages.
2563  */
2564 void
2565 remap_to_dummy(caddr_t va, pgcnt_t metapgs)
2566 {
2567         int phase;
2568
2569         ASSERT(IS_P2ALIGNED((uint64_t)(uintptr_t)va, PAGESIZE));
2570
2571         /*
2572          * We may start remapping at a non-zero page offset
2573          * within the dummy pages since the low/high ends
2574          * of the outgoing pp's could be shared by other
2575          * memsegs (see memseg_remap_meta).
2576          */
2577         phase = btop((uint64_t)(uintptr_t)va) % pp_dummy_npages;
2578         /*CONSTCOND*/
2579         ASSERT(PAGESIZE % sizeof (page_t) || phase == 0);
2580
2581         while (metapgs != 0) {
2582                 pgcnt_t n;
2583                 int i, j;
2584
2585                 n = pp_dummy_npages;
2586                 if (n > metapgs)
2587                         n = metapgs;
2588                 for (i = 0; i < n; i++) {
2589                         j = (i + phase) % pp_dummy_npages;
2590                         hat_devload(kas.a_hat, va, ptob(1), pp_dummy_pfn[j],
2591                             PROT_READ,
2592                             HAT_LOAD | HAT_LOAD_NOCONSIST |
2593                             HAT_LOAD_REMAP);
2594                         va += ptob(1);
2595                 }
2596                 metapgs -= n;
2597         }
2598 }
2599
2600 static void
2601 memseg_remap_to_dummy(struct memseg *seg)
2602 {
2603         caddr_t pp;
2604         pgcnt_t metapgs;
2605
2606         ASSERT(memseg_is_dynamic(seg));
2607         ASSERT(pp_dummy != NULL);
2608
2609
2610         if (!memseg_includes_meta(seg)) {
2611                 memseg_remap_meta(seg);
2612                 return;
2613         }
2614
2615         pp = (caddr_t)seg->pages;
2616         metapgs = seg->pages_base - memseg_get_start(seg);
2617         ASSERT(metapgs != 0);
2618
2619         seg->pages_end = seg->pages_base;
2620
2621         remap_to_dummy(pp, metapgs);
2622 }
2623
2624 /*
2625  * Transition all the deleted pages to the deleted state so that
2626  * page_lock will not wait. The page_lock_delete call will
2627  * also wake up any waiters.
2628  */
2629 static void
2630 memseg_lock_delete_all(struct memseg *seg)
2631 {
2632         page_t *pp;
2633
2634         for (pp = seg->pages; pp < seg->epages; pp++) {
2635                 pp->p_pagenum = PFN_INVALID;    /* XXXX */
2636                 page_lock_delete(pp);
2637         }
2638 }
2639
2640 static void
2641 kphysm_del_cleanup(struct mem_handle *mhp)
2642 {
2643         struct memdelspan       *mdsp;
2644         struct memseg           *seg;
2645         struct memseg           **segpp;
2646         struct memseg           *seglist;
2647         pfn_t                   p_end;
2648         uint64_t                avmem;
2649         pgcnt_t                 avpgs;
2650         pgcnt_t                 npgs;
2651
2652         avpgs = mhp->mh_vm_pages;
2653
2654         memsegs_lock(1);
2655
2656         /*
2657          * remove from main segment list.
2658          */
2659         npgs = 0;
2660         seglist = NULL;
2661         for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2662             mdsp = mdsp->mds_next) {
2663                 p_end = mdsp->mds_base + mdsp->mds_npgs;
2664                 for (segpp = &memsegs; (seg = *segpp) != NULL; ) {
2665                         if (seg->pages_base >= p_end ||
2666                             seg->pages_end <= mdsp->mds_base) {
2667                                 /* Span and memseg don't overlap. */
2668                                 segpp = &((*segpp)->next);
2669                                 continue;
2670                         }
2671                         ASSERT(seg->pages_base >= mdsp->mds_base);
2672                         ASSERT(seg->pages_end <= p_end);
2673
2674                         PLCNT_MODIFY_MAX(seg->pages_base,
2675                             seg->pages_base - seg->pages_end);
2676
2677                         /* Hide the memseg from future scans. */
2678                         hat_kpm_delmem_mseg_update(seg, segpp);
2679                         *segpp = seg->next;
2680                         membar_producer();      /* TODO: Needed? */
2681                         npgs += MSEG_NPAGES(seg);
2682
2683                         /*
2684                          * Leave the deleted segment's next pointer intact
2685                          * in case a memsegs scanning loop is walking this
2686                          * segment concurrently.
2687                          */
2688                         seg->lnext = seglist;
2689                         seglist = seg;
2690                 }
2691         }
2692
2693         build_pfn_hash();
2694
2695         ASSERT(npgs < total_pages);
2696         total_pages -= npgs;
2697
2698         /*
2699          * Recalculate the paging parameters now total_pages has changed.
2700          * This will also cause the clock hands to be reset before next use.
2701          */
2702         setupclock(1);
2703
2704         memsegs_unlock(1);
2705
2706         mutex_exit(&mhp->mh_mutex);
2707
2708         while ((seg = seglist) != NULL) {
2709                 pfn_t mseg_start;
2710                 pfn_t mseg_base, mseg_end;
2711                 pgcnt_t mseg_npgs;
2712                 int mlret;
2713
2714                 seglist = seg->lnext;
2715
2716                 /*
2717                  * Put the page_t's into the deleted state to stop
2718                  * cv_wait()s on the pages. When we remap, the dummy
2719                  * page_t's will be in the same state.
2720                  */
2721                 memseg_lock_delete_all(seg);
2722                 /*
2723                  * Collect up information based on pages_base and pages_end
2724                  * early so that we can flag early that the memseg has been
2725                  * deleted by setting pages_end == pages_base.
2726                  */
2727                 mseg_base = seg->pages_base;
2728                 mseg_end = seg->pages_end;
2729                 mseg_npgs = MSEG_NPAGES(seg);
2730                 mseg_start = memseg_get_start(seg);
2731
2732                 if (memseg_is_dynamic(seg)) {
2733                         /* Remap the meta data to our special dummy area. */
2734                         memseg_remap_to_dummy(seg);
2735
2736                         mutex_enter(&memseg_lists_lock);
2737                         seg->lnext = memseg_va_avail;
2738                         memseg_va_avail = seg;
2739                         mutex_exit(&memseg_lists_lock);
2740                 } else {
2741                         /*
2742                          * For memory whose page_ts were allocated
2743                          * at boot, we need to find a new use for
2744                          * the page_t memory.
2745                          * For the moment, just leak it.
2746                          * (It is held in the memseg_delete_junk list.)
2747                          */
2748                         seg->pages_end = seg->pages_base;
2749
2750                         mutex_enter(&memseg_lists_lock);
2751                         seg->lnext = memseg_delete_junk;
2752                         memseg_delete_junk = seg;
2753                         mutex_exit(&memseg_lists_lock);
2754                 }
2755
2756                 /* Must not use seg now as it could be re-used. */
2757
2758                 memlist_write_lock();
2759
2760                 mlret = memlist_delete_span(
2761                     (uint64_t)(mseg_base) << PAGESHIFT,
2762                     (uint64_t)(mseg_npgs) << PAGESHIFT,
2763                     &phys_avail);
2764                 ASSERT(mlret == MEML_SPANOP_OK);
2765
2766                 mlret = memlist_delete_span(
2767                     (uint64_t)(mseg_start) << PAGESHIFT,
2768                     (uint64_t)(mseg_end - mseg_start) <<
2769                     PAGESHIFT,
2770                     &phys_install);
2771                 ASSERT(mlret == MEML_SPANOP_OK);
2772                 phys_install_has_changed();
2773
2774                 memlist_write_unlock();
2775         }
2776
2777         memlist_read_lock();
2778         installed_top_size(phys_install, &physmax, &physinstalled);
2779         memlist_read_unlock();
2780
2781         mutex_enter(&freemem_lock);
2782         maxmem -= avpgs;
2783         physmem -= avpgs;
2784         /* availrmem is adjusted during the delete. */
2785         availrmem_initial -= avpgs;
2786
2787         mutex_exit(&freemem_lock);
2788
2789         dump_resize();
2790
2791         cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK "
2792             "(0x%" PRIx64 ")\n",
2793             physinstalled << (PAGESHIFT - 10),
2794             (uint64_t)physinstalled << PAGESHIFT);
2795
2796         avmem = (uint64_t)freemem << PAGESHIFT;
2797         cmn_err(CE_CONT, "?kphysm_delete: "
2798             "avail mem = %" PRId64 "\n", avmem);
2799
2800         /*
2801          * Update lgroup generation number on single lgroup systems
2802          */
2803         if (nlgrps == 1)
2804                 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
2805
2806         /* Successfully deleted system memory */
2807         mutex_enter(&mhp->mh_mutex);
2808 }
2809
2810 static uint_t mdel_nullvp_waiter;
2811
2812 static void
2813 page_delete_collect(
2814         page_t *pp,
2815         struct mem_handle *mhp)
2816 {
2817         if (pp->p_vnode) {
2818                 page_hashout(pp, (kmutex_t *)NULL);
2819                 /* do not do PP_SETAGED(pp); */
2820         } else {
2821                 kmutex_t *sep;
2822
2823                 sep = page_se_mutex(pp);
2824                 mutex_enter(sep);
2825                 if (CV_HAS_WAITERS(&pp->p_cv)) {
2826                         mdel_nullvp_waiter++;
2827                         cv_broadcast(&pp->p_cv);
2828                 }
2829                 mutex_exit(sep);
2830         }
2831         ASSERT(pp->p_next == pp->p_prev);
2832         ASSERT(pp->p_next == NULL || pp->p_next == pp);
2833         pp->p_next = mhp->mh_deleted;
2834         mhp->mh_deleted = pp;
2835         ASSERT(mhp->mh_hold_todo != 0);
2836         mhp->mh_hold_todo--;
2837 }
2838
2839 static void
2840 transit_list_collect(struct mem_handle *mhp, int v)
2841 {
2842         struct transit_list_head *trh;
2843
2844         trh = &transit_list_head;
2845         mutex_enter(&trh->trh_lock);
2846         mhp->mh_transit.trl_collect = v;
2847         mutex_exit(&trh->trh_lock);
2848 }
2849
2850 static void
2851 transit_list_insert(struct transit_list *tlp)
2852 {
2853         struct transit_list_head *trh;
2854
2855         trh = &transit_list_head;
2856         ASSERT(MUTEX_HELD(&trh->trh_lock));
2857         tlp->trl_next = trh->trh_head;
2858         trh->trh_head = tlp;
2859 }
2860
2861 static void
2862 transit_list_remove(struct transit_list *tlp)
2863 {
2864         struct transit_list_head *trh;
2865         struct transit_list **tlpp;
2866
2867         trh = &transit_list_head;
2868         tlpp = &trh->trh_head;
2869         ASSERT(MUTEX_HELD(&trh->trh_lock));
2870         while (*tlpp != NULL && *tlpp != tlp)
2871                 tlpp = &(*tlpp)->trl_next;
2872         ASSERT(*tlpp != NULL);
2873         if (*tlpp == tlp)
2874                 *tlpp = tlp->trl_next;
2875         tlp->trl_next = NULL;
2876 }
2877
2878 static struct transit_list *
2879 pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum)
2880 {
2881         struct transit_list *tlp;
2882
2883         for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
2884                 struct memdelspan *mdsp;
2885
2886                 for (mdsp = tlp->trl_spans; mdsp != NULL;
2887                     mdsp = mdsp->mds_next) {
2888                         if (pfnum >= mdsp->mds_base &&
2889                             pfnum < (mdsp->mds_base + mdsp->mds_npgs)) {
2890                                 return (tlp);
2891                         }
2892                 }
2893         }
2894         return (NULL);
2895 }
2896
2897 int
2898 pfn_is_being_deleted(pfn_t pfnum)
2899 {
2900         struct transit_list_head *trh;
2901         struct transit_list *tlp;
2902         int ret;
2903
2904         trh = &transit_list_head;
2905         if (trh->trh_head == NULL)
2906                 return (0);
2907
2908         mutex_enter(&trh->trh_lock);
2909         tlp = pfnum_to_transit_list(trh, pfnum);
2910         ret = (tlp != NULL && tlp->trl_collect);
2911         mutex_exit(&trh->trh_lock);
2912
2913         return (ret);
2914 }
2915
2916 #ifdef MEM_DEL_STATS
2917 extern int hz;
2918 static void
2919 mem_del_stat_print_func(struct mem_handle *mhp)
2920 {
2921         uint64_t tmp;
2922
2923         if (mem_del_stat_print) {
2924                 printf("memory delete loop %x/%x, statistics%s\n",
2925                     (uint_t)mhp->mh_transit.trl_spans->mds_base,
2926                     (uint_t)mhp->mh_transit.trl_spans->mds_npgs,
2927                     (mhp->mh_cancel ? " (cancelled)" : ""));
2928                 printf("\t%8u nloop\n", mhp->mh_delstat.nloop);
2929                 printf("\t%8u need_free\n", mhp->mh_delstat.need_free);
2930                 printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop);
2931                 printf("\t%8u free_low\n", mhp->mh_delstat.free_low);
2932                 printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed);
2933                 printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck);
2934                 printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget);
2935                 printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail);
2936                 printf("\t%8u nfree\n", mhp->mh_delstat.nfree);
2937                 printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc);
2938                 printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail);
2939                 printf("\t%8u already_done\n", mhp->mh_delstat.already_done);
2940                 printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree);
2941                 printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked);
2942                 printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc);
2943                 printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl);
2944                 printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc);
2945                 printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy);
2946                 printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage);
2947                 printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim);
2948                 printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay);
2949                 printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail);
2950                 printf("\t%8u retired\n", mhp->mh_delstat.retired);
2951                 printf("\t%8u toxic\n", mhp->mh_delstat.toxic);
2952                 printf("\t%8u failing\n", mhp->mh_delstat.failing);
2953                 printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic);
2954                 printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic);
2955                 printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail);
2956                 printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail);
2957                 tmp = mhp->mh_delstat.nticks_total / hz;  /* seconds */
2958                 printf(
2959                     "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n",
2960                     mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60);
2961
2962                 tmp = mhp->mh_delstat.nticks_pgrp / hz;  /* seconds */
2963                 printf(
2964                     "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n",
2965                     mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60);
2966         }
2967 }
2968 #endif /* MEM_DEL_STATS */
2969
2970 struct mem_callback {
2971         kphysm_setup_vector_t   *vec;
2972         void                    *arg;
2973 };
2974
2975 #define NMEMCALLBACKS           100
2976
2977 static struct mem_callback mem_callbacks[NMEMCALLBACKS];
2978 static uint_t nmemcallbacks;
2979 static krwlock_t mem_callback_rwlock;
2980
2981 int
2982 kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg)
2983 {
2984         uint_t i, found;
2985
2986         /*
2987          * This test will become more complicated when the version must
2988          * change.
2989          */
2990         if (vec->version != KPHYSM_SETUP_VECTOR_VERSION)
2991                 return (EINVAL);
2992
2993         if (vec->post_add == NULL || vec->pre_del == NULL ||
2994             vec->post_del == NULL)
2995                 return (EINVAL);
2996
2997         rw_enter(&mem_callback_rwlock, RW_WRITER);
2998         for (i = 0, found = 0; i < nmemcallbacks; i++) {
2999                 if (mem_callbacks[i].vec == NULL && found == 0)
3000                         found = i + 1;
3001                 if (mem_callbacks[i].vec == vec &&
3002                     mem_callbacks[i].arg == arg) {
3003 #ifdef DEBUG
3004                         /* Catch this in DEBUG kernels. */
3005                         cmn_err(CE_WARN, "kphysm_setup_func_register"
3006                             "(0x%p, 0x%p) duplicate registration from 0x%p",
3007                             (void *)vec, arg, (void *)caller());
3008 #endif /* DEBUG */
3009                         rw_exit(&mem_callback_rwlock);
3010                         return (EEXIST);
3011                 }
3012         }
3013         if (found != 0) {
3014                 i = found - 1;
3015         } else {
3016                 ASSERT(nmemcallbacks < NMEMCALLBACKS);
3017                 if (nmemcallbacks == NMEMCALLBACKS) {
3018                         rw_exit(&mem_callback_rwlock);
3019                         return (ENOMEM);
3020                 }
3021                 i = nmemcallbacks++;
3022         }
3023         mem_callbacks[i].vec = vec;
3024         mem_callbacks[i].arg = arg;
3025         rw_exit(&mem_callback_rwlock);
3026         return (0);
3027 }
3028
3029 void
3030 kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg)
3031 {
3032         uint_t i;
3033
3034         rw_enter(&mem_callback_rwlock, RW_WRITER);
3035         for (i = 0; i < nmemcallbacks; i++) {
3036                 if (mem_callbacks[i].vec == vec &&
3037                     mem_callbacks[i].arg == arg) {
3038                         mem_callbacks[i].vec = NULL;
3039                         mem_callbacks[i].arg = NULL;
3040                         if (i == (nmemcallbacks - 1))
3041                                 nmemcallbacks--;
3042                         break;
3043                 }
3044         }
3045         rw_exit(&mem_callback_rwlock);
3046 }
3047
3048 static void
3049 kphysm_setup_post_add(pgcnt_t delta_pages)
3050 {
3051         uint_t i;
3052
3053         rw_enter(&mem_callback_rwlock, RW_READER);
3054         for (i = 0; i < nmemcallbacks; i++) {
3055                 if (mem_callbacks[i].vec != NULL) {
3056                         (*mem_callbacks[i].vec->post_add)
3057                             (mem_callbacks[i].arg, delta_pages);
3058                 }
3059         }
3060         rw_exit(&mem_callback_rwlock);
3061 }
3062
3063 /*
3064  * Note the locking between pre_del and post_del: The reader lock is held
3065  * between the two calls to stop the set of functions from changing.
3066  */
3067
3068 static int
3069 kphysm_setup_pre_del(pgcnt_t delta_pages)
3070 {
3071         uint_t i;
3072         int ret;
3073         int aret;
3074
3075         ret = 0;
3076         rw_enter(&mem_callback_rwlock, RW_READER);
3077         for (i = 0; i < nmemcallbacks; i++) {
3078                 if (mem_callbacks[i].vec != NULL) {
3079                         aret = (*mem_callbacks[i].vec->pre_del)
3080                             (mem_callbacks[i].arg, delta_pages);
3081                         ret |= aret;
3082                 }
3083         }
3084
3085         return (ret);
3086 }
3087
3088 static void
3089 kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled)
3090 {
3091         uint_t i;
3092
3093         for (i = 0; i < nmemcallbacks; i++) {
3094                 if (mem_callbacks[i].vec != NULL) {
3095                         (*mem_callbacks[i].vec->post_del)
3096                             (mem_callbacks[i].arg, delta_pages, cancelled);
3097                 }
3098         }
3099         rw_exit(&mem_callback_rwlock);
3100 }
3101
3102 static int
3103 kphysm_split_memseg(
3104         pfn_t base,
3105         pgcnt_t npgs)
3106 {
3107         struct memseg *seg;
3108         struct memseg **segpp;
3109         pgcnt_t size_low, size_high;
3110         struct memseg *seg_low, *seg_mid, *seg_high;
3111
3112         /*
3113          * Lock the memsegs list against other updates now
3114          */
3115         memsegs_lock(1);
3116
3117         /*
3118          * Find boot time memseg that wholly covers this area.
3119          */
3120
3121         /* First find the memseg with page 'base' in it. */
3122         for (segpp = &memsegs; (seg = *segpp) != NULL;
3123             segpp = &((*segpp)->next)) {
3124                 if (base >= seg->pages_base && base < seg->pages_end)
3125                         break;
3126         }
3127         if (seg == NULL) {
3128                 memsegs_unlock(1);
3129                 return (0);
3130         }
3131         if (memseg_includes_meta(seg)) {
3132                 memsegs_unlock(1);
3133                 return (0);
3134         }
3135         if ((base + npgs) > seg->pages_end) {
3136                 memsegs_unlock(1);
3137                 return (0);
3138         }
3139
3140         /*
3141          * Work out the size of the two segments that will
3142          * surround the new segment, one for low address
3143          * and one for high.
3144          */
3145         ASSERT(base >= seg->pages_base);
3146         size_low = base - seg->pages_base;
3147         ASSERT(seg->pages_end >= (base + npgs));
3148         size_high = seg->pages_end - (base + npgs);
3149
3150         /*
3151          * Sanity check.
3152          */
3153         if ((size_low + size_high) == 0) {
3154                 memsegs_unlock(1);
3155                 return (0);
3156         }
3157
3158         /*
3159          * Allocate the new structures. The old memseg will not be freed
3160          * as there may be a reference to it.
3161          */
3162         seg_low = NULL;
3163         seg_high = NULL;
3164
3165         if (size_low != 0)
3166                 seg_low = memseg_alloc();
3167
3168         seg_mid = memseg_alloc();
3169
3170         if (size_high != 0)
3171                 seg_high = memseg_alloc();
3172
3173         /*
3174          * All allocation done now.
3175          */
3176         if (size_low != 0) {
3177                 seg_low->pages = seg->pages;
3178                 seg_low->epages = seg_low->pages + size_low;
3179                 seg_low->pages_base = seg->pages_base;
3180                 seg_low->pages_end = seg_low->pages_base + size_low;
3181                 seg_low->next = seg_mid;
3182                 seg_low->msegflags = seg->msegflags;
3183         }
3184         if (size_high != 0) {
3185                 seg_high->pages = seg->epages - size_high;
3186                 seg_high->epages = seg_high->pages + size_high;
3187                 seg_high->pages_base = seg->pages_end - size_high;
3188                 seg_high->pages_end = seg_high->pages_base + size_high;
3189                 seg_high->next = seg->next;
3190                 seg_high->msegflags = seg->msegflags;
3191         }
3192
3193         seg_mid->pages = seg->pages + size_low;
3194         seg_mid->pages_base = seg->pages_base + size_low;
3195         seg_mid->epages = seg->epages - size_high;
3196         seg_mid->pages_end = seg->pages_end - size_high;
3197         seg_mid->next = (seg_high != NULL) ? seg_high : seg->next;
3198         seg_mid->msegflags = seg->msegflags;
3199
3200         /*
3201          * Update hat_kpm specific info of all involved memsegs and
3202          * allow hat_kpm specific global chain updates.
3203          */
3204         hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high);
3205
3206         /*
3207          * At this point we have two equivalent memseg sub-chains,
3208          * seg and seg_low/seg_mid/seg_high, which both chain on to
3209          * the same place in the global chain. By re-writing the pointer
3210          * in the previous element we switch atomically from using the old
3211          * (seg) to the new.
3212          */
3213         *segpp = (seg_low != NULL) ? seg_low : seg_mid;
3214
3215         membar_enter();
3216
3217         build_pfn_hash();
3218         memsegs_unlock(1);
3219
3220         /*
3221          * We leave the old segment, 'seg', intact as there may be
3222          * references to it. Also, as the value of total_pages has not
3223          * changed and the memsegs list is effectively the same when
3224          * accessed via the old or the new pointer, we do not have to
3225          * cause pageout_scanner() to re-evaluate its hand pointers.
3226          *
3227          * We currently do not re-use or reclaim the page_t memory.
3228          * If we do, then this may have to change.
3229          */
3230
3231         mutex_enter(&memseg_lists_lock);
3232         seg->lnext = memseg_edit_junk;
3233         memseg_edit_junk = seg;
3234         mutex_exit(&memseg_lists_lock);
3235
3236         return (1);
3237 }
3238
3239 /*
3240  * The sfmmu hat layer (e.g.) accesses some parts of the memseg
3241  * structure using physical addresses. Therefore a kmem_cache is
3242  * used with KMC_NOHASH to avoid page crossings within a memseg
3243  * structure. KMC_NOHASH requires that no external (outside of
3244  * slab) information is allowed. This, in turn, implies that the
3245  * cache's slabsize must be exactly a single page, since per-slab
3246  * information (e.g. the freelist for the slab) is kept at the
3247  * end of the slab, where it is easy to locate. Should be changed
3248  * when a more obvious kmem_cache interface/flag will become
3249  * available.
3250  */
3251 void
3252 mem_config_init()
3253 {
3254         memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg),
3255             0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH);
3256 }
3257
3258 struct memseg *
3259 memseg_alloc()
3260 {
3261         struct memseg *seg;
3262
3263         seg = kmem_cache_alloc(memseg_cache, KM_SLEEP);
3264         bzero(seg, sizeof (struct memseg));
3265
3266         return (seg);
3267 }
3268
3269 /*
3270  * Return whether the page_t memory for this memseg
3271  * is included in the memseg itself.
3272  */
3273 static int
3274 memseg_includes_meta(struct memseg *seg)
3275 {
3276         return (seg->msegflags & MEMSEG_META_INCL);
3277 }
3278
3279 pfn_t
3280 memseg_get_start(struct memseg *seg)
3281 {
3282         pfn_t           pt_start;
3283
3284         if (memseg_includes_meta(seg)) {
3285                 pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages);
3286
3287                 /* Meta data is required to be at the beginning */
3288                 ASSERT(pt_start < seg->pages_base);
3289         } else
3290                 pt_start = seg->pages_base;
3291
3292         return (pt_start);
3293 }
3294
3295 /*
3296  * Invalidate memseg pointers in cpu private vm data caches.
3297  */
3298 static void
3299 memseg_cpu_vm_flush()
3300 {
3301         cpu_t *cp;
3302         vm_cpu_data_t *vc;
3303
3304         mutex_enter(&cpu_lock);
3305         pause_cpus(NULL, NULL);
3306
3307         cp = cpu_list;
3308         do {
3309                 vc = cp->cpu_vm_data;
3310                 vc->vc_pnum_memseg = NULL;
3311                 vc->vc_pnext_memseg = NULL;
3312
3313         } while ((cp = cp->cpu_next) != cpu_list);
3314
3315         start_cpus();
3316         mutex_exit(&cpu_lock);
3317 }