usr/src/uts/common/os/mem_config.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright 2017 Joyent, Inc.
  25  */
  26
  27 #include <sys/types.h>
  28 #include <sys/cmn_err.h>
  29 #include <sys/vmem.h>
  30 #include <sys/kmem.h>
  31 #include <sys/systm.h>
  32 #include <sys/machsystm.h>      /* for page_freelist_coalesce() */
  33 #include <sys/errno.h>
  34 #include <sys/memnode.h>
  35 #include <sys/memlist.h>
  36 #include <sys/memlist_impl.h>
  37 #include <sys/tuneable.h>
  38 #include <sys/proc.h>
  39 #include <sys/disp.h>
  40 #include <sys/debug.h>
  41 #include <sys/vm.h>
  42 #include <sys/callb.h>
  43 #include <sys/memlist_plat.h>   /* for installed_top_size() */
  44 #include <sys/condvar_impl.h>   /* for CV_HAS_WAITERS() */
  45 #include <sys/dumphdr.h>        /* for dump_resize() */
  46 #include <sys/atomic.h>         /* for use in stats collection */
  47 #include <sys/rwlock.h>
  48 #include <sys/cpuvar.h>
  49 #include <vm/seg_kmem.h>
  50 #include <vm/seg_kpm.h>
  51 #include <vm/page.h>
  52 #include <vm/vm_dep.h>
  53 #define SUNDDI_IMPL             /* so sunddi.h will not redefine splx() et al */
  54 #include <sys/sunddi.h>
  55 #include <sys/mem_config.h>
  56 #include <sys/mem_cage.h>
  57 #include <sys/lgrp.h>
  58 #include <sys/ddi.h>
  59 #include <sys/modctl.h>
  60
  61 extern struct memlist *phys_avail;
  62
  63 extern uint_t page_ctrs_adjust(int);
  64 void page_ctrs_cleanup(void);
  65 static void kphysm_setup_post_add(pgcnt_t);
  66 static int kphysm_setup_pre_del(pgcnt_t);
  67 static void kphysm_setup_post_del(pgcnt_t, int);
  68
  69 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs);
  70
  71 static int delspan_reserve(pfn_t, pgcnt_t);
  72 static void delspan_unreserve(pfn_t, pgcnt_t);
  73
  74 kmutex_t memseg_lists_lock;
  75 struct memseg *memseg_va_avail;
  76 struct memseg *memseg_alloc(void);
  77 static struct memseg *memseg_delete_junk;
  78 static struct memseg *memseg_edit_junk;
  79 void memseg_remap_init(void);
  80 static void memseg_remap_to_dummy(struct memseg *);
  81 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t);
  82 static struct memseg *memseg_reuse(pgcnt_t);
  83
  84 static struct kmem_cache *memseg_cache;
  85
  86 /*
  87  * Interfaces to manage externally allocated
  88  * page_t memory (metadata) for a memseg.
  89  */
  90 #pragma weak    memseg_alloc_meta
  91 #pragma weak    memseg_free_meta
  92 #pragma weak    memseg_get_metapfn
  93 #pragma weak    memseg_remap_meta
  94
  95 extern int ppvm_enable;
  96 extern page_t *ppvm_base;
  97 extern int memseg_alloc_meta(pfn_t, pgcnt_t, void **, pgcnt_t *);
  98 extern void memseg_free_meta(void *, pgcnt_t);
  99 extern pfn_t memseg_get_metapfn(void *, pgcnt_t);
 100 extern void memseg_remap_meta(struct memseg *);
 101 static int memseg_is_dynamic(struct memseg *);
 102 static int memseg_includes_meta(struct memseg *);
 103 pfn_t memseg_get_start(struct memseg *);
 104 static void memseg_cpu_vm_flush(void);
 105
 106 int meta_alloc_enable;
 107
 108 #ifdef  DEBUG
 109 static int memseg_debug;
 110 #define MEMSEG_DEBUG(args...) if (memseg_debug) printf(args)
 111 #else
 112 #define MEMSEG_DEBUG(...)
 113 #endif
 114
 115 /*
 116  * Add a chunk of memory to the system.
 117  * base: starting PAGESIZE page of new memory.
 118  * npgs: length in PAGESIZE pages.
 119  *
 120  * Adding mem this way doesn't increase the size of the hash tables;
 121  * growing them would be too hard.  This should be OK, but adding memory
 122  * dynamically most likely means more hash misses, since the tables will
 123  * be smaller than they otherwise would be.
 124  */
 125 int
 126 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs)
 127 {
 128         page_t *pp;
 129         page_t          *opp, *oepp, *segpp;
 130         struct memseg   *seg;
 131         uint64_t        avmem;
 132         pfn_t           pfn;
 133         pfn_t           pt_base = base;
 134         pgcnt_t         tpgs = npgs;
 135         pgcnt_t         metapgs = 0;
 136         int             exhausted;
 137         pfn_t           pnum;
 138         int             mnode;
 139         caddr_t         vaddr;
 140         int             reuse;
 141         int             mlret;
 142         int             rv;
 143         int             flags;
 144         int             meta_alloc = 0;
 145         void            *mapva;
 146         void            *metabase = (void *)base;
 147         pgcnt_t         nkpmpgs = 0;
 148         offset_t        kpm_pages_off = 0;
 149
 150         cmn_err(CE_CONT,
 151             "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n",
 152             npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT);
 153
 154         /*
 155          * Add this span in the delete list to prevent interactions.
 156          */
 157         if (!delspan_reserve(base, npgs)) {
 158                 return (KPHYSM_ESPAN);
 159         }
 160         /*
 161          * Check to see if any of the memory span has been added
 162          * by trying an add to the installed memory list. This
 163          * forms the interlocking process for add.
 164          */
 165
 166         memlist_write_lock();
 167
 168         mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT,
 169             (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
 170
 171         if (mlret == MEML_SPANOP_OK)
 172                 installed_top_size(phys_install, &physmax, &physinstalled);
 173
 174         memlist_write_unlock();
 175
 176         if (mlret != MEML_SPANOP_OK) {
 177                 if (mlret == MEML_SPANOP_EALLOC) {
 178                         delspan_unreserve(pt_base, tpgs);
 179                         return (KPHYSM_ERESOURCE);
 180                 } else if (mlret == MEML_SPANOP_ESPAN) {
 181                         delspan_unreserve(pt_base, tpgs);
 182                         return (KPHYSM_ESPAN);
 183                 } else {
 184                         delspan_unreserve(pt_base, tpgs);
 185                         return (KPHYSM_ERESOURCE);
 186                 }
 187         }
 188
 189         if (meta_alloc_enable) {
 190                 /*
 191                  * Allocate the page_t's from existing memory;
 192                  * if that fails, allocate from the incoming memory.
 193                  */
 194                 rv = memseg_alloc_meta(base, npgs, &metabase, &metapgs);
 195                 if (rv == KPHYSM_OK) {
 196                         ASSERT(metapgs);
 197                         ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
 198                         meta_alloc = 1;
 199                         goto mapalloc;
 200                 }
 201         }
 202
 203         /*
 204          * We store the page_t's for this new memory in the first
 205          * few pages of the chunk. Here, we go and get'em ...
 206          */
 207
 208         /*
 209          * The expression after the '-' gives the number of pages
 210          * that will fit in the new memory based on a requirement
 211          * of (PAGESIZE + sizeof (page_t)) bytes per page.
 212          */
 213         metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) /
 214             (PAGESIZE + sizeof (page_t)));
 215
 216         npgs -= metapgs;
 217         base += metapgs;
 218
 219         ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
 220
 221         exhausted = (metapgs == 0 || npgs == 0);
 222
 223         if (kpm_enable && !exhausted) {
 224                 pgcnt_t start, end, nkpmpgs_prelim;
 225                 size_t  ptsz;
 226
 227                 /*
 228                  * A viable kpm large page mapping must not overlap two
 229                  * dynamic memsegs. Therefore the total size is checked
 230                  * to be at least kpm_pgsz and also whether start and end
 231                  * points are at least kpm_pgsz aligned.
 232                  */
 233                 if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) ||
 234                     pmodkpmp(base + npgs)) {
 235
 236                         kphysm_addmem_error_undospan(pt_base, tpgs);
 237
 238                         /*
 239                          * There is no specific error code for violating
 240                          * kpm granularity constraints.
 241                          */
 242                         return (KPHYSM_ENOTVIABLE);
 243                 }
 244
 245                 start = kpmptop(ptokpmp(base));
 246                 end = kpmptop(ptokpmp(base + npgs));
 247                 nkpmpgs_prelim = ptokpmp(end - start);
 248                 ptsz = npgs * sizeof (page_t);
 249                 metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ);
 250                 exhausted = (tpgs <= metapgs);
 251                 if (!exhausted) {
 252                         npgs = tpgs - metapgs;
 253                         base = pt_base + metapgs;
 254
 255                         /* final nkpmpgs */
 256                         start = kpmptop(ptokpmp(base));
 257                         nkpmpgs = ptokpmp(end - start);
 258                         kpm_pages_off = ptsz +
 259                             (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ;
 260                 }
 261         }
 262
 263         /*
 264          * Is memory area supplied too small?
 265          */
 266         if (exhausted) {
 267                 kphysm_addmem_error_undospan(pt_base, tpgs);
 268                 /*
 269                  * There is no specific error code for 'too small'.
 270                  */
 271                 return (KPHYSM_ERESOURCE);
 272         }
 273
 274 mapalloc:
 275         /*
 276          * We may re-use a previously allocated VA space for the page_ts
 277          * eventually, but we need to initialize and lock the pages first.
 278          */
 279
 280         /*
 281          * Get an address in the kernel address map, map
 282          * the page_t pages and see if we can touch them.
 283          */
 284
 285         mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP);
 286         if (mapva == NULL) {
 287                 cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
 288                     " Can't allocate VA for page_ts");
 289
 290                 if (meta_alloc)
 291                         memseg_free_meta(metabase, metapgs);
 292                 kphysm_addmem_error_undospan(pt_base, tpgs);
 293
 294                 return (KPHYSM_ERESOURCE);
 295         }
 296         pp = mapva;
 297
 298         if (physmax < (pt_base + tpgs))
 299                 physmax = (pt_base + tpgs);
 300
 301         /*
 302          * In the remapping code we map one page at a time so we must do
 303          * the same here to match mapping sizes.
 304          */
 305         pfn = pt_base;
 306         vaddr = (caddr_t)pp;
 307         for (pnum = 0; pnum < metapgs; pnum++) {
 308                 if (meta_alloc)
 309                         pfn = memseg_get_metapfn(metabase, (pgcnt_t)pnum);
 310                 hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
 311                     PROT_READ | PROT_WRITE,
 312                     HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
 313                 pfn++;
 314                 vaddr += ptob(1);
 315         }
 316
 317         if (ddi_peek32((dev_info_t *)NULL,
 318             (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) {
 319
 320                 cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
 321                     " Can't access pp array at 0x%p [phys 0x%lx]",
 322                     (void *)pp, pt_base);
 323
 324                 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
 325                     HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
 326
 327                 vmem_free(heap_arena, mapva, ptob(metapgs));
 328                 if (meta_alloc)
 329                         memseg_free_meta(metabase, metapgs);
 330                 kphysm_addmem_error_undospan(pt_base, tpgs);
 331
 332                 return (KPHYSM_EFAULT);
 333         }
 334
 335         /*
 336          * Add this memory slice to its memory node translation.
 337          *
 338          * Note that right now, each node may have only one slice;
 339          * this may change with COD or in larger SSM systems with
 340          * nested latency groups, so we must not assume that the
 341          * node does not yet exist.
 342          *
 343          * Note that there may be multiple memory nodes associated with
 344          * a single lgrp node on x86 systems.
 345          */
 346         pnum = pt_base + tpgs - 1;
 347         mem_node_add_range(pt_base, pnum);
 348
 349         /*
 350          * Allocate or resize page counters as necessary to accommodate
 351          * the increase in memory pages.
 352          */
 353         mnode = PFN_2_MEM_NODE(pnum);
 354         PAGE_CTRS_ADJUST(base, npgs, rv);
 355         if (rv) {
 356
 357                 mem_node_del_range(pt_base, pnum);
 358
 359                 /* cleanup the  page counters */
 360                 page_ctrs_cleanup();
 361
 362                 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
 363                     HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
 364
 365                 vmem_free(heap_arena, mapva, ptob(metapgs));
 366                 if (meta_alloc)
 367                         memseg_free_meta(metabase, metapgs);
 368                 kphysm_addmem_error_undospan(pt_base, tpgs);
 369
 370                 return (KPHYSM_ERESOURCE);
 371         }
 372
 373         /*
 374          * Update the phys_avail memory list.
 375          * The phys_install list was done at the start.
 376          */
 377
 378         memlist_write_lock();
 379
 380         mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT,
 381             (uint64_t)(npgs) << PAGESHIFT, &phys_avail);
 382         ASSERT(mlret == MEML_SPANOP_OK);
 383
 384         memlist_write_unlock();
 385
 386         /* See if we can find a memseg to re-use. */
 387         if (meta_alloc) {
 388                 seg = memseg_reuse(0);
 389                 reuse = 1;      /* force unmapping of temp mapva */
 390                 flags = MEMSEG_DYNAMIC | MEMSEG_META_ALLOC;
 391                 /*
 392                  * There is a 1:1 fixed relationship between a pfn
 393                  * and a page_t VA.  The pfn is used as an index into
 394                  * the ppvm_base page_t table in order to calculate
 395                  * the page_t base address for a given pfn range.
 396                  */
 397                 segpp = ppvm_base + base;
 398         } else {
 399                 seg = memseg_reuse(metapgs);
 400                 reuse = (seg != NULL);
 401                 flags = MEMSEG_DYNAMIC | MEMSEG_META_INCL;
 402                 segpp = pp;
 403         }
 404
 405         /*
 406          * Initialize the memseg structure representing this memory
 407          * and add it to the existing list of memsegs. Do some basic
 408          * initialization and add the memory to the system.
 409          * In order to prevent lock deadlocks, the add_physmem()
 410          * code is repeated here, but split into several stages.
 411          *
 412          * If a memseg is reused, invalidate memseg pointers in
 413          * all cpu vm caches.  We need to do this this since the check
 414          *      pp >= seg->pages && pp < seg->epages
 415          * used in various places is not atomic and so the first compare
 416          * can happen before reuse and the second compare after reuse.
 417          * The invalidation ensures that a memseg is not deferenced while
 418          * it's page/pfn pointers are changing.
 419          */
 420         if (seg == NULL) {
 421                 seg = memseg_alloc();
 422                 ASSERT(seg != NULL);
 423                 seg->msegflags = flags;
 424                 MEMSEG_DEBUG("memseg_get: alloc seg=0x%p, pages=0x%p",
 425                     (void *)seg, (void *)(seg->pages));
 426                 seg->pages = segpp;
 427         } else {
 428                 ASSERT(seg->msegflags == flags);
 429                 ASSERT(seg->pages_base == seg->pages_end);
 430                 MEMSEG_DEBUG("memseg_get: reuse seg=0x%p, pages=0x%p",
 431                     (void *)seg, (void *)(seg->pages));
 432                 if (meta_alloc) {
 433                         memseg_cpu_vm_flush();
 434                         seg->pages = segpp;
 435                 }
 436         }
 437
 438         seg->epages = seg->pages + npgs;
 439         seg->pages_base = base;
 440         seg->pages_end = base + npgs;
 441
 442         /*
 443          * Initialize metadata. The page_ts are set to locked state
 444          * ready to be freed.
 445          */
 446         bzero((caddr_t)pp, ptob(metapgs));
 447
 448         pfn = seg->pages_base;
 449         /* Save the original pp base in case we reuse a memseg. */
 450         opp = pp;
 451         oepp = opp + npgs;
 452         for (pp = opp; pp < oepp; pp++) {
 453                 pp->p_pagenum = pfn;
 454                 pfn++;
 455                 page_iolock_init(pp);
 456                 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
 457                         continue;
 458                 pp->p_offset = (u_offset_t)-1;
 459         }
 460
 461         if (reuse) {
 462                 /* Remap our page_ts to the re-used memseg VA space. */
 463                 pfn = pt_base;
 464                 vaddr = (caddr_t)seg->pages;
 465                 for (pnum = 0; pnum < metapgs; pnum++) {
 466                         if (meta_alloc)
 467                                 pfn = memseg_get_metapfn(metabase,
 468                                     (pgcnt_t)pnum);
 469                         hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
 470                             PROT_READ | PROT_WRITE,
 471                             HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST);
 472                         pfn++;
 473                         vaddr += ptob(1);
 474                 }
 475
 476                 hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs),
 477                     HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
 478
 479                 vmem_free(heap_arena, mapva, ptob(metapgs));
 480         }
 481
 482         hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off);
 483
 484         memsegs_lock(1);
 485
 486         /*
 487          * The new memseg is inserted at the beginning of the list.
 488          * Not only does this save searching for the tail, but in the
 489          * case of a re-used memseg, it solves the problem of what
 490          * happens if some process has still got a pointer to the
 491          * memseg and follows the next pointer to continue traversing
 492          * the memsegs list.
 493          */
 494
 495         hat_kpm_addmem_mseg_insert(seg);
 496
 497         seg->next = memsegs;
 498         membar_producer();
 499
 500         hat_kpm_addmem_memsegs_update(seg);
 501
 502         memsegs = seg;
 503
 504         build_pfn_hash();
 505
 506         total_pages += npgs;
 507
 508         /*
 509          * Recalculate the paging parameters now total_pages has changed.
 510          * This will also cause the clock hands to be reset before next use.
 511          */
 512         setupclock();
 513
 514         memsegs_unlock(1);
 515
 516         PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs);
 517
 518         /*
 519          * Free the pages outside the lock to avoid locking loops.
 520          */
 521         for (pp = seg->pages; pp < seg->epages; pp++) {
 522                 page_free(pp, 1);
 523         }
 524
 525         /*
 526          * Now that we've updated the appropriate memory lists we
 527          * need to reset a number of globals, since we've increased memory.
 528          * Several have already been updated for us as noted above. The
 529          * globals we're interested in at this point are:
 530          *   physmax - highest page frame number.
 531          *   physinstalled - number of pages currently installed (done earlier)
 532          *   maxmem - max free pages in the system
 533          *   physmem - physical memory pages available
 534          *   availrmem - real memory available
 535          */
 536
 537         mutex_enter(&freemem_lock);
 538         maxmem += npgs;
 539         physmem += npgs;
 540         availrmem += npgs;
 541         availrmem_initial += npgs;
 542
 543         mutex_exit(&freemem_lock);
 544
 545         dump_resize();
 546
 547         page_freelist_coalesce_all(mnode);
 548
 549         kphysm_setup_post_add(npgs);
 550
 551         cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK "
 552             "(0x%" PRIx64 ")\n",
 553             physinstalled << (PAGESHIFT - 10),
 554             (uint64_t)physinstalled << PAGESHIFT);
 555
 556         avmem = (uint64_t)freemem << PAGESHIFT;
 557         cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: "
 558             "avail mem = %" PRId64 "\n", avmem);
 559
 560         /*
 561          * Update lgroup generation number on single lgroup systems
 562          */
 563         if (nlgrps == 1)
 564                 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
 565
 566         /*
 567          * Inform DDI of update
 568          */
 569         ddi_mem_update((uint64_t)(pt_base) << PAGESHIFT,
 570             (uint64_t)(tpgs) << PAGESHIFT);
 571
 572         delspan_unreserve(pt_base, tpgs);
 573
 574         return (KPHYSM_OK);             /* Successfully added system memory */
 575 }
 576
 577 /*
 578  * There are various error conditions in kphysm_add_memory_dynamic()
 579  * which require a rollback of already changed global state.
 580  */
 581 static void
 582 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs)
 583 {
 584         int mlret;
 585
 586         /* Unreserve memory span. */
 587         memlist_write_lock();
 588
 589         mlret = memlist_delete_span(
 590             (uint64_t)(pt_base) << PAGESHIFT,
 591             (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
 592
 593         ASSERT(mlret == MEML_SPANOP_OK);
 594         phys_install_has_changed();
 595         installed_top_size(phys_install, &physmax, &physinstalled);
 596
 597         memlist_write_unlock();
 598         delspan_unreserve(pt_base, tpgs);
 599 }
 600
 601 /*
 602  * Only return an available memseg of exactly the right size
 603  * if size is required.
 604  * When the meta data area has it's own virtual address space
 605  * we will need to manage this more carefully and do best fit
 606  * allocations, possibly splitting an available area.
 607  */
 608 struct memseg *
 609 memseg_reuse(pgcnt_t metapgs)
 610 {
 611         int type;
 612         struct memseg **segpp, *seg;
 613
 614         mutex_enter(&memseg_lists_lock);
 615
 616         segpp = &memseg_va_avail;
 617         for (; (seg = *segpp) != NULL; segpp = &seg->lnext) {
 618                 caddr_t end;
 619
 620                 /*
 621                  * Make sure we are reusing the right segment type.
 622                  */
 623                 type = metapgs ? MEMSEG_META_INCL : MEMSEG_META_ALLOC;
 624
 625                 if ((seg->msegflags & (MEMSEG_META_INCL | MEMSEG_META_ALLOC))
 626                     != type)
 627                         continue;
 628
 629                 if (kpm_enable)
 630                         end = hat_kpm_mseg_reuse(seg);
 631                 else
 632                         end = (caddr_t)seg->epages;
 633
 634                 /*
 635                  * Check for the right size if it is provided.
 636                  */
 637                 if (!metapgs || btopr(end - (caddr_t)seg->pages) == metapgs) {
 638                         *segpp = seg->lnext;
 639                         seg->lnext = NULL;
 640                         break;
 641                 }
 642         }
 643         mutex_exit(&memseg_lists_lock);
 644
 645         return (seg);
 646 }
 647
 648 static uint_t handle_gen;
 649
 650 struct memdelspan {
 651         struct memdelspan *mds_next;
 652         pfn_t           mds_base;
 653         pgcnt_t         mds_npgs;
 654         uint_t          *mds_bitmap;
 655         uint_t          *mds_bitmap_retired;
 656 };
 657
 658 #define NBPBMW          (sizeof (uint_t) * NBBY)
 659 #define MDS_BITMAPBYTES(MDSP) \
 660         ((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t))
 661
 662 struct transit_list {
 663         struct transit_list     *trl_next;
 664         struct memdelspan       *trl_spans;
 665         int                     trl_collect;
 666 };
 667
 668 struct transit_list_head {
 669         kmutex_t                trh_lock;
 670         struct transit_list     *trh_head;
 671 };
 672
 673 static struct transit_list_head transit_list_head;
 674
 675 struct mem_handle;
 676 static void transit_list_collect(struct mem_handle *, int);
 677 static void transit_list_insert(struct transit_list *);
 678 static void transit_list_remove(struct transit_list *);
 679
 680 #ifdef DEBUG
 681 #define MEM_DEL_STATS
 682 #endif /* DEBUG */
 683
 684 #ifdef MEM_DEL_STATS
 685 static int mem_del_stat_print = 0;
 686 struct mem_del_stat {
 687         uint_t  nloop;
 688         uint_t  need_free;
 689         uint_t  free_loop;
 690         uint_t  free_low;
 691         uint_t  free_failed;
 692         uint_t  ncheck;
 693         uint_t  nopaget;
 694         uint_t  lockfail;
 695         uint_t  nfree;
 696         uint_t  nreloc;
 697         uint_t  nrelocfail;
 698         uint_t  already_done;
 699         uint_t  first_notfree;
 700         uint_t  npplocked;
 701         uint_t  nlockreloc;
 702         uint_t  nnorepl;
 703         uint_t  nmodreloc;
 704         uint_t  ndestroy;
 705         uint_t  nputpage;
 706         uint_t  nnoreclaim;
 707         uint_t  ndelay;
 708         uint_t  demotefail;
 709         uint64_t nticks_total;
 710         uint64_t nticks_pgrp;
 711         uint_t  retired;
 712         uint_t  toxic;
 713         uint_t  failing;
 714         uint_t  modtoxic;
 715         uint_t  npplkdtoxic;
 716         uint_t  gptlmodfail;
 717         uint_t  gptllckfail;
 718 };
 719 /*
 720  * The stat values are only incremented in the delete thread
 721  * so no locking or atomic required.
 722  */
 723 #define MDSTAT_INCR(MHP, FLD)   (MHP)->mh_delstat.FLD++
 724 #define MDSTAT_TOTAL(MHP, ntck) ((MHP)->mh_delstat.nticks_total += (ntck))
 725 #define MDSTAT_PGRP(MHP, ntck)  ((MHP)->mh_delstat.nticks_pgrp += (ntck))
 726 static void mem_del_stat_print_func(struct mem_handle *);
 727 #define MDSTAT_PRINT(MHP)       mem_del_stat_print_func((MHP))
 728 #else /* MEM_DEL_STATS */
 729 #define MDSTAT_INCR(MHP, FLD)
 730 #define MDSTAT_TOTAL(MHP, ntck)
 731 #define MDSTAT_PGRP(MHP, ntck)
 732 #define MDSTAT_PRINT(MHP)
 733 #endif /* MEM_DEL_STATS */
 734
 735 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING,
 736         MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t;
 737
 738 /*
 739  * mh_mutex must be taken to examine or change mh_exthandle and mh_state.
 740  * The mutex may not be required for other fields, dependent on mh_state.
 741  */
 742 struct mem_handle {
 743         kmutex_t        mh_mutex;
 744         struct mem_handle *mh_next;
 745         memhandle_t     mh_exthandle;
 746         mhnd_state_t    mh_state;
 747         struct transit_list mh_transit;
 748         pgcnt_t         mh_phys_pages;
 749         pgcnt_t         mh_vm_pages;
 750         pgcnt_t         mh_hold_todo;
 751         void            (*mh_delete_complete)(void *, int error);
 752         void            *mh_delete_complete_arg;
 753         volatile uint_t mh_cancel;
 754         volatile uint_t mh_dr_aio_cleanup_cancel;
 755         volatile uint_t mh_aio_cleanup_done;
 756         kcondvar_t      mh_cv;
 757         kthread_id_t    mh_thread_id;
 758         page_t          *mh_deleted;    /* link through p_next */
 759 #ifdef MEM_DEL_STATS
 760         struct mem_del_stat mh_delstat;
 761 #endif /* MEM_DEL_STATS */
 762 };
 763
 764 static struct mem_handle *mem_handle_head;
 765 static kmutex_t mem_handle_list_mutex;
 766
 767 static struct mem_handle *
 768 kphysm_allocate_mem_handle()
 769 {
 770         struct mem_handle *mhp;
 771
 772         mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP);
 773         mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL);
 774         mutex_enter(&mem_handle_list_mutex);
 775         mutex_enter(&mhp->mh_mutex);
 776         /* handle_gen is protected by list mutex. */
 777         mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen);
 778         mhp->mh_next = mem_handle_head;
 779         mem_handle_head = mhp;
 780         mutex_exit(&mem_handle_list_mutex);
 781
 782         return (mhp);
 783 }
 784
 785 static void
 786 kphysm_free_mem_handle(struct mem_handle *mhp)
 787 {
 788         struct mem_handle **mhpp;
 789
 790         ASSERT(mutex_owned(&mhp->mh_mutex));
 791         ASSERT(mhp->mh_state == MHND_FREE);
 792         /*
 793          * Exit the mutex to preserve locking order. This is OK
 794          * here as once in the FREE state, the handle cannot
 795          * be found by a lookup.
 796          */
 797         mutex_exit(&mhp->mh_mutex);
 798
 799         mutex_enter(&mem_handle_list_mutex);
 800         mhpp = &mem_handle_head;
 801         while (*mhpp != NULL && *mhpp != mhp)
 802                 mhpp = &(*mhpp)->mh_next;
 803         ASSERT(*mhpp == mhp);
 804         /*
 805          * No need to lock the handle (mh_mutex) as only
 806          * mh_next changing and this is the only thread that
 807          * can be referncing mhp.
 808          */
 809         *mhpp = mhp->mh_next;
 810         mutex_exit(&mem_handle_list_mutex);
 811
 812         mutex_destroy(&mhp->mh_mutex);
 813         kmem_free(mhp, sizeof (struct mem_handle));
 814 }
 815
 816 /*
 817  * This function finds the internal mem_handle corresponding to an
 818  * external handle and returns it with the mh_mutex held.
 819  */
 820 static struct mem_handle *
 821 kphysm_lookup_mem_handle(memhandle_t handle)
 822 {
 823         struct mem_handle *mhp;
 824
 825         mutex_enter(&mem_handle_list_mutex);
 826         for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) {
 827                 if (mhp->mh_exthandle == handle) {
 828                         mutex_enter(&mhp->mh_mutex);
 829                         /*
 830                          * The state of the handle could have been changed
 831                          * by kphysm_del_release() while waiting for mh_mutex.
 832                          */
 833                         if (mhp->mh_state == MHND_FREE) {
 834                                 mutex_exit(&mhp->mh_mutex);
 835                                 continue;
 836                         }
 837                         break;
 838                 }
 839         }
 840         mutex_exit(&mem_handle_list_mutex);
 841         return (mhp);
 842 }
 843
 844 int
 845 kphysm_del_gethandle(memhandle_t *xmhp)
 846 {
 847         struct mem_handle *mhp;
 848
 849         mhp = kphysm_allocate_mem_handle();
 850         /*
 851          * The handle is allocated using KM_SLEEP, so cannot fail.
 852          * If the implementation is changed, the correct error to return
 853          * here would be KPHYSM_ENOHANDLES.
 854          */
 855         ASSERT(mhp->mh_state == MHND_FREE);
 856         mhp->mh_state = MHND_INIT;
 857         *xmhp = mhp->mh_exthandle;
 858         mutex_exit(&mhp->mh_mutex);
 859         return (KPHYSM_OK);
 860 }
 861
 862 static int
 863 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2)
 864 {
 865         pfn_t e1, e2;
 866
 867         e1 = b1 + l1;
 868         e2 = b2 + l2;
 869
 870         return (!(b2 >= e1 || b1 >= e2));
 871 }
 872
 873 static int can_remove_pgs(pgcnt_t);
 874
 875 static struct memdelspan *
 876 span_to_install(pfn_t base, pgcnt_t npgs)
 877 {
 878         struct memdelspan *mdsp;
 879         struct memdelspan *mdsp_new;
 880         uint64_t address, size, thislen;
 881         struct memlist *mlp;
 882
 883         mdsp_new = NULL;
 884
 885         address = (uint64_t)base << PAGESHIFT;
 886         size = (uint64_t)npgs << PAGESHIFT;
 887         while (size != 0) {
 888                 memlist_read_lock();
 889                 for (mlp = phys_install; mlp != NULL; mlp = mlp->ml_next) {
 890                         if (address >= (mlp->ml_address + mlp->ml_size))
 891                                 continue;
 892                         if ((address + size) > mlp->ml_address)
 893                                 break;
 894                 }
 895                 if (mlp == NULL) {
 896                         address += size;
 897                         size = 0;
 898                         thislen = 0;
 899                 } else {
 900                         if (address < mlp->ml_address) {
 901                                 size -= (mlp->ml_address - address);
 902                                 address = mlp->ml_address;
 903                         }
 904                         ASSERT(address >= mlp->ml_address);
 905                         if ((address + size) >
 906                             (mlp->ml_address + mlp->ml_size)) {
 907                                 thislen =
 908                                     mlp->ml_size - (address - mlp->ml_address);
 909                         } else {
 910                                 thislen = size;
 911                         }
 912                 }
 913                 memlist_read_unlock();
 914                 /* TODO: phys_install could change now */
 915                 if (thislen == 0)
 916                         continue;
 917                 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
 918                 mdsp->mds_base = btop(address);
 919                 mdsp->mds_npgs = btop(thislen);
 920                 mdsp->mds_next = mdsp_new;
 921                 mdsp_new = mdsp;
 922                 address += thislen;
 923                 size -= thislen;
 924         }
 925         return (mdsp_new);
 926 }
 927
 928 static void
 929 free_delspans(struct memdelspan *mdsp)
 930 {
 931         struct memdelspan *amdsp;
 932
 933         while ((amdsp = mdsp) != NULL) {
 934                 mdsp = amdsp->mds_next;
 935                 kmem_free(amdsp, sizeof (struct memdelspan));
 936         }
 937 }
 938
 939 /*
 940  * Concatenate lists. No list ordering is required.
 941  */
 942
 943 static void
 944 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp)
 945 {
 946         while (*mdspp != NULL)
 947                 mdspp = &(*mdspp)->mds_next;
 948
 949         *mdspp = mdsp;
 950 }
 951
 952 /*
 953  * Given a new list of delspans, check there is no overlap with
 954  * all existing span activity (add or delete) and then concatenate
 955  * the new spans to the given list.
 956  * Return 1 for OK, 0 if overlapping.
 957  */
 958 static int
 959 delspan_insert(
 960         struct transit_list *my_tlp,
 961         struct memdelspan *mdsp_new)
 962 {
 963         struct transit_list_head *trh;
 964         struct transit_list *tlp;
 965         int ret;
 966
 967         trh = &transit_list_head;
 968
 969         ASSERT(my_tlp != NULL);
 970         ASSERT(mdsp_new != NULL);
 971
 972         ret = 1;
 973         mutex_enter(&trh->trh_lock);
 974         /* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */
 975         for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
 976                 struct memdelspan *mdsp;
 977
 978                 for (mdsp = tlp->trl_spans; mdsp != NULL;
 979                     mdsp = mdsp->mds_next) {
 980                         struct memdelspan *nmdsp;
 981
 982                         for (nmdsp = mdsp_new; nmdsp != NULL;
 983                             nmdsp = nmdsp->mds_next) {
 984                                 if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
 985                                     nmdsp->mds_base, nmdsp->mds_npgs)) {
 986                                         ret = 0;
 987                                         goto done;
 988                                 }
 989                         }
 990                 }
 991         }
 992 done:
 993         if (ret != 0) {
 994                 if (my_tlp->trl_spans == NULL)
 995                         transit_list_insert(my_tlp);
 996                 delspan_concat(&my_tlp->trl_spans, mdsp_new);
 997         }
 998         mutex_exit(&trh->trh_lock);
 999         return (ret);
1000 }
1001
1002 static void
1003 delspan_remove(
1004         struct transit_list *my_tlp,
1005         pfn_t base,
1006         pgcnt_t npgs)
1007 {
1008         struct transit_list_head *trh;
1009         struct memdelspan *mdsp;
1010
1011         trh = &transit_list_head;
1012
1013         ASSERT(my_tlp != NULL);
1014
1015         mutex_enter(&trh->trh_lock);
1016         if ((mdsp = my_tlp->trl_spans) != NULL) {
1017                 if (npgs == 0) {
1018                         my_tlp->trl_spans = NULL;
1019                         free_delspans(mdsp);
1020                         transit_list_remove(my_tlp);
1021                 } else {
1022                         struct memdelspan **prv;
1023
1024                         prv = &my_tlp->trl_spans;
1025                         while (mdsp != NULL) {
1026                                 pfn_t p_end;
1027
1028                                 p_end = mdsp->mds_base + mdsp->mds_npgs;
1029                                 if (mdsp->mds_base >= base &&
1030                                     p_end <= (base + npgs)) {
1031                                         *prv = mdsp->mds_next;
1032                                         mdsp->mds_next = NULL;
1033                                         free_delspans(mdsp);
1034                                 } else {
1035                                         prv = &mdsp->mds_next;
1036                                 }
1037                                 mdsp = *prv;
1038                         }
1039                         if (my_tlp->trl_spans == NULL)
1040                                 transit_list_remove(my_tlp);
1041                 }
1042         }
1043         mutex_exit(&trh->trh_lock);
1044 }
1045
1046 /*
1047  * Reserve interface for add to stop delete before add finished.
1048  * This list is only accessed through the delspan_insert/remove
1049  * functions and so is fully protected by the mutex in struct transit_list.
1050  */
1051
1052 static struct transit_list reserve_transit;
1053
1054 static int
1055 delspan_reserve(pfn_t base, pgcnt_t npgs)
1056 {
1057         struct memdelspan *mdsp;
1058         int ret;
1059
1060         mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
1061         mdsp->mds_base = base;
1062         mdsp->mds_npgs = npgs;
1063         if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) {
1064                 free_delspans(mdsp);
1065         }
1066         return (ret);
1067 }
1068
1069 static void
1070 delspan_unreserve(pfn_t base, pgcnt_t npgs)
1071 {
1072         delspan_remove(&reserve_transit, base, npgs);
1073 }
1074
1075 /*
1076  * Return whether memseg was created by kphysm_add_memory_dynamic().
1077  */
1078 static int
1079 memseg_is_dynamic(struct memseg *seg)
1080 {
1081         return (seg->msegflags & MEMSEG_DYNAMIC);
1082 }
1083
1084 int
1085 kphysm_del_span(
1086         memhandle_t handle,
1087         pfn_t base,
1088         pgcnt_t npgs)
1089 {
1090         struct mem_handle *mhp;
1091         struct memseg *seg;
1092         struct memdelspan *mdsp;
1093         struct memdelspan *mdsp_new;
1094         pgcnt_t phys_pages, vm_pages;
1095         pfn_t p_end;
1096         page_t *pp;
1097         int ret;
1098
1099         mhp = kphysm_lookup_mem_handle(handle);
1100         if (mhp == NULL) {
1101                 return (KPHYSM_EHANDLE);
1102         }
1103         if (mhp->mh_state != MHND_INIT) {
1104                 mutex_exit(&mhp->mh_mutex);
1105                 return (KPHYSM_ESEQUENCE);
1106         }
1107
1108         /*
1109          * Intersect the span with the installed memory list (phys_install).
1110          */
1111         mdsp_new = span_to_install(base, npgs);
1112         if (mdsp_new == NULL) {
1113                 /*
1114                  * No physical memory in this range. Is this an
1115                  * error? If an attempt to start the delete is made
1116                  * for OK returns from del_span such as this, start will
1117                  * return an error.
1118                  * Could return KPHYSM_ENOWORK.
1119                  */
1120                 /*
1121                  * It is assumed that there are no error returns
1122                  * from span_to_install() due to kmem_alloc failure.
1123                  */
1124                 mutex_exit(&mhp->mh_mutex);
1125                 return (KPHYSM_OK);
1126         }
1127         /*
1128          * Does this span overlap an existing span?
1129          */
1130         if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) {
1131                 /*
1132                  * Differentiate between already on list for this handle
1133                  * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY).
1134                  */
1135                 ret = KPHYSM_EBUSY;
1136                 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1137                     mdsp = mdsp->mds_next) {
1138                         if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
1139                             base, npgs)) {
1140                                 ret = KPHYSM_EDUP;
1141                                 break;
1142                         }
1143                 }
1144                 mutex_exit(&mhp->mh_mutex);
1145                 free_delspans(mdsp_new);
1146                 return (ret);
1147         }
1148         /*
1149          * At this point the spans in mdsp_new have been inserted into the
1150          * list of spans for this handle and thereby to the global list of
1151          * spans being processed. Each of these spans must now be checked
1152          * for relocatability. As a side-effect segments in the memseg list
1153          * may be split.
1154          *
1155          * Note that mdsp_new can no longer be used as it is now part of
1156          * a larger list. Select elements of this larger list based
1157          * on base and npgs.
1158          */
1159 restart:
1160         phys_pages = 0;
1161         vm_pages = 0;
1162         ret = KPHYSM_OK;
1163         for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1164             mdsp = mdsp->mds_next) {
1165                 pgcnt_t pages_checked;
1166
1167                 if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) {
1168                         continue;
1169                 }
1170                 p_end = mdsp->mds_base + mdsp->mds_npgs;
1171                 /*
1172                  * The pages_checked count is a hack. All pages should be
1173                  * checked for relocatability. Those not covered by memsegs
1174                  * should be tested with arch_kphysm_del_span_ok().
1175                  */
1176                 pages_checked = 0;
1177                 for (seg = memsegs; seg; seg = seg->next) {
1178                         pfn_t mseg_start;
1179
1180                         if (seg->pages_base >= p_end ||
1181                             seg->pages_end <= mdsp->mds_base) {
1182                                 /* Span and memseg don't overlap. */
1183                                 continue;
1184                         }
1185                         mseg_start = memseg_get_start(seg);
1186                         /* Check that segment is suitable for delete. */
1187                         if (memseg_includes_meta(seg)) {
1188                                 /*
1189                                  * Check that this segment is completely
1190                                  * within the span.
1191                                  */
1192                                 if (mseg_start < mdsp->mds_base ||
1193                                     seg->pages_end > p_end) {
1194                                         ret = KPHYSM_EBUSY;
1195                                         break;
1196                                 }
1197                                 pages_checked += seg->pages_end - mseg_start;
1198                         } else {
1199                                 /*
1200                                  * If this segment is larger than the span,
1201                                  * try to split it. After the split, it
1202                                  * is necessary to restart.
1203                                  */
1204                                 if (seg->pages_base < mdsp->mds_base ||
1205                                     seg->pages_end > p_end) {
1206                                         pfn_t abase;
1207                                         pgcnt_t anpgs;
1208                                         int s_ret;
1209
1210                                         /* Split required.  */
1211                                         if (mdsp->mds_base < seg->pages_base)
1212                                                 abase = seg->pages_base;
1213                                         else
1214                                                 abase = mdsp->mds_base;
1215                                         if (p_end > seg->pages_end)
1216                                                 anpgs = seg->pages_end - abase;
1217                                         else
1218                                                 anpgs = p_end - abase;
1219                                         s_ret = kphysm_split_memseg(abase,
1220                                             anpgs);
1221                                         if (s_ret == 0) {
1222                                                 /* Split failed. */
1223                                                 ret = KPHYSM_ERESOURCE;
1224                                                 break;
1225                                         }
1226                                         goto restart;
1227                                 }
1228                                 pages_checked +=
1229                                     seg->pages_end - seg->pages_base;
1230                         }
1231                         /*
1232                          * The memseg is wholly within the delete span.
1233                          * The individual pages can now be checked.
1234                          */
1235                         /* Cage test. */
1236                         for (pp = seg->pages; pp < seg->epages; pp++) {
1237                                 if (PP_ISNORELOC(pp)) {
1238                                         ret = KPHYSM_ENONRELOC;
1239                                         break;
1240                                 }
1241                         }
1242                         if (ret != KPHYSM_OK) {
1243                                 break;
1244                         }
1245                         phys_pages += (seg->pages_end - mseg_start);
1246                         vm_pages += MSEG_NPAGES(seg);
1247                 }
1248                 if (ret != KPHYSM_OK)
1249                         break;
1250                 if (pages_checked != mdsp->mds_npgs) {
1251                         ret = KPHYSM_ENONRELOC;
1252                         break;
1253                 }
1254         }
1255
1256         if (ret == KPHYSM_OK) {
1257                 mhp->mh_phys_pages += phys_pages;
1258                 mhp->mh_vm_pages += vm_pages;
1259         } else {
1260                 /*
1261                  * Keep holding the mh_mutex to prevent it going away.
1262                  */
1263                 delspan_remove(&mhp->mh_transit, base, npgs);
1264         }
1265         mutex_exit(&mhp->mh_mutex);
1266         return (ret);
1267 }
1268
1269 int
1270 kphysm_del_span_query(
1271         pfn_t base,
1272         pgcnt_t npgs,
1273         memquery_t *mqp)
1274 {
1275         struct memdelspan *mdsp;
1276         struct memdelspan *mdsp_new;
1277         int done_first_nonreloc;
1278
1279         mqp->phys_pages = 0;
1280         mqp->managed = 0;
1281         mqp->nonrelocatable = 0;
1282         mqp->first_nonrelocatable = 0;
1283         mqp->last_nonrelocatable = 0;
1284
1285         mdsp_new = span_to_install(base, npgs);
1286         /*
1287          * It is OK to proceed here if mdsp_new == NULL.
1288          */
1289         done_first_nonreloc = 0;
1290         for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) {
1291                 pfn_t sbase;
1292                 pgcnt_t snpgs;
1293
1294                 mqp->phys_pages += mdsp->mds_npgs;
1295                 sbase = mdsp->mds_base;
1296                 snpgs = mdsp->mds_npgs;
1297                 while (snpgs != 0) {
1298                         struct memseg *lseg, *seg;
1299                         pfn_t p_end;
1300                         page_t *pp;
1301                         pfn_t mseg_start;
1302
1303                         p_end = sbase + snpgs;
1304                         /*
1305                          * Find the lowest addressed memseg that starts
1306                          * after sbase and account for it.
1307                          * This is to catch dynamic memsegs whose start
1308                          * is hidden.
1309                          */
1310                         seg = NULL;
1311                         for (lseg = memsegs; lseg != NULL; lseg = lseg->next) {
1312                                 if ((lseg->pages_base >= sbase) ||
1313                                     (lseg->pages_base < p_end &&
1314                                     lseg->pages_end > sbase)) {
1315                                         if (seg == NULL ||
1316                                             seg->pages_base > lseg->pages_base)
1317                                                 seg = lseg;
1318                                 }
1319                         }
1320                         if (seg != NULL) {
1321                                 mseg_start = memseg_get_start(seg);
1322                                 /*
1323                                  * Now have the full extent of the memseg so
1324                                  * do the range check.
1325                                  */
1326                                 if (mseg_start >= p_end ||
1327                                     seg->pages_end <= sbase) {
1328                                         /* Span does not overlap memseg. */
1329                                         seg = NULL;
1330                                 }
1331                         }
1332                         /*
1333                          * Account for gap either before the segment if
1334                          * there is one or to the end of the span.
1335                          */
1336                         if (seg == NULL || mseg_start > sbase) {
1337                                 pfn_t a_end;
1338
1339                                 a_end = (seg == NULL) ? p_end : mseg_start;
1340                                 /*
1341                                  * Check with arch layer for relocatability.
1342                                  */
1343                                 if (arch_kphysm_del_span_ok(sbase,
1344                                     (a_end - sbase))) {
1345                                         /*
1346                                          * No non-relocatble pages in this
1347                                          * area, avoid the fine-grained
1348                                          * test.
1349                                          */
1350                                         snpgs -= (a_end - sbase);
1351                                         sbase = a_end;
1352                                 }
1353                                 while (sbase < a_end) {
1354                                         if (!arch_kphysm_del_span_ok(sbase,
1355                                             1)) {
1356                                                 mqp->nonrelocatable++;
1357                                                 if (!done_first_nonreloc) {
1358                                                         mqp->
1359                                                             first_nonrelocatable
1360                                                             = sbase;
1361                                                         done_first_nonreloc = 1;
1362                                                 }
1363                                                 mqp->last_nonrelocatable =
1364                                                     sbase;
1365                                         }
1366                                         sbase++;
1367                                         snpgs--;
1368                                 }
1369                         }
1370                         if (seg != NULL) {
1371                                 ASSERT(mseg_start <= sbase);
1372                                 if (seg->pages_base != mseg_start &&
1373                                     seg->pages_base > sbase) {
1374                                         pgcnt_t skip_pgs;
1375
1376                                         /*
1377                                          * Skip the page_t area of a
1378                                          * dynamic memseg.
1379                                          */
1380                                         skip_pgs = seg->pages_base - sbase;
1381                                         if (snpgs <= skip_pgs) {
1382                                                 sbase += snpgs;
1383                                                 snpgs = 0;
1384                                                 continue;
1385                                         }
1386                                         snpgs -= skip_pgs;
1387                                         sbase += skip_pgs;
1388                                 }
1389                                 ASSERT(snpgs != 0);
1390                                 ASSERT(seg->pages_base <= sbase);
1391                                 /*
1392                                  * The individual pages can now be checked.
1393                                  */
1394                                 for (pp = seg->pages +
1395                                     (sbase - seg->pages_base);
1396                                     snpgs != 0 && pp < seg->epages; pp++) {
1397                                         mqp->managed++;
1398                                         if (PP_ISNORELOC(pp)) {
1399                                                 mqp->nonrelocatable++;
1400                                                 if (!done_first_nonreloc) {
1401                                                         mqp->
1402                                                             first_nonrelocatable
1403                                                             = sbase;
1404                                                         done_first_nonreloc = 1;
1405                                                 }
1406                                                 mqp->last_nonrelocatable =
1407                                                     sbase;
1408                                         }
1409                                         sbase++;
1410                                         snpgs--;
1411                                 }
1412                         }
1413                 }
1414         }
1415
1416         free_delspans(mdsp_new);
1417
1418         return (KPHYSM_OK);
1419 }
1420
1421 /*
1422  * This release function can be called at any stage as follows:
1423  *      _gethandle only called
1424  *      _span(s) only called
1425  *      _start called but failed
1426  *      delete thread exited
1427  */
1428 int
1429 kphysm_del_release(memhandle_t handle)
1430 {
1431         struct mem_handle *mhp;
1432
1433         mhp = kphysm_lookup_mem_handle(handle);
1434         if (mhp == NULL) {
1435                 return (KPHYSM_EHANDLE);
1436         }
1437         switch (mhp->mh_state) {
1438         case MHND_STARTING:
1439         case MHND_RUNNING:
1440                 mutex_exit(&mhp->mh_mutex);
1441                 return (KPHYSM_ENOTFINISHED);
1442         case MHND_FREE:
1443                 ASSERT(mhp->mh_state != MHND_FREE);
1444                 mutex_exit(&mhp->mh_mutex);
1445                 return (KPHYSM_EHANDLE);
1446         case MHND_INIT:
1447                 break;
1448         case MHND_DONE:
1449                 break;
1450         case MHND_RELEASE:
1451                 mutex_exit(&mhp->mh_mutex);
1452                 return (KPHYSM_ESEQUENCE);
1453         default:
1454 #ifdef DEBUG
1455                 cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d",
1456                     (void *)mhp, mhp->mh_state);
1457 #endif /* DEBUG */
1458                 mutex_exit(&mhp->mh_mutex);
1459                 return (KPHYSM_EHANDLE);
1460         }
1461         /*
1462          * Set state so that we can wait if necessary.
1463          * Also this means that we have read/write access to all
1464          * fields except mh_exthandle and mh_state.
1465          */
1466         mhp->mh_state = MHND_RELEASE;
1467         /*
1468          * The mem_handle cannot be de-allocated by any other operation
1469          * now, so no need to hold mh_mutex.
1470          */
1471         mutex_exit(&mhp->mh_mutex);
1472
1473         delspan_remove(&mhp->mh_transit, 0, 0);
1474         mhp->mh_phys_pages = 0;
1475         mhp->mh_vm_pages = 0;
1476         mhp->mh_hold_todo = 0;
1477         mhp->mh_delete_complete = NULL;
1478         mhp->mh_delete_complete_arg = NULL;
1479         mhp->mh_cancel = 0;
1480
1481         mutex_enter(&mhp->mh_mutex);
1482         ASSERT(mhp->mh_state == MHND_RELEASE);
1483         mhp->mh_state = MHND_FREE;
1484
1485         kphysm_free_mem_handle(mhp);
1486
1487         return (KPHYSM_OK);
1488 }
1489
1490 /*
1491  * This cancel function can only be called with the thread running.
1492  */
1493 int
1494 kphysm_del_cancel(memhandle_t handle)
1495 {
1496         struct mem_handle *mhp;
1497
1498         mhp = kphysm_lookup_mem_handle(handle);
1499         if (mhp == NULL) {
1500                 return (KPHYSM_EHANDLE);
1501         }
1502         if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) {
1503                 mutex_exit(&mhp->mh_mutex);
1504                 return (KPHYSM_ENOTRUNNING);
1505         }
1506         /*
1507          * Set the cancel flag and wake the delete thread up.
1508          * The thread may be waiting on I/O, so the effect of the cancel
1509          * may be delayed.
1510          */
1511         if (mhp->mh_cancel == 0) {
1512                 mhp->mh_cancel = KPHYSM_ECANCELLED;
1513                 cv_signal(&mhp->mh_cv);
1514         }
1515         mutex_exit(&mhp->mh_mutex);
1516         return (KPHYSM_OK);
1517 }
1518
1519 int
1520 kphysm_del_status(
1521         memhandle_t handle,
1522         memdelstat_t *mdstp)
1523 {
1524         struct mem_handle *mhp;
1525
1526         mhp = kphysm_lookup_mem_handle(handle);
1527         if (mhp == NULL) {
1528                 return (KPHYSM_EHANDLE);
1529         }
1530         /*
1531          * Calling kphysm_del_status() is allowed before the delete
1532          * is started to allow for status display.
1533          */
1534         if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING &&
1535             mhp->mh_state != MHND_RUNNING) {
1536                 mutex_exit(&mhp->mh_mutex);
1537                 return (KPHYSM_ENOTRUNNING);
1538         }
1539         mdstp->phys_pages = mhp->mh_phys_pages;
1540         mdstp->managed = mhp->mh_vm_pages;
1541         mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo;
1542         mutex_exit(&mhp->mh_mutex);
1543         return (KPHYSM_OK);
1544 }
1545
1546 static int mem_delete_additional_pages = 100;
1547
1548 static int
1549 can_remove_pgs(pgcnt_t npgs)
1550 {
1551         /*
1552          * If all pageable pages were paged out, freemem would
1553          * equal availrmem.  There is a minimum requirement for
1554          * availrmem.
1555          */
1556         if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages))
1557             < npgs)
1558                 return (0);
1559         /* TODO: check swap space, etc. */
1560         return (1);
1561 }
1562
1563 static int
1564 get_availrmem(pgcnt_t npgs)
1565 {
1566         int ret;
1567
1568         mutex_enter(&freemem_lock);
1569         ret = can_remove_pgs(npgs);
1570         if (ret != 0)
1571                 availrmem -= npgs;
1572         mutex_exit(&freemem_lock);
1573         return (ret);
1574 }
1575
1576 static void
1577 put_availrmem(pgcnt_t npgs)
1578 {
1579         mutex_enter(&freemem_lock);
1580         availrmem += npgs;
1581         mutex_exit(&freemem_lock);
1582 }
1583
1584 #define FREEMEM_INCR    100
1585 static pgcnt_t freemem_incr = FREEMEM_INCR;
1586 #define DEL_FREE_WAIT_FRAC      4
1587 #define DEL_FREE_WAIT_TICKS     ((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC)
1588
1589 #define DEL_BUSY_WAIT_FRAC      20
1590 #define DEL_BUSY_WAIT_TICKS     ((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC)
1591
1592 static void kphysm_del_cleanup(struct mem_handle *);
1593
1594 static void page_delete_collect(page_t *, struct mem_handle *);
1595
1596 static pgcnt_t
1597 delthr_get_freemem(struct mem_handle *mhp)
1598 {
1599         pgcnt_t free_get;
1600         int ret;
1601
1602         ASSERT(MUTEX_HELD(&mhp->mh_mutex));
1603
1604         MDSTAT_INCR(mhp, need_free);
1605         /*
1606          * Get up to freemem_incr pages.
1607          */
1608         free_get = freemem_incr;
1609         if (free_get > mhp->mh_hold_todo)
1610                 free_get = mhp->mh_hold_todo;
1611         /*
1612          * Take free_get pages away from freemem,
1613          * waiting if necessary.
1614          */
1615
1616         while (!mhp->mh_cancel) {
1617                 mutex_exit(&mhp->mh_mutex);
1618                 MDSTAT_INCR(mhp, free_loop);
1619                 /*
1620                  * Duplicate test from page_create_throttle()
1621                  * but don't override with !PG_WAIT.
1622                  */
1623                 if (freemem < (free_get + throttlefree)) {
1624                         MDSTAT_INCR(mhp, free_low);
1625                         ret = 0;
1626                 } else {
1627                         ret = page_create_wait(free_get, 0);
1628                         if (ret == 0) {
1629                                 /* EMPTY */
1630                                 MDSTAT_INCR(mhp, free_failed);
1631                         }
1632                 }
1633                 if (ret != 0) {
1634                         mutex_enter(&mhp->mh_mutex);
1635                         return (free_get);
1636                 }
1637
1638                 /*
1639                  * Put pressure on pageout.
1640                  */
1641                 page_needfree(free_get);
1642                 WAKE_PAGEOUT_SCANNER(delthr);
1643
1644                 mutex_enter(&mhp->mh_mutex);
1645                 (void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
1646                     DEL_FREE_WAIT_TICKS, TR_CLOCK_TICK);
1647                 mutex_exit(&mhp->mh_mutex);
1648                 page_needfree(-(spgcnt_t)free_get);
1649
1650                 mutex_enter(&mhp->mh_mutex);
1651         }
1652         return (0);
1653 }
1654
1655 #define DR_AIO_CLEANUP_DELAY    25000   /* 0.025secs, in usec */
1656 #define DR_AIO_CLEANUP_MAXLOOPS_NODELAY 100
1657 /*
1658  * This function is run as a helper thread for delete_memory_thread.
1659  * It is needed in order to force kaio cleanup, so that pages used in kaio
1660  * will be unlocked and subsequently relocated by delete_memory_thread.
1661  * The address of the delete_memory_threads's mem_handle is passed in to
1662  * this thread function, and is used to set the mh_aio_cleanup_done member
1663  * prior to calling thread_exit().
1664  */
1665 static void
1666 dr_aio_cleanup_thread(caddr_t amhp)
1667 {
1668         proc_t *procp;
1669         int (*aio_cleanup_dr_delete_memory)(proc_t *);
1670         int cleaned;
1671         int n = 0;
1672         struct mem_handle *mhp;
1673         volatile uint_t *pcancel;
1674
1675         mhp = (struct mem_handle *)amhp;
1676         ASSERT(mhp != NULL);
1677         pcancel = &mhp->mh_dr_aio_cleanup_cancel;
1678         if (modload("sys", "kaio") == -1) {
1679                 mhp->mh_aio_cleanup_done = 1;
1680                 cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio");
1681                 thread_exit();
1682         }
1683         aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
1684             modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
1685         if (aio_cleanup_dr_delete_memory == NULL) {
1686                 mhp->mh_aio_cleanup_done = 1;
1687                 cmn_err(CE_WARN,
1688             "aio_cleanup_dr_delete_memory not found in kaio");
1689                 thread_exit();
1690         }
1691         do {
1692                 cleaned = 0;
1693                 mutex_enter(&pidlock);
1694                 for (procp = practive; (*pcancel == 0) && (procp != NULL);
1695                     procp = procp->p_next) {
1696                         mutex_enter(&procp->p_lock);
1697                         if (procp->p_aio != NULL) {
1698                                 /* cleanup proc's outstanding kaio */
1699                                 cleaned +=
1700                                     (*aio_cleanup_dr_delete_memory)(procp);
1701                         }
1702                         mutex_exit(&procp->p_lock);
1703                 }
1704                 mutex_exit(&pidlock);
1705                 if ((*pcancel == 0) &&
1706                     (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) {
1707                         /* delay a bit before retrying all procs again */
1708                         delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
1709                         n = 0;
1710                 }
1711         } while (*pcancel == 0);
1712         mhp->mh_aio_cleanup_done = 1;
1713         thread_exit();
1714 }
1715
1716 static void
1717 delete_memory_thread(caddr_t amhp)
1718 {
1719         struct mem_handle *mhp;
1720         struct memdelspan *mdsp;
1721         callb_cpr_t cprinfo;
1722         page_t *pp_targ;
1723         spgcnt_t freemem_left;
1724         void (*del_complete_funcp)(void *, int error);
1725         void *del_complete_arg;
1726         int comp_code;
1727         int ret;
1728         int first_scan;
1729         uint_t szc;
1730 #ifdef MEM_DEL_STATS
1731         uint64_t start_total, ntick_total;
1732         uint64_t start_pgrp, ntick_pgrp;
1733 #endif /* MEM_DEL_STATS */
1734
1735         mhp = (struct mem_handle *)amhp;
1736
1737 #ifdef MEM_DEL_STATS
1738         start_total = ddi_get_lbolt();
1739 #endif /* MEM_DEL_STATS */
1740
1741         CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex,
1742             callb_generic_cpr, "memdel");
1743
1744         mutex_enter(&mhp->mh_mutex);
1745         ASSERT(mhp->mh_state == MHND_STARTING);
1746
1747         mhp->mh_state = MHND_RUNNING;
1748         mhp->mh_thread_id = curthread;
1749
1750         mhp->mh_hold_todo = mhp->mh_vm_pages;
1751         mutex_exit(&mhp->mh_mutex);
1752
1753         /* Allocate the remap pages now, if necessary. */
1754         memseg_remap_init();
1755
1756         /*
1757          * Subtract from availrmem now if possible as availrmem
1758          * may not be available by the end of the delete.
1759          */
1760         if (!get_availrmem(mhp->mh_vm_pages)) {
1761                 comp_code = KPHYSM_ENOTVIABLE;
1762                 mutex_enter(&mhp->mh_mutex);
1763                 goto early_exit;
1764         }
1765
1766         ret = kphysm_setup_pre_del(mhp->mh_vm_pages);
1767
1768         mutex_enter(&mhp->mh_mutex);
1769
1770         if (ret != 0) {
1771                 mhp->mh_cancel = KPHYSM_EREFUSED;
1772                 goto refused;
1773         }
1774
1775         transit_list_collect(mhp, 1);
1776
1777         for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1778             mdsp = mdsp->mds_next) {
1779                 ASSERT(mdsp->mds_bitmap == NULL);
1780                 mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP);
1781                 mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp),
1782                     KM_SLEEP);
1783         }
1784
1785         first_scan = 1;
1786         freemem_left = 0;
1787         /*
1788          * Start dr_aio_cleanup_thread, which periodically iterates
1789          * through the process list and invokes aio cleanup.  This
1790          * is needed in order to avoid a deadly embrace between the
1791          * delete_memory_thread (waiting on writer lock for page, with the
1792          * exclusive-wanted bit set), kaio read request threads (waiting for a
1793          * reader lock on the same page that is wanted by the
1794          * delete_memory_thread), and threads waiting for kaio completion
1795          * (blocked on spt_amp->lock).
1796          */
1797         mhp->mh_dr_aio_cleanup_cancel = 0;
1798         mhp->mh_aio_cleanup_done = 0;
1799         (void) thread_create(NULL, 0, dr_aio_cleanup_thread,
1800             (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1);
1801         while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) {
1802                 pgcnt_t collected;
1803
1804                 MDSTAT_INCR(mhp, nloop);
1805                 collected = 0;
1806                 for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) &&
1807                     (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) {
1808                         pfn_t pfn, p_end;
1809
1810                         p_end = mdsp->mds_base + mdsp->mds_npgs;
1811                         for (pfn = mdsp->mds_base; (pfn < p_end) &&
1812                             (mhp->mh_cancel == 0); pfn++) {
1813                                 page_t *pp, *tpp, *tpp_targ;
1814                                 pgcnt_t bit;
1815                                 struct vnode *vp;
1816                                 u_offset_t offset;
1817                                 int mod, result;
1818                                 spgcnt_t pgcnt;
1819
1820                                 bit = pfn - mdsp->mds_base;
1821                                 if ((mdsp->mds_bitmap[bit / NBPBMW] &
1822                                     (1 << (bit % NBPBMW))) != 0) {
1823                                         MDSTAT_INCR(mhp, already_done);
1824                                         continue;
1825                                 }
1826                                 if (freemem_left == 0) {
1827                                         freemem_left += delthr_get_freemem(mhp);
1828                                         if (freemem_left == 0)
1829                                                 break;
1830                                 }
1831
1832                                 /*
1833                                  * Release mh_mutex - some of this
1834                                  * stuff takes some time (eg PUTPAGE).
1835                                  */
1836
1837                                 mutex_exit(&mhp->mh_mutex);
1838                                 MDSTAT_INCR(mhp, ncheck);
1839
1840                                 pp = page_numtopp_nolock(pfn);
1841                                 if (pp == NULL) {
1842                                         /*
1843                                          * Not covered by a page_t - will
1844                                          * be dealt with elsewhere.
1845                                          */
1846                                         MDSTAT_INCR(mhp, nopaget);
1847                                         mutex_enter(&mhp->mh_mutex);
1848                                         mdsp->mds_bitmap[bit / NBPBMW] |=
1849                                             (1 << (bit % NBPBMW));
1850                                         continue;
1851                                 }
1852
1853                                 if (!page_try_reclaim_lock(pp, SE_EXCL,
1854                                     SE_EXCL_WANTED | SE_RETIRED)) {
1855                                         /*
1856                                          * Page in use elsewhere.  Skip it.
1857                                          */
1858                                         MDSTAT_INCR(mhp, lockfail);
1859                                         mutex_enter(&mhp->mh_mutex);
1860                                         continue;
1861                                 }
1862                                 /*
1863                                  * See if the cage expanded into the delete.
1864                                  * This can happen as we have to allow the
1865                                  * cage to expand.
1866                                  */
1867                                 if (PP_ISNORELOC(pp)) {
1868                                         page_unlock(pp);
1869                                         mutex_enter(&mhp->mh_mutex);
1870                                         mhp->mh_cancel = KPHYSM_ENONRELOC;
1871                                         break;
1872                                 }
1873                                 if (PP_RETIRED(pp)) {
1874                                         /*
1875                                          * Page has been retired and is
1876                                          * not part of the cage so we
1877                                          * can now do the accounting for
1878                                          * it.
1879                                          */
1880                                         MDSTAT_INCR(mhp, retired);
1881                                         mutex_enter(&mhp->mh_mutex);
1882                                         mdsp->mds_bitmap[bit / NBPBMW]
1883                                             |= (1 << (bit % NBPBMW));
1884                                         mdsp->mds_bitmap_retired[bit /
1885                                             NBPBMW] |=
1886                                             (1 << (bit % NBPBMW));
1887                                         mhp->mh_hold_todo--;
1888                                         continue;
1889                                 }
1890                                 ASSERT(freemem_left != 0);
1891                                 if (PP_ISFREE(pp)) {
1892                                         /*
1893                                          * Like page_reclaim() only 'freemem'
1894                                          * processing is already done.
1895                                          */
1896                                         MDSTAT_INCR(mhp, nfree);
1897                                 free_page_collect:
1898                                         if (PP_ISAGED(pp)) {
1899                                                 page_list_sub(pp,
1900                                                     PG_FREE_LIST);
1901                                         } else {
1902                                                 page_list_sub(pp,
1903                                                     PG_CACHE_LIST);
1904                                         }
1905                                         PP_CLRFREE(pp);
1906                                         PP_CLRAGED(pp);
1907                                         collected++;
1908                                         mutex_enter(&mhp->mh_mutex);
1909                                         page_delete_collect(pp, mhp);
1910                                         mdsp->mds_bitmap[bit / NBPBMW] |=
1911                                             (1 << (bit % NBPBMW));
1912                                         freemem_left--;
1913                                         continue;
1914                                 }
1915                                 ASSERT(pp->p_vnode != NULL);
1916                                 if (first_scan) {
1917                                         MDSTAT_INCR(mhp, first_notfree);
1918                                         page_unlock(pp);
1919                                         mutex_enter(&mhp->mh_mutex);
1920                                         continue;
1921                                 }
1922                                 /*
1923                                  * Keep stats on pages encountered that
1924                                  * are marked for retirement.
1925                                  */
1926                                 if (PP_TOXIC(pp)) {
1927                                         MDSTAT_INCR(mhp, toxic);
1928                                 } else if (PP_PR_REQ(pp)) {
1929                                         MDSTAT_INCR(mhp, failing);
1930                                 }
1931                                 /*
1932                                  * In certain cases below, special exceptions
1933                                  * are made for pages that are toxic.  This
1934                                  * is because the current meaning of toxic
1935                                  * is that an uncorrectable error has been
1936                                  * previously associated with the page.
1937                                  */
1938                                 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1939                                         if (!PP_TOXIC(pp)) {
1940                                                 /*
1941                                                  * Must relocate locked in
1942                                                  * memory pages.
1943                                                  */
1944 #ifdef MEM_DEL_STATS
1945                                                 start_pgrp = ddi_get_lbolt();
1946 #endif /* MEM_DEL_STATS */
1947                                                 /*
1948                                                  * Lock all constituent pages
1949                                                  * of a large page to ensure
1950                                                  * that p_szc won't change.
1951                                                  */
1952                                                 if (!group_page_trylock(pp,
1953                                                     SE_EXCL)) {
1954                                                         MDSTAT_INCR(mhp,
1955                                                             gptllckfail);
1956                                                         page_unlock(pp);
1957                                                         mutex_enter(
1958                                                             &mhp->mh_mutex);
1959                                                         continue;
1960                                                 }
1961                                                 MDSTAT_INCR(mhp, npplocked);
1962                                                 pp_targ =
1963                                                     page_get_replacement_page(
1964                                                     pp, NULL, 0);
1965                                                 if (pp_targ != NULL) {
1966 #ifdef MEM_DEL_STATS
1967                                                         ntick_pgrp =
1968                                                             (uint64_t)
1969                                                             ddi_get_lbolt() -
1970                                                             start_pgrp;
1971 #endif /* MEM_DEL_STATS */
1972                                                         MDSTAT_PGRP(mhp,
1973                                                             ntick_pgrp);
1974                                                         MDSTAT_INCR(mhp,
1975                                                             nlockreloc);
1976                                                         goto reloc;
1977                                                 }
1978                                                 group_page_unlock(pp);
1979                                                 page_unlock(pp);
1980 #ifdef MEM_DEL_STATS
1981                                                 ntick_pgrp =
1982                                                     (uint64_t)ddi_get_lbolt() -
1983                                                     start_pgrp;
1984 #endif /* MEM_DEL_STATS */
1985                                                 MDSTAT_PGRP(mhp, ntick_pgrp);
1986                                                 MDSTAT_INCR(mhp, nnorepl);
1987                                                 mutex_enter(&mhp->mh_mutex);
1988                                                 continue;
1989                                         } else {
1990                                                 /*
1991                                                  * Cannot do anything about
1992                                                  * this page because it is
1993                                                  * toxic.
1994                                                  */
1995                                                 MDSTAT_INCR(mhp, npplkdtoxic);
1996                                                 page_unlock(pp);
1997                                                 mutex_enter(&mhp->mh_mutex);
1998                                                 continue;
1999                                         }
2000                                 }
2001                                 /*
2002                                  * Unload the mappings and check if mod bit
2003                                  * is set.
2004                                  */
2005                                 ASSERT(!PP_ISKAS(pp));
2006                                 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
2007                                 mod = hat_ismod(pp);
2008
2009 #ifdef MEM_DEL_STATS
2010                                 start_pgrp = ddi_get_lbolt();
2011 #endif /* MEM_DEL_STATS */
2012                                 if (mod && !PP_TOXIC(pp)) {
2013                                         /*
2014                                          * Lock all constituent pages
2015                                          * of a large page to ensure
2016                                          * that p_szc won't change.
2017                                          */
2018                                         if (!group_page_trylock(pp, SE_EXCL)) {
2019                                                 MDSTAT_INCR(mhp, gptlmodfail);
2020                                                 page_unlock(pp);
2021                                                 mutex_enter(&mhp->mh_mutex);
2022                                                 continue;
2023                                         }
2024                                         pp_targ = page_get_replacement_page(pp,
2025                                             NULL, 0);
2026                                         if (pp_targ != NULL) {
2027                                                 MDSTAT_INCR(mhp, nmodreloc);
2028 #ifdef MEM_DEL_STATS
2029                                                 ntick_pgrp =
2030                                                     (uint64_t)ddi_get_lbolt() -
2031                                                     start_pgrp;
2032 #endif /* MEM_DEL_STATS */
2033                                                 MDSTAT_PGRP(mhp, ntick_pgrp);
2034                                                 goto reloc;
2035                                         }
2036                                         group_page_unlock(pp);
2037                                 }
2038
2039                                 if (!page_try_demote_pages(pp)) {
2040                                         MDSTAT_INCR(mhp, demotefail);
2041                                         page_unlock(pp);
2042 #ifdef MEM_DEL_STATS
2043                                         ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2044                                             start_pgrp;
2045 #endif /* MEM_DEL_STATS */
2046                                         MDSTAT_PGRP(mhp, ntick_pgrp);
2047                                         mutex_enter(&mhp->mh_mutex);
2048                                         continue;
2049                                 }
2050
2051                                 /*
2052                                  * Regular 'page-out'.
2053                                  */
2054                                 if (!mod) {
2055                                         MDSTAT_INCR(mhp, ndestroy);
2056                                         page_destroy(pp, 1);
2057                                         /*
2058                                          * page_destroy was called with
2059                                          * dontfree. As long as p_lckcnt
2060                                          * and p_cowcnt are both zero, the
2061                                          * only additional action of
2062                                          * page_destroy with !dontfree is to
2063                                          * call page_free, so we can collect
2064                                          * the page here.
2065                                          */
2066                                         collected++;
2067 #ifdef MEM_DEL_STATS
2068                                         ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2069                                             start_pgrp;
2070 #endif /* MEM_DEL_STATS */
2071                                         MDSTAT_PGRP(mhp, ntick_pgrp);
2072                                         mutex_enter(&mhp->mh_mutex);
2073                                         page_delete_collect(pp, mhp);
2074                                         mdsp->mds_bitmap[bit / NBPBMW] |=
2075                                             (1 << (bit % NBPBMW));
2076                                         continue;
2077                                 }
2078                                 /*
2079                                  * The page is toxic and the mod bit is
2080                                  * set, we cannot do anything here to deal
2081                                  * with it.
2082                                  */
2083                                 if (PP_TOXIC(pp)) {
2084                                         page_unlock(pp);
2085 #ifdef MEM_DEL_STATS
2086                                         ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2087                                             start_pgrp;
2088 #endif /* MEM_DEL_STATS */
2089                                         MDSTAT_PGRP(mhp, ntick_pgrp);
2090                                         MDSTAT_INCR(mhp, modtoxic);
2091                                         mutex_enter(&mhp->mh_mutex);
2092                                         continue;
2093                                 }
2094                                 MDSTAT_INCR(mhp, nputpage);
2095                                 vp = pp->p_vnode;
2096                                 offset = pp->p_offset;
2097                                 VN_HOLD(vp);
2098                                 page_unlock(pp);
2099                                 (void) VOP_PUTPAGE(vp, offset, PAGESIZE,
2100                                     B_INVAL|B_FORCE, kcred, NULL);
2101                                 VN_RELE(vp);
2102 #ifdef MEM_DEL_STATS
2103                                 ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2104                                     start_pgrp;
2105 #endif /* MEM_DEL_STATS */
2106                                 MDSTAT_PGRP(mhp, ntick_pgrp);
2107                                 /*
2108                                  * Try to get the page back immediately
2109                                  * so that it can be collected.
2110                                  */
2111                                 pp = page_numtopp_nolock(pfn);
2112                                 if (pp == NULL) {
2113                                         MDSTAT_INCR(mhp, nnoreclaim);
2114                                         /*
2115                                          * This should not happen as this
2116                                          * thread is deleting the page.
2117                                          * If this code is generalized, this
2118                                          * becomes a reality.
2119                                          */
2120 #ifdef DEBUG
2121                                         cmn_err(CE_WARN,
2122                                             "delete_memory_thread(0x%p) "
2123                                             "pfn 0x%lx has no page_t",
2124                                             (void *)mhp, pfn);
2125 #endif /* DEBUG */
2126                                         mutex_enter(&mhp->mh_mutex);
2127                                         continue;
2128                                 }
2129                                 if (page_try_reclaim_lock(pp, SE_EXCL,
2130                                     SE_EXCL_WANTED | SE_RETIRED)) {
2131                                         if (PP_ISFREE(pp)) {
2132                                                 goto free_page_collect;
2133                                         }
2134                                         page_unlock(pp);
2135                                 }
2136                                 MDSTAT_INCR(mhp, nnoreclaim);
2137                                 mutex_enter(&mhp->mh_mutex);
2138                                 continue;
2139
2140                         reloc:
2141                                 /*
2142                                  * Got some freemem and a target
2143                                  * page, so move the data to avoid
2144                                  * I/O and lock problems.
2145                                  */
2146                                 ASSERT(!page_iolock_assert(pp));
2147                                 MDSTAT_INCR(mhp, nreloc);
2148                                 /*
2149                                  * page_relocate() will return pgcnt: the
2150                                  * number of consecutive pages relocated.
2151                                  * If it is successful, pp will be a
2152                                  * linked list of the page structs that
2153                                  * were relocated. If page_relocate() is
2154                                  * unsuccessful, pp will be unmodified.
2155                                  */
2156 #ifdef MEM_DEL_STATS
2157                                 start_pgrp = ddi_get_lbolt();
2158 #endif /* MEM_DEL_STATS */
2159                                 result = page_relocate(&pp, &pp_targ, 0, 0,
2160                                     &pgcnt, NULL);
2161 #ifdef MEM_DEL_STATS
2162                                 ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2163                                     start_pgrp;
2164 #endif /* MEM_DEL_STATS */
2165                                 MDSTAT_PGRP(mhp, ntick_pgrp);
2166                                 if (result != 0) {
2167                                         MDSTAT_INCR(mhp, nrelocfail);
2168                                         /*
2169                                          * We did not succeed. We need
2170                                          * to give the pp_targ pages back.
2171                                          * page_free(pp_targ, 1) without
2172                                          * the freemem accounting.
2173                                          */
2174                                         group_page_unlock(pp);
2175                                         page_free_replacement_page(pp_targ);
2176                                         page_unlock(pp);
2177                                         mutex_enter(&mhp->mh_mutex);
2178                                         continue;
2179                                 }
2180
2181                                 /*
2182                                  * We will then collect pgcnt pages.
2183                                  */
2184                                 ASSERT(pgcnt > 0);
2185                                 mutex_enter(&mhp->mh_mutex);
2186                                 /*
2187                                  * We need to make sure freemem_left is
2188                                  * large enough.
2189                                  */
2190                                 while ((freemem_left < pgcnt) &&
2191                                     (!mhp->mh_cancel)) {
2192                                         freemem_left +=
2193                                             delthr_get_freemem(mhp);
2194                                 }
2195
2196                                 /*
2197                                  * Do not proceed if mh_cancel is set.
2198                                  */
2199                                 if (mhp->mh_cancel) {
2200                                         while (pp_targ != NULL) {
2201                                                 /*
2202                                                  * Unlink and unlock each page.
2203                                                  */
2204                                                 tpp_targ = pp_targ;
2205                                                 page_sub(&pp_targ, tpp_targ);
2206                                                 page_unlock(tpp_targ);
2207                                         }
2208                                         /*
2209                                          * We need to give the pp pages back.
2210                                          * page_free(pp, 1) without the
2211                                          * freemem accounting.
2212                                          */
2213                                         page_free_replacement_page(pp);
2214                                         break;
2215                                 }
2216
2217                                 /* Now remove pgcnt from freemem_left */
2218                                 freemem_left -= pgcnt;
2219                                 ASSERT(freemem_left >= 0);
2220                                 szc = pp->p_szc;
2221                                 while (pp != NULL) {
2222                                         /*
2223                                          * pp and pp_targ were passed back as
2224                                          * a linked list of pages.
2225                                          * Unlink and unlock each page.
2226                                          */
2227                                         tpp_targ = pp_targ;
2228                                         page_sub(&pp_targ, tpp_targ);
2229                                         page_unlock(tpp_targ);
2230                                         /*
2231                                          * The original page is now free
2232                                          * so remove it from the linked
2233                                          * list and collect it.
2234                                          */
2235                                         tpp = pp;
2236                                         page_sub(&pp, tpp);
2237                                         pfn = page_pptonum(tpp);
2238                                         collected++;
2239                                         ASSERT(PAGE_EXCL(tpp));
2240                                         ASSERT(tpp->p_vnode == NULL);
2241                                         ASSERT(!hat_page_is_mapped(tpp));
2242                                         ASSERT(tpp->p_szc == szc);
2243                                         tpp->p_szc = 0;
2244                                         page_delete_collect(tpp, mhp);
2245                                         bit = pfn - mdsp->mds_base;
2246                                         mdsp->mds_bitmap[bit / NBPBMW] |=
2247                                             (1 << (bit % NBPBMW));
2248                                 }
2249                                 ASSERT(pp_targ == NULL);
2250                         }
2251                 }
2252                 first_scan = 0;
2253                 if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) &&
2254                     (collected == 0)) {
2255                         /*
2256                          * This code is needed as we cannot wait
2257                          * for a page to be locked OR the delete to
2258                          * be cancelled.  Also, we must delay so
2259                          * that other threads get a chance to run
2260                          * on our cpu, otherwise page locks may be
2261                          * held indefinitely by those threads.
2262                          */
2263                         MDSTAT_INCR(mhp, ndelay);
2264                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
2265                         (void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
2266                             DEL_BUSY_WAIT_TICKS, TR_CLOCK_TICK);
2267                         CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2268                 }
2269         }
2270         /* stop the dr aio cleanup thread */
2271         mhp->mh_dr_aio_cleanup_cancel = 1;
2272         transit_list_collect(mhp, 0);
2273         if (freemem_left != 0) {
2274                 /* Return any surplus. */
2275                 page_create_putback(freemem_left);
2276                 freemem_left = 0;
2277         }
2278 #ifdef MEM_DEL_STATS
2279         ntick_total = (uint64_t)ddi_get_lbolt() - start_total;
2280 #endif /* MEM_DEL_STATS */
2281         MDSTAT_TOTAL(mhp, ntick_total);
2282         MDSTAT_PRINT(mhp);
2283
2284         /*
2285          * If the memory delete was cancelled, exclusive-wanted bits must
2286          * be cleared. If there are retired pages being deleted, they need
2287          * to be unretired.
2288          */
2289         for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2290             mdsp = mdsp->mds_next) {
2291                 pfn_t pfn, p_end;
2292
2293                 p_end = mdsp->mds_base + mdsp->mds_npgs;
2294                 for (pfn = mdsp->mds_base; pfn < p_end; pfn++) {
2295                         page_t *pp;
2296                         pgcnt_t bit;
2297
2298                         bit = pfn - mdsp->mds_base;
2299                         if (mhp->mh_cancel) {
2300                                 pp = page_numtopp_nolock(pfn);
2301                                 if (pp != NULL) {
2302                                         if ((mdsp->mds_bitmap[bit / NBPBMW] &
2303                                             (1 << (bit % NBPBMW))) == 0) {
2304                                                 page_lock_clr_exclwanted(pp);
2305                                         }
2306                                 }
2307                         } else {
2308                                 pp = NULL;
2309                         }
2310                         if ((mdsp->mds_bitmap_retired[bit / NBPBMW] &
2311                             (1 << (bit % NBPBMW))) != 0) {
2312                                 /* do we already have pp? */
2313                                 if (pp == NULL) {
2314                                         pp = page_numtopp_nolock(pfn);
2315                                 }
2316                                 ASSERT(pp != NULL);
2317                                 ASSERT(PP_RETIRED(pp));
2318                                 if (mhp->mh_cancel != 0) {
2319                                         page_unlock(pp);
2320                                         /*
2321                                          * To satisfy ASSERT below in
2322                                          * cancel code.
2323                                          */
2324                                         mhp->mh_hold_todo++;
2325                                 } else {
2326                                         (void) page_unretire_pp(pp,
2327                                             PR_UNR_CLEAN);
2328                                 }
2329                         }
2330                 }
2331         }
2332         /*
2333          * Free retired page bitmap and collected page bitmap
2334          */
2335         for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2336             mdsp = mdsp->mds_next) {
2337                 ASSERT(mdsp->mds_bitmap_retired != NULL);
2338                 kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp));
2339                 mdsp->mds_bitmap_retired = NULL;        /* Paranoia. */
2340                 ASSERT(mdsp->mds_bitmap != NULL);
2341                 kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp));
2342                 mdsp->mds_bitmap = NULL;        /* Paranoia. */
2343         }
2344
2345         /* wait for our dr aio cancel thread to exit */
2346         while (!(mhp->mh_aio_cleanup_done)) {
2347                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2348                 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
2349                 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2350         }
2351 refused:
2352         if (mhp->mh_cancel != 0) {
2353                 page_t *pp;
2354
2355                 comp_code = mhp->mh_cancel;
2356                 /*
2357                  * Go through list of deleted pages (mh_deleted) freeing
2358                  * them.
2359                  */
2360                 while ((pp = mhp->mh_deleted) != NULL) {
2361                         mhp->mh_deleted = pp->p_next;
2362                         mhp->mh_hold_todo++;
2363                         mutex_exit(&mhp->mh_mutex);
2364                         /* Restore p_next. */
2365                         pp->p_next = pp->p_prev;
2366                         if (PP_ISFREE(pp)) {
2367                                 cmn_err(CE_PANIC,
2368                                     "page %p is free",
2369                                     (void *)pp);
2370                         }
2371                         page_free(pp, 1);
2372                         mutex_enter(&mhp->mh_mutex);
2373                 }
2374                 ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages);
2375
2376                 mutex_exit(&mhp->mh_mutex);
2377                 put_availrmem(mhp->mh_vm_pages);
2378                 mutex_enter(&mhp->mh_mutex);
2379
2380                 goto t_exit;
2381         }
2382
2383         /*
2384          * All the pages are no longer in use and are exclusively locked.
2385          */
2386
2387         mhp->mh_deleted = NULL;
2388
2389         kphysm_del_cleanup(mhp);
2390
2391         /*
2392          * mem_node_del_range needs to be after kphysm_del_cleanup so
2393          * that the mem_node_config[] will remain intact for the cleanup.
2394          */
2395         for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2396             mdsp = mdsp->mds_next) {
2397                 mem_node_del_range(mdsp->mds_base,
2398                     mdsp->mds_base + mdsp->mds_npgs - 1);
2399         }
2400         /* cleanup the page counters */
2401         page_ctrs_cleanup();
2402
2403         comp_code = KPHYSM_OK;
2404
2405 t_exit:
2406         mutex_exit(&mhp->mh_mutex);
2407         kphysm_setup_post_del(mhp->mh_vm_pages,
2408             (comp_code == KPHYSM_OK) ? 0 : 1);
2409         mutex_enter(&mhp->mh_mutex);
2410
2411 early_exit:
2412         /* mhp->mh_mutex exited by CALLB_CPR_EXIT() */
2413         mhp->mh_state = MHND_DONE;
2414         del_complete_funcp = mhp->mh_delete_complete;
2415         del_complete_arg = mhp->mh_delete_complete_arg;
2416         CALLB_CPR_EXIT(&cprinfo);
2417         (*del_complete_funcp)(del_complete_arg, comp_code);
2418         thread_exit();
2419         /*NOTREACHED*/
2420 }
2421
2422 /*
2423  * Start the delete of the memory from the system.
2424  */
2425 int
2426 kphysm_del_start(
2427         memhandle_t handle,
2428         void (*complete)(void *, int),
2429         void *complete_arg)
2430 {
2431         struct mem_handle *mhp;
2432
2433         mhp = kphysm_lookup_mem_handle(handle);
2434         if (mhp == NULL) {
2435                 return (KPHYSM_EHANDLE);
2436         }
2437         switch (mhp->mh_state) {
2438         case MHND_FREE:
2439                 ASSERT(mhp->mh_state != MHND_FREE);
2440                 mutex_exit(&mhp->mh_mutex);
2441                 return (KPHYSM_EHANDLE);
2442         case MHND_INIT:
2443                 break;
2444         case MHND_STARTING:
2445         case MHND_RUNNING:
2446                 mutex_exit(&mhp->mh_mutex);
2447                 return (KPHYSM_ESEQUENCE);
2448         case MHND_DONE:
2449                 mutex_exit(&mhp->mh_mutex);
2450                 return (KPHYSM_ESEQUENCE);
2451         case MHND_RELEASE:
2452                 mutex_exit(&mhp->mh_mutex);
2453                 return (KPHYSM_ESEQUENCE);
2454         default:
2455 #ifdef DEBUG
2456                 cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d",
2457                     (void *)mhp, mhp->mh_state);
2458 #endif /* DEBUG */
2459                 mutex_exit(&mhp->mh_mutex);
2460                 return (KPHYSM_EHANDLE);
2461         }
2462
2463         if (mhp->mh_transit.trl_spans == NULL) {
2464                 mutex_exit(&mhp->mh_mutex);
2465                 return (KPHYSM_ENOWORK);
2466         }
2467
2468         ASSERT(complete != NULL);
2469         mhp->mh_delete_complete = complete;
2470         mhp->mh_delete_complete_arg = complete_arg;
2471         mhp->mh_state = MHND_STARTING;
2472         /*
2473          * Release the mutex in case thread_create sleeps.
2474          */
2475         mutex_exit(&mhp->mh_mutex);
2476
2477         /*
2478          * The "obvious" process for this thread is pageout (proc_pageout)
2479          * but this gives the thread too much power over freemem
2480          * which results in freemem starvation.
2481          */
2482         (void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0,
2483             TS_RUN, maxclsyspri - 1);
2484
2485         return (KPHYSM_OK);
2486 }
2487
2488 static kmutex_t pp_dummy_lock;          /* Protects init. of pp_dummy. */
2489 static caddr_t pp_dummy;
2490 static pgcnt_t pp_dummy_npages;
2491 static pfn_t *pp_dummy_pfn;     /* Array of dummy pfns. */
2492
2493 static void
2494 memseg_remap_init_pages(page_t *pages, page_t *epages)
2495 {
2496         page_t *pp;
2497
2498         for (pp = pages; pp < epages; pp++) {
2499                 pp->p_pagenum = PFN_INVALID;    /* XXXX */
2500                 pp->p_offset = (u_offset_t)-1;
2501                 page_iolock_init(pp);
2502                 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
2503                         continue;
2504                 page_lock_delete(pp);
2505         }
2506 }
2507
2508 void
2509 memseg_remap_init()
2510 {
2511         mutex_enter(&pp_dummy_lock);
2512         if (pp_dummy == NULL) {
2513                 uint_t dpages;
2514                 int i;
2515
2516                 /*
2517                  * dpages starts off as the size of the structure and
2518                  * ends up as the minimum number of pages that will
2519                  * hold a whole number of page_t structures.
2520                  */
2521                 dpages = sizeof (page_t);
2522                 ASSERT(dpages != 0);
2523                 ASSERT(dpages <= MMU_PAGESIZE);
2524
2525                 while ((dpages & 1) == 0)
2526                         dpages >>= 1;
2527
2528                 pp_dummy_npages = dpages;
2529                 /*
2530                  * Allocate pp_dummy pages directly from static_arena,
2531                  * since these are whole page allocations and are
2532                  * referenced by physical address.  This also has the
2533                  * nice fringe benefit of hiding the memory from
2534                  * ::findleaks since it doesn't deal well with allocated
2535                  * kernel heap memory that doesn't have any mappings.
2536                  */
2537                 pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages),
2538                     PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP);
2539                 bzero(pp_dummy, ptob(pp_dummy_npages));
2540                 ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0);
2541                 pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) *
2542                     pp_dummy_npages, KM_SLEEP);
2543                 for (i = 0; i < pp_dummy_npages; i++) {
2544                         pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat,
2545                             &pp_dummy[MMU_PAGESIZE * i]);
2546                         ASSERT(pp_dummy_pfn[i] != PFN_INVALID);
2547                 }
2548                 /*
2549                  * Initialize the page_t's to a known 'deleted' state
2550                  * that matches the state of deleted pages.
2551                  */
2552                 memseg_remap_init_pages((page_t *)pp_dummy,
2553                     (page_t *)(pp_dummy + ptob(pp_dummy_npages)));
2554                 /* Remove kmem mappings for the pages for safety. */
2555                 hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages),
2556                     HAT_UNLOAD_UNLOCK);
2557                 /* Leave pp_dummy pointer set as flag that init is done. */
2558         }
2559         mutex_exit(&pp_dummy_lock);
2560 }
2561
2562 /*
2563  * Remap a page-aglined range of page_t's to dummy pages.
2564  */
2565 void
2566 remap_to_dummy(caddr_t va, pgcnt_t metapgs)
2567 {
2568         int phase;
2569
2570         ASSERT(IS_P2ALIGNED((uint64_t)(uintptr_t)va, PAGESIZE));
2571
2572         /*
2573          * We may start remapping at a non-zero page offset
2574          * within the dummy pages since the low/high ends
2575          * of the outgoing pp's could be shared by other
2576          * memsegs (see memseg_remap_meta).
2577          */
2578         phase = btop((uint64_t)(uintptr_t)va) % pp_dummy_npages;
2579         /*CONSTCOND*/
2580         ASSERT(PAGESIZE % sizeof (page_t) || phase == 0);
2581
2582         while (metapgs != 0) {
2583                 pgcnt_t n;
2584                 int i, j;
2585
2586                 n = pp_dummy_npages;
2587                 if (n > metapgs)
2588                         n = metapgs;
2589                 for (i = 0; i < n; i++) {
2590                         j = (i + phase) % pp_dummy_npages;
2591                         hat_devload(kas.a_hat, va, ptob(1), pp_dummy_pfn[j],
2592                             PROT_READ,
2593                             HAT_LOAD | HAT_LOAD_NOCONSIST |
2594                             HAT_LOAD_REMAP);
2595                         va += ptob(1);
2596                 }
2597                 metapgs -= n;
2598         }
2599 }
2600
2601 static void
2602 memseg_remap_to_dummy(struct memseg *seg)
2603 {
2604         caddr_t pp;
2605         pgcnt_t metapgs;
2606
2607         ASSERT(memseg_is_dynamic(seg));
2608         ASSERT(pp_dummy != NULL);
2609
2610
2611         if (!memseg_includes_meta(seg)) {
2612                 memseg_remap_meta(seg);
2613                 return;
2614         }
2615
2616         pp = (caddr_t)seg->pages;
2617         metapgs = seg->pages_base - memseg_get_start(seg);
2618         ASSERT(metapgs != 0);
2619
2620         seg->pages_end = seg->pages_base;
2621
2622         remap_to_dummy(pp, metapgs);
2623 }
2624
2625 /*
2626  * Transition all the deleted pages to the deleted state so that
2627  * page_lock will not wait. The page_lock_delete call will
2628  * also wake up any waiters.
2629  */
2630 static void
2631 memseg_lock_delete_all(struct memseg *seg)
2632 {
2633         page_t *pp;
2634
2635         for (pp = seg->pages; pp < seg->epages; pp++) {
2636                 pp->p_pagenum = PFN_INVALID;    /* XXXX */
2637                 page_lock_delete(pp);
2638         }
2639 }
2640
2641 static void
2642 kphysm_del_cleanup(struct mem_handle *mhp)
2643 {
2644         struct memdelspan       *mdsp;
2645         struct memseg           *seg;
2646         struct memseg           **segpp;
2647         struct memseg           *seglist;
2648         pfn_t                   p_end;
2649         uint64_t                avmem;
2650         pgcnt_t                 avpgs;
2651         pgcnt_t                 npgs;
2652
2653         avpgs = mhp->mh_vm_pages;
2654
2655         memsegs_lock(1);
2656
2657         /*
2658          * remove from main segment list.
2659          */
2660         npgs = 0;
2661         seglist = NULL;
2662         for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2663             mdsp = mdsp->mds_next) {
2664                 p_end = mdsp->mds_base + mdsp->mds_npgs;
2665                 for (segpp = &memsegs; (seg = *segpp) != NULL; ) {
2666                         if (seg->pages_base >= p_end ||
2667                             seg->pages_end <= mdsp->mds_base) {
2668                                 /* Span and memseg don't overlap. */
2669                                 segpp = &((*segpp)->next);
2670                                 continue;
2671                         }
2672                         ASSERT(seg->pages_base >= mdsp->mds_base);
2673                         ASSERT(seg->pages_end <= p_end);
2674
2675                         PLCNT_MODIFY_MAX(seg->pages_base,
2676                             seg->pages_base - seg->pages_end);
2677
2678                         /* Hide the memseg from future scans. */
2679                         hat_kpm_delmem_mseg_update(seg, segpp);
2680                         *segpp = seg->next;
2681                         membar_producer();      /* TODO: Needed? */
2682                         npgs += MSEG_NPAGES(seg);
2683
2684                         /*
2685                          * Leave the deleted segment's next pointer intact
2686                          * in case a memsegs scanning loop is walking this
2687                          * segment concurrently.
2688                          */
2689                         seg->lnext = seglist;
2690                         seglist = seg;
2691                 }
2692         }
2693
2694         build_pfn_hash();
2695
2696         ASSERT(npgs < total_pages);
2697         total_pages -= npgs;
2698
2699         /*
2700          * Recalculate the paging parameters now total_pages has changed.
2701          * This will also cause the clock hands to be reset before next use.
2702          */
2703         setupclock();
2704
2705         memsegs_unlock(1);
2706
2707         mutex_exit(&mhp->mh_mutex);
2708
2709         while ((seg = seglist) != NULL) {
2710                 pfn_t mseg_start;
2711                 pfn_t mseg_base, mseg_end;
2712                 pgcnt_t mseg_npgs;
2713                 int mlret;
2714
2715                 seglist = seg->lnext;
2716
2717                 /*
2718                  * Put the page_t's into the deleted state to stop
2719                  * cv_wait()s on the pages. When we remap, the dummy
2720                  * page_t's will be in the same state.
2721                  */
2722                 memseg_lock_delete_all(seg);
2723                 /*
2724                  * Collect up information based on pages_base and pages_end
2725                  * early so that we can flag early that the memseg has been
2726                  * deleted by setting pages_end == pages_base.
2727                  */
2728                 mseg_base = seg->pages_base;
2729                 mseg_end = seg->pages_end;
2730                 mseg_npgs = MSEG_NPAGES(seg);
2731                 mseg_start = memseg_get_start(seg);
2732
2733                 if (memseg_is_dynamic(seg)) {
2734                         /* Remap the meta data to our special dummy area. */
2735                         memseg_remap_to_dummy(seg);
2736
2737                         mutex_enter(&memseg_lists_lock);
2738                         seg->lnext = memseg_va_avail;
2739                         memseg_va_avail = seg;
2740                         mutex_exit(&memseg_lists_lock);
2741                 } else {
2742                         /*
2743                          * For memory whose page_ts were allocated
2744                          * at boot, we need to find a new use for
2745                          * the page_t memory.
2746                          * For the moment, just leak it.
2747                          * (It is held in the memseg_delete_junk list.)
2748                          */
2749                         seg->pages_end = seg->pages_base;
2750
2751                         mutex_enter(&memseg_lists_lock);
2752                         seg->lnext = memseg_delete_junk;
2753                         memseg_delete_junk = seg;
2754                         mutex_exit(&memseg_lists_lock);
2755                 }
2756
2757                 /* Must not use seg now as it could be re-used. */
2758
2759                 memlist_write_lock();
2760
2761                 mlret = memlist_delete_span(
2762                     (uint64_t)(mseg_base) << PAGESHIFT,
2763                     (uint64_t)(mseg_npgs) << PAGESHIFT,
2764                     &phys_avail);
2765                 ASSERT(mlret == MEML_SPANOP_OK);
2766
2767                 mlret = memlist_delete_span(
2768                     (uint64_t)(mseg_start) << PAGESHIFT,
2769                     (uint64_t)(mseg_end - mseg_start) <<
2770                     PAGESHIFT,
2771                     &phys_install);
2772                 ASSERT(mlret == MEML_SPANOP_OK);
2773                 phys_install_has_changed();
2774
2775                 memlist_write_unlock();
2776         }
2777
2778         memlist_read_lock();
2779         installed_top_size(phys_install, &physmax, &physinstalled);
2780         memlist_read_unlock();
2781
2782         mutex_enter(&freemem_lock);
2783         maxmem -= avpgs;
2784         physmem -= avpgs;
2785         /* availrmem is adjusted during the delete. */
2786         availrmem_initial -= avpgs;
2787
2788         mutex_exit(&freemem_lock);
2789
2790         dump_resize();
2791
2792         cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK "
2793             "(0x%" PRIx64 ")\n",
2794             physinstalled << (PAGESHIFT - 10),
2795             (uint64_t)physinstalled << PAGESHIFT);
2796
2797         avmem = (uint64_t)freemem << PAGESHIFT;
2798         cmn_err(CE_CONT, "?kphysm_delete: "
2799             "avail mem = %" PRId64 "\n", avmem);
2800
2801         /*
2802          * Update lgroup generation number on single lgroup systems
2803          */
2804         if (nlgrps == 1)
2805                 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
2806
2807         /* Successfully deleted system memory */
2808         mutex_enter(&mhp->mh_mutex);
2809 }
2810
2811 static uint_t mdel_nullvp_waiter;
2812
2813 static void
2814 page_delete_collect(
2815         page_t *pp,
2816         struct mem_handle *mhp)
2817 {
2818         if (pp->p_vnode) {
2819                 page_hashout(pp, (kmutex_t *)NULL);
2820                 /* do not do PP_SETAGED(pp); */
2821         } else {
2822                 kmutex_t *sep;
2823
2824                 sep = page_se_mutex(pp);
2825                 mutex_enter(sep);
2826                 if (CV_HAS_WAITERS(&pp->p_cv)) {
2827                         mdel_nullvp_waiter++;
2828                         cv_broadcast(&pp->p_cv);
2829                 }
2830                 mutex_exit(sep);
2831         }
2832         ASSERT(pp->p_next == pp->p_prev);
2833         ASSERT(pp->p_next == NULL || pp->p_next == pp);
2834         pp->p_next = mhp->mh_deleted;
2835         mhp->mh_deleted = pp;
2836         ASSERT(mhp->mh_hold_todo != 0);
2837         mhp->mh_hold_todo--;
2838 }
2839
2840 static void
2841 transit_list_collect(struct mem_handle *mhp, int v)
2842 {
2843         struct transit_list_head *trh;
2844
2845         trh = &transit_list_head;
2846         mutex_enter(&trh->trh_lock);
2847         mhp->mh_transit.trl_collect = v;
2848         mutex_exit(&trh->trh_lock);
2849 }
2850
2851 static void
2852 transit_list_insert(struct transit_list *tlp)
2853 {
2854         struct transit_list_head *trh;
2855
2856         trh = &transit_list_head;
2857         ASSERT(MUTEX_HELD(&trh->trh_lock));
2858         tlp->trl_next = trh->trh_head;
2859         trh->trh_head = tlp;
2860 }
2861
2862 static void
2863 transit_list_remove(struct transit_list *tlp)
2864 {
2865         struct transit_list_head *trh;
2866         struct transit_list **tlpp;
2867
2868         trh = &transit_list_head;
2869         tlpp = &trh->trh_head;
2870         ASSERT(MUTEX_HELD(&trh->trh_lock));
2871         while (*tlpp != NULL && *tlpp != tlp)
2872                 tlpp = &(*tlpp)->trl_next;
2873         ASSERT(*tlpp != NULL);
2874         if (*tlpp == tlp)
2875                 *tlpp = tlp->trl_next;
2876         tlp->trl_next = NULL;
2877 }
2878
2879 static struct transit_list *
2880 pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum)
2881 {
2882         struct transit_list *tlp;
2883
2884         for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
2885                 struct memdelspan *mdsp;
2886
2887                 for (mdsp = tlp->trl_spans; mdsp != NULL;
2888                     mdsp = mdsp->mds_next) {
2889                         if (pfnum >= mdsp->mds_base &&
2890                             pfnum < (mdsp->mds_base + mdsp->mds_npgs)) {
2891                                 return (tlp);
2892                         }
2893                 }
2894         }
2895         return (NULL);
2896 }
2897
2898 int
2899 pfn_is_being_deleted(pfn_t pfnum)
2900 {
2901         struct transit_list_head *trh;
2902         struct transit_list *tlp;
2903         int ret;
2904
2905         trh = &transit_list_head;
2906         if (trh->trh_head == NULL)
2907                 return (0);
2908
2909         mutex_enter(&trh->trh_lock);
2910         tlp = pfnum_to_transit_list(trh, pfnum);
2911         ret = (tlp != NULL && tlp->trl_collect);
2912         mutex_exit(&trh->trh_lock);
2913
2914         return (ret);
2915 }
2916
2917 #ifdef MEM_DEL_STATS
2918 extern int hz;
2919 static void
2920 mem_del_stat_print_func(struct mem_handle *mhp)
2921 {
2922         uint64_t tmp;
2923
2924         if (mem_del_stat_print) {
2925                 printf("memory delete loop %x/%x, statistics%s\n",
2926                     (uint_t)mhp->mh_transit.trl_spans->mds_base,
2927                     (uint_t)mhp->mh_transit.trl_spans->mds_npgs,
2928                     (mhp->mh_cancel ? " (cancelled)" : ""));
2929                 printf("\t%8u nloop\n", mhp->mh_delstat.nloop);
2930                 printf("\t%8u need_free\n", mhp->mh_delstat.need_free);
2931                 printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop);
2932                 printf("\t%8u free_low\n", mhp->mh_delstat.free_low);
2933                 printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed);
2934                 printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck);
2935                 printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget);
2936                 printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail);
2937                 printf("\t%8u nfree\n", mhp->mh_delstat.nfree);
2938                 printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc);
2939                 printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail);
2940                 printf("\t%8u already_done\n", mhp->mh_delstat.already_done);
2941                 printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree);
2942                 printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked);
2943                 printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc);
2944                 printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl);
2945                 printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc);
2946                 printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy);
2947                 printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage);
2948                 printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim);
2949                 printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay);
2950                 printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail);
2951                 printf("\t%8u retired\n", mhp->mh_delstat.retired);
2952                 printf("\t%8u toxic\n", mhp->mh_delstat.toxic);
2953                 printf("\t%8u failing\n", mhp->mh_delstat.failing);
2954                 printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic);
2955                 printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic);
2956                 printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail);
2957                 printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail);
2958                 tmp = mhp->mh_delstat.nticks_total / hz;  /* seconds */
2959                 printf(
2960                     "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n",
2961                     mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60);
2962
2963                 tmp = mhp->mh_delstat.nticks_pgrp / hz;  /* seconds */
2964                 printf(
2965                     "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n",
2966                     mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60);
2967         }
2968 }
2969 #endif /* MEM_DEL_STATS */
2970
2971 struct mem_callback {
2972         kphysm_setup_vector_t   *vec;
2973         void                    *arg;
2974 };
2975
2976 #define NMEMCALLBACKS           100
2977
2978 static struct mem_callback mem_callbacks[NMEMCALLBACKS];
2979 static uint_t nmemcallbacks;
2980 static krwlock_t mem_callback_rwlock;
2981
2982 int
2983 kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg)
2984 {
2985         uint_t i, found;
2986
2987         /*
2988          * This test will become more complicated when the version must
2989          * change.
2990          */
2991         if (vec->version != KPHYSM_SETUP_VECTOR_VERSION)
2992                 return (EINVAL);
2993
2994         if (vec->post_add == NULL || vec->pre_del == NULL ||
2995             vec->post_del == NULL)
2996                 return (EINVAL);
2997
2998         rw_enter(&mem_callback_rwlock, RW_WRITER);
2999         for (i = 0, found = 0; i < nmemcallbacks; i++) {
3000                 if (mem_callbacks[i].vec == NULL && found == 0)
3001                         found = i + 1;
3002                 if (mem_callbacks[i].vec == vec &&
3003                     mem_callbacks[i].arg == arg) {
3004 #ifdef DEBUG
3005                         /* Catch this in DEBUG kernels. */
3006                         cmn_err(CE_WARN, "kphysm_setup_func_register"
3007                             "(0x%p, 0x%p) duplicate registration from 0x%p",
3008                             (void *)vec, arg, (void *)caller());
3009 #endif /* DEBUG */
3010                         rw_exit(&mem_callback_rwlock);
3011                         return (EEXIST);
3012                 }
3013         }
3014         if (found != 0) {
3015                 i = found - 1;
3016         } else {
3017                 ASSERT(nmemcallbacks < NMEMCALLBACKS);
3018                 if (nmemcallbacks == NMEMCALLBACKS) {
3019                         rw_exit(&mem_callback_rwlock);
3020                         return (ENOMEM);
3021                 }
3022                 i = nmemcallbacks++;
3023         }
3024         mem_callbacks[i].vec = vec;
3025         mem_callbacks[i].arg = arg;
3026         rw_exit(&mem_callback_rwlock);
3027         return (0);
3028 }
3029
3030 void
3031 kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg)
3032 {
3033         uint_t i;
3034
3035         rw_enter(&mem_callback_rwlock, RW_WRITER);
3036         for (i = 0; i < nmemcallbacks; i++) {
3037                 if (mem_callbacks[i].vec == vec &&
3038                     mem_callbacks[i].arg == arg) {
3039                         mem_callbacks[i].vec = NULL;
3040                         mem_callbacks[i].arg = NULL;
3041                         if (i == (nmemcallbacks - 1))
3042                                 nmemcallbacks--;
3043                         break;
3044                 }
3045         }
3046         rw_exit(&mem_callback_rwlock);
3047 }
3048
3049 static void
3050 kphysm_setup_post_add(pgcnt_t delta_pages)
3051 {
3052         uint_t i;
3053
3054         rw_enter(&mem_callback_rwlock, RW_READER);
3055         for (i = 0; i < nmemcallbacks; i++) {
3056                 if (mem_callbacks[i].vec != NULL) {
3057                         (*mem_callbacks[i].vec->post_add)
3058                             (mem_callbacks[i].arg, delta_pages);
3059                 }
3060         }
3061         rw_exit(&mem_callback_rwlock);
3062 }
3063
3064 /*
3065  * Note the locking between pre_del and post_del: The reader lock is held
3066  * between the two calls to stop the set of functions from changing.
3067  */
3068
3069 static int
3070 kphysm_setup_pre_del(pgcnt_t delta_pages)
3071 {
3072         uint_t i;
3073         int ret;
3074         int aret;
3075
3076         ret = 0;
3077         rw_enter(&mem_callback_rwlock, RW_READER);
3078         for (i = 0; i < nmemcallbacks; i++) {
3079                 if (mem_callbacks[i].vec != NULL) {
3080                         aret = (*mem_callbacks[i].vec->pre_del)
3081                             (mem_callbacks[i].arg, delta_pages);
3082                         ret |= aret;
3083                 }
3084         }
3085
3086         return (ret);
3087 }
3088
3089 static void
3090 kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled)
3091 {
3092         uint_t i;
3093
3094         for (i = 0; i < nmemcallbacks; i++) {
3095                 if (mem_callbacks[i].vec != NULL) {
3096                         (*mem_callbacks[i].vec->post_del)
3097                             (mem_callbacks[i].arg, delta_pages, cancelled);
3098                 }
3099         }
3100         rw_exit(&mem_callback_rwlock);
3101 }
3102
3103 static int
3104 kphysm_split_memseg(
3105         pfn_t base,
3106         pgcnt_t npgs)
3107 {
3108         struct memseg *seg;
3109         struct memseg **segpp;
3110         pgcnt_t size_low, size_high;
3111         struct memseg *seg_low, *seg_mid, *seg_high;
3112
3113         /*
3114          * Lock the memsegs list against other updates now
3115          */
3116         memsegs_lock(1);
3117
3118         /*
3119          * Find boot time memseg that wholly covers this area.
3120          */
3121
3122         /* First find the memseg with page 'base' in it. */
3123         for (segpp = &memsegs; (seg = *segpp) != NULL;
3124             segpp = &((*segpp)->next)) {
3125                 if (base >= seg->pages_base && base < seg->pages_end)
3126                         break;
3127         }
3128         if (seg == NULL) {
3129                 memsegs_unlock(1);
3130                 return (0);
3131         }
3132         if (memseg_includes_meta(seg)) {
3133                 memsegs_unlock(1);
3134                 return (0);
3135         }
3136         if ((base + npgs) > seg->pages_end) {
3137                 memsegs_unlock(1);
3138                 return (0);
3139         }
3140
3141         /*
3142          * Work out the size of the two segments that will
3143          * surround the new segment, one for low address
3144          * and one for high.
3145          */
3146         ASSERT(base >= seg->pages_base);
3147         size_low = base - seg->pages_base;
3148         ASSERT(seg->pages_end >= (base + npgs));
3149         size_high = seg->pages_end - (base + npgs);
3150
3151         /*
3152          * Sanity check.
3153          */
3154         if ((size_low + size_high) == 0) {
3155                 memsegs_unlock(1);
3156                 return (0);
3157         }
3158
3159         /*
3160          * Allocate the new structures. The old memseg will not be freed
3161          * as there may be a reference to it.
3162          */
3163         seg_low = NULL;
3164         seg_high = NULL;
3165
3166         if (size_low != 0)
3167                 seg_low = memseg_alloc();
3168
3169         seg_mid = memseg_alloc();
3170
3171         if (size_high != 0)
3172                 seg_high = memseg_alloc();
3173
3174         /*
3175          * All allocation done now.
3176          */
3177         if (size_low != 0) {
3178                 seg_low->pages = seg->pages;
3179                 seg_low->epages = seg_low->pages + size_low;
3180                 seg_low->pages_base = seg->pages_base;
3181                 seg_low->pages_end = seg_low->pages_base + size_low;
3182                 seg_low->next = seg_mid;
3183                 seg_low->msegflags = seg->msegflags;
3184         }
3185         if (size_high != 0) {
3186                 seg_high->pages = seg->epages - size_high;
3187                 seg_high->epages = seg_high->pages + size_high;
3188                 seg_high->pages_base = seg->pages_end - size_high;
3189                 seg_high->pages_end = seg_high->pages_base + size_high;
3190                 seg_high->next = seg->next;
3191                 seg_high->msegflags = seg->msegflags;
3192         }
3193
3194         seg_mid->pages = seg->pages + size_low;
3195         seg_mid->pages_base = seg->pages_base + size_low;
3196         seg_mid->epages = seg->epages - size_high;
3197         seg_mid->pages_end = seg->pages_end - size_high;
3198         seg_mid->next = (seg_high != NULL) ? seg_high : seg->next;
3199         seg_mid->msegflags = seg->msegflags;
3200
3201         /*
3202          * Update hat_kpm specific info of all involved memsegs and
3203          * allow hat_kpm specific global chain updates.
3204          */
3205         hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high);
3206
3207         /*
3208          * At this point we have two equivalent memseg sub-chains,
3209          * seg and seg_low/seg_mid/seg_high, which both chain on to
3210          * the same place in the global chain. By re-writing the pointer
3211          * in the previous element we switch atomically from using the old
3212          * (seg) to the new.
3213          */
3214         *segpp = (seg_low != NULL) ? seg_low : seg_mid;
3215
3216         membar_enter();
3217
3218         build_pfn_hash();
3219         memsegs_unlock(1);
3220
3221         /*
3222          * We leave the old segment, 'seg', intact as there may be
3223          * references to it. Also, as the value of total_pages has not
3224          * changed and the memsegs list is effectively the same when
3225          * accessed via the old or the new pointer, we do not have to
3226          * cause pageout_scanner() to re-evaluate its hand pointers.
3227          *
3228          * We currently do not re-use or reclaim the page_t memory.
3229          * If we do, then this may have to change.
3230          */
3231
3232         mutex_enter(&memseg_lists_lock);
3233         seg->lnext = memseg_edit_junk;
3234         memseg_edit_junk = seg;
3235         mutex_exit(&memseg_lists_lock);
3236
3237         return (1);
3238 }
3239
3240 /*
3241  * The sfmmu hat layer (e.g.) accesses some parts of the memseg
3242  * structure using physical addresses. Therefore a kmem_cache is
3243  * used with KMC_NOHASH to avoid page crossings within a memseg
3244  * structure. KMC_NOHASH requires that no external (outside of
3245  * slab) information is allowed. This, in turn, implies that the
3246  * cache's slabsize must be exactly a single page, since per-slab
3247  * information (e.g. the freelist for the slab) is kept at the
3248  * end of the slab, where it is easy to locate. Should be changed
3249  * when a more obvious kmem_cache interface/flag will become
3250  * available.
3251  */
3252 void
3253 mem_config_init()
3254 {
3255         memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg),
3256             0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH);
3257 }
3258
3259 struct memseg *
3260 memseg_alloc()
3261 {
3262         struct memseg *seg;
3263
3264         seg = kmem_cache_alloc(memseg_cache, KM_SLEEP);
3265         bzero(seg, sizeof (struct memseg));
3266
3267         return (seg);
3268 }
3269
3270 /*
3271  * Return whether the page_t memory for this memseg
3272  * is included in the memseg itself.
3273  */
3274 static int
3275 memseg_includes_meta(struct memseg *seg)
3276 {
3277         return (seg->msegflags & MEMSEG_META_INCL);
3278 }
3279
3280 pfn_t
3281 memseg_get_start(struct memseg *seg)
3282 {
3283         pfn_t           pt_start;
3284
3285         if (memseg_includes_meta(seg)) {
3286                 pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages);
3287
3288                 /* Meta data is required to be at the beginning */
3289                 ASSERT(pt_start < seg->pages_base);
3290         } else
3291                 pt_start = seg->pages_base;
3292
3293         return (pt_start);
3294 }
3295
3296 /*
3297  * Invalidate memseg pointers in cpu private vm data caches.
3298  */
3299 static void
3300 memseg_cpu_vm_flush()
3301 {
3302         cpu_t *cp;
3303         vm_cpu_data_t *vc;
3304
3305         mutex_enter(&cpu_lock);
3306         pause_cpus(NULL, NULL);
3307
3308         cp = cpu_list;
3309         do {
3310                 vc = cp->cpu_vm_data;
3311                 vc->vc_pnum_memseg = NULL;
3312                 vc->vc_pnext_memseg = NULL;
3313
3314         } while ((cp = cp->cpu_next) != cpu_list);
3315
3316         start_cpus();
3317         mutex_exit(&cpu_lock);
3318 }