16164 Consistently wake all pageout daemon threads
[illumos-gate.git] / usr / src / uts / common / os / mem_config.c
blob9687f743a010f04523783caf53c1861767782149
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2017 Joyent, Inc.
27 #include <sys/types.h>
28 #include <sys/cmn_err.h>
29 #include <sys/vmem.h>
30 #include <sys/kmem.h>
31 #include <sys/systm.h>
32 #include <sys/machsystm.h> /* for page_freelist_coalesce() */
33 #include <sys/errno.h>
34 #include <sys/memnode.h>
35 #include <sys/memlist.h>
36 #include <sys/memlist_impl.h>
37 #include <sys/tuneable.h>
38 #include <sys/proc.h>
39 #include <sys/disp.h>
40 #include <sys/debug.h>
41 #include <sys/vm.h>
42 #include <sys/callb.h>
43 #include <sys/memlist_plat.h> /* for installed_top_size() */
44 #include <sys/condvar_impl.h> /* for CV_HAS_WAITERS() */
45 #include <sys/dumphdr.h> /* for dump_resize() */
46 #include <sys/atomic.h> /* for use in stats collection */
47 #include <sys/rwlock.h>
48 #include <sys/cpuvar.h>
49 #include <vm/seg_kmem.h>
50 #include <vm/seg_kpm.h>
51 #include <vm/page.h>
52 #include <vm/vm_dep.h>
53 #define SUNDDI_IMPL /* so sunddi.h will not redefine splx() et al */
54 #include <sys/sunddi.h>
55 #include <sys/mem_config.h>
56 #include <sys/mem_cage.h>
57 #include <sys/lgrp.h>
58 #include <sys/ddi.h>
59 #include <sys/modctl.h>
61 extern struct memlist *phys_avail;
63 extern uint_t page_ctrs_adjust(int);
64 void page_ctrs_cleanup(void);
65 static void kphysm_setup_post_add(pgcnt_t);
66 static int kphysm_setup_pre_del(pgcnt_t);
67 static void kphysm_setup_post_del(pgcnt_t, int);
69 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs);
71 static int delspan_reserve(pfn_t, pgcnt_t);
72 static void delspan_unreserve(pfn_t, pgcnt_t);
74 kmutex_t memseg_lists_lock;
75 struct memseg *memseg_va_avail;
76 struct memseg *memseg_alloc(void);
77 static struct memseg *memseg_delete_junk;
78 static struct memseg *memseg_edit_junk;
79 void memseg_remap_init(void);
80 static void memseg_remap_to_dummy(struct memseg *);
81 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t);
82 static struct memseg *memseg_reuse(pgcnt_t);
84 static struct kmem_cache *memseg_cache;
87 * Interfaces to manage externally allocated
88 * page_t memory (metadata) for a memseg.
90 #pragma weak memseg_alloc_meta
91 #pragma weak memseg_free_meta
92 #pragma weak memseg_get_metapfn
93 #pragma weak memseg_remap_meta
95 extern int ppvm_enable;
96 extern page_t *ppvm_base;
97 extern int memseg_alloc_meta(pfn_t, pgcnt_t, void **, pgcnt_t *);
98 extern void memseg_free_meta(void *, pgcnt_t);
99 extern pfn_t memseg_get_metapfn(void *, pgcnt_t);
100 extern void memseg_remap_meta(struct memseg *);
101 static int memseg_is_dynamic(struct memseg *);
102 static int memseg_includes_meta(struct memseg *);
103 pfn_t memseg_get_start(struct memseg *);
104 static void memseg_cpu_vm_flush(void);
106 int meta_alloc_enable;
108 #ifdef DEBUG
109 static int memseg_debug;
110 #define MEMSEG_DEBUG(args...) if (memseg_debug) printf(args)
111 #else
112 #define MEMSEG_DEBUG(...)
113 #endif
116 * Add a chunk of memory to the system.
117 * base: starting PAGESIZE page of new memory.
118 * npgs: length in PAGESIZE pages.
120 * Adding mem this way doesn't increase the size of the hash tables;
121 * growing them would be too hard. This should be OK, but adding memory
122 * dynamically most likely means more hash misses, since the tables will
123 * be smaller than they otherwise would be.
126 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs)
128 page_t *pp;
129 page_t *opp, *oepp, *segpp;
130 struct memseg *seg;
131 uint64_t avmem;
132 pfn_t pfn;
133 pfn_t pt_base = base;
134 pgcnt_t tpgs = npgs;
135 pgcnt_t metapgs = 0;
136 int exhausted;
137 pfn_t pnum;
138 int mnode;
139 caddr_t vaddr;
140 int reuse;
141 int mlret;
142 int rv;
143 int flags;
144 int meta_alloc = 0;
145 void *mapva;
146 void *metabase = (void *)base;
147 pgcnt_t nkpmpgs = 0;
148 offset_t kpm_pages_off = 0;
150 cmn_err(CE_CONT,
151 "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n",
152 npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT);
155 * Add this span in the delete list to prevent interactions.
157 if (!delspan_reserve(base, npgs)) {
158 return (KPHYSM_ESPAN);
161 * Check to see if any of the memory span has been added
162 * by trying an add to the installed memory list. This
163 * forms the interlocking process for add.
166 memlist_write_lock();
168 mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT,
169 (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
171 if (mlret == MEML_SPANOP_OK)
172 installed_top_size(phys_install, &physmax, &physinstalled);
174 memlist_write_unlock();
176 if (mlret != MEML_SPANOP_OK) {
177 if (mlret == MEML_SPANOP_EALLOC) {
178 delspan_unreserve(pt_base, tpgs);
179 return (KPHYSM_ERESOURCE);
180 } else if (mlret == MEML_SPANOP_ESPAN) {
181 delspan_unreserve(pt_base, tpgs);
182 return (KPHYSM_ESPAN);
183 } else {
184 delspan_unreserve(pt_base, tpgs);
185 return (KPHYSM_ERESOURCE);
189 if (meta_alloc_enable) {
191 * Allocate the page_t's from existing memory;
192 * if that fails, allocate from the incoming memory.
194 rv = memseg_alloc_meta(base, npgs, &metabase, &metapgs);
195 if (rv == KPHYSM_OK) {
196 ASSERT(metapgs);
197 ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
198 meta_alloc = 1;
199 goto mapalloc;
204 * We store the page_t's for this new memory in the first
205 * few pages of the chunk. Here, we go and get'em ...
209 * The expression after the '-' gives the number of pages
210 * that will fit in the new memory based on a requirement
211 * of (PAGESIZE + sizeof (page_t)) bytes per page.
213 metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) /
214 (PAGESIZE + sizeof (page_t)));
216 npgs -= metapgs;
217 base += metapgs;
219 ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
221 exhausted = (metapgs == 0 || npgs == 0);
223 if (kpm_enable && !exhausted) {
224 pgcnt_t start, end, nkpmpgs_prelim;
225 size_t ptsz;
228 * A viable kpm large page mapping must not overlap two
229 * dynamic memsegs. Therefore the total size is checked
230 * to be at least kpm_pgsz and also whether start and end
231 * points are at least kpm_pgsz aligned.
233 if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) ||
234 pmodkpmp(base + npgs)) {
236 kphysm_addmem_error_undospan(pt_base, tpgs);
239 * There is no specific error code for violating
240 * kpm granularity constraints.
242 return (KPHYSM_ENOTVIABLE);
245 start = kpmptop(ptokpmp(base));
246 end = kpmptop(ptokpmp(base + npgs));
247 nkpmpgs_prelim = ptokpmp(end - start);
248 ptsz = npgs * sizeof (page_t);
249 metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ);
250 exhausted = (tpgs <= metapgs);
251 if (!exhausted) {
252 npgs = tpgs - metapgs;
253 base = pt_base + metapgs;
255 /* final nkpmpgs */
256 start = kpmptop(ptokpmp(base));
257 nkpmpgs = ptokpmp(end - start);
258 kpm_pages_off = ptsz +
259 (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ;
264 * Is memory area supplied too small?
266 if (exhausted) {
267 kphysm_addmem_error_undospan(pt_base, tpgs);
269 * There is no specific error code for 'too small'.
271 return (KPHYSM_ERESOURCE);
274 mapalloc:
276 * We may re-use a previously allocated VA space for the page_ts
277 * eventually, but we need to initialize and lock the pages first.
281 * Get an address in the kernel address map, map
282 * the page_t pages and see if we can touch them.
285 mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP);
286 if (mapva == NULL) {
287 cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
288 " Can't allocate VA for page_ts");
290 if (meta_alloc)
291 memseg_free_meta(metabase, metapgs);
292 kphysm_addmem_error_undospan(pt_base, tpgs);
294 return (KPHYSM_ERESOURCE);
296 pp = mapva;
298 if (physmax < (pt_base + tpgs))
299 physmax = (pt_base + tpgs);
302 * In the remapping code we map one page at a time so we must do
303 * the same here to match mapping sizes.
305 pfn = pt_base;
306 vaddr = (caddr_t)pp;
307 for (pnum = 0; pnum < metapgs; pnum++) {
308 if (meta_alloc)
309 pfn = memseg_get_metapfn(metabase, (pgcnt_t)pnum);
310 hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
311 PROT_READ | PROT_WRITE,
312 HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
313 pfn++;
314 vaddr += ptob(1);
317 if (ddi_peek32((dev_info_t *)NULL,
318 (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) {
320 cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
321 " Can't access pp array at 0x%p [phys 0x%lx]",
322 (void *)pp, pt_base);
324 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
325 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
327 vmem_free(heap_arena, mapva, ptob(metapgs));
328 if (meta_alloc)
329 memseg_free_meta(metabase, metapgs);
330 kphysm_addmem_error_undospan(pt_base, tpgs);
332 return (KPHYSM_EFAULT);
336 * Add this memory slice to its memory node translation.
338 * Note that right now, each node may have only one slice;
339 * this may change with COD or in larger SSM systems with
340 * nested latency groups, so we must not assume that the
341 * node does not yet exist.
343 * Note that there may be multiple memory nodes associated with
344 * a single lgrp node on x86 systems.
346 pnum = pt_base + tpgs - 1;
347 mem_node_add_range(pt_base, pnum);
350 * Allocate or resize page counters as necessary to accommodate
351 * the increase in memory pages.
353 mnode = PFN_2_MEM_NODE(pnum);
354 PAGE_CTRS_ADJUST(base, npgs, rv);
355 if (rv) {
357 mem_node_del_range(pt_base, pnum);
359 /* cleanup the page counters */
360 page_ctrs_cleanup();
362 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
363 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
365 vmem_free(heap_arena, mapva, ptob(metapgs));
366 if (meta_alloc)
367 memseg_free_meta(metabase, metapgs);
368 kphysm_addmem_error_undospan(pt_base, tpgs);
370 return (KPHYSM_ERESOURCE);
374 * Update the phys_avail memory list.
375 * The phys_install list was done at the start.
378 memlist_write_lock();
380 mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT,
381 (uint64_t)(npgs) << PAGESHIFT, &phys_avail);
382 ASSERT(mlret == MEML_SPANOP_OK);
384 memlist_write_unlock();
386 /* See if we can find a memseg to re-use. */
387 if (meta_alloc) {
388 seg = memseg_reuse(0);
389 reuse = 1; /* force unmapping of temp mapva */
390 flags = MEMSEG_DYNAMIC | MEMSEG_META_ALLOC;
392 * There is a 1:1 fixed relationship between a pfn
393 * and a page_t VA. The pfn is used as an index into
394 * the ppvm_base page_t table in order to calculate
395 * the page_t base address for a given pfn range.
397 segpp = ppvm_base + base;
398 } else {
399 seg = memseg_reuse(metapgs);
400 reuse = (seg != NULL);
401 flags = MEMSEG_DYNAMIC | MEMSEG_META_INCL;
402 segpp = pp;
406 * Initialize the memseg structure representing this memory
407 * and add it to the existing list of memsegs. Do some basic
408 * initialization and add the memory to the system.
409 * In order to prevent lock deadlocks, the add_physmem()
410 * code is repeated here, but split into several stages.
412 * If a memseg is reused, invalidate memseg pointers in
413 * all cpu vm caches. We need to do this this since the check
414 * pp >= seg->pages && pp < seg->epages
415 * used in various places is not atomic and so the first compare
416 * can happen before reuse and the second compare after reuse.
417 * The invalidation ensures that a memseg is not deferenced while
418 * it's page/pfn pointers are changing.
420 if (seg == NULL) {
421 seg = memseg_alloc();
422 ASSERT(seg != NULL);
423 seg->msegflags = flags;
424 MEMSEG_DEBUG("memseg_get: alloc seg=0x%p, pages=0x%p",
425 (void *)seg, (void *)(seg->pages));
426 seg->pages = segpp;
427 } else {
428 ASSERT(seg->msegflags == flags);
429 ASSERT(seg->pages_base == seg->pages_end);
430 MEMSEG_DEBUG("memseg_get: reuse seg=0x%p, pages=0x%p",
431 (void *)seg, (void *)(seg->pages));
432 if (meta_alloc) {
433 memseg_cpu_vm_flush();
434 seg->pages = segpp;
438 seg->epages = seg->pages + npgs;
439 seg->pages_base = base;
440 seg->pages_end = base + npgs;
443 * Initialize metadata. The page_ts are set to locked state
444 * ready to be freed.
446 bzero((caddr_t)pp, ptob(metapgs));
448 pfn = seg->pages_base;
449 /* Save the original pp base in case we reuse a memseg. */
450 opp = pp;
451 oepp = opp + npgs;
452 for (pp = opp; pp < oepp; pp++) {
453 pp->p_pagenum = pfn;
454 pfn++;
455 page_iolock_init(pp);
456 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
457 continue;
458 pp->p_offset = (u_offset_t)-1;
461 if (reuse) {
462 /* Remap our page_ts to the re-used memseg VA space. */
463 pfn = pt_base;
464 vaddr = (caddr_t)seg->pages;
465 for (pnum = 0; pnum < metapgs; pnum++) {
466 if (meta_alloc)
467 pfn = memseg_get_metapfn(metabase,
468 (pgcnt_t)pnum);
469 hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
470 PROT_READ | PROT_WRITE,
471 HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST);
472 pfn++;
473 vaddr += ptob(1);
476 hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs),
477 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
479 vmem_free(heap_arena, mapva, ptob(metapgs));
482 hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off);
484 memsegs_lock(1);
487 * The new memseg is inserted at the beginning of the list.
488 * Not only does this save searching for the tail, but in the
489 * case of a re-used memseg, it solves the problem of what
490 * happens if some process has still got a pointer to the
491 * memseg and follows the next pointer to continue traversing
492 * the memsegs list.
495 hat_kpm_addmem_mseg_insert(seg);
497 seg->next = memsegs;
498 membar_producer();
500 hat_kpm_addmem_memsegs_update(seg);
502 memsegs = seg;
504 build_pfn_hash();
506 total_pages += npgs;
509 * Recalculate the paging parameters now total_pages has changed.
510 * This will also cause the clock hands to be reset before next use.
512 setupclock();
514 memsegs_unlock(1);
516 PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs);
519 * Free the pages outside the lock to avoid locking loops.
521 for (pp = seg->pages; pp < seg->epages; pp++) {
522 page_free(pp, 1);
526 * Now that we've updated the appropriate memory lists we
527 * need to reset a number of globals, since we've increased memory.
528 * Several have already been updated for us as noted above. The
529 * globals we're interested in at this point are:
530 * physmax - highest page frame number.
531 * physinstalled - number of pages currently installed (done earlier)
532 * maxmem - max free pages in the system
533 * physmem - physical memory pages available
534 * availrmem - real memory available
537 mutex_enter(&freemem_lock);
538 maxmem += npgs;
539 physmem += npgs;
540 availrmem += npgs;
541 availrmem_initial += npgs;
543 mutex_exit(&freemem_lock);
545 dump_resize();
547 page_freelist_coalesce_all(mnode);
549 kphysm_setup_post_add(npgs);
551 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK "
552 "(0x%" PRIx64 ")\n",
553 physinstalled << (PAGESHIFT - 10),
554 (uint64_t)physinstalled << PAGESHIFT);
556 avmem = (uint64_t)freemem << PAGESHIFT;
557 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: "
558 "avail mem = %" PRId64 "\n", avmem);
561 * Update lgroup generation number on single lgroup systems
563 if (nlgrps == 1)
564 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
567 * Inform DDI of update
569 ddi_mem_update((uint64_t)(pt_base) << PAGESHIFT,
570 (uint64_t)(tpgs) << PAGESHIFT);
572 delspan_unreserve(pt_base, tpgs);
574 return (KPHYSM_OK); /* Successfully added system memory */
578 * There are various error conditions in kphysm_add_memory_dynamic()
579 * which require a rollback of already changed global state.
581 static void
582 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs)
584 int mlret;
586 /* Unreserve memory span. */
587 memlist_write_lock();
589 mlret = memlist_delete_span(
590 (uint64_t)(pt_base) << PAGESHIFT,
591 (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
593 ASSERT(mlret == MEML_SPANOP_OK);
594 phys_install_has_changed();
595 installed_top_size(phys_install, &physmax, &physinstalled);
597 memlist_write_unlock();
598 delspan_unreserve(pt_base, tpgs);
602 * Only return an available memseg of exactly the right size
603 * if size is required.
604 * When the meta data area has it's own virtual address space
605 * we will need to manage this more carefully and do best fit
606 * allocations, possibly splitting an available area.
608 struct memseg *
609 memseg_reuse(pgcnt_t metapgs)
611 int type;
612 struct memseg **segpp, *seg;
614 mutex_enter(&memseg_lists_lock);
616 segpp = &memseg_va_avail;
617 for (; (seg = *segpp) != NULL; segpp = &seg->lnext) {
618 caddr_t end;
621 * Make sure we are reusing the right segment type.
623 type = metapgs ? MEMSEG_META_INCL : MEMSEG_META_ALLOC;
625 if ((seg->msegflags & (MEMSEG_META_INCL | MEMSEG_META_ALLOC))
626 != type)
627 continue;
629 if (kpm_enable)
630 end = hat_kpm_mseg_reuse(seg);
631 else
632 end = (caddr_t)seg->epages;
635 * Check for the right size if it is provided.
637 if (!metapgs || btopr(end - (caddr_t)seg->pages) == metapgs) {
638 *segpp = seg->lnext;
639 seg->lnext = NULL;
640 break;
643 mutex_exit(&memseg_lists_lock);
645 return (seg);
648 static uint_t handle_gen;
650 struct memdelspan {
651 struct memdelspan *mds_next;
652 pfn_t mds_base;
653 pgcnt_t mds_npgs;
654 uint_t *mds_bitmap;
655 uint_t *mds_bitmap_retired;
658 #define NBPBMW (sizeof (uint_t) * NBBY)
659 #define MDS_BITMAPBYTES(MDSP) \
660 ((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t))
662 struct transit_list {
663 struct transit_list *trl_next;
664 struct memdelspan *trl_spans;
665 int trl_collect;
668 struct transit_list_head {
669 kmutex_t trh_lock;
670 struct transit_list *trh_head;
673 static struct transit_list_head transit_list_head;
675 struct mem_handle;
676 static void transit_list_collect(struct mem_handle *, int);
677 static void transit_list_insert(struct transit_list *);
678 static void transit_list_remove(struct transit_list *);
680 #ifdef DEBUG
681 #define MEM_DEL_STATS
682 #endif /* DEBUG */
684 #ifdef MEM_DEL_STATS
685 static int mem_del_stat_print = 0;
686 struct mem_del_stat {
687 uint_t nloop;
688 uint_t need_free;
689 uint_t free_loop;
690 uint_t free_low;
691 uint_t free_failed;
692 uint_t ncheck;
693 uint_t nopaget;
694 uint_t lockfail;
695 uint_t nfree;
696 uint_t nreloc;
697 uint_t nrelocfail;
698 uint_t already_done;
699 uint_t first_notfree;
700 uint_t npplocked;
701 uint_t nlockreloc;
702 uint_t nnorepl;
703 uint_t nmodreloc;
704 uint_t ndestroy;
705 uint_t nputpage;
706 uint_t nnoreclaim;
707 uint_t ndelay;
708 uint_t demotefail;
709 uint64_t nticks_total;
710 uint64_t nticks_pgrp;
711 uint_t retired;
712 uint_t toxic;
713 uint_t failing;
714 uint_t modtoxic;
715 uint_t npplkdtoxic;
716 uint_t gptlmodfail;
717 uint_t gptllckfail;
720 * The stat values are only incremented in the delete thread
721 * so no locking or atomic required.
723 #define MDSTAT_INCR(MHP, FLD) (MHP)->mh_delstat.FLD++
724 #define MDSTAT_TOTAL(MHP, ntck) ((MHP)->mh_delstat.nticks_total += (ntck))
725 #define MDSTAT_PGRP(MHP, ntck) ((MHP)->mh_delstat.nticks_pgrp += (ntck))
726 static void mem_del_stat_print_func(struct mem_handle *);
727 #define MDSTAT_PRINT(MHP) mem_del_stat_print_func((MHP))
728 #else /* MEM_DEL_STATS */
729 #define MDSTAT_INCR(MHP, FLD)
730 #define MDSTAT_TOTAL(MHP, ntck)
731 #define MDSTAT_PGRP(MHP, ntck)
732 #define MDSTAT_PRINT(MHP)
733 #endif /* MEM_DEL_STATS */
735 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING,
736 MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t;
739 * mh_mutex must be taken to examine or change mh_exthandle and mh_state.
740 * The mutex may not be required for other fields, dependent on mh_state.
742 struct mem_handle {
743 kmutex_t mh_mutex;
744 struct mem_handle *mh_next;
745 memhandle_t mh_exthandle;
746 mhnd_state_t mh_state;
747 struct transit_list mh_transit;
748 pgcnt_t mh_phys_pages;
749 pgcnt_t mh_vm_pages;
750 pgcnt_t mh_hold_todo;
751 void (*mh_delete_complete)(void *, int error);
752 void *mh_delete_complete_arg;
753 volatile uint_t mh_cancel;
754 volatile uint_t mh_dr_aio_cleanup_cancel;
755 volatile uint_t mh_aio_cleanup_done;
756 kcondvar_t mh_cv;
757 kthread_id_t mh_thread_id;
758 page_t *mh_deleted; /* link through p_next */
759 #ifdef MEM_DEL_STATS
760 struct mem_del_stat mh_delstat;
761 #endif /* MEM_DEL_STATS */
764 static struct mem_handle *mem_handle_head;
765 static kmutex_t mem_handle_list_mutex;
767 static struct mem_handle *
768 kphysm_allocate_mem_handle()
770 struct mem_handle *mhp;
772 mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP);
773 mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL);
774 mutex_enter(&mem_handle_list_mutex);
775 mutex_enter(&mhp->mh_mutex);
776 /* handle_gen is protected by list mutex. */
777 mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen);
778 mhp->mh_next = mem_handle_head;
779 mem_handle_head = mhp;
780 mutex_exit(&mem_handle_list_mutex);
782 return (mhp);
785 static void
786 kphysm_free_mem_handle(struct mem_handle *mhp)
788 struct mem_handle **mhpp;
790 ASSERT(mutex_owned(&mhp->mh_mutex));
791 ASSERT(mhp->mh_state == MHND_FREE);
793 * Exit the mutex to preserve locking order. This is OK
794 * here as once in the FREE state, the handle cannot
795 * be found by a lookup.
797 mutex_exit(&mhp->mh_mutex);
799 mutex_enter(&mem_handle_list_mutex);
800 mhpp = &mem_handle_head;
801 while (*mhpp != NULL && *mhpp != mhp)
802 mhpp = &(*mhpp)->mh_next;
803 ASSERT(*mhpp == mhp);
805 * No need to lock the handle (mh_mutex) as only
806 * mh_next changing and this is the only thread that
807 * can be referncing mhp.
809 *mhpp = mhp->mh_next;
810 mutex_exit(&mem_handle_list_mutex);
812 mutex_destroy(&mhp->mh_mutex);
813 kmem_free(mhp, sizeof (struct mem_handle));
817 * This function finds the internal mem_handle corresponding to an
818 * external handle and returns it with the mh_mutex held.
820 static struct mem_handle *
821 kphysm_lookup_mem_handle(memhandle_t handle)
823 struct mem_handle *mhp;
825 mutex_enter(&mem_handle_list_mutex);
826 for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) {
827 if (mhp->mh_exthandle == handle) {
828 mutex_enter(&mhp->mh_mutex);
830 * The state of the handle could have been changed
831 * by kphysm_del_release() while waiting for mh_mutex.
833 if (mhp->mh_state == MHND_FREE) {
834 mutex_exit(&mhp->mh_mutex);
835 continue;
837 break;
840 mutex_exit(&mem_handle_list_mutex);
841 return (mhp);
845 kphysm_del_gethandle(memhandle_t *xmhp)
847 struct mem_handle *mhp;
849 mhp = kphysm_allocate_mem_handle();
851 * The handle is allocated using KM_SLEEP, so cannot fail.
852 * If the implementation is changed, the correct error to return
853 * here would be KPHYSM_ENOHANDLES.
855 ASSERT(mhp->mh_state == MHND_FREE);
856 mhp->mh_state = MHND_INIT;
857 *xmhp = mhp->mh_exthandle;
858 mutex_exit(&mhp->mh_mutex);
859 return (KPHYSM_OK);
862 static int
863 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2)
865 pfn_t e1, e2;
867 e1 = b1 + l1;
868 e2 = b2 + l2;
870 return (!(b2 >= e1 || b1 >= e2));
873 static int can_remove_pgs(pgcnt_t);
875 static struct memdelspan *
876 span_to_install(pfn_t base, pgcnt_t npgs)
878 struct memdelspan *mdsp;
879 struct memdelspan *mdsp_new;
880 uint64_t address, size, thislen;
881 struct memlist *mlp;
883 mdsp_new = NULL;
885 address = (uint64_t)base << PAGESHIFT;
886 size = (uint64_t)npgs << PAGESHIFT;
887 while (size != 0) {
888 memlist_read_lock();
889 for (mlp = phys_install; mlp != NULL; mlp = mlp->ml_next) {
890 if (address >= (mlp->ml_address + mlp->ml_size))
891 continue;
892 if ((address + size) > mlp->ml_address)
893 break;
895 if (mlp == NULL) {
896 address += size;
897 size = 0;
898 thislen = 0;
899 } else {
900 if (address < mlp->ml_address) {
901 size -= (mlp->ml_address - address);
902 address = mlp->ml_address;
904 ASSERT(address >= mlp->ml_address);
905 if ((address + size) >
906 (mlp->ml_address + mlp->ml_size)) {
907 thislen =
908 mlp->ml_size - (address - mlp->ml_address);
909 } else {
910 thislen = size;
913 memlist_read_unlock();
914 /* TODO: phys_install could change now */
915 if (thislen == 0)
916 continue;
917 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
918 mdsp->mds_base = btop(address);
919 mdsp->mds_npgs = btop(thislen);
920 mdsp->mds_next = mdsp_new;
921 mdsp_new = mdsp;
922 address += thislen;
923 size -= thislen;
925 return (mdsp_new);
928 static void
929 free_delspans(struct memdelspan *mdsp)
931 struct memdelspan *amdsp;
933 while ((amdsp = mdsp) != NULL) {
934 mdsp = amdsp->mds_next;
935 kmem_free(amdsp, sizeof (struct memdelspan));
940 * Concatenate lists. No list ordering is required.
943 static void
944 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp)
946 while (*mdspp != NULL)
947 mdspp = &(*mdspp)->mds_next;
949 *mdspp = mdsp;
953 * Given a new list of delspans, check there is no overlap with
954 * all existing span activity (add or delete) and then concatenate
955 * the new spans to the given list.
956 * Return 1 for OK, 0 if overlapping.
958 static int
959 delspan_insert(
960 struct transit_list *my_tlp,
961 struct memdelspan *mdsp_new)
963 struct transit_list_head *trh;
964 struct transit_list *tlp;
965 int ret;
967 trh = &transit_list_head;
969 ASSERT(my_tlp != NULL);
970 ASSERT(mdsp_new != NULL);
972 ret = 1;
973 mutex_enter(&trh->trh_lock);
974 /* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */
975 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
976 struct memdelspan *mdsp;
978 for (mdsp = tlp->trl_spans; mdsp != NULL;
979 mdsp = mdsp->mds_next) {
980 struct memdelspan *nmdsp;
982 for (nmdsp = mdsp_new; nmdsp != NULL;
983 nmdsp = nmdsp->mds_next) {
984 if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
985 nmdsp->mds_base, nmdsp->mds_npgs)) {
986 ret = 0;
987 goto done;
992 done:
993 if (ret != 0) {
994 if (my_tlp->trl_spans == NULL)
995 transit_list_insert(my_tlp);
996 delspan_concat(&my_tlp->trl_spans, mdsp_new);
998 mutex_exit(&trh->trh_lock);
999 return (ret);
1002 static void
1003 delspan_remove(
1004 struct transit_list *my_tlp,
1005 pfn_t base,
1006 pgcnt_t npgs)
1008 struct transit_list_head *trh;
1009 struct memdelspan *mdsp;
1011 trh = &transit_list_head;
1013 ASSERT(my_tlp != NULL);
1015 mutex_enter(&trh->trh_lock);
1016 if ((mdsp = my_tlp->trl_spans) != NULL) {
1017 if (npgs == 0) {
1018 my_tlp->trl_spans = NULL;
1019 free_delspans(mdsp);
1020 transit_list_remove(my_tlp);
1021 } else {
1022 struct memdelspan **prv;
1024 prv = &my_tlp->trl_spans;
1025 while (mdsp != NULL) {
1026 pfn_t p_end;
1028 p_end = mdsp->mds_base + mdsp->mds_npgs;
1029 if (mdsp->mds_base >= base &&
1030 p_end <= (base + npgs)) {
1031 *prv = mdsp->mds_next;
1032 mdsp->mds_next = NULL;
1033 free_delspans(mdsp);
1034 } else {
1035 prv = &mdsp->mds_next;
1037 mdsp = *prv;
1039 if (my_tlp->trl_spans == NULL)
1040 transit_list_remove(my_tlp);
1043 mutex_exit(&trh->trh_lock);
1047 * Reserve interface for add to stop delete before add finished.
1048 * This list is only accessed through the delspan_insert/remove
1049 * functions and so is fully protected by the mutex in struct transit_list.
1052 static struct transit_list reserve_transit;
1054 static int
1055 delspan_reserve(pfn_t base, pgcnt_t npgs)
1057 struct memdelspan *mdsp;
1058 int ret;
1060 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
1061 mdsp->mds_base = base;
1062 mdsp->mds_npgs = npgs;
1063 if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) {
1064 free_delspans(mdsp);
1066 return (ret);
1069 static void
1070 delspan_unreserve(pfn_t base, pgcnt_t npgs)
1072 delspan_remove(&reserve_transit, base, npgs);
1076 * Return whether memseg was created by kphysm_add_memory_dynamic().
1078 static int
1079 memseg_is_dynamic(struct memseg *seg)
1081 return (seg->msegflags & MEMSEG_DYNAMIC);
1085 kphysm_del_span(
1086 memhandle_t handle,
1087 pfn_t base,
1088 pgcnt_t npgs)
1090 struct mem_handle *mhp;
1091 struct memseg *seg;
1092 struct memdelspan *mdsp;
1093 struct memdelspan *mdsp_new;
1094 pgcnt_t phys_pages, vm_pages;
1095 pfn_t p_end;
1096 page_t *pp;
1097 int ret;
1099 mhp = kphysm_lookup_mem_handle(handle);
1100 if (mhp == NULL) {
1101 return (KPHYSM_EHANDLE);
1103 if (mhp->mh_state != MHND_INIT) {
1104 mutex_exit(&mhp->mh_mutex);
1105 return (KPHYSM_ESEQUENCE);
1109 * Intersect the span with the installed memory list (phys_install).
1111 mdsp_new = span_to_install(base, npgs);
1112 if (mdsp_new == NULL) {
1114 * No physical memory in this range. Is this an
1115 * error? If an attempt to start the delete is made
1116 * for OK returns from del_span such as this, start will
1117 * return an error.
1118 * Could return KPHYSM_ENOWORK.
1121 * It is assumed that there are no error returns
1122 * from span_to_install() due to kmem_alloc failure.
1124 mutex_exit(&mhp->mh_mutex);
1125 return (KPHYSM_OK);
1128 * Does this span overlap an existing span?
1130 if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) {
1132 * Differentiate between already on list for this handle
1133 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY).
1135 ret = KPHYSM_EBUSY;
1136 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1137 mdsp = mdsp->mds_next) {
1138 if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
1139 base, npgs)) {
1140 ret = KPHYSM_EDUP;
1141 break;
1144 mutex_exit(&mhp->mh_mutex);
1145 free_delspans(mdsp_new);
1146 return (ret);
1149 * At this point the spans in mdsp_new have been inserted into the
1150 * list of spans for this handle and thereby to the global list of
1151 * spans being processed. Each of these spans must now be checked
1152 * for relocatability. As a side-effect segments in the memseg list
1153 * may be split.
1155 * Note that mdsp_new can no longer be used as it is now part of
1156 * a larger list. Select elements of this larger list based
1157 * on base and npgs.
1159 restart:
1160 phys_pages = 0;
1161 vm_pages = 0;
1162 ret = KPHYSM_OK;
1163 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1164 mdsp = mdsp->mds_next) {
1165 pgcnt_t pages_checked;
1167 if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) {
1168 continue;
1170 p_end = mdsp->mds_base + mdsp->mds_npgs;
1172 * The pages_checked count is a hack. All pages should be
1173 * checked for relocatability. Those not covered by memsegs
1174 * should be tested with arch_kphysm_del_span_ok().
1176 pages_checked = 0;
1177 for (seg = memsegs; seg; seg = seg->next) {
1178 pfn_t mseg_start;
1180 if (seg->pages_base >= p_end ||
1181 seg->pages_end <= mdsp->mds_base) {
1182 /* Span and memseg don't overlap. */
1183 continue;
1185 mseg_start = memseg_get_start(seg);
1186 /* Check that segment is suitable for delete. */
1187 if (memseg_includes_meta(seg)) {
1189 * Check that this segment is completely
1190 * within the span.
1192 if (mseg_start < mdsp->mds_base ||
1193 seg->pages_end > p_end) {
1194 ret = KPHYSM_EBUSY;
1195 break;
1197 pages_checked += seg->pages_end - mseg_start;
1198 } else {
1200 * If this segment is larger than the span,
1201 * try to split it. After the split, it
1202 * is necessary to restart.
1204 if (seg->pages_base < mdsp->mds_base ||
1205 seg->pages_end > p_end) {
1206 pfn_t abase;
1207 pgcnt_t anpgs;
1208 int s_ret;
1210 /* Split required. */
1211 if (mdsp->mds_base < seg->pages_base)
1212 abase = seg->pages_base;
1213 else
1214 abase = mdsp->mds_base;
1215 if (p_end > seg->pages_end)
1216 anpgs = seg->pages_end - abase;
1217 else
1218 anpgs = p_end - abase;
1219 s_ret = kphysm_split_memseg(abase,
1220 anpgs);
1221 if (s_ret == 0) {
1222 /* Split failed. */
1223 ret = KPHYSM_ERESOURCE;
1224 break;
1226 goto restart;
1228 pages_checked +=
1229 seg->pages_end - seg->pages_base;
1232 * The memseg is wholly within the delete span.
1233 * The individual pages can now be checked.
1235 /* Cage test. */
1236 for (pp = seg->pages; pp < seg->epages; pp++) {
1237 if (PP_ISNORELOC(pp)) {
1238 ret = KPHYSM_ENONRELOC;
1239 break;
1242 if (ret != KPHYSM_OK) {
1243 break;
1245 phys_pages += (seg->pages_end - mseg_start);
1246 vm_pages += MSEG_NPAGES(seg);
1248 if (ret != KPHYSM_OK)
1249 break;
1250 if (pages_checked != mdsp->mds_npgs) {
1251 ret = KPHYSM_ENONRELOC;
1252 break;
1256 if (ret == KPHYSM_OK) {
1257 mhp->mh_phys_pages += phys_pages;
1258 mhp->mh_vm_pages += vm_pages;
1259 } else {
1261 * Keep holding the mh_mutex to prevent it going away.
1263 delspan_remove(&mhp->mh_transit, base, npgs);
1265 mutex_exit(&mhp->mh_mutex);
1266 return (ret);
1270 kphysm_del_span_query(
1271 pfn_t base,
1272 pgcnt_t npgs,
1273 memquery_t *mqp)
1275 struct memdelspan *mdsp;
1276 struct memdelspan *mdsp_new;
1277 int done_first_nonreloc;
1279 mqp->phys_pages = 0;
1280 mqp->managed = 0;
1281 mqp->nonrelocatable = 0;
1282 mqp->first_nonrelocatable = 0;
1283 mqp->last_nonrelocatable = 0;
1285 mdsp_new = span_to_install(base, npgs);
1287 * It is OK to proceed here if mdsp_new == NULL.
1289 done_first_nonreloc = 0;
1290 for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) {
1291 pfn_t sbase;
1292 pgcnt_t snpgs;
1294 mqp->phys_pages += mdsp->mds_npgs;
1295 sbase = mdsp->mds_base;
1296 snpgs = mdsp->mds_npgs;
1297 while (snpgs != 0) {
1298 struct memseg *lseg, *seg;
1299 pfn_t p_end;
1300 page_t *pp;
1301 pfn_t mseg_start;
1303 p_end = sbase + snpgs;
1305 * Find the lowest addressed memseg that starts
1306 * after sbase and account for it.
1307 * This is to catch dynamic memsegs whose start
1308 * is hidden.
1310 seg = NULL;
1311 for (lseg = memsegs; lseg != NULL; lseg = lseg->next) {
1312 if ((lseg->pages_base >= sbase) ||
1313 (lseg->pages_base < p_end &&
1314 lseg->pages_end > sbase)) {
1315 if (seg == NULL ||
1316 seg->pages_base > lseg->pages_base)
1317 seg = lseg;
1320 if (seg != NULL) {
1321 mseg_start = memseg_get_start(seg);
1323 * Now have the full extent of the memseg so
1324 * do the range check.
1326 if (mseg_start >= p_end ||
1327 seg->pages_end <= sbase) {
1328 /* Span does not overlap memseg. */
1329 seg = NULL;
1333 * Account for gap either before the segment if
1334 * there is one or to the end of the span.
1336 if (seg == NULL || mseg_start > sbase) {
1337 pfn_t a_end;
1339 a_end = (seg == NULL) ? p_end : mseg_start;
1341 * Check with arch layer for relocatability.
1343 if (arch_kphysm_del_span_ok(sbase,
1344 (a_end - sbase))) {
1346 * No non-relocatble pages in this
1347 * area, avoid the fine-grained
1348 * test.
1350 snpgs -= (a_end - sbase);
1351 sbase = a_end;
1353 while (sbase < a_end) {
1354 if (!arch_kphysm_del_span_ok(sbase,
1355 1)) {
1356 mqp->nonrelocatable++;
1357 if (!done_first_nonreloc) {
1358 mqp->
1359 first_nonrelocatable
1360 = sbase;
1361 done_first_nonreloc = 1;
1363 mqp->last_nonrelocatable =
1364 sbase;
1366 sbase++;
1367 snpgs--;
1370 if (seg != NULL) {
1371 ASSERT(mseg_start <= sbase);
1372 if (seg->pages_base != mseg_start &&
1373 seg->pages_base > sbase) {
1374 pgcnt_t skip_pgs;
1377 * Skip the page_t area of a
1378 * dynamic memseg.
1380 skip_pgs = seg->pages_base - sbase;
1381 if (snpgs <= skip_pgs) {
1382 sbase += snpgs;
1383 snpgs = 0;
1384 continue;
1386 snpgs -= skip_pgs;
1387 sbase += skip_pgs;
1389 ASSERT(snpgs != 0);
1390 ASSERT(seg->pages_base <= sbase);
1392 * The individual pages can now be checked.
1394 for (pp = seg->pages +
1395 (sbase - seg->pages_base);
1396 snpgs != 0 && pp < seg->epages; pp++) {
1397 mqp->managed++;
1398 if (PP_ISNORELOC(pp)) {
1399 mqp->nonrelocatable++;
1400 if (!done_first_nonreloc) {
1401 mqp->
1402 first_nonrelocatable
1403 = sbase;
1404 done_first_nonreloc = 1;
1406 mqp->last_nonrelocatable =
1407 sbase;
1409 sbase++;
1410 snpgs--;
1416 free_delspans(mdsp_new);
1418 return (KPHYSM_OK);
1422 * This release function can be called at any stage as follows:
1423 * _gethandle only called
1424 * _span(s) only called
1425 * _start called but failed
1426 * delete thread exited
1429 kphysm_del_release(memhandle_t handle)
1431 struct mem_handle *mhp;
1433 mhp = kphysm_lookup_mem_handle(handle);
1434 if (mhp == NULL) {
1435 return (KPHYSM_EHANDLE);
1437 switch (mhp->mh_state) {
1438 case MHND_STARTING:
1439 case MHND_RUNNING:
1440 mutex_exit(&mhp->mh_mutex);
1441 return (KPHYSM_ENOTFINISHED);
1442 case MHND_FREE:
1443 ASSERT(mhp->mh_state != MHND_FREE);
1444 mutex_exit(&mhp->mh_mutex);
1445 return (KPHYSM_EHANDLE);
1446 case MHND_INIT:
1447 break;
1448 case MHND_DONE:
1449 break;
1450 case MHND_RELEASE:
1451 mutex_exit(&mhp->mh_mutex);
1452 return (KPHYSM_ESEQUENCE);
1453 default:
1454 #ifdef DEBUG
1455 cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d",
1456 (void *)mhp, mhp->mh_state);
1457 #endif /* DEBUG */
1458 mutex_exit(&mhp->mh_mutex);
1459 return (KPHYSM_EHANDLE);
1462 * Set state so that we can wait if necessary.
1463 * Also this means that we have read/write access to all
1464 * fields except mh_exthandle and mh_state.
1466 mhp->mh_state = MHND_RELEASE;
1468 * The mem_handle cannot be de-allocated by any other operation
1469 * now, so no need to hold mh_mutex.
1471 mutex_exit(&mhp->mh_mutex);
1473 delspan_remove(&mhp->mh_transit, 0, 0);
1474 mhp->mh_phys_pages = 0;
1475 mhp->mh_vm_pages = 0;
1476 mhp->mh_hold_todo = 0;
1477 mhp->mh_delete_complete = NULL;
1478 mhp->mh_delete_complete_arg = NULL;
1479 mhp->mh_cancel = 0;
1481 mutex_enter(&mhp->mh_mutex);
1482 ASSERT(mhp->mh_state == MHND_RELEASE);
1483 mhp->mh_state = MHND_FREE;
1485 kphysm_free_mem_handle(mhp);
1487 return (KPHYSM_OK);
1491 * This cancel function can only be called with the thread running.
1494 kphysm_del_cancel(memhandle_t handle)
1496 struct mem_handle *mhp;
1498 mhp = kphysm_lookup_mem_handle(handle);
1499 if (mhp == NULL) {
1500 return (KPHYSM_EHANDLE);
1502 if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) {
1503 mutex_exit(&mhp->mh_mutex);
1504 return (KPHYSM_ENOTRUNNING);
1507 * Set the cancel flag and wake the delete thread up.
1508 * The thread may be waiting on I/O, so the effect of the cancel
1509 * may be delayed.
1511 if (mhp->mh_cancel == 0) {
1512 mhp->mh_cancel = KPHYSM_ECANCELLED;
1513 cv_signal(&mhp->mh_cv);
1515 mutex_exit(&mhp->mh_mutex);
1516 return (KPHYSM_OK);
1520 kphysm_del_status(
1521 memhandle_t handle,
1522 memdelstat_t *mdstp)
1524 struct mem_handle *mhp;
1526 mhp = kphysm_lookup_mem_handle(handle);
1527 if (mhp == NULL) {
1528 return (KPHYSM_EHANDLE);
1531 * Calling kphysm_del_status() is allowed before the delete
1532 * is started to allow for status display.
1534 if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING &&
1535 mhp->mh_state != MHND_RUNNING) {
1536 mutex_exit(&mhp->mh_mutex);
1537 return (KPHYSM_ENOTRUNNING);
1539 mdstp->phys_pages = mhp->mh_phys_pages;
1540 mdstp->managed = mhp->mh_vm_pages;
1541 mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo;
1542 mutex_exit(&mhp->mh_mutex);
1543 return (KPHYSM_OK);
1546 static int mem_delete_additional_pages = 100;
1548 static int
1549 can_remove_pgs(pgcnt_t npgs)
1552 * If all pageable pages were paged out, freemem would
1553 * equal availrmem. There is a minimum requirement for
1554 * availrmem.
1556 if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages))
1557 < npgs)
1558 return (0);
1559 /* TODO: check swap space, etc. */
1560 return (1);
1563 static int
1564 get_availrmem(pgcnt_t npgs)
1566 int ret;
1568 mutex_enter(&freemem_lock);
1569 ret = can_remove_pgs(npgs);
1570 if (ret != 0)
1571 availrmem -= npgs;
1572 mutex_exit(&freemem_lock);
1573 return (ret);
1576 static void
1577 put_availrmem(pgcnt_t npgs)
1579 mutex_enter(&freemem_lock);
1580 availrmem += npgs;
1581 mutex_exit(&freemem_lock);
1584 #define FREEMEM_INCR 100
1585 static pgcnt_t freemem_incr = FREEMEM_INCR;
1586 #define DEL_FREE_WAIT_FRAC 4
1587 #define DEL_FREE_WAIT_TICKS ((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC)
1589 #define DEL_BUSY_WAIT_FRAC 20
1590 #define DEL_BUSY_WAIT_TICKS ((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC)
1592 static void kphysm_del_cleanup(struct mem_handle *);
1594 static void page_delete_collect(page_t *, struct mem_handle *);
1596 static pgcnt_t
1597 delthr_get_freemem(struct mem_handle *mhp)
1599 pgcnt_t free_get;
1600 int ret;
1602 ASSERT(MUTEX_HELD(&mhp->mh_mutex));
1604 MDSTAT_INCR(mhp, need_free);
1606 * Get up to freemem_incr pages.
1608 free_get = freemem_incr;
1609 if (free_get > mhp->mh_hold_todo)
1610 free_get = mhp->mh_hold_todo;
1612 * Take free_get pages away from freemem,
1613 * waiting if necessary.
1616 while (!mhp->mh_cancel) {
1617 mutex_exit(&mhp->mh_mutex);
1618 MDSTAT_INCR(mhp, free_loop);
1620 * Duplicate test from page_create_throttle()
1621 * but don't override with !PG_WAIT.
1623 if (freemem < (free_get + throttlefree)) {
1624 MDSTAT_INCR(mhp, free_low);
1625 ret = 0;
1626 } else {
1627 ret = page_create_wait(free_get, 0);
1628 if (ret == 0) {
1629 /* EMPTY */
1630 MDSTAT_INCR(mhp, free_failed);
1633 if (ret != 0) {
1634 mutex_enter(&mhp->mh_mutex);
1635 return (free_get);
1639 * Put pressure on pageout.
1641 page_needfree(free_get);
1642 WAKE_PAGEOUT_SCANNER(delthr);
1644 mutex_enter(&mhp->mh_mutex);
1645 (void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
1646 DEL_FREE_WAIT_TICKS, TR_CLOCK_TICK);
1647 mutex_exit(&mhp->mh_mutex);
1648 page_needfree(-(spgcnt_t)free_get);
1650 mutex_enter(&mhp->mh_mutex);
1652 return (0);
1655 #define DR_AIO_CLEANUP_DELAY 25000 /* 0.025secs, in usec */
1656 #define DR_AIO_CLEANUP_MAXLOOPS_NODELAY 100
1658 * This function is run as a helper thread for delete_memory_thread.
1659 * It is needed in order to force kaio cleanup, so that pages used in kaio
1660 * will be unlocked and subsequently relocated by delete_memory_thread.
1661 * The address of the delete_memory_threads's mem_handle is passed in to
1662 * this thread function, and is used to set the mh_aio_cleanup_done member
1663 * prior to calling thread_exit().
1665 static void
1666 dr_aio_cleanup_thread(caddr_t amhp)
1668 proc_t *procp;
1669 int (*aio_cleanup_dr_delete_memory)(proc_t *);
1670 int cleaned;
1671 int n = 0;
1672 struct mem_handle *mhp;
1673 volatile uint_t *pcancel;
1675 mhp = (struct mem_handle *)amhp;
1676 ASSERT(mhp != NULL);
1677 pcancel = &mhp->mh_dr_aio_cleanup_cancel;
1678 if (modload("sys", "kaio") == -1) {
1679 mhp->mh_aio_cleanup_done = 1;
1680 cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio");
1681 thread_exit();
1683 aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
1684 modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
1685 if (aio_cleanup_dr_delete_memory == NULL) {
1686 mhp->mh_aio_cleanup_done = 1;
1687 cmn_err(CE_WARN,
1688 "aio_cleanup_dr_delete_memory not found in kaio");
1689 thread_exit();
1691 do {
1692 cleaned = 0;
1693 mutex_enter(&pidlock);
1694 for (procp = practive; (*pcancel == 0) && (procp != NULL);
1695 procp = procp->p_next) {
1696 mutex_enter(&procp->p_lock);
1697 if (procp->p_aio != NULL) {
1698 /* cleanup proc's outstanding kaio */
1699 cleaned +=
1700 (*aio_cleanup_dr_delete_memory)(procp);
1702 mutex_exit(&procp->p_lock);
1704 mutex_exit(&pidlock);
1705 if ((*pcancel == 0) &&
1706 (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) {
1707 /* delay a bit before retrying all procs again */
1708 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
1709 n = 0;
1711 } while (*pcancel == 0);
1712 mhp->mh_aio_cleanup_done = 1;
1713 thread_exit();
1716 static void
1717 delete_memory_thread(caddr_t amhp)
1719 struct mem_handle *mhp;
1720 struct memdelspan *mdsp;
1721 callb_cpr_t cprinfo;
1722 page_t *pp_targ;
1723 spgcnt_t freemem_left;
1724 void (*del_complete_funcp)(void *, int error);
1725 void *del_complete_arg;
1726 int comp_code;
1727 int ret;
1728 int first_scan;
1729 uint_t szc;
1730 #ifdef MEM_DEL_STATS
1731 uint64_t start_total, ntick_total;
1732 uint64_t start_pgrp, ntick_pgrp;
1733 #endif /* MEM_DEL_STATS */
1735 mhp = (struct mem_handle *)amhp;
1737 #ifdef MEM_DEL_STATS
1738 start_total = ddi_get_lbolt();
1739 #endif /* MEM_DEL_STATS */
1741 CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex,
1742 callb_generic_cpr, "memdel");
1744 mutex_enter(&mhp->mh_mutex);
1745 ASSERT(mhp->mh_state == MHND_STARTING);
1747 mhp->mh_state = MHND_RUNNING;
1748 mhp->mh_thread_id = curthread;
1750 mhp->mh_hold_todo = mhp->mh_vm_pages;
1751 mutex_exit(&mhp->mh_mutex);
1753 /* Allocate the remap pages now, if necessary. */
1754 memseg_remap_init();
1757 * Subtract from availrmem now if possible as availrmem
1758 * may not be available by the end of the delete.
1760 if (!get_availrmem(mhp->mh_vm_pages)) {
1761 comp_code = KPHYSM_ENOTVIABLE;
1762 mutex_enter(&mhp->mh_mutex);
1763 goto early_exit;
1766 ret = kphysm_setup_pre_del(mhp->mh_vm_pages);
1768 mutex_enter(&mhp->mh_mutex);
1770 if (ret != 0) {
1771 mhp->mh_cancel = KPHYSM_EREFUSED;
1772 goto refused;
1775 transit_list_collect(mhp, 1);
1777 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1778 mdsp = mdsp->mds_next) {
1779 ASSERT(mdsp->mds_bitmap == NULL);
1780 mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP);
1781 mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp),
1782 KM_SLEEP);
1785 first_scan = 1;
1786 freemem_left = 0;
1788 * Start dr_aio_cleanup_thread, which periodically iterates
1789 * through the process list and invokes aio cleanup. This
1790 * is needed in order to avoid a deadly embrace between the
1791 * delete_memory_thread (waiting on writer lock for page, with the
1792 * exclusive-wanted bit set), kaio read request threads (waiting for a
1793 * reader lock on the same page that is wanted by the
1794 * delete_memory_thread), and threads waiting for kaio completion
1795 * (blocked on spt_amp->lock).
1797 mhp->mh_dr_aio_cleanup_cancel = 0;
1798 mhp->mh_aio_cleanup_done = 0;
1799 (void) thread_create(NULL, 0, dr_aio_cleanup_thread,
1800 (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1);
1801 while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) {
1802 pgcnt_t collected;
1804 MDSTAT_INCR(mhp, nloop);
1805 collected = 0;
1806 for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) &&
1807 (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) {
1808 pfn_t pfn, p_end;
1810 p_end = mdsp->mds_base + mdsp->mds_npgs;
1811 for (pfn = mdsp->mds_base; (pfn < p_end) &&
1812 (mhp->mh_cancel == 0); pfn++) {
1813 page_t *pp, *tpp, *tpp_targ;
1814 pgcnt_t bit;
1815 struct vnode *vp;
1816 u_offset_t offset;
1817 int mod, result;
1818 spgcnt_t pgcnt;
1820 bit = pfn - mdsp->mds_base;
1821 if ((mdsp->mds_bitmap[bit / NBPBMW] &
1822 (1 << (bit % NBPBMW))) != 0) {
1823 MDSTAT_INCR(mhp, already_done);
1824 continue;
1826 if (freemem_left == 0) {
1827 freemem_left += delthr_get_freemem(mhp);
1828 if (freemem_left == 0)
1829 break;
1833 * Release mh_mutex - some of this
1834 * stuff takes some time (eg PUTPAGE).
1837 mutex_exit(&mhp->mh_mutex);
1838 MDSTAT_INCR(mhp, ncheck);
1840 pp = page_numtopp_nolock(pfn);
1841 if (pp == NULL) {
1843 * Not covered by a page_t - will
1844 * be dealt with elsewhere.
1846 MDSTAT_INCR(mhp, nopaget);
1847 mutex_enter(&mhp->mh_mutex);
1848 mdsp->mds_bitmap[bit / NBPBMW] |=
1849 (1 << (bit % NBPBMW));
1850 continue;
1853 if (!page_try_reclaim_lock(pp, SE_EXCL,
1854 SE_EXCL_WANTED | SE_RETIRED)) {
1856 * Page in use elsewhere. Skip it.
1858 MDSTAT_INCR(mhp, lockfail);
1859 mutex_enter(&mhp->mh_mutex);
1860 continue;
1863 * See if the cage expanded into the delete.
1864 * This can happen as we have to allow the
1865 * cage to expand.
1867 if (PP_ISNORELOC(pp)) {
1868 page_unlock(pp);
1869 mutex_enter(&mhp->mh_mutex);
1870 mhp->mh_cancel = KPHYSM_ENONRELOC;
1871 break;
1873 if (PP_RETIRED(pp)) {
1875 * Page has been retired and is
1876 * not part of the cage so we
1877 * can now do the accounting for
1878 * it.
1880 MDSTAT_INCR(mhp, retired);
1881 mutex_enter(&mhp->mh_mutex);
1882 mdsp->mds_bitmap[bit / NBPBMW]
1883 |= (1 << (bit % NBPBMW));
1884 mdsp->mds_bitmap_retired[bit /
1885 NBPBMW] |=
1886 (1 << (bit % NBPBMW));
1887 mhp->mh_hold_todo--;
1888 continue;
1890 ASSERT(freemem_left != 0);
1891 if (PP_ISFREE(pp)) {
1893 * Like page_reclaim() only 'freemem'
1894 * processing is already done.
1896 MDSTAT_INCR(mhp, nfree);
1897 free_page_collect:
1898 if (PP_ISAGED(pp)) {
1899 page_list_sub(pp,
1900 PG_FREE_LIST);
1901 } else {
1902 page_list_sub(pp,
1903 PG_CACHE_LIST);
1905 PP_CLRFREE(pp);
1906 PP_CLRAGED(pp);
1907 collected++;
1908 mutex_enter(&mhp->mh_mutex);
1909 page_delete_collect(pp, mhp);
1910 mdsp->mds_bitmap[bit / NBPBMW] |=
1911 (1 << (bit % NBPBMW));
1912 freemem_left--;
1913 continue;
1915 ASSERT(pp->p_vnode != NULL);
1916 if (first_scan) {
1917 MDSTAT_INCR(mhp, first_notfree);
1918 page_unlock(pp);
1919 mutex_enter(&mhp->mh_mutex);
1920 continue;
1923 * Keep stats on pages encountered that
1924 * are marked for retirement.
1926 if (PP_TOXIC(pp)) {
1927 MDSTAT_INCR(mhp, toxic);
1928 } else if (PP_PR_REQ(pp)) {
1929 MDSTAT_INCR(mhp, failing);
1932 * In certain cases below, special exceptions
1933 * are made for pages that are toxic. This
1934 * is because the current meaning of toxic
1935 * is that an uncorrectable error has been
1936 * previously associated with the page.
1938 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1939 if (!PP_TOXIC(pp)) {
1941 * Must relocate locked in
1942 * memory pages.
1944 #ifdef MEM_DEL_STATS
1945 start_pgrp = ddi_get_lbolt();
1946 #endif /* MEM_DEL_STATS */
1948 * Lock all constituent pages
1949 * of a large page to ensure
1950 * that p_szc won't change.
1952 if (!group_page_trylock(pp,
1953 SE_EXCL)) {
1954 MDSTAT_INCR(mhp,
1955 gptllckfail);
1956 page_unlock(pp);
1957 mutex_enter(
1958 &mhp->mh_mutex);
1959 continue;
1961 MDSTAT_INCR(mhp, npplocked);
1962 pp_targ =
1963 page_get_replacement_page(
1964 pp, NULL, 0);
1965 if (pp_targ != NULL) {
1966 #ifdef MEM_DEL_STATS
1967 ntick_pgrp =
1968 (uint64_t)
1969 ddi_get_lbolt() -
1970 start_pgrp;
1971 #endif /* MEM_DEL_STATS */
1972 MDSTAT_PGRP(mhp,
1973 ntick_pgrp);
1974 MDSTAT_INCR(mhp,
1975 nlockreloc);
1976 goto reloc;
1978 group_page_unlock(pp);
1979 page_unlock(pp);
1980 #ifdef MEM_DEL_STATS
1981 ntick_pgrp =
1982 (uint64_t)ddi_get_lbolt() -
1983 start_pgrp;
1984 #endif /* MEM_DEL_STATS */
1985 MDSTAT_PGRP(mhp, ntick_pgrp);
1986 MDSTAT_INCR(mhp, nnorepl);
1987 mutex_enter(&mhp->mh_mutex);
1988 continue;
1989 } else {
1991 * Cannot do anything about
1992 * this page because it is
1993 * toxic.
1995 MDSTAT_INCR(mhp, npplkdtoxic);
1996 page_unlock(pp);
1997 mutex_enter(&mhp->mh_mutex);
1998 continue;
2002 * Unload the mappings and check if mod bit
2003 * is set.
2005 ASSERT(!PP_ISKAS(pp));
2006 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
2007 mod = hat_ismod(pp);
2009 #ifdef MEM_DEL_STATS
2010 start_pgrp = ddi_get_lbolt();
2011 #endif /* MEM_DEL_STATS */
2012 if (mod && !PP_TOXIC(pp)) {
2014 * Lock all constituent pages
2015 * of a large page to ensure
2016 * that p_szc won't change.
2018 if (!group_page_trylock(pp, SE_EXCL)) {
2019 MDSTAT_INCR(mhp, gptlmodfail);
2020 page_unlock(pp);
2021 mutex_enter(&mhp->mh_mutex);
2022 continue;
2024 pp_targ = page_get_replacement_page(pp,
2025 NULL, 0);
2026 if (pp_targ != NULL) {
2027 MDSTAT_INCR(mhp, nmodreloc);
2028 #ifdef MEM_DEL_STATS
2029 ntick_pgrp =
2030 (uint64_t)ddi_get_lbolt() -
2031 start_pgrp;
2032 #endif /* MEM_DEL_STATS */
2033 MDSTAT_PGRP(mhp, ntick_pgrp);
2034 goto reloc;
2036 group_page_unlock(pp);
2039 if (!page_try_demote_pages(pp)) {
2040 MDSTAT_INCR(mhp, demotefail);
2041 page_unlock(pp);
2042 #ifdef MEM_DEL_STATS
2043 ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2044 start_pgrp;
2045 #endif /* MEM_DEL_STATS */
2046 MDSTAT_PGRP(mhp, ntick_pgrp);
2047 mutex_enter(&mhp->mh_mutex);
2048 continue;
2052 * Regular 'page-out'.
2054 if (!mod) {
2055 MDSTAT_INCR(mhp, ndestroy);
2056 page_destroy(pp, 1);
2058 * page_destroy was called with
2059 * dontfree. As long as p_lckcnt
2060 * and p_cowcnt are both zero, the
2061 * only additional action of
2062 * page_destroy with !dontfree is to
2063 * call page_free, so we can collect
2064 * the page here.
2066 collected++;
2067 #ifdef MEM_DEL_STATS
2068 ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2069 start_pgrp;
2070 #endif /* MEM_DEL_STATS */
2071 MDSTAT_PGRP(mhp, ntick_pgrp);
2072 mutex_enter(&mhp->mh_mutex);
2073 page_delete_collect(pp, mhp);
2074 mdsp->mds_bitmap[bit / NBPBMW] |=
2075 (1 << (bit % NBPBMW));
2076 continue;
2079 * The page is toxic and the mod bit is
2080 * set, we cannot do anything here to deal
2081 * with it.
2083 if (PP_TOXIC(pp)) {
2084 page_unlock(pp);
2085 #ifdef MEM_DEL_STATS
2086 ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2087 start_pgrp;
2088 #endif /* MEM_DEL_STATS */
2089 MDSTAT_PGRP(mhp, ntick_pgrp);
2090 MDSTAT_INCR(mhp, modtoxic);
2091 mutex_enter(&mhp->mh_mutex);
2092 continue;
2094 MDSTAT_INCR(mhp, nputpage);
2095 vp = pp->p_vnode;
2096 offset = pp->p_offset;
2097 VN_HOLD(vp);
2098 page_unlock(pp);
2099 (void) VOP_PUTPAGE(vp, offset, PAGESIZE,
2100 B_INVAL|B_FORCE, kcred, NULL);
2101 VN_RELE(vp);
2102 #ifdef MEM_DEL_STATS
2103 ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2104 start_pgrp;
2105 #endif /* MEM_DEL_STATS */
2106 MDSTAT_PGRP(mhp, ntick_pgrp);
2108 * Try to get the page back immediately
2109 * so that it can be collected.
2111 pp = page_numtopp_nolock(pfn);
2112 if (pp == NULL) {
2113 MDSTAT_INCR(mhp, nnoreclaim);
2115 * This should not happen as this
2116 * thread is deleting the page.
2117 * If this code is generalized, this
2118 * becomes a reality.
2120 #ifdef DEBUG
2121 cmn_err(CE_WARN,
2122 "delete_memory_thread(0x%p) "
2123 "pfn 0x%lx has no page_t",
2124 (void *)mhp, pfn);
2125 #endif /* DEBUG */
2126 mutex_enter(&mhp->mh_mutex);
2127 continue;
2129 if (page_try_reclaim_lock(pp, SE_EXCL,
2130 SE_EXCL_WANTED | SE_RETIRED)) {
2131 if (PP_ISFREE(pp)) {
2132 goto free_page_collect;
2134 page_unlock(pp);
2136 MDSTAT_INCR(mhp, nnoreclaim);
2137 mutex_enter(&mhp->mh_mutex);
2138 continue;
2140 reloc:
2142 * Got some freemem and a target
2143 * page, so move the data to avoid
2144 * I/O and lock problems.
2146 ASSERT(!page_iolock_assert(pp));
2147 MDSTAT_INCR(mhp, nreloc);
2149 * page_relocate() will return pgcnt: the
2150 * number of consecutive pages relocated.
2151 * If it is successful, pp will be a
2152 * linked list of the page structs that
2153 * were relocated. If page_relocate() is
2154 * unsuccessful, pp will be unmodified.
2156 #ifdef MEM_DEL_STATS
2157 start_pgrp = ddi_get_lbolt();
2158 #endif /* MEM_DEL_STATS */
2159 result = page_relocate(&pp, &pp_targ, 0, 0,
2160 &pgcnt, NULL);
2161 #ifdef MEM_DEL_STATS
2162 ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2163 start_pgrp;
2164 #endif /* MEM_DEL_STATS */
2165 MDSTAT_PGRP(mhp, ntick_pgrp);
2166 if (result != 0) {
2167 MDSTAT_INCR(mhp, nrelocfail);
2169 * We did not succeed. We need
2170 * to give the pp_targ pages back.
2171 * page_free(pp_targ, 1) without
2172 * the freemem accounting.
2174 group_page_unlock(pp);
2175 page_free_replacement_page(pp_targ);
2176 page_unlock(pp);
2177 mutex_enter(&mhp->mh_mutex);
2178 continue;
2182 * We will then collect pgcnt pages.
2184 ASSERT(pgcnt > 0);
2185 mutex_enter(&mhp->mh_mutex);
2187 * We need to make sure freemem_left is
2188 * large enough.
2190 while ((freemem_left < pgcnt) &&
2191 (!mhp->mh_cancel)) {
2192 freemem_left +=
2193 delthr_get_freemem(mhp);
2197 * Do not proceed if mh_cancel is set.
2199 if (mhp->mh_cancel) {
2200 while (pp_targ != NULL) {
2202 * Unlink and unlock each page.
2204 tpp_targ = pp_targ;
2205 page_sub(&pp_targ, tpp_targ);
2206 page_unlock(tpp_targ);
2209 * We need to give the pp pages back.
2210 * page_free(pp, 1) without the
2211 * freemem accounting.
2213 page_free_replacement_page(pp);
2214 break;
2217 /* Now remove pgcnt from freemem_left */
2218 freemem_left -= pgcnt;
2219 ASSERT(freemem_left >= 0);
2220 szc = pp->p_szc;
2221 while (pp != NULL) {
2223 * pp and pp_targ were passed back as
2224 * a linked list of pages.
2225 * Unlink and unlock each page.
2227 tpp_targ = pp_targ;
2228 page_sub(&pp_targ, tpp_targ);
2229 page_unlock(tpp_targ);
2231 * The original page is now free
2232 * so remove it from the linked
2233 * list and collect it.
2235 tpp = pp;
2236 page_sub(&pp, tpp);
2237 pfn = page_pptonum(tpp);
2238 collected++;
2239 ASSERT(PAGE_EXCL(tpp));
2240 ASSERT(tpp->p_vnode == NULL);
2241 ASSERT(!hat_page_is_mapped(tpp));
2242 ASSERT(tpp->p_szc == szc);
2243 tpp->p_szc = 0;
2244 page_delete_collect(tpp, mhp);
2245 bit = pfn - mdsp->mds_base;
2246 mdsp->mds_bitmap[bit / NBPBMW] |=
2247 (1 << (bit % NBPBMW));
2249 ASSERT(pp_targ == NULL);
2252 first_scan = 0;
2253 if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) &&
2254 (collected == 0)) {
2256 * This code is needed as we cannot wait
2257 * for a page to be locked OR the delete to
2258 * be cancelled. Also, we must delay so
2259 * that other threads get a chance to run
2260 * on our cpu, otherwise page locks may be
2261 * held indefinitely by those threads.
2263 MDSTAT_INCR(mhp, ndelay);
2264 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2265 (void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
2266 DEL_BUSY_WAIT_TICKS, TR_CLOCK_TICK);
2267 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2270 /* stop the dr aio cleanup thread */
2271 mhp->mh_dr_aio_cleanup_cancel = 1;
2272 transit_list_collect(mhp, 0);
2273 if (freemem_left != 0) {
2274 /* Return any surplus. */
2275 page_create_putback(freemem_left);
2276 freemem_left = 0;
2278 #ifdef MEM_DEL_STATS
2279 ntick_total = (uint64_t)ddi_get_lbolt() - start_total;
2280 #endif /* MEM_DEL_STATS */
2281 MDSTAT_TOTAL(mhp, ntick_total);
2282 MDSTAT_PRINT(mhp);
2285 * If the memory delete was cancelled, exclusive-wanted bits must
2286 * be cleared. If there are retired pages being deleted, they need
2287 * to be unretired.
2289 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2290 mdsp = mdsp->mds_next) {
2291 pfn_t pfn, p_end;
2293 p_end = mdsp->mds_base + mdsp->mds_npgs;
2294 for (pfn = mdsp->mds_base; pfn < p_end; pfn++) {
2295 page_t *pp;
2296 pgcnt_t bit;
2298 bit = pfn - mdsp->mds_base;
2299 if (mhp->mh_cancel) {
2300 pp = page_numtopp_nolock(pfn);
2301 if (pp != NULL) {
2302 if ((mdsp->mds_bitmap[bit / NBPBMW] &
2303 (1 << (bit % NBPBMW))) == 0) {
2304 page_lock_clr_exclwanted(pp);
2307 } else {
2308 pp = NULL;
2310 if ((mdsp->mds_bitmap_retired[bit / NBPBMW] &
2311 (1 << (bit % NBPBMW))) != 0) {
2312 /* do we already have pp? */
2313 if (pp == NULL) {
2314 pp = page_numtopp_nolock(pfn);
2316 ASSERT(pp != NULL);
2317 ASSERT(PP_RETIRED(pp));
2318 if (mhp->mh_cancel != 0) {
2319 page_unlock(pp);
2321 * To satisfy ASSERT below in
2322 * cancel code.
2324 mhp->mh_hold_todo++;
2325 } else {
2326 (void) page_unretire_pp(pp,
2327 PR_UNR_CLEAN);
2333 * Free retired page bitmap and collected page bitmap
2335 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2336 mdsp = mdsp->mds_next) {
2337 ASSERT(mdsp->mds_bitmap_retired != NULL);
2338 kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp));
2339 mdsp->mds_bitmap_retired = NULL; /* Paranoia. */
2340 ASSERT(mdsp->mds_bitmap != NULL);
2341 kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp));
2342 mdsp->mds_bitmap = NULL; /* Paranoia. */
2345 /* wait for our dr aio cancel thread to exit */
2346 while (!(mhp->mh_aio_cleanup_done)) {
2347 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2348 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
2349 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2351 refused:
2352 if (mhp->mh_cancel != 0) {
2353 page_t *pp;
2355 comp_code = mhp->mh_cancel;
2357 * Go through list of deleted pages (mh_deleted) freeing
2358 * them.
2360 while ((pp = mhp->mh_deleted) != NULL) {
2361 mhp->mh_deleted = pp->p_next;
2362 mhp->mh_hold_todo++;
2363 mutex_exit(&mhp->mh_mutex);
2364 /* Restore p_next. */
2365 pp->p_next = pp->p_prev;
2366 if (PP_ISFREE(pp)) {
2367 cmn_err(CE_PANIC,
2368 "page %p is free",
2369 (void *)pp);
2371 page_free(pp, 1);
2372 mutex_enter(&mhp->mh_mutex);
2374 ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages);
2376 mutex_exit(&mhp->mh_mutex);
2377 put_availrmem(mhp->mh_vm_pages);
2378 mutex_enter(&mhp->mh_mutex);
2380 goto t_exit;
2384 * All the pages are no longer in use and are exclusively locked.
2387 mhp->mh_deleted = NULL;
2389 kphysm_del_cleanup(mhp);
2392 * mem_node_del_range needs to be after kphysm_del_cleanup so
2393 * that the mem_node_config[] will remain intact for the cleanup.
2395 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2396 mdsp = mdsp->mds_next) {
2397 mem_node_del_range(mdsp->mds_base,
2398 mdsp->mds_base + mdsp->mds_npgs - 1);
2400 /* cleanup the page counters */
2401 page_ctrs_cleanup();
2403 comp_code = KPHYSM_OK;
2405 t_exit:
2406 mutex_exit(&mhp->mh_mutex);
2407 kphysm_setup_post_del(mhp->mh_vm_pages,
2408 (comp_code == KPHYSM_OK) ? 0 : 1);
2409 mutex_enter(&mhp->mh_mutex);
2411 early_exit:
2412 /* mhp->mh_mutex exited by CALLB_CPR_EXIT() */
2413 mhp->mh_state = MHND_DONE;
2414 del_complete_funcp = mhp->mh_delete_complete;
2415 del_complete_arg = mhp->mh_delete_complete_arg;
2416 CALLB_CPR_EXIT(&cprinfo);
2417 (*del_complete_funcp)(del_complete_arg, comp_code);
2418 thread_exit();
2419 /*NOTREACHED*/
2423 * Start the delete of the memory from the system.
2426 kphysm_del_start(
2427 memhandle_t handle,
2428 void (*complete)(void *, int),
2429 void *complete_arg)
2431 struct mem_handle *mhp;
2433 mhp = kphysm_lookup_mem_handle(handle);
2434 if (mhp == NULL) {
2435 return (KPHYSM_EHANDLE);
2437 switch (mhp->mh_state) {
2438 case MHND_FREE:
2439 ASSERT(mhp->mh_state != MHND_FREE);
2440 mutex_exit(&mhp->mh_mutex);
2441 return (KPHYSM_EHANDLE);
2442 case MHND_INIT:
2443 break;
2444 case MHND_STARTING:
2445 case MHND_RUNNING:
2446 mutex_exit(&mhp->mh_mutex);
2447 return (KPHYSM_ESEQUENCE);
2448 case MHND_DONE:
2449 mutex_exit(&mhp->mh_mutex);
2450 return (KPHYSM_ESEQUENCE);
2451 case MHND_RELEASE:
2452 mutex_exit(&mhp->mh_mutex);
2453 return (KPHYSM_ESEQUENCE);
2454 default:
2455 #ifdef DEBUG
2456 cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d",
2457 (void *)mhp, mhp->mh_state);
2458 #endif /* DEBUG */
2459 mutex_exit(&mhp->mh_mutex);
2460 return (KPHYSM_EHANDLE);
2463 if (mhp->mh_transit.trl_spans == NULL) {
2464 mutex_exit(&mhp->mh_mutex);
2465 return (KPHYSM_ENOWORK);
2468 ASSERT(complete != NULL);
2469 mhp->mh_delete_complete = complete;
2470 mhp->mh_delete_complete_arg = complete_arg;
2471 mhp->mh_state = MHND_STARTING;
2473 * Release the mutex in case thread_create sleeps.
2475 mutex_exit(&mhp->mh_mutex);
2478 * The "obvious" process for this thread is pageout (proc_pageout)
2479 * but this gives the thread too much power over freemem
2480 * which results in freemem starvation.
2482 (void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0,
2483 TS_RUN, maxclsyspri - 1);
2485 return (KPHYSM_OK);
2488 static kmutex_t pp_dummy_lock; /* Protects init. of pp_dummy. */
2489 static caddr_t pp_dummy;
2490 static pgcnt_t pp_dummy_npages;
2491 static pfn_t *pp_dummy_pfn; /* Array of dummy pfns. */
2493 static void
2494 memseg_remap_init_pages(page_t *pages, page_t *epages)
2496 page_t *pp;
2498 for (pp = pages; pp < epages; pp++) {
2499 pp->p_pagenum = PFN_INVALID; /* XXXX */
2500 pp->p_offset = (u_offset_t)-1;
2501 page_iolock_init(pp);
2502 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
2503 continue;
2504 page_lock_delete(pp);
2508 void
2509 memseg_remap_init()
2511 mutex_enter(&pp_dummy_lock);
2512 if (pp_dummy == NULL) {
2513 uint_t dpages;
2514 int i;
2517 * dpages starts off as the size of the structure and
2518 * ends up as the minimum number of pages that will
2519 * hold a whole number of page_t structures.
2521 dpages = sizeof (page_t);
2522 ASSERT(dpages != 0);
2523 ASSERT(dpages <= MMU_PAGESIZE);
2525 while ((dpages & 1) == 0)
2526 dpages >>= 1;
2528 pp_dummy_npages = dpages;
2530 * Allocate pp_dummy pages directly from static_arena,
2531 * since these are whole page allocations and are
2532 * referenced by physical address. This also has the
2533 * nice fringe benefit of hiding the memory from
2534 * ::findleaks since it doesn't deal well with allocated
2535 * kernel heap memory that doesn't have any mappings.
2537 pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages),
2538 PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP);
2539 bzero(pp_dummy, ptob(pp_dummy_npages));
2540 ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0);
2541 pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) *
2542 pp_dummy_npages, KM_SLEEP);
2543 for (i = 0; i < pp_dummy_npages; i++) {
2544 pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat,
2545 &pp_dummy[MMU_PAGESIZE * i]);
2546 ASSERT(pp_dummy_pfn[i] != PFN_INVALID);
2549 * Initialize the page_t's to a known 'deleted' state
2550 * that matches the state of deleted pages.
2552 memseg_remap_init_pages((page_t *)pp_dummy,
2553 (page_t *)(pp_dummy + ptob(pp_dummy_npages)));
2554 /* Remove kmem mappings for the pages for safety. */
2555 hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages),
2556 HAT_UNLOAD_UNLOCK);
2557 /* Leave pp_dummy pointer set as flag that init is done. */
2559 mutex_exit(&pp_dummy_lock);
2563 * Remap a page-aglined range of page_t's to dummy pages.
2565 void
2566 remap_to_dummy(caddr_t va, pgcnt_t metapgs)
2568 int phase;
2570 ASSERT(IS_P2ALIGNED((uint64_t)(uintptr_t)va, PAGESIZE));
2573 * We may start remapping at a non-zero page offset
2574 * within the dummy pages since the low/high ends
2575 * of the outgoing pp's could be shared by other
2576 * memsegs (see memseg_remap_meta).
2578 phase = btop((uint64_t)(uintptr_t)va) % pp_dummy_npages;
2579 /*CONSTCOND*/
2580 ASSERT(PAGESIZE % sizeof (page_t) || phase == 0);
2582 while (metapgs != 0) {
2583 pgcnt_t n;
2584 int i, j;
2586 n = pp_dummy_npages;
2587 if (n > metapgs)
2588 n = metapgs;
2589 for (i = 0; i < n; i++) {
2590 j = (i + phase) % pp_dummy_npages;
2591 hat_devload(kas.a_hat, va, ptob(1), pp_dummy_pfn[j],
2592 PROT_READ,
2593 HAT_LOAD | HAT_LOAD_NOCONSIST |
2594 HAT_LOAD_REMAP);
2595 va += ptob(1);
2597 metapgs -= n;
2601 static void
2602 memseg_remap_to_dummy(struct memseg *seg)
2604 caddr_t pp;
2605 pgcnt_t metapgs;
2607 ASSERT(memseg_is_dynamic(seg));
2608 ASSERT(pp_dummy != NULL);
2611 if (!memseg_includes_meta(seg)) {
2612 memseg_remap_meta(seg);
2613 return;
2616 pp = (caddr_t)seg->pages;
2617 metapgs = seg->pages_base - memseg_get_start(seg);
2618 ASSERT(metapgs != 0);
2620 seg->pages_end = seg->pages_base;
2622 remap_to_dummy(pp, metapgs);
2626 * Transition all the deleted pages to the deleted state so that
2627 * page_lock will not wait. The page_lock_delete call will
2628 * also wake up any waiters.
2630 static void
2631 memseg_lock_delete_all(struct memseg *seg)
2633 page_t *pp;
2635 for (pp = seg->pages; pp < seg->epages; pp++) {
2636 pp->p_pagenum = PFN_INVALID; /* XXXX */
2637 page_lock_delete(pp);
2641 static void
2642 kphysm_del_cleanup(struct mem_handle *mhp)
2644 struct memdelspan *mdsp;
2645 struct memseg *seg;
2646 struct memseg **segpp;
2647 struct memseg *seglist;
2648 pfn_t p_end;
2649 uint64_t avmem;
2650 pgcnt_t avpgs;
2651 pgcnt_t npgs;
2653 avpgs = mhp->mh_vm_pages;
2655 memsegs_lock(1);
2658 * remove from main segment list.
2660 npgs = 0;
2661 seglist = NULL;
2662 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2663 mdsp = mdsp->mds_next) {
2664 p_end = mdsp->mds_base + mdsp->mds_npgs;
2665 for (segpp = &memsegs; (seg = *segpp) != NULL; ) {
2666 if (seg->pages_base >= p_end ||
2667 seg->pages_end <= mdsp->mds_base) {
2668 /* Span and memseg don't overlap. */
2669 segpp = &((*segpp)->next);
2670 continue;
2672 ASSERT(seg->pages_base >= mdsp->mds_base);
2673 ASSERT(seg->pages_end <= p_end);
2675 PLCNT_MODIFY_MAX(seg->pages_base,
2676 seg->pages_base - seg->pages_end);
2678 /* Hide the memseg from future scans. */
2679 hat_kpm_delmem_mseg_update(seg, segpp);
2680 *segpp = seg->next;
2681 membar_producer(); /* TODO: Needed? */
2682 npgs += MSEG_NPAGES(seg);
2685 * Leave the deleted segment's next pointer intact
2686 * in case a memsegs scanning loop is walking this
2687 * segment concurrently.
2689 seg->lnext = seglist;
2690 seglist = seg;
2694 build_pfn_hash();
2696 ASSERT(npgs < total_pages);
2697 total_pages -= npgs;
2700 * Recalculate the paging parameters now total_pages has changed.
2701 * This will also cause the clock hands to be reset before next use.
2703 setupclock();
2705 memsegs_unlock(1);
2707 mutex_exit(&mhp->mh_mutex);
2709 while ((seg = seglist) != NULL) {
2710 pfn_t mseg_start;
2711 pfn_t mseg_base, mseg_end;
2712 pgcnt_t mseg_npgs;
2713 int mlret;
2715 seglist = seg->lnext;
2718 * Put the page_t's into the deleted state to stop
2719 * cv_wait()s on the pages. When we remap, the dummy
2720 * page_t's will be in the same state.
2722 memseg_lock_delete_all(seg);
2724 * Collect up information based on pages_base and pages_end
2725 * early so that we can flag early that the memseg has been
2726 * deleted by setting pages_end == pages_base.
2728 mseg_base = seg->pages_base;
2729 mseg_end = seg->pages_end;
2730 mseg_npgs = MSEG_NPAGES(seg);
2731 mseg_start = memseg_get_start(seg);
2733 if (memseg_is_dynamic(seg)) {
2734 /* Remap the meta data to our special dummy area. */
2735 memseg_remap_to_dummy(seg);
2737 mutex_enter(&memseg_lists_lock);
2738 seg->lnext = memseg_va_avail;
2739 memseg_va_avail = seg;
2740 mutex_exit(&memseg_lists_lock);
2741 } else {
2743 * For memory whose page_ts were allocated
2744 * at boot, we need to find a new use for
2745 * the page_t memory.
2746 * For the moment, just leak it.
2747 * (It is held in the memseg_delete_junk list.)
2749 seg->pages_end = seg->pages_base;
2751 mutex_enter(&memseg_lists_lock);
2752 seg->lnext = memseg_delete_junk;
2753 memseg_delete_junk = seg;
2754 mutex_exit(&memseg_lists_lock);
2757 /* Must not use seg now as it could be re-used. */
2759 memlist_write_lock();
2761 mlret = memlist_delete_span(
2762 (uint64_t)(mseg_base) << PAGESHIFT,
2763 (uint64_t)(mseg_npgs) << PAGESHIFT,
2764 &phys_avail);
2765 ASSERT(mlret == MEML_SPANOP_OK);
2767 mlret = memlist_delete_span(
2768 (uint64_t)(mseg_start) << PAGESHIFT,
2769 (uint64_t)(mseg_end - mseg_start) <<
2770 PAGESHIFT,
2771 &phys_install);
2772 ASSERT(mlret == MEML_SPANOP_OK);
2773 phys_install_has_changed();
2775 memlist_write_unlock();
2778 memlist_read_lock();
2779 installed_top_size(phys_install, &physmax, &physinstalled);
2780 memlist_read_unlock();
2782 mutex_enter(&freemem_lock);
2783 maxmem -= avpgs;
2784 physmem -= avpgs;
2785 /* availrmem is adjusted during the delete. */
2786 availrmem_initial -= avpgs;
2788 mutex_exit(&freemem_lock);
2790 dump_resize();
2792 cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK "
2793 "(0x%" PRIx64 ")\n",
2794 physinstalled << (PAGESHIFT - 10),
2795 (uint64_t)physinstalled << PAGESHIFT);
2797 avmem = (uint64_t)freemem << PAGESHIFT;
2798 cmn_err(CE_CONT, "?kphysm_delete: "
2799 "avail mem = %" PRId64 "\n", avmem);
2802 * Update lgroup generation number on single lgroup systems
2804 if (nlgrps == 1)
2805 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
2807 /* Successfully deleted system memory */
2808 mutex_enter(&mhp->mh_mutex);
2811 static uint_t mdel_nullvp_waiter;
2813 static void
2814 page_delete_collect(
2815 page_t *pp,
2816 struct mem_handle *mhp)
2818 if (pp->p_vnode) {
2819 page_hashout(pp, (kmutex_t *)NULL);
2820 /* do not do PP_SETAGED(pp); */
2821 } else {
2822 kmutex_t *sep;
2824 sep = page_se_mutex(pp);
2825 mutex_enter(sep);
2826 if (CV_HAS_WAITERS(&pp->p_cv)) {
2827 mdel_nullvp_waiter++;
2828 cv_broadcast(&pp->p_cv);
2830 mutex_exit(sep);
2832 ASSERT(pp->p_next == pp->p_prev);
2833 ASSERT(pp->p_next == NULL || pp->p_next == pp);
2834 pp->p_next = mhp->mh_deleted;
2835 mhp->mh_deleted = pp;
2836 ASSERT(mhp->mh_hold_todo != 0);
2837 mhp->mh_hold_todo--;
2840 static void
2841 transit_list_collect(struct mem_handle *mhp, int v)
2843 struct transit_list_head *trh;
2845 trh = &transit_list_head;
2846 mutex_enter(&trh->trh_lock);
2847 mhp->mh_transit.trl_collect = v;
2848 mutex_exit(&trh->trh_lock);
2851 static void
2852 transit_list_insert(struct transit_list *tlp)
2854 struct transit_list_head *trh;
2856 trh = &transit_list_head;
2857 ASSERT(MUTEX_HELD(&trh->trh_lock));
2858 tlp->trl_next = trh->trh_head;
2859 trh->trh_head = tlp;
2862 static void
2863 transit_list_remove(struct transit_list *tlp)
2865 struct transit_list_head *trh;
2866 struct transit_list **tlpp;
2868 trh = &transit_list_head;
2869 tlpp = &trh->trh_head;
2870 ASSERT(MUTEX_HELD(&trh->trh_lock));
2871 while (*tlpp != NULL && *tlpp != tlp)
2872 tlpp = &(*tlpp)->trl_next;
2873 ASSERT(*tlpp != NULL);
2874 if (*tlpp == tlp)
2875 *tlpp = tlp->trl_next;
2876 tlp->trl_next = NULL;
2879 static struct transit_list *
2880 pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum)
2882 struct transit_list *tlp;
2884 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
2885 struct memdelspan *mdsp;
2887 for (mdsp = tlp->trl_spans; mdsp != NULL;
2888 mdsp = mdsp->mds_next) {
2889 if (pfnum >= mdsp->mds_base &&
2890 pfnum < (mdsp->mds_base + mdsp->mds_npgs)) {
2891 return (tlp);
2895 return (NULL);
2899 pfn_is_being_deleted(pfn_t pfnum)
2901 struct transit_list_head *trh;
2902 struct transit_list *tlp;
2903 int ret;
2905 trh = &transit_list_head;
2906 if (trh->trh_head == NULL)
2907 return (0);
2909 mutex_enter(&trh->trh_lock);
2910 tlp = pfnum_to_transit_list(trh, pfnum);
2911 ret = (tlp != NULL && tlp->trl_collect);
2912 mutex_exit(&trh->trh_lock);
2914 return (ret);
2917 #ifdef MEM_DEL_STATS
2918 extern int hz;
2919 static void
2920 mem_del_stat_print_func(struct mem_handle *mhp)
2922 uint64_t tmp;
2924 if (mem_del_stat_print) {
2925 printf("memory delete loop %x/%x, statistics%s\n",
2926 (uint_t)mhp->mh_transit.trl_spans->mds_base,
2927 (uint_t)mhp->mh_transit.trl_spans->mds_npgs,
2928 (mhp->mh_cancel ? " (cancelled)" : ""));
2929 printf("\t%8u nloop\n", mhp->mh_delstat.nloop);
2930 printf("\t%8u need_free\n", mhp->mh_delstat.need_free);
2931 printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop);
2932 printf("\t%8u free_low\n", mhp->mh_delstat.free_low);
2933 printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed);
2934 printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck);
2935 printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget);
2936 printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail);
2937 printf("\t%8u nfree\n", mhp->mh_delstat.nfree);
2938 printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc);
2939 printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail);
2940 printf("\t%8u already_done\n", mhp->mh_delstat.already_done);
2941 printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree);
2942 printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked);
2943 printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc);
2944 printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl);
2945 printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc);
2946 printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy);
2947 printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage);
2948 printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim);
2949 printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay);
2950 printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail);
2951 printf("\t%8u retired\n", mhp->mh_delstat.retired);
2952 printf("\t%8u toxic\n", mhp->mh_delstat.toxic);
2953 printf("\t%8u failing\n", mhp->mh_delstat.failing);
2954 printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic);
2955 printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic);
2956 printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail);
2957 printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail);
2958 tmp = mhp->mh_delstat.nticks_total / hz; /* seconds */
2959 printf(
2960 "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n",
2961 mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60);
2963 tmp = mhp->mh_delstat.nticks_pgrp / hz; /* seconds */
2964 printf(
2965 "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n",
2966 mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60);
2969 #endif /* MEM_DEL_STATS */
2971 struct mem_callback {
2972 kphysm_setup_vector_t *vec;
2973 void *arg;
2976 #define NMEMCALLBACKS 100
2978 static struct mem_callback mem_callbacks[NMEMCALLBACKS];
2979 static uint_t nmemcallbacks;
2980 static krwlock_t mem_callback_rwlock;
2983 kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg)
2985 uint_t i, found;
2988 * This test will become more complicated when the version must
2989 * change.
2991 if (vec->version != KPHYSM_SETUP_VECTOR_VERSION)
2992 return (EINVAL);
2994 if (vec->post_add == NULL || vec->pre_del == NULL ||
2995 vec->post_del == NULL)
2996 return (EINVAL);
2998 rw_enter(&mem_callback_rwlock, RW_WRITER);
2999 for (i = 0, found = 0; i < nmemcallbacks; i++) {
3000 if (mem_callbacks[i].vec == NULL && found == 0)
3001 found = i + 1;
3002 if (mem_callbacks[i].vec == vec &&
3003 mem_callbacks[i].arg == arg) {
3004 #ifdef DEBUG
3005 /* Catch this in DEBUG kernels. */
3006 cmn_err(CE_WARN, "kphysm_setup_func_register"
3007 "(0x%p, 0x%p) duplicate registration from 0x%p",
3008 (void *)vec, arg, (void *)caller());
3009 #endif /* DEBUG */
3010 rw_exit(&mem_callback_rwlock);
3011 return (EEXIST);
3014 if (found != 0) {
3015 i = found - 1;
3016 } else {
3017 ASSERT(nmemcallbacks < NMEMCALLBACKS);
3018 if (nmemcallbacks == NMEMCALLBACKS) {
3019 rw_exit(&mem_callback_rwlock);
3020 return (ENOMEM);
3022 i = nmemcallbacks++;
3024 mem_callbacks[i].vec = vec;
3025 mem_callbacks[i].arg = arg;
3026 rw_exit(&mem_callback_rwlock);
3027 return (0);
3030 void
3031 kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg)
3033 uint_t i;
3035 rw_enter(&mem_callback_rwlock, RW_WRITER);
3036 for (i = 0; i < nmemcallbacks; i++) {
3037 if (mem_callbacks[i].vec == vec &&
3038 mem_callbacks[i].arg == arg) {
3039 mem_callbacks[i].vec = NULL;
3040 mem_callbacks[i].arg = NULL;
3041 if (i == (nmemcallbacks - 1))
3042 nmemcallbacks--;
3043 break;
3046 rw_exit(&mem_callback_rwlock);
3049 static void
3050 kphysm_setup_post_add(pgcnt_t delta_pages)
3052 uint_t i;
3054 rw_enter(&mem_callback_rwlock, RW_READER);
3055 for (i = 0; i < nmemcallbacks; i++) {
3056 if (mem_callbacks[i].vec != NULL) {
3057 (*mem_callbacks[i].vec->post_add)
3058 (mem_callbacks[i].arg, delta_pages);
3061 rw_exit(&mem_callback_rwlock);
3065 * Note the locking between pre_del and post_del: The reader lock is held
3066 * between the two calls to stop the set of functions from changing.
3069 static int
3070 kphysm_setup_pre_del(pgcnt_t delta_pages)
3072 uint_t i;
3073 int ret;
3074 int aret;
3076 ret = 0;
3077 rw_enter(&mem_callback_rwlock, RW_READER);
3078 for (i = 0; i < nmemcallbacks; i++) {
3079 if (mem_callbacks[i].vec != NULL) {
3080 aret = (*mem_callbacks[i].vec->pre_del)
3081 (mem_callbacks[i].arg, delta_pages);
3082 ret |= aret;
3086 return (ret);
3089 static void
3090 kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled)
3092 uint_t i;
3094 for (i = 0; i < nmemcallbacks; i++) {
3095 if (mem_callbacks[i].vec != NULL) {
3096 (*mem_callbacks[i].vec->post_del)
3097 (mem_callbacks[i].arg, delta_pages, cancelled);
3100 rw_exit(&mem_callback_rwlock);
3103 static int
3104 kphysm_split_memseg(
3105 pfn_t base,
3106 pgcnt_t npgs)
3108 struct memseg *seg;
3109 struct memseg **segpp;
3110 pgcnt_t size_low, size_high;
3111 struct memseg *seg_low, *seg_mid, *seg_high;
3114 * Lock the memsegs list against other updates now
3116 memsegs_lock(1);
3119 * Find boot time memseg that wholly covers this area.
3122 /* First find the memseg with page 'base' in it. */
3123 for (segpp = &memsegs; (seg = *segpp) != NULL;
3124 segpp = &((*segpp)->next)) {
3125 if (base >= seg->pages_base && base < seg->pages_end)
3126 break;
3128 if (seg == NULL) {
3129 memsegs_unlock(1);
3130 return (0);
3132 if (memseg_includes_meta(seg)) {
3133 memsegs_unlock(1);
3134 return (0);
3136 if ((base + npgs) > seg->pages_end) {
3137 memsegs_unlock(1);
3138 return (0);
3142 * Work out the size of the two segments that will
3143 * surround the new segment, one for low address
3144 * and one for high.
3146 ASSERT(base >= seg->pages_base);
3147 size_low = base - seg->pages_base;
3148 ASSERT(seg->pages_end >= (base + npgs));
3149 size_high = seg->pages_end - (base + npgs);
3152 * Sanity check.
3154 if ((size_low + size_high) == 0) {
3155 memsegs_unlock(1);
3156 return (0);
3160 * Allocate the new structures. The old memseg will not be freed
3161 * as there may be a reference to it.
3163 seg_low = NULL;
3164 seg_high = NULL;
3166 if (size_low != 0)
3167 seg_low = memseg_alloc();
3169 seg_mid = memseg_alloc();
3171 if (size_high != 0)
3172 seg_high = memseg_alloc();
3175 * All allocation done now.
3177 if (size_low != 0) {
3178 seg_low->pages = seg->pages;
3179 seg_low->epages = seg_low->pages + size_low;
3180 seg_low->pages_base = seg->pages_base;
3181 seg_low->pages_end = seg_low->pages_base + size_low;
3182 seg_low->next = seg_mid;
3183 seg_low->msegflags = seg->msegflags;
3185 if (size_high != 0) {
3186 seg_high->pages = seg->epages - size_high;
3187 seg_high->epages = seg_high->pages + size_high;
3188 seg_high->pages_base = seg->pages_end - size_high;
3189 seg_high->pages_end = seg_high->pages_base + size_high;
3190 seg_high->next = seg->next;
3191 seg_high->msegflags = seg->msegflags;
3194 seg_mid->pages = seg->pages + size_low;
3195 seg_mid->pages_base = seg->pages_base + size_low;
3196 seg_mid->epages = seg->epages - size_high;
3197 seg_mid->pages_end = seg->pages_end - size_high;
3198 seg_mid->next = (seg_high != NULL) ? seg_high : seg->next;
3199 seg_mid->msegflags = seg->msegflags;
3202 * Update hat_kpm specific info of all involved memsegs and
3203 * allow hat_kpm specific global chain updates.
3205 hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high);
3208 * At this point we have two equivalent memseg sub-chains,
3209 * seg and seg_low/seg_mid/seg_high, which both chain on to
3210 * the same place in the global chain. By re-writing the pointer
3211 * in the previous element we switch atomically from using the old
3212 * (seg) to the new.
3214 *segpp = (seg_low != NULL) ? seg_low : seg_mid;
3216 membar_enter();
3218 build_pfn_hash();
3219 memsegs_unlock(1);
3222 * We leave the old segment, 'seg', intact as there may be
3223 * references to it. Also, as the value of total_pages has not
3224 * changed and the memsegs list is effectively the same when
3225 * accessed via the old or the new pointer, we do not have to
3226 * cause pageout_scanner() to re-evaluate its hand pointers.
3228 * We currently do not re-use or reclaim the page_t memory.
3229 * If we do, then this may have to change.
3232 mutex_enter(&memseg_lists_lock);
3233 seg->lnext = memseg_edit_junk;
3234 memseg_edit_junk = seg;
3235 mutex_exit(&memseg_lists_lock);
3237 return (1);
3241 * The sfmmu hat layer (e.g.) accesses some parts of the memseg
3242 * structure using physical addresses. Therefore a kmem_cache is
3243 * used with KMC_NOHASH to avoid page crossings within a memseg
3244 * structure. KMC_NOHASH requires that no external (outside of
3245 * slab) information is allowed. This, in turn, implies that the
3246 * cache's slabsize must be exactly a single page, since per-slab
3247 * information (e.g. the freelist for the slab) is kept at the
3248 * end of the slab, where it is easy to locate. Should be changed
3249 * when a more obvious kmem_cache interface/flag will become
3250 * available.
3252 void
3253 mem_config_init()
3255 memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg),
3256 0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH);
3259 struct memseg *
3260 memseg_alloc()
3262 struct memseg *seg;
3264 seg = kmem_cache_alloc(memseg_cache, KM_SLEEP);
3265 bzero(seg, sizeof (struct memseg));
3267 return (seg);
3271 * Return whether the page_t memory for this memseg
3272 * is included in the memseg itself.
3274 static int
3275 memseg_includes_meta(struct memseg *seg)
3277 return (seg->msegflags & MEMSEG_META_INCL);
3280 pfn_t
3281 memseg_get_start(struct memseg *seg)
3283 pfn_t pt_start;
3285 if (memseg_includes_meta(seg)) {
3286 pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages);
3288 /* Meta data is required to be at the beginning */
3289 ASSERT(pt_start < seg->pages_base);
3290 } else
3291 pt_start = seg->pages_base;
3293 return (pt_start);
3297 * Invalidate memseg pointers in cpu private vm data caches.
3299 static void
3300 memseg_cpu_vm_flush()
3302 cpu_t *cp;
3303 vm_cpu_data_t *vc;
3305 mutex_enter(&cpu_lock);
3306 pause_cpus(NULL, NULL);
3308 cp = cpu_list;
3309 do {
3310 vc = cp->cpu_vm_data;
3311 vc->vc_pnum_memseg = NULL;
3312 vc->vc_pnext_memseg = NULL;
3314 } while ((cp = cp->cpu_next) != cpu_list);
3316 start_cpus();
3317 mutex_exit(&cpu_lock);