4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2017 Joyent, Inc.
27 #include <sys/types.h>
28 #include <sys/cmn_err.h>
31 #include <sys/systm.h>
32 #include <sys/machsystm.h> /* for page_freelist_coalesce() */
33 #include <sys/errno.h>
34 #include <sys/memnode.h>
35 #include <sys/memlist.h>
36 #include <sys/memlist_impl.h>
37 #include <sys/tuneable.h>
40 #include <sys/debug.h>
42 #include <sys/callb.h>
43 #include <sys/memlist_plat.h> /* for installed_top_size() */
44 #include <sys/condvar_impl.h> /* for CV_HAS_WAITERS() */
45 #include <sys/dumphdr.h> /* for dump_resize() */
46 #include <sys/atomic.h> /* for use in stats collection */
47 #include <sys/rwlock.h>
48 #include <sys/cpuvar.h>
49 #include <vm/seg_kmem.h>
50 #include <vm/seg_kpm.h>
52 #include <vm/vm_dep.h>
53 #define SUNDDI_IMPL /* so sunddi.h will not redefine splx() et al */
54 #include <sys/sunddi.h>
55 #include <sys/mem_config.h>
56 #include <sys/mem_cage.h>
59 #include <sys/modctl.h>
61 extern struct memlist
*phys_avail
;
63 extern uint_t
page_ctrs_adjust(int);
64 void page_ctrs_cleanup(void);
65 static void kphysm_setup_post_add(pgcnt_t
);
66 static int kphysm_setup_pre_del(pgcnt_t
);
67 static void kphysm_setup_post_del(pgcnt_t
, int);
69 static int kphysm_split_memseg(pfn_t base
, pgcnt_t npgs
);
71 static int delspan_reserve(pfn_t
, pgcnt_t
);
72 static void delspan_unreserve(pfn_t
, pgcnt_t
);
74 kmutex_t memseg_lists_lock
;
75 struct memseg
*memseg_va_avail
;
76 struct memseg
*memseg_alloc(void);
77 static struct memseg
*memseg_delete_junk
;
78 static struct memseg
*memseg_edit_junk
;
79 void memseg_remap_init(void);
80 static void memseg_remap_to_dummy(struct memseg
*);
81 static void kphysm_addmem_error_undospan(pfn_t
, pgcnt_t
);
82 static struct memseg
*memseg_reuse(pgcnt_t
);
84 static struct kmem_cache
*memseg_cache
;
87 * Interfaces to manage externally allocated
88 * page_t memory (metadata) for a memseg.
90 #pragma weak memseg_alloc_meta
91 #pragma weak memseg_free_meta
92 #pragma weak memseg_get_metapfn
93 #pragma weak memseg_remap_meta
95 extern int ppvm_enable
;
96 extern page_t
*ppvm_base
;
97 extern int memseg_alloc_meta(pfn_t
, pgcnt_t
, void **, pgcnt_t
*);
98 extern void memseg_free_meta(void *, pgcnt_t
);
99 extern pfn_t
memseg_get_metapfn(void *, pgcnt_t
);
100 extern void memseg_remap_meta(struct memseg
*);
101 static int memseg_is_dynamic(struct memseg
*);
102 static int memseg_includes_meta(struct memseg
*);
103 pfn_t
memseg_get_start(struct memseg
*);
104 static void memseg_cpu_vm_flush(void);
106 int meta_alloc_enable
;
109 static int memseg_debug
;
110 #define MEMSEG_DEBUG(args...) if (memseg_debug) printf(args)
112 #define MEMSEG_DEBUG(...)
116 * Add a chunk of memory to the system.
117 * base: starting PAGESIZE page of new memory.
118 * npgs: length in PAGESIZE pages.
120 * Adding mem this way doesn't increase the size of the hash tables;
121 * growing them would be too hard. This should be OK, but adding memory
122 * dynamically most likely means more hash misses, since the tables will
123 * be smaller than they otherwise would be.
126 kphysm_add_memory_dynamic(pfn_t base
, pgcnt_t npgs
)
129 page_t
*opp
, *oepp
, *segpp
;
133 pfn_t pt_base
= base
;
146 void *metabase
= (void *)base
;
148 offset_t kpm_pages_off
= 0;
151 "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64
"\n",
152 npgs
<< (PAGESHIFT
- 10), (uint64_t)base
<< PAGESHIFT
);
155 * Add this span in the delete list to prevent interactions.
157 if (!delspan_reserve(base
, npgs
)) {
158 return (KPHYSM_ESPAN
);
161 * Check to see if any of the memory span has been added
162 * by trying an add to the installed memory list. This
163 * forms the interlocking process for add.
166 memlist_write_lock();
168 mlret
= memlist_add_span((uint64_t)(pt_base
) << PAGESHIFT
,
169 (uint64_t)(tpgs
) << PAGESHIFT
, &phys_install
);
171 if (mlret
== MEML_SPANOP_OK
)
172 installed_top_size(phys_install
, &physmax
, &physinstalled
);
174 memlist_write_unlock();
176 if (mlret
!= MEML_SPANOP_OK
) {
177 if (mlret
== MEML_SPANOP_EALLOC
) {
178 delspan_unreserve(pt_base
, tpgs
);
179 return (KPHYSM_ERESOURCE
);
180 } else if (mlret
== MEML_SPANOP_ESPAN
) {
181 delspan_unreserve(pt_base
, tpgs
);
182 return (KPHYSM_ESPAN
);
184 delspan_unreserve(pt_base
, tpgs
);
185 return (KPHYSM_ERESOURCE
);
189 if (meta_alloc_enable
) {
191 * Allocate the page_t's from existing memory;
192 * if that fails, allocate from the incoming memory.
194 rv
= memseg_alloc_meta(base
, npgs
, &metabase
, &metapgs
);
195 if (rv
== KPHYSM_OK
) {
197 ASSERT(btopr(npgs
* sizeof (page_t
)) <= metapgs
);
204 * We store the page_t's for this new memory in the first
205 * few pages of the chunk. Here, we go and get'em ...
209 * The expression after the '-' gives the number of pages
210 * that will fit in the new memory based on a requirement
211 * of (PAGESIZE + sizeof (page_t)) bytes per page.
213 metapgs
= npgs
- (((uint64_t)(npgs
) << PAGESHIFT
) /
214 (PAGESIZE
+ sizeof (page_t
)));
219 ASSERT(btopr(npgs
* sizeof (page_t
)) <= metapgs
);
221 exhausted
= (metapgs
== 0 || npgs
== 0);
223 if (kpm_enable
&& !exhausted
) {
224 pgcnt_t start
, end
, nkpmpgs_prelim
;
228 * A viable kpm large page mapping must not overlap two
229 * dynamic memsegs. Therefore the total size is checked
230 * to be at least kpm_pgsz and also whether start and end
231 * points are at least kpm_pgsz aligned.
233 if (ptokpmp(tpgs
) < 1 || pmodkpmp(pt_base
) ||
234 pmodkpmp(base
+ npgs
)) {
236 kphysm_addmem_error_undospan(pt_base
, tpgs
);
239 * There is no specific error code for violating
240 * kpm granularity constraints.
242 return (KPHYSM_ENOTVIABLE
);
245 start
= kpmptop(ptokpmp(base
));
246 end
= kpmptop(ptokpmp(base
+ npgs
));
247 nkpmpgs_prelim
= ptokpmp(end
- start
);
248 ptsz
= npgs
* sizeof (page_t
);
249 metapgs
= btopr(ptsz
+ nkpmpgs_prelim
* KPMPAGE_T_SZ
);
250 exhausted
= (tpgs
<= metapgs
);
252 npgs
= tpgs
- metapgs
;
253 base
= pt_base
+ metapgs
;
256 start
= kpmptop(ptokpmp(base
));
257 nkpmpgs
= ptokpmp(end
- start
);
258 kpm_pages_off
= ptsz
+
259 (nkpmpgs_prelim
- nkpmpgs
) * KPMPAGE_T_SZ
;
264 * Is memory area supplied too small?
267 kphysm_addmem_error_undospan(pt_base
, tpgs
);
269 * There is no specific error code for 'too small'.
271 return (KPHYSM_ERESOURCE
);
276 * We may re-use a previously allocated VA space for the page_ts
277 * eventually, but we need to initialize and lock the pages first.
281 * Get an address in the kernel address map, map
282 * the page_t pages and see if we can touch them.
285 mapva
= vmem_alloc(heap_arena
, ptob(metapgs
), VM_NOSLEEP
);
287 cmn_err(CE_WARN
, "kphysm_add_memory_dynamic:"
288 " Can't allocate VA for page_ts");
291 memseg_free_meta(metabase
, metapgs
);
292 kphysm_addmem_error_undospan(pt_base
, tpgs
);
294 return (KPHYSM_ERESOURCE
);
298 if (physmax
< (pt_base
+ tpgs
))
299 physmax
= (pt_base
+ tpgs
);
302 * In the remapping code we map one page at a time so we must do
303 * the same here to match mapping sizes.
307 for (pnum
= 0; pnum
< metapgs
; pnum
++) {
309 pfn
= memseg_get_metapfn(metabase
, (pgcnt_t
)pnum
);
310 hat_devload(kas
.a_hat
, vaddr
, ptob(1), pfn
,
311 PROT_READ
| PROT_WRITE
,
312 HAT_LOAD
| HAT_LOAD_LOCK
| HAT_LOAD_NOCONSIST
);
317 if (ddi_peek32((dev_info_t
*)NULL
,
318 (int32_t *)pp
, (int32_t *)0) == DDI_FAILURE
) {
320 cmn_err(CE_WARN
, "kphysm_add_memory_dynamic:"
321 " Can't access pp array at 0x%p [phys 0x%lx]",
322 (void *)pp
, pt_base
);
324 hat_unload(kas
.a_hat
, (caddr_t
)pp
, ptob(metapgs
),
325 HAT_UNLOAD_UNMAP
|HAT_UNLOAD_UNLOCK
);
327 vmem_free(heap_arena
, mapva
, ptob(metapgs
));
329 memseg_free_meta(metabase
, metapgs
);
330 kphysm_addmem_error_undospan(pt_base
, tpgs
);
332 return (KPHYSM_EFAULT
);
336 * Add this memory slice to its memory node translation.
338 * Note that right now, each node may have only one slice;
339 * this may change with COD or in larger SSM systems with
340 * nested latency groups, so we must not assume that the
341 * node does not yet exist.
343 * Note that there may be multiple memory nodes associated with
344 * a single lgrp node on x86 systems.
346 pnum
= pt_base
+ tpgs
- 1;
347 mem_node_add_range(pt_base
, pnum
);
350 * Allocate or resize page counters as necessary to accommodate
351 * the increase in memory pages.
353 mnode
= PFN_2_MEM_NODE(pnum
);
354 PAGE_CTRS_ADJUST(base
, npgs
, rv
);
357 mem_node_del_range(pt_base
, pnum
);
359 /* cleanup the page counters */
362 hat_unload(kas
.a_hat
, (caddr_t
)pp
, ptob(metapgs
),
363 HAT_UNLOAD_UNMAP
|HAT_UNLOAD_UNLOCK
);
365 vmem_free(heap_arena
, mapva
, ptob(metapgs
));
367 memseg_free_meta(metabase
, metapgs
);
368 kphysm_addmem_error_undospan(pt_base
, tpgs
);
370 return (KPHYSM_ERESOURCE
);
374 * Update the phys_avail memory list.
375 * The phys_install list was done at the start.
378 memlist_write_lock();
380 mlret
= memlist_add_span((uint64_t)(base
) << PAGESHIFT
,
381 (uint64_t)(npgs
) << PAGESHIFT
, &phys_avail
);
382 ASSERT(mlret
== MEML_SPANOP_OK
);
384 memlist_write_unlock();
386 /* See if we can find a memseg to re-use. */
388 seg
= memseg_reuse(0);
389 reuse
= 1; /* force unmapping of temp mapva */
390 flags
= MEMSEG_DYNAMIC
| MEMSEG_META_ALLOC
;
392 * There is a 1:1 fixed relationship between a pfn
393 * and a page_t VA. The pfn is used as an index into
394 * the ppvm_base page_t table in order to calculate
395 * the page_t base address for a given pfn range.
397 segpp
= ppvm_base
+ base
;
399 seg
= memseg_reuse(metapgs
);
400 reuse
= (seg
!= NULL
);
401 flags
= MEMSEG_DYNAMIC
| MEMSEG_META_INCL
;
406 * Initialize the memseg structure representing this memory
407 * and add it to the existing list of memsegs. Do some basic
408 * initialization and add the memory to the system.
409 * In order to prevent lock deadlocks, the add_physmem()
410 * code is repeated here, but split into several stages.
412 * If a memseg is reused, invalidate memseg pointers in
413 * all cpu vm caches. We need to do this this since the check
414 * pp >= seg->pages && pp < seg->epages
415 * used in various places is not atomic and so the first compare
416 * can happen before reuse and the second compare after reuse.
417 * The invalidation ensures that a memseg is not deferenced while
418 * it's page/pfn pointers are changing.
421 seg
= memseg_alloc();
423 seg
->msegflags
= flags
;
424 MEMSEG_DEBUG("memseg_get: alloc seg=0x%p, pages=0x%p",
425 (void *)seg
, (void *)(seg
->pages
));
428 ASSERT(seg
->msegflags
== flags
);
429 ASSERT(seg
->pages_base
== seg
->pages_end
);
430 MEMSEG_DEBUG("memseg_get: reuse seg=0x%p, pages=0x%p",
431 (void *)seg
, (void *)(seg
->pages
));
433 memseg_cpu_vm_flush();
438 seg
->epages
= seg
->pages
+ npgs
;
439 seg
->pages_base
= base
;
440 seg
->pages_end
= base
+ npgs
;
443 * Initialize metadata. The page_ts are set to locked state
446 bzero((caddr_t
)pp
, ptob(metapgs
));
448 pfn
= seg
->pages_base
;
449 /* Save the original pp base in case we reuse a memseg. */
452 for (pp
= opp
; pp
< oepp
; pp
++) {
455 page_iolock_init(pp
);
456 while (!page_lock(pp
, SE_EXCL
, (kmutex_t
*)NULL
, P_RECLAIM
))
458 pp
->p_offset
= (u_offset_t
)-1;
462 /* Remap our page_ts to the re-used memseg VA space. */
464 vaddr
= (caddr_t
)seg
->pages
;
465 for (pnum
= 0; pnum
< metapgs
; pnum
++) {
467 pfn
= memseg_get_metapfn(metabase
,
469 hat_devload(kas
.a_hat
, vaddr
, ptob(1), pfn
,
470 PROT_READ
| PROT_WRITE
,
471 HAT_LOAD_REMAP
| HAT_LOAD
| HAT_LOAD_NOCONSIST
);
476 hat_unload(kas
.a_hat
, (caddr_t
)opp
, ptob(metapgs
),
477 HAT_UNLOAD_UNMAP
|HAT_UNLOAD_UNLOCK
);
479 vmem_free(heap_arena
, mapva
, ptob(metapgs
));
482 hat_kpm_addmem_mseg_update(seg
, nkpmpgs
, kpm_pages_off
);
487 * The new memseg is inserted at the beginning of the list.
488 * Not only does this save searching for the tail, but in the
489 * case of a re-used memseg, it solves the problem of what
490 * happens if some process has still got a pointer to the
491 * memseg and follows the next pointer to continue traversing
495 hat_kpm_addmem_mseg_insert(seg
);
500 hat_kpm_addmem_memsegs_update(seg
);
509 * Recalculate the paging parameters now total_pages has changed.
510 * This will also cause the clock hands to be reset before next use.
516 PLCNT_MODIFY_MAX(seg
->pages_base
, (long)npgs
);
519 * Free the pages outside the lock to avoid locking loops.
521 for (pp
= seg
->pages
; pp
< seg
->epages
; pp
++) {
526 * Now that we've updated the appropriate memory lists we
527 * need to reset a number of globals, since we've increased memory.
528 * Several have already been updated for us as noted above. The
529 * globals we're interested in at this point are:
530 * physmax - highest page frame number.
531 * physinstalled - number of pages currently installed (done earlier)
532 * maxmem - max free pages in the system
533 * physmem - physical memory pages available
534 * availrmem - real memory available
537 mutex_enter(&freemem_lock
);
541 availrmem_initial
+= npgs
;
543 mutex_exit(&freemem_lock
);
547 page_freelist_coalesce_all(mnode
);
549 kphysm_setup_post_add(npgs
);
551 cmn_err(CE_CONT
, "?kphysm_add_memory_dynamic: mem = %ldK "
553 physinstalled
<< (PAGESHIFT
- 10),
554 (uint64_t)physinstalled
<< PAGESHIFT
);
556 avmem
= (uint64_t)freemem
<< PAGESHIFT
;
557 cmn_err(CE_CONT
, "?kphysm_add_memory_dynamic: "
558 "avail mem = %" PRId64
"\n", avmem
);
561 * Update lgroup generation number on single lgroup systems
564 lgrp_config(LGRP_CONFIG_GEN_UPDATE
, 0, 0);
567 * Inform DDI of update
569 ddi_mem_update((uint64_t)(pt_base
) << PAGESHIFT
,
570 (uint64_t)(tpgs
) << PAGESHIFT
);
572 delspan_unreserve(pt_base
, tpgs
);
574 return (KPHYSM_OK
); /* Successfully added system memory */
578 * There are various error conditions in kphysm_add_memory_dynamic()
579 * which require a rollback of already changed global state.
582 kphysm_addmem_error_undospan(pfn_t pt_base
, pgcnt_t tpgs
)
586 /* Unreserve memory span. */
587 memlist_write_lock();
589 mlret
= memlist_delete_span(
590 (uint64_t)(pt_base
) << PAGESHIFT
,
591 (uint64_t)(tpgs
) << PAGESHIFT
, &phys_install
);
593 ASSERT(mlret
== MEML_SPANOP_OK
);
594 phys_install_has_changed();
595 installed_top_size(phys_install
, &physmax
, &physinstalled
);
597 memlist_write_unlock();
598 delspan_unreserve(pt_base
, tpgs
);
602 * Only return an available memseg of exactly the right size
603 * if size is required.
604 * When the meta data area has it's own virtual address space
605 * we will need to manage this more carefully and do best fit
606 * allocations, possibly splitting an available area.
609 memseg_reuse(pgcnt_t metapgs
)
612 struct memseg
**segpp
, *seg
;
614 mutex_enter(&memseg_lists_lock
);
616 segpp
= &memseg_va_avail
;
617 for (; (seg
= *segpp
) != NULL
; segpp
= &seg
->lnext
) {
621 * Make sure we are reusing the right segment type.
623 type
= metapgs
? MEMSEG_META_INCL
: MEMSEG_META_ALLOC
;
625 if ((seg
->msegflags
& (MEMSEG_META_INCL
| MEMSEG_META_ALLOC
))
630 end
= hat_kpm_mseg_reuse(seg
);
632 end
= (caddr_t
)seg
->epages
;
635 * Check for the right size if it is provided.
637 if (!metapgs
|| btopr(end
- (caddr_t
)seg
->pages
) == metapgs
) {
643 mutex_exit(&memseg_lists_lock
);
648 static uint_t handle_gen
;
651 struct memdelspan
*mds_next
;
655 uint_t
*mds_bitmap_retired
;
658 #define NBPBMW (sizeof (uint_t) * NBBY)
659 #define MDS_BITMAPBYTES(MDSP) \
660 ((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t))
662 struct transit_list
{
663 struct transit_list
*trl_next
;
664 struct memdelspan
*trl_spans
;
668 struct transit_list_head
{
670 struct transit_list
*trh_head
;
673 static struct transit_list_head transit_list_head
;
676 static void transit_list_collect(struct mem_handle
*, int);
677 static void transit_list_insert(struct transit_list
*);
678 static void transit_list_remove(struct transit_list
*);
681 #define MEM_DEL_STATS
685 static int mem_del_stat_print
= 0;
686 struct mem_del_stat
{
699 uint_t first_notfree
;
709 uint64_t nticks_total
;
710 uint64_t nticks_pgrp
;
720 * The stat values are only incremented in the delete thread
721 * so no locking or atomic required.
723 #define MDSTAT_INCR(MHP, FLD) (MHP)->mh_delstat.FLD++
724 #define MDSTAT_TOTAL(MHP, ntck) ((MHP)->mh_delstat.nticks_total += (ntck))
725 #define MDSTAT_PGRP(MHP, ntck) ((MHP)->mh_delstat.nticks_pgrp += (ntck))
726 static void mem_del_stat_print_func(struct mem_handle
*);
727 #define MDSTAT_PRINT(MHP) mem_del_stat_print_func((MHP))
728 #else /* MEM_DEL_STATS */
729 #define MDSTAT_INCR(MHP, FLD)
730 #define MDSTAT_TOTAL(MHP, ntck)
731 #define MDSTAT_PGRP(MHP, ntck)
732 #define MDSTAT_PRINT(MHP)
733 #endif /* MEM_DEL_STATS */
735 typedef enum mhnd_state
{MHND_FREE
= 0, MHND_INIT
, MHND_STARTING
,
736 MHND_RUNNING
, MHND_DONE
, MHND_RELEASE
} mhnd_state_t
;
739 * mh_mutex must be taken to examine or change mh_exthandle and mh_state.
740 * The mutex may not be required for other fields, dependent on mh_state.
744 struct mem_handle
*mh_next
;
745 memhandle_t mh_exthandle
;
746 mhnd_state_t mh_state
;
747 struct transit_list mh_transit
;
748 pgcnt_t mh_phys_pages
;
750 pgcnt_t mh_hold_todo
;
751 void (*mh_delete_complete
)(void *, int error
);
752 void *mh_delete_complete_arg
;
753 volatile uint_t mh_cancel
;
754 volatile uint_t mh_dr_aio_cleanup_cancel
;
755 volatile uint_t mh_aio_cleanup_done
;
757 kthread_id_t mh_thread_id
;
758 page_t
*mh_deleted
; /* link through p_next */
760 struct mem_del_stat mh_delstat
;
761 #endif /* MEM_DEL_STATS */
764 static struct mem_handle
*mem_handle_head
;
765 static kmutex_t mem_handle_list_mutex
;
767 static struct mem_handle
*
768 kphysm_allocate_mem_handle()
770 struct mem_handle
*mhp
;
772 mhp
= kmem_zalloc(sizeof (struct mem_handle
), KM_SLEEP
);
773 mutex_init(&mhp
->mh_mutex
, NULL
, MUTEX_DEFAULT
, NULL
);
774 mutex_enter(&mem_handle_list_mutex
);
775 mutex_enter(&mhp
->mh_mutex
);
776 /* handle_gen is protected by list mutex. */
777 mhp
->mh_exthandle
= (memhandle_t
)(uintptr_t)(++handle_gen
);
778 mhp
->mh_next
= mem_handle_head
;
779 mem_handle_head
= mhp
;
780 mutex_exit(&mem_handle_list_mutex
);
786 kphysm_free_mem_handle(struct mem_handle
*mhp
)
788 struct mem_handle
**mhpp
;
790 ASSERT(mutex_owned(&mhp
->mh_mutex
));
791 ASSERT(mhp
->mh_state
== MHND_FREE
);
793 * Exit the mutex to preserve locking order. This is OK
794 * here as once in the FREE state, the handle cannot
795 * be found by a lookup.
797 mutex_exit(&mhp
->mh_mutex
);
799 mutex_enter(&mem_handle_list_mutex
);
800 mhpp
= &mem_handle_head
;
801 while (*mhpp
!= NULL
&& *mhpp
!= mhp
)
802 mhpp
= &(*mhpp
)->mh_next
;
803 ASSERT(*mhpp
== mhp
);
805 * No need to lock the handle (mh_mutex) as only
806 * mh_next changing and this is the only thread that
807 * can be referncing mhp.
809 *mhpp
= mhp
->mh_next
;
810 mutex_exit(&mem_handle_list_mutex
);
812 mutex_destroy(&mhp
->mh_mutex
);
813 kmem_free(mhp
, sizeof (struct mem_handle
));
817 * This function finds the internal mem_handle corresponding to an
818 * external handle and returns it with the mh_mutex held.
820 static struct mem_handle
*
821 kphysm_lookup_mem_handle(memhandle_t handle
)
823 struct mem_handle
*mhp
;
825 mutex_enter(&mem_handle_list_mutex
);
826 for (mhp
= mem_handle_head
; mhp
!= NULL
; mhp
= mhp
->mh_next
) {
827 if (mhp
->mh_exthandle
== handle
) {
828 mutex_enter(&mhp
->mh_mutex
);
830 * The state of the handle could have been changed
831 * by kphysm_del_release() while waiting for mh_mutex.
833 if (mhp
->mh_state
== MHND_FREE
) {
834 mutex_exit(&mhp
->mh_mutex
);
840 mutex_exit(&mem_handle_list_mutex
);
845 kphysm_del_gethandle(memhandle_t
*xmhp
)
847 struct mem_handle
*mhp
;
849 mhp
= kphysm_allocate_mem_handle();
851 * The handle is allocated using KM_SLEEP, so cannot fail.
852 * If the implementation is changed, the correct error to return
853 * here would be KPHYSM_ENOHANDLES.
855 ASSERT(mhp
->mh_state
== MHND_FREE
);
856 mhp
->mh_state
= MHND_INIT
;
857 *xmhp
= mhp
->mh_exthandle
;
858 mutex_exit(&mhp
->mh_mutex
);
863 overlapping(pfn_t b1
, pgcnt_t l1
, pfn_t b2
, pgcnt_t l2
)
870 return (!(b2
>= e1
|| b1
>= e2
));
873 static int can_remove_pgs(pgcnt_t
);
875 static struct memdelspan
*
876 span_to_install(pfn_t base
, pgcnt_t npgs
)
878 struct memdelspan
*mdsp
;
879 struct memdelspan
*mdsp_new
;
880 uint64_t address
, size
, thislen
;
885 address
= (uint64_t)base
<< PAGESHIFT
;
886 size
= (uint64_t)npgs
<< PAGESHIFT
;
889 for (mlp
= phys_install
; mlp
!= NULL
; mlp
= mlp
->ml_next
) {
890 if (address
>= (mlp
->ml_address
+ mlp
->ml_size
))
892 if ((address
+ size
) > mlp
->ml_address
)
900 if (address
< mlp
->ml_address
) {
901 size
-= (mlp
->ml_address
- address
);
902 address
= mlp
->ml_address
;
904 ASSERT(address
>= mlp
->ml_address
);
905 if ((address
+ size
) >
906 (mlp
->ml_address
+ mlp
->ml_size
)) {
908 mlp
->ml_size
- (address
- mlp
->ml_address
);
913 memlist_read_unlock();
914 /* TODO: phys_install could change now */
917 mdsp
= kmem_zalloc(sizeof (struct memdelspan
), KM_SLEEP
);
918 mdsp
->mds_base
= btop(address
);
919 mdsp
->mds_npgs
= btop(thislen
);
920 mdsp
->mds_next
= mdsp_new
;
929 free_delspans(struct memdelspan
*mdsp
)
931 struct memdelspan
*amdsp
;
933 while ((amdsp
= mdsp
) != NULL
) {
934 mdsp
= amdsp
->mds_next
;
935 kmem_free(amdsp
, sizeof (struct memdelspan
));
940 * Concatenate lists. No list ordering is required.
944 delspan_concat(struct memdelspan
**mdspp
, struct memdelspan
*mdsp
)
946 while (*mdspp
!= NULL
)
947 mdspp
= &(*mdspp
)->mds_next
;
953 * Given a new list of delspans, check there is no overlap with
954 * all existing span activity (add or delete) and then concatenate
955 * the new spans to the given list.
956 * Return 1 for OK, 0 if overlapping.
960 struct transit_list
*my_tlp
,
961 struct memdelspan
*mdsp_new
)
963 struct transit_list_head
*trh
;
964 struct transit_list
*tlp
;
967 trh
= &transit_list_head
;
969 ASSERT(my_tlp
!= NULL
);
970 ASSERT(mdsp_new
!= NULL
);
973 mutex_enter(&trh
->trh_lock
);
974 /* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */
975 for (tlp
= trh
->trh_head
; tlp
!= NULL
; tlp
= tlp
->trl_next
) {
976 struct memdelspan
*mdsp
;
978 for (mdsp
= tlp
->trl_spans
; mdsp
!= NULL
;
979 mdsp
= mdsp
->mds_next
) {
980 struct memdelspan
*nmdsp
;
982 for (nmdsp
= mdsp_new
; nmdsp
!= NULL
;
983 nmdsp
= nmdsp
->mds_next
) {
984 if (overlapping(mdsp
->mds_base
, mdsp
->mds_npgs
,
985 nmdsp
->mds_base
, nmdsp
->mds_npgs
)) {
994 if (my_tlp
->trl_spans
== NULL
)
995 transit_list_insert(my_tlp
);
996 delspan_concat(&my_tlp
->trl_spans
, mdsp_new
);
998 mutex_exit(&trh
->trh_lock
);
1004 struct transit_list
*my_tlp
,
1008 struct transit_list_head
*trh
;
1009 struct memdelspan
*mdsp
;
1011 trh
= &transit_list_head
;
1013 ASSERT(my_tlp
!= NULL
);
1015 mutex_enter(&trh
->trh_lock
);
1016 if ((mdsp
= my_tlp
->trl_spans
) != NULL
) {
1018 my_tlp
->trl_spans
= NULL
;
1019 free_delspans(mdsp
);
1020 transit_list_remove(my_tlp
);
1022 struct memdelspan
**prv
;
1024 prv
= &my_tlp
->trl_spans
;
1025 while (mdsp
!= NULL
) {
1028 p_end
= mdsp
->mds_base
+ mdsp
->mds_npgs
;
1029 if (mdsp
->mds_base
>= base
&&
1030 p_end
<= (base
+ npgs
)) {
1031 *prv
= mdsp
->mds_next
;
1032 mdsp
->mds_next
= NULL
;
1033 free_delspans(mdsp
);
1035 prv
= &mdsp
->mds_next
;
1039 if (my_tlp
->trl_spans
== NULL
)
1040 transit_list_remove(my_tlp
);
1043 mutex_exit(&trh
->trh_lock
);
1047 * Reserve interface for add to stop delete before add finished.
1048 * This list is only accessed through the delspan_insert/remove
1049 * functions and so is fully protected by the mutex in struct transit_list.
1052 static struct transit_list reserve_transit
;
1055 delspan_reserve(pfn_t base
, pgcnt_t npgs
)
1057 struct memdelspan
*mdsp
;
1060 mdsp
= kmem_zalloc(sizeof (struct memdelspan
), KM_SLEEP
);
1061 mdsp
->mds_base
= base
;
1062 mdsp
->mds_npgs
= npgs
;
1063 if ((ret
= delspan_insert(&reserve_transit
, mdsp
)) == 0) {
1064 free_delspans(mdsp
);
1070 delspan_unreserve(pfn_t base
, pgcnt_t npgs
)
1072 delspan_remove(&reserve_transit
, base
, npgs
);
1076 * Return whether memseg was created by kphysm_add_memory_dynamic().
1079 memseg_is_dynamic(struct memseg
*seg
)
1081 return (seg
->msegflags
& MEMSEG_DYNAMIC
);
1090 struct mem_handle
*mhp
;
1092 struct memdelspan
*mdsp
;
1093 struct memdelspan
*mdsp_new
;
1094 pgcnt_t phys_pages
, vm_pages
;
1099 mhp
= kphysm_lookup_mem_handle(handle
);
1101 return (KPHYSM_EHANDLE
);
1103 if (mhp
->mh_state
!= MHND_INIT
) {
1104 mutex_exit(&mhp
->mh_mutex
);
1105 return (KPHYSM_ESEQUENCE
);
1109 * Intersect the span with the installed memory list (phys_install).
1111 mdsp_new
= span_to_install(base
, npgs
);
1112 if (mdsp_new
== NULL
) {
1114 * No physical memory in this range. Is this an
1115 * error? If an attempt to start the delete is made
1116 * for OK returns from del_span such as this, start will
1118 * Could return KPHYSM_ENOWORK.
1121 * It is assumed that there are no error returns
1122 * from span_to_install() due to kmem_alloc failure.
1124 mutex_exit(&mhp
->mh_mutex
);
1128 * Does this span overlap an existing span?
1130 if (delspan_insert(&mhp
->mh_transit
, mdsp_new
) == 0) {
1132 * Differentiate between already on list for this handle
1133 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY).
1136 for (mdsp
= mhp
->mh_transit
.trl_spans
; mdsp
!= NULL
;
1137 mdsp
= mdsp
->mds_next
) {
1138 if (overlapping(mdsp
->mds_base
, mdsp
->mds_npgs
,
1144 mutex_exit(&mhp
->mh_mutex
);
1145 free_delspans(mdsp_new
);
1149 * At this point the spans in mdsp_new have been inserted into the
1150 * list of spans for this handle and thereby to the global list of
1151 * spans being processed. Each of these spans must now be checked
1152 * for relocatability. As a side-effect segments in the memseg list
1155 * Note that mdsp_new can no longer be used as it is now part of
1156 * a larger list. Select elements of this larger list based
1163 for (mdsp
= mhp
->mh_transit
.trl_spans
; mdsp
!= NULL
;
1164 mdsp
= mdsp
->mds_next
) {
1165 pgcnt_t pages_checked
;
1167 if (!overlapping(mdsp
->mds_base
, mdsp
->mds_npgs
, base
, npgs
)) {
1170 p_end
= mdsp
->mds_base
+ mdsp
->mds_npgs
;
1172 * The pages_checked count is a hack. All pages should be
1173 * checked for relocatability. Those not covered by memsegs
1174 * should be tested with arch_kphysm_del_span_ok().
1177 for (seg
= memsegs
; seg
; seg
= seg
->next
) {
1180 if (seg
->pages_base
>= p_end
||
1181 seg
->pages_end
<= mdsp
->mds_base
) {
1182 /* Span and memseg don't overlap. */
1185 mseg_start
= memseg_get_start(seg
);
1186 /* Check that segment is suitable for delete. */
1187 if (memseg_includes_meta(seg
)) {
1189 * Check that this segment is completely
1192 if (mseg_start
< mdsp
->mds_base
||
1193 seg
->pages_end
> p_end
) {
1197 pages_checked
+= seg
->pages_end
- mseg_start
;
1200 * If this segment is larger than the span,
1201 * try to split it. After the split, it
1202 * is necessary to restart.
1204 if (seg
->pages_base
< mdsp
->mds_base
||
1205 seg
->pages_end
> p_end
) {
1210 /* Split required. */
1211 if (mdsp
->mds_base
< seg
->pages_base
)
1212 abase
= seg
->pages_base
;
1214 abase
= mdsp
->mds_base
;
1215 if (p_end
> seg
->pages_end
)
1216 anpgs
= seg
->pages_end
- abase
;
1218 anpgs
= p_end
- abase
;
1219 s_ret
= kphysm_split_memseg(abase
,
1223 ret
= KPHYSM_ERESOURCE
;
1229 seg
->pages_end
- seg
->pages_base
;
1232 * The memseg is wholly within the delete span.
1233 * The individual pages can now be checked.
1236 for (pp
= seg
->pages
; pp
< seg
->epages
; pp
++) {
1237 if (PP_ISNORELOC(pp
)) {
1238 ret
= KPHYSM_ENONRELOC
;
1242 if (ret
!= KPHYSM_OK
) {
1245 phys_pages
+= (seg
->pages_end
- mseg_start
);
1246 vm_pages
+= MSEG_NPAGES(seg
);
1248 if (ret
!= KPHYSM_OK
)
1250 if (pages_checked
!= mdsp
->mds_npgs
) {
1251 ret
= KPHYSM_ENONRELOC
;
1256 if (ret
== KPHYSM_OK
) {
1257 mhp
->mh_phys_pages
+= phys_pages
;
1258 mhp
->mh_vm_pages
+= vm_pages
;
1261 * Keep holding the mh_mutex to prevent it going away.
1263 delspan_remove(&mhp
->mh_transit
, base
, npgs
);
1265 mutex_exit(&mhp
->mh_mutex
);
1270 kphysm_del_span_query(
1275 struct memdelspan
*mdsp
;
1276 struct memdelspan
*mdsp_new
;
1277 int done_first_nonreloc
;
1279 mqp
->phys_pages
= 0;
1281 mqp
->nonrelocatable
= 0;
1282 mqp
->first_nonrelocatable
= 0;
1283 mqp
->last_nonrelocatable
= 0;
1285 mdsp_new
= span_to_install(base
, npgs
);
1287 * It is OK to proceed here if mdsp_new == NULL.
1289 done_first_nonreloc
= 0;
1290 for (mdsp
= mdsp_new
; mdsp
!= NULL
; mdsp
= mdsp
->mds_next
) {
1294 mqp
->phys_pages
+= mdsp
->mds_npgs
;
1295 sbase
= mdsp
->mds_base
;
1296 snpgs
= mdsp
->mds_npgs
;
1297 while (snpgs
!= 0) {
1298 struct memseg
*lseg
, *seg
;
1303 p_end
= sbase
+ snpgs
;
1305 * Find the lowest addressed memseg that starts
1306 * after sbase and account for it.
1307 * This is to catch dynamic memsegs whose start
1311 for (lseg
= memsegs
; lseg
!= NULL
; lseg
= lseg
->next
) {
1312 if ((lseg
->pages_base
>= sbase
) ||
1313 (lseg
->pages_base
< p_end
&&
1314 lseg
->pages_end
> sbase
)) {
1316 seg
->pages_base
> lseg
->pages_base
)
1321 mseg_start
= memseg_get_start(seg
);
1323 * Now have the full extent of the memseg so
1324 * do the range check.
1326 if (mseg_start
>= p_end
||
1327 seg
->pages_end
<= sbase
) {
1328 /* Span does not overlap memseg. */
1333 * Account for gap either before the segment if
1334 * there is one or to the end of the span.
1336 if (seg
== NULL
|| mseg_start
> sbase
) {
1339 a_end
= (seg
== NULL
) ? p_end
: mseg_start
;
1341 * Check with arch layer for relocatability.
1343 if (arch_kphysm_del_span_ok(sbase
,
1346 * No non-relocatble pages in this
1347 * area, avoid the fine-grained
1350 snpgs
-= (a_end
- sbase
);
1353 while (sbase
< a_end
) {
1354 if (!arch_kphysm_del_span_ok(sbase
,
1356 mqp
->nonrelocatable
++;
1357 if (!done_first_nonreloc
) {
1359 first_nonrelocatable
1361 done_first_nonreloc
= 1;
1363 mqp
->last_nonrelocatable
=
1371 ASSERT(mseg_start
<= sbase
);
1372 if (seg
->pages_base
!= mseg_start
&&
1373 seg
->pages_base
> sbase
) {
1377 * Skip the page_t area of a
1380 skip_pgs
= seg
->pages_base
- sbase
;
1381 if (snpgs
<= skip_pgs
) {
1390 ASSERT(seg
->pages_base
<= sbase
);
1392 * The individual pages can now be checked.
1394 for (pp
= seg
->pages
+
1395 (sbase
- seg
->pages_base
);
1396 snpgs
!= 0 && pp
< seg
->epages
; pp
++) {
1398 if (PP_ISNORELOC(pp
)) {
1399 mqp
->nonrelocatable
++;
1400 if (!done_first_nonreloc
) {
1402 first_nonrelocatable
1404 done_first_nonreloc
= 1;
1406 mqp
->last_nonrelocatable
=
1416 free_delspans(mdsp_new
);
1422 * This release function can be called at any stage as follows:
1423 * _gethandle only called
1424 * _span(s) only called
1425 * _start called but failed
1426 * delete thread exited
1429 kphysm_del_release(memhandle_t handle
)
1431 struct mem_handle
*mhp
;
1433 mhp
= kphysm_lookup_mem_handle(handle
);
1435 return (KPHYSM_EHANDLE
);
1437 switch (mhp
->mh_state
) {
1440 mutex_exit(&mhp
->mh_mutex
);
1441 return (KPHYSM_ENOTFINISHED
);
1443 ASSERT(mhp
->mh_state
!= MHND_FREE
);
1444 mutex_exit(&mhp
->mh_mutex
);
1445 return (KPHYSM_EHANDLE
);
1451 mutex_exit(&mhp
->mh_mutex
);
1452 return (KPHYSM_ESEQUENCE
);
1455 cmn_err(CE_WARN
, "kphysm_del_release(0x%p) state corrupt %d",
1456 (void *)mhp
, mhp
->mh_state
);
1458 mutex_exit(&mhp
->mh_mutex
);
1459 return (KPHYSM_EHANDLE
);
1462 * Set state so that we can wait if necessary.
1463 * Also this means that we have read/write access to all
1464 * fields except mh_exthandle and mh_state.
1466 mhp
->mh_state
= MHND_RELEASE
;
1468 * The mem_handle cannot be de-allocated by any other operation
1469 * now, so no need to hold mh_mutex.
1471 mutex_exit(&mhp
->mh_mutex
);
1473 delspan_remove(&mhp
->mh_transit
, 0, 0);
1474 mhp
->mh_phys_pages
= 0;
1475 mhp
->mh_vm_pages
= 0;
1476 mhp
->mh_hold_todo
= 0;
1477 mhp
->mh_delete_complete
= NULL
;
1478 mhp
->mh_delete_complete_arg
= NULL
;
1481 mutex_enter(&mhp
->mh_mutex
);
1482 ASSERT(mhp
->mh_state
== MHND_RELEASE
);
1483 mhp
->mh_state
= MHND_FREE
;
1485 kphysm_free_mem_handle(mhp
);
1491 * This cancel function can only be called with the thread running.
1494 kphysm_del_cancel(memhandle_t handle
)
1496 struct mem_handle
*mhp
;
1498 mhp
= kphysm_lookup_mem_handle(handle
);
1500 return (KPHYSM_EHANDLE
);
1502 if (mhp
->mh_state
!= MHND_STARTING
&& mhp
->mh_state
!= MHND_RUNNING
) {
1503 mutex_exit(&mhp
->mh_mutex
);
1504 return (KPHYSM_ENOTRUNNING
);
1507 * Set the cancel flag and wake the delete thread up.
1508 * The thread may be waiting on I/O, so the effect of the cancel
1511 if (mhp
->mh_cancel
== 0) {
1512 mhp
->mh_cancel
= KPHYSM_ECANCELLED
;
1513 cv_signal(&mhp
->mh_cv
);
1515 mutex_exit(&mhp
->mh_mutex
);
1522 memdelstat_t
*mdstp
)
1524 struct mem_handle
*mhp
;
1526 mhp
= kphysm_lookup_mem_handle(handle
);
1528 return (KPHYSM_EHANDLE
);
1531 * Calling kphysm_del_status() is allowed before the delete
1532 * is started to allow for status display.
1534 if (mhp
->mh_state
!= MHND_INIT
&& mhp
->mh_state
!= MHND_STARTING
&&
1535 mhp
->mh_state
!= MHND_RUNNING
) {
1536 mutex_exit(&mhp
->mh_mutex
);
1537 return (KPHYSM_ENOTRUNNING
);
1539 mdstp
->phys_pages
= mhp
->mh_phys_pages
;
1540 mdstp
->managed
= mhp
->mh_vm_pages
;
1541 mdstp
->collected
= mhp
->mh_vm_pages
- mhp
->mh_hold_todo
;
1542 mutex_exit(&mhp
->mh_mutex
);
1546 static int mem_delete_additional_pages
= 100;
1549 can_remove_pgs(pgcnt_t npgs
)
1552 * If all pageable pages were paged out, freemem would
1553 * equal availrmem. There is a minimum requirement for
1556 if ((availrmem
- (tune
.t_minarmem
+ mem_delete_additional_pages
))
1559 /* TODO: check swap space, etc. */
1564 get_availrmem(pgcnt_t npgs
)
1568 mutex_enter(&freemem_lock
);
1569 ret
= can_remove_pgs(npgs
);
1572 mutex_exit(&freemem_lock
);
1577 put_availrmem(pgcnt_t npgs
)
1579 mutex_enter(&freemem_lock
);
1581 mutex_exit(&freemem_lock
);
1584 #define FREEMEM_INCR 100
1585 static pgcnt_t freemem_incr
= FREEMEM_INCR
;
1586 #define DEL_FREE_WAIT_FRAC 4
1587 #define DEL_FREE_WAIT_TICKS ((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC)
1589 #define DEL_BUSY_WAIT_FRAC 20
1590 #define DEL_BUSY_WAIT_TICKS ((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC)
1592 static void kphysm_del_cleanup(struct mem_handle
*);
1594 static void page_delete_collect(page_t
*, struct mem_handle
*);
1597 delthr_get_freemem(struct mem_handle
*mhp
)
1602 ASSERT(MUTEX_HELD(&mhp
->mh_mutex
));
1604 MDSTAT_INCR(mhp
, need_free
);
1606 * Get up to freemem_incr pages.
1608 free_get
= freemem_incr
;
1609 if (free_get
> mhp
->mh_hold_todo
)
1610 free_get
= mhp
->mh_hold_todo
;
1612 * Take free_get pages away from freemem,
1613 * waiting if necessary.
1616 while (!mhp
->mh_cancel
) {
1617 mutex_exit(&mhp
->mh_mutex
);
1618 MDSTAT_INCR(mhp
, free_loop
);
1620 * Duplicate test from page_create_throttle()
1621 * but don't override with !PG_WAIT.
1623 if (freemem
< (free_get
+ throttlefree
)) {
1624 MDSTAT_INCR(mhp
, free_low
);
1627 ret
= page_create_wait(free_get
, 0);
1630 MDSTAT_INCR(mhp
, free_failed
);
1634 mutex_enter(&mhp
->mh_mutex
);
1639 * Put pressure on pageout.
1641 page_needfree(free_get
);
1642 WAKE_PAGEOUT_SCANNER(delthr
);
1644 mutex_enter(&mhp
->mh_mutex
);
1645 (void) cv_reltimedwait(&mhp
->mh_cv
, &mhp
->mh_mutex
,
1646 DEL_FREE_WAIT_TICKS
, TR_CLOCK_TICK
);
1647 mutex_exit(&mhp
->mh_mutex
);
1648 page_needfree(-(spgcnt_t
)free_get
);
1650 mutex_enter(&mhp
->mh_mutex
);
1655 #define DR_AIO_CLEANUP_DELAY 25000 /* 0.025secs, in usec */
1656 #define DR_AIO_CLEANUP_MAXLOOPS_NODELAY 100
1658 * This function is run as a helper thread for delete_memory_thread.
1659 * It is needed in order to force kaio cleanup, so that pages used in kaio
1660 * will be unlocked and subsequently relocated by delete_memory_thread.
1661 * The address of the delete_memory_threads's mem_handle is passed in to
1662 * this thread function, and is used to set the mh_aio_cleanup_done member
1663 * prior to calling thread_exit().
1666 dr_aio_cleanup_thread(caddr_t amhp
)
1669 int (*aio_cleanup_dr_delete_memory
)(proc_t
*);
1672 struct mem_handle
*mhp
;
1673 volatile uint_t
*pcancel
;
1675 mhp
= (struct mem_handle
*)amhp
;
1676 ASSERT(mhp
!= NULL
);
1677 pcancel
= &mhp
->mh_dr_aio_cleanup_cancel
;
1678 if (modload("sys", "kaio") == -1) {
1679 mhp
->mh_aio_cleanup_done
= 1;
1680 cmn_err(CE_WARN
, "dr_aio_cleanup_thread: cannot load kaio");
1683 aio_cleanup_dr_delete_memory
= (int (*)(proc_t
*))
1684 modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
1685 if (aio_cleanup_dr_delete_memory
== NULL
) {
1686 mhp
->mh_aio_cleanup_done
= 1;
1688 "aio_cleanup_dr_delete_memory not found in kaio");
1693 mutex_enter(&pidlock
);
1694 for (procp
= practive
; (*pcancel
== 0) && (procp
!= NULL
);
1695 procp
= procp
->p_next
) {
1696 mutex_enter(&procp
->p_lock
);
1697 if (procp
->p_aio
!= NULL
) {
1698 /* cleanup proc's outstanding kaio */
1700 (*aio_cleanup_dr_delete_memory
)(procp
);
1702 mutex_exit(&procp
->p_lock
);
1704 mutex_exit(&pidlock
);
1705 if ((*pcancel
== 0) &&
1706 (!cleaned
|| (++n
== DR_AIO_CLEANUP_MAXLOOPS_NODELAY
))) {
1707 /* delay a bit before retrying all procs again */
1708 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY
));
1711 } while (*pcancel
== 0);
1712 mhp
->mh_aio_cleanup_done
= 1;
1717 delete_memory_thread(caddr_t amhp
)
1719 struct mem_handle
*mhp
;
1720 struct memdelspan
*mdsp
;
1721 callb_cpr_t cprinfo
;
1723 spgcnt_t freemem_left
;
1724 void (*del_complete_funcp
)(void *, int error
);
1725 void *del_complete_arg
;
1730 #ifdef MEM_DEL_STATS
1731 uint64_t start_total
, ntick_total
;
1732 uint64_t start_pgrp
, ntick_pgrp
;
1733 #endif /* MEM_DEL_STATS */
1735 mhp
= (struct mem_handle
*)amhp
;
1737 #ifdef MEM_DEL_STATS
1738 start_total
= ddi_get_lbolt();
1739 #endif /* MEM_DEL_STATS */
1741 CALLB_CPR_INIT(&cprinfo
, &mhp
->mh_mutex
,
1742 callb_generic_cpr
, "memdel");
1744 mutex_enter(&mhp
->mh_mutex
);
1745 ASSERT(mhp
->mh_state
== MHND_STARTING
);
1747 mhp
->mh_state
= MHND_RUNNING
;
1748 mhp
->mh_thread_id
= curthread
;
1750 mhp
->mh_hold_todo
= mhp
->mh_vm_pages
;
1751 mutex_exit(&mhp
->mh_mutex
);
1753 /* Allocate the remap pages now, if necessary. */
1754 memseg_remap_init();
1757 * Subtract from availrmem now if possible as availrmem
1758 * may not be available by the end of the delete.
1760 if (!get_availrmem(mhp
->mh_vm_pages
)) {
1761 comp_code
= KPHYSM_ENOTVIABLE
;
1762 mutex_enter(&mhp
->mh_mutex
);
1766 ret
= kphysm_setup_pre_del(mhp
->mh_vm_pages
);
1768 mutex_enter(&mhp
->mh_mutex
);
1771 mhp
->mh_cancel
= KPHYSM_EREFUSED
;
1775 transit_list_collect(mhp
, 1);
1777 for (mdsp
= mhp
->mh_transit
.trl_spans
; mdsp
!= NULL
;
1778 mdsp
= mdsp
->mds_next
) {
1779 ASSERT(mdsp
->mds_bitmap
== NULL
);
1780 mdsp
->mds_bitmap
= kmem_zalloc(MDS_BITMAPBYTES(mdsp
), KM_SLEEP
);
1781 mdsp
->mds_bitmap_retired
= kmem_zalloc(MDS_BITMAPBYTES(mdsp
),
1788 * Start dr_aio_cleanup_thread, which periodically iterates
1789 * through the process list and invokes aio cleanup. This
1790 * is needed in order to avoid a deadly embrace between the
1791 * delete_memory_thread (waiting on writer lock for page, with the
1792 * exclusive-wanted bit set), kaio read request threads (waiting for a
1793 * reader lock on the same page that is wanted by the
1794 * delete_memory_thread), and threads waiting for kaio completion
1795 * (blocked on spt_amp->lock).
1797 mhp
->mh_dr_aio_cleanup_cancel
= 0;
1798 mhp
->mh_aio_cleanup_done
= 0;
1799 (void) thread_create(NULL
, 0, dr_aio_cleanup_thread
,
1800 (caddr_t
)mhp
, 0, &p0
, TS_RUN
, maxclsyspri
- 1);
1801 while ((mhp
->mh_hold_todo
!= 0) && (mhp
->mh_cancel
== 0)) {
1804 MDSTAT_INCR(mhp
, nloop
);
1806 for (mdsp
= mhp
->mh_transit
.trl_spans
; (mdsp
!= NULL
) &&
1807 (mhp
->mh_cancel
== 0); mdsp
= mdsp
->mds_next
) {
1810 p_end
= mdsp
->mds_base
+ mdsp
->mds_npgs
;
1811 for (pfn
= mdsp
->mds_base
; (pfn
< p_end
) &&
1812 (mhp
->mh_cancel
== 0); pfn
++) {
1813 page_t
*pp
, *tpp
, *tpp_targ
;
1820 bit
= pfn
- mdsp
->mds_base
;
1821 if ((mdsp
->mds_bitmap
[bit
/ NBPBMW
] &
1822 (1 << (bit
% NBPBMW
))) != 0) {
1823 MDSTAT_INCR(mhp
, already_done
);
1826 if (freemem_left
== 0) {
1827 freemem_left
+= delthr_get_freemem(mhp
);
1828 if (freemem_left
== 0)
1833 * Release mh_mutex - some of this
1834 * stuff takes some time (eg PUTPAGE).
1837 mutex_exit(&mhp
->mh_mutex
);
1838 MDSTAT_INCR(mhp
, ncheck
);
1840 pp
= page_numtopp_nolock(pfn
);
1843 * Not covered by a page_t - will
1844 * be dealt with elsewhere.
1846 MDSTAT_INCR(mhp
, nopaget
);
1847 mutex_enter(&mhp
->mh_mutex
);
1848 mdsp
->mds_bitmap
[bit
/ NBPBMW
] |=
1849 (1 << (bit
% NBPBMW
));
1853 if (!page_try_reclaim_lock(pp
, SE_EXCL
,
1854 SE_EXCL_WANTED
| SE_RETIRED
)) {
1856 * Page in use elsewhere. Skip it.
1858 MDSTAT_INCR(mhp
, lockfail
);
1859 mutex_enter(&mhp
->mh_mutex
);
1863 * See if the cage expanded into the delete.
1864 * This can happen as we have to allow the
1867 if (PP_ISNORELOC(pp
)) {
1869 mutex_enter(&mhp
->mh_mutex
);
1870 mhp
->mh_cancel
= KPHYSM_ENONRELOC
;
1873 if (PP_RETIRED(pp
)) {
1875 * Page has been retired and is
1876 * not part of the cage so we
1877 * can now do the accounting for
1880 MDSTAT_INCR(mhp
, retired
);
1881 mutex_enter(&mhp
->mh_mutex
);
1882 mdsp
->mds_bitmap
[bit
/ NBPBMW
]
1883 |= (1 << (bit
% NBPBMW
));
1884 mdsp
->mds_bitmap_retired
[bit
/
1886 (1 << (bit
% NBPBMW
));
1887 mhp
->mh_hold_todo
--;
1890 ASSERT(freemem_left
!= 0);
1891 if (PP_ISFREE(pp
)) {
1893 * Like page_reclaim() only 'freemem'
1894 * processing is already done.
1896 MDSTAT_INCR(mhp
, nfree
);
1898 if (PP_ISAGED(pp
)) {
1908 mutex_enter(&mhp
->mh_mutex
);
1909 page_delete_collect(pp
, mhp
);
1910 mdsp
->mds_bitmap
[bit
/ NBPBMW
] |=
1911 (1 << (bit
% NBPBMW
));
1915 ASSERT(pp
->p_vnode
!= NULL
);
1917 MDSTAT_INCR(mhp
, first_notfree
);
1919 mutex_enter(&mhp
->mh_mutex
);
1923 * Keep stats on pages encountered that
1924 * are marked for retirement.
1927 MDSTAT_INCR(mhp
, toxic
);
1928 } else if (PP_PR_REQ(pp
)) {
1929 MDSTAT_INCR(mhp
, failing
);
1932 * In certain cases below, special exceptions
1933 * are made for pages that are toxic. This
1934 * is because the current meaning of toxic
1935 * is that an uncorrectable error has been
1936 * previously associated with the page.
1938 if (pp
->p_lckcnt
!= 0 || pp
->p_cowcnt
!= 0) {
1939 if (!PP_TOXIC(pp
)) {
1941 * Must relocate locked in
1944 #ifdef MEM_DEL_STATS
1945 start_pgrp
= ddi_get_lbolt();
1946 #endif /* MEM_DEL_STATS */
1948 * Lock all constituent pages
1949 * of a large page to ensure
1950 * that p_szc won't change.
1952 if (!group_page_trylock(pp
,
1961 MDSTAT_INCR(mhp
, npplocked
);
1963 page_get_replacement_page(
1965 if (pp_targ
!= NULL
) {
1966 #ifdef MEM_DEL_STATS
1971 #endif /* MEM_DEL_STATS */
1978 group_page_unlock(pp
);
1980 #ifdef MEM_DEL_STATS
1982 (uint64_t)ddi_get_lbolt() -
1984 #endif /* MEM_DEL_STATS */
1985 MDSTAT_PGRP(mhp
, ntick_pgrp
);
1986 MDSTAT_INCR(mhp
, nnorepl
);
1987 mutex_enter(&mhp
->mh_mutex
);
1991 * Cannot do anything about
1992 * this page because it is
1995 MDSTAT_INCR(mhp
, npplkdtoxic
);
1997 mutex_enter(&mhp
->mh_mutex
);
2002 * Unload the mappings and check if mod bit
2005 ASSERT(!PP_ISKAS(pp
));
2006 (void) hat_pageunload(pp
, HAT_FORCE_PGUNLOAD
);
2007 mod
= hat_ismod(pp
);
2009 #ifdef MEM_DEL_STATS
2010 start_pgrp
= ddi_get_lbolt();
2011 #endif /* MEM_DEL_STATS */
2012 if (mod
&& !PP_TOXIC(pp
)) {
2014 * Lock all constituent pages
2015 * of a large page to ensure
2016 * that p_szc won't change.
2018 if (!group_page_trylock(pp
, SE_EXCL
)) {
2019 MDSTAT_INCR(mhp
, gptlmodfail
);
2021 mutex_enter(&mhp
->mh_mutex
);
2024 pp_targ
= page_get_replacement_page(pp
,
2026 if (pp_targ
!= NULL
) {
2027 MDSTAT_INCR(mhp
, nmodreloc
);
2028 #ifdef MEM_DEL_STATS
2030 (uint64_t)ddi_get_lbolt() -
2032 #endif /* MEM_DEL_STATS */
2033 MDSTAT_PGRP(mhp
, ntick_pgrp
);
2036 group_page_unlock(pp
);
2039 if (!page_try_demote_pages(pp
)) {
2040 MDSTAT_INCR(mhp
, demotefail
);
2042 #ifdef MEM_DEL_STATS
2043 ntick_pgrp
= (uint64_t)ddi_get_lbolt() -
2045 #endif /* MEM_DEL_STATS */
2046 MDSTAT_PGRP(mhp
, ntick_pgrp
);
2047 mutex_enter(&mhp
->mh_mutex
);
2052 * Regular 'page-out'.
2055 MDSTAT_INCR(mhp
, ndestroy
);
2056 page_destroy(pp
, 1);
2058 * page_destroy was called with
2059 * dontfree. As long as p_lckcnt
2060 * and p_cowcnt are both zero, the
2061 * only additional action of
2062 * page_destroy with !dontfree is to
2063 * call page_free, so we can collect
2067 #ifdef MEM_DEL_STATS
2068 ntick_pgrp
= (uint64_t)ddi_get_lbolt() -
2070 #endif /* MEM_DEL_STATS */
2071 MDSTAT_PGRP(mhp
, ntick_pgrp
);
2072 mutex_enter(&mhp
->mh_mutex
);
2073 page_delete_collect(pp
, mhp
);
2074 mdsp
->mds_bitmap
[bit
/ NBPBMW
] |=
2075 (1 << (bit
% NBPBMW
));
2079 * The page is toxic and the mod bit is
2080 * set, we cannot do anything here to deal
2085 #ifdef MEM_DEL_STATS
2086 ntick_pgrp
= (uint64_t)ddi_get_lbolt() -
2088 #endif /* MEM_DEL_STATS */
2089 MDSTAT_PGRP(mhp
, ntick_pgrp
);
2090 MDSTAT_INCR(mhp
, modtoxic
);
2091 mutex_enter(&mhp
->mh_mutex
);
2094 MDSTAT_INCR(mhp
, nputpage
);
2096 offset
= pp
->p_offset
;
2099 (void) VOP_PUTPAGE(vp
, offset
, PAGESIZE
,
2100 B_INVAL
|B_FORCE
, kcred
, NULL
);
2102 #ifdef MEM_DEL_STATS
2103 ntick_pgrp
= (uint64_t)ddi_get_lbolt() -
2105 #endif /* MEM_DEL_STATS */
2106 MDSTAT_PGRP(mhp
, ntick_pgrp
);
2108 * Try to get the page back immediately
2109 * so that it can be collected.
2111 pp
= page_numtopp_nolock(pfn
);
2113 MDSTAT_INCR(mhp
, nnoreclaim
);
2115 * This should not happen as this
2116 * thread is deleting the page.
2117 * If this code is generalized, this
2118 * becomes a reality.
2122 "delete_memory_thread(0x%p) "
2123 "pfn 0x%lx has no page_t",
2126 mutex_enter(&mhp
->mh_mutex
);
2129 if (page_try_reclaim_lock(pp
, SE_EXCL
,
2130 SE_EXCL_WANTED
| SE_RETIRED
)) {
2131 if (PP_ISFREE(pp
)) {
2132 goto free_page_collect
;
2136 MDSTAT_INCR(mhp
, nnoreclaim
);
2137 mutex_enter(&mhp
->mh_mutex
);
2142 * Got some freemem and a target
2143 * page, so move the data to avoid
2144 * I/O and lock problems.
2146 ASSERT(!page_iolock_assert(pp
));
2147 MDSTAT_INCR(mhp
, nreloc
);
2149 * page_relocate() will return pgcnt: the
2150 * number of consecutive pages relocated.
2151 * If it is successful, pp will be a
2152 * linked list of the page structs that
2153 * were relocated. If page_relocate() is
2154 * unsuccessful, pp will be unmodified.
2156 #ifdef MEM_DEL_STATS
2157 start_pgrp
= ddi_get_lbolt();
2158 #endif /* MEM_DEL_STATS */
2159 result
= page_relocate(&pp
, &pp_targ
, 0, 0,
2161 #ifdef MEM_DEL_STATS
2162 ntick_pgrp
= (uint64_t)ddi_get_lbolt() -
2164 #endif /* MEM_DEL_STATS */
2165 MDSTAT_PGRP(mhp
, ntick_pgrp
);
2167 MDSTAT_INCR(mhp
, nrelocfail
);
2169 * We did not succeed. We need
2170 * to give the pp_targ pages back.
2171 * page_free(pp_targ, 1) without
2172 * the freemem accounting.
2174 group_page_unlock(pp
);
2175 page_free_replacement_page(pp_targ
);
2177 mutex_enter(&mhp
->mh_mutex
);
2182 * We will then collect pgcnt pages.
2185 mutex_enter(&mhp
->mh_mutex
);
2187 * We need to make sure freemem_left is
2190 while ((freemem_left
< pgcnt
) &&
2191 (!mhp
->mh_cancel
)) {
2193 delthr_get_freemem(mhp
);
2197 * Do not proceed if mh_cancel is set.
2199 if (mhp
->mh_cancel
) {
2200 while (pp_targ
!= NULL
) {
2202 * Unlink and unlock each page.
2205 page_sub(&pp_targ
, tpp_targ
);
2206 page_unlock(tpp_targ
);
2209 * We need to give the pp pages back.
2210 * page_free(pp, 1) without the
2211 * freemem accounting.
2213 page_free_replacement_page(pp
);
2217 /* Now remove pgcnt from freemem_left */
2218 freemem_left
-= pgcnt
;
2219 ASSERT(freemem_left
>= 0);
2221 while (pp
!= NULL
) {
2223 * pp and pp_targ were passed back as
2224 * a linked list of pages.
2225 * Unlink and unlock each page.
2228 page_sub(&pp_targ
, tpp_targ
);
2229 page_unlock(tpp_targ
);
2231 * The original page is now free
2232 * so remove it from the linked
2233 * list and collect it.
2237 pfn
= page_pptonum(tpp
);
2239 ASSERT(PAGE_EXCL(tpp
));
2240 ASSERT(tpp
->p_vnode
== NULL
);
2241 ASSERT(!hat_page_is_mapped(tpp
));
2242 ASSERT(tpp
->p_szc
== szc
);
2244 page_delete_collect(tpp
, mhp
);
2245 bit
= pfn
- mdsp
->mds_base
;
2246 mdsp
->mds_bitmap
[bit
/ NBPBMW
] |=
2247 (1 << (bit
% NBPBMW
));
2249 ASSERT(pp_targ
== NULL
);
2253 if ((mhp
->mh_cancel
== 0) && (mhp
->mh_hold_todo
!= 0) &&
2256 * This code is needed as we cannot wait
2257 * for a page to be locked OR the delete to
2258 * be cancelled. Also, we must delay so
2259 * that other threads get a chance to run
2260 * on our cpu, otherwise page locks may be
2261 * held indefinitely by those threads.
2263 MDSTAT_INCR(mhp
, ndelay
);
2264 CALLB_CPR_SAFE_BEGIN(&cprinfo
);
2265 (void) cv_reltimedwait(&mhp
->mh_cv
, &mhp
->mh_mutex
,
2266 DEL_BUSY_WAIT_TICKS
, TR_CLOCK_TICK
);
2267 CALLB_CPR_SAFE_END(&cprinfo
, &mhp
->mh_mutex
);
2270 /* stop the dr aio cleanup thread */
2271 mhp
->mh_dr_aio_cleanup_cancel
= 1;
2272 transit_list_collect(mhp
, 0);
2273 if (freemem_left
!= 0) {
2274 /* Return any surplus. */
2275 page_create_putback(freemem_left
);
2278 #ifdef MEM_DEL_STATS
2279 ntick_total
= (uint64_t)ddi_get_lbolt() - start_total
;
2280 #endif /* MEM_DEL_STATS */
2281 MDSTAT_TOTAL(mhp
, ntick_total
);
2285 * If the memory delete was cancelled, exclusive-wanted bits must
2286 * be cleared. If there are retired pages being deleted, they need
2289 for (mdsp
= mhp
->mh_transit
.trl_spans
; mdsp
!= NULL
;
2290 mdsp
= mdsp
->mds_next
) {
2293 p_end
= mdsp
->mds_base
+ mdsp
->mds_npgs
;
2294 for (pfn
= mdsp
->mds_base
; pfn
< p_end
; pfn
++) {
2298 bit
= pfn
- mdsp
->mds_base
;
2299 if (mhp
->mh_cancel
) {
2300 pp
= page_numtopp_nolock(pfn
);
2302 if ((mdsp
->mds_bitmap
[bit
/ NBPBMW
] &
2303 (1 << (bit
% NBPBMW
))) == 0) {
2304 page_lock_clr_exclwanted(pp
);
2310 if ((mdsp
->mds_bitmap_retired
[bit
/ NBPBMW
] &
2311 (1 << (bit
% NBPBMW
))) != 0) {
2312 /* do we already have pp? */
2314 pp
= page_numtopp_nolock(pfn
);
2317 ASSERT(PP_RETIRED(pp
));
2318 if (mhp
->mh_cancel
!= 0) {
2321 * To satisfy ASSERT below in
2324 mhp
->mh_hold_todo
++;
2326 (void) page_unretire_pp(pp
,
2333 * Free retired page bitmap and collected page bitmap
2335 for (mdsp
= mhp
->mh_transit
.trl_spans
; mdsp
!= NULL
;
2336 mdsp
= mdsp
->mds_next
) {
2337 ASSERT(mdsp
->mds_bitmap_retired
!= NULL
);
2338 kmem_free(mdsp
->mds_bitmap_retired
, MDS_BITMAPBYTES(mdsp
));
2339 mdsp
->mds_bitmap_retired
= NULL
; /* Paranoia. */
2340 ASSERT(mdsp
->mds_bitmap
!= NULL
);
2341 kmem_free(mdsp
->mds_bitmap
, MDS_BITMAPBYTES(mdsp
));
2342 mdsp
->mds_bitmap
= NULL
; /* Paranoia. */
2345 /* wait for our dr aio cancel thread to exit */
2346 while (!(mhp
->mh_aio_cleanup_done
)) {
2347 CALLB_CPR_SAFE_BEGIN(&cprinfo
);
2348 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY
));
2349 CALLB_CPR_SAFE_END(&cprinfo
, &mhp
->mh_mutex
);
2352 if (mhp
->mh_cancel
!= 0) {
2355 comp_code
= mhp
->mh_cancel
;
2357 * Go through list of deleted pages (mh_deleted) freeing
2360 while ((pp
= mhp
->mh_deleted
) != NULL
) {
2361 mhp
->mh_deleted
= pp
->p_next
;
2362 mhp
->mh_hold_todo
++;
2363 mutex_exit(&mhp
->mh_mutex
);
2364 /* Restore p_next. */
2365 pp
->p_next
= pp
->p_prev
;
2366 if (PP_ISFREE(pp
)) {
2372 mutex_enter(&mhp
->mh_mutex
);
2374 ASSERT(mhp
->mh_hold_todo
== mhp
->mh_vm_pages
);
2376 mutex_exit(&mhp
->mh_mutex
);
2377 put_availrmem(mhp
->mh_vm_pages
);
2378 mutex_enter(&mhp
->mh_mutex
);
2384 * All the pages are no longer in use and are exclusively locked.
2387 mhp
->mh_deleted
= NULL
;
2389 kphysm_del_cleanup(mhp
);
2392 * mem_node_del_range needs to be after kphysm_del_cleanup so
2393 * that the mem_node_config[] will remain intact for the cleanup.
2395 for (mdsp
= mhp
->mh_transit
.trl_spans
; mdsp
!= NULL
;
2396 mdsp
= mdsp
->mds_next
) {
2397 mem_node_del_range(mdsp
->mds_base
,
2398 mdsp
->mds_base
+ mdsp
->mds_npgs
- 1);
2400 /* cleanup the page counters */
2401 page_ctrs_cleanup();
2403 comp_code
= KPHYSM_OK
;
2406 mutex_exit(&mhp
->mh_mutex
);
2407 kphysm_setup_post_del(mhp
->mh_vm_pages
,
2408 (comp_code
== KPHYSM_OK
) ? 0 : 1);
2409 mutex_enter(&mhp
->mh_mutex
);
2412 /* mhp->mh_mutex exited by CALLB_CPR_EXIT() */
2413 mhp
->mh_state
= MHND_DONE
;
2414 del_complete_funcp
= mhp
->mh_delete_complete
;
2415 del_complete_arg
= mhp
->mh_delete_complete_arg
;
2416 CALLB_CPR_EXIT(&cprinfo
);
2417 (*del_complete_funcp
)(del_complete_arg
, comp_code
);
2423 * Start the delete of the memory from the system.
2428 void (*complete
)(void *, int),
2431 struct mem_handle
*mhp
;
2433 mhp
= kphysm_lookup_mem_handle(handle
);
2435 return (KPHYSM_EHANDLE
);
2437 switch (mhp
->mh_state
) {
2439 ASSERT(mhp
->mh_state
!= MHND_FREE
);
2440 mutex_exit(&mhp
->mh_mutex
);
2441 return (KPHYSM_EHANDLE
);
2446 mutex_exit(&mhp
->mh_mutex
);
2447 return (KPHYSM_ESEQUENCE
);
2449 mutex_exit(&mhp
->mh_mutex
);
2450 return (KPHYSM_ESEQUENCE
);
2452 mutex_exit(&mhp
->mh_mutex
);
2453 return (KPHYSM_ESEQUENCE
);
2456 cmn_err(CE_WARN
, "kphysm_del_start(0x%p) state corrupt %d",
2457 (void *)mhp
, mhp
->mh_state
);
2459 mutex_exit(&mhp
->mh_mutex
);
2460 return (KPHYSM_EHANDLE
);
2463 if (mhp
->mh_transit
.trl_spans
== NULL
) {
2464 mutex_exit(&mhp
->mh_mutex
);
2465 return (KPHYSM_ENOWORK
);
2468 ASSERT(complete
!= NULL
);
2469 mhp
->mh_delete_complete
= complete
;
2470 mhp
->mh_delete_complete_arg
= complete_arg
;
2471 mhp
->mh_state
= MHND_STARTING
;
2473 * Release the mutex in case thread_create sleeps.
2475 mutex_exit(&mhp
->mh_mutex
);
2478 * The "obvious" process for this thread is pageout (proc_pageout)
2479 * but this gives the thread too much power over freemem
2480 * which results in freemem starvation.
2482 (void) thread_create(NULL
, 0, delete_memory_thread
, mhp
, 0, &p0
,
2483 TS_RUN
, maxclsyspri
- 1);
2488 static kmutex_t pp_dummy_lock
; /* Protects init. of pp_dummy. */
2489 static caddr_t pp_dummy
;
2490 static pgcnt_t pp_dummy_npages
;
2491 static pfn_t
*pp_dummy_pfn
; /* Array of dummy pfns. */
2494 memseg_remap_init_pages(page_t
*pages
, page_t
*epages
)
2498 for (pp
= pages
; pp
< epages
; pp
++) {
2499 pp
->p_pagenum
= PFN_INVALID
; /* XXXX */
2500 pp
->p_offset
= (u_offset_t
)-1;
2501 page_iolock_init(pp
);
2502 while (!page_lock(pp
, SE_EXCL
, (kmutex_t
*)NULL
, P_RECLAIM
))
2504 page_lock_delete(pp
);
2511 mutex_enter(&pp_dummy_lock
);
2512 if (pp_dummy
== NULL
) {
2517 * dpages starts off as the size of the structure and
2518 * ends up as the minimum number of pages that will
2519 * hold a whole number of page_t structures.
2521 dpages
= sizeof (page_t
);
2522 ASSERT(dpages
!= 0);
2523 ASSERT(dpages
<= MMU_PAGESIZE
);
2525 while ((dpages
& 1) == 0)
2528 pp_dummy_npages
= dpages
;
2530 * Allocate pp_dummy pages directly from static_arena,
2531 * since these are whole page allocations and are
2532 * referenced by physical address. This also has the
2533 * nice fringe benefit of hiding the memory from
2534 * ::findleaks since it doesn't deal well with allocated
2535 * kernel heap memory that doesn't have any mappings.
2537 pp_dummy
= vmem_xalloc(static_arena
, ptob(pp_dummy_npages
),
2538 PAGESIZE
, 0, 0, NULL
, NULL
, VM_SLEEP
);
2539 bzero(pp_dummy
, ptob(pp_dummy_npages
));
2540 ASSERT(((uintptr_t)pp_dummy
& MMU_PAGEOFFSET
) == 0);
2541 pp_dummy_pfn
= kmem_alloc(sizeof (*pp_dummy_pfn
) *
2542 pp_dummy_npages
, KM_SLEEP
);
2543 for (i
= 0; i
< pp_dummy_npages
; i
++) {
2544 pp_dummy_pfn
[i
] = hat_getpfnum(kas
.a_hat
,
2545 &pp_dummy
[MMU_PAGESIZE
* i
]);
2546 ASSERT(pp_dummy_pfn
[i
] != PFN_INVALID
);
2549 * Initialize the page_t's to a known 'deleted' state
2550 * that matches the state of deleted pages.
2552 memseg_remap_init_pages((page_t
*)pp_dummy
,
2553 (page_t
*)(pp_dummy
+ ptob(pp_dummy_npages
)));
2554 /* Remove kmem mappings for the pages for safety. */
2555 hat_unload(kas
.a_hat
, pp_dummy
, ptob(pp_dummy_npages
),
2557 /* Leave pp_dummy pointer set as flag that init is done. */
2559 mutex_exit(&pp_dummy_lock
);
2563 * Remap a page-aglined range of page_t's to dummy pages.
2566 remap_to_dummy(caddr_t va
, pgcnt_t metapgs
)
2570 ASSERT(IS_P2ALIGNED((uint64_t)(uintptr_t)va
, PAGESIZE
));
2573 * We may start remapping at a non-zero page offset
2574 * within the dummy pages since the low/high ends
2575 * of the outgoing pp's could be shared by other
2576 * memsegs (see memseg_remap_meta).
2578 phase
= btop((uint64_t)(uintptr_t)va
) % pp_dummy_npages
;
2580 ASSERT(PAGESIZE
% sizeof (page_t
) || phase
== 0);
2582 while (metapgs
!= 0) {
2586 n
= pp_dummy_npages
;
2589 for (i
= 0; i
< n
; i
++) {
2590 j
= (i
+ phase
) % pp_dummy_npages
;
2591 hat_devload(kas
.a_hat
, va
, ptob(1), pp_dummy_pfn
[j
],
2593 HAT_LOAD
| HAT_LOAD_NOCONSIST
|
2602 memseg_remap_to_dummy(struct memseg
*seg
)
2607 ASSERT(memseg_is_dynamic(seg
));
2608 ASSERT(pp_dummy
!= NULL
);
2611 if (!memseg_includes_meta(seg
)) {
2612 memseg_remap_meta(seg
);
2616 pp
= (caddr_t
)seg
->pages
;
2617 metapgs
= seg
->pages_base
- memseg_get_start(seg
);
2618 ASSERT(metapgs
!= 0);
2620 seg
->pages_end
= seg
->pages_base
;
2622 remap_to_dummy(pp
, metapgs
);
2626 * Transition all the deleted pages to the deleted state so that
2627 * page_lock will not wait. The page_lock_delete call will
2628 * also wake up any waiters.
2631 memseg_lock_delete_all(struct memseg
*seg
)
2635 for (pp
= seg
->pages
; pp
< seg
->epages
; pp
++) {
2636 pp
->p_pagenum
= PFN_INVALID
; /* XXXX */
2637 page_lock_delete(pp
);
2642 kphysm_del_cleanup(struct mem_handle
*mhp
)
2644 struct memdelspan
*mdsp
;
2646 struct memseg
**segpp
;
2647 struct memseg
*seglist
;
2653 avpgs
= mhp
->mh_vm_pages
;
2658 * remove from main segment list.
2662 for (mdsp
= mhp
->mh_transit
.trl_spans
; mdsp
!= NULL
;
2663 mdsp
= mdsp
->mds_next
) {
2664 p_end
= mdsp
->mds_base
+ mdsp
->mds_npgs
;
2665 for (segpp
= &memsegs
; (seg
= *segpp
) != NULL
; ) {
2666 if (seg
->pages_base
>= p_end
||
2667 seg
->pages_end
<= mdsp
->mds_base
) {
2668 /* Span and memseg don't overlap. */
2669 segpp
= &((*segpp
)->next
);
2672 ASSERT(seg
->pages_base
>= mdsp
->mds_base
);
2673 ASSERT(seg
->pages_end
<= p_end
);
2675 PLCNT_MODIFY_MAX(seg
->pages_base
,
2676 seg
->pages_base
- seg
->pages_end
);
2678 /* Hide the memseg from future scans. */
2679 hat_kpm_delmem_mseg_update(seg
, segpp
);
2681 membar_producer(); /* TODO: Needed? */
2682 npgs
+= MSEG_NPAGES(seg
);
2685 * Leave the deleted segment's next pointer intact
2686 * in case a memsegs scanning loop is walking this
2687 * segment concurrently.
2689 seg
->lnext
= seglist
;
2696 ASSERT(npgs
< total_pages
);
2697 total_pages
-= npgs
;
2700 * Recalculate the paging parameters now total_pages has changed.
2701 * This will also cause the clock hands to be reset before next use.
2707 mutex_exit(&mhp
->mh_mutex
);
2709 while ((seg
= seglist
) != NULL
) {
2711 pfn_t mseg_base
, mseg_end
;
2715 seglist
= seg
->lnext
;
2718 * Put the page_t's into the deleted state to stop
2719 * cv_wait()s on the pages. When we remap, the dummy
2720 * page_t's will be in the same state.
2722 memseg_lock_delete_all(seg
);
2724 * Collect up information based on pages_base and pages_end
2725 * early so that we can flag early that the memseg has been
2726 * deleted by setting pages_end == pages_base.
2728 mseg_base
= seg
->pages_base
;
2729 mseg_end
= seg
->pages_end
;
2730 mseg_npgs
= MSEG_NPAGES(seg
);
2731 mseg_start
= memseg_get_start(seg
);
2733 if (memseg_is_dynamic(seg
)) {
2734 /* Remap the meta data to our special dummy area. */
2735 memseg_remap_to_dummy(seg
);
2737 mutex_enter(&memseg_lists_lock
);
2738 seg
->lnext
= memseg_va_avail
;
2739 memseg_va_avail
= seg
;
2740 mutex_exit(&memseg_lists_lock
);
2743 * For memory whose page_ts were allocated
2744 * at boot, we need to find a new use for
2745 * the page_t memory.
2746 * For the moment, just leak it.
2747 * (It is held in the memseg_delete_junk list.)
2749 seg
->pages_end
= seg
->pages_base
;
2751 mutex_enter(&memseg_lists_lock
);
2752 seg
->lnext
= memseg_delete_junk
;
2753 memseg_delete_junk
= seg
;
2754 mutex_exit(&memseg_lists_lock
);
2757 /* Must not use seg now as it could be re-used. */
2759 memlist_write_lock();
2761 mlret
= memlist_delete_span(
2762 (uint64_t)(mseg_base
) << PAGESHIFT
,
2763 (uint64_t)(mseg_npgs
) << PAGESHIFT
,
2765 ASSERT(mlret
== MEML_SPANOP_OK
);
2767 mlret
= memlist_delete_span(
2768 (uint64_t)(mseg_start
) << PAGESHIFT
,
2769 (uint64_t)(mseg_end
- mseg_start
) <<
2772 ASSERT(mlret
== MEML_SPANOP_OK
);
2773 phys_install_has_changed();
2775 memlist_write_unlock();
2778 memlist_read_lock();
2779 installed_top_size(phys_install
, &physmax
, &physinstalled
);
2780 memlist_read_unlock();
2782 mutex_enter(&freemem_lock
);
2785 /* availrmem is adjusted during the delete. */
2786 availrmem_initial
-= avpgs
;
2788 mutex_exit(&freemem_lock
);
2792 cmn_err(CE_CONT
, "?kphysm_delete: mem = %ldK "
2793 "(0x%" PRIx64
")\n",
2794 physinstalled
<< (PAGESHIFT
- 10),
2795 (uint64_t)physinstalled
<< PAGESHIFT
);
2797 avmem
= (uint64_t)freemem
<< PAGESHIFT
;
2798 cmn_err(CE_CONT
, "?kphysm_delete: "
2799 "avail mem = %" PRId64
"\n", avmem
);
2802 * Update lgroup generation number on single lgroup systems
2805 lgrp_config(LGRP_CONFIG_GEN_UPDATE
, 0, 0);
2807 /* Successfully deleted system memory */
2808 mutex_enter(&mhp
->mh_mutex
);
2811 static uint_t mdel_nullvp_waiter
;
2814 page_delete_collect(
2816 struct mem_handle
*mhp
)
2819 page_hashout(pp
, (kmutex_t
*)NULL
);
2820 /* do not do PP_SETAGED(pp); */
2824 sep
= page_se_mutex(pp
);
2826 if (CV_HAS_WAITERS(&pp
->p_cv
)) {
2827 mdel_nullvp_waiter
++;
2828 cv_broadcast(&pp
->p_cv
);
2832 ASSERT(pp
->p_next
== pp
->p_prev
);
2833 ASSERT(pp
->p_next
== NULL
|| pp
->p_next
== pp
);
2834 pp
->p_next
= mhp
->mh_deleted
;
2835 mhp
->mh_deleted
= pp
;
2836 ASSERT(mhp
->mh_hold_todo
!= 0);
2837 mhp
->mh_hold_todo
--;
2841 transit_list_collect(struct mem_handle
*mhp
, int v
)
2843 struct transit_list_head
*trh
;
2845 trh
= &transit_list_head
;
2846 mutex_enter(&trh
->trh_lock
);
2847 mhp
->mh_transit
.trl_collect
= v
;
2848 mutex_exit(&trh
->trh_lock
);
2852 transit_list_insert(struct transit_list
*tlp
)
2854 struct transit_list_head
*trh
;
2856 trh
= &transit_list_head
;
2857 ASSERT(MUTEX_HELD(&trh
->trh_lock
));
2858 tlp
->trl_next
= trh
->trh_head
;
2859 trh
->trh_head
= tlp
;
2863 transit_list_remove(struct transit_list
*tlp
)
2865 struct transit_list_head
*trh
;
2866 struct transit_list
**tlpp
;
2868 trh
= &transit_list_head
;
2869 tlpp
= &trh
->trh_head
;
2870 ASSERT(MUTEX_HELD(&trh
->trh_lock
));
2871 while (*tlpp
!= NULL
&& *tlpp
!= tlp
)
2872 tlpp
= &(*tlpp
)->trl_next
;
2873 ASSERT(*tlpp
!= NULL
);
2875 *tlpp
= tlp
->trl_next
;
2876 tlp
->trl_next
= NULL
;
2879 static struct transit_list
*
2880 pfnum_to_transit_list(struct transit_list_head
*trh
, pfn_t pfnum
)
2882 struct transit_list
*tlp
;
2884 for (tlp
= trh
->trh_head
; tlp
!= NULL
; tlp
= tlp
->trl_next
) {
2885 struct memdelspan
*mdsp
;
2887 for (mdsp
= tlp
->trl_spans
; mdsp
!= NULL
;
2888 mdsp
= mdsp
->mds_next
) {
2889 if (pfnum
>= mdsp
->mds_base
&&
2890 pfnum
< (mdsp
->mds_base
+ mdsp
->mds_npgs
)) {
2899 pfn_is_being_deleted(pfn_t pfnum
)
2901 struct transit_list_head
*trh
;
2902 struct transit_list
*tlp
;
2905 trh
= &transit_list_head
;
2906 if (trh
->trh_head
== NULL
)
2909 mutex_enter(&trh
->trh_lock
);
2910 tlp
= pfnum_to_transit_list(trh
, pfnum
);
2911 ret
= (tlp
!= NULL
&& tlp
->trl_collect
);
2912 mutex_exit(&trh
->trh_lock
);
2917 #ifdef MEM_DEL_STATS
2920 mem_del_stat_print_func(struct mem_handle
*mhp
)
2924 if (mem_del_stat_print
) {
2925 printf("memory delete loop %x/%x, statistics%s\n",
2926 (uint_t
)mhp
->mh_transit
.trl_spans
->mds_base
,
2927 (uint_t
)mhp
->mh_transit
.trl_spans
->mds_npgs
,
2928 (mhp
->mh_cancel
? " (cancelled)" : ""));
2929 printf("\t%8u nloop\n", mhp
->mh_delstat
.nloop
);
2930 printf("\t%8u need_free\n", mhp
->mh_delstat
.need_free
);
2931 printf("\t%8u free_loop\n", mhp
->mh_delstat
.free_loop
);
2932 printf("\t%8u free_low\n", mhp
->mh_delstat
.free_low
);
2933 printf("\t%8u free_failed\n", mhp
->mh_delstat
.free_failed
);
2934 printf("\t%8u ncheck\n", mhp
->mh_delstat
.ncheck
);
2935 printf("\t%8u nopaget\n", mhp
->mh_delstat
.nopaget
);
2936 printf("\t%8u lockfail\n", mhp
->mh_delstat
.lockfail
);
2937 printf("\t%8u nfree\n", mhp
->mh_delstat
.nfree
);
2938 printf("\t%8u nreloc\n", mhp
->mh_delstat
.nreloc
);
2939 printf("\t%8u nrelocfail\n", mhp
->mh_delstat
.nrelocfail
);
2940 printf("\t%8u already_done\n", mhp
->mh_delstat
.already_done
);
2941 printf("\t%8u first_notfree\n", mhp
->mh_delstat
.first_notfree
);
2942 printf("\t%8u npplocked\n", mhp
->mh_delstat
.npplocked
);
2943 printf("\t%8u nlockreloc\n", mhp
->mh_delstat
.nlockreloc
);
2944 printf("\t%8u nnorepl\n", mhp
->mh_delstat
.nnorepl
);
2945 printf("\t%8u nmodreloc\n", mhp
->mh_delstat
.nmodreloc
);
2946 printf("\t%8u ndestroy\n", mhp
->mh_delstat
.ndestroy
);
2947 printf("\t%8u nputpage\n", mhp
->mh_delstat
.nputpage
);
2948 printf("\t%8u nnoreclaim\n", mhp
->mh_delstat
.nnoreclaim
);
2949 printf("\t%8u ndelay\n", mhp
->mh_delstat
.ndelay
);
2950 printf("\t%8u demotefail\n", mhp
->mh_delstat
.demotefail
);
2951 printf("\t%8u retired\n", mhp
->mh_delstat
.retired
);
2952 printf("\t%8u toxic\n", mhp
->mh_delstat
.toxic
);
2953 printf("\t%8u failing\n", mhp
->mh_delstat
.failing
);
2954 printf("\t%8u modtoxic\n", mhp
->mh_delstat
.modtoxic
);
2955 printf("\t%8u npplkdtoxic\n", mhp
->mh_delstat
.npplkdtoxic
);
2956 printf("\t%8u gptlmodfail\n", mhp
->mh_delstat
.gptlmodfail
);
2957 printf("\t%8u gptllckfail\n", mhp
->mh_delstat
.gptllckfail
);
2958 tmp
= mhp
->mh_delstat
.nticks_total
/ hz
; /* seconds */
2960 "\t%"PRIu64
" nticks_total - %"PRIu64
" min %"PRIu64
" sec\n",
2961 mhp
->mh_delstat
.nticks_total
, tmp
/ 60, tmp
% 60);
2963 tmp
= mhp
->mh_delstat
.nticks_pgrp
/ hz
; /* seconds */
2965 "\t%"PRIu64
" nticks_pgrp - %"PRIu64
" min %"PRIu64
" sec\n",
2966 mhp
->mh_delstat
.nticks_pgrp
, tmp
/ 60, tmp
% 60);
2969 #endif /* MEM_DEL_STATS */
2971 struct mem_callback
{
2972 kphysm_setup_vector_t
*vec
;
2976 #define NMEMCALLBACKS 100
2978 static struct mem_callback mem_callbacks
[NMEMCALLBACKS
];
2979 static uint_t nmemcallbacks
;
2980 static krwlock_t mem_callback_rwlock
;
2983 kphysm_setup_func_register(kphysm_setup_vector_t
*vec
, void *arg
)
2988 * This test will become more complicated when the version must
2991 if (vec
->version
!= KPHYSM_SETUP_VECTOR_VERSION
)
2994 if (vec
->post_add
== NULL
|| vec
->pre_del
== NULL
||
2995 vec
->post_del
== NULL
)
2998 rw_enter(&mem_callback_rwlock
, RW_WRITER
);
2999 for (i
= 0, found
= 0; i
< nmemcallbacks
; i
++) {
3000 if (mem_callbacks
[i
].vec
== NULL
&& found
== 0)
3002 if (mem_callbacks
[i
].vec
== vec
&&
3003 mem_callbacks
[i
].arg
== arg
) {
3005 /* Catch this in DEBUG kernels. */
3006 cmn_err(CE_WARN
, "kphysm_setup_func_register"
3007 "(0x%p, 0x%p) duplicate registration from 0x%p",
3008 (void *)vec
, arg
, (void *)caller());
3010 rw_exit(&mem_callback_rwlock
);
3017 ASSERT(nmemcallbacks
< NMEMCALLBACKS
);
3018 if (nmemcallbacks
== NMEMCALLBACKS
) {
3019 rw_exit(&mem_callback_rwlock
);
3022 i
= nmemcallbacks
++;
3024 mem_callbacks
[i
].vec
= vec
;
3025 mem_callbacks
[i
].arg
= arg
;
3026 rw_exit(&mem_callback_rwlock
);
3031 kphysm_setup_func_unregister(kphysm_setup_vector_t
*vec
, void *arg
)
3035 rw_enter(&mem_callback_rwlock
, RW_WRITER
);
3036 for (i
= 0; i
< nmemcallbacks
; i
++) {
3037 if (mem_callbacks
[i
].vec
== vec
&&
3038 mem_callbacks
[i
].arg
== arg
) {
3039 mem_callbacks
[i
].vec
= NULL
;
3040 mem_callbacks
[i
].arg
= NULL
;
3041 if (i
== (nmemcallbacks
- 1))
3046 rw_exit(&mem_callback_rwlock
);
3050 kphysm_setup_post_add(pgcnt_t delta_pages
)
3054 rw_enter(&mem_callback_rwlock
, RW_READER
);
3055 for (i
= 0; i
< nmemcallbacks
; i
++) {
3056 if (mem_callbacks
[i
].vec
!= NULL
) {
3057 (*mem_callbacks
[i
].vec
->post_add
)
3058 (mem_callbacks
[i
].arg
, delta_pages
);
3061 rw_exit(&mem_callback_rwlock
);
3065 * Note the locking between pre_del and post_del: The reader lock is held
3066 * between the two calls to stop the set of functions from changing.
3070 kphysm_setup_pre_del(pgcnt_t delta_pages
)
3077 rw_enter(&mem_callback_rwlock
, RW_READER
);
3078 for (i
= 0; i
< nmemcallbacks
; i
++) {
3079 if (mem_callbacks
[i
].vec
!= NULL
) {
3080 aret
= (*mem_callbacks
[i
].vec
->pre_del
)
3081 (mem_callbacks
[i
].arg
, delta_pages
);
3090 kphysm_setup_post_del(pgcnt_t delta_pages
, int cancelled
)
3094 for (i
= 0; i
< nmemcallbacks
; i
++) {
3095 if (mem_callbacks
[i
].vec
!= NULL
) {
3096 (*mem_callbacks
[i
].vec
->post_del
)
3097 (mem_callbacks
[i
].arg
, delta_pages
, cancelled
);
3100 rw_exit(&mem_callback_rwlock
);
3104 kphysm_split_memseg(
3109 struct memseg
**segpp
;
3110 pgcnt_t size_low
, size_high
;
3111 struct memseg
*seg_low
, *seg_mid
, *seg_high
;
3114 * Lock the memsegs list against other updates now
3119 * Find boot time memseg that wholly covers this area.
3122 /* First find the memseg with page 'base' in it. */
3123 for (segpp
= &memsegs
; (seg
= *segpp
) != NULL
;
3124 segpp
= &((*segpp
)->next
)) {
3125 if (base
>= seg
->pages_base
&& base
< seg
->pages_end
)
3132 if (memseg_includes_meta(seg
)) {
3136 if ((base
+ npgs
) > seg
->pages_end
) {
3142 * Work out the size of the two segments that will
3143 * surround the new segment, one for low address
3146 ASSERT(base
>= seg
->pages_base
);
3147 size_low
= base
- seg
->pages_base
;
3148 ASSERT(seg
->pages_end
>= (base
+ npgs
));
3149 size_high
= seg
->pages_end
- (base
+ npgs
);
3154 if ((size_low
+ size_high
) == 0) {
3160 * Allocate the new structures. The old memseg will not be freed
3161 * as there may be a reference to it.
3167 seg_low
= memseg_alloc();
3169 seg_mid
= memseg_alloc();
3172 seg_high
= memseg_alloc();
3175 * All allocation done now.
3177 if (size_low
!= 0) {
3178 seg_low
->pages
= seg
->pages
;
3179 seg_low
->epages
= seg_low
->pages
+ size_low
;
3180 seg_low
->pages_base
= seg
->pages_base
;
3181 seg_low
->pages_end
= seg_low
->pages_base
+ size_low
;
3182 seg_low
->next
= seg_mid
;
3183 seg_low
->msegflags
= seg
->msegflags
;
3185 if (size_high
!= 0) {
3186 seg_high
->pages
= seg
->epages
- size_high
;
3187 seg_high
->epages
= seg_high
->pages
+ size_high
;
3188 seg_high
->pages_base
= seg
->pages_end
- size_high
;
3189 seg_high
->pages_end
= seg_high
->pages_base
+ size_high
;
3190 seg_high
->next
= seg
->next
;
3191 seg_high
->msegflags
= seg
->msegflags
;
3194 seg_mid
->pages
= seg
->pages
+ size_low
;
3195 seg_mid
->pages_base
= seg
->pages_base
+ size_low
;
3196 seg_mid
->epages
= seg
->epages
- size_high
;
3197 seg_mid
->pages_end
= seg
->pages_end
- size_high
;
3198 seg_mid
->next
= (seg_high
!= NULL
) ? seg_high
: seg
->next
;
3199 seg_mid
->msegflags
= seg
->msegflags
;
3202 * Update hat_kpm specific info of all involved memsegs and
3203 * allow hat_kpm specific global chain updates.
3205 hat_kpm_split_mseg_update(seg
, segpp
, seg_low
, seg_mid
, seg_high
);
3208 * At this point we have two equivalent memseg sub-chains,
3209 * seg and seg_low/seg_mid/seg_high, which both chain on to
3210 * the same place in the global chain. By re-writing the pointer
3211 * in the previous element we switch atomically from using the old
3214 *segpp
= (seg_low
!= NULL
) ? seg_low
: seg_mid
;
3222 * We leave the old segment, 'seg', intact as there may be
3223 * references to it. Also, as the value of total_pages has not
3224 * changed and the memsegs list is effectively the same when
3225 * accessed via the old or the new pointer, we do not have to
3226 * cause pageout_scanner() to re-evaluate its hand pointers.
3228 * We currently do not re-use or reclaim the page_t memory.
3229 * If we do, then this may have to change.
3232 mutex_enter(&memseg_lists_lock
);
3233 seg
->lnext
= memseg_edit_junk
;
3234 memseg_edit_junk
= seg
;
3235 mutex_exit(&memseg_lists_lock
);
3241 * The sfmmu hat layer (e.g.) accesses some parts of the memseg
3242 * structure using physical addresses. Therefore a kmem_cache is
3243 * used with KMC_NOHASH to avoid page crossings within a memseg
3244 * structure. KMC_NOHASH requires that no external (outside of
3245 * slab) information is allowed. This, in turn, implies that the
3246 * cache's slabsize must be exactly a single page, since per-slab
3247 * information (e.g. the freelist for the slab) is kept at the
3248 * end of the slab, where it is easy to locate. Should be changed
3249 * when a more obvious kmem_cache interface/flag will become
3255 memseg_cache
= kmem_cache_create("memseg_cache", sizeof (struct memseg
),
3256 0, NULL
, NULL
, NULL
, NULL
, static_arena
, KMC_NOHASH
);
3264 seg
= kmem_cache_alloc(memseg_cache
, KM_SLEEP
);
3265 bzero(seg
, sizeof (struct memseg
));
3271 * Return whether the page_t memory for this memseg
3272 * is included in the memseg itself.
3275 memseg_includes_meta(struct memseg
*seg
)
3277 return (seg
->msegflags
& MEMSEG_META_INCL
);
3281 memseg_get_start(struct memseg
*seg
)
3285 if (memseg_includes_meta(seg
)) {
3286 pt_start
= hat_getpfnum(kas
.a_hat
, (caddr_t
)seg
->pages
);
3288 /* Meta data is required to be at the beginning */
3289 ASSERT(pt_start
< seg
->pages_base
);
3291 pt_start
= seg
->pages_base
;
3297 * Invalidate memseg pointers in cpu private vm data caches.
3300 memseg_cpu_vm_flush()
3305 mutex_enter(&cpu_lock
);
3306 pause_cpus(NULL
, NULL
);
3310 vc
= cp
->cpu_vm_data
;
3311 vc
->vc_pnum_memseg
= NULL
;
3312 vc
->vc_pnext_memseg
= NULL
;
3314 } while ((cp
= cp
->cpu_next
) != cpu_list
);
3317 mutex_exit(&cpu_lock
);