6345 remove xhat support
[illumos-gate.git] / usr / src / uts / sfmmu / vm / hat_sfmmu.c
blob36d857b2a5acbf6e4b178496b7817477f28e538a
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
29 * VM - Hardware Address Translation management for Spitfire MMU.
31 * This file implements the machine specific hardware translation
32 * needed by the VM system. The machine independent interface is
33 * described in <vm/hat.h> while the machine dependent interface
34 * and data structures are described in <vm/hat_sfmmu.h>.
36 * The hat layer manages the address translation hardware as a cache
37 * driven by calls from the higher levels in the VM system.
40 #include <sys/types.h>
41 #include <sys/kstat.h>
42 #include <vm/hat.h>
43 #include <vm/hat_sfmmu.h>
44 #include <vm/page.h>
45 #include <sys/pte.h>
46 #include <sys/systm.h>
47 #include <sys/mman.h>
48 #include <sys/sysmacros.h>
49 #include <sys/machparam.h>
50 #include <sys/vtrace.h>
51 #include <sys/kmem.h>
52 #include <sys/mmu.h>
53 #include <sys/cmn_err.h>
54 #include <sys/cpu.h>
55 #include <sys/cpuvar.h>
56 #include <sys/debug.h>
57 #include <sys/lgrp.h>
58 #include <sys/archsystm.h>
59 #include <sys/machsystm.h>
60 #include <sys/vmsystm.h>
61 #include <vm/as.h>
62 #include <vm/seg.h>
63 #include <vm/seg_kp.h>
64 #include <vm/seg_kmem.h>
65 #include <vm/seg_kpm.h>
66 #include <vm/rm.h>
67 #include <sys/t_lock.h>
68 #include <sys/obpdefs.h>
69 #include <sys/vm_machparam.h>
70 #include <sys/var.h>
71 #include <sys/trap.h>
72 #include <sys/machtrap.h>
73 #include <sys/scb.h>
74 #include <sys/bitmap.h>
75 #include <sys/machlock.h>
76 #include <sys/membar.h>
77 #include <sys/atomic.h>
78 #include <sys/cpu_module.h>
79 #include <sys/prom_debug.h>
80 #include <sys/ksynch.h>
81 #include <sys/mem_config.h>
82 #include <sys/mem_cage.h>
83 #include <vm/vm_dep.h>
84 #include <sys/fpu/fpusystm.h>
85 #include <vm/mach_kpm.h>
86 #include <sys/callb.h>
88 #ifdef DEBUG
89 #define SFMMU_VALIDATE_HMERID(hat, rid, saddr, len) \
90 if (SFMMU_IS_SHMERID_VALID(rid)) { \
91 caddr_t _eaddr = (saddr) + (len); \
92 sf_srd_t *_srdp; \
93 sf_region_t *_rgnp; \
94 ASSERT((rid) < SFMMU_MAX_HME_REGIONS); \
95 ASSERT(SF_RGNMAP_TEST(hat->sfmmu_hmeregion_map, rid)); \
96 ASSERT((hat) != ksfmmup); \
97 _srdp = (hat)->sfmmu_srdp; \
98 ASSERT(_srdp != NULL); \
99 ASSERT(_srdp->srd_refcnt != 0); \
100 _rgnp = _srdp->srd_hmergnp[(rid)]; \
101 ASSERT(_rgnp != NULL && _rgnp->rgn_id == rid); \
102 ASSERT(_rgnp->rgn_refcnt != 0); \
103 ASSERT(!(_rgnp->rgn_flags & SFMMU_REGION_FREE)); \
104 ASSERT((_rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == \
105 SFMMU_REGION_HME); \
106 ASSERT((saddr) >= _rgnp->rgn_saddr); \
107 ASSERT((saddr) < _rgnp->rgn_saddr + _rgnp->rgn_size); \
108 ASSERT(_eaddr > _rgnp->rgn_saddr); \
109 ASSERT(_eaddr <= _rgnp->rgn_saddr + _rgnp->rgn_size); \
112 #define SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid) \
114 caddr_t _hsva; \
115 caddr_t _heva; \
116 caddr_t _rsva; \
117 caddr_t _reva; \
118 int _ttesz = get_hblk_ttesz(hmeblkp); \
119 int _flagtte; \
120 ASSERT((srdp)->srd_refcnt != 0); \
121 ASSERT((rid) < SFMMU_MAX_HME_REGIONS); \
122 ASSERT((rgnp)->rgn_id == rid); \
123 ASSERT(!((rgnp)->rgn_flags & SFMMU_REGION_FREE)); \
124 ASSERT(((rgnp)->rgn_flags & SFMMU_REGION_TYPE_MASK) == \
125 SFMMU_REGION_HME); \
126 ASSERT(_ttesz <= (rgnp)->rgn_pgszc); \
127 _hsva = (caddr_t)get_hblk_base(hmeblkp); \
128 _heva = get_hblk_endaddr(hmeblkp); \
129 _rsva = (caddr_t)P2ALIGN( \
130 (uintptr_t)(rgnp)->rgn_saddr, HBLK_MIN_BYTES); \
131 _reva = (caddr_t)P2ROUNDUP( \
132 (uintptr_t)((rgnp)->rgn_saddr + (rgnp)->rgn_size), \
133 HBLK_MIN_BYTES); \
134 ASSERT(_hsva >= _rsva); \
135 ASSERT(_hsva < _reva); \
136 ASSERT(_heva > _rsva); \
137 ASSERT(_heva <= _reva); \
138 _flagtte = (_ttesz < HBLK_MIN_TTESZ) ? HBLK_MIN_TTESZ : \
139 _ttesz; \
140 ASSERT(rgnp->rgn_hmeflags & (0x1 << _flagtte)); \
143 #else /* DEBUG */
144 #define SFMMU_VALIDATE_HMERID(hat, rid, addr, len)
145 #define SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid)
146 #endif /* DEBUG */
148 #if defined(SF_ERRATA_57)
149 extern caddr_t errata57_limit;
150 #endif
152 #define HME8BLK_SZ_RND ((roundup(HME8BLK_SZ, sizeof (int64_t))) / \
153 (sizeof (int64_t)))
154 #define HBLK_RESERVE ((struct hme_blk *)hblk_reserve)
156 #define HBLK_RESERVE_CNT 128
157 #define HBLK_RESERVE_MIN 20
159 static struct hme_blk *freehblkp;
160 static kmutex_t freehblkp_lock;
161 static int freehblkcnt;
163 static int64_t hblk_reserve[HME8BLK_SZ_RND];
164 static kmutex_t hblk_reserve_lock;
165 static kthread_t *hblk_reserve_thread;
167 static nucleus_hblk8_info_t nucleus_hblk8;
168 static nucleus_hblk1_info_t nucleus_hblk1;
171 * Data to manage per-cpu hmeblk pending queues, hmeblks are queued here
172 * after the initial phase of removing an hmeblk from the hash chain, see
173 * the detailed comment in sfmmu_hblk_hash_rm() for further details.
175 static cpu_hme_pend_t *cpu_hme_pend;
176 static uint_t cpu_hme_pend_thresh;
178 * SFMMU specific hat functions
180 void hat_pagecachectl(struct page *, int);
182 /* flags for hat_pagecachectl */
183 #define HAT_CACHE 0x1
184 #define HAT_UNCACHE 0x2
185 #define HAT_TMPNC 0x4
188 * Flag to allow the creation of non-cacheable translations
189 * to system memory. It is off by default. At the moment this
190 * flag is used by the ecache error injector. The error injector
191 * will turn it on when creating such a translation then shut it
192 * off when it's finished.
195 int sfmmu_allow_nc_trans = 0;
198 * Flag to disable large page support.
199 * value of 1 => disable all large pages.
200 * bits 1, 2, and 3 are to disable 64K, 512K and 4M pages respectively.
202 * For example, use the value 0x4 to disable 512K pages.
205 #define LARGE_PAGES_OFF 0x1
208 * The disable_large_pages and disable_ism_large_pages variables control
209 * hat_memload_array and the page sizes to be used by ISM and the kernel.
211 * The disable_auto_data_large_pages and disable_auto_text_large_pages variables
212 * are only used to control which OOB pages to use at upper VM segment creation
213 * time, and are set in hat_init_pagesizes and used in the map_pgsz* routines.
214 * Their values may come from platform or CPU specific code to disable page
215 * sizes that should not be used.
217 * WARNING: 512K pages are currently not supported for ISM/DISM.
219 uint_t disable_large_pages = 0;
220 uint_t disable_ism_large_pages = (1 << TTE512K);
221 uint_t disable_auto_data_large_pages = 0;
222 uint_t disable_auto_text_large_pages = 0;
225 * Private sfmmu data structures for hat management
227 static struct kmem_cache *sfmmuid_cache;
228 static struct kmem_cache *mmuctxdom_cache;
231 * Private sfmmu data structures for tsb management
233 static struct kmem_cache *sfmmu_tsbinfo_cache;
234 static struct kmem_cache *sfmmu_tsb8k_cache;
235 static struct kmem_cache *sfmmu_tsb_cache[NLGRPS_MAX];
236 static vmem_t *kmem_bigtsb_arena;
237 static vmem_t *kmem_tsb_arena;
240 * sfmmu static variables for hmeblk resource management.
242 static vmem_t *hat_memload1_arena; /* HAT translation arena for sfmmu1_cache */
243 static struct kmem_cache *sfmmu8_cache;
244 static struct kmem_cache *sfmmu1_cache;
245 static struct kmem_cache *pa_hment_cache;
247 static kmutex_t ism_mlist_lock; /* mutex for ism mapping list */
249 * private data for ism
251 static struct kmem_cache *ism_blk_cache;
252 static struct kmem_cache *ism_ment_cache;
253 #define ISMID_STARTADDR NULL
256 * Region management data structures and function declarations.
259 static void sfmmu_leave_srd(sfmmu_t *);
260 static int sfmmu_srdcache_constructor(void *, void *, int);
261 static void sfmmu_srdcache_destructor(void *, void *);
262 static int sfmmu_rgncache_constructor(void *, void *, int);
263 static void sfmmu_rgncache_destructor(void *, void *);
264 static int sfrgnmap_isnull(sf_region_map_t *);
265 static int sfhmergnmap_isnull(sf_hmeregion_map_t *);
266 static int sfmmu_scdcache_constructor(void *, void *, int);
267 static void sfmmu_scdcache_destructor(void *, void *);
268 static void sfmmu_rgn_cb_noop(caddr_t, caddr_t, caddr_t,
269 size_t, void *, u_offset_t);
271 static uint_t srd_hashmask = SFMMU_MAX_SRD_BUCKETS - 1;
272 static sf_srd_bucket_t *srd_buckets;
273 static struct kmem_cache *srd_cache;
274 static uint_t srd_rgn_hashmask = SFMMU_MAX_REGION_BUCKETS - 1;
275 static struct kmem_cache *region_cache;
276 static struct kmem_cache *scd_cache;
278 #ifdef sun4v
279 int use_bigtsb_arena = 1;
280 #else
281 int use_bigtsb_arena = 0;
282 #endif
284 /* External /etc/system tunable, for turning on&off the shctx support */
285 int disable_shctx = 0;
286 /* Internal variable, set by MD if the HW supports shctx feature */
287 int shctx_on = 0;
289 #ifdef DEBUG
290 static void check_scd_sfmmu_list(sfmmu_t **, sfmmu_t *, int);
291 #endif
292 static void sfmmu_to_scd_list(sfmmu_t **, sfmmu_t *);
293 static void sfmmu_from_scd_list(sfmmu_t **, sfmmu_t *);
295 static sf_scd_t *sfmmu_alloc_scd(sf_srd_t *, sf_region_map_t *);
296 static void sfmmu_find_scd(sfmmu_t *);
297 static void sfmmu_join_scd(sf_scd_t *, sfmmu_t *);
298 static void sfmmu_finish_join_scd(sfmmu_t *);
299 static void sfmmu_leave_scd(sfmmu_t *, uchar_t);
300 static void sfmmu_destroy_scd(sf_srd_t *, sf_scd_t *, sf_region_map_t *);
301 static int sfmmu_alloc_scd_tsbs(sf_srd_t *, sf_scd_t *);
302 static void sfmmu_free_scd_tsbs(sfmmu_t *);
303 static void sfmmu_tsb_inv_ctx(sfmmu_t *);
304 static int find_ism_rid(sfmmu_t *, sfmmu_t *, caddr_t, uint_t *);
305 static void sfmmu_ism_hatflags(sfmmu_t *, int);
306 static int sfmmu_srd_lock_held(sf_srd_t *);
307 static void sfmmu_remove_scd(sf_scd_t **, sf_scd_t *);
308 static void sfmmu_add_scd(sf_scd_t **headp, sf_scd_t *);
309 static void sfmmu_link_scd_to_regions(sf_srd_t *, sf_scd_t *);
310 static void sfmmu_unlink_scd_from_regions(sf_srd_t *, sf_scd_t *);
311 static void sfmmu_link_to_hmeregion(sfmmu_t *, sf_region_t *);
312 static void sfmmu_unlink_from_hmeregion(sfmmu_t *, sf_region_t *);
315 * ``hat_lock'' is a hashed mutex lock for protecting sfmmu TSB lists,
316 * HAT flags, synchronizing TLB/TSB coherency, and context management.
317 * The lock is hashed on the sfmmup since the case where we need to lock
318 * all processes is rare but does occur (e.g. we need to unload a shared
319 * mapping from all processes using the mapping). We have a lot of buckets,
320 * and each slab of sfmmu_t's can use about a quarter of them, giving us
321 * a fairly good distribution without wasting too much space and overhead
322 * when we have to grab them all.
324 #define SFMMU_NUM_LOCK 128 /* must be power of two */
325 hatlock_t hat_lock[SFMMU_NUM_LOCK];
328 * Hash algorithm optimized for a small number of slabs.
329 * 7 is (highbit((sizeof sfmmu_t)) - 1)
330 * This hash algorithm is based upon the knowledge that sfmmu_t's come from a
331 * kmem_cache, and thus they will be sequential within that cache. In
332 * addition, each new slab will have a different "color" up to cache_maxcolor
333 * which will skew the hashing for each successive slab which is allocated.
334 * If the size of sfmmu_t changed to a larger size, this algorithm may need
335 * to be revisited.
337 #define TSB_HASH_SHIFT_BITS (7)
338 #define PTR_HASH(x) ((uintptr_t)x >> TSB_HASH_SHIFT_BITS)
340 #ifdef DEBUG
341 int tsb_hash_debug = 0;
342 #define TSB_HASH(sfmmup) \
343 (tsb_hash_debug ? &hat_lock[0] : \
344 &hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)])
345 #else /* DEBUG */
346 #define TSB_HASH(sfmmup) &hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)]
347 #endif /* DEBUG */
350 /* sfmmu_replace_tsb() return codes. */
351 typedef enum tsb_replace_rc {
352 TSB_SUCCESS,
353 TSB_ALLOCFAIL,
354 TSB_LOSTRACE,
355 TSB_ALREADY_SWAPPED,
356 TSB_CANTGROW
357 } tsb_replace_rc_t;
360 * Flags for TSB allocation routines.
362 #define TSB_ALLOC 0x01
363 #define TSB_FORCEALLOC 0x02
364 #define TSB_GROW 0x04
365 #define TSB_SHRINK 0x08
366 #define TSB_SWAPIN 0x10
369 * Support for HAT callbacks.
371 #define SFMMU_MAX_RELOC_CALLBACKS 10
372 int sfmmu_max_cb_id = SFMMU_MAX_RELOC_CALLBACKS;
373 static id_t sfmmu_cb_nextid = 0;
374 static id_t sfmmu_tsb_cb_id;
375 struct sfmmu_callback *sfmmu_cb_table;
377 kmutex_t kpr_mutex;
378 kmutex_t kpr_suspendlock;
379 kthread_t *kreloc_thread;
382 * Enable VA->PA translation sanity checking on DEBUG kernels.
383 * Disabled by default. This is incompatible with some
384 * drivers (error injector, RSM) so if it breaks you get
385 * to keep both pieces.
387 int hat_check_vtop = 0;
390 * Private sfmmu routines (prototypes)
392 static struct hme_blk *sfmmu_shadow_hcreate(sfmmu_t *, caddr_t, int, uint_t);
393 static struct hme_blk *sfmmu_hblk_alloc(sfmmu_t *, caddr_t,
394 struct hmehash_bucket *, uint_t, hmeblk_tag, uint_t,
395 uint_t);
396 static caddr_t sfmmu_hblk_unload(struct hat *, struct hme_blk *, caddr_t,
397 caddr_t, demap_range_t *, uint_t);
398 static caddr_t sfmmu_hblk_sync(struct hat *, struct hme_blk *, caddr_t,
399 caddr_t, int);
400 static void sfmmu_hblk_free(struct hme_blk **);
401 static void sfmmu_hblks_list_purge(struct hme_blk **, int);
402 static uint_t sfmmu_get_free_hblk(struct hme_blk **, uint_t);
403 static uint_t sfmmu_put_free_hblk(struct hme_blk *, uint_t);
404 static struct hme_blk *sfmmu_hblk_steal(int);
405 static int sfmmu_steal_this_hblk(struct hmehash_bucket *,
406 struct hme_blk *, uint64_t, struct hme_blk *);
407 static caddr_t sfmmu_hblk_unlock(struct hme_blk *, caddr_t, caddr_t);
409 static void hat_do_memload_array(struct hat *, caddr_t, size_t,
410 struct page **, uint_t, uint_t, uint_t);
411 static void hat_do_memload(struct hat *, caddr_t, struct page *,
412 uint_t, uint_t, uint_t);
413 static void sfmmu_memload_batchsmall(struct hat *, caddr_t, page_t **,
414 uint_t, uint_t, pgcnt_t, uint_t);
415 void sfmmu_tteload(struct hat *, tte_t *, caddr_t, page_t *,
416 uint_t);
417 static int sfmmu_tteload_array(sfmmu_t *, tte_t *, caddr_t, page_t **,
418 uint_t, uint_t);
419 static struct hmehash_bucket *sfmmu_tteload_acquire_hashbucket(sfmmu_t *,
420 caddr_t, int, uint_t);
421 static struct hme_blk *sfmmu_tteload_find_hmeblk(sfmmu_t *,
422 struct hmehash_bucket *, caddr_t, uint_t, uint_t,
423 uint_t);
424 static int sfmmu_tteload_addentry(sfmmu_t *, struct hme_blk *, tte_t *,
425 caddr_t, page_t **, uint_t, uint_t);
426 static void sfmmu_tteload_release_hashbucket(struct hmehash_bucket *);
428 static int sfmmu_pagearray_setup(caddr_t, page_t **, tte_t *, int);
429 static pfn_t sfmmu_uvatopfn(caddr_t, sfmmu_t *, tte_t *);
430 void sfmmu_memtte(tte_t *, pfn_t, uint_t, int);
431 #ifdef VAC
432 static void sfmmu_vac_conflict(struct hat *, caddr_t, page_t *);
433 static int sfmmu_vacconflict_array(caddr_t, page_t *, int *);
434 int tst_tnc(page_t *pp, pgcnt_t);
435 void conv_tnc(page_t *pp, int);
436 #endif
438 static void sfmmu_get_ctx(sfmmu_t *);
439 static void sfmmu_free_sfmmu(sfmmu_t *);
441 static void sfmmu_ttesync(struct hat *, caddr_t, tte_t *, page_t *);
442 static void sfmmu_chgattr(struct hat *, caddr_t, size_t, uint_t, int);
444 cpuset_t sfmmu_pageunload(page_t *, struct sf_hment *, int);
445 static void hat_pagereload(struct page *, struct page *);
446 static cpuset_t sfmmu_pagesync(page_t *, struct sf_hment *, uint_t);
447 #ifdef VAC
448 void sfmmu_page_cache_array(page_t *, int, int, pgcnt_t);
449 static void sfmmu_page_cache(page_t *, int, int, int);
450 #endif
452 cpuset_t sfmmu_rgntlb_demap(caddr_t, sf_region_t *,
453 struct hme_blk *, int);
454 static void sfmmu_tlbcache_demap(caddr_t, sfmmu_t *, struct hme_blk *,
455 pfn_t, int, int, int, int);
456 static void sfmmu_ismtlbcache_demap(caddr_t, sfmmu_t *, struct hme_blk *,
457 pfn_t, int);
458 static void sfmmu_tlb_demap(caddr_t, sfmmu_t *, struct hme_blk *, int, int);
459 static void sfmmu_tlb_range_demap(demap_range_t *);
460 static void sfmmu_invalidate_ctx(sfmmu_t *);
461 static void sfmmu_sync_mmustate(sfmmu_t *);
463 static void sfmmu_tsbinfo_setup_phys(struct tsb_info *, pfn_t);
464 static int sfmmu_tsbinfo_alloc(struct tsb_info **, int, int, uint_t,
465 sfmmu_t *);
466 static void sfmmu_tsb_free(struct tsb_info *);
467 static void sfmmu_tsbinfo_free(struct tsb_info *);
468 static int sfmmu_init_tsbinfo(struct tsb_info *, int, int, uint_t,
469 sfmmu_t *);
470 static void sfmmu_tsb_chk_reloc(sfmmu_t *, hatlock_t *);
471 static void sfmmu_tsb_swapin(sfmmu_t *, hatlock_t *);
472 static int sfmmu_select_tsb_szc(pgcnt_t);
473 static void sfmmu_mod_tsb(sfmmu_t *, caddr_t, tte_t *, int);
474 #define sfmmu_load_tsb(sfmmup, vaddr, tte, szc) \
475 sfmmu_mod_tsb(sfmmup, vaddr, tte, szc)
476 #define sfmmu_unload_tsb(sfmmup, vaddr, szc) \
477 sfmmu_mod_tsb(sfmmup, vaddr, NULL, szc)
478 static void sfmmu_copy_tsb(struct tsb_info *, struct tsb_info *);
479 static tsb_replace_rc_t sfmmu_replace_tsb(sfmmu_t *, struct tsb_info *, uint_t,
480 hatlock_t *, uint_t);
481 static void sfmmu_size_tsb(sfmmu_t *, int, uint64_t, uint64_t, int);
483 #ifdef VAC
484 void sfmmu_cache_flush(pfn_t, int);
485 void sfmmu_cache_flushcolor(int, pfn_t);
486 #endif
487 static caddr_t sfmmu_hblk_chgattr(sfmmu_t *, struct hme_blk *, caddr_t,
488 caddr_t, demap_range_t *, uint_t, int);
490 static uint64_t sfmmu_vtop_attr(uint_t, int mode, tte_t *);
491 static uint_t sfmmu_ptov_attr(tte_t *);
492 static caddr_t sfmmu_hblk_chgprot(sfmmu_t *, struct hme_blk *, caddr_t,
493 caddr_t, demap_range_t *, uint_t);
494 static uint_t sfmmu_vtop_prot(uint_t, uint_t *);
495 static int sfmmu_idcache_constructor(void *, void *, int);
496 static void sfmmu_idcache_destructor(void *, void *);
497 static int sfmmu_hblkcache_constructor(void *, void *, int);
498 static void sfmmu_hblkcache_destructor(void *, void *);
499 static void sfmmu_hblkcache_reclaim(void *);
500 static void sfmmu_shadow_hcleanup(sfmmu_t *, struct hme_blk *,
501 struct hmehash_bucket *);
502 static void sfmmu_hblk_hash_rm(struct hmehash_bucket *, struct hme_blk *,
503 struct hme_blk *, struct hme_blk **, int);
504 static void sfmmu_hblk_hash_add(struct hmehash_bucket *, struct hme_blk *,
505 uint64_t);
506 static struct hme_blk *sfmmu_check_pending_hblks(int);
507 static void sfmmu_free_hblks(sfmmu_t *, caddr_t, caddr_t, int);
508 static void sfmmu_cleanup_rhblk(sf_srd_t *, caddr_t, uint_t, int);
509 static void sfmmu_unload_hmeregion_va(sf_srd_t *, uint_t, caddr_t, caddr_t,
510 int, caddr_t *);
511 static void sfmmu_unload_hmeregion(sf_srd_t *, sf_region_t *);
513 static void sfmmu_rm_large_mappings(page_t *, int);
515 static void hat_lock_init(void);
516 static void hat_kstat_init(void);
517 static int sfmmu_kstat_percpu_update(kstat_t *ksp, int rw);
518 static void sfmmu_set_scd_rttecnt(sf_srd_t *, sf_scd_t *);
519 static int sfmmu_is_rgnva(sf_srd_t *, caddr_t, ulong_t, ulong_t);
520 static void sfmmu_check_page_sizes(sfmmu_t *, int);
521 int fnd_mapping_sz(page_t *);
522 static void iment_add(struct ism_ment *, struct hat *);
523 static void iment_sub(struct ism_ment *, struct hat *);
524 static pgcnt_t ism_tsb_entries(sfmmu_t *, int szc);
525 extern void sfmmu_setup_tsbinfo(sfmmu_t *);
526 extern void sfmmu_clear_utsbinfo(void);
528 static void sfmmu_ctx_wrap_around(mmu_ctx_t *, boolean_t);
530 extern int vpm_enable;
532 /* kpm globals */
533 #ifdef DEBUG
535 * Enable trap level tsbmiss handling
537 int kpm_tsbmtl = 1;
540 * Flush the TLB on kpm mapout. Note: Xcalls are used (again) for the
541 * required TLB shootdowns in this case, so handle w/ care. Off by default.
543 int kpm_tlb_flush;
544 #endif /* DEBUG */
546 static void *sfmmu_vmem_xalloc_aligned_wrapper(vmem_t *, size_t, int);
548 #ifdef DEBUG
549 static void sfmmu_check_hblk_flist();
550 #endif
553 * Semi-private sfmmu data structures. Some of them are initialize in
554 * startup or in hat_init. Some of them are private but accessed by
555 * assembly code or mach_sfmmu.c
557 struct hmehash_bucket *uhme_hash; /* user hmeblk hash table */
558 struct hmehash_bucket *khme_hash; /* kernel hmeblk hash table */
559 uint64_t uhme_hash_pa; /* PA of uhme_hash */
560 uint64_t khme_hash_pa; /* PA of khme_hash */
561 int uhmehash_num; /* # of buckets in user hash table */
562 int khmehash_num; /* # of buckets in kernel hash table */
564 uint_t max_mmu_ctxdoms = 0; /* max context domains in the system */
565 mmu_ctx_t **mmu_ctxs_tbl; /* global array of context domains */
566 uint64_t mmu_saved_gnum = 0; /* to init incoming MMUs' gnums */
568 #define DEFAULT_NUM_CTXS_PER_MMU 8192
569 static uint_t nctxs = DEFAULT_NUM_CTXS_PER_MMU;
571 int cache; /* describes system cache */
573 caddr_t ktsb_base; /* kernel 8k-indexed tsb base address */
574 uint64_t ktsb_pbase; /* kernel 8k-indexed tsb phys address */
575 int ktsb_szcode; /* kernel 8k-indexed tsb size code */
576 int ktsb_sz; /* kernel 8k-indexed tsb size */
578 caddr_t ktsb4m_base; /* kernel 4m-indexed tsb base address */
579 uint64_t ktsb4m_pbase; /* kernel 4m-indexed tsb phys address */
580 int ktsb4m_szcode; /* kernel 4m-indexed tsb size code */
581 int ktsb4m_sz; /* kernel 4m-indexed tsb size */
583 uint64_t kpm_tsbbase; /* kernel seg_kpm 4M TSB base address */
584 int kpm_tsbsz; /* kernel seg_kpm 4M TSB size code */
585 uint64_t kpmsm_tsbbase; /* kernel seg_kpm 8K TSB base address */
586 int kpmsm_tsbsz; /* kernel seg_kpm 8K TSB size code */
588 #ifndef sun4v
589 int utsb_dtlb_ttenum = -1; /* index in TLB for utsb locked TTE */
590 int utsb4m_dtlb_ttenum = -1; /* index in TLB for 4M TSB TTE */
591 int dtlb_resv_ttenum; /* index in TLB of first reserved TTE */
592 caddr_t utsb_vabase; /* reserved kernel virtual memory */
593 caddr_t utsb4m_vabase; /* for trap handler TSB accesses */
594 #endif /* sun4v */
595 uint64_t tsb_alloc_bytes = 0; /* bytes allocated to TSBs */
596 vmem_t *kmem_tsb_default_arena[NLGRPS_MAX]; /* For dynamic TSBs */
597 vmem_t *kmem_bigtsb_default_arena[NLGRPS_MAX]; /* dynamic 256M TSBs */
600 * Size to use for TSB slabs. Future platforms that support page sizes
601 * larger than 4M may wish to change these values, and provide their own
602 * assembly macros for building and decoding the TSB base register contents.
603 * Note disable_large_pages will override the value set here.
605 static uint_t tsb_slab_ttesz = TTE4M;
606 size_t tsb_slab_size = MMU_PAGESIZE4M;
607 uint_t tsb_slab_shift = MMU_PAGESHIFT4M;
608 /* PFN mask for TTE */
609 size_t tsb_slab_mask = MMU_PAGEOFFSET4M >> MMU_PAGESHIFT;
612 * Size to use for TSB slabs. These are used only when 256M tsb arenas
613 * exist.
615 static uint_t bigtsb_slab_ttesz = TTE256M;
616 static size_t bigtsb_slab_size = MMU_PAGESIZE256M;
617 static uint_t bigtsb_slab_shift = MMU_PAGESHIFT256M;
618 /* 256M page alignment for 8K pfn */
619 static size_t bigtsb_slab_mask = MMU_PAGEOFFSET256M >> MMU_PAGESHIFT;
621 /* largest TSB size to grow to, will be smaller on smaller memory systems */
622 static int tsb_max_growsize = 0;
625 * Tunable parameters dealing with TSB policies.
629 * This undocumented tunable forces all 8K TSBs to be allocated from
630 * the kernel heap rather than from the kmem_tsb_default_arena arenas.
632 #ifdef DEBUG
633 int tsb_forceheap = 0;
634 #endif /* DEBUG */
637 * Decide whether to use per-lgroup arenas, or one global set of
638 * TSB arenas. The default is not to break up per-lgroup, since
639 * most platforms don't recognize any tangible benefit from it.
641 int tsb_lgrp_affinity = 0;
644 * Used for growing the TSB based on the process RSS.
645 * tsb_rss_factor is based on the smallest TSB, and is
646 * shifted by the TSB size to determine if we need to grow.
647 * The default will grow the TSB if the number of TTEs for
648 * this page size exceeds 75% of the number of TSB entries,
649 * which should _almost_ eliminate all conflict misses
650 * (at the expense of using up lots and lots of memory).
652 #define TSB_RSS_FACTOR (TSB_ENTRIES(TSB_MIN_SZCODE) * 0.75)
653 #define SFMMU_RSS_TSBSIZE(tsbszc) (tsb_rss_factor << tsbszc)
654 #define SELECT_TSB_SIZECODE(pgcnt) ( \
655 (enable_tsb_rss_sizing)? sfmmu_select_tsb_szc(pgcnt) : \
656 default_tsb_size)
657 #define TSB_OK_SHRINK() \
658 (tsb_alloc_bytes > tsb_alloc_hiwater || freemem < desfree)
659 #define TSB_OK_GROW() \
660 (tsb_alloc_bytes < tsb_alloc_hiwater && freemem > desfree)
662 int enable_tsb_rss_sizing = 1;
663 int tsb_rss_factor = (int)TSB_RSS_FACTOR;
665 /* which TSB size code to use for new address spaces or if rss sizing off */
666 int default_tsb_size = TSB_8K_SZCODE;
668 static uint64_t tsb_alloc_hiwater; /* limit TSB reserved memory */
669 uint64_t tsb_alloc_hiwater_factor; /* tsb_alloc_hiwater = physmem / this */
670 #define TSB_ALLOC_HIWATER_FACTOR_DEFAULT 32
672 #ifdef DEBUG
673 static int tsb_random_size = 0; /* set to 1 to test random tsb sizes on alloc */
674 static int tsb_grow_stress = 0; /* if set to 1, keep replacing TSB w/ random */
675 static int tsb_alloc_mtbf = 0; /* fail allocation every n attempts */
676 static int tsb_alloc_fail_mtbf = 0;
677 static int tsb_alloc_count = 0;
678 #endif /* DEBUG */
680 /* if set to 1, will remap valid TTEs when growing TSB. */
681 int tsb_remap_ttes = 1;
684 * If we have more than this many mappings, allocate a second TSB.
685 * This default is chosen because the I/D fully associative TLBs are
686 * assumed to have at least 8 available entries. Platforms with a
687 * larger fully-associative TLB could probably override the default.
690 #ifdef sun4v
691 int tsb_sectsb_threshold = 0;
692 #else
693 int tsb_sectsb_threshold = 8;
694 #endif
697 * kstat data
699 struct sfmmu_global_stat sfmmu_global_stat;
700 struct sfmmu_tsbsize_stat sfmmu_tsbsize_stat;
703 * Global data
705 sfmmu_t *ksfmmup; /* kernel's hat id */
707 #ifdef DEBUG
708 static void chk_tte(tte_t *, tte_t *, tte_t *, struct hme_blk *);
709 #endif
711 /* sfmmu locking operations */
712 static kmutex_t *sfmmu_mlspl_enter(struct page *, int);
713 static int sfmmu_mlspl_held(struct page *, int);
715 kmutex_t *sfmmu_page_enter(page_t *);
716 void sfmmu_page_exit(kmutex_t *);
717 int sfmmu_page_spl_held(struct page *);
719 /* sfmmu internal locking operations - accessed directly */
720 static void sfmmu_mlist_reloc_enter(page_t *, page_t *,
721 kmutex_t **, kmutex_t **);
722 static void sfmmu_mlist_reloc_exit(kmutex_t *, kmutex_t *);
723 static hatlock_t *
724 sfmmu_hat_enter(sfmmu_t *);
725 static hatlock_t *
726 sfmmu_hat_tryenter(sfmmu_t *);
727 static void sfmmu_hat_exit(hatlock_t *);
728 static void sfmmu_hat_lock_all(void);
729 static void sfmmu_hat_unlock_all(void);
730 static void sfmmu_ismhat_enter(sfmmu_t *, int);
731 static void sfmmu_ismhat_exit(sfmmu_t *, int);
733 kpm_hlk_t *kpmp_table;
734 uint_t kpmp_table_sz; /* must be a power of 2 */
735 uchar_t kpmp_shift;
737 kpm_shlk_t *kpmp_stable;
738 uint_t kpmp_stable_sz; /* must be a power of 2 */
741 * SPL_TABLE_SIZE is 2 * NCPU, but no smaller than 128.
742 * SPL_SHIFT is log2(SPL_TABLE_SIZE).
744 #if ((2*NCPU_P2) > 128)
745 #define SPL_SHIFT ((unsigned)(NCPU_LOG2 + 1))
746 #else
747 #define SPL_SHIFT 7U
748 #endif
749 #define SPL_TABLE_SIZE (1U << SPL_SHIFT)
750 #define SPL_MASK (SPL_TABLE_SIZE - 1)
753 * We shift by PP_SHIFT to take care of the low-order 0 bits of a page_t
754 * and by multiples of SPL_SHIFT to get as many varied bits as we can.
756 #define SPL_INDEX(pp) \
757 ((((uintptr_t)(pp) >> PP_SHIFT) ^ \
758 ((uintptr_t)(pp) >> (PP_SHIFT + SPL_SHIFT)) ^ \
759 ((uintptr_t)(pp) >> (PP_SHIFT + SPL_SHIFT * 2)) ^ \
760 ((uintptr_t)(pp) >> (PP_SHIFT + SPL_SHIFT * 3))) & \
761 SPL_MASK)
763 #define SPL_HASH(pp) \
764 (&sfmmu_page_lock[SPL_INDEX(pp)].pad_mutex)
766 static pad_mutex_t sfmmu_page_lock[SPL_TABLE_SIZE];
768 /* Array of mutexes protecting a page's mapping list and p_nrm field. */
770 #define MML_TABLE_SIZE SPL_TABLE_SIZE
771 #define MLIST_HASH(pp) (&mml_table[SPL_INDEX(pp)].pad_mutex)
773 static pad_mutex_t mml_table[MML_TABLE_SIZE];
776 * hat_unload_callback() will group together callbacks in order
777 * to avoid xt_sync() calls. This is the maximum size of the group.
779 #define MAX_CB_ADDR 32
781 tte_t hw_tte;
782 static ulong_t sfmmu_dmr_maxbit = DMR_MAXBIT;
784 static char *mmu_ctx_kstat_names[] = {
785 "mmu_ctx_tsb_exceptions",
786 "mmu_ctx_tsb_raise_exception",
787 "mmu_ctx_wrap_around",
791 * Wrapper for vmem_xalloc since vmem_create only allows limited
792 * parameters for vm_source_alloc functions. This function allows us
793 * to specify alignment consistent with the size of the object being
794 * allocated.
796 static void *
797 sfmmu_vmem_xalloc_aligned_wrapper(vmem_t *vmp, size_t size, int vmflag)
799 return (vmem_xalloc(vmp, size, size, 0, 0, NULL, NULL, vmflag));
802 /* Common code for setting tsb_alloc_hiwater. */
803 #define SFMMU_SET_TSB_ALLOC_HIWATER(pages) tsb_alloc_hiwater = \
804 ptob(pages) / tsb_alloc_hiwater_factor
807 * Set tsb_max_growsize to allow at most all of physical memory to be mapped by
808 * a single TSB. physmem is the number of physical pages so we need physmem 8K
809 * TTEs to represent all those physical pages. We round this up by using
810 * 1<<highbit(). To figure out which size code to use, remember that the size
811 * code is just an amount to shift the smallest TSB size to get the size of
812 * this TSB. So we subtract that size, TSB_START_SIZE, from highbit() (or
813 * highbit() - 1) to get the size code for the smallest TSB that can represent
814 * all of physical memory, while erring on the side of too much.
816 * Restrict tsb_max_growsize to make sure that:
817 * 1) TSBs can't grow larger than the TSB slab size
818 * 2) TSBs can't grow larger than UTSB_MAX_SZCODE.
820 #define SFMMU_SET_TSB_MAX_GROWSIZE(pages) { \
821 int _i, _szc, _slabszc, _tsbszc; \
823 _i = highbit(pages); \
824 if ((1 << (_i - 1)) == (pages)) \
825 _i--; /* 2^n case, round down */ \
826 _szc = _i - TSB_START_SIZE; \
827 _slabszc = bigtsb_slab_shift - (TSB_START_SIZE + TSB_ENTRY_SHIFT); \
828 _tsbszc = MIN(_szc, _slabszc); \
829 tsb_max_growsize = MIN(_tsbszc, UTSB_MAX_SZCODE); \
833 * Given a pointer to an sfmmu and a TTE size code, return a pointer to the
834 * tsb_info which handles that TTE size.
836 #define SFMMU_GET_TSBINFO(tsbinfop, sfmmup, tte_szc) { \
837 (tsbinfop) = (sfmmup)->sfmmu_tsb; \
838 ASSERT(((tsbinfop)->tsb_flags & TSB_SHAREDCTX) || \
839 sfmmu_hat_lock_held(sfmmup)); \
840 if ((tte_szc) >= TTE4M) { \
841 ASSERT((tsbinfop) != NULL); \
842 (tsbinfop) = (tsbinfop)->tsb_next; \
847 * Macro to use to unload entries from the TSB.
848 * It has knowledge of which page sizes get replicated in the TSB
849 * and will call the appropriate unload routine for the appropriate size.
851 #define SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, ismhat) \
853 int ttesz = get_hblk_ttesz(hmeblkp); \
854 if (ttesz == TTE8K || ttesz == TTE4M) { \
855 sfmmu_unload_tsb(sfmmup, addr, ttesz); \
856 } else { \
857 caddr_t sva = ismhat ? addr : \
858 (caddr_t)get_hblk_base(hmeblkp); \
859 caddr_t eva = sva + get_hblk_span(hmeblkp); \
860 ASSERT(addr >= sva && addr < eva); \
861 sfmmu_unload_tsb_range(sfmmup, sva, eva, ttesz); \
866 /* Update tsb_alloc_hiwater after memory is configured. */
867 /*ARGSUSED*/
868 static void
869 sfmmu_update_post_add(void *arg, pgcnt_t delta_pages)
871 /* Assumes physmem has already been updated. */
872 SFMMU_SET_TSB_ALLOC_HIWATER(physmem);
873 SFMMU_SET_TSB_MAX_GROWSIZE(physmem);
877 * Update tsb_alloc_hiwater before memory is deleted. We'll do nothing here
878 * and update tsb_alloc_hiwater and tsb_max_growsize after the memory is
879 * deleted.
881 /*ARGSUSED*/
882 static int
883 sfmmu_update_pre_del(void *arg, pgcnt_t delta_pages)
885 return (0);
888 /* Update tsb_alloc_hiwater after memory fails to be unconfigured. */
889 /*ARGSUSED*/
890 static void
891 sfmmu_update_post_del(void *arg, pgcnt_t delta_pages, int cancelled)
894 * Whether the delete was cancelled or not, just go ahead and update
895 * tsb_alloc_hiwater and tsb_max_growsize.
897 SFMMU_SET_TSB_ALLOC_HIWATER(physmem);
898 SFMMU_SET_TSB_MAX_GROWSIZE(physmem);
901 static kphysm_setup_vector_t sfmmu_update_vec = {
902 KPHYSM_SETUP_VECTOR_VERSION, /* version */
903 sfmmu_update_post_add, /* post_add */
904 sfmmu_update_pre_del, /* pre_del */
905 sfmmu_update_post_del /* post_del */
910 * HME_BLK HASH PRIMITIVES
914 * Enter a hme on the mapping list for page pp.
915 * When large pages are more prevalent in the system we might want to
916 * keep the mapping list in ascending order by the hment size. For now,
917 * small pages are more frequent, so don't slow it down.
919 #define HME_ADD(hme, pp) \
921 ASSERT(sfmmu_mlist_held(pp)); \
923 hme->hme_prev = NULL; \
924 hme->hme_next = pp->p_mapping; \
925 hme->hme_page = pp; \
926 if (pp->p_mapping) { \
927 ((struct sf_hment *)(pp->p_mapping))->hme_prev = hme;\
928 ASSERT(pp->p_share > 0); \
929 } else { \
930 /* EMPTY */ \
931 ASSERT(pp->p_share == 0); \
933 pp->p_mapping = hme; \
934 pp->p_share++; \
938 * Enter a hme on the mapping list for page pp.
939 * If we are unmapping a large translation, we need to make sure that the
940 * change is reflect in the corresponding bit of the p_index field.
942 #define HME_SUB(hme, pp) \
944 ASSERT(sfmmu_mlist_held(pp)); \
945 ASSERT(hme->hme_page == pp || IS_PAHME(hme)); \
947 if (pp->p_mapping == NULL) { \
948 panic("hme_remove - no mappings"); \
951 membar_stst(); /* ensure previous stores finish */ \
953 ASSERT(pp->p_share > 0); \
954 pp->p_share--; \
956 if (hme->hme_prev) { \
957 ASSERT(pp->p_mapping != hme); \
958 ASSERT(hme->hme_prev->hme_page == pp || \
959 IS_PAHME(hme->hme_prev)); \
960 hme->hme_prev->hme_next = hme->hme_next; \
961 } else { \
962 ASSERT(pp->p_mapping == hme); \
963 pp->p_mapping = hme->hme_next; \
964 ASSERT((pp->p_mapping == NULL) ? \
965 (pp->p_share == 0) : 1); \
968 if (hme->hme_next) { \
969 ASSERT(hme->hme_next->hme_page == pp || \
970 IS_PAHME(hme->hme_next)); \
971 hme->hme_next->hme_prev = hme->hme_prev; \
974 /* zero out the entry */ \
975 hme->hme_next = NULL; \
976 hme->hme_prev = NULL; \
977 hme->hme_page = NULL; \
979 if (hme_size(hme) > TTE8K) { \
980 /* remove mappings for remainder of large pg */ \
981 sfmmu_rm_large_mappings(pp, hme_size(hme)); \
986 * This function returns the hment given the hme_blk and a vaddr.
987 * It assumes addr has already been checked to belong to hme_blk's
988 * range.
990 #define HBLKTOHME(hment, hmeblkp, addr) \
992 int index; \
993 HBLKTOHME_IDX(hment, hmeblkp, addr, index) \
997 * Version of HBLKTOHME that also returns the index in hmeblkp
998 * of the hment.
1000 #define HBLKTOHME_IDX(hment, hmeblkp, addr, idx) \
1002 ASSERT(in_hblk_range((hmeblkp), (addr))); \
1004 if (get_hblk_ttesz(hmeblkp) == TTE8K) { \
1005 idx = (((uintptr_t)(addr) >> MMU_PAGESHIFT) & (NHMENTS-1)); \
1006 } else \
1007 idx = 0; \
1009 (hment) = &(hmeblkp)->hblk_hme[idx]; \
1013 * Disable any page sizes not supported by the CPU
1015 void
1016 hat_init_pagesizes()
1018 int i;
1020 mmu_exported_page_sizes = 0;
1021 for (i = TTE8K; i < max_mmu_page_sizes; i++) {
1023 szc_2_userszc[i] = (uint_t)-1;
1024 userszc_2_szc[i] = (uint_t)-1;
1026 if ((mmu_exported_pagesize_mask & (1 << i)) == 0) {
1027 disable_large_pages |= (1 << i);
1028 } else {
1029 szc_2_userszc[i] = mmu_exported_page_sizes;
1030 userszc_2_szc[mmu_exported_page_sizes] = i;
1031 mmu_exported_page_sizes++;
1035 disable_ism_large_pages |= disable_large_pages;
1036 disable_auto_data_large_pages = disable_large_pages;
1037 disable_auto_text_large_pages = disable_large_pages;
1040 * Initialize mmu-specific large page sizes.
1042 if (&mmu_large_pages_disabled) {
1043 disable_large_pages |= mmu_large_pages_disabled(HAT_LOAD);
1044 disable_ism_large_pages |=
1045 mmu_large_pages_disabled(HAT_LOAD_SHARE);
1046 disable_auto_data_large_pages |=
1047 mmu_large_pages_disabled(HAT_AUTO_DATA);
1048 disable_auto_text_large_pages |=
1049 mmu_large_pages_disabled(HAT_AUTO_TEXT);
1054 * Initialize the hardware address translation structures.
1056 void
1057 hat_init(void)
1059 int i;
1060 uint_t sz;
1061 size_t size;
1063 hat_lock_init();
1064 hat_kstat_init();
1067 * Hardware-only bits in a TTE
1069 MAKE_TTE_MASK(&hw_tte);
1071 hat_init_pagesizes();
1073 /* Initialize the hash locks */
1074 for (i = 0; i < khmehash_num; i++) {
1075 mutex_init(&khme_hash[i].hmehash_mutex, NULL,
1076 MUTEX_DEFAULT, NULL);
1077 khme_hash[i].hmeh_nextpa = HMEBLK_ENDPA;
1079 for (i = 0; i < uhmehash_num; i++) {
1080 mutex_init(&uhme_hash[i].hmehash_mutex, NULL,
1081 MUTEX_DEFAULT, NULL);
1082 uhme_hash[i].hmeh_nextpa = HMEBLK_ENDPA;
1084 khmehash_num--; /* make sure counter starts from 0 */
1085 uhmehash_num--; /* make sure counter starts from 0 */
1088 * Allocate context domain structures.
1090 * A platform may choose to modify max_mmu_ctxdoms in
1091 * set_platform_defaults(). If a platform does not define
1092 * a set_platform_defaults() or does not choose to modify
1093 * max_mmu_ctxdoms, it gets one MMU context domain for every CPU.
1095 * For all platforms that have CPUs sharing MMUs, this
1096 * value must be defined.
1098 if (max_mmu_ctxdoms == 0)
1099 max_mmu_ctxdoms = max_ncpus;
1101 size = max_mmu_ctxdoms * sizeof (mmu_ctx_t *);
1102 mmu_ctxs_tbl = kmem_zalloc(size, KM_SLEEP);
1104 /* mmu_ctx_t is 64 bytes aligned */
1105 mmuctxdom_cache = kmem_cache_create("mmuctxdom_cache",
1106 sizeof (mmu_ctx_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
1108 * MMU context domain initialization for the Boot CPU.
1109 * This needs the context domains array allocated above.
1111 mutex_enter(&cpu_lock);
1112 sfmmu_cpu_init(CPU);
1113 mutex_exit(&cpu_lock);
1116 * Intialize ism mapping list lock.
1119 mutex_init(&ism_mlist_lock, NULL, MUTEX_DEFAULT, NULL);
1122 * Each sfmmu structure carries an array of MMU context info
1123 * structures, one per context domain. The size of this array depends
1124 * on the maximum number of context domains. So, the size of the
1125 * sfmmu structure varies per platform.
1127 * sfmmu is allocated from static arena, because trap
1128 * handler at TL > 0 is not allowed to touch kernel relocatable
1129 * memory. sfmmu's alignment is changed to 64 bytes from
1130 * default 8 bytes, as the lower 6 bits will be used to pass
1131 * pgcnt to vtag_flush_pgcnt_tl1.
1133 size = sizeof (sfmmu_t) + sizeof (sfmmu_ctx_t) * (max_mmu_ctxdoms - 1);
1135 sfmmuid_cache = kmem_cache_create("sfmmuid_cache", size,
1136 64, sfmmu_idcache_constructor, sfmmu_idcache_destructor,
1137 NULL, NULL, static_arena, 0);
1139 sfmmu_tsbinfo_cache = kmem_cache_create("sfmmu_tsbinfo_cache",
1140 sizeof (struct tsb_info), 0, NULL, NULL, NULL, NULL, NULL, 0);
1143 * Since we only use the tsb8k cache to "borrow" pages for TSBs
1144 * from the heap when low on memory or when TSB_FORCEALLOC is
1145 * specified, don't use magazines to cache them--we want to return
1146 * them to the system as quickly as possible.
1148 sfmmu_tsb8k_cache = kmem_cache_create("sfmmu_tsb8k_cache",
1149 MMU_PAGESIZE, MMU_PAGESIZE, NULL, NULL, NULL, NULL,
1150 static_arena, KMC_NOMAGAZINE);
1153 * Set tsb_alloc_hiwater to 1/tsb_alloc_hiwater_factor of physical
1154 * memory, which corresponds to the old static reserve for TSBs.
1155 * tsb_alloc_hiwater_factor defaults to 32. This caps the amount of
1156 * memory we'll allocate for TSB slabs; beyond this point TSB
1157 * allocations will be taken from the kernel heap (via
1158 * sfmmu_tsb8k_cache) and will be throttled as would any other kmem
1159 * consumer.
1161 if (tsb_alloc_hiwater_factor == 0) {
1162 tsb_alloc_hiwater_factor = TSB_ALLOC_HIWATER_FACTOR_DEFAULT;
1164 SFMMU_SET_TSB_ALLOC_HIWATER(physmem);
1166 for (sz = tsb_slab_ttesz; sz > 0; sz--) {
1167 if (!(disable_large_pages & (1 << sz)))
1168 break;
1171 if (sz < tsb_slab_ttesz) {
1172 tsb_slab_ttesz = sz;
1173 tsb_slab_shift = MMU_PAGESHIFT + (sz << 1) + sz;
1174 tsb_slab_size = 1 << tsb_slab_shift;
1175 tsb_slab_mask = (1 << (tsb_slab_shift - MMU_PAGESHIFT)) - 1;
1176 use_bigtsb_arena = 0;
1177 } else if (use_bigtsb_arena &&
1178 (disable_large_pages & (1 << bigtsb_slab_ttesz))) {
1179 use_bigtsb_arena = 0;
1182 if (!use_bigtsb_arena) {
1183 bigtsb_slab_shift = tsb_slab_shift;
1185 SFMMU_SET_TSB_MAX_GROWSIZE(physmem);
1188 * On smaller memory systems, allocate TSB memory in smaller chunks
1189 * than the default 4M slab size. We also honor disable_large_pages
1190 * here.
1192 * The trap handlers need to be patched with the final slab shift,
1193 * since they need to be able to construct the TSB pointer at runtime.
1195 if ((tsb_max_growsize <= TSB_512K_SZCODE) &&
1196 !(disable_large_pages & (1 << TTE512K))) {
1197 tsb_slab_ttesz = TTE512K;
1198 tsb_slab_shift = MMU_PAGESHIFT512K;
1199 tsb_slab_size = MMU_PAGESIZE512K;
1200 tsb_slab_mask = MMU_PAGEOFFSET512K >> MMU_PAGESHIFT;
1201 use_bigtsb_arena = 0;
1204 if (!use_bigtsb_arena) {
1205 bigtsb_slab_ttesz = tsb_slab_ttesz;
1206 bigtsb_slab_shift = tsb_slab_shift;
1207 bigtsb_slab_size = tsb_slab_size;
1208 bigtsb_slab_mask = tsb_slab_mask;
1213 * Set up memory callback to update tsb_alloc_hiwater and
1214 * tsb_max_growsize.
1216 i = kphysm_setup_func_register(&sfmmu_update_vec, (void *) 0);
1217 ASSERT(i == 0);
1220 * kmem_tsb_arena is the source from which large TSB slabs are
1221 * drawn. The quantum of this arena corresponds to the largest
1222 * TSB size we can dynamically allocate for user processes.
1223 * Currently it must also be a supported page size since we
1224 * use exactly one translation entry to map each slab page.
1226 * The per-lgroup kmem_tsb_default_arena arenas are the arenas from
1227 * which most TSBs are allocated. Since most TSB allocations are
1228 * typically 8K we have a kmem cache we stack on top of each
1229 * kmem_tsb_default_arena to speed up those allocations.
1231 * Note the two-level scheme of arenas is required only
1232 * because vmem_create doesn't allow us to specify alignment
1233 * requirements. If this ever changes the code could be
1234 * simplified to use only one level of arenas.
1236 * If 256M page support exists on sun4v, 256MB kmem_bigtsb_arena
1237 * will be provided in addition to the 4M kmem_tsb_arena.
1239 if (use_bigtsb_arena) {
1240 kmem_bigtsb_arena = vmem_create("kmem_bigtsb", NULL, 0,
1241 bigtsb_slab_size, sfmmu_vmem_xalloc_aligned_wrapper,
1242 vmem_xfree, heap_arena, 0, VM_SLEEP);
1245 kmem_tsb_arena = vmem_create("kmem_tsb", NULL, 0, tsb_slab_size,
1246 sfmmu_vmem_xalloc_aligned_wrapper,
1247 vmem_xfree, heap_arena, 0, VM_SLEEP);
1249 if (tsb_lgrp_affinity) {
1250 char s[50];
1251 for (i = 0; i < NLGRPS_MAX; i++) {
1252 if (use_bigtsb_arena) {
1253 (void) sprintf(s, "kmem_bigtsb_lgrp%d", i);
1254 kmem_bigtsb_default_arena[i] = vmem_create(s,
1255 NULL, 0, 2 * tsb_slab_size,
1256 sfmmu_tsb_segkmem_alloc,
1257 sfmmu_tsb_segkmem_free, kmem_bigtsb_arena,
1258 0, VM_SLEEP | VM_BESTFIT);
1261 (void) sprintf(s, "kmem_tsb_lgrp%d", i);
1262 kmem_tsb_default_arena[i] = vmem_create(s,
1263 NULL, 0, PAGESIZE, sfmmu_tsb_segkmem_alloc,
1264 sfmmu_tsb_segkmem_free, kmem_tsb_arena, 0,
1265 VM_SLEEP | VM_BESTFIT);
1267 (void) sprintf(s, "sfmmu_tsb_lgrp%d_cache", i);
1268 sfmmu_tsb_cache[i] = kmem_cache_create(s,
1269 PAGESIZE, PAGESIZE, NULL, NULL, NULL, NULL,
1270 kmem_tsb_default_arena[i], 0);
1272 } else {
1273 if (use_bigtsb_arena) {
1274 kmem_bigtsb_default_arena[0] =
1275 vmem_create("kmem_bigtsb_default", NULL, 0,
1276 2 * tsb_slab_size, sfmmu_tsb_segkmem_alloc,
1277 sfmmu_tsb_segkmem_free, kmem_bigtsb_arena, 0,
1278 VM_SLEEP | VM_BESTFIT);
1281 kmem_tsb_default_arena[0] = vmem_create("kmem_tsb_default",
1282 NULL, 0, PAGESIZE, sfmmu_tsb_segkmem_alloc,
1283 sfmmu_tsb_segkmem_free, kmem_tsb_arena, 0,
1284 VM_SLEEP | VM_BESTFIT);
1285 sfmmu_tsb_cache[0] = kmem_cache_create("sfmmu_tsb_cache",
1286 PAGESIZE, PAGESIZE, NULL, NULL, NULL, NULL,
1287 kmem_tsb_default_arena[0], 0);
1290 sfmmu8_cache = kmem_cache_create("sfmmu8_cache", HME8BLK_SZ,
1291 HMEBLK_ALIGN, sfmmu_hblkcache_constructor,
1292 sfmmu_hblkcache_destructor,
1293 sfmmu_hblkcache_reclaim, (void *)HME8BLK_SZ,
1294 hat_memload_arena, KMC_NOHASH);
1296 hat_memload1_arena = vmem_create("hat_memload1", NULL, 0, PAGESIZE,
1297 segkmem_alloc_permanent, segkmem_free, heap_arena, 0,
1298 VMC_DUMPSAFE | VM_SLEEP);
1300 sfmmu1_cache = kmem_cache_create("sfmmu1_cache", HME1BLK_SZ,
1301 HMEBLK_ALIGN, sfmmu_hblkcache_constructor,
1302 sfmmu_hblkcache_destructor,
1303 NULL, (void *)HME1BLK_SZ,
1304 hat_memload1_arena, KMC_NOHASH);
1306 pa_hment_cache = kmem_cache_create("pa_hment_cache", PAHME_SZ,
1307 0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH);
1309 ism_blk_cache = kmem_cache_create("ism_blk_cache",
1310 sizeof (ism_blk_t), ecache_alignsize, NULL, NULL,
1311 NULL, NULL, static_arena, KMC_NOHASH);
1313 ism_ment_cache = kmem_cache_create("ism_ment_cache",
1314 sizeof (ism_ment_t), 0, NULL, NULL,
1315 NULL, NULL, NULL, 0);
1318 * We grab the first hat for the kernel,
1320 AS_LOCK_ENTER(&kas, RW_WRITER);
1321 kas.a_hat = hat_alloc(&kas);
1322 AS_LOCK_EXIT(&kas);
1325 * Initialize hblk_reserve.
1327 ((struct hme_blk *)hblk_reserve)->hblk_nextpa =
1328 va_to_pa((caddr_t)hblk_reserve);
1330 #ifndef UTSB_PHYS
1332 * Reserve some kernel virtual address space for the locked TTEs
1333 * that allow us to probe the TSB from TL>0.
1335 utsb_vabase = vmem_xalloc(heap_arena, tsb_slab_size, tsb_slab_size,
1336 0, 0, NULL, NULL, VM_SLEEP);
1337 utsb4m_vabase = vmem_xalloc(heap_arena, tsb_slab_size, tsb_slab_size,
1338 0, 0, NULL, NULL, VM_SLEEP);
1339 #endif
1341 #ifdef VAC
1343 * The big page VAC handling code assumes VAC
1344 * will not be bigger than the smallest big
1345 * page- which is 64K.
1347 if (TTEPAGES(TTE64K) < CACHE_NUM_COLOR) {
1348 cmn_err(CE_PANIC, "VAC too big!");
1350 #endif
1352 uhme_hash_pa = va_to_pa(uhme_hash);
1353 khme_hash_pa = va_to_pa(khme_hash);
1356 * Initialize relocation locks. kpr_suspendlock is held
1357 * at PIL_MAX to prevent interrupts from pinning the holder
1358 * of a suspended TTE which may access it leading to a
1359 * deadlock condition.
1361 mutex_init(&kpr_mutex, NULL, MUTEX_DEFAULT, NULL);
1362 mutex_init(&kpr_suspendlock, NULL, MUTEX_SPIN, (void *)PIL_MAX);
1365 * If Shared context support is disabled via /etc/system
1366 * set shctx_on to 0 here if it was set to 1 earlier in boot
1367 * sequence by cpu module initialization code.
1369 if (shctx_on && disable_shctx) {
1370 shctx_on = 0;
1373 if (shctx_on) {
1374 srd_buckets = kmem_zalloc(SFMMU_MAX_SRD_BUCKETS *
1375 sizeof (srd_buckets[0]), KM_SLEEP);
1376 for (i = 0; i < SFMMU_MAX_SRD_BUCKETS; i++) {
1377 mutex_init(&srd_buckets[i].srdb_lock, NULL,
1378 MUTEX_DEFAULT, NULL);
1381 srd_cache = kmem_cache_create("srd_cache", sizeof (sf_srd_t),
1382 0, sfmmu_srdcache_constructor, sfmmu_srdcache_destructor,
1383 NULL, NULL, NULL, 0);
1384 region_cache = kmem_cache_create("region_cache",
1385 sizeof (sf_region_t), 0, sfmmu_rgncache_constructor,
1386 sfmmu_rgncache_destructor, NULL, NULL, NULL, 0);
1387 scd_cache = kmem_cache_create("scd_cache", sizeof (sf_scd_t),
1388 0, sfmmu_scdcache_constructor, sfmmu_scdcache_destructor,
1389 NULL, NULL, NULL, 0);
1393 * Pre-allocate hrm_hashtab before enabling the collection of
1394 * refmod statistics. Allocating on the fly would mean us
1395 * running the risk of suffering recursive mutex enters or
1396 * deadlocks.
1398 hrm_hashtab = kmem_zalloc(HRM_HASHSIZE * sizeof (struct hrmstat *),
1399 KM_SLEEP);
1401 /* Allocate per-cpu pending freelist of hmeblks */
1402 cpu_hme_pend = kmem_zalloc((NCPU * sizeof (cpu_hme_pend_t)) + 64,
1403 KM_SLEEP);
1404 cpu_hme_pend = (cpu_hme_pend_t *)P2ROUNDUP(
1405 (uintptr_t)cpu_hme_pend, 64);
1407 for (i = 0; i < NCPU; i++) {
1408 mutex_init(&cpu_hme_pend[i].chp_mutex, NULL, MUTEX_DEFAULT,
1409 NULL);
1412 if (cpu_hme_pend_thresh == 0) {
1413 cpu_hme_pend_thresh = CPU_HME_PEND_THRESH;
1418 * Initialize locking for the hat layer, called early during boot.
1420 static void
1421 hat_lock_init()
1423 int i;
1426 * initialize the array of mutexes protecting a page's mapping
1427 * list and p_nrm field.
1429 for (i = 0; i < MML_TABLE_SIZE; i++)
1430 mutex_init(&mml_table[i].pad_mutex, NULL, MUTEX_DEFAULT, NULL);
1432 if (kpm_enable) {
1433 for (i = 0; i < kpmp_table_sz; i++) {
1434 mutex_init(&kpmp_table[i].khl_mutex, NULL,
1435 MUTEX_DEFAULT, NULL);
1440 * Initialize array of mutex locks that protects sfmmu fields and
1441 * TSB lists.
1443 for (i = 0; i < SFMMU_NUM_LOCK; i++)
1444 mutex_init(HATLOCK_MUTEXP(&hat_lock[i]), NULL, MUTEX_DEFAULT,
1445 NULL);
1448 #define SFMMU_KERNEL_MAXVA \
1449 (kmem64_base ? (uintptr_t)kmem64_end : (SYSLIMIT))
1452 * Allocate a hat structure.
1453 * Called when an address space first uses a hat.
1455 struct hat *
1456 hat_alloc(struct as *as)
1458 sfmmu_t *sfmmup;
1459 int i;
1460 uint64_t cnum;
1461 extern uint_t get_color_start(struct as *);
1463 ASSERT(AS_WRITE_HELD(as));
1464 sfmmup = kmem_cache_alloc(sfmmuid_cache, KM_SLEEP);
1465 sfmmup->sfmmu_as = as;
1466 sfmmup->sfmmu_flags = 0;
1467 sfmmup->sfmmu_tteflags = 0;
1468 sfmmup->sfmmu_rtteflags = 0;
1469 LOCK_INIT_CLEAR(&sfmmup->sfmmu_ctx_lock);
1471 if (as == &kas) {
1472 ksfmmup = sfmmup;
1473 sfmmup->sfmmu_cext = 0;
1474 cnum = KCONTEXT;
1476 sfmmup->sfmmu_clrstart = 0;
1477 sfmmup->sfmmu_tsb = NULL;
1479 * hat_kern_setup() will call sfmmu_init_ktsbinfo()
1480 * to setup tsb_info for ksfmmup.
1482 } else {
1485 * Just set to invalid ctx. When it faults, it will
1486 * get a valid ctx. This would avoid the situation
1487 * where we get a ctx, but it gets stolen and then
1488 * we fault when we try to run and so have to get
1489 * another ctx.
1491 sfmmup->sfmmu_cext = 0;
1492 cnum = INVALID_CONTEXT;
1494 /* initialize original physical page coloring bin */
1495 sfmmup->sfmmu_clrstart = get_color_start(as);
1496 #ifdef DEBUG
1497 if (tsb_random_size) {
1498 uint32_t randval = (uint32_t)gettick() >> 4;
1499 int size = randval % (tsb_max_growsize + 1);
1501 /* chose a random tsb size for stress testing */
1502 (void) sfmmu_tsbinfo_alloc(&sfmmup->sfmmu_tsb, size,
1503 TSB8K|TSB64K|TSB512K, 0, sfmmup);
1504 } else
1505 #endif /* DEBUG */
1506 (void) sfmmu_tsbinfo_alloc(&sfmmup->sfmmu_tsb,
1507 default_tsb_size,
1508 TSB8K|TSB64K|TSB512K, 0, sfmmup);
1509 sfmmup->sfmmu_flags = HAT_SWAPPED | HAT_ALLCTX_INVALID;
1510 ASSERT(sfmmup->sfmmu_tsb != NULL);
1513 ASSERT(max_mmu_ctxdoms > 0);
1514 for (i = 0; i < max_mmu_ctxdoms; i++) {
1515 sfmmup->sfmmu_ctxs[i].cnum = cnum;
1516 sfmmup->sfmmu_ctxs[i].gnum = 0;
1519 for (i = 0; i < max_mmu_page_sizes; i++) {
1520 sfmmup->sfmmu_ttecnt[i] = 0;
1521 sfmmup->sfmmu_scdrttecnt[i] = 0;
1522 sfmmup->sfmmu_ismttecnt[i] = 0;
1523 sfmmup->sfmmu_scdismttecnt[i] = 0;
1524 sfmmup->sfmmu_pgsz[i] = TTE8K;
1526 sfmmup->sfmmu_tsb0_4minflcnt = 0;
1527 sfmmup->sfmmu_iblk = NULL;
1528 sfmmup->sfmmu_ismhat = 0;
1529 sfmmup->sfmmu_scdhat = 0;
1530 sfmmup->sfmmu_ismblkpa = (uint64_t)-1;
1531 if (sfmmup == ksfmmup) {
1532 CPUSET_ALL(sfmmup->sfmmu_cpusran);
1533 } else {
1534 CPUSET_ZERO(sfmmup->sfmmu_cpusran);
1536 sfmmup->sfmmu_free = 0;
1537 sfmmup->sfmmu_rmstat = 0;
1538 sfmmup->sfmmu_clrbin = sfmmup->sfmmu_clrstart;
1539 cv_init(&sfmmup->sfmmu_tsb_cv, NULL, CV_DEFAULT, NULL);
1540 sfmmup->sfmmu_srdp = NULL;
1541 SF_RGNMAP_ZERO(sfmmup->sfmmu_region_map);
1542 bzero(sfmmup->sfmmu_hmeregion_links, SFMMU_L1_HMERLINKS_SIZE);
1543 sfmmup->sfmmu_scdp = NULL;
1544 sfmmup->sfmmu_scd_link.next = NULL;
1545 sfmmup->sfmmu_scd_link.prev = NULL;
1546 return (sfmmup);
1550 * Create per-MMU context domain kstats for a given MMU ctx.
1552 static void
1553 sfmmu_mmu_kstat_create(mmu_ctx_t *mmu_ctxp)
1555 mmu_ctx_stat_t stat;
1556 kstat_t *mmu_kstat;
1558 ASSERT(MUTEX_HELD(&cpu_lock));
1559 ASSERT(mmu_ctxp->mmu_kstat == NULL);
1561 mmu_kstat = kstat_create("unix", mmu_ctxp->mmu_idx, "mmu_ctx",
1562 "hat", KSTAT_TYPE_NAMED, MMU_CTX_NUM_STATS, KSTAT_FLAG_VIRTUAL);
1564 if (mmu_kstat == NULL) {
1565 cmn_err(CE_WARN, "kstat_create for MMU %d failed",
1566 mmu_ctxp->mmu_idx);
1567 } else {
1568 mmu_kstat->ks_data = mmu_ctxp->mmu_kstat_data;
1569 for (stat = 0; stat < MMU_CTX_NUM_STATS; stat++)
1570 kstat_named_init(&mmu_ctxp->mmu_kstat_data[stat],
1571 mmu_ctx_kstat_names[stat], KSTAT_DATA_INT64);
1572 mmu_ctxp->mmu_kstat = mmu_kstat;
1573 kstat_install(mmu_kstat);
1578 * plat_cpuid_to_mmu_ctx_info() is a platform interface that returns MMU
1579 * context domain information for a given CPU. If a platform does not
1580 * specify that interface, then the function below is used instead to return
1581 * default information. The defaults are as follows:
1583 * - The number of MMU context IDs supported on any CPU in the
1584 * system is 8K.
1585 * - There is one MMU context domain per CPU.
1587 /*ARGSUSED*/
1588 static void
1589 sfmmu_cpuid_to_mmu_ctx_info(processorid_t cpuid, mmu_ctx_info_t *infop)
1591 infop->mmu_nctxs = nctxs;
1592 infop->mmu_idx = cpu[cpuid]->cpu_seqid;
1596 * Called during CPU initialization to set the MMU context-related information
1597 * for a CPU.
1599 * cpu_lock serializes accesses to mmu_ctxs and mmu_saved_gnum.
1601 void
1602 sfmmu_cpu_init(cpu_t *cp)
1604 mmu_ctx_info_t info;
1605 mmu_ctx_t *mmu_ctxp;
1607 ASSERT(MUTEX_HELD(&cpu_lock));
1609 if (&plat_cpuid_to_mmu_ctx_info == NULL)
1610 sfmmu_cpuid_to_mmu_ctx_info(cp->cpu_id, &info);
1611 else
1612 plat_cpuid_to_mmu_ctx_info(cp->cpu_id, &info);
1614 ASSERT(info.mmu_idx < max_mmu_ctxdoms);
1616 if ((mmu_ctxp = mmu_ctxs_tbl[info.mmu_idx]) == NULL) {
1617 /* Each mmu_ctx is cacheline aligned. */
1618 mmu_ctxp = kmem_cache_alloc(mmuctxdom_cache, KM_SLEEP);
1619 bzero(mmu_ctxp, sizeof (mmu_ctx_t));
1621 mutex_init(&mmu_ctxp->mmu_lock, NULL, MUTEX_SPIN,
1622 (void *)ipltospl(DISP_LEVEL));
1623 mmu_ctxp->mmu_idx = info.mmu_idx;
1624 mmu_ctxp->mmu_nctxs = info.mmu_nctxs;
1626 * Globally for lifetime of a system,
1627 * gnum must always increase.
1628 * mmu_saved_gnum is protected by the cpu_lock.
1630 mmu_ctxp->mmu_gnum = mmu_saved_gnum + 1;
1631 mmu_ctxp->mmu_cnum = NUM_LOCKED_CTXS;
1633 sfmmu_mmu_kstat_create(mmu_ctxp);
1635 mmu_ctxs_tbl[info.mmu_idx] = mmu_ctxp;
1636 } else {
1637 ASSERT(mmu_ctxp->mmu_idx == info.mmu_idx);
1638 ASSERT(mmu_ctxp->mmu_nctxs <= info.mmu_nctxs);
1642 * The mmu_lock is acquired here to prevent races with
1643 * the wrap-around code.
1645 mutex_enter(&mmu_ctxp->mmu_lock);
1648 mmu_ctxp->mmu_ncpus++;
1649 CPUSET_ADD(mmu_ctxp->mmu_cpuset, cp->cpu_id);
1650 CPU_MMU_IDX(cp) = info.mmu_idx;
1651 CPU_MMU_CTXP(cp) = mmu_ctxp;
1653 mutex_exit(&mmu_ctxp->mmu_lock);
1656 static void
1657 sfmmu_ctxdom_free(mmu_ctx_t *mmu_ctxp)
1659 ASSERT(MUTEX_HELD(&cpu_lock));
1660 ASSERT(!MUTEX_HELD(&mmu_ctxp->mmu_lock));
1662 mutex_destroy(&mmu_ctxp->mmu_lock);
1664 if (mmu_ctxp->mmu_kstat)
1665 kstat_delete(mmu_ctxp->mmu_kstat);
1667 /* mmu_saved_gnum is protected by the cpu_lock. */
1668 if (mmu_saved_gnum < mmu_ctxp->mmu_gnum)
1669 mmu_saved_gnum = mmu_ctxp->mmu_gnum;
1671 kmem_cache_free(mmuctxdom_cache, mmu_ctxp);
1675 * Called to perform MMU context-related cleanup for a CPU.
1677 void
1678 sfmmu_cpu_cleanup(cpu_t *cp)
1680 mmu_ctx_t *mmu_ctxp;
1682 ASSERT(MUTEX_HELD(&cpu_lock));
1684 mmu_ctxp = CPU_MMU_CTXP(cp);
1685 ASSERT(mmu_ctxp != NULL);
1688 * The mmu_lock is acquired here to prevent races with
1689 * the wrap-around code.
1691 mutex_enter(&mmu_ctxp->mmu_lock);
1693 CPU_MMU_CTXP(cp) = NULL;
1695 CPUSET_DEL(mmu_ctxp->mmu_cpuset, cp->cpu_id);
1696 if (--mmu_ctxp->mmu_ncpus == 0) {
1697 mmu_ctxs_tbl[mmu_ctxp->mmu_idx] = NULL;
1698 mutex_exit(&mmu_ctxp->mmu_lock);
1699 sfmmu_ctxdom_free(mmu_ctxp);
1700 return;
1703 mutex_exit(&mmu_ctxp->mmu_lock);
1706 uint_t
1707 sfmmu_ctxdom_nctxs(int idx)
1709 return (mmu_ctxs_tbl[idx]->mmu_nctxs);
1712 #ifdef sun4v
1714 * sfmmu_ctxdoms_* is an interface provided to help keep context domains
1715 * consistant after suspend/resume on system that can resume on a different
1716 * hardware than it was suspended.
1718 * sfmmu_ctxdom_lock(void) locks all context domains and prevents new contexts
1719 * from being allocated. It acquires all hat_locks, which blocks most access to
1720 * context data, except for a few cases that are handled separately or are
1721 * harmless. It wraps each domain to increment gnum and invalidate on-CPU
1722 * contexts, and forces cnum to its max. As a result of this call all user
1723 * threads that are running on CPUs trap and try to perform wrap around but
1724 * can't because hat_locks are taken. Threads that were not on CPUs but started
1725 * by scheduler go to sfmmu_alloc_ctx() to aquire context without checking
1726 * hat_lock, but fail, because cnum == nctxs, and therefore also trap and block
1727 * on hat_lock trying to wrap. sfmmu_ctxdom_lock() must be called before CPUs
1728 * are paused, else it could deadlock acquiring locks held by paused CPUs.
1730 * sfmmu_ctxdoms_remove() removes context domains from every CPUs and records
1731 * the CPUs that had them. It must be called after CPUs have been paused. This
1732 * ensures that no threads are in sfmmu_alloc_ctx() accessing domain data,
1733 * because pause_cpus sends a mondo interrupt to every CPU, and sfmmu_alloc_ctx
1734 * runs with interrupts disabled. When CPUs are later resumed, they may enter
1735 * sfmmu_alloc_ctx, but it will check for CPU_MMU_CTXP = NULL and immediately
1736 * return failure. Or, they will be blocked trying to acquire hat_lock. Thus
1737 * after sfmmu_ctxdoms_remove returns, we are guaranteed that no one is
1738 * accessing the old context domains.
1740 * sfmmu_ctxdoms_update(void) frees space used by old context domains and
1741 * allocates new context domains based on hardware layout. It initializes
1742 * every CPU that had context domain before migration to have one again.
1743 * sfmmu_ctxdoms_update must be called after CPUs are resumed, else it
1744 * could deadlock acquiring locks held by paused CPUs.
1746 * sfmmu_ctxdoms_unlock(void) releases all hat_locks after which user threads
1747 * acquire new context ids and continue execution.
1749 * Therefore functions should be called in the following order:
1750 * suspend_routine()
1751 * sfmmu_ctxdom_lock()
1752 * pause_cpus()
1753 * suspend()
1754 * if (suspend failed)
1755 * sfmmu_ctxdom_unlock()
1756 * ...
1757 * sfmmu_ctxdom_remove()
1758 * resume_cpus()
1759 * sfmmu_ctxdom_update()
1760 * sfmmu_ctxdom_unlock()
1762 static cpuset_t sfmmu_ctxdoms_pset;
1764 void
1765 sfmmu_ctxdoms_remove()
1767 processorid_t id;
1768 cpu_t *cp;
1771 * Record the CPUs that have domains in sfmmu_ctxdoms_pset, so they can
1772 * be restored post-migration. A CPU may be powered off and not have a
1773 * domain, for example.
1775 CPUSET_ZERO(sfmmu_ctxdoms_pset);
1777 for (id = 0; id < NCPU; id++) {
1778 if ((cp = cpu[id]) != NULL && CPU_MMU_CTXP(cp) != NULL) {
1779 CPUSET_ADD(sfmmu_ctxdoms_pset, id);
1780 CPU_MMU_CTXP(cp) = NULL;
1785 void
1786 sfmmu_ctxdoms_lock(void)
1788 int idx;
1789 mmu_ctx_t *mmu_ctxp;
1791 sfmmu_hat_lock_all();
1794 * At this point, no thread can be in sfmmu_ctx_wrap_around, because
1795 * hat_lock is always taken before calling it.
1797 * For each domain, set mmu_cnum to max so no more contexts can be
1798 * allocated, and wrap to flush on-CPU contexts and force threads to
1799 * acquire a new context when we later drop hat_lock after migration.
1800 * Setting mmu_cnum may race with sfmmu_alloc_ctx which also sets cnum,
1801 * but the latter uses CAS and will miscompare and not overwrite it.
1803 kpreempt_disable(); /* required by sfmmu_ctx_wrap_around */
1804 for (idx = 0; idx < max_mmu_ctxdoms; idx++) {
1805 if ((mmu_ctxp = mmu_ctxs_tbl[idx]) != NULL) {
1806 mutex_enter(&mmu_ctxp->mmu_lock);
1807 mmu_ctxp->mmu_cnum = mmu_ctxp->mmu_nctxs;
1808 /* make sure updated cnum visible */
1809 membar_enter();
1810 mutex_exit(&mmu_ctxp->mmu_lock);
1811 sfmmu_ctx_wrap_around(mmu_ctxp, B_FALSE);
1814 kpreempt_enable();
1817 void
1818 sfmmu_ctxdoms_unlock(void)
1820 sfmmu_hat_unlock_all();
1823 void
1824 sfmmu_ctxdoms_update(void)
1826 processorid_t id;
1827 cpu_t *cp;
1828 uint_t idx;
1829 mmu_ctx_t *mmu_ctxp;
1832 * Free all context domains. As side effect, this increases
1833 * mmu_saved_gnum to the maximum gnum over all domains, which is used to
1834 * init gnum in the new domains, which therefore will be larger than the
1835 * sfmmu gnum for any process, guaranteeing that every process will see
1836 * a new generation and allocate a new context regardless of what new
1837 * domain it runs in.
1839 mutex_enter(&cpu_lock);
1841 for (idx = 0; idx < max_mmu_ctxdoms; idx++) {
1842 if (mmu_ctxs_tbl[idx] != NULL) {
1843 mmu_ctxp = mmu_ctxs_tbl[idx];
1844 mmu_ctxs_tbl[idx] = NULL;
1845 sfmmu_ctxdom_free(mmu_ctxp);
1849 for (id = 0; id < NCPU; id++) {
1850 if (CPU_IN_SET(sfmmu_ctxdoms_pset, id) &&
1851 (cp = cpu[id]) != NULL)
1852 sfmmu_cpu_init(cp);
1854 mutex_exit(&cpu_lock);
1856 #endif
1859 * Hat_setup, makes an address space context the current active one.
1860 * In sfmmu this translates to setting the secondary context with the
1861 * corresponding context.
1863 void
1864 hat_setup(struct hat *sfmmup, int allocflag)
1866 hatlock_t *hatlockp;
1868 /* Init needs some special treatment. */
1869 if (allocflag == HAT_INIT) {
1871 * Make sure that we have
1872 * 1. a TSB
1873 * 2. a valid ctx that doesn't get stolen after this point.
1875 hatlockp = sfmmu_hat_enter(sfmmup);
1878 * Swap in the TSB. hat_init() allocates tsbinfos without
1879 * TSBs, but we need one for init, since the kernel does some
1880 * special things to set up its stack and needs the TSB to
1881 * resolve page faults.
1883 sfmmu_tsb_swapin(sfmmup, hatlockp);
1885 sfmmu_get_ctx(sfmmup);
1887 sfmmu_hat_exit(hatlockp);
1888 } else {
1889 ASSERT(allocflag == HAT_ALLOC);
1891 hatlockp = sfmmu_hat_enter(sfmmup);
1892 kpreempt_disable();
1894 CPUSET_ADD(sfmmup->sfmmu_cpusran, CPU->cpu_id);
1896 * sfmmu_setctx_sec takes <pgsz|cnum> as a parameter,
1897 * pagesize bits don't matter in this case since we are passing
1898 * INVALID_CONTEXT to it.
1899 * Compatibility Note: hw takes care of MMU_SCONTEXT1
1901 sfmmu_setctx_sec(INVALID_CONTEXT);
1902 sfmmu_clear_utsbinfo();
1904 kpreempt_enable();
1905 sfmmu_hat_exit(hatlockp);
1910 * Free all the translation resources for the specified address space.
1911 * Called from as_free when an address space is being destroyed.
1913 void
1914 hat_free_start(struct hat *sfmmup)
1916 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as));
1917 ASSERT(sfmmup != ksfmmup);
1919 sfmmup->sfmmu_free = 1;
1920 if (sfmmup->sfmmu_scdp != NULL) {
1921 sfmmu_leave_scd(sfmmup, 0);
1924 ASSERT(sfmmup->sfmmu_scdp == NULL);
1927 void
1928 hat_free_end(struct hat *sfmmup)
1930 int i;
1932 ASSERT(sfmmup->sfmmu_free == 1);
1933 ASSERT(sfmmup->sfmmu_ttecnt[TTE8K] == 0);
1934 ASSERT(sfmmup->sfmmu_ttecnt[TTE64K] == 0);
1935 ASSERT(sfmmup->sfmmu_ttecnt[TTE512K] == 0);
1936 ASSERT(sfmmup->sfmmu_ttecnt[TTE4M] == 0);
1937 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0);
1938 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0);
1940 if (sfmmup->sfmmu_rmstat) {
1941 hat_freestat(sfmmup->sfmmu_as, NULL);
1944 while (sfmmup->sfmmu_tsb != NULL) {
1945 struct tsb_info *next = sfmmup->sfmmu_tsb->tsb_next;
1946 sfmmu_tsbinfo_free(sfmmup->sfmmu_tsb);
1947 sfmmup->sfmmu_tsb = next;
1950 if (sfmmup->sfmmu_srdp != NULL) {
1951 sfmmu_leave_srd(sfmmup);
1952 ASSERT(sfmmup->sfmmu_srdp == NULL);
1953 for (i = 0; i < SFMMU_L1_HMERLINKS; i++) {
1954 if (sfmmup->sfmmu_hmeregion_links[i] != NULL) {
1955 kmem_free(sfmmup->sfmmu_hmeregion_links[i],
1956 SFMMU_L2_HMERLINKS_SIZE);
1957 sfmmup->sfmmu_hmeregion_links[i] = NULL;
1961 sfmmu_free_sfmmu(sfmmup);
1963 #ifdef DEBUG
1964 for (i = 0; i < SFMMU_L1_HMERLINKS; i++) {
1965 ASSERT(sfmmup->sfmmu_hmeregion_links[i] == NULL);
1967 #endif
1969 kmem_cache_free(sfmmuid_cache, sfmmup);
1973 * Set up any translation structures, for the specified address space,
1974 * that are needed or preferred when the process is being swapped in.
1976 /* ARGSUSED */
1977 void
1978 hat_swapin(struct hat *hat)
1983 * Free all of the translation resources, for the specified address space,
1984 * that can be freed while the process is swapped out. Called from as_swapout.
1985 * Also, free up the ctx that this process was using.
1987 void
1988 hat_swapout(struct hat *sfmmup)
1990 struct hmehash_bucket *hmebp;
1991 struct hme_blk *hmeblkp;
1992 struct hme_blk *pr_hblk = NULL;
1993 struct hme_blk *nx_hblk;
1994 int i;
1995 struct hme_blk *list = NULL;
1996 hatlock_t *hatlockp;
1997 struct tsb_info *tsbinfop;
1998 struct free_tsb {
1999 struct free_tsb *next;
2000 struct tsb_info *tsbinfop;
2001 }; /* free list of TSBs */
2002 struct free_tsb *freelist, *last, *next;
2004 SFMMU_STAT(sf_swapout);
2007 * There is no way to go from an as to all its translations in sfmmu.
2008 * Here is one of the times when we take the big hit and traverse
2009 * the hash looking for hme_blks to free up. Not only do we free up
2010 * this as hme_blks but all those that are free. We are obviously
2011 * swapping because we need memory so let's free up as much
2012 * as we can.
2014 * Note that we don't flush TLB/TSB here -- it's not necessary
2015 * because:
2016 * 1) we free the ctx we're using and throw away the TSB(s);
2017 * 2) processes aren't runnable while being swapped out.
2019 ASSERT(sfmmup != KHATID);
2020 for (i = 0; i <= UHMEHASH_SZ; i++) {
2021 hmebp = &uhme_hash[i];
2022 SFMMU_HASH_LOCK(hmebp);
2023 hmeblkp = hmebp->hmeblkp;
2024 pr_hblk = NULL;
2025 while (hmeblkp) {
2027 if ((hmeblkp->hblk_tag.htag_id == sfmmup) &&
2028 !hmeblkp->hblk_shw_bit && !hmeblkp->hblk_lckcnt) {
2029 ASSERT(!hmeblkp->hblk_shared);
2030 (void) sfmmu_hblk_unload(sfmmup, hmeblkp,
2031 (caddr_t)get_hblk_base(hmeblkp),
2032 get_hblk_endaddr(hmeblkp),
2033 NULL, HAT_UNLOAD);
2035 nx_hblk = hmeblkp->hblk_next;
2036 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) {
2037 ASSERT(!hmeblkp->hblk_lckcnt);
2038 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
2039 &list, 0);
2040 } else {
2041 pr_hblk = hmeblkp;
2043 hmeblkp = nx_hblk;
2045 SFMMU_HASH_UNLOCK(hmebp);
2048 sfmmu_hblks_list_purge(&list, 0);
2051 * Now free up the ctx so that others can reuse it.
2053 hatlockp = sfmmu_hat_enter(sfmmup);
2055 sfmmu_invalidate_ctx(sfmmup);
2058 * Free TSBs, but not tsbinfos, and set SWAPPED flag.
2059 * If TSBs were never swapped in, just return.
2060 * This implies that we don't support partial swapping
2061 * of TSBs -- either all are swapped out, or none are.
2063 * We must hold the HAT lock here to prevent racing with another
2064 * thread trying to unmap TTEs from the TSB or running the post-
2065 * relocator after relocating the TSB's memory. Unfortunately, we
2066 * can't free memory while holding the HAT lock or we could
2067 * deadlock, so we build a list of TSBs to be freed after marking
2068 * the tsbinfos as swapped out and free them after dropping the
2069 * lock.
2071 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) {
2072 sfmmu_hat_exit(hatlockp);
2073 return;
2076 SFMMU_FLAGS_SET(sfmmup, HAT_SWAPPED);
2077 last = freelist = NULL;
2078 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL;
2079 tsbinfop = tsbinfop->tsb_next) {
2080 ASSERT((tsbinfop->tsb_flags & TSB_SWAPPED) == 0);
2083 * Cast the TSB into a struct free_tsb and put it on the free
2084 * list.
2086 if (freelist == NULL) {
2087 last = freelist = (struct free_tsb *)tsbinfop->tsb_va;
2088 } else {
2089 last->next = (struct free_tsb *)tsbinfop->tsb_va;
2090 last = last->next;
2092 last->next = NULL;
2093 last->tsbinfop = tsbinfop;
2094 tsbinfop->tsb_flags |= TSB_SWAPPED;
2096 * Zero out the TTE to clear the valid bit.
2097 * Note we can't use a value like 0xbad because we want to
2098 * ensure diagnostic bits are NEVER set on TTEs that might
2099 * be loaded. The intent is to catch any invalid access
2100 * to the swapped TSB, such as a thread running with a valid
2101 * context without first calling sfmmu_tsb_swapin() to
2102 * allocate TSB memory.
2104 tsbinfop->tsb_tte.ll = 0;
2107 /* Now we can drop the lock and free the TSB memory. */
2108 sfmmu_hat_exit(hatlockp);
2109 for (; freelist != NULL; freelist = next) {
2110 next = freelist->next;
2111 sfmmu_tsb_free(freelist->tsbinfop);
2116 * Duplicate the translations of an as into another newas
2118 /* ARGSUSED */
2120 hat_dup(struct hat *hat, struct hat *newhat, caddr_t addr, size_t len,
2121 uint_t flag)
2123 sf_srd_t *srdp;
2124 sf_scd_t *scdp;
2125 int i;
2126 extern uint_t get_color_start(struct as *);
2128 ASSERT((flag == 0) || (flag == HAT_DUP_ALL) || (flag == HAT_DUP_COW) ||
2129 (flag == HAT_DUP_SRD));
2130 ASSERT(hat != ksfmmup);
2131 ASSERT(newhat != ksfmmup);
2132 ASSERT(flag != HAT_DUP_ALL || hat->sfmmu_srdp == newhat->sfmmu_srdp);
2134 if (flag == HAT_DUP_COW) {
2135 panic("hat_dup: HAT_DUP_COW not supported");
2138 if (flag == HAT_DUP_SRD && ((srdp = hat->sfmmu_srdp) != NULL)) {
2139 ASSERT(srdp->srd_evp != NULL);
2140 VN_HOLD(srdp->srd_evp);
2141 ASSERT(srdp->srd_refcnt > 0);
2142 newhat->sfmmu_srdp = srdp;
2143 atomic_inc_32((volatile uint_t *)&srdp->srd_refcnt);
2147 * HAT_DUP_ALL flag is used after as duplication is done.
2149 if (flag == HAT_DUP_ALL && ((srdp = newhat->sfmmu_srdp) != NULL)) {
2150 ASSERT(newhat->sfmmu_srdp->srd_refcnt >= 2);
2151 newhat->sfmmu_rtteflags = hat->sfmmu_rtteflags;
2152 if (hat->sfmmu_flags & HAT_4MTEXT_FLAG) {
2153 newhat->sfmmu_flags |= HAT_4MTEXT_FLAG;
2156 /* check if need to join scd */
2157 if ((scdp = hat->sfmmu_scdp) != NULL &&
2158 newhat->sfmmu_scdp != scdp) {
2159 int ret;
2160 SF_RGNMAP_IS_SUBSET(&newhat->sfmmu_region_map,
2161 &scdp->scd_region_map, ret);
2162 ASSERT(ret);
2163 sfmmu_join_scd(scdp, newhat);
2164 ASSERT(newhat->sfmmu_scdp == scdp &&
2165 scdp->scd_refcnt >= 2);
2166 for (i = 0; i < max_mmu_page_sizes; i++) {
2167 newhat->sfmmu_ismttecnt[i] =
2168 hat->sfmmu_ismttecnt[i];
2169 newhat->sfmmu_scdismttecnt[i] =
2170 hat->sfmmu_scdismttecnt[i];
2174 sfmmu_check_page_sizes(newhat, 1);
2177 if (flag == HAT_DUP_ALL && consistent_coloring == 0 &&
2178 update_proc_pgcolorbase_after_fork != 0) {
2179 hat->sfmmu_clrbin = get_color_start(hat->sfmmu_as);
2181 return (0);
2184 void
2185 hat_memload(struct hat *hat, caddr_t addr, struct page *pp,
2186 uint_t attr, uint_t flags)
2188 hat_do_memload(hat, addr, pp, attr, flags,
2189 SFMMU_INVALID_SHMERID);
2192 void
2193 hat_memload_region(struct hat *hat, caddr_t addr, struct page *pp,
2194 uint_t attr, uint_t flags, hat_region_cookie_t rcookie)
2196 uint_t rid;
2197 if (rcookie == HAT_INVALID_REGION_COOKIE) {
2198 hat_do_memload(hat, addr, pp, attr, flags,
2199 SFMMU_INVALID_SHMERID);
2200 return;
2202 rid = (uint_t)((uint64_t)rcookie);
2203 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
2204 hat_do_memload(hat, addr, pp, attr, flags, rid);
2208 * Set up addr to map to page pp with protection prot.
2209 * As an optimization we also load the TSB with the
2210 * corresponding tte but it is no big deal if the tte gets kicked out.
2212 static void
2213 hat_do_memload(struct hat *hat, caddr_t addr, struct page *pp,
2214 uint_t attr, uint_t flags, uint_t rid)
2216 tte_t tte;
2219 ASSERT(hat != NULL);
2220 ASSERT(PAGE_LOCKED(pp));
2221 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET));
2222 ASSERT(!(flags & ~SFMMU_LOAD_ALLFLAG));
2223 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR));
2224 SFMMU_VALIDATE_HMERID(hat, rid, addr, MMU_PAGESIZE);
2226 if (PP_ISFREE(pp)) {
2227 panic("hat_memload: loading a mapping to free page %p",
2228 (void *)pp);
2231 ASSERT((hat == ksfmmup) || AS_LOCK_HELD(hat->sfmmu_as));
2233 if (flags & ~SFMMU_LOAD_ALLFLAG)
2234 cmn_err(CE_NOTE, "hat_memload: unsupported flags %d",
2235 flags & ~SFMMU_LOAD_ALLFLAG);
2237 if (hat->sfmmu_rmstat)
2238 hat_resvstat(MMU_PAGESIZE, hat->sfmmu_as, addr);
2240 #if defined(SF_ERRATA_57)
2241 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) &&
2242 (addr < errata57_limit) && (attr & PROT_EXEC) &&
2243 !(flags & HAT_LOAD_SHARE)) {
2244 cmn_err(CE_WARN, "hat_memload: illegal attempt to make user "
2245 " page executable");
2246 attr &= ~PROT_EXEC;
2248 #endif
2250 sfmmu_memtte(&tte, pp->p_pagenum, attr, TTE8K);
2251 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, flags, rid);
2254 * Check TSB and TLB page sizes.
2256 if ((flags & HAT_LOAD_SHARE) == 0) {
2257 sfmmu_check_page_sizes(hat, 1);
2262 * hat_devload can be called to map real memory (e.g.
2263 * /dev/kmem) and even though hat_devload will determine pf is
2264 * for memory, it will be unable to get a shared lock on the
2265 * page (because someone else has it exclusively) and will
2266 * pass dp = NULL. If tteload doesn't get a non-NULL
2267 * page pointer it can't cache memory.
2269 void
2270 hat_devload(struct hat *hat, caddr_t addr, size_t len, pfn_t pfn,
2271 uint_t attr, int flags)
2273 tte_t tte;
2274 struct page *pp = NULL;
2275 int use_lgpg = 0;
2277 ASSERT(hat != NULL);
2279 ASSERT(!(flags & ~SFMMU_LOAD_ALLFLAG));
2280 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR));
2281 ASSERT((hat == ksfmmup) || AS_LOCK_HELD(hat->sfmmu_as));
2282 if (len == 0)
2283 panic("hat_devload: zero len");
2284 if (flags & ~SFMMU_LOAD_ALLFLAG)
2285 cmn_err(CE_NOTE, "hat_devload: unsupported flags %d",
2286 flags & ~SFMMU_LOAD_ALLFLAG);
2288 #if defined(SF_ERRATA_57)
2289 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) &&
2290 (addr < errata57_limit) && (attr & PROT_EXEC) &&
2291 !(flags & HAT_LOAD_SHARE)) {
2292 cmn_err(CE_WARN, "hat_devload: illegal attempt to make user "
2293 " page executable");
2294 attr &= ~PROT_EXEC;
2296 #endif
2299 * If it's a memory page find its pp
2301 if (!(flags & HAT_LOAD_NOCONSIST) && pf_is_memory(pfn)) {
2302 pp = page_numtopp_nolock(pfn);
2303 if (pp == NULL) {
2304 flags |= HAT_LOAD_NOCONSIST;
2305 } else {
2306 if (PP_ISFREE(pp)) {
2307 panic("hat_memload: loading "
2308 "a mapping to free page %p",
2309 (void *)pp);
2311 if (!PAGE_LOCKED(pp) && !PP_ISNORELOC(pp)) {
2312 panic("hat_memload: loading a mapping "
2313 "to unlocked relocatable page %p",
2314 (void *)pp);
2316 ASSERT(len == MMU_PAGESIZE);
2320 if (hat->sfmmu_rmstat)
2321 hat_resvstat(len, hat->sfmmu_as, addr);
2323 if (flags & HAT_LOAD_NOCONSIST) {
2324 attr |= SFMMU_UNCACHEVTTE;
2325 use_lgpg = 1;
2327 if (!pf_is_memory(pfn)) {
2328 attr |= SFMMU_UNCACHEPTTE | HAT_NOSYNC;
2329 use_lgpg = 1;
2330 switch (attr & HAT_ORDER_MASK) {
2331 case HAT_STRICTORDER:
2332 case HAT_UNORDERED_OK:
2334 * we set the side effect bit for all non
2335 * memory mappings unless merging is ok
2337 attr |= SFMMU_SIDEFFECT;
2338 break;
2339 case HAT_MERGING_OK:
2340 case HAT_LOADCACHING_OK:
2341 case HAT_STORECACHING_OK:
2342 break;
2343 default:
2344 panic("hat_devload: bad attr");
2345 break;
2348 while (len) {
2349 if (!use_lgpg) {
2350 sfmmu_memtte(&tte, pfn, attr, TTE8K);
2351 (void) sfmmu_tteload_array(hat, &tte, addr, &pp,
2352 flags, SFMMU_INVALID_SHMERID);
2353 len -= MMU_PAGESIZE;
2354 addr += MMU_PAGESIZE;
2355 pfn++;
2356 continue;
2359 * try to use large pages, check va/pa alignments
2360 * Note that 32M/256M page sizes are not (yet) supported.
2362 if ((len >= MMU_PAGESIZE4M) &&
2363 !((uintptr_t)addr & MMU_PAGEOFFSET4M) &&
2364 !(disable_large_pages & (1 << TTE4M)) &&
2365 !(mmu_ptob(pfn) & MMU_PAGEOFFSET4M)) {
2366 sfmmu_memtte(&tte, pfn, attr, TTE4M);
2367 (void) sfmmu_tteload_array(hat, &tte, addr, &pp,
2368 flags, SFMMU_INVALID_SHMERID);
2369 len -= MMU_PAGESIZE4M;
2370 addr += MMU_PAGESIZE4M;
2371 pfn += MMU_PAGESIZE4M / MMU_PAGESIZE;
2372 } else if ((len >= MMU_PAGESIZE512K) &&
2373 !((uintptr_t)addr & MMU_PAGEOFFSET512K) &&
2374 !(disable_large_pages & (1 << TTE512K)) &&
2375 !(mmu_ptob(pfn) & MMU_PAGEOFFSET512K)) {
2376 sfmmu_memtte(&tte, pfn, attr, TTE512K);
2377 (void) sfmmu_tteload_array(hat, &tte, addr, &pp,
2378 flags, SFMMU_INVALID_SHMERID);
2379 len -= MMU_PAGESIZE512K;
2380 addr += MMU_PAGESIZE512K;
2381 pfn += MMU_PAGESIZE512K / MMU_PAGESIZE;
2382 } else if ((len >= MMU_PAGESIZE64K) &&
2383 !((uintptr_t)addr & MMU_PAGEOFFSET64K) &&
2384 !(disable_large_pages & (1 << TTE64K)) &&
2385 !(mmu_ptob(pfn) & MMU_PAGEOFFSET64K)) {
2386 sfmmu_memtte(&tte, pfn, attr, TTE64K);
2387 (void) sfmmu_tteload_array(hat, &tte, addr, &pp,
2388 flags, SFMMU_INVALID_SHMERID);
2389 len -= MMU_PAGESIZE64K;
2390 addr += MMU_PAGESIZE64K;
2391 pfn += MMU_PAGESIZE64K / MMU_PAGESIZE;
2392 } else {
2393 sfmmu_memtte(&tte, pfn, attr, TTE8K);
2394 (void) sfmmu_tteload_array(hat, &tte, addr, &pp,
2395 flags, SFMMU_INVALID_SHMERID);
2396 len -= MMU_PAGESIZE;
2397 addr += MMU_PAGESIZE;
2398 pfn++;
2403 * Check TSB and TLB page sizes.
2405 if ((flags & HAT_LOAD_SHARE) == 0) {
2406 sfmmu_check_page_sizes(hat, 1);
2410 void
2411 hat_memload_array(struct hat *hat, caddr_t addr, size_t len,
2412 struct page **pps, uint_t attr, uint_t flags)
2414 hat_do_memload_array(hat, addr, len, pps, attr, flags,
2415 SFMMU_INVALID_SHMERID);
2418 void
2419 hat_memload_array_region(struct hat *hat, caddr_t addr, size_t len,
2420 struct page **pps, uint_t attr, uint_t flags,
2421 hat_region_cookie_t rcookie)
2423 uint_t rid;
2424 if (rcookie == HAT_INVALID_REGION_COOKIE) {
2425 hat_do_memload_array(hat, addr, len, pps, attr, flags,
2426 SFMMU_INVALID_SHMERID);
2427 return;
2429 rid = (uint_t)((uint64_t)rcookie);
2430 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
2431 hat_do_memload_array(hat, addr, len, pps, attr, flags, rid);
2435 * Map the largest extend possible out of the page array. The array may NOT
2436 * be in order. The largest possible mapping a page can have
2437 * is specified in the p_szc field. The p_szc field
2438 * cannot change as long as there any mappings (large or small)
2439 * to any of the pages that make up the large page. (ie. any
2440 * promotion/demotion of page size is not up to the hat but up to
2441 * the page free list manager). The array
2442 * should consist of properly aligned contigous pages that are
2443 * part of a big page for a large mapping to be created.
2445 static void
2446 hat_do_memload_array(struct hat *hat, caddr_t addr, size_t len,
2447 struct page **pps, uint_t attr, uint_t flags, uint_t rid)
2449 int ttesz;
2450 size_t mapsz;
2451 pgcnt_t numpg, npgs;
2452 tte_t tte;
2453 page_t *pp;
2454 uint_t large_pages_disable;
2456 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET));
2457 SFMMU_VALIDATE_HMERID(hat, rid, addr, len);
2459 if (hat->sfmmu_rmstat)
2460 hat_resvstat(len, hat->sfmmu_as, addr);
2462 #if defined(SF_ERRATA_57)
2463 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) &&
2464 (addr < errata57_limit) && (attr & PROT_EXEC) &&
2465 !(flags & HAT_LOAD_SHARE)) {
2466 cmn_err(CE_WARN, "hat_memload_array: illegal attempt to make "
2467 "user page executable");
2468 attr &= ~PROT_EXEC;
2470 #endif
2472 /* Get number of pages */
2473 npgs = len >> MMU_PAGESHIFT;
2475 if (flags & HAT_LOAD_SHARE) {
2476 large_pages_disable = disable_ism_large_pages;
2477 } else {
2478 large_pages_disable = disable_large_pages;
2481 if (npgs < NHMENTS || large_pages_disable == LARGE_PAGES_OFF) {
2482 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, npgs,
2483 rid);
2484 return;
2487 while (npgs >= NHMENTS) {
2488 pp = *pps;
2489 for (ttesz = pp->p_szc; ttesz != TTE8K; ttesz--) {
2491 * Check if this page size is disabled.
2493 if (large_pages_disable & (1 << ttesz))
2494 continue;
2496 numpg = TTEPAGES(ttesz);
2497 mapsz = numpg << MMU_PAGESHIFT;
2498 if ((npgs >= numpg) &&
2499 IS_P2ALIGNED(addr, mapsz) &&
2500 IS_P2ALIGNED(pp->p_pagenum, numpg)) {
2502 * At this point we have enough pages and
2503 * we know the virtual address and the pfn
2504 * are properly aligned. We still need
2505 * to check for physical contiguity but since
2506 * it is very likely that this is the case
2507 * we will assume they are so and undo
2508 * the request if necessary. It would
2509 * be great if we could get a hint flag
2510 * like HAT_CONTIG which would tell us
2511 * the pages are contigous for sure.
2513 sfmmu_memtte(&tte, (*pps)->p_pagenum,
2514 attr, ttesz);
2515 if (!sfmmu_tteload_array(hat, &tte, addr,
2516 pps, flags, rid)) {
2517 break;
2521 if (ttesz == TTE8K) {
2523 * We were not able to map array using a large page
2524 * batch a hmeblk or fraction at a time.
2526 numpg = ((uintptr_t)addr >> MMU_PAGESHIFT)
2527 & (NHMENTS-1);
2528 numpg = NHMENTS - numpg;
2529 ASSERT(numpg <= npgs);
2530 mapsz = numpg * MMU_PAGESIZE;
2531 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags,
2532 numpg, rid);
2534 addr += mapsz;
2535 npgs -= numpg;
2536 pps += numpg;
2539 if (npgs) {
2540 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, npgs,
2541 rid);
2545 * Check TSB and TLB page sizes.
2547 if ((flags & HAT_LOAD_SHARE) == 0) {
2548 sfmmu_check_page_sizes(hat, 1);
2553 * Function tries to batch 8K pages into the same hme blk.
2555 static void
2556 sfmmu_memload_batchsmall(struct hat *hat, caddr_t vaddr, page_t **pps,
2557 uint_t attr, uint_t flags, pgcnt_t npgs, uint_t rid)
2559 tte_t tte;
2560 page_t *pp;
2561 struct hmehash_bucket *hmebp;
2562 struct hme_blk *hmeblkp;
2563 int index;
2565 while (npgs) {
2567 * Acquire the hash bucket.
2569 hmebp = sfmmu_tteload_acquire_hashbucket(hat, vaddr, TTE8K,
2570 rid);
2571 ASSERT(hmebp);
2574 * Find the hment block.
2576 hmeblkp = sfmmu_tteload_find_hmeblk(hat, hmebp, vaddr,
2577 TTE8K, flags, rid);
2578 ASSERT(hmeblkp);
2580 do {
2582 * Make the tte.
2584 pp = *pps;
2585 sfmmu_memtte(&tte, pp->p_pagenum, attr, TTE8K);
2588 * Add the translation.
2590 (void) sfmmu_tteload_addentry(hat, hmeblkp, &tte,
2591 vaddr, pps, flags, rid);
2594 * Goto next page.
2596 pps++;
2597 npgs--;
2600 * Goto next address.
2602 vaddr += MMU_PAGESIZE;
2605 * Don't crossover into a different hmentblk.
2607 index = (int)(((uintptr_t)vaddr >> MMU_PAGESHIFT) &
2608 (NHMENTS-1));
2610 } while (index != 0 && npgs != 0);
2613 * Release the hash bucket.
2616 sfmmu_tteload_release_hashbucket(hmebp);
2621 * Construct a tte for a page:
2623 * tte_valid = 1
2624 * tte_size2 = size & TTE_SZ2_BITS (Panther and Olympus-C only)
2625 * tte_size = size
2626 * tte_nfo = attr & HAT_NOFAULT
2627 * tte_ie = attr & HAT_STRUCTURE_LE
2628 * tte_hmenum = hmenum
2629 * tte_pahi = pp->p_pagenum >> TTE_PASHIFT;
2630 * tte_palo = pp->p_pagenum & TTE_PALOMASK;
2631 * tte_ref = 1 (optimization)
2632 * tte_wr_perm = attr & PROT_WRITE;
2633 * tte_no_sync = attr & HAT_NOSYNC
2634 * tte_lock = attr & SFMMU_LOCKTTE
2635 * tte_cp = !(attr & SFMMU_UNCACHEPTTE)
2636 * tte_cv = !(attr & SFMMU_UNCACHEVTTE)
2637 * tte_e = attr & SFMMU_SIDEFFECT
2638 * tte_priv = !(attr & PROT_USER)
2639 * tte_hwwr = if nosync is set and it is writable we set the mod bit (opt)
2640 * tte_glb = 0
2642 void
2643 sfmmu_memtte(tte_t *ttep, pfn_t pfn, uint_t attr, int tte_sz)
2645 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR));
2647 ttep->tte_inthi = MAKE_TTE_INTHI(pfn, attr, tte_sz, 0 /* hmenum */);
2648 ttep->tte_intlo = MAKE_TTE_INTLO(pfn, attr, tte_sz, 0 /* hmenum */);
2650 if (TTE_IS_NOSYNC(ttep)) {
2651 TTE_SET_REF(ttep);
2652 if (TTE_IS_WRITABLE(ttep)) {
2653 TTE_SET_MOD(ttep);
2656 if (TTE_IS_NFO(ttep) && TTE_IS_EXECUTABLE(ttep)) {
2657 panic("sfmmu_memtte: can't set both NFO and EXEC bits");
2662 * This function will add a translation to the hme_blk and allocate the
2663 * hme_blk if one does not exist.
2664 * If a page structure is specified then it will add the
2665 * corresponding hment to the mapping list.
2666 * It will also update the hmenum field for the tte.
2668 * Currently this function is only used for kernel mappings.
2669 * So pass invalid region to sfmmu_tteload_array().
2671 void
2672 sfmmu_tteload(struct hat *sfmmup, tte_t *ttep, caddr_t vaddr, page_t *pp,
2673 uint_t flags)
2675 ASSERT(sfmmup == ksfmmup);
2676 (void) sfmmu_tteload_array(sfmmup, ttep, vaddr, &pp, flags,
2677 SFMMU_INVALID_SHMERID);
2681 * Load (ttep != NULL) or unload (ttep == NULL) one entry in the TSB.
2682 * Assumes that a particular page size may only be resident in one TSB.
2684 static void
2685 sfmmu_mod_tsb(sfmmu_t *sfmmup, caddr_t vaddr, tte_t *ttep, int ttesz)
2687 struct tsb_info *tsbinfop = NULL;
2688 uint64_t tag;
2689 struct tsbe *tsbe_addr;
2690 uint64_t tsb_base;
2691 uint_t tsb_size;
2692 int vpshift = MMU_PAGESHIFT;
2693 int phys = 0;
2695 if (sfmmup == ksfmmup) { /* No support for 32/256M ksfmmu pages */
2696 phys = ktsb_phys;
2697 if (ttesz >= TTE4M) {
2698 #ifndef sun4v
2699 ASSERT((ttesz != TTE32M) && (ttesz != TTE256M));
2700 #endif
2701 tsb_base = (phys)? ktsb4m_pbase : (uint64_t)ktsb4m_base;
2702 tsb_size = ktsb4m_szcode;
2703 } else {
2704 tsb_base = (phys)? ktsb_pbase : (uint64_t)ktsb_base;
2705 tsb_size = ktsb_szcode;
2707 } else {
2708 SFMMU_GET_TSBINFO(tsbinfop, sfmmup, ttesz);
2711 * If there isn't a TSB for this page size, or the TSB is
2712 * swapped out, there is nothing to do. Note that the latter
2713 * case seems impossible but can occur if hat_pageunload()
2714 * is called on an ISM mapping while the process is swapped
2715 * out.
2717 if (tsbinfop == NULL || (tsbinfop->tsb_flags & TSB_SWAPPED))
2718 return;
2721 * If another thread is in the middle of relocating a TSB
2722 * we can't unload the entry so set a flag so that the
2723 * TSB will be flushed before it can be accessed by the
2724 * process.
2726 if ((tsbinfop->tsb_flags & TSB_RELOC_FLAG) != 0) {
2727 if (ttep == NULL)
2728 tsbinfop->tsb_flags |= TSB_FLUSH_NEEDED;
2729 return;
2731 #if defined(UTSB_PHYS)
2732 phys = 1;
2733 tsb_base = (uint64_t)tsbinfop->tsb_pa;
2734 #else
2735 tsb_base = (uint64_t)tsbinfop->tsb_va;
2736 #endif
2737 tsb_size = tsbinfop->tsb_szc;
2739 if (ttesz >= TTE4M)
2740 vpshift = MMU_PAGESHIFT4M;
2742 tsbe_addr = sfmmu_get_tsbe(tsb_base, vaddr, vpshift, tsb_size);
2743 tag = sfmmu_make_tsbtag(vaddr);
2745 if (ttep == NULL) {
2746 sfmmu_unload_tsbe(tsbe_addr, tag, phys);
2747 } else {
2748 if (ttesz >= TTE4M) {
2749 SFMMU_STAT(sf_tsb_load4m);
2750 } else {
2751 SFMMU_STAT(sf_tsb_load8k);
2754 sfmmu_load_tsbe(tsbe_addr, tag, ttep, phys);
2759 * Unmap all entries from [start, end) matching the given page size.
2761 * This function is used primarily to unmap replicated 64K or 512K entries
2762 * from the TSB that are inserted using the base page size TSB pointer, but
2763 * it may also be called to unmap a range of addresses from the TSB.
2765 void
2766 sfmmu_unload_tsb_range(sfmmu_t *sfmmup, caddr_t start, caddr_t end, int ttesz)
2768 struct tsb_info *tsbinfop;
2769 uint64_t tag;
2770 struct tsbe *tsbe_addr;
2771 caddr_t vaddr;
2772 uint64_t tsb_base;
2773 int vpshift, vpgsz;
2774 uint_t tsb_size;
2775 int phys = 0;
2778 * Assumptions:
2779 * If ttesz == 8K, 64K or 512K, we walk through the range 8K
2780 * at a time shooting down any valid entries we encounter.
2782 * If ttesz >= 4M we walk the range 4M at a time shooting
2783 * down any valid mappings we find.
2785 if (sfmmup == ksfmmup) {
2786 phys = ktsb_phys;
2787 if (ttesz >= TTE4M) {
2788 #ifndef sun4v
2789 ASSERT((ttesz != TTE32M) && (ttesz != TTE256M));
2790 #endif
2791 tsb_base = (phys)? ktsb4m_pbase : (uint64_t)ktsb4m_base;
2792 tsb_size = ktsb4m_szcode;
2793 } else {
2794 tsb_base = (phys)? ktsb_pbase : (uint64_t)ktsb_base;
2795 tsb_size = ktsb_szcode;
2797 } else {
2798 SFMMU_GET_TSBINFO(tsbinfop, sfmmup, ttesz);
2801 * If there isn't a TSB for this page size, or the TSB is
2802 * swapped out, there is nothing to do. Note that the latter
2803 * case seems impossible but can occur if hat_pageunload()
2804 * is called on an ISM mapping while the process is swapped
2805 * out.
2807 if (tsbinfop == NULL || (tsbinfop->tsb_flags & TSB_SWAPPED))
2808 return;
2811 * If another thread is in the middle of relocating a TSB
2812 * we can't unload the entry so set a flag so that the
2813 * TSB will be flushed before it can be accessed by the
2814 * process.
2816 if ((tsbinfop->tsb_flags & TSB_RELOC_FLAG) != 0) {
2817 tsbinfop->tsb_flags |= TSB_FLUSH_NEEDED;
2818 return;
2820 #if defined(UTSB_PHYS)
2821 phys = 1;
2822 tsb_base = (uint64_t)tsbinfop->tsb_pa;
2823 #else
2824 tsb_base = (uint64_t)tsbinfop->tsb_va;
2825 #endif
2826 tsb_size = tsbinfop->tsb_szc;
2828 if (ttesz >= TTE4M) {
2829 vpshift = MMU_PAGESHIFT4M;
2830 vpgsz = MMU_PAGESIZE4M;
2831 } else {
2832 vpshift = MMU_PAGESHIFT;
2833 vpgsz = MMU_PAGESIZE;
2836 for (vaddr = start; vaddr < end; vaddr += vpgsz) {
2837 tag = sfmmu_make_tsbtag(vaddr);
2838 tsbe_addr = sfmmu_get_tsbe(tsb_base, vaddr, vpshift, tsb_size);
2839 sfmmu_unload_tsbe(tsbe_addr, tag, phys);
2844 * Select the optimum TSB size given the number of mappings
2845 * that need to be cached.
2847 static int
2848 sfmmu_select_tsb_szc(pgcnt_t pgcnt)
2850 int szc = 0;
2852 #ifdef DEBUG
2853 if (tsb_grow_stress) {
2854 uint32_t randval = (uint32_t)gettick() >> 4;
2855 return (randval % (tsb_max_growsize + 1));
2857 #endif /* DEBUG */
2859 while ((szc < tsb_max_growsize) && (pgcnt > SFMMU_RSS_TSBSIZE(szc)))
2860 szc++;
2861 return (szc);
2865 * This function will add a translation to the hme_blk and allocate the
2866 * hme_blk if one does not exist.
2867 * If a page structure is specified then it will add the
2868 * corresponding hment to the mapping list.
2869 * It will also update the hmenum field for the tte.
2870 * Furthermore, it attempts to create a large page translation
2871 * for <addr,hat> at page array pps. It assumes addr and first
2872 * pp is correctly aligned. It returns 0 if successful and 1 otherwise.
2874 static int
2875 sfmmu_tteload_array(sfmmu_t *sfmmup, tte_t *ttep, caddr_t vaddr,
2876 page_t **pps, uint_t flags, uint_t rid)
2878 struct hmehash_bucket *hmebp;
2879 struct hme_blk *hmeblkp;
2880 int ret;
2881 uint_t size;
2884 * Get mapping size.
2886 size = TTE_CSZ(ttep);
2887 ASSERT(!((uintptr_t)vaddr & TTE_PAGE_OFFSET(size)));
2890 * Acquire the hash bucket.
2892 hmebp = sfmmu_tteload_acquire_hashbucket(sfmmup, vaddr, size, rid);
2893 ASSERT(hmebp);
2896 * Find the hment block.
2898 hmeblkp = sfmmu_tteload_find_hmeblk(sfmmup, hmebp, vaddr, size, flags,
2899 rid);
2900 ASSERT(hmeblkp);
2903 * Add the translation.
2905 ret = sfmmu_tteload_addentry(sfmmup, hmeblkp, ttep, vaddr, pps, flags,
2906 rid);
2909 * Release the hash bucket.
2911 sfmmu_tteload_release_hashbucket(hmebp);
2913 return (ret);
2917 * Function locks and returns a pointer to the hash bucket for vaddr and size.
2919 static struct hmehash_bucket *
2920 sfmmu_tteload_acquire_hashbucket(sfmmu_t *sfmmup, caddr_t vaddr, int size,
2921 uint_t rid)
2923 struct hmehash_bucket *hmebp;
2924 int hmeshift;
2925 void *htagid = sfmmutohtagid(sfmmup, rid);
2927 ASSERT(htagid != NULL);
2929 hmeshift = HME_HASH_SHIFT(size);
2931 hmebp = HME_HASH_FUNCTION(htagid, vaddr, hmeshift);
2933 SFMMU_HASH_LOCK(hmebp);
2935 return (hmebp);
2939 * Function returns a pointer to an hmeblk in the hash bucket, hmebp. If the
2940 * hmeblk doesn't exists for the [sfmmup, vaddr & size] signature, a hmeblk is
2941 * allocated.
2943 static struct hme_blk *
2944 sfmmu_tteload_find_hmeblk(sfmmu_t *sfmmup, struct hmehash_bucket *hmebp,
2945 caddr_t vaddr, uint_t size, uint_t flags, uint_t rid)
2947 hmeblk_tag hblktag;
2948 int hmeshift;
2949 struct hme_blk *hmeblkp, *pr_hblk, *list = NULL;
2951 SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size));
2953 hblktag.htag_id = sfmmutohtagid(sfmmup, rid);
2954 ASSERT(hblktag.htag_id != NULL);
2955 hmeshift = HME_HASH_SHIFT(size);
2956 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift);
2957 hblktag.htag_rehash = HME_HASH_REHASH(size);
2958 hblktag.htag_rid = rid;
2960 ttearray_realloc:
2962 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list);
2965 * We block until hblk_reserve_lock is released; it's held by
2966 * the thread, temporarily using hblk_reserve, until hblk_reserve is
2967 * replaced by a hblk from sfmmu8_cache.
2969 if (hmeblkp == (struct hme_blk *)hblk_reserve &&
2970 hblk_reserve_thread != curthread) {
2971 SFMMU_HASH_UNLOCK(hmebp);
2972 mutex_enter(&hblk_reserve_lock);
2973 mutex_exit(&hblk_reserve_lock);
2974 SFMMU_STAT(sf_hblk_reserve_hit);
2975 SFMMU_HASH_LOCK(hmebp);
2976 goto ttearray_realloc;
2979 if (hmeblkp == NULL) {
2980 hmeblkp = sfmmu_hblk_alloc(sfmmup, vaddr, hmebp, size,
2981 hblktag, flags, rid);
2982 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared);
2983 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared);
2984 } else {
2986 * It is possible for 8k and 64k hblks to collide since they
2987 * have the same rehash value. This is because we
2988 * lazily free hblks and 8K/64K blks could be lingering.
2989 * If we find size mismatch we free the block and & try again.
2991 if (get_hblk_ttesz(hmeblkp) != size) {
2992 ASSERT(!hmeblkp->hblk_vcnt);
2993 ASSERT(!hmeblkp->hblk_hmecnt);
2994 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
2995 &list, 0);
2996 goto ttearray_realloc;
2998 if (hmeblkp->hblk_shw_bit) {
3000 * if the hblk was previously used as a shadow hblk then
3001 * we will change it to a normal hblk
3003 ASSERT(!hmeblkp->hblk_shared);
3004 if (hmeblkp->hblk_shw_mask) {
3005 sfmmu_shadow_hcleanup(sfmmup, hmeblkp, hmebp);
3006 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
3007 goto ttearray_realloc;
3008 } else {
3009 hmeblkp->hblk_shw_bit = 0;
3012 SFMMU_STAT(sf_hblk_hit);
3016 * hat_memload() should never call kmem_cache_free() for kernel hmeblks;
3017 * see block comment showing the stacktrace in sfmmu_hblk_alloc();
3018 * set the flag parameter to 1 so that sfmmu_hblks_list_purge() will
3019 * just add these hmeblks to the per-cpu pending queue.
3021 sfmmu_hblks_list_purge(&list, 1);
3023 ASSERT(get_hblk_ttesz(hmeblkp) == size);
3024 ASSERT(!hmeblkp->hblk_shw_bit);
3025 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared);
3026 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared);
3027 ASSERT(hmeblkp->hblk_tag.htag_rid == rid);
3029 return (hmeblkp);
3033 * Function adds a tte entry into the hmeblk. It returns 0 if successful and 1
3034 * otherwise.
3036 static int
3037 sfmmu_tteload_addentry(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, tte_t *ttep,
3038 caddr_t vaddr, page_t **pps, uint_t flags, uint_t rid)
3040 page_t *pp = *pps;
3041 int hmenum, size, remap;
3042 tte_t tteold, flush_tte;
3043 #ifdef DEBUG
3044 tte_t orig_old;
3045 #endif /* DEBUG */
3046 struct sf_hment *sfhme;
3047 kmutex_t *pml, *pmtx;
3048 hatlock_t *hatlockp;
3049 int myflt;
3052 * remove this panic when we decide to let user virtual address
3053 * space be >= USERLIMIT.
3055 if (!TTE_IS_PRIVILEGED(ttep) && vaddr >= (caddr_t)USERLIMIT)
3056 panic("user addr %p in kernel space", (void *)vaddr);
3057 #if defined(TTE_IS_GLOBAL)
3058 if (TTE_IS_GLOBAL(ttep))
3059 panic("sfmmu_tteload: creating global tte");
3060 #endif
3062 #ifdef DEBUG
3063 if (pf_is_memory(sfmmu_ttetopfn(ttep, vaddr)) &&
3064 !TTE_IS_PCACHEABLE(ttep) && !sfmmu_allow_nc_trans)
3065 panic("sfmmu_tteload: non cacheable memory tte");
3066 #endif /* DEBUG */
3068 /* don't simulate dirty bit for writeable ISM/DISM mappings */
3069 if ((flags & HAT_LOAD_SHARE) && TTE_IS_WRITABLE(ttep)) {
3070 TTE_SET_REF(ttep);
3071 TTE_SET_MOD(ttep);
3074 if ((flags & HAT_LOAD_SHARE) || !TTE_IS_REF(ttep) ||
3075 !TTE_IS_MOD(ttep)) {
3077 * Don't load TSB for dummy as in ISM. Also don't preload
3078 * the TSB if the TTE isn't writable since we're likely to
3079 * fault on it again -- preloading can be fairly expensive.
3081 flags |= SFMMU_NO_TSBLOAD;
3084 size = TTE_CSZ(ttep);
3085 switch (size) {
3086 case TTE8K:
3087 SFMMU_STAT(sf_tteload8k);
3088 break;
3089 case TTE64K:
3090 SFMMU_STAT(sf_tteload64k);
3091 break;
3092 case TTE512K:
3093 SFMMU_STAT(sf_tteload512k);
3094 break;
3095 case TTE4M:
3096 SFMMU_STAT(sf_tteload4m);
3097 break;
3098 case (TTE32M):
3099 SFMMU_STAT(sf_tteload32m);
3100 ASSERT(mmu_page_sizes == max_mmu_page_sizes);
3101 break;
3102 case (TTE256M):
3103 SFMMU_STAT(sf_tteload256m);
3104 ASSERT(mmu_page_sizes == max_mmu_page_sizes);
3105 break;
3108 ASSERT(!((uintptr_t)vaddr & TTE_PAGE_OFFSET(size)));
3109 SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size));
3110 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared);
3111 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared);
3113 HBLKTOHME_IDX(sfhme, hmeblkp, vaddr, hmenum);
3116 * Need to grab mlist lock here so that pageunload
3117 * will not change tte behind us.
3119 if (pp) {
3120 pml = sfmmu_mlist_enter(pp);
3123 sfmmu_copytte(&sfhme->hme_tte, &tteold);
3125 * Look for corresponding hment and if valid verify
3126 * pfns are equal.
3128 remap = TTE_IS_VALID(&tteold);
3129 if (remap) {
3130 pfn_t new_pfn, old_pfn;
3132 old_pfn = TTE_TO_PFN(vaddr, &tteold);
3133 new_pfn = TTE_TO_PFN(vaddr, ttep);
3135 if (flags & HAT_LOAD_REMAP) {
3136 /* make sure we are remapping same type of pages */
3137 if (pf_is_memory(old_pfn) != pf_is_memory(new_pfn)) {
3138 panic("sfmmu_tteload - tte remap io<->memory");
3140 if (old_pfn != new_pfn &&
3141 (pp != NULL || sfhme->hme_page != NULL)) {
3142 panic("sfmmu_tteload - tte remap pp != NULL");
3144 } else if (old_pfn != new_pfn) {
3145 panic("sfmmu_tteload - tte remap, hmeblkp 0x%p",
3146 (void *)hmeblkp);
3148 ASSERT(TTE_CSZ(&tteold) == TTE_CSZ(ttep));
3151 if (pp) {
3152 if (size == TTE8K) {
3153 #ifdef VAC
3155 * Handle VAC consistency
3157 if (!remap && (cache & CACHE_VAC) && !PP_ISNC(pp)) {
3158 sfmmu_vac_conflict(sfmmup, vaddr, pp);
3160 #endif
3162 if (TTE_IS_WRITABLE(ttep) && PP_ISRO(pp)) {
3163 pmtx = sfmmu_page_enter(pp);
3164 PP_CLRRO(pp);
3165 sfmmu_page_exit(pmtx);
3166 } else if (!PP_ISMAPPED(pp) &&
3167 (!TTE_IS_WRITABLE(ttep)) && !(PP_ISMOD(pp))) {
3168 pmtx = sfmmu_page_enter(pp);
3169 if (!(PP_ISMOD(pp))) {
3170 PP_SETRO(pp);
3172 sfmmu_page_exit(pmtx);
3175 } else if (sfmmu_pagearray_setup(vaddr, pps, ttep, remap)) {
3177 * sfmmu_pagearray_setup failed so return
3179 sfmmu_mlist_exit(pml);
3180 return (1);
3185 * Make sure hment is not on a mapping list.
3187 ASSERT(remap || (sfhme->hme_page == NULL));
3189 /* if it is not a remap then hme->next better be NULL */
3190 ASSERT((!remap) ? sfhme->hme_next == NULL : 1);
3192 if (flags & HAT_LOAD_LOCK) {
3193 if ((hmeblkp->hblk_lckcnt + 1) >= MAX_HBLK_LCKCNT) {
3194 panic("too high lckcnt-hmeblk %p",
3195 (void *)hmeblkp);
3197 atomic_inc_32(&hmeblkp->hblk_lckcnt);
3199 HBLK_STACK_TRACE(hmeblkp, HBLK_LOCK);
3202 #ifdef VAC
3203 if (pp && PP_ISNC(pp)) {
3205 * If the physical page is marked to be uncacheable, like
3206 * by a vac conflict, make sure the new mapping is also
3207 * uncacheable.
3209 TTE_CLR_VCACHEABLE(ttep);
3210 ASSERT(PP_GET_VCOLOR(pp) == NO_VCOLOR);
3212 #endif
3213 ttep->tte_hmenum = hmenum;
3215 #ifdef DEBUG
3216 orig_old = tteold;
3217 #endif /* DEBUG */
3219 while (sfmmu_modifytte_try(&tteold, ttep, &sfhme->hme_tte) < 0) {
3220 if ((sfmmup == KHATID) &&
3221 (flags & (HAT_LOAD_LOCK | HAT_LOAD_REMAP))) {
3222 sfmmu_copytte(&sfhme->hme_tte, &tteold);
3224 #ifdef DEBUG
3225 chk_tte(&orig_old, &tteold, ttep, hmeblkp);
3226 #endif /* DEBUG */
3228 ASSERT(TTE_IS_VALID(&sfhme->hme_tte));
3230 if (!TTE_IS_VALID(&tteold)) {
3232 atomic_inc_16(&hmeblkp->hblk_vcnt);
3233 if (rid == SFMMU_INVALID_SHMERID) {
3234 atomic_inc_ulong(&sfmmup->sfmmu_ttecnt[size]);
3235 } else {
3236 sf_srd_t *srdp = sfmmup->sfmmu_srdp;
3237 sf_region_t *rgnp = srdp->srd_hmergnp[rid];
3239 * We already accounted for region ttecnt's in sfmmu
3240 * during hat_join_region() processing. Here we
3241 * only update ttecnt's in region struture.
3243 atomic_inc_ulong(&rgnp->rgn_ttecnt[size]);
3247 myflt = (astosfmmu(curthread->t_procp->p_as) == sfmmup);
3248 if (size > TTE8K && (flags & HAT_LOAD_SHARE) == 0 &&
3249 sfmmup != ksfmmup) {
3250 uchar_t tteflag = 1 << size;
3251 if (rid == SFMMU_INVALID_SHMERID) {
3252 if (!(sfmmup->sfmmu_tteflags & tteflag)) {
3253 hatlockp = sfmmu_hat_enter(sfmmup);
3254 sfmmup->sfmmu_tteflags |= tteflag;
3255 sfmmu_hat_exit(hatlockp);
3257 } else if (!(sfmmup->sfmmu_rtteflags & tteflag)) {
3258 hatlockp = sfmmu_hat_enter(sfmmup);
3259 sfmmup->sfmmu_rtteflags |= tteflag;
3260 sfmmu_hat_exit(hatlockp);
3263 * Update the current CPU tsbmiss area, so the current thread
3264 * won't need to take the tsbmiss for the new pagesize.
3265 * The other threads in the process will update their tsb
3266 * miss area lazily in sfmmu_tsbmiss_exception() when they
3267 * fail to find the translation for a newly added pagesize.
3269 if (size > TTE64K && myflt) {
3270 struct tsbmiss *tsbmp;
3271 kpreempt_disable();
3272 tsbmp = &tsbmiss_area[CPU->cpu_id];
3273 if (rid == SFMMU_INVALID_SHMERID) {
3274 if (!(tsbmp->uhat_tteflags & tteflag)) {
3275 tsbmp->uhat_tteflags |= tteflag;
3277 } else {
3278 if (!(tsbmp->uhat_rtteflags & tteflag)) {
3279 tsbmp->uhat_rtteflags |= tteflag;
3282 kpreempt_enable();
3286 if (size >= TTE4M && (flags & HAT_LOAD_TEXT) &&
3287 !SFMMU_FLAGS_ISSET(sfmmup, HAT_4MTEXT_FLAG)) {
3288 hatlockp = sfmmu_hat_enter(sfmmup);
3289 SFMMU_FLAGS_SET(sfmmup, HAT_4MTEXT_FLAG);
3290 sfmmu_hat_exit(hatlockp);
3293 flush_tte.tte_intlo = (tteold.tte_intlo ^ ttep->tte_intlo) &
3294 hw_tte.tte_intlo;
3295 flush_tte.tte_inthi = (tteold.tte_inthi ^ ttep->tte_inthi) &
3296 hw_tte.tte_inthi;
3298 if (remap && (flush_tte.tte_inthi || flush_tte.tte_intlo)) {
3300 * If remap and new tte differs from old tte we need
3301 * to sync the mod bit and flush TLB/TSB. We don't
3302 * need to sync ref bit because we currently always set
3303 * ref bit in tteload.
3305 ASSERT(TTE_IS_REF(ttep));
3306 if (TTE_IS_MOD(&tteold)) {
3307 sfmmu_ttesync(sfmmup, vaddr, &tteold, pp);
3310 * hwtte bits shouldn't change for SRD hmeblks as long as SRD
3311 * hmes are only used for read only text. Adding this code for
3312 * completeness and future use of shared hmeblks with writable
3313 * mappings of VMODSORT vnodes.
3315 if (hmeblkp->hblk_shared) {
3316 cpuset_t cpuset = sfmmu_rgntlb_demap(vaddr,
3317 sfmmup->sfmmu_srdp->srd_hmergnp[rid], hmeblkp, 1);
3318 xt_sync(cpuset);
3319 SFMMU_STAT_ADD(sf_region_remap_demap, 1);
3320 } else {
3321 sfmmu_tlb_demap(vaddr, sfmmup, hmeblkp, 0, 0);
3322 xt_sync(sfmmup->sfmmu_cpusran);
3326 if ((flags & SFMMU_NO_TSBLOAD) == 0) {
3328 * We only preload 8K and 4M mappings into the TSB, since
3329 * 64K and 512K mappings are replicated and hence don't
3330 * have a single, unique TSB entry. Ditto for 32M/256M.
3332 if (size == TTE8K || size == TTE4M) {
3333 sf_scd_t *scdp;
3334 hatlockp = sfmmu_hat_enter(sfmmup);
3336 * Don't preload private TSB if the mapping is used
3337 * by the shctx in the SCD.
3339 scdp = sfmmup->sfmmu_scdp;
3340 if (rid == SFMMU_INVALID_SHMERID || scdp == NULL ||
3341 !SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) {
3342 sfmmu_load_tsb(sfmmup, vaddr, &sfhme->hme_tte,
3343 size);
3345 sfmmu_hat_exit(hatlockp);
3348 if (pp) {
3349 if (!remap) {
3350 HME_ADD(sfhme, pp);
3351 atomic_inc_16(&hmeblkp->hblk_hmecnt);
3352 ASSERT(hmeblkp->hblk_hmecnt > 0);
3355 * Cannot ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS)
3356 * see pageunload() for comment.
3359 sfmmu_mlist_exit(pml);
3362 return (0);
3365 * Function unlocks hash bucket.
3367 static void
3368 sfmmu_tteload_release_hashbucket(struct hmehash_bucket *hmebp)
3370 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
3371 SFMMU_HASH_UNLOCK(hmebp);
3375 * function which checks and sets up page array for a large
3376 * translation. Will set p_vcolor, p_index, p_ro fields.
3377 * Assumes addr and pfnum of first page are properly aligned.
3378 * Will check for physical contiguity. If check fails it return
3379 * non null.
3381 static int
3382 sfmmu_pagearray_setup(caddr_t addr, page_t **pps, tte_t *ttep, int remap)
3384 int i, index, ttesz;
3385 pfn_t pfnum;
3386 pgcnt_t npgs;
3387 page_t *pp, *pp1;
3388 kmutex_t *pmtx;
3389 #ifdef VAC
3390 int osz;
3391 int cflags = 0;
3392 int vac_err = 0;
3393 #endif
3394 int newidx = 0;
3396 ttesz = TTE_CSZ(ttep);
3398 ASSERT(ttesz > TTE8K);
3400 npgs = TTEPAGES(ttesz);
3401 index = PAGESZ_TO_INDEX(ttesz);
3403 pfnum = (*pps)->p_pagenum;
3404 ASSERT(IS_P2ALIGNED(pfnum, npgs));
3407 * Save the first pp so we can do HAT_TMPNC at the end.
3409 pp1 = *pps;
3410 #ifdef VAC
3411 osz = fnd_mapping_sz(pp1);
3412 #endif
3414 for (i = 0; i < npgs; i++, pps++) {
3415 pp = *pps;
3416 ASSERT(PAGE_LOCKED(pp));
3417 ASSERT(pp->p_szc >= ttesz);
3418 ASSERT(pp->p_szc == pp1->p_szc);
3419 ASSERT(sfmmu_mlist_held(pp));
3422 * XXX is it possible to maintain P_RO on the root only?
3424 if (TTE_IS_WRITABLE(ttep) && PP_ISRO(pp)) {
3425 pmtx = sfmmu_page_enter(pp);
3426 PP_CLRRO(pp);
3427 sfmmu_page_exit(pmtx);
3428 } else if (!PP_ISMAPPED(pp) && !TTE_IS_WRITABLE(ttep) &&
3429 !PP_ISMOD(pp)) {
3430 pmtx = sfmmu_page_enter(pp);
3431 if (!(PP_ISMOD(pp))) {
3432 PP_SETRO(pp);
3434 sfmmu_page_exit(pmtx);
3438 * If this is a remap we skip vac & contiguity checks.
3440 if (remap)
3441 continue;
3444 * set p_vcolor and detect any vac conflicts.
3446 #ifdef VAC
3447 if (vac_err == 0) {
3448 vac_err = sfmmu_vacconflict_array(addr, pp, &cflags);
3451 #endif
3454 * Save current index in case we need to undo it.
3455 * Note: "PAGESZ_TO_INDEX(sz) (1 << (sz))"
3456 * "SFMMU_INDEX_SHIFT 6"
3457 * "SFMMU_INDEX_MASK ((1 << SFMMU_INDEX_SHIFT) - 1)"
3458 * "PP_MAPINDEX(p_index) (p_index & SFMMU_INDEX_MASK)"
3460 * So: index = PAGESZ_TO_INDEX(ttesz);
3461 * if ttesz == 1 then index = 0x2
3462 * 2 then index = 0x4
3463 * 3 then index = 0x8
3464 * 4 then index = 0x10
3465 * 5 then index = 0x20
3466 * The code below checks if it's a new pagesize (ie, newidx)
3467 * in case we need to take it back out of p_index,
3468 * and then or's the new index into the existing index.
3470 if ((PP_MAPINDEX(pp) & index) == 0)
3471 newidx = 1;
3472 pp->p_index = (PP_MAPINDEX(pp) | index);
3475 * contiguity check
3477 if (pp->p_pagenum != pfnum) {
3479 * If we fail the contiguity test then
3480 * the only thing we need to fix is the p_index field.
3481 * We might get a few extra flushes but since this
3482 * path is rare that is ok. The p_ro field will
3483 * get automatically fixed on the next tteload to
3484 * the page. NO TNC bit is set yet.
3486 while (i >= 0) {
3487 pp = *pps;
3488 if (newidx)
3489 pp->p_index = (PP_MAPINDEX(pp) &
3490 ~index);
3491 pps--;
3492 i--;
3494 return (1);
3496 pfnum++;
3497 addr += MMU_PAGESIZE;
3500 #ifdef VAC
3501 if (vac_err) {
3502 if (ttesz > osz) {
3504 * There are some smaller mappings that causes vac
3505 * conflicts. Convert all existing small mappings to
3506 * TNC.
3508 SFMMU_STAT_ADD(sf_uncache_conflict, npgs);
3509 sfmmu_page_cache_array(pp1, HAT_TMPNC, CACHE_FLUSH,
3510 npgs);
3511 } else {
3512 /* EMPTY */
3514 * If there exists an big page mapping,
3515 * that means the whole existing big page
3516 * has TNC setting already. No need to covert to
3517 * TNC again.
3519 ASSERT(PP_ISTNC(pp1));
3522 #endif /* VAC */
3524 return (0);
3527 #ifdef VAC
3529 * Routine that detects vac consistency for a large page. It also
3530 * sets virtual color for all pp's for this big mapping.
3532 static int
3533 sfmmu_vacconflict_array(caddr_t addr, page_t *pp, int *cflags)
3535 int vcolor, ocolor;
3537 ASSERT(sfmmu_mlist_held(pp));
3539 if (PP_ISNC(pp)) {
3540 return (HAT_TMPNC);
3543 vcolor = addr_to_vcolor(addr);
3544 if (PP_NEWPAGE(pp)) {
3545 PP_SET_VCOLOR(pp, vcolor);
3546 return (0);
3549 ocolor = PP_GET_VCOLOR(pp);
3550 if (ocolor == vcolor) {
3551 return (0);
3554 if (!PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp)) {
3556 * Previous user of page had a differnet color
3557 * but since there are no current users
3558 * we just flush the cache and change the color.
3559 * As an optimization for large pages we flush the
3560 * entire cache of that color and set a flag.
3562 SFMMU_STAT(sf_pgcolor_conflict);
3563 if (!CacheColor_IsFlushed(*cflags, ocolor)) {
3564 CacheColor_SetFlushed(*cflags, ocolor);
3565 sfmmu_cache_flushcolor(ocolor, pp->p_pagenum);
3567 PP_SET_VCOLOR(pp, vcolor);
3568 return (0);
3572 * We got a real conflict with a current mapping.
3573 * set flags to start unencaching all mappings
3574 * and return failure so we restart looping
3575 * the pp array from the beginning.
3577 return (HAT_TMPNC);
3579 #endif /* VAC */
3582 * creates a large page shadow hmeblk for a tte.
3583 * The purpose of this routine is to allow us to do quick unloads because
3584 * the vm layer can easily pass a very large but sparsely populated range.
3586 static struct hme_blk *
3587 sfmmu_shadow_hcreate(sfmmu_t *sfmmup, caddr_t vaddr, int ttesz, uint_t flags)
3589 struct hmehash_bucket *hmebp;
3590 hmeblk_tag hblktag;
3591 int hmeshift, size, vshift;
3592 uint_t shw_mask, newshw_mask;
3593 struct hme_blk *hmeblkp;
3595 ASSERT(sfmmup != KHATID);
3596 if (mmu_page_sizes == max_mmu_page_sizes) {
3597 ASSERT(ttesz < TTE256M);
3598 } else {
3599 ASSERT(ttesz < TTE4M);
3600 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0);
3601 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0);
3604 if (ttesz == TTE8K) {
3605 size = TTE512K;
3606 } else {
3607 size = ++ttesz;
3610 hblktag.htag_id = sfmmup;
3611 hmeshift = HME_HASH_SHIFT(size);
3612 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift);
3613 hblktag.htag_rehash = HME_HASH_REHASH(size);
3614 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
3615 hmebp = HME_HASH_FUNCTION(sfmmup, vaddr, hmeshift);
3617 SFMMU_HASH_LOCK(hmebp);
3619 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp);
3620 ASSERT(hmeblkp != (struct hme_blk *)hblk_reserve);
3621 if (hmeblkp == NULL) {
3622 hmeblkp = sfmmu_hblk_alloc(sfmmup, vaddr, hmebp, size,
3623 hblktag, flags, SFMMU_INVALID_SHMERID);
3625 ASSERT(hmeblkp);
3626 if (!hmeblkp->hblk_shw_mask) {
3628 * if this is a unused hblk it was just allocated or could
3629 * potentially be a previous large page hblk so we need to
3630 * set the shadow bit.
3632 ASSERT(!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt);
3633 hmeblkp->hblk_shw_bit = 1;
3634 } else if (hmeblkp->hblk_shw_bit == 0) {
3635 panic("sfmmu_shadow_hcreate: shw bit not set in hmeblkp 0x%p",
3636 (void *)hmeblkp);
3638 ASSERT(hmeblkp->hblk_shw_bit == 1);
3639 ASSERT(!hmeblkp->hblk_shared);
3640 vshift = vaddr_to_vshift(hblktag, vaddr, size);
3641 ASSERT(vshift < 8);
3643 * Atomically set shw mask bit
3645 do {
3646 shw_mask = hmeblkp->hblk_shw_mask;
3647 newshw_mask = shw_mask | (1 << vshift);
3648 newshw_mask = atomic_cas_32(&hmeblkp->hblk_shw_mask, shw_mask,
3649 newshw_mask);
3650 } while (newshw_mask != shw_mask);
3652 SFMMU_HASH_UNLOCK(hmebp);
3654 return (hmeblkp);
3658 * This routine cleanup a previous shadow hmeblk and changes it to
3659 * a regular hblk. This happens rarely but it is possible
3660 * when a process wants to use large pages and there are hblks still
3661 * lying around from the previous as that used these hmeblks.
3662 * The alternative was to cleanup the shadow hblks at unload time
3663 * but since so few user processes actually use large pages, it is
3664 * better to be lazy and cleanup at this time.
3666 static void
3667 sfmmu_shadow_hcleanup(sfmmu_t *sfmmup, struct hme_blk *hmeblkp,
3668 struct hmehash_bucket *hmebp)
3670 caddr_t addr, endaddr;
3671 int hashno, size;
3673 ASSERT(hmeblkp->hblk_shw_bit);
3674 ASSERT(!hmeblkp->hblk_shared);
3676 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
3678 if (!hmeblkp->hblk_shw_mask) {
3679 hmeblkp->hblk_shw_bit = 0;
3680 return;
3682 addr = (caddr_t)get_hblk_base(hmeblkp);
3683 endaddr = get_hblk_endaddr(hmeblkp);
3684 size = get_hblk_ttesz(hmeblkp);
3685 hashno = size - 1;
3686 ASSERT(hashno > 0);
3687 SFMMU_HASH_UNLOCK(hmebp);
3689 sfmmu_free_hblks(sfmmup, addr, endaddr, hashno);
3691 SFMMU_HASH_LOCK(hmebp);
3694 static void
3695 sfmmu_free_hblks(sfmmu_t *sfmmup, caddr_t addr, caddr_t endaddr,
3696 int hashno)
3698 int hmeshift, shadow = 0;
3699 hmeblk_tag hblktag;
3700 struct hmehash_bucket *hmebp;
3701 struct hme_blk *hmeblkp;
3702 struct hme_blk *nx_hblk, *pr_hblk, *list = NULL;
3704 ASSERT(hashno > 0);
3705 hblktag.htag_id = sfmmup;
3706 hblktag.htag_rehash = hashno;
3707 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
3709 hmeshift = HME_HASH_SHIFT(hashno);
3711 while (addr < endaddr) {
3712 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
3713 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
3714 SFMMU_HASH_LOCK(hmebp);
3715 /* inline HME_HASH_SEARCH */
3716 hmeblkp = hmebp->hmeblkp;
3717 pr_hblk = NULL;
3718 while (hmeblkp) {
3719 if (HTAGS_EQ(hmeblkp->hblk_tag, hblktag)) {
3720 /* found hme_blk */
3721 ASSERT(!hmeblkp->hblk_shared);
3722 if (hmeblkp->hblk_shw_bit) {
3723 if (hmeblkp->hblk_shw_mask) {
3724 shadow = 1;
3725 sfmmu_shadow_hcleanup(sfmmup,
3726 hmeblkp, hmebp);
3727 break;
3728 } else {
3729 hmeblkp->hblk_shw_bit = 0;
3734 * Hblk_hmecnt and hblk_vcnt could be non zero
3735 * since hblk_unload() does not gurantee that.
3737 * XXX - this could cause tteload() to spin
3738 * where sfmmu_shadow_hcleanup() is called.
3742 nx_hblk = hmeblkp->hblk_next;
3743 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) {
3744 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
3745 &list, 0);
3746 } else {
3747 pr_hblk = hmeblkp;
3749 hmeblkp = nx_hblk;
3752 SFMMU_HASH_UNLOCK(hmebp);
3754 if (shadow) {
3756 * We found another shadow hblk so cleaned its
3757 * children. We need to go back and cleanup
3758 * the original hblk so we don't change the
3759 * addr.
3761 shadow = 0;
3762 } else {
3763 addr = (caddr_t)roundup((uintptr_t)addr + 1,
3764 (1 << hmeshift));
3767 sfmmu_hblks_list_purge(&list, 0);
3771 * This routine's job is to delete stale invalid shared hmeregions hmeblks that
3772 * may still linger on after pageunload.
3774 static void
3775 sfmmu_cleanup_rhblk(sf_srd_t *srdp, caddr_t addr, uint_t rid, int ttesz)
3777 int hmeshift;
3778 hmeblk_tag hblktag;
3779 struct hmehash_bucket *hmebp;
3780 struct hme_blk *hmeblkp;
3781 struct hme_blk *pr_hblk;
3782 struct hme_blk *list = NULL;
3784 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
3785 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
3787 hmeshift = HME_HASH_SHIFT(ttesz);
3788 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
3789 hblktag.htag_rehash = ttesz;
3790 hblktag.htag_rid = rid;
3791 hblktag.htag_id = srdp;
3792 hmebp = HME_HASH_FUNCTION(srdp, addr, hmeshift);
3794 SFMMU_HASH_LOCK(hmebp);
3795 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list);
3796 if (hmeblkp != NULL) {
3797 ASSERT(hmeblkp->hblk_shared);
3798 ASSERT(!hmeblkp->hblk_shw_bit);
3799 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) {
3800 panic("sfmmu_cleanup_rhblk: valid hmeblk");
3802 ASSERT(!hmeblkp->hblk_lckcnt);
3803 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
3804 &list, 0);
3806 SFMMU_HASH_UNLOCK(hmebp);
3807 sfmmu_hblks_list_purge(&list, 0);
3810 /* ARGSUSED */
3811 static void
3812 sfmmu_rgn_cb_noop(caddr_t saddr, caddr_t eaddr, caddr_t r_saddr,
3813 size_t r_size, void *r_obj, u_offset_t r_objoff)
3818 * Searches for an hmeblk which maps addr, then unloads this mapping
3819 * and updates *eaddrp, if the hmeblk is found.
3821 static void
3822 sfmmu_unload_hmeregion_va(sf_srd_t *srdp, uint_t rid, caddr_t addr,
3823 caddr_t eaddr, int ttesz, caddr_t *eaddrp)
3825 int hmeshift;
3826 hmeblk_tag hblktag;
3827 struct hmehash_bucket *hmebp;
3828 struct hme_blk *hmeblkp;
3829 struct hme_blk *pr_hblk;
3830 struct hme_blk *list = NULL;
3832 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
3833 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
3834 ASSERT(ttesz >= HBLK_MIN_TTESZ);
3836 hmeshift = HME_HASH_SHIFT(ttesz);
3837 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
3838 hblktag.htag_rehash = ttesz;
3839 hblktag.htag_rid = rid;
3840 hblktag.htag_id = srdp;
3841 hmebp = HME_HASH_FUNCTION(srdp, addr, hmeshift);
3843 SFMMU_HASH_LOCK(hmebp);
3844 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list);
3845 if (hmeblkp != NULL) {
3846 ASSERT(hmeblkp->hblk_shared);
3847 ASSERT(!hmeblkp->hblk_lckcnt);
3848 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) {
3849 *eaddrp = sfmmu_hblk_unload(NULL, hmeblkp, addr,
3850 eaddr, NULL, HAT_UNLOAD);
3851 ASSERT(*eaddrp > addr);
3853 ASSERT(!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt);
3854 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
3855 &list, 0);
3857 SFMMU_HASH_UNLOCK(hmebp);
3858 sfmmu_hblks_list_purge(&list, 0);
3861 static void
3862 sfmmu_unload_hmeregion(sf_srd_t *srdp, sf_region_t *rgnp)
3864 int ttesz = rgnp->rgn_pgszc;
3865 size_t rsz = rgnp->rgn_size;
3866 caddr_t rsaddr = rgnp->rgn_saddr;
3867 caddr_t readdr = rsaddr + rsz;
3868 caddr_t rhsaddr;
3869 caddr_t va;
3870 uint_t rid = rgnp->rgn_id;
3871 caddr_t cbsaddr;
3872 caddr_t cbeaddr;
3873 hat_rgn_cb_func_t rcbfunc;
3874 ulong_t cnt;
3876 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
3877 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
3879 ASSERT(IS_P2ALIGNED(rsaddr, TTEBYTES(ttesz)));
3880 ASSERT(IS_P2ALIGNED(rsz, TTEBYTES(ttesz)));
3881 if (ttesz < HBLK_MIN_TTESZ) {
3882 ttesz = HBLK_MIN_TTESZ;
3883 rhsaddr = (caddr_t)P2ALIGN((uintptr_t)rsaddr, HBLK_MIN_BYTES);
3884 } else {
3885 rhsaddr = rsaddr;
3888 if ((rcbfunc = rgnp->rgn_cb_function) == NULL) {
3889 rcbfunc = sfmmu_rgn_cb_noop;
3892 while (ttesz >= HBLK_MIN_TTESZ) {
3893 cbsaddr = rsaddr;
3894 cbeaddr = rsaddr;
3895 if (!(rgnp->rgn_hmeflags & (1 << ttesz))) {
3896 ttesz--;
3897 continue;
3899 cnt = 0;
3900 va = rsaddr;
3901 while (va < readdr) {
3902 ASSERT(va >= rhsaddr);
3903 if (va != cbeaddr) {
3904 if (cbeaddr != cbsaddr) {
3905 ASSERT(cbeaddr > cbsaddr);
3906 (*rcbfunc)(cbsaddr, cbeaddr,
3907 rsaddr, rsz, rgnp->rgn_obj,
3908 rgnp->rgn_objoff);
3910 cbsaddr = va;
3911 cbeaddr = va;
3913 sfmmu_unload_hmeregion_va(srdp, rid, va, readdr,
3914 ttesz, &cbeaddr);
3915 cnt++;
3916 va = rhsaddr + (cnt << TTE_PAGE_SHIFT(ttesz));
3918 if (cbeaddr != cbsaddr) {
3919 ASSERT(cbeaddr > cbsaddr);
3920 (*rcbfunc)(cbsaddr, cbeaddr, rsaddr,
3921 rsz, rgnp->rgn_obj,
3922 rgnp->rgn_objoff);
3924 ttesz--;
3929 * Release one hardware address translation lock on the given address range.
3931 void
3932 hat_unlock(struct hat *sfmmup, caddr_t addr, size_t len)
3934 struct hmehash_bucket *hmebp;
3935 hmeblk_tag hblktag;
3936 int hmeshift, hashno = 1;
3937 struct hme_blk *hmeblkp, *list = NULL;
3938 caddr_t endaddr;
3940 ASSERT(sfmmup != NULL);
3942 ASSERT((sfmmup == ksfmmup) || AS_LOCK_HELD(sfmmup->sfmmu_as));
3943 ASSERT((len & MMU_PAGEOFFSET) == 0);
3944 endaddr = addr + len;
3945 hblktag.htag_id = sfmmup;
3946 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
3949 * Spitfire supports 4 page sizes.
3950 * Most pages are expected to be of the smallest page size (8K) and
3951 * these will not need to be rehashed. 64K pages also don't need to be
3952 * rehashed because an hmeblk spans 64K of address space. 512K pages
3953 * might need 1 rehash and and 4M pages might need 2 rehashes.
3955 while (addr < endaddr) {
3956 hmeshift = HME_HASH_SHIFT(hashno);
3957 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
3958 hblktag.htag_rehash = hashno;
3959 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
3961 SFMMU_HASH_LOCK(hmebp);
3963 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list);
3964 if (hmeblkp != NULL) {
3965 ASSERT(!hmeblkp->hblk_shared);
3967 * If we encounter a shadow hmeblk then
3968 * we know there are no valid hmeblks mapping
3969 * this address at this size or larger.
3970 * Just increment address by the smallest
3971 * page size.
3973 if (hmeblkp->hblk_shw_bit) {
3974 addr += MMU_PAGESIZE;
3975 } else {
3976 addr = sfmmu_hblk_unlock(hmeblkp, addr,
3977 endaddr);
3979 SFMMU_HASH_UNLOCK(hmebp);
3980 hashno = 1;
3981 continue;
3983 SFMMU_HASH_UNLOCK(hmebp);
3985 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) {
3987 * We have traversed the whole list and rehashed
3988 * if necessary without finding the address to unlock
3989 * which should never happen.
3991 panic("sfmmu_unlock: addr not found. "
3992 "addr %p hat %p", (void *)addr, (void *)sfmmup);
3993 } else {
3994 hashno++;
3998 sfmmu_hblks_list_purge(&list, 0);
4001 void
4002 hat_unlock_region(struct hat *sfmmup, caddr_t addr, size_t len,
4003 hat_region_cookie_t rcookie)
4005 sf_srd_t *srdp;
4006 sf_region_t *rgnp;
4007 int ttesz;
4008 uint_t rid;
4009 caddr_t eaddr;
4010 caddr_t va;
4011 int hmeshift;
4012 hmeblk_tag hblktag;
4013 struct hmehash_bucket *hmebp;
4014 struct hme_blk *hmeblkp;
4015 struct hme_blk *pr_hblk;
4016 struct hme_blk *list;
4018 if (rcookie == HAT_INVALID_REGION_COOKIE) {
4019 hat_unlock(sfmmup, addr, len);
4020 return;
4023 ASSERT(sfmmup != NULL);
4024 ASSERT(sfmmup != ksfmmup);
4026 srdp = sfmmup->sfmmu_srdp;
4027 rid = (uint_t)((uint64_t)rcookie);
4028 VERIFY3U(rid, <, SFMMU_MAX_HME_REGIONS);
4029 eaddr = addr + len;
4030 va = addr;
4031 list = NULL;
4032 rgnp = srdp->srd_hmergnp[rid];
4033 SFMMU_VALIDATE_HMERID(sfmmup, rid, addr, len);
4035 ASSERT(IS_P2ALIGNED(addr, TTEBYTES(rgnp->rgn_pgszc)));
4036 ASSERT(IS_P2ALIGNED(len, TTEBYTES(rgnp->rgn_pgszc)));
4037 if (rgnp->rgn_pgszc < HBLK_MIN_TTESZ) {
4038 ttesz = HBLK_MIN_TTESZ;
4039 } else {
4040 ttesz = rgnp->rgn_pgszc;
4042 while (va < eaddr) {
4043 while (ttesz < rgnp->rgn_pgszc &&
4044 IS_P2ALIGNED(va, TTEBYTES(ttesz + 1))) {
4045 ttesz++;
4047 while (ttesz >= HBLK_MIN_TTESZ) {
4048 if (!(rgnp->rgn_hmeflags & (1 << ttesz))) {
4049 ttesz--;
4050 continue;
4052 hmeshift = HME_HASH_SHIFT(ttesz);
4053 hblktag.htag_bspage = HME_HASH_BSPAGE(va, hmeshift);
4054 hblktag.htag_rehash = ttesz;
4055 hblktag.htag_rid = rid;
4056 hblktag.htag_id = srdp;
4057 hmebp = HME_HASH_FUNCTION(srdp, va, hmeshift);
4058 SFMMU_HASH_LOCK(hmebp);
4059 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk,
4060 &list);
4061 if (hmeblkp == NULL) {
4062 SFMMU_HASH_UNLOCK(hmebp);
4063 ttesz--;
4064 continue;
4066 ASSERT(hmeblkp->hblk_shared);
4067 va = sfmmu_hblk_unlock(hmeblkp, va, eaddr);
4068 ASSERT(va >= eaddr ||
4069 IS_P2ALIGNED((uintptr_t)va, TTEBYTES(ttesz)));
4070 SFMMU_HASH_UNLOCK(hmebp);
4071 break;
4073 if (ttesz < HBLK_MIN_TTESZ) {
4074 panic("hat_unlock_region: addr not found "
4075 "addr %p hat %p", (void *)va, (void *)sfmmup);
4078 sfmmu_hblks_list_purge(&list, 0);
4082 * Function to unlock a range of addresses in an hmeblk. It returns the
4083 * next address that needs to be unlocked.
4084 * Should be called with the hash lock held.
4086 static caddr_t
4087 sfmmu_hblk_unlock(struct hme_blk *hmeblkp, caddr_t addr, caddr_t endaddr)
4089 struct sf_hment *sfhme;
4090 tte_t tteold, ttemod;
4091 int ttesz, ret;
4093 ASSERT(in_hblk_range(hmeblkp, addr));
4094 ASSERT(hmeblkp->hblk_shw_bit == 0);
4096 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp));
4097 ttesz = get_hblk_ttesz(hmeblkp);
4099 HBLKTOHME(sfhme, hmeblkp, addr);
4100 while (addr < endaddr) {
4101 readtte:
4102 sfmmu_copytte(&sfhme->hme_tte, &tteold);
4103 if (TTE_IS_VALID(&tteold)) {
4105 ttemod = tteold;
4107 ret = sfmmu_modifytte_try(&tteold, &ttemod,
4108 &sfhme->hme_tte);
4110 if (ret < 0)
4111 goto readtte;
4113 if (hmeblkp->hblk_lckcnt == 0)
4114 panic("zero hblk lckcnt");
4116 if (((uintptr_t)addr + TTEBYTES(ttesz)) >
4117 (uintptr_t)endaddr)
4118 panic("can't unlock large tte");
4120 ASSERT(hmeblkp->hblk_lckcnt > 0);
4121 atomic_dec_32(&hmeblkp->hblk_lckcnt);
4122 HBLK_STACK_TRACE(hmeblkp, HBLK_UNLOCK);
4123 } else {
4124 panic("sfmmu_hblk_unlock: invalid tte");
4126 addr += TTEBYTES(ttesz);
4127 sfhme++;
4129 return (addr);
4133 * Physical Address Mapping Framework
4135 * General rules:
4137 * (1) Applies only to seg_kmem memory pages. To make things easier,
4138 * seg_kpm addresses are also accepted by the routines, but nothing
4139 * is done with them since by definition their PA mappings are static.
4140 * (2) hat_add_callback() may only be called while holding the page lock
4141 * SE_SHARED or SE_EXCL of the underlying page (e.g., as_pagelock()),
4142 * or passing HAC_PAGELOCK flag.
4143 * (3) prehandler() and posthandler() may not call hat_add_callback() or
4144 * hat_delete_callback(), nor should they allocate memory. Post quiesce
4145 * callbacks may not sleep or acquire adaptive mutex locks.
4146 * (4) Either prehandler() or posthandler() (but not both) may be specified
4147 * as being NULL. Specifying an errhandler() is optional.
4149 * Details of using the framework:
4151 * registering a callback (hat_register_callback())
4153 * Pass prehandler, posthandler, errhandler addresses
4154 * as described below. If capture_cpus argument is nonzero,
4155 * suspend callback to the prehandler will occur with CPUs
4156 * captured and executing xc_loop() and CPUs will remain
4157 * captured until after the posthandler suspend callback
4158 * occurs.
4160 * adding a callback (hat_add_callback())
4162 * as_pagelock();
4163 * hat_add_callback();
4164 * save returned pfn in private data structures or program registers;
4165 * as_pageunlock();
4167 * prehandler()
4169 * Stop all accesses by physical address to this memory page.
4170 * Called twice: the first, PRESUSPEND, is a context safe to acquire
4171 * adaptive locks. The second, SUSPEND, is called at high PIL with
4172 * CPUs captured so adaptive locks may NOT be acquired (and all spin
4173 * locks must be XCALL_PIL or higher locks).
4175 * May return the following errors:
4176 * EIO: A fatal error has occurred. This will result in panic.
4177 * EAGAIN: The page cannot be suspended. This will fail the
4178 * relocation.
4179 * 0: Success.
4181 * posthandler()
4183 * Save new pfn in private data structures or program registers;
4184 * not allowed to fail (non-zero return values will result in panic).
4186 * errhandler()
4188 * called when an error occurs related to the callback. Currently
4189 * the only such error is HAT_CB_ERR_LEAKED which indicates that
4190 * a page is being freed, but there are still outstanding callback(s)
4191 * registered on the page.
4193 * removing a callback (hat_delete_callback(); e.g., prior to freeing memory)
4195 * stop using physical address
4196 * hat_delete_callback();
4201 * Register a callback class. Each subsystem should do this once and
4202 * cache the id_t returned for use in setting up and tearing down callbacks.
4204 * There is no facility for removing callback IDs once they are created;
4205 * the "key" should be unique for each module, so in case a module is unloaded
4206 * and subsequently re-loaded, we can recycle the module's previous entry.
4208 id_t
4209 hat_register_callback(int key,
4210 int (*prehandler)(caddr_t, uint_t, uint_t, void *),
4211 int (*posthandler)(caddr_t, uint_t, uint_t, void *, pfn_t),
4212 int (*errhandler)(caddr_t, uint_t, uint_t, void *),
4213 int capture_cpus)
4215 id_t id;
4218 * Search the table for a pre-existing callback associated with
4219 * the identifier "key". If one exists, we re-use that entry in
4220 * the table for this instance, otherwise we assign the next
4221 * available table slot.
4223 for (id = 0; id < sfmmu_max_cb_id; id++) {
4224 if (sfmmu_cb_table[id].key == key)
4225 break;
4228 if (id == sfmmu_max_cb_id) {
4229 id = sfmmu_cb_nextid++;
4230 if (id >= sfmmu_max_cb_id)
4231 panic("hat_register_callback: out of callback IDs");
4234 ASSERT(prehandler != NULL || posthandler != NULL);
4236 sfmmu_cb_table[id].key = key;
4237 sfmmu_cb_table[id].prehandler = prehandler;
4238 sfmmu_cb_table[id].posthandler = posthandler;
4239 sfmmu_cb_table[id].errhandler = errhandler;
4240 sfmmu_cb_table[id].capture_cpus = capture_cpus;
4242 return (id);
4245 #define HAC_COOKIE_NONE (void *)-1
4248 * Add relocation callbacks to the specified addr/len which will be called
4249 * when relocating the associated page. See the description of pre and
4250 * posthandler above for more details.
4252 * If HAC_PAGELOCK is included in flags, the underlying memory page is
4253 * locked internally so the caller must be able to deal with the callback
4254 * running even before this function has returned. If HAC_PAGELOCK is not
4255 * set, it is assumed that the underlying memory pages are locked.
4257 * Since the caller must track the individual page boundaries anyway,
4258 * we only allow a callback to be added to a single page (large
4259 * or small). Thus [addr, addr + len) MUST be contained within a single
4260 * page.
4262 * Registering multiple callbacks on the same [addr, addr+len) is supported,
4263 * _provided_that_ a unique parameter is specified for each callback.
4264 * If multiple callbacks are registered on the same range the callback will
4265 * be invoked with each unique parameter. Registering the same callback with
4266 * the same argument more than once will result in corrupted kernel state.
4268 * Returns the pfn of the underlying kernel page in *rpfn
4269 * on success, or PFN_INVALID on failure.
4271 * cookiep (if passed) provides storage space for an opaque cookie
4272 * to return later to hat_delete_callback(). This cookie makes the callback
4273 * deletion significantly quicker by avoiding a potentially lengthy hash
4274 * search.
4276 * Returns values:
4277 * 0: success
4278 * ENOMEM: memory allocation failure (e.g. flags was passed as HAC_NOSLEEP)
4279 * EINVAL: callback ID is not valid
4280 * ENXIO: ["vaddr", "vaddr" + len) is not mapped in the kernel's address
4281 * space
4282 * ERANGE: ["vaddr", "vaddr" + len) crosses a page boundary
4285 hat_add_callback(id_t callback_id, caddr_t vaddr, uint_t len, uint_t flags,
4286 void *pvt, pfn_t *rpfn, void **cookiep)
4288 struct hmehash_bucket *hmebp;
4289 hmeblk_tag hblktag;
4290 struct hme_blk *hmeblkp;
4291 int hmeshift, hashno;
4292 caddr_t saddr, eaddr, baseaddr;
4293 struct pa_hment *pahmep;
4294 struct sf_hment *sfhmep, *osfhmep;
4295 kmutex_t *pml;
4296 tte_t tte;
4297 page_t *pp;
4298 vnode_t *vp;
4299 u_offset_t off;
4300 pfn_t pfn;
4301 int kmflags = (flags & HAC_SLEEP)? KM_SLEEP : KM_NOSLEEP;
4302 int locked = 0;
4305 * For KPM mappings, just return the physical address since we
4306 * don't need to register any callbacks.
4308 if (IS_KPM_ADDR(vaddr)) {
4309 uint64_t paddr;
4310 SFMMU_KPM_VTOP(vaddr, paddr);
4311 *rpfn = btop(paddr);
4312 if (cookiep != NULL)
4313 *cookiep = HAC_COOKIE_NONE;
4314 return (0);
4317 if (callback_id < (id_t)0 || callback_id >= sfmmu_cb_nextid) {
4318 *rpfn = PFN_INVALID;
4319 return (EINVAL);
4322 if ((pahmep = kmem_cache_alloc(pa_hment_cache, kmflags)) == NULL) {
4323 *rpfn = PFN_INVALID;
4324 return (ENOMEM);
4327 sfhmep = &pahmep->sfment;
4329 saddr = (caddr_t)((uintptr_t)vaddr & MMU_PAGEMASK);
4330 eaddr = saddr + len;
4332 rehash:
4333 /* Find the mapping(s) for this page */
4334 for (hashno = TTE64K, hmeblkp = NULL;
4335 hmeblkp == NULL && hashno <= mmu_hashcnt;
4336 hashno++) {
4337 hmeshift = HME_HASH_SHIFT(hashno);
4338 hblktag.htag_id = ksfmmup;
4339 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
4340 hblktag.htag_bspage = HME_HASH_BSPAGE(saddr, hmeshift);
4341 hblktag.htag_rehash = hashno;
4342 hmebp = HME_HASH_FUNCTION(ksfmmup, saddr, hmeshift);
4344 SFMMU_HASH_LOCK(hmebp);
4346 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp);
4348 if (hmeblkp == NULL)
4349 SFMMU_HASH_UNLOCK(hmebp);
4352 if (hmeblkp == NULL) {
4353 kmem_cache_free(pa_hment_cache, pahmep);
4354 *rpfn = PFN_INVALID;
4355 return (ENXIO);
4358 ASSERT(!hmeblkp->hblk_shared);
4360 HBLKTOHME(osfhmep, hmeblkp, saddr);
4361 sfmmu_copytte(&osfhmep->hme_tte, &tte);
4363 if (!TTE_IS_VALID(&tte)) {
4364 SFMMU_HASH_UNLOCK(hmebp);
4365 kmem_cache_free(pa_hment_cache, pahmep);
4366 *rpfn = PFN_INVALID;
4367 return (ENXIO);
4371 * Make sure the boundaries for the callback fall within this
4372 * single mapping.
4374 baseaddr = (caddr_t)get_hblk_base(hmeblkp);
4375 ASSERT(saddr >= baseaddr);
4376 if (eaddr > saddr + TTEBYTES(TTE_CSZ(&tte))) {
4377 SFMMU_HASH_UNLOCK(hmebp);
4378 kmem_cache_free(pa_hment_cache, pahmep);
4379 *rpfn = PFN_INVALID;
4380 return (ERANGE);
4383 pfn = sfmmu_ttetopfn(&tte, vaddr);
4386 * The pfn may not have a page_t underneath in which case we
4387 * just return it. This can happen if we are doing I/O to a
4388 * static portion of the kernel's address space, for instance.
4390 pp = osfhmep->hme_page;
4391 if (pp == NULL) {
4392 SFMMU_HASH_UNLOCK(hmebp);
4393 kmem_cache_free(pa_hment_cache, pahmep);
4394 *rpfn = pfn;
4395 if (cookiep)
4396 *cookiep = HAC_COOKIE_NONE;
4397 return (0);
4399 ASSERT(pp == PP_PAGEROOT(pp));
4401 vp = pp->p_vnode;
4402 off = pp->p_offset;
4404 pml = sfmmu_mlist_enter(pp);
4406 if (flags & HAC_PAGELOCK) {
4407 if (!page_trylock(pp, SE_SHARED)) {
4409 * Somebody is holding SE_EXCL lock. Might
4410 * even be hat_page_relocate(). Drop all
4411 * our locks, lookup the page in &kvp, and
4412 * retry. If it doesn't exist in &kvp and &zvp,
4413 * then we must be dealing with a kernel mapped
4414 * page which doesn't actually belong to
4415 * segkmem so we punt.
4417 sfmmu_mlist_exit(pml);
4418 SFMMU_HASH_UNLOCK(hmebp);
4419 pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED);
4421 /* check zvp before giving up */
4422 if (pp == NULL)
4423 pp = page_lookup(&zvp, (u_offset_t)saddr,
4424 SE_SHARED);
4426 /* Okay, we didn't find it, give up */
4427 if (pp == NULL) {
4428 kmem_cache_free(pa_hment_cache, pahmep);
4429 *rpfn = pfn;
4430 if (cookiep)
4431 *cookiep = HAC_COOKIE_NONE;
4432 return (0);
4434 page_unlock(pp);
4435 goto rehash;
4437 locked = 1;
4440 if (!PAGE_LOCKED(pp) && !panicstr)
4441 panic("hat_add_callback: page 0x%p not locked", (void *)pp);
4443 if (osfhmep->hme_page != pp || pp->p_vnode != vp ||
4444 pp->p_offset != off) {
4446 * The page moved before we got our hands on it. Drop
4447 * all the locks and try again.
4449 ASSERT((flags & HAC_PAGELOCK) != 0);
4450 sfmmu_mlist_exit(pml);
4451 SFMMU_HASH_UNLOCK(hmebp);
4452 page_unlock(pp);
4453 locked = 0;
4454 goto rehash;
4457 if (!VN_ISKAS(vp)) {
4459 * This is not a segkmem page but another page which
4460 * has been kernel mapped. It had better have at least
4461 * a share lock on it. Return the pfn.
4463 sfmmu_mlist_exit(pml);
4464 SFMMU_HASH_UNLOCK(hmebp);
4465 if (locked)
4466 page_unlock(pp);
4467 kmem_cache_free(pa_hment_cache, pahmep);
4468 ASSERT(PAGE_LOCKED(pp));
4469 *rpfn = pfn;
4470 if (cookiep)
4471 *cookiep = HAC_COOKIE_NONE;
4472 return (0);
4476 * Setup this pa_hment and link its embedded dummy sf_hment into
4477 * the mapping list.
4479 pp->p_share++;
4480 pahmep->cb_id = callback_id;
4481 pahmep->addr = vaddr;
4482 pahmep->len = len;
4483 pahmep->refcnt = 1;
4484 pahmep->flags = 0;
4485 pahmep->pvt = pvt;
4487 sfhmep->hme_tte.ll = 0;
4488 sfhmep->hme_data = pahmep;
4489 sfhmep->hme_prev = osfhmep;
4490 sfhmep->hme_next = osfhmep->hme_next;
4492 if (osfhmep->hme_next)
4493 osfhmep->hme_next->hme_prev = sfhmep;
4495 osfhmep->hme_next = sfhmep;
4497 sfmmu_mlist_exit(pml);
4498 SFMMU_HASH_UNLOCK(hmebp);
4500 if (locked)
4501 page_unlock(pp);
4503 *rpfn = pfn;
4504 if (cookiep)
4505 *cookiep = (void *)pahmep;
4507 return (0);
4511 * Remove the relocation callbacks from the specified addr/len.
4513 void
4514 hat_delete_callback(caddr_t vaddr, uint_t len, void *pvt, uint_t flags,
4515 void *cookie)
4517 struct hmehash_bucket *hmebp;
4518 hmeblk_tag hblktag;
4519 struct hme_blk *hmeblkp;
4520 int hmeshift, hashno;
4521 caddr_t saddr;
4522 struct pa_hment *pahmep;
4523 struct sf_hment *sfhmep, *osfhmep;
4524 kmutex_t *pml;
4525 tte_t tte;
4526 page_t *pp;
4527 vnode_t *vp;
4528 u_offset_t off;
4529 int locked = 0;
4532 * If the cookie is HAC_COOKIE_NONE then there is no pa_hment to
4533 * remove so just return.
4535 if (cookie == HAC_COOKIE_NONE || IS_KPM_ADDR(vaddr))
4536 return;
4538 saddr = (caddr_t)((uintptr_t)vaddr & MMU_PAGEMASK);
4540 rehash:
4541 /* Find the mapping(s) for this page */
4542 for (hashno = TTE64K, hmeblkp = NULL;
4543 hmeblkp == NULL && hashno <= mmu_hashcnt;
4544 hashno++) {
4545 hmeshift = HME_HASH_SHIFT(hashno);
4546 hblktag.htag_id = ksfmmup;
4547 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
4548 hblktag.htag_bspage = HME_HASH_BSPAGE(saddr, hmeshift);
4549 hblktag.htag_rehash = hashno;
4550 hmebp = HME_HASH_FUNCTION(ksfmmup, saddr, hmeshift);
4552 SFMMU_HASH_LOCK(hmebp);
4554 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp);
4556 if (hmeblkp == NULL)
4557 SFMMU_HASH_UNLOCK(hmebp);
4560 if (hmeblkp == NULL)
4561 return;
4563 ASSERT(!hmeblkp->hblk_shared);
4565 HBLKTOHME(osfhmep, hmeblkp, saddr);
4567 sfmmu_copytte(&osfhmep->hme_tte, &tte);
4568 if (!TTE_IS_VALID(&tte)) {
4569 SFMMU_HASH_UNLOCK(hmebp);
4570 return;
4573 pp = osfhmep->hme_page;
4574 if (pp == NULL) {
4575 SFMMU_HASH_UNLOCK(hmebp);
4576 ASSERT(cookie == NULL);
4577 return;
4580 vp = pp->p_vnode;
4581 off = pp->p_offset;
4583 pml = sfmmu_mlist_enter(pp);
4585 if (flags & HAC_PAGELOCK) {
4586 if (!page_trylock(pp, SE_SHARED)) {
4588 * Somebody is holding SE_EXCL lock. Might
4589 * even be hat_page_relocate(). Drop all
4590 * our locks, lookup the page in &kvp, and
4591 * retry. If it doesn't exist in &kvp and &zvp,
4592 * then we must be dealing with a kernel mapped
4593 * page which doesn't actually belong to
4594 * segkmem so we punt.
4596 sfmmu_mlist_exit(pml);
4597 SFMMU_HASH_UNLOCK(hmebp);
4598 pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED);
4599 /* check zvp before giving up */
4600 if (pp == NULL)
4601 pp = page_lookup(&zvp, (u_offset_t)saddr,
4602 SE_SHARED);
4604 if (pp == NULL) {
4605 ASSERT(cookie == NULL);
4606 return;
4608 page_unlock(pp);
4609 goto rehash;
4611 locked = 1;
4614 ASSERT(PAGE_LOCKED(pp));
4616 if (osfhmep->hme_page != pp || pp->p_vnode != vp ||
4617 pp->p_offset != off) {
4619 * The page moved before we got our hands on it. Drop
4620 * all the locks and try again.
4622 ASSERT((flags & HAC_PAGELOCK) != 0);
4623 sfmmu_mlist_exit(pml);
4624 SFMMU_HASH_UNLOCK(hmebp);
4625 page_unlock(pp);
4626 locked = 0;
4627 goto rehash;
4630 if (!VN_ISKAS(vp)) {
4632 * This is not a segkmem page but another page which
4633 * has been kernel mapped.
4635 sfmmu_mlist_exit(pml);
4636 SFMMU_HASH_UNLOCK(hmebp);
4637 if (locked)
4638 page_unlock(pp);
4639 ASSERT(cookie == NULL);
4640 return;
4643 if (cookie != NULL) {
4644 pahmep = (struct pa_hment *)cookie;
4645 sfhmep = &pahmep->sfment;
4646 } else {
4647 for (sfhmep = pp->p_mapping; sfhmep != NULL;
4648 sfhmep = sfhmep->hme_next) {
4651 * skip va<->pa mappings
4653 if (!IS_PAHME(sfhmep))
4654 continue;
4656 pahmep = sfhmep->hme_data;
4657 ASSERT(pahmep != NULL);
4660 * if pa_hment matches, remove it
4662 if ((pahmep->pvt == pvt) &&
4663 (pahmep->addr == vaddr) &&
4664 (pahmep->len == len)) {
4665 break;
4670 if (sfhmep == NULL) {
4671 if (!panicstr) {
4672 panic("hat_delete_callback: pa_hment not found, pp %p",
4673 (void *)pp);
4675 return;
4679 * Note: at this point a valid kernel mapping must still be
4680 * present on this page.
4682 pp->p_share--;
4683 if (pp->p_share <= 0)
4684 panic("hat_delete_callback: zero p_share");
4686 if (--pahmep->refcnt == 0) {
4687 if (pahmep->flags != 0)
4688 panic("hat_delete_callback: pa_hment is busy");
4691 * Remove sfhmep from the mapping list for the page.
4693 if (sfhmep->hme_prev) {
4694 sfhmep->hme_prev->hme_next = sfhmep->hme_next;
4695 } else {
4696 pp->p_mapping = sfhmep->hme_next;
4699 if (sfhmep->hme_next)
4700 sfhmep->hme_next->hme_prev = sfhmep->hme_prev;
4702 sfmmu_mlist_exit(pml);
4703 SFMMU_HASH_UNLOCK(hmebp);
4705 if (locked)
4706 page_unlock(pp);
4708 kmem_cache_free(pa_hment_cache, pahmep);
4709 return;
4712 sfmmu_mlist_exit(pml);
4713 SFMMU_HASH_UNLOCK(hmebp);
4714 if (locked)
4715 page_unlock(pp);
4719 * hat_probe returns 1 if the translation for the address 'addr' is
4720 * loaded, zero otherwise.
4722 * hat_probe should be used only for advisorary purposes because it may
4723 * occasionally return the wrong value. The implementation must guarantee that
4724 * returning the wrong value is a very rare event. hat_probe is used
4725 * to implement optimizations in the segment drivers.
4729 hat_probe(struct hat *sfmmup, caddr_t addr)
4731 pfn_t pfn;
4732 tte_t tte;
4734 ASSERT(sfmmup != NULL);
4736 ASSERT((sfmmup == ksfmmup) || AS_LOCK_HELD(sfmmup->sfmmu_as));
4738 if (sfmmup == ksfmmup) {
4739 while ((pfn = sfmmu_vatopfn(addr, sfmmup, &tte))
4740 == PFN_SUSPENDED) {
4741 sfmmu_vatopfn_suspended(addr, sfmmup, &tte);
4743 } else {
4744 pfn = sfmmu_uvatopfn(addr, sfmmup, NULL);
4747 if (pfn != PFN_INVALID)
4748 return (1);
4749 else
4750 return (0);
4753 ssize_t
4754 hat_getpagesize(struct hat *sfmmup, caddr_t addr)
4756 tte_t tte;
4758 if (sfmmup == ksfmmup) {
4759 if (sfmmu_vatopfn(addr, sfmmup, &tte) == PFN_INVALID) {
4760 return (-1);
4762 } else {
4763 if (sfmmu_uvatopfn(addr, sfmmup, &tte) == PFN_INVALID) {
4764 return (-1);
4768 ASSERT(TTE_IS_VALID(&tte));
4769 return (TTEBYTES(TTE_CSZ(&tte)));
4772 uint_t
4773 hat_getattr(struct hat *sfmmup, caddr_t addr, uint_t *attr)
4775 tte_t tte;
4777 if (sfmmup == ksfmmup) {
4778 if (sfmmu_vatopfn(addr, sfmmup, &tte) == PFN_INVALID) {
4779 tte.ll = 0;
4781 } else {
4782 if (sfmmu_uvatopfn(addr, sfmmup, &tte) == PFN_INVALID) {
4783 tte.ll = 0;
4786 if (TTE_IS_VALID(&tte)) {
4787 *attr = sfmmu_ptov_attr(&tte);
4788 return (0);
4790 *attr = 0;
4791 return ((uint_t)0xffffffff);
4795 * Enables more attributes on specified address range (ie. logical OR)
4797 void
4798 hat_setattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr)
4800 ASSERT(hat->sfmmu_as != NULL);
4802 sfmmu_chgattr(hat, addr, len, attr, SFMMU_SETATTR);
4806 * Assigns attributes to the specified address range. All the attributes
4807 * are specified.
4809 void
4810 hat_chgattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr)
4812 ASSERT(hat->sfmmu_as != NULL);
4814 sfmmu_chgattr(hat, addr, len, attr, SFMMU_CHGATTR);
4818 * Remove attributes on the specified address range (ie. loginal NAND)
4820 void
4821 hat_clrattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr)
4823 ASSERT(hat->sfmmu_as != NULL);
4825 sfmmu_chgattr(hat, addr, len, attr, SFMMU_CLRATTR);
4829 * Change attributes on an address range to that specified by attr and mode.
4831 static void
4832 sfmmu_chgattr(struct hat *sfmmup, caddr_t addr, size_t len, uint_t attr,
4833 int mode)
4835 struct hmehash_bucket *hmebp;
4836 hmeblk_tag hblktag;
4837 int hmeshift, hashno = 1;
4838 struct hme_blk *hmeblkp, *list = NULL;
4839 caddr_t endaddr;
4840 cpuset_t cpuset;
4841 demap_range_t dmr;
4843 CPUSET_ZERO(cpuset);
4845 ASSERT((sfmmup == ksfmmup) || AS_LOCK_HELD(sfmmup->sfmmu_as));
4846 ASSERT((len & MMU_PAGEOFFSET) == 0);
4847 ASSERT(((uintptr_t)addr & MMU_PAGEOFFSET) == 0);
4849 if ((attr & PROT_USER) && (mode != SFMMU_CLRATTR) &&
4850 ((addr + len) > (caddr_t)USERLIMIT)) {
4851 panic("user addr %p in kernel space",
4852 (void *)addr);
4855 endaddr = addr + len;
4856 hblktag.htag_id = sfmmup;
4857 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
4858 DEMAP_RANGE_INIT(sfmmup, &dmr);
4860 while (addr < endaddr) {
4861 hmeshift = HME_HASH_SHIFT(hashno);
4862 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
4863 hblktag.htag_rehash = hashno;
4864 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
4866 SFMMU_HASH_LOCK(hmebp);
4868 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list);
4869 if (hmeblkp != NULL) {
4870 ASSERT(!hmeblkp->hblk_shared);
4872 * We've encountered a shadow hmeblk so skip the range
4873 * of the next smaller mapping size.
4875 if (hmeblkp->hblk_shw_bit) {
4876 ASSERT(sfmmup != ksfmmup);
4877 ASSERT(hashno > 1);
4878 addr = (caddr_t)P2END((uintptr_t)addr,
4879 TTEBYTES(hashno - 1));
4880 } else {
4881 addr = sfmmu_hblk_chgattr(sfmmup,
4882 hmeblkp, addr, endaddr, &dmr, attr, mode);
4884 SFMMU_HASH_UNLOCK(hmebp);
4885 hashno = 1;
4886 continue;
4888 SFMMU_HASH_UNLOCK(hmebp);
4890 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) {
4892 * We have traversed the whole list and rehashed
4893 * if necessary without finding the address to chgattr.
4894 * This is ok, so we increment the address by the
4895 * smallest hmeblk range for kernel mappings or for
4896 * user mappings with no large pages, and the largest
4897 * hmeblk range, to account for shadow hmeblks, for
4898 * user mappings with large pages and continue.
4900 if (sfmmup == ksfmmup)
4901 addr = (caddr_t)P2END((uintptr_t)addr,
4902 TTEBYTES(1));
4903 else
4904 addr = (caddr_t)P2END((uintptr_t)addr,
4905 TTEBYTES(hashno));
4906 hashno = 1;
4907 } else {
4908 hashno++;
4912 sfmmu_hblks_list_purge(&list, 0);
4913 DEMAP_RANGE_FLUSH(&dmr);
4914 cpuset = sfmmup->sfmmu_cpusran;
4915 xt_sync(cpuset);
4919 * This function chgattr on a range of addresses in an hmeblk. It returns the
4920 * next addres that needs to be chgattr.
4921 * It should be called with the hash lock held.
4922 * XXX It should be possible to optimize chgattr by not flushing every time but
4923 * on the other hand:
4924 * 1. do one flush crosscall.
4925 * 2. only flush if we are increasing permissions (make sure this will work)
4927 static caddr_t
4928 sfmmu_hblk_chgattr(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr,
4929 caddr_t endaddr, demap_range_t *dmrp, uint_t attr, int mode)
4931 tte_t tte, tteattr, tteflags, ttemod;
4932 struct sf_hment *sfhmep;
4933 int ttesz;
4934 struct page *pp = NULL;
4935 kmutex_t *pml, *pmtx;
4936 int ret;
4937 int use_demap_range;
4938 #if defined(SF_ERRATA_57)
4939 int check_exec;
4940 #endif
4942 ASSERT(in_hblk_range(hmeblkp, addr));
4943 ASSERT(hmeblkp->hblk_shw_bit == 0);
4944 ASSERT(!hmeblkp->hblk_shared);
4946 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp));
4947 ttesz = get_hblk_ttesz(hmeblkp);
4950 * Flush the current demap region if addresses have been
4951 * skipped or the page size doesn't match.
4953 use_demap_range = (TTEBYTES(ttesz) == DEMAP_RANGE_PGSZ(dmrp));
4954 if (use_demap_range) {
4955 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr);
4956 } else if (dmrp != NULL) {
4957 DEMAP_RANGE_FLUSH(dmrp);
4960 tteattr.ll = sfmmu_vtop_attr(attr, mode, &tteflags);
4961 #if defined(SF_ERRATA_57)
4962 check_exec = (sfmmup != ksfmmup) &&
4963 AS_TYPE_64BIT(sfmmup->sfmmu_as) &&
4964 TTE_IS_EXECUTABLE(&tteattr);
4965 #endif
4966 HBLKTOHME(sfhmep, hmeblkp, addr);
4967 while (addr < endaddr) {
4968 sfmmu_copytte(&sfhmep->hme_tte, &tte);
4969 if (TTE_IS_VALID(&tte)) {
4970 if ((tte.ll & tteflags.ll) == tteattr.ll) {
4972 * if the new attr is the same as old
4973 * continue
4975 goto next_addr;
4977 if (!TTE_IS_WRITABLE(&tteattr)) {
4979 * make sure we clear hw modify bit if we
4980 * removing write protections
4982 tteflags.tte_intlo |= TTE_HWWR_INT;
4985 pml = NULL;
4986 pp = sfhmep->hme_page;
4987 if (pp) {
4988 pml = sfmmu_mlist_enter(pp);
4991 if (pp != sfhmep->hme_page) {
4993 * tte must have been unloaded.
4995 ASSERT(pml);
4996 sfmmu_mlist_exit(pml);
4997 continue;
5000 ASSERT(pp == NULL || sfmmu_mlist_held(pp));
5002 ttemod = tte;
5003 ttemod.ll = (ttemod.ll & ~tteflags.ll) | tteattr.ll;
5004 ASSERT(TTE_TO_TTEPFN(&ttemod) == TTE_TO_TTEPFN(&tte));
5006 #if defined(SF_ERRATA_57)
5007 if (check_exec && addr < errata57_limit)
5008 ttemod.tte_exec_perm = 0;
5009 #endif
5010 ret = sfmmu_modifytte_try(&tte, &ttemod,
5011 &sfhmep->hme_tte);
5013 if (ret < 0) {
5014 /* tte changed underneath us */
5015 if (pml) {
5016 sfmmu_mlist_exit(pml);
5018 continue;
5021 if (tteflags.tte_intlo & TTE_HWWR_INT) {
5023 * need to sync if we are clearing modify bit.
5025 sfmmu_ttesync(sfmmup, addr, &tte, pp);
5028 if (pp && PP_ISRO(pp)) {
5029 if (tteattr.tte_intlo & TTE_WRPRM_INT) {
5030 pmtx = sfmmu_page_enter(pp);
5031 PP_CLRRO(pp);
5032 sfmmu_page_exit(pmtx);
5036 if (ret > 0 && use_demap_range) {
5037 DEMAP_RANGE_MARKPG(dmrp, addr);
5038 } else if (ret > 0) {
5039 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0);
5042 if (pml) {
5043 sfmmu_mlist_exit(pml);
5046 next_addr:
5047 addr += TTEBYTES(ttesz);
5048 sfhmep++;
5049 DEMAP_RANGE_NEXTPG(dmrp);
5051 return (addr);
5055 * This routine converts virtual attributes to physical ones. It will
5056 * update the tteflags field with the tte mask corresponding to the attributes
5057 * affected and it returns the new attributes. It will also clear the modify
5058 * bit if we are taking away write permission. This is necessary since the
5059 * modify bit is the hardware permission bit and we need to clear it in order
5060 * to detect write faults.
5062 static uint64_t
5063 sfmmu_vtop_attr(uint_t attr, int mode, tte_t *ttemaskp)
5065 tte_t ttevalue;
5067 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR));
5069 switch (mode) {
5070 case SFMMU_CHGATTR:
5071 /* all attributes specified */
5072 ttevalue.tte_inthi = MAKE_TTEATTR_INTHI(attr);
5073 ttevalue.tte_intlo = MAKE_TTEATTR_INTLO(attr);
5074 ttemaskp->tte_inthi = TTEINTHI_ATTR;
5075 ttemaskp->tte_intlo = TTEINTLO_ATTR;
5076 break;
5077 case SFMMU_SETATTR:
5078 ASSERT(!(attr & ~HAT_PROT_MASK));
5079 ttemaskp->ll = 0;
5080 ttevalue.ll = 0;
5082 * a valid tte implies exec and read for sfmmu
5083 * so no need to do anything about them.
5084 * since priviledged access implies user access
5085 * PROT_USER doesn't make sense either.
5087 if (attr & PROT_WRITE) {
5088 ttemaskp->tte_intlo |= TTE_WRPRM_INT;
5089 ttevalue.tte_intlo |= TTE_WRPRM_INT;
5091 break;
5092 case SFMMU_CLRATTR:
5093 /* attributes will be nand with current ones */
5094 if (attr & ~(PROT_WRITE | PROT_USER)) {
5095 panic("sfmmu: attr %x not supported", attr);
5097 ttemaskp->ll = 0;
5098 ttevalue.ll = 0;
5099 if (attr & PROT_WRITE) {
5100 /* clear both writable and modify bit */
5101 ttemaskp->tte_intlo |= TTE_WRPRM_INT | TTE_HWWR_INT;
5103 if (attr & PROT_USER) {
5104 ttemaskp->tte_intlo |= TTE_PRIV_INT;
5105 ttevalue.tte_intlo |= TTE_PRIV_INT;
5107 break;
5108 default:
5109 panic("sfmmu_vtop_attr: bad mode %x", mode);
5111 ASSERT(TTE_TO_TTEPFN(&ttevalue) == 0);
5112 return (ttevalue.ll);
5115 static uint_t
5116 sfmmu_ptov_attr(tte_t *ttep)
5118 uint_t attr;
5120 ASSERT(TTE_IS_VALID(ttep));
5122 attr = PROT_READ;
5124 if (TTE_IS_WRITABLE(ttep)) {
5125 attr |= PROT_WRITE;
5127 if (TTE_IS_EXECUTABLE(ttep)) {
5128 attr |= PROT_EXEC;
5130 if (!TTE_IS_PRIVILEGED(ttep)) {
5131 attr |= PROT_USER;
5133 if (TTE_IS_NFO(ttep)) {
5134 attr |= HAT_NOFAULT;
5136 if (TTE_IS_NOSYNC(ttep)) {
5137 attr |= HAT_NOSYNC;
5139 if (TTE_IS_SIDEFFECT(ttep)) {
5140 attr |= SFMMU_SIDEFFECT;
5142 if (!TTE_IS_VCACHEABLE(ttep)) {
5143 attr |= SFMMU_UNCACHEVTTE;
5145 if (!TTE_IS_PCACHEABLE(ttep)) {
5146 attr |= SFMMU_UNCACHEPTTE;
5148 return (attr);
5152 * hat_chgprot is a deprecated hat call. New segment drivers
5153 * should store all attributes and use hat_*attr calls.
5155 * Change the protections in the virtual address range
5156 * given to the specified virtual protection. If vprot is ~PROT_WRITE,
5157 * then remove write permission, leaving the other
5158 * permissions unchanged. If vprot is ~PROT_USER, remove user permissions.
5161 void
5162 hat_chgprot(struct hat *sfmmup, caddr_t addr, size_t len, uint_t vprot)
5164 struct hmehash_bucket *hmebp;
5165 hmeblk_tag hblktag;
5166 int hmeshift, hashno = 1;
5167 struct hme_blk *hmeblkp, *list = NULL;
5168 caddr_t endaddr;
5169 cpuset_t cpuset;
5170 demap_range_t dmr;
5172 ASSERT((len & MMU_PAGEOFFSET) == 0);
5173 ASSERT(((uintptr_t)addr & MMU_PAGEOFFSET) == 0);
5175 ASSERT(sfmmup->sfmmu_as != NULL);
5177 CPUSET_ZERO(cpuset);
5179 if ((vprot != (uint_t)~PROT_WRITE) && (vprot & PROT_USER) &&
5180 ((addr + len) > (caddr_t)USERLIMIT)) {
5181 panic("user addr %p vprot %x in kernel space",
5182 (void *)addr, vprot);
5184 endaddr = addr + len;
5185 hblktag.htag_id = sfmmup;
5186 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
5187 DEMAP_RANGE_INIT(sfmmup, &dmr);
5189 while (addr < endaddr) {
5190 hmeshift = HME_HASH_SHIFT(hashno);
5191 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
5192 hblktag.htag_rehash = hashno;
5193 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
5195 SFMMU_HASH_LOCK(hmebp);
5197 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list);
5198 if (hmeblkp != NULL) {
5199 ASSERT(!hmeblkp->hblk_shared);
5201 * We've encountered a shadow hmeblk so skip the range
5202 * of the next smaller mapping size.
5204 if (hmeblkp->hblk_shw_bit) {
5205 ASSERT(sfmmup != ksfmmup);
5206 ASSERT(hashno > 1);
5207 addr = (caddr_t)P2END((uintptr_t)addr,
5208 TTEBYTES(hashno - 1));
5209 } else {
5210 addr = sfmmu_hblk_chgprot(sfmmup, hmeblkp,
5211 addr, endaddr, &dmr, vprot);
5213 SFMMU_HASH_UNLOCK(hmebp);
5214 hashno = 1;
5215 continue;
5217 SFMMU_HASH_UNLOCK(hmebp);
5219 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) {
5221 * We have traversed the whole list and rehashed
5222 * if necessary without finding the address to chgprot.
5223 * This is ok so we increment the address by the
5224 * smallest hmeblk range for kernel mappings and the
5225 * largest hmeblk range, to account for shadow hmeblks,
5226 * for user mappings and continue.
5228 if (sfmmup == ksfmmup)
5229 addr = (caddr_t)P2END((uintptr_t)addr,
5230 TTEBYTES(1));
5231 else
5232 addr = (caddr_t)P2END((uintptr_t)addr,
5233 TTEBYTES(hashno));
5234 hashno = 1;
5235 } else {
5236 hashno++;
5240 sfmmu_hblks_list_purge(&list, 0);
5241 DEMAP_RANGE_FLUSH(&dmr);
5242 cpuset = sfmmup->sfmmu_cpusran;
5243 xt_sync(cpuset);
5247 * This function chgprots a range of addresses in an hmeblk. It returns the
5248 * next addres that needs to be chgprot.
5249 * It should be called with the hash lock held.
5250 * XXX It shold be possible to optimize chgprot by not flushing every time but
5251 * on the other hand:
5252 * 1. do one flush crosscall.
5253 * 2. only flush if we are increasing permissions (make sure this will work)
5255 static caddr_t
5256 sfmmu_hblk_chgprot(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, caddr_t addr,
5257 caddr_t endaddr, demap_range_t *dmrp, uint_t vprot)
5259 uint_t pprot;
5260 tte_t tte, ttemod;
5261 struct sf_hment *sfhmep;
5262 uint_t tteflags;
5263 int ttesz;
5264 struct page *pp = NULL;
5265 kmutex_t *pml, *pmtx;
5266 int ret;
5267 int use_demap_range;
5268 #if defined(SF_ERRATA_57)
5269 int check_exec;
5270 #endif
5272 ASSERT(in_hblk_range(hmeblkp, addr));
5273 ASSERT(hmeblkp->hblk_shw_bit == 0);
5274 ASSERT(!hmeblkp->hblk_shared);
5276 #ifdef DEBUG
5277 if (get_hblk_ttesz(hmeblkp) != TTE8K &&
5278 (endaddr < get_hblk_endaddr(hmeblkp))) {
5279 panic("sfmmu_hblk_chgprot: partial chgprot of large page");
5281 #endif /* DEBUG */
5283 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp));
5284 ttesz = get_hblk_ttesz(hmeblkp);
5286 pprot = sfmmu_vtop_prot(vprot, &tteflags);
5287 #if defined(SF_ERRATA_57)
5288 check_exec = (sfmmup != ksfmmup) &&
5289 AS_TYPE_64BIT(sfmmup->sfmmu_as) &&
5290 ((vprot & PROT_EXEC) == PROT_EXEC);
5291 #endif
5292 HBLKTOHME(sfhmep, hmeblkp, addr);
5295 * Flush the current demap region if addresses have been
5296 * skipped or the page size doesn't match.
5298 use_demap_range = (TTEBYTES(ttesz) == MMU_PAGESIZE);
5299 if (use_demap_range) {
5300 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr);
5301 } else if (dmrp != NULL) {
5302 DEMAP_RANGE_FLUSH(dmrp);
5305 while (addr < endaddr) {
5306 sfmmu_copytte(&sfhmep->hme_tte, &tte);
5307 if (TTE_IS_VALID(&tte)) {
5308 if (TTE_GET_LOFLAGS(&tte, tteflags) == pprot) {
5310 * if the new protection is the same as old
5311 * continue
5313 goto next_addr;
5315 pml = NULL;
5316 pp = sfhmep->hme_page;
5317 if (pp) {
5318 pml = sfmmu_mlist_enter(pp);
5320 if (pp != sfhmep->hme_page) {
5322 * tte most have been unloaded
5323 * underneath us. Recheck
5325 ASSERT(pml);
5326 sfmmu_mlist_exit(pml);
5327 continue;
5330 ASSERT(pp == NULL || sfmmu_mlist_held(pp));
5332 ttemod = tte;
5333 TTE_SET_LOFLAGS(&ttemod, tteflags, pprot);
5334 #if defined(SF_ERRATA_57)
5335 if (check_exec && addr < errata57_limit)
5336 ttemod.tte_exec_perm = 0;
5337 #endif
5338 ret = sfmmu_modifytte_try(&tte, &ttemod,
5339 &sfhmep->hme_tte);
5341 if (ret < 0) {
5342 /* tte changed underneath us */
5343 if (pml) {
5344 sfmmu_mlist_exit(pml);
5346 continue;
5349 if (tteflags & TTE_HWWR_INT) {
5351 * need to sync if we are clearing modify bit.
5353 sfmmu_ttesync(sfmmup, addr, &tte, pp);
5356 if (pp && PP_ISRO(pp)) {
5357 if (pprot & TTE_WRPRM_INT) {
5358 pmtx = sfmmu_page_enter(pp);
5359 PP_CLRRO(pp);
5360 sfmmu_page_exit(pmtx);
5364 if (ret > 0 && use_demap_range) {
5365 DEMAP_RANGE_MARKPG(dmrp, addr);
5366 } else if (ret > 0) {
5367 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0);
5370 if (pml) {
5371 sfmmu_mlist_exit(pml);
5374 next_addr:
5375 addr += TTEBYTES(ttesz);
5376 sfhmep++;
5377 DEMAP_RANGE_NEXTPG(dmrp);
5379 return (addr);
5383 * This routine is deprecated and should only be used by hat_chgprot.
5384 * The correct routine is sfmmu_vtop_attr.
5385 * This routine converts virtual page protections to physical ones. It will
5386 * update the tteflags field with the tte mask corresponding to the protections
5387 * affected and it returns the new protections. It will also clear the modify
5388 * bit if we are taking away write permission. This is necessary since the
5389 * modify bit is the hardware permission bit and we need to clear it in order
5390 * to detect write faults.
5391 * It accepts the following special protections:
5392 * ~PROT_WRITE = remove write permissions.
5393 * ~PROT_USER = remove user permissions.
5395 static uint_t
5396 sfmmu_vtop_prot(uint_t vprot, uint_t *tteflagsp)
5398 if (vprot == (uint_t)~PROT_WRITE) {
5399 *tteflagsp = TTE_WRPRM_INT | TTE_HWWR_INT;
5400 return (0); /* will cause wrprm to be cleared */
5402 if (vprot == (uint_t)~PROT_USER) {
5403 *tteflagsp = TTE_PRIV_INT;
5404 return (0); /* will cause privprm to be cleared */
5406 if ((vprot == 0) || (vprot == PROT_USER) ||
5407 ((vprot & PROT_ALL) != vprot)) {
5408 panic("sfmmu_vtop_prot -- bad prot %x", vprot);
5411 switch (vprot) {
5412 case (PROT_READ):
5413 case (PROT_EXEC):
5414 case (PROT_EXEC | PROT_READ):
5415 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT | TTE_HWWR_INT;
5416 return (TTE_PRIV_INT); /* set prv and clr wrt */
5417 case (PROT_WRITE):
5418 case (PROT_WRITE | PROT_READ):
5419 case (PROT_EXEC | PROT_WRITE):
5420 case (PROT_EXEC | PROT_WRITE | PROT_READ):
5421 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT;
5422 return (TTE_PRIV_INT | TTE_WRPRM_INT); /* set prv and wrt */
5423 case (PROT_USER | PROT_READ):
5424 case (PROT_USER | PROT_EXEC):
5425 case (PROT_USER | PROT_EXEC | PROT_READ):
5426 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT | TTE_HWWR_INT;
5427 return (0); /* clr prv and wrt */
5428 case (PROT_USER | PROT_WRITE):
5429 case (PROT_USER | PROT_WRITE | PROT_READ):
5430 case (PROT_USER | PROT_EXEC | PROT_WRITE):
5431 case (PROT_USER | PROT_EXEC | PROT_WRITE | PROT_READ):
5432 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT;
5433 return (TTE_WRPRM_INT); /* clr prv and set wrt */
5434 default:
5435 panic("sfmmu_vtop_prot -- bad prot %x", vprot);
5437 return (0);
5441 * Alternate unload for very large virtual ranges. With a true 64 bit VA,
5442 * the normal algorithm would take too long for a very large VA range with
5443 * few real mappings. This routine just walks thru all HMEs in the global
5444 * hash table to find and remove mappings.
5446 static void
5447 hat_unload_large_virtual(
5448 struct hat *sfmmup,
5449 caddr_t startaddr,
5450 size_t len,
5451 uint_t flags,
5452 hat_callback_t *callback)
5454 struct hmehash_bucket *hmebp;
5455 struct hme_blk *hmeblkp;
5456 struct hme_blk *pr_hblk = NULL;
5457 struct hme_blk *nx_hblk;
5458 struct hme_blk *list = NULL;
5459 int i;
5460 demap_range_t dmr, *dmrp;
5461 cpuset_t cpuset;
5462 caddr_t endaddr = startaddr + len;
5463 caddr_t sa;
5464 caddr_t ea;
5465 caddr_t cb_sa[MAX_CB_ADDR];
5466 caddr_t cb_ea[MAX_CB_ADDR];
5467 int addr_cnt = 0;
5468 int a = 0;
5470 if (sfmmup->sfmmu_free) {
5471 dmrp = NULL;
5472 } else {
5473 dmrp = &dmr;
5474 DEMAP_RANGE_INIT(sfmmup, dmrp);
5478 * Loop through all the hash buckets of HME blocks looking for matches.
5480 for (i = 0; i <= UHMEHASH_SZ; i++) {
5481 hmebp = &uhme_hash[i];
5482 SFMMU_HASH_LOCK(hmebp);
5483 hmeblkp = hmebp->hmeblkp;
5484 pr_hblk = NULL;
5485 while (hmeblkp) {
5486 nx_hblk = hmeblkp->hblk_next;
5489 * skip if not this context, if a shadow block or
5490 * if the mapping is not in the requested range
5492 if (hmeblkp->hblk_tag.htag_id != sfmmup ||
5493 hmeblkp->hblk_shw_bit ||
5494 (sa = (caddr_t)get_hblk_base(hmeblkp)) >= endaddr ||
5495 (ea = get_hblk_endaddr(hmeblkp)) <= startaddr) {
5496 pr_hblk = hmeblkp;
5497 goto next_block;
5500 ASSERT(!hmeblkp->hblk_shared);
5502 * unload if there are any current valid mappings
5504 if (hmeblkp->hblk_vcnt != 0 ||
5505 hmeblkp->hblk_hmecnt != 0)
5506 (void) sfmmu_hblk_unload(sfmmup, hmeblkp,
5507 sa, ea, dmrp, flags);
5510 * on unmap we also release the HME block itself, once
5511 * all mappings are gone.
5513 if ((flags & HAT_UNLOAD_UNMAP) != 0 &&
5514 !hmeblkp->hblk_vcnt &&
5515 !hmeblkp->hblk_hmecnt) {
5516 ASSERT(!hmeblkp->hblk_lckcnt);
5517 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
5518 &list, 0);
5519 } else {
5520 pr_hblk = hmeblkp;
5523 if (callback == NULL)
5524 goto next_block;
5527 * HME blocks may span more than one page, but we may be
5528 * unmapping only one page, so check for a smaller range
5529 * for the callback
5531 if (sa < startaddr)
5532 sa = startaddr;
5533 if (--ea > endaddr)
5534 ea = endaddr - 1;
5536 cb_sa[addr_cnt] = sa;
5537 cb_ea[addr_cnt] = ea;
5538 if (++addr_cnt == MAX_CB_ADDR) {
5539 if (dmrp != NULL) {
5540 DEMAP_RANGE_FLUSH(dmrp);
5541 cpuset = sfmmup->sfmmu_cpusran;
5542 xt_sync(cpuset);
5545 for (a = 0; a < MAX_CB_ADDR; ++a) {
5546 callback->hcb_start_addr = cb_sa[a];
5547 callback->hcb_end_addr = cb_ea[a];
5548 callback->hcb_function(callback);
5550 addr_cnt = 0;
5553 next_block:
5554 hmeblkp = nx_hblk;
5556 SFMMU_HASH_UNLOCK(hmebp);
5559 sfmmu_hblks_list_purge(&list, 0);
5560 if (dmrp != NULL) {
5561 DEMAP_RANGE_FLUSH(dmrp);
5562 cpuset = sfmmup->sfmmu_cpusran;
5563 xt_sync(cpuset);
5566 for (a = 0; a < addr_cnt; ++a) {
5567 callback->hcb_start_addr = cb_sa[a];
5568 callback->hcb_end_addr = cb_ea[a];
5569 callback->hcb_function(callback);
5573 * Check TSB and TLB page sizes if the process isn't exiting.
5575 if (!sfmmup->sfmmu_free)
5576 sfmmu_check_page_sizes(sfmmup, 0);
5580 * Unload all the mappings in the range [addr..addr+len). addr and len must
5581 * be MMU_PAGESIZE aligned.
5584 extern struct seg *segkmap;
5585 #define ISSEGKMAP(sfmmup, addr) (sfmmup == ksfmmup && \
5586 segkmap->s_base <= (addr) && (addr) < (segkmap->s_base + segkmap->s_size))
5589 void
5590 hat_unload_callback(
5591 struct hat *sfmmup,
5592 caddr_t addr,
5593 size_t len,
5594 uint_t flags,
5595 hat_callback_t *callback)
5597 struct hmehash_bucket *hmebp;
5598 hmeblk_tag hblktag;
5599 int hmeshift, hashno, iskernel;
5600 struct hme_blk *hmeblkp, *pr_hblk, *list = NULL;
5601 caddr_t endaddr;
5602 cpuset_t cpuset;
5603 int addr_count = 0;
5604 int a;
5605 caddr_t cb_start_addr[MAX_CB_ADDR];
5606 caddr_t cb_end_addr[MAX_CB_ADDR];
5607 int issegkmap = ISSEGKMAP(sfmmup, addr);
5608 demap_range_t dmr, *dmrp;
5610 ASSERT(sfmmup->sfmmu_as != NULL);
5612 ASSERT((sfmmup == ksfmmup) || (flags & HAT_UNLOAD_OTHER) || \
5613 AS_LOCK_HELD(sfmmup->sfmmu_as));
5615 ASSERT(sfmmup != NULL);
5616 ASSERT((len & MMU_PAGEOFFSET) == 0);
5617 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET));
5620 * Probing through a large VA range (say 63 bits) will be slow, even
5621 * at 4 Meg steps between the probes. So, when the virtual address range
5622 * is very large, search the HME entries for what to unload.
5624 * len >> TTE_PAGE_SHIFT(TTE4M) is the # of 4Meg probes we'd need
5626 * UHMEHASH_SZ is number of hash buckets to examine
5629 if (sfmmup != KHATID && (len >> TTE_PAGE_SHIFT(TTE4M)) > UHMEHASH_SZ) {
5630 hat_unload_large_virtual(sfmmup, addr, len, flags, callback);
5631 return;
5634 CPUSET_ZERO(cpuset);
5637 * If the process is exiting, we can save a lot of fuss since
5638 * we'll flush the TLB when we free the ctx anyway.
5640 if (sfmmup->sfmmu_free) {
5641 dmrp = NULL;
5642 } else {
5643 dmrp = &dmr;
5644 DEMAP_RANGE_INIT(sfmmup, dmrp);
5647 endaddr = addr + len;
5648 hblktag.htag_id = sfmmup;
5649 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
5652 * It is likely for the vm to call unload over a wide range of
5653 * addresses that are actually very sparsely populated by
5654 * translations. In order to speed this up the sfmmu hat supports
5655 * the concept of shadow hmeblks. Dummy large page hmeblks that
5656 * correspond to actual small translations are allocated at tteload
5657 * time and are referred to as shadow hmeblks. Now, during unload
5658 * time, we first check if we have a shadow hmeblk for that
5659 * translation. The absence of one means the corresponding address
5660 * range is empty and can be skipped.
5662 * The kernel is an exception to above statement and that is why
5663 * we don't use shadow hmeblks and hash starting from the smallest
5664 * page size.
5666 if (sfmmup == KHATID) {
5667 iskernel = 1;
5668 hashno = TTE64K;
5669 } else {
5670 iskernel = 0;
5671 if (mmu_page_sizes == max_mmu_page_sizes) {
5672 hashno = TTE256M;
5673 } else {
5674 hashno = TTE4M;
5677 while (addr < endaddr) {
5678 hmeshift = HME_HASH_SHIFT(hashno);
5679 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
5680 hblktag.htag_rehash = hashno;
5681 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
5683 SFMMU_HASH_LOCK(hmebp);
5685 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list);
5686 if (hmeblkp == NULL) {
5688 * didn't find an hmeblk. skip the appropiate
5689 * address range.
5691 SFMMU_HASH_UNLOCK(hmebp);
5692 if (iskernel) {
5693 if (hashno < mmu_hashcnt) {
5694 hashno++;
5695 continue;
5696 } else {
5697 hashno = TTE64K;
5698 addr = (caddr_t)roundup((uintptr_t)addr
5699 + 1, MMU_PAGESIZE64K);
5700 continue;
5703 addr = (caddr_t)roundup((uintptr_t)addr + 1,
5704 (1 << hmeshift));
5705 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) {
5706 ASSERT(hashno == TTE64K);
5707 continue;
5709 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) {
5710 hashno = TTE512K;
5711 continue;
5713 if (mmu_page_sizes == max_mmu_page_sizes) {
5714 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) {
5715 hashno = TTE4M;
5716 continue;
5718 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) {
5719 hashno = TTE32M;
5720 continue;
5722 hashno = TTE256M;
5723 continue;
5724 } else {
5725 hashno = TTE4M;
5726 continue;
5729 ASSERT(hmeblkp);
5730 ASSERT(!hmeblkp->hblk_shared);
5731 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) {
5733 * If the valid count is zero we can skip the range
5734 * mapped by this hmeblk.
5735 * We free hblks in the case of HAT_UNMAP. HAT_UNMAP
5736 * is used by segment drivers as a hint
5737 * that the mapping resource won't be used any longer.
5738 * The best example of this is during exit().
5740 addr = (caddr_t)roundup((uintptr_t)addr + 1,
5741 get_hblk_span(hmeblkp));
5742 if ((flags & HAT_UNLOAD_UNMAP) ||
5743 (iskernel && !issegkmap)) {
5744 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
5745 &list, 0);
5747 SFMMU_HASH_UNLOCK(hmebp);
5749 if (iskernel) {
5750 hashno = TTE64K;
5751 continue;
5753 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) {
5754 ASSERT(hashno == TTE64K);
5755 continue;
5757 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) {
5758 hashno = TTE512K;
5759 continue;
5761 if (mmu_page_sizes == max_mmu_page_sizes) {
5762 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) {
5763 hashno = TTE4M;
5764 continue;
5766 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) {
5767 hashno = TTE32M;
5768 continue;
5770 hashno = TTE256M;
5771 continue;
5772 } else {
5773 hashno = TTE4M;
5774 continue;
5777 if (hmeblkp->hblk_shw_bit) {
5779 * If we encounter a shadow hmeblk we know there is
5780 * smaller sized hmeblks mapping the same address space.
5781 * Decrement the hash size and rehash.
5783 ASSERT(sfmmup != KHATID);
5784 hashno--;
5785 SFMMU_HASH_UNLOCK(hmebp);
5786 continue;
5790 * track callback address ranges.
5791 * only start a new range when it's not contiguous
5793 if (callback != NULL) {
5794 if (addr_count > 0 &&
5795 addr == cb_end_addr[addr_count - 1])
5796 --addr_count;
5797 else
5798 cb_start_addr[addr_count] = addr;
5801 addr = sfmmu_hblk_unload(sfmmup, hmeblkp, addr, endaddr,
5802 dmrp, flags);
5804 if (callback != NULL)
5805 cb_end_addr[addr_count++] = addr;
5807 if (((flags & HAT_UNLOAD_UNMAP) || (iskernel && !issegkmap)) &&
5808 !hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) {
5809 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, &list, 0);
5811 SFMMU_HASH_UNLOCK(hmebp);
5814 * Notify our caller as to exactly which pages
5815 * have been unloaded. We do these in clumps,
5816 * to minimize the number of xt_sync()s that need to occur.
5818 if (callback != NULL && addr_count == MAX_CB_ADDR) {
5819 if (dmrp != NULL) {
5820 DEMAP_RANGE_FLUSH(dmrp);
5821 cpuset = sfmmup->sfmmu_cpusran;
5822 xt_sync(cpuset);
5825 for (a = 0; a < MAX_CB_ADDR; ++a) {
5826 callback->hcb_start_addr = cb_start_addr[a];
5827 callback->hcb_end_addr = cb_end_addr[a];
5828 callback->hcb_function(callback);
5830 addr_count = 0;
5832 if (iskernel) {
5833 hashno = TTE64K;
5834 continue;
5836 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) {
5837 ASSERT(hashno == TTE64K);
5838 continue;
5840 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) {
5841 hashno = TTE512K;
5842 continue;
5844 if (mmu_page_sizes == max_mmu_page_sizes) {
5845 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) {
5846 hashno = TTE4M;
5847 continue;
5849 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) {
5850 hashno = TTE32M;
5851 continue;
5853 hashno = TTE256M;
5854 } else {
5855 hashno = TTE4M;
5859 sfmmu_hblks_list_purge(&list, 0);
5860 if (dmrp != NULL) {
5861 DEMAP_RANGE_FLUSH(dmrp);
5862 cpuset = sfmmup->sfmmu_cpusran;
5863 xt_sync(cpuset);
5865 if (callback && addr_count != 0) {
5866 for (a = 0; a < addr_count; ++a) {
5867 callback->hcb_start_addr = cb_start_addr[a];
5868 callback->hcb_end_addr = cb_end_addr[a];
5869 callback->hcb_function(callback);
5874 * Check TSB and TLB page sizes if the process isn't exiting.
5876 if (!sfmmup->sfmmu_free)
5877 sfmmu_check_page_sizes(sfmmup, 0);
5881 * Unload all the mappings in the range [addr..addr+len). addr and len must
5882 * be MMU_PAGESIZE aligned.
5884 void
5885 hat_unload(struct hat *sfmmup, caddr_t addr, size_t len, uint_t flags)
5887 hat_unload_callback(sfmmup, addr, len, flags, NULL);
5892 * Find the largest mapping size for this page.
5895 fnd_mapping_sz(page_t *pp)
5897 int sz;
5898 int p_index;
5900 p_index = PP_MAPINDEX(pp);
5902 sz = 0;
5903 p_index >>= 1; /* don't care about 8K bit */
5904 for (; p_index; p_index >>= 1) {
5905 sz++;
5908 return (sz);
5912 * This function unloads a range of addresses for an hmeblk.
5913 * It returns the next address to be unloaded.
5914 * It should be called with the hash lock held.
5916 static caddr_t
5917 sfmmu_hblk_unload(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr,
5918 caddr_t endaddr, demap_range_t *dmrp, uint_t flags)
5920 tte_t tte, ttemod;
5921 struct sf_hment *sfhmep;
5922 int ttesz;
5923 long ttecnt;
5924 page_t *pp;
5925 kmutex_t *pml;
5926 int ret;
5927 int use_demap_range;
5929 ASSERT(in_hblk_range(hmeblkp, addr));
5930 ASSERT(!hmeblkp->hblk_shw_bit);
5931 ASSERT(sfmmup != NULL || hmeblkp->hblk_shared);
5932 ASSERT(sfmmup == NULL || !hmeblkp->hblk_shared);
5933 ASSERT(dmrp == NULL || !hmeblkp->hblk_shared);
5935 #ifdef DEBUG
5936 if (get_hblk_ttesz(hmeblkp) != TTE8K &&
5937 (endaddr < get_hblk_endaddr(hmeblkp))) {
5938 panic("sfmmu_hblk_unload: partial unload of large page");
5940 #endif /* DEBUG */
5942 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp));
5943 ttesz = get_hblk_ttesz(hmeblkp);
5945 use_demap_range = ((dmrp == NULL) ||
5946 (TTEBYTES(ttesz) == DEMAP_RANGE_PGSZ(dmrp)));
5948 if (use_demap_range) {
5949 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr);
5950 } else if (dmrp != NULL) {
5951 DEMAP_RANGE_FLUSH(dmrp);
5953 ttecnt = 0;
5954 HBLKTOHME(sfhmep, hmeblkp, addr);
5956 while (addr < endaddr) {
5957 pml = NULL;
5958 sfmmu_copytte(&sfhmep->hme_tte, &tte);
5959 if (TTE_IS_VALID(&tte)) {
5960 pp = sfhmep->hme_page;
5961 if (pp != NULL) {
5962 pml = sfmmu_mlist_enter(pp);
5966 * Verify if hme still points to 'pp' now that
5967 * we have p_mapping lock.
5969 if (sfhmep->hme_page != pp) {
5970 if (pp != NULL && sfhmep->hme_page != NULL) {
5971 ASSERT(pml != NULL);
5972 sfmmu_mlist_exit(pml);
5973 /* Re-start this iteration. */
5974 continue;
5976 ASSERT((pp != NULL) &&
5977 (sfhmep->hme_page == NULL));
5978 goto tte_unloaded;
5982 * This point on we have both HASH and p_mapping
5983 * lock.
5985 ASSERT(pp == sfhmep->hme_page);
5986 ASSERT(pp == NULL || sfmmu_mlist_held(pp));
5989 * We need to loop on modify tte because it is
5990 * possible for pagesync to come along and
5991 * change the software bits beneath us.
5993 * Page_unload can also invalidate the tte after
5994 * we read tte outside of p_mapping lock.
5996 again:
5997 ttemod = tte;
5999 TTE_SET_INVALID(&ttemod);
6000 ret = sfmmu_modifytte_try(&tte, &ttemod,
6001 &sfhmep->hme_tte);
6003 if (ret <= 0) {
6004 if (TTE_IS_VALID(&tte)) {
6005 ASSERT(ret < 0);
6006 goto again;
6008 if (pp != NULL) {
6009 panic("sfmmu_hblk_unload: pp = 0x%p "
6010 "tte became invalid under mlist"
6011 " lock = 0x%p", (void *)pp,
6012 (void *)pml);
6014 continue;
6017 if (!(flags & HAT_UNLOAD_NOSYNC)) {
6018 sfmmu_ttesync(sfmmup, addr, &tte, pp);
6022 * Ok- we invalidated the tte. Do the rest of the job.
6024 ttecnt++;
6026 if (flags & HAT_UNLOAD_UNLOCK) {
6027 ASSERT(hmeblkp->hblk_lckcnt > 0);
6028 atomic_dec_32(&hmeblkp->hblk_lckcnt);
6029 HBLK_STACK_TRACE(hmeblkp, HBLK_UNLOCK);
6033 * Normally we would need to flush the page
6034 * from the virtual cache at this point in
6035 * order to prevent a potential cache alias
6036 * inconsistency.
6037 * The particular scenario we need to worry
6038 * about is:
6039 * Given: va1 and va2 are two virtual address
6040 * that alias and map the same physical
6041 * address.
6042 * 1. mapping exists from va1 to pa and data
6043 * has been read into the cache.
6044 * 2. unload va1.
6045 * 3. load va2 and modify data using va2.
6046 * 4 unload va2.
6047 * 5. load va1 and reference data. Unless we
6048 * flush the data cache when we unload we will
6049 * get stale data.
6050 * Fortunately, page coloring eliminates the
6051 * above scenario by remembering the color a
6052 * physical page was last or is currently
6053 * mapped to. Now, we delay the flush until
6054 * the loading of translations. Only when the
6055 * new translation is of a different color
6056 * are we forced to flush.
6058 if (use_demap_range) {
6060 * Mark this page as needing a demap.
6062 DEMAP_RANGE_MARKPG(dmrp, addr);
6063 } else {
6064 ASSERT(sfmmup != NULL);
6065 ASSERT(!hmeblkp->hblk_shared);
6066 sfmmu_tlb_demap(addr, sfmmup, hmeblkp,
6067 sfmmup->sfmmu_free, 0);
6070 if (pp) {
6072 * Remove the hment from the mapping list
6074 ASSERT(hmeblkp->hblk_hmecnt > 0);
6077 * Again, we cannot
6078 * ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS);
6080 HME_SUB(sfhmep, pp);
6081 membar_stst();
6082 atomic_dec_16(&hmeblkp->hblk_hmecnt);
6085 ASSERT(hmeblkp->hblk_vcnt > 0);
6086 atomic_dec_16(&hmeblkp->hblk_vcnt);
6088 ASSERT(hmeblkp->hblk_hmecnt || hmeblkp->hblk_vcnt ||
6089 !hmeblkp->hblk_lckcnt);
6091 #ifdef VAC
6092 if (pp && (pp->p_nrm & (P_KPMC | P_KPMS | P_TNC))) {
6093 if (PP_ISTNC(pp)) {
6095 * If page was temporary
6096 * uncached, try to recache
6097 * it. Note that HME_SUB() was
6098 * called above so p_index and
6099 * mlist had been updated.
6101 conv_tnc(pp, ttesz);
6102 } else if (pp->p_mapping == NULL) {
6103 ASSERT(kpm_enable);
6105 * Page is marked to be in VAC conflict
6106 * to an existing kpm mapping and/or is
6107 * kpm mapped using only the regular
6108 * pagesize.
6110 sfmmu_kpm_hme_unload(pp);
6113 #endif /* VAC */
6114 } else if ((pp = sfhmep->hme_page) != NULL) {
6116 * TTE is invalid but the hme
6117 * still exists. let pageunload
6118 * complete its job.
6120 ASSERT(pml == NULL);
6121 pml = sfmmu_mlist_enter(pp);
6122 if (sfhmep->hme_page != NULL) {
6123 sfmmu_mlist_exit(pml);
6124 continue;
6126 ASSERT(sfhmep->hme_page == NULL);
6127 } else if (hmeblkp->hblk_hmecnt != 0) {
6129 * pageunload may have not finished decrementing
6130 * hblk_vcnt and hblk_hmecnt. Find page_t if any and
6131 * wait for pageunload to finish. Rely on pageunload
6132 * to decrement hblk_hmecnt after hblk_vcnt.
6134 pfn_t pfn = TTE_TO_TTEPFN(&tte);
6135 ASSERT(pml == NULL);
6136 if (pf_is_memory(pfn)) {
6137 pp = page_numtopp_nolock(pfn);
6138 if (pp != NULL) {
6139 pml = sfmmu_mlist_enter(pp);
6140 sfmmu_mlist_exit(pml);
6141 pml = NULL;
6146 tte_unloaded:
6148 * At this point, the tte we are looking at
6149 * should be unloaded, and hme has been unlinked
6150 * from page too. This is important because in
6151 * pageunload, it does ttesync() then HME_SUB.
6152 * We need to make sure HME_SUB has been completed
6153 * so we know ttesync() has been completed. Otherwise,
6154 * at exit time, after return from hat layer, VM will
6155 * release as structure which hat_setstat() (called
6156 * by ttesync()) needs.
6158 #ifdef DEBUG
6160 tte_t dtte;
6162 ASSERT(sfhmep->hme_page == NULL);
6164 sfmmu_copytte(&sfhmep->hme_tte, &dtte);
6165 ASSERT(!TTE_IS_VALID(&dtte));
6167 #endif
6169 if (pml) {
6170 sfmmu_mlist_exit(pml);
6173 addr += TTEBYTES(ttesz);
6174 sfhmep++;
6175 DEMAP_RANGE_NEXTPG(dmrp);
6178 * For shared hmeblks this routine is only called when region is freed
6179 * and no longer referenced. So no need to decrement ttecnt
6180 * in the region structure here.
6182 if (ttecnt > 0 && sfmmup != NULL) {
6183 atomic_add_long(&sfmmup->sfmmu_ttecnt[ttesz], -ttecnt);
6185 return (addr);
6189 * Invalidate a virtual address range for the local CPU.
6190 * For best performance ensure that the va range is completely
6191 * mapped, otherwise the entire TLB will be flushed.
6193 void
6194 hat_flush_range(struct hat *sfmmup, caddr_t va, size_t size)
6196 ssize_t sz;
6197 caddr_t endva = va + size;
6199 while (va < endva) {
6200 sz = hat_getpagesize(sfmmup, va);
6201 if (sz < 0) {
6202 vtag_flushall();
6203 break;
6205 vtag_flushpage(va, (uint64_t)sfmmup);
6206 va += sz;
6211 * Synchronize all the mappings in the range [addr..addr+len).
6212 * Can be called with clearflag having two states:
6213 * HAT_SYNC_DONTZERO means just return the rm stats
6214 * HAT_SYNC_ZERORM means zero rm bits in the tte and return the stats
6216 void
6217 hat_sync(struct hat *sfmmup, caddr_t addr, size_t len, uint_t clearflag)
6219 struct hmehash_bucket *hmebp;
6220 hmeblk_tag hblktag;
6221 int hmeshift, hashno = 1;
6222 struct hme_blk *hmeblkp, *list = NULL;
6223 caddr_t endaddr;
6224 cpuset_t cpuset;
6226 ASSERT((sfmmup == ksfmmup) || AS_LOCK_HELD(sfmmup->sfmmu_as));
6227 ASSERT((len & MMU_PAGEOFFSET) == 0);
6228 ASSERT((clearflag == HAT_SYNC_DONTZERO) ||
6229 (clearflag == HAT_SYNC_ZERORM));
6231 CPUSET_ZERO(cpuset);
6233 endaddr = addr + len;
6234 hblktag.htag_id = sfmmup;
6235 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
6238 * Spitfire supports 4 page sizes.
6239 * Most pages are expected to be of the smallest page
6240 * size (8K) and these will not need to be rehashed. 64K
6241 * pages also don't need to be rehashed because the an hmeblk
6242 * spans 64K of address space. 512K pages might need 1 rehash and
6243 * and 4M pages 2 rehashes.
6245 while (addr < endaddr) {
6246 hmeshift = HME_HASH_SHIFT(hashno);
6247 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
6248 hblktag.htag_rehash = hashno;
6249 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
6251 SFMMU_HASH_LOCK(hmebp);
6253 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list);
6254 if (hmeblkp != NULL) {
6255 ASSERT(!hmeblkp->hblk_shared);
6257 * We've encountered a shadow hmeblk so skip the range
6258 * of the next smaller mapping size.
6260 if (hmeblkp->hblk_shw_bit) {
6261 ASSERT(sfmmup != ksfmmup);
6262 ASSERT(hashno > 1);
6263 addr = (caddr_t)P2END((uintptr_t)addr,
6264 TTEBYTES(hashno - 1));
6265 } else {
6266 addr = sfmmu_hblk_sync(sfmmup, hmeblkp,
6267 addr, endaddr, clearflag);
6269 SFMMU_HASH_UNLOCK(hmebp);
6270 hashno = 1;
6271 continue;
6273 SFMMU_HASH_UNLOCK(hmebp);
6275 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) {
6277 * We have traversed the whole list and rehashed
6278 * if necessary without finding the address to sync.
6279 * This is ok so we increment the address by the
6280 * smallest hmeblk range for kernel mappings and the
6281 * largest hmeblk range, to account for shadow hmeblks,
6282 * for user mappings and continue.
6284 if (sfmmup == ksfmmup)
6285 addr = (caddr_t)P2END((uintptr_t)addr,
6286 TTEBYTES(1));
6287 else
6288 addr = (caddr_t)P2END((uintptr_t)addr,
6289 TTEBYTES(hashno));
6290 hashno = 1;
6291 } else {
6292 hashno++;
6295 sfmmu_hblks_list_purge(&list, 0);
6296 cpuset = sfmmup->sfmmu_cpusran;
6297 xt_sync(cpuset);
6300 static caddr_t
6301 sfmmu_hblk_sync(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr,
6302 caddr_t endaddr, int clearflag)
6304 tte_t tte, ttemod;
6305 struct sf_hment *sfhmep;
6306 int ttesz;
6307 struct page *pp;
6308 kmutex_t *pml;
6309 int ret;
6311 ASSERT(hmeblkp->hblk_shw_bit == 0);
6312 ASSERT(!hmeblkp->hblk_shared);
6314 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp));
6316 ttesz = get_hblk_ttesz(hmeblkp);
6317 HBLKTOHME(sfhmep, hmeblkp, addr);
6319 while (addr < endaddr) {
6320 sfmmu_copytte(&sfhmep->hme_tte, &tte);
6321 if (TTE_IS_VALID(&tte)) {
6322 pml = NULL;
6323 pp = sfhmep->hme_page;
6324 if (pp) {
6325 pml = sfmmu_mlist_enter(pp);
6327 if (pp != sfhmep->hme_page) {
6329 * tte most have been unloaded
6330 * underneath us. Recheck
6332 ASSERT(pml);
6333 sfmmu_mlist_exit(pml);
6334 continue;
6337 ASSERT(pp == NULL || sfmmu_mlist_held(pp));
6339 if (clearflag == HAT_SYNC_ZERORM) {
6340 ttemod = tte;
6341 TTE_CLR_RM(&ttemod);
6342 ret = sfmmu_modifytte_try(&tte, &ttemod,
6343 &sfhmep->hme_tte);
6344 if (ret < 0) {
6345 if (pml) {
6346 sfmmu_mlist_exit(pml);
6348 continue;
6351 if (ret > 0) {
6352 sfmmu_tlb_demap(addr, sfmmup,
6353 hmeblkp, 0, 0);
6356 sfmmu_ttesync(sfmmup, addr, &tte, pp);
6357 if (pml) {
6358 sfmmu_mlist_exit(pml);
6361 addr += TTEBYTES(ttesz);
6362 sfhmep++;
6364 return (addr);
6368 * This function will sync a tte to the page struct and it will
6369 * update the hat stats. Currently it allows us to pass a NULL pp
6370 * and we will simply update the stats. We may want to change this
6371 * so we only keep stats for pages backed by pp's.
6373 static void
6374 sfmmu_ttesync(struct hat *sfmmup, caddr_t addr, tte_t *ttep, page_t *pp)
6376 uint_t rm = 0;
6377 int sz;
6378 pgcnt_t npgs;
6380 ASSERT(TTE_IS_VALID(ttep));
6382 if (TTE_IS_NOSYNC(ttep)) {
6383 return;
6386 if (TTE_IS_REF(ttep)) {
6387 rm = P_REF;
6389 if (TTE_IS_MOD(ttep)) {
6390 rm |= P_MOD;
6393 if (rm == 0) {
6394 return;
6397 sz = TTE_CSZ(ttep);
6398 if (sfmmup != NULL && sfmmup->sfmmu_rmstat) {
6399 int i;
6400 caddr_t vaddr = addr;
6402 for (i = 0; i < TTEPAGES(sz); i++, vaddr += MMU_PAGESIZE) {
6403 hat_setstat(sfmmup->sfmmu_as, vaddr, MMU_PAGESIZE, rm);
6409 * XXX I want to use cas to update nrm bits but they
6410 * currently belong in common/vm and not in hat where
6411 * they should be.
6412 * The nrm bits are protected by the same mutex as
6413 * the one that protects the page's mapping list.
6415 if (!pp)
6416 return;
6417 ASSERT(sfmmu_mlist_held(pp));
6419 * If the tte is for a large page, we need to sync all the
6420 * pages covered by the tte.
6422 if (sz != TTE8K) {
6423 ASSERT(pp->p_szc != 0);
6424 pp = PP_GROUPLEADER(pp, sz);
6425 ASSERT(sfmmu_mlist_held(pp));
6428 /* Get number of pages from tte size. */
6429 npgs = TTEPAGES(sz);
6431 do {
6432 ASSERT(pp);
6433 ASSERT(sfmmu_mlist_held(pp));
6434 if (((rm & P_REF) != 0 && !PP_ISREF(pp)) ||
6435 ((rm & P_MOD) != 0 && !PP_ISMOD(pp)))
6436 hat_page_setattr(pp, rm);
6439 * Are we done? If not, we must have a large mapping.
6440 * For large mappings we need to sync the rest of the pages
6441 * covered by this tte; goto the next page.
6443 } while (--npgs > 0 && (pp = PP_PAGENEXT(pp)));
6447 * Execute pre-callback handler of each pa_hment linked to pp
6449 * Inputs:
6450 * flag: either HAT_PRESUSPEND or HAT_SUSPEND.
6451 * capture_cpus: pointer to return value (below)
6453 * Returns:
6454 * Propagates the subsystem callback return values back to the caller;
6455 * returns 0 on success. If capture_cpus is non-NULL, the value returned
6456 * is zero if all of the pa_hments are of a type that do not require
6457 * capturing CPUs prior to suspending the mapping, else it is 1.
6459 static int
6460 hat_pageprocess_precallbacks(struct page *pp, uint_t flag, int *capture_cpus)
6462 struct sf_hment *sfhmep;
6463 struct pa_hment *pahmep;
6464 int (*f)(caddr_t, uint_t, uint_t, void *);
6465 int ret;
6466 id_t id;
6467 int locked = 0;
6468 kmutex_t *pml;
6470 ASSERT(PAGE_EXCL(pp));
6471 if (!sfmmu_mlist_held(pp)) {
6472 pml = sfmmu_mlist_enter(pp);
6473 locked = 1;
6476 if (capture_cpus)
6477 *capture_cpus = 0;
6479 top:
6480 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) {
6482 * skip sf_hments corresponding to VA<->PA mappings;
6483 * for pa_hment's, hme_tte.ll is zero
6485 if (!IS_PAHME(sfhmep))
6486 continue;
6488 pahmep = sfhmep->hme_data;
6489 ASSERT(pahmep != NULL);
6492 * skip if pre-handler has been called earlier in this loop
6494 if (pahmep->flags & flag)
6495 continue;
6497 id = pahmep->cb_id;
6498 ASSERT(id >= (id_t)0 && id < sfmmu_cb_nextid);
6499 if (capture_cpus && sfmmu_cb_table[id].capture_cpus != 0)
6500 *capture_cpus = 1;
6501 if ((f = sfmmu_cb_table[id].prehandler) == NULL) {
6502 pahmep->flags |= flag;
6503 continue;
6507 * Drop the mapping list lock to avoid locking order issues.
6509 if (locked)
6510 sfmmu_mlist_exit(pml);
6512 ret = f(pahmep->addr, pahmep->len, flag, pahmep->pvt);
6513 if (ret != 0)
6514 return (ret); /* caller must do the cleanup */
6516 if (locked) {
6517 pml = sfmmu_mlist_enter(pp);
6518 pahmep->flags |= flag;
6519 goto top;
6522 pahmep->flags |= flag;
6525 if (locked)
6526 sfmmu_mlist_exit(pml);
6528 return (0);
6532 * Execute post-callback handler of each pa_hment linked to pp
6534 * Same overall assumptions and restrictions apply as for
6535 * hat_pageprocess_precallbacks().
6537 static void
6538 hat_pageprocess_postcallbacks(struct page *pp, uint_t flag)
6540 pfn_t pgpfn = pp->p_pagenum;
6541 pfn_t pgmask = btop(page_get_pagesize(pp->p_szc)) - 1;
6542 pfn_t newpfn;
6543 struct sf_hment *sfhmep;
6544 struct pa_hment *pahmep;
6545 int (*f)(caddr_t, uint_t, uint_t, void *, pfn_t);
6546 id_t id;
6547 int locked = 0;
6548 kmutex_t *pml;
6550 ASSERT(PAGE_EXCL(pp));
6551 if (!sfmmu_mlist_held(pp)) {
6552 pml = sfmmu_mlist_enter(pp);
6553 locked = 1;
6556 top:
6557 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) {
6559 * skip sf_hments corresponding to VA<->PA mappings;
6560 * for pa_hment's, hme_tte.ll is zero
6562 if (!IS_PAHME(sfhmep))
6563 continue;
6565 pahmep = sfhmep->hme_data;
6566 ASSERT(pahmep != NULL);
6568 if ((pahmep->flags & flag) == 0)
6569 continue;
6571 pahmep->flags &= ~flag;
6573 id = pahmep->cb_id;
6574 ASSERT(id >= (id_t)0 && id < sfmmu_cb_nextid);
6575 if ((f = sfmmu_cb_table[id].posthandler) == NULL)
6576 continue;
6579 * Convert the base page PFN into the constituent PFN
6580 * which is needed by the callback handler.
6582 newpfn = pgpfn | (btop((uintptr_t)pahmep->addr) & pgmask);
6585 * Drop the mapping list lock to avoid locking order issues.
6587 if (locked)
6588 sfmmu_mlist_exit(pml);
6590 if (f(pahmep->addr, pahmep->len, flag, pahmep->pvt, newpfn)
6591 != 0)
6592 panic("sfmmu: posthandler failed");
6594 if (locked) {
6595 pml = sfmmu_mlist_enter(pp);
6596 goto top;
6600 if (locked)
6601 sfmmu_mlist_exit(pml);
6605 * Suspend locked kernel mapping
6607 void
6608 hat_pagesuspend(struct page *pp)
6610 struct sf_hment *sfhmep;
6611 sfmmu_t *sfmmup;
6612 tte_t tte, ttemod;
6613 struct hme_blk *hmeblkp;
6614 caddr_t addr;
6615 int index, cons;
6616 cpuset_t cpuset;
6618 ASSERT(PAGE_EXCL(pp));
6619 ASSERT(sfmmu_mlist_held(pp));
6621 mutex_enter(&kpr_suspendlock);
6624 * We're about to suspend a kernel mapping so mark this thread as
6625 * non-traceable by DTrace. This prevents us from running into issues
6626 * with probe context trying to touch a suspended page
6627 * in the relocation codepath itself.
6629 curthread->t_flag |= T_DONTDTRACE;
6631 index = PP_MAPINDEX(pp);
6632 cons = TTE8K;
6634 retry:
6635 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) {
6637 if (IS_PAHME(sfhmep))
6638 continue;
6640 if (get_hblk_ttesz(sfmmu_hmetohblk(sfhmep)) != cons)
6641 continue;
6644 * Loop until we successfully set the suspend bit in
6645 * the TTE.
6647 again:
6648 sfmmu_copytte(&sfhmep->hme_tte, &tte);
6649 ASSERT(TTE_IS_VALID(&tte));
6651 ttemod = tte;
6652 TTE_SET_SUSPEND(&ttemod);
6653 if (sfmmu_modifytte_try(&tte, &ttemod,
6654 &sfhmep->hme_tte) < 0)
6655 goto again;
6658 * Invalidate TSB entry
6660 hmeblkp = sfmmu_hmetohblk(sfhmep);
6662 sfmmup = hblktosfmmu(hmeblkp);
6663 ASSERT(sfmmup == ksfmmup);
6664 ASSERT(!hmeblkp->hblk_shared);
6666 addr = tte_to_vaddr(hmeblkp, tte);
6669 * No need to make sure that the TSB for this sfmmu is
6670 * not being relocated since it is ksfmmup and thus it
6671 * will never be relocated.
6673 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0);
6676 * Update xcall stats
6678 cpuset = cpu_ready_set;
6679 CPUSET_DEL(cpuset, CPU->cpu_id);
6681 /* LINTED: constant in conditional context */
6682 SFMMU_XCALL_STATS(ksfmmup);
6685 * Flush TLB entry on remote CPU's
6687 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr,
6688 (uint64_t)ksfmmup);
6689 xt_sync(cpuset);
6692 * Flush TLB entry on local CPU
6694 vtag_flushpage(addr, (uint64_t)ksfmmup);
6697 while (index != 0) {
6698 index = index >> 1;
6699 if (index != 0)
6700 cons++;
6701 if (index & 0x1) {
6702 pp = PP_GROUPLEADER(pp, cons);
6703 goto retry;
6708 #ifdef DEBUG
6710 #define N_PRLE 1024
6711 struct prle {
6712 page_t *targ;
6713 page_t *repl;
6714 int status;
6715 int pausecpus;
6716 hrtime_t whence;
6719 static struct prle page_relocate_log[N_PRLE];
6720 static int prl_entry;
6721 static kmutex_t prl_mutex;
6723 #define PAGE_RELOCATE_LOG(t, r, s, p) \
6724 mutex_enter(&prl_mutex); \
6725 page_relocate_log[prl_entry].targ = *(t); \
6726 page_relocate_log[prl_entry].repl = *(r); \
6727 page_relocate_log[prl_entry].status = (s); \
6728 page_relocate_log[prl_entry].pausecpus = (p); \
6729 page_relocate_log[prl_entry].whence = gethrtime(); \
6730 prl_entry = (prl_entry == (N_PRLE - 1))? 0 : prl_entry + 1; \
6731 mutex_exit(&prl_mutex);
6733 #else /* !DEBUG */
6734 #define PAGE_RELOCATE_LOG(t, r, s, p)
6735 #endif
6738 * Core Kernel Page Relocation Algorithm
6740 * Input:
6742 * target : constituent pages are SE_EXCL locked.
6743 * replacement: constituent pages are SE_EXCL locked.
6745 * Output:
6747 * nrelocp: number of pages relocated
6750 hat_page_relocate(page_t **target, page_t **replacement, spgcnt_t *nrelocp)
6752 page_t *targ, *repl;
6753 page_t *tpp, *rpp;
6754 kmutex_t *low, *high;
6755 spgcnt_t npages, i;
6756 page_t *pl = NULL;
6757 int old_pil;
6758 cpuset_t cpuset;
6759 int cap_cpus;
6760 int ret;
6761 #ifdef VAC
6762 int cflags = 0;
6763 #endif
6765 if (!kcage_on || PP_ISNORELOC(*target)) {
6766 PAGE_RELOCATE_LOG(target, replacement, EAGAIN, -1);
6767 return (EAGAIN);
6770 mutex_enter(&kpr_mutex);
6771 kreloc_thread = curthread;
6773 targ = *target;
6774 repl = *replacement;
6775 ASSERT(repl != NULL);
6776 ASSERT(targ->p_szc == repl->p_szc);
6778 npages = page_get_pagecnt(targ->p_szc);
6781 * unload VA<->PA mappings that are not locked
6783 tpp = targ;
6784 for (i = 0; i < npages; i++) {
6785 (void) hat_pageunload(tpp, SFMMU_KERNEL_RELOC);
6786 tpp++;
6790 * Do "presuspend" callbacks, in a context from which we can still
6791 * block as needed. Note that we don't hold the mapping list lock
6792 * of "targ" at this point due to potential locking order issues;
6793 * we assume that between the hat_pageunload() above and holding
6794 * the SE_EXCL lock that the mapping list *cannot* change at this
6795 * point.
6797 ret = hat_pageprocess_precallbacks(targ, HAT_PRESUSPEND, &cap_cpus);
6798 if (ret != 0) {
6800 * EIO translates to fatal error, for all others cleanup
6801 * and return EAGAIN.
6803 ASSERT(ret != EIO);
6804 hat_pageprocess_postcallbacks(targ, HAT_POSTUNSUSPEND);
6805 PAGE_RELOCATE_LOG(target, replacement, ret, -1);
6806 kreloc_thread = NULL;
6807 mutex_exit(&kpr_mutex);
6808 return (EAGAIN);
6812 * acquire p_mapping list lock for both the target and replacement
6813 * root pages.
6815 * low and high refer to the need to grab the mlist locks in a
6816 * specific order in order to prevent race conditions. Thus the
6817 * lower lock must be grabbed before the higher lock.
6819 * This will block hat_unload's accessing p_mapping list. Since
6820 * we have SE_EXCL lock, hat_memload and hat_pageunload will be
6821 * blocked. Thus, no one else will be accessing the p_mapping list
6822 * while we suspend and reload the locked mapping below.
6824 tpp = targ;
6825 rpp = repl;
6826 sfmmu_mlist_reloc_enter(tpp, rpp, &low, &high);
6828 kpreempt_disable();
6831 * We raise our PIL to 13 so that we don't get captured by
6832 * another CPU or pinned by an interrupt thread. We can't go to
6833 * PIL 14 since the nexus driver(s) may need to interrupt at
6834 * that level in the case of IOMMU pseudo mappings.
6836 cpuset = cpu_ready_set;
6837 CPUSET_DEL(cpuset, CPU->cpu_id);
6838 if (!cap_cpus || CPUSET_ISNULL(cpuset)) {
6839 old_pil = splr(XCALL_PIL);
6840 } else {
6841 old_pil = -1;
6842 xc_attention(cpuset);
6844 ASSERT(getpil() == XCALL_PIL);
6847 * Now do suspend callbacks. In the case of an IOMMU mapping
6848 * this will suspend all DMA activity to the page while it is
6849 * being relocated. Since we are well above LOCK_LEVEL and CPUs
6850 * may be captured at this point we should have acquired any needed
6851 * locks in the presuspend callback.
6853 ret = hat_pageprocess_precallbacks(targ, HAT_SUSPEND, NULL);
6854 if (ret != 0) {
6855 repl = targ;
6856 goto suspend_fail;
6860 * Raise the PIL yet again, this time to block all high-level
6861 * interrupts on this CPU. This is necessary to prevent an
6862 * interrupt routine from pinning the thread which holds the
6863 * mapping suspended and then touching the suspended page.
6865 * Once the page is suspended we also need to be careful to
6866 * avoid calling any functions which touch any seg_kmem memory
6867 * since that memory may be backed by the very page we are
6868 * relocating in here!
6870 hat_pagesuspend(targ);
6873 * Now that we are confident everybody has stopped using this page,
6874 * copy the page contents. Note we use a physical copy to prevent
6875 * locking issues and to avoid fpRAS because we can't handle it in
6876 * this context.
6878 for (i = 0; i < npages; i++, tpp++, rpp++) {
6879 #ifdef VAC
6881 * If the replacement has a different vcolor than
6882 * the one being replacd, we need to handle VAC
6883 * consistency for it just as we were setting up
6884 * a new mapping to it.
6886 if ((PP_GET_VCOLOR(rpp) != NO_VCOLOR) &&
6887 (tpp->p_vcolor != rpp->p_vcolor) &&
6888 !CacheColor_IsFlushed(cflags, PP_GET_VCOLOR(rpp))) {
6889 CacheColor_SetFlushed(cflags, PP_GET_VCOLOR(rpp));
6890 sfmmu_cache_flushcolor(PP_GET_VCOLOR(rpp),
6891 rpp->p_pagenum);
6893 #endif
6895 * Copy the contents of the page.
6897 ppcopy_kernel(tpp, rpp);
6900 tpp = targ;
6901 rpp = repl;
6902 for (i = 0; i < npages; i++, tpp++, rpp++) {
6904 * Copy attributes. VAC consistency was handled above,
6905 * if required.
6907 rpp->p_nrm = tpp->p_nrm;
6908 tpp->p_nrm = 0;
6909 rpp->p_index = tpp->p_index;
6910 tpp->p_index = 0;
6911 #ifdef VAC
6912 rpp->p_vcolor = tpp->p_vcolor;
6913 #endif
6917 * First, unsuspend the page, if we set the suspend bit, and transfer
6918 * the mapping list from the target page to the replacement page.
6919 * Next process postcallbacks; since pa_hment's are linked only to the
6920 * p_mapping list of root page, we don't iterate over the constituent
6921 * pages.
6923 hat_pagereload(targ, repl);
6925 suspend_fail:
6926 hat_pageprocess_postcallbacks(repl, HAT_UNSUSPEND);
6929 * Now lower our PIL and release any captured CPUs since we
6930 * are out of the "danger zone". After this it will again be
6931 * safe to acquire adaptive mutex locks, or to drop them...
6933 if (old_pil != -1) {
6934 splx(old_pil);
6935 } else {
6936 xc_dismissed(cpuset);
6939 kpreempt_enable();
6941 sfmmu_mlist_reloc_exit(low, high);
6944 * Postsuspend callbacks should drop any locks held across
6945 * the suspend callbacks. As before, we don't hold the mapping
6946 * list lock at this point.. our assumption is that the mapping
6947 * list still can't change due to our holding SE_EXCL lock and
6948 * there being no unlocked mappings left. Hence the restriction
6949 * on calling context to hat_delete_callback()
6951 hat_pageprocess_postcallbacks(repl, HAT_POSTUNSUSPEND);
6952 if (ret != 0) {
6954 * The second presuspend call failed: we got here through
6955 * the suspend_fail label above.
6957 ASSERT(ret != EIO);
6958 PAGE_RELOCATE_LOG(target, replacement, ret, cap_cpus);
6959 kreloc_thread = NULL;
6960 mutex_exit(&kpr_mutex);
6961 return (EAGAIN);
6965 * Now that we're out of the performance critical section we can
6966 * take care of updating the hash table, since we still
6967 * hold all the pages locked SE_EXCL at this point we
6968 * needn't worry about things changing out from under us.
6970 tpp = targ;
6971 rpp = repl;
6972 for (i = 0; i < npages; i++, tpp++, rpp++) {
6975 * replace targ with replacement in page_hash table
6977 targ = tpp;
6978 page_relocate_hash(rpp, targ);
6981 * concatenate target; caller of platform_page_relocate()
6982 * expects target to be concatenated after returning.
6984 ASSERT(targ->p_next == targ);
6985 ASSERT(targ->p_prev == targ);
6986 page_list_concat(&pl, &targ);
6989 ASSERT(*target == pl);
6990 *nrelocp = npages;
6991 PAGE_RELOCATE_LOG(target, replacement, 0, cap_cpus);
6992 kreloc_thread = NULL;
6993 mutex_exit(&kpr_mutex);
6994 return (0);
6998 * Called when stray pa_hments are found attached to a page which is
6999 * being freed. Notify the subsystem which attached the pa_hment of
7000 * the error if it registered a suitable handler, else panic.
7002 static void
7003 sfmmu_pahment_leaked(struct pa_hment *pahmep)
7005 id_t cb_id = pahmep->cb_id;
7007 ASSERT(cb_id >= (id_t)0 && cb_id < sfmmu_cb_nextid);
7008 if (sfmmu_cb_table[cb_id].errhandler != NULL) {
7009 if (sfmmu_cb_table[cb_id].errhandler(pahmep->addr, pahmep->len,
7010 HAT_CB_ERR_LEAKED, pahmep->pvt) == 0)
7011 return; /* non-fatal */
7013 panic("pa_hment leaked: 0x%p", (void *)pahmep);
7017 * Remove all mappings to page 'pp'.
7020 hat_pageunload(struct page *pp, uint_t forceflag)
7022 struct page *origpp = pp;
7023 struct sf_hment *sfhme, *tmphme;
7024 struct hme_blk *hmeblkp;
7025 kmutex_t *pml;
7026 #ifdef VAC
7027 kmutex_t *pmtx;
7028 #endif
7029 cpuset_t cpuset, tset;
7030 int index, cons;
7031 int pa_hments;
7033 ASSERT(PAGE_EXCL(pp));
7035 tmphme = NULL;
7036 pa_hments = 0;
7037 CPUSET_ZERO(cpuset);
7039 pml = sfmmu_mlist_enter(pp);
7041 #ifdef VAC
7042 if (pp->p_kpmref)
7043 sfmmu_kpm_pageunload(pp);
7044 ASSERT(!PP_ISMAPPED_KPM(pp));
7045 #endif
7047 * Clear vpm reference. Since the page is exclusively locked
7048 * vpm cannot be referencing it.
7050 if (vpm_enable) {
7051 pp->p_vpmref = 0;
7054 index = PP_MAPINDEX(pp);
7055 cons = TTE8K;
7056 retry:
7057 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) {
7058 tmphme = sfhme->hme_next;
7060 if (IS_PAHME(sfhme)) {
7061 ASSERT(sfhme->hme_data != NULL);
7062 pa_hments++;
7063 continue;
7066 hmeblkp = sfmmu_hmetohblk(sfhme);
7069 * If there are kernel mappings don't unload them, they will
7070 * be suspended.
7072 if (forceflag == SFMMU_KERNEL_RELOC && hmeblkp->hblk_lckcnt &&
7073 hmeblkp->hblk_tag.htag_id == ksfmmup)
7074 continue;
7076 tset = sfmmu_pageunload(pp, sfhme, cons);
7077 CPUSET_OR(cpuset, tset);
7080 while (index != 0) {
7081 index = index >> 1;
7082 if (index != 0)
7083 cons++;
7084 if (index & 0x1) {
7085 /* Go to leading page */
7086 pp = PP_GROUPLEADER(pp, cons);
7087 ASSERT(sfmmu_mlist_held(pp));
7088 goto retry;
7093 * cpuset may be empty if the page was only mapped by segkpm,
7094 * in which case we won't actually cross-trap.
7096 xt_sync(cpuset);
7099 * The page should have no mappings at this point, unless
7100 * we were called from hat_page_relocate() in which case we
7101 * leave the locked mappings which will be suspended later.
7103 ASSERT(!PP_ISMAPPED(origpp) || pa_hments ||
7104 (forceflag == SFMMU_KERNEL_RELOC));
7106 #ifdef VAC
7107 if (PP_ISTNC(pp)) {
7108 if (cons == TTE8K) {
7109 pmtx = sfmmu_page_enter(pp);
7110 PP_CLRTNC(pp);
7111 sfmmu_page_exit(pmtx);
7112 } else {
7113 conv_tnc(pp, cons);
7116 #endif /* VAC */
7118 if (pa_hments && forceflag != SFMMU_KERNEL_RELOC) {
7120 * Unlink any pa_hments and free them, calling back
7121 * the responsible subsystem to notify it of the error.
7122 * This can occur in situations such as drivers leaking
7123 * DMA handles: naughty, but common enough that we'd like
7124 * to keep the system running rather than bringing it
7125 * down with an obscure error like "pa_hment leaked"
7126 * which doesn't aid the user in debugging their driver.
7128 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) {
7129 tmphme = sfhme->hme_next;
7130 if (IS_PAHME(sfhme)) {
7131 struct pa_hment *pahmep = sfhme->hme_data;
7132 sfmmu_pahment_leaked(pahmep);
7133 HME_SUB(sfhme, pp);
7134 kmem_cache_free(pa_hment_cache, pahmep);
7138 ASSERT(!PP_ISMAPPED(origpp));
7141 sfmmu_mlist_exit(pml);
7143 return (0);
7146 cpuset_t
7147 sfmmu_pageunload(page_t *pp, struct sf_hment *sfhme, int cons)
7149 struct hme_blk *hmeblkp;
7150 sfmmu_t *sfmmup;
7151 tte_t tte, ttemod;
7152 #ifdef DEBUG
7153 tte_t orig_old;
7154 #endif /* DEBUG */
7155 caddr_t addr;
7156 int ttesz;
7157 int ret;
7158 cpuset_t cpuset;
7160 ASSERT(pp != NULL);
7161 ASSERT(sfmmu_mlist_held(pp));
7162 ASSERT(!PP_ISKAS(pp));
7164 CPUSET_ZERO(cpuset);
7166 hmeblkp = sfmmu_hmetohblk(sfhme);
7168 readtte:
7169 sfmmu_copytte(&sfhme->hme_tte, &tte);
7170 if (TTE_IS_VALID(&tte)) {
7171 sfmmup = hblktosfmmu(hmeblkp);
7172 ttesz = get_hblk_ttesz(hmeblkp);
7174 * Only unload mappings of 'cons' size.
7176 if (ttesz != cons)
7177 return (cpuset);
7180 * Note that we have p_mapping lock, but no hash lock here.
7181 * hblk_unload() has to have both hash lock AND p_mapping
7182 * lock before it tries to modify tte. So, the tte could
7183 * not become invalid in the sfmmu_modifytte_try() below.
7185 ttemod = tte;
7186 #ifdef DEBUG
7187 orig_old = tte;
7188 #endif /* DEBUG */
7190 TTE_SET_INVALID(&ttemod);
7191 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte);
7192 if (ret < 0) {
7193 #ifdef DEBUG
7194 /* only R/M bits can change. */
7195 chk_tte(&orig_old, &tte, &ttemod, hmeblkp);
7196 #endif /* DEBUG */
7197 goto readtte;
7200 if (ret == 0) {
7201 panic("pageunload: cas failed?");
7204 addr = tte_to_vaddr(hmeblkp, tte);
7206 if (hmeblkp->hblk_shared) {
7207 sf_srd_t *srdp = (sf_srd_t *)sfmmup;
7208 uint_t rid = hmeblkp->hblk_tag.htag_rid;
7209 sf_region_t *rgnp;
7210 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
7211 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
7212 ASSERT(srdp != NULL);
7213 rgnp = srdp->srd_hmergnp[rid];
7214 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid);
7215 cpuset = sfmmu_rgntlb_demap(addr, rgnp, hmeblkp, 1);
7216 sfmmu_ttesync(NULL, addr, &tte, pp);
7217 ASSERT(rgnp->rgn_ttecnt[ttesz] > 0);
7218 atomic_dec_ulong(&rgnp->rgn_ttecnt[ttesz]);
7219 } else {
7220 sfmmu_ttesync(sfmmup, addr, &tte, pp);
7221 atomic_dec_ulong(&sfmmup->sfmmu_ttecnt[ttesz]);
7224 * We need to flush the page from the virtual cache
7225 * in order to prevent a virtual cache alias
7226 * inconsistency. The particular scenario we need
7227 * to worry about is:
7228 * Given: va1 and va2 are two virtual address that
7229 * alias and will map the same physical address.
7230 * 1. mapping exists from va1 to pa and data has
7231 * been read into the cache.
7232 * 2. unload va1.
7233 * 3. load va2 and modify data using va2.
7234 * 4 unload va2.
7235 * 5. load va1 and reference data. Unless we flush
7236 * the data cache when we unload we will get
7237 * stale data.
7238 * This scenario is taken care of by using virtual
7239 * page coloring.
7241 if (sfmmup->sfmmu_ismhat) {
7243 * Flush TSBs, TLBs and caches
7244 * of every process
7245 * sharing this ism segment.
7247 sfmmu_hat_lock_all();
7248 mutex_enter(&ism_mlist_lock);
7249 kpreempt_disable();
7250 sfmmu_ismtlbcache_demap(addr, sfmmup, hmeblkp,
7251 pp->p_pagenum, CACHE_NO_FLUSH);
7252 kpreempt_enable();
7253 mutex_exit(&ism_mlist_lock);
7254 sfmmu_hat_unlock_all();
7255 cpuset = cpu_ready_set;
7256 } else {
7257 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0);
7258 cpuset = sfmmup->sfmmu_cpusran;
7263 * Hme_sub has to run after ttesync() and a_rss update.
7264 * See hblk_unload().
7266 HME_SUB(sfhme, pp);
7267 membar_stst();
7270 * We can not make ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS)
7271 * since pteload may have done a HME_ADD() right after
7272 * we did the HME_SUB() above. Hmecnt is now maintained
7273 * by cas only. no lock guranteed its value. The only
7274 * gurantee we have is the hmecnt should not be less than
7275 * what it should be so the hblk will not be taken away.
7276 * It's also important that we decremented the hmecnt after
7277 * we are done with hmeblkp so that this hmeblk won't be
7278 * stolen.
7280 ASSERT(hmeblkp->hblk_hmecnt > 0);
7281 ASSERT(hmeblkp->hblk_vcnt > 0);
7282 atomic_dec_16(&hmeblkp->hblk_vcnt);
7283 atomic_dec_16(&hmeblkp->hblk_hmecnt);
7285 * This is bug 4063182.
7286 * XXX: fixme
7287 * ASSERT(hmeblkp->hblk_hmecnt || hmeblkp->hblk_vcnt ||
7288 * !hmeblkp->hblk_lckcnt);
7290 } else {
7291 panic("invalid tte? pp %p &tte %p",
7292 (void *)pp, (void *)&tte);
7295 return (cpuset);
7299 * While relocating a kernel page, this function will move the mappings
7300 * from tpp to dpp and modify any associated data with these mappings.
7301 * It also unsuspends the suspended kernel mapping.
7303 static void
7304 hat_pagereload(struct page *tpp, struct page *dpp)
7306 struct sf_hment *sfhme;
7307 tte_t tte, ttemod;
7308 int index, cons;
7310 ASSERT(getpil() == PIL_MAX);
7311 ASSERT(sfmmu_mlist_held(tpp));
7312 ASSERT(sfmmu_mlist_held(dpp));
7314 index = PP_MAPINDEX(tpp);
7315 cons = TTE8K;
7317 /* Update real mappings to the page */
7318 retry:
7319 for (sfhme = tpp->p_mapping; sfhme != NULL; sfhme = sfhme->hme_next) {
7320 if (IS_PAHME(sfhme))
7321 continue;
7322 sfmmu_copytte(&sfhme->hme_tte, &tte);
7323 ttemod = tte;
7326 * replace old pfn with new pfn in TTE
7328 PFN_TO_TTE(ttemod, dpp->p_pagenum);
7331 * clear suspend bit
7333 ASSERT(TTE_IS_SUSPEND(&ttemod));
7334 TTE_CLR_SUSPEND(&ttemod);
7336 if (sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte) < 0)
7337 panic("hat_pagereload(): sfmmu_modifytte_try() failed");
7340 * set hme_page point to new page
7342 sfhme->hme_page = dpp;
7346 * move p_mapping list from old page to new page
7348 dpp->p_mapping = tpp->p_mapping;
7349 tpp->p_mapping = NULL;
7350 dpp->p_share = tpp->p_share;
7351 tpp->p_share = 0;
7353 while (index != 0) {
7354 index = index >> 1;
7355 if (index != 0)
7356 cons++;
7357 if (index & 0x1) {
7358 tpp = PP_GROUPLEADER(tpp, cons);
7359 dpp = PP_GROUPLEADER(dpp, cons);
7360 goto retry;
7364 curthread->t_flag &= ~T_DONTDTRACE;
7365 mutex_exit(&kpr_suspendlock);
7368 uint_t
7369 hat_pagesync(struct page *pp, uint_t clearflag)
7371 struct sf_hment *sfhme, *tmphme = NULL;
7372 struct hme_blk *hmeblkp;
7373 kmutex_t *pml;
7374 cpuset_t cpuset, tset;
7375 int index, cons;
7376 extern ulong_t po_share;
7377 page_t *save_pp = pp;
7378 int stop_on_sh = 0;
7379 uint_t shcnt;
7381 CPUSET_ZERO(cpuset);
7383 if (PP_ISRO(pp) && (clearflag & HAT_SYNC_STOPON_MOD)) {
7384 return (PP_GENERIC_ATTR(pp));
7387 if ((clearflag & HAT_SYNC_ZERORM) == 0) {
7388 if ((clearflag & HAT_SYNC_STOPON_REF) && PP_ISREF(pp)) {
7389 return (PP_GENERIC_ATTR(pp));
7391 if ((clearflag & HAT_SYNC_STOPON_MOD) && PP_ISMOD(pp)) {
7392 return (PP_GENERIC_ATTR(pp));
7394 if (clearflag & HAT_SYNC_STOPON_SHARED) {
7395 if (pp->p_share > po_share) {
7396 hat_page_setattr(pp, P_REF);
7397 return (PP_GENERIC_ATTR(pp));
7399 stop_on_sh = 1;
7400 shcnt = 0;
7404 clearflag &= ~HAT_SYNC_STOPON_SHARED;
7405 pml = sfmmu_mlist_enter(pp);
7406 index = PP_MAPINDEX(pp);
7407 cons = TTE8K;
7408 retry:
7409 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) {
7411 * We need to save the next hment on the list since
7412 * it is possible for pagesync to remove an invalid hment
7413 * from the list.
7415 tmphme = sfhme->hme_next;
7416 if (IS_PAHME(sfhme))
7417 continue;
7419 * If we are looking for large mappings and this hme doesn't
7420 * reach the range we are seeking, just ignore it.
7422 hmeblkp = sfmmu_hmetohblk(sfhme);
7424 if (hme_size(sfhme) < cons)
7425 continue;
7427 if (stop_on_sh) {
7428 if (hmeblkp->hblk_shared) {
7429 sf_srd_t *srdp = hblktosrd(hmeblkp);
7430 uint_t rid = hmeblkp->hblk_tag.htag_rid;
7431 sf_region_t *rgnp;
7432 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
7433 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
7434 ASSERT(srdp != NULL);
7435 rgnp = srdp->srd_hmergnp[rid];
7436 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp,
7437 rgnp, rid);
7438 shcnt += rgnp->rgn_refcnt;
7439 } else {
7440 shcnt++;
7442 if (shcnt > po_share) {
7444 * tell the pager to spare the page this time
7445 * around.
7447 hat_page_setattr(save_pp, P_REF);
7448 index = 0;
7449 break;
7452 tset = sfmmu_pagesync(pp, sfhme,
7453 clearflag & ~HAT_SYNC_STOPON_RM);
7454 CPUSET_OR(cpuset, tset);
7457 * If clearflag is HAT_SYNC_DONTZERO, break out as soon
7458 * as the "ref" or "mod" is set or share cnt exceeds po_share.
7460 if ((clearflag & ~HAT_SYNC_STOPON_RM) == HAT_SYNC_DONTZERO &&
7461 (((clearflag & HAT_SYNC_STOPON_MOD) && PP_ISMOD(save_pp)) ||
7462 ((clearflag & HAT_SYNC_STOPON_REF) && PP_ISREF(save_pp)))) {
7463 index = 0;
7464 break;
7468 while (index) {
7469 index = index >> 1;
7470 cons++;
7471 if (index & 0x1) {
7472 /* Go to leading page */
7473 pp = PP_GROUPLEADER(pp, cons);
7474 goto retry;
7478 xt_sync(cpuset);
7479 sfmmu_mlist_exit(pml);
7480 return (PP_GENERIC_ATTR(save_pp));
7484 * Get all the hardware dependent attributes for a page struct
7486 static cpuset_t
7487 sfmmu_pagesync(struct page *pp, struct sf_hment *sfhme,
7488 uint_t clearflag)
7490 caddr_t addr;
7491 tte_t tte, ttemod;
7492 struct hme_blk *hmeblkp;
7493 int ret;
7494 sfmmu_t *sfmmup;
7495 cpuset_t cpuset;
7497 ASSERT(pp != NULL);
7498 ASSERT(sfmmu_mlist_held(pp));
7499 ASSERT((clearflag == HAT_SYNC_DONTZERO) ||
7500 (clearflag == HAT_SYNC_ZERORM));
7502 SFMMU_STAT(sf_pagesync);
7504 CPUSET_ZERO(cpuset);
7506 sfmmu_pagesync_retry:
7508 sfmmu_copytte(&sfhme->hme_tte, &tte);
7509 if (TTE_IS_VALID(&tte)) {
7510 hmeblkp = sfmmu_hmetohblk(sfhme);
7511 sfmmup = hblktosfmmu(hmeblkp);
7512 addr = tte_to_vaddr(hmeblkp, tte);
7513 if (clearflag == HAT_SYNC_ZERORM) {
7514 ttemod = tte;
7515 TTE_CLR_RM(&ttemod);
7516 ret = sfmmu_modifytte_try(&tte, &ttemod,
7517 &sfhme->hme_tte);
7518 if (ret < 0) {
7520 * cas failed and the new value is not what
7521 * we want.
7523 goto sfmmu_pagesync_retry;
7526 if (ret > 0) {
7527 /* we win the cas */
7528 if (hmeblkp->hblk_shared) {
7529 sf_srd_t *srdp = (sf_srd_t *)sfmmup;
7530 uint_t rid =
7531 hmeblkp->hblk_tag.htag_rid;
7532 sf_region_t *rgnp;
7533 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
7534 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
7535 ASSERT(srdp != NULL);
7536 rgnp = srdp->srd_hmergnp[rid];
7537 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp,
7538 srdp, rgnp, rid);
7539 cpuset = sfmmu_rgntlb_demap(addr,
7540 rgnp, hmeblkp, 1);
7541 } else {
7542 sfmmu_tlb_demap(addr, sfmmup, hmeblkp,
7543 0, 0);
7544 cpuset = sfmmup->sfmmu_cpusran;
7548 sfmmu_ttesync(hmeblkp->hblk_shared ? NULL : sfmmup, addr,
7549 &tte, pp);
7551 return (cpuset);
7555 * Remove write permission from a mappings to a page, so that
7556 * we can detect the next modification of it. This requires modifying
7557 * the TTE then invalidating (demap) any TLB entry using that TTE.
7558 * This code is similar to sfmmu_pagesync().
7560 static cpuset_t
7561 sfmmu_pageclrwrt(struct page *pp, struct sf_hment *sfhme)
7563 caddr_t addr;
7564 tte_t tte;
7565 tte_t ttemod;
7566 struct hme_blk *hmeblkp;
7567 int ret;
7568 sfmmu_t *sfmmup;
7569 cpuset_t cpuset;
7571 ASSERT(pp != NULL);
7572 ASSERT(sfmmu_mlist_held(pp));
7574 CPUSET_ZERO(cpuset);
7575 SFMMU_STAT(sf_clrwrt);
7577 retry:
7579 sfmmu_copytte(&sfhme->hme_tte, &tte);
7580 if (TTE_IS_VALID(&tte) && TTE_IS_WRITABLE(&tte)) {
7581 hmeblkp = sfmmu_hmetohblk(sfhme);
7582 sfmmup = hblktosfmmu(hmeblkp);
7583 addr = tte_to_vaddr(hmeblkp, tte);
7585 ttemod = tte;
7586 TTE_CLR_WRT(&ttemod);
7587 TTE_CLR_MOD(&ttemod);
7588 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte);
7591 * if cas failed and the new value is not what
7592 * we want retry
7594 if (ret < 0)
7595 goto retry;
7597 /* we win the cas */
7598 if (ret > 0) {
7599 if (hmeblkp->hblk_shared) {
7600 sf_srd_t *srdp = (sf_srd_t *)sfmmup;
7601 uint_t rid = hmeblkp->hblk_tag.htag_rid;
7602 sf_region_t *rgnp;
7603 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
7604 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
7605 ASSERT(srdp != NULL);
7606 rgnp = srdp->srd_hmergnp[rid];
7607 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp,
7608 srdp, rgnp, rid);
7609 cpuset = sfmmu_rgntlb_demap(addr,
7610 rgnp, hmeblkp, 1);
7611 } else {
7612 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0);
7613 cpuset = sfmmup->sfmmu_cpusran;
7618 return (cpuset);
7622 * Walk all mappings of a page, removing write permission and clearing the
7623 * ref/mod bits. This code is similar to hat_pagesync()
7625 static void
7626 hat_page_clrwrt(page_t *pp)
7628 struct sf_hment *sfhme;
7629 struct sf_hment *tmphme = NULL;
7630 kmutex_t *pml;
7631 cpuset_t cpuset;
7632 cpuset_t tset;
7633 int index;
7634 int cons;
7636 CPUSET_ZERO(cpuset);
7638 pml = sfmmu_mlist_enter(pp);
7639 index = PP_MAPINDEX(pp);
7640 cons = TTE8K;
7641 retry:
7642 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) {
7643 tmphme = sfhme->hme_next;
7646 * If we are looking for large mappings and this hme doesn't
7647 * reach the range we are seeking, just ignore its.
7650 if (hme_size(sfhme) < cons)
7651 continue;
7653 tset = sfmmu_pageclrwrt(pp, sfhme);
7654 CPUSET_OR(cpuset, tset);
7657 while (index) {
7658 index = index >> 1;
7659 cons++;
7660 if (index & 0x1) {
7661 /* Go to leading page */
7662 pp = PP_GROUPLEADER(pp, cons);
7663 goto retry;
7667 xt_sync(cpuset);
7668 sfmmu_mlist_exit(pml);
7672 * Set the given REF/MOD/RO bits for the given page.
7673 * For a vnode with a sorted v_pages list, we need to change
7674 * the attributes and the v_pages list together under page_vnode_mutex.
7676 void
7677 hat_page_setattr(page_t *pp, uint_t flag)
7679 vnode_t *vp = pp->p_vnode;
7680 page_t **listp;
7681 kmutex_t *pmtx;
7682 kmutex_t *vphm = NULL;
7683 int noshuffle;
7685 noshuffle = flag & P_NSH;
7686 flag &= ~P_NSH;
7688 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO)));
7691 * nothing to do if attribute already set
7693 if ((pp->p_nrm & flag) == flag)
7694 return;
7696 if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp) &&
7697 !noshuffle) {
7698 vphm = page_vnode_mutex(vp);
7699 mutex_enter(vphm);
7702 pmtx = sfmmu_page_enter(pp);
7703 pp->p_nrm |= flag;
7704 sfmmu_page_exit(pmtx);
7706 if (vphm != NULL) {
7708 * Some File Systems examine v_pages for NULL w/o
7709 * grabbing the vphm mutex. Must not let it become NULL when
7710 * pp is the only page on the list.
7712 if (pp->p_vpnext != pp) {
7713 page_vpsub(&vp->v_pages, pp);
7714 if (vp->v_pages != NULL)
7715 listp = &vp->v_pages->p_vpprev->p_vpnext;
7716 else
7717 listp = &vp->v_pages;
7718 page_vpadd(listp, pp);
7720 mutex_exit(vphm);
7724 void
7725 hat_page_clrattr(page_t *pp, uint_t flag)
7727 vnode_t *vp = pp->p_vnode;
7728 kmutex_t *pmtx;
7730 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO)));
7732 pmtx = sfmmu_page_enter(pp);
7735 * Caller is expected to hold page's io lock for VMODSORT to work
7736 * correctly with pvn_vplist_dirty() and pvn_getdirty() when mod
7737 * bit is cleared.
7738 * We don't have assert to avoid tripping some existing third party
7739 * code. The dirty page is moved back to top of the v_page list
7740 * after IO is done in pvn_write_done().
7742 pp->p_nrm &= ~flag;
7743 sfmmu_page_exit(pmtx);
7745 if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp)) {
7748 * VMODSORT works by removing write permissions and getting
7749 * a fault when a page is made dirty. At this point
7750 * we need to remove write permission from all mappings
7751 * to this page.
7753 hat_page_clrwrt(pp);
7757 uint_t
7758 hat_page_getattr(page_t *pp, uint_t flag)
7760 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO)));
7761 return ((uint_t)(pp->p_nrm & flag));
7765 * DEBUG kernels: verify that a kernel va<->pa translation
7766 * is safe by checking the underlying page_t is in a page
7767 * relocation-safe state.
7769 #ifdef DEBUG
7770 void
7771 sfmmu_check_kpfn(pfn_t pfn)
7773 page_t *pp;
7774 int index, cons;
7776 if (hat_check_vtop == 0)
7777 return;
7779 if (kvseg.s_base == NULL || panicstr)
7780 return;
7782 pp = page_numtopp_nolock(pfn);
7783 if (!pp)
7784 return;
7786 if (PAGE_LOCKED(pp) || PP_ISNORELOC(pp))
7787 return;
7790 * Handed a large kernel page, we dig up the root page since we
7791 * know the root page might have the lock also.
7793 if (pp->p_szc != 0) {
7794 index = PP_MAPINDEX(pp);
7795 cons = TTE8K;
7796 again:
7797 while (index != 0) {
7798 index >>= 1;
7799 if (index != 0)
7800 cons++;
7801 if (index & 0x1) {
7802 pp = PP_GROUPLEADER(pp, cons);
7803 goto again;
7808 if (PAGE_LOCKED(pp) || PP_ISNORELOC(pp))
7809 return;
7812 * Pages need to be locked or allocated "permanent" (either from
7813 * static_arena arena or explicitly setting PG_NORELOC when calling
7814 * page_create_va()) for VA->PA translations to be valid.
7816 if (!PP_ISNORELOC(pp))
7817 panic("Illegal VA->PA translation, pp 0x%p not permanent",
7818 (void *)pp);
7819 else
7820 panic("Illegal VA->PA translation, pp 0x%p not locked",
7821 (void *)pp);
7823 #endif /* DEBUG */
7826 * Returns a page frame number for a given virtual address.
7827 * Returns PFN_INVALID to indicate an invalid mapping
7829 pfn_t
7830 hat_getpfnum(struct hat *hat, caddr_t addr)
7832 pfn_t pfn;
7833 tte_t tte;
7836 * We would like to
7837 * ASSERT(AS_LOCK_HELD(as));
7838 * but we can't because the iommu driver will call this
7839 * routine at interrupt time and it can't grab the as lock
7840 * or it will deadlock: A thread could have the as lock
7841 * and be waiting for io. The io can't complete
7842 * because the interrupt thread is blocked trying to grab
7843 * the as lock.
7846 if (hat == ksfmmup) {
7847 if (IS_KMEM_VA_LARGEPAGE(addr)) {
7848 ASSERT(segkmem_lpszc > 0);
7849 pfn = sfmmu_kvaszc2pfn(addr, segkmem_lpszc);
7850 if (pfn != PFN_INVALID) {
7851 sfmmu_check_kpfn(pfn);
7852 return (pfn);
7854 } else if (segkpm && IS_KPM_ADDR(addr)) {
7855 return (sfmmu_kpm_vatopfn(addr));
7857 while ((pfn = sfmmu_vatopfn(addr, ksfmmup, &tte))
7858 == PFN_SUSPENDED) {
7859 sfmmu_vatopfn_suspended(addr, ksfmmup, &tte);
7861 sfmmu_check_kpfn(pfn);
7862 return (pfn);
7863 } else {
7864 return (sfmmu_uvatopfn(addr, hat, NULL));
7869 * This routine will return both pfn and tte for the vaddr.
7871 static pfn_t
7872 sfmmu_uvatopfn(caddr_t vaddr, struct hat *sfmmup, tte_t *ttep)
7874 struct hmehash_bucket *hmebp;
7875 hmeblk_tag hblktag;
7876 int hmeshift, hashno = 1;
7877 struct hme_blk *hmeblkp = NULL;
7878 tte_t tte;
7880 struct sf_hment *sfhmep;
7881 pfn_t pfn;
7883 /* support for ISM */
7884 ism_map_t *ism_map;
7885 ism_blk_t *ism_blkp;
7886 int i;
7887 sfmmu_t *ism_hatid = NULL;
7888 sfmmu_t *locked_hatid = NULL;
7889 sfmmu_t *sv_sfmmup = sfmmup;
7890 caddr_t sv_vaddr = vaddr;
7891 sf_srd_t *srdp;
7893 if (ttep == NULL) {
7894 ttep = &tte;
7895 } else {
7896 ttep->ll = 0;
7899 ASSERT(sfmmup != ksfmmup);
7900 SFMMU_STAT(sf_user_vtop);
7902 * Set ism_hatid if vaddr falls in a ISM segment.
7904 ism_blkp = sfmmup->sfmmu_iblk;
7905 if (ism_blkp != NULL) {
7906 sfmmu_ismhat_enter(sfmmup, 0);
7907 locked_hatid = sfmmup;
7909 while (ism_blkp != NULL && ism_hatid == NULL) {
7910 ism_map = ism_blkp->iblk_maps;
7911 for (i = 0; ism_map[i].imap_ismhat && i < ISM_MAP_SLOTS; i++) {
7912 if (vaddr >= ism_start(ism_map[i]) &&
7913 vaddr < ism_end(ism_map[i])) {
7914 sfmmup = ism_hatid = ism_map[i].imap_ismhat;
7915 vaddr = (caddr_t)(vaddr -
7916 ism_start(ism_map[i]));
7917 break;
7920 ism_blkp = ism_blkp->iblk_next;
7922 if (locked_hatid) {
7923 sfmmu_ismhat_exit(locked_hatid, 0);
7926 hblktag.htag_id = sfmmup;
7927 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
7928 do {
7929 hmeshift = HME_HASH_SHIFT(hashno);
7930 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift);
7931 hblktag.htag_rehash = hashno;
7932 hmebp = HME_HASH_FUNCTION(sfmmup, vaddr, hmeshift);
7934 SFMMU_HASH_LOCK(hmebp);
7936 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp);
7937 if (hmeblkp != NULL) {
7938 ASSERT(!hmeblkp->hblk_shared);
7939 HBLKTOHME(sfhmep, hmeblkp, vaddr);
7940 sfmmu_copytte(&sfhmep->hme_tte, ttep);
7941 SFMMU_HASH_UNLOCK(hmebp);
7942 if (TTE_IS_VALID(ttep)) {
7943 pfn = TTE_TO_PFN(vaddr, ttep);
7944 return (pfn);
7946 break;
7948 SFMMU_HASH_UNLOCK(hmebp);
7949 hashno++;
7950 } while (HME_REHASH(sfmmup) && (hashno <= mmu_hashcnt));
7952 if (SF_HMERGNMAP_ISNULL(sv_sfmmup)) {
7953 return (PFN_INVALID);
7955 srdp = sv_sfmmup->sfmmu_srdp;
7956 ASSERT(srdp != NULL);
7957 ASSERT(srdp->srd_refcnt != 0);
7958 hblktag.htag_id = srdp;
7959 hashno = 1;
7960 do {
7961 hmeshift = HME_HASH_SHIFT(hashno);
7962 hblktag.htag_bspage = HME_HASH_BSPAGE(sv_vaddr, hmeshift);
7963 hblktag.htag_rehash = hashno;
7964 hmebp = HME_HASH_FUNCTION(srdp, sv_vaddr, hmeshift);
7966 SFMMU_HASH_LOCK(hmebp);
7967 for (hmeblkp = hmebp->hmeblkp; hmeblkp != NULL;
7968 hmeblkp = hmeblkp->hblk_next) {
7969 uint_t rid;
7970 sf_region_t *rgnp;
7971 caddr_t rsaddr;
7972 caddr_t readdr;
7974 if (!HTAGS_EQ_SHME(hmeblkp->hblk_tag, hblktag,
7975 sv_sfmmup->sfmmu_hmeregion_map)) {
7976 continue;
7978 ASSERT(hmeblkp->hblk_shared);
7979 rid = hmeblkp->hblk_tag.htag_rid;
7980 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
7981 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
7982 rgnp = srdp->srd_hmergnp[rid];
7983 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid);
7984 HBLKTOHME(sfhmep, hmeblkp, sv_vaddr);
7985 sfmmu_copytte(&sfhmep->hme_tte, ttep);
7986 rsaddr = rgnp->rgn_saddr;
7987 readdr = rsaddr + rgnp->rgn_size;
7988 #ifdef DEBUG
7989 if (TTE_IS_VALID(ttep) ||
7990 get_hblk_ttesz(hmeblkp) > TTE8K) {
7991 caddr_t eva = tte_to_evaddr(hmeblkp, ttep);
7992 ASSERT(eva > sv_vaddr);
7993 ASSERT(sv_vaddr >= rsaddr);
7994 ASSERT(sv_vaddr < readdr);
7995 ASSERT(eva <= readdr);
7997 #endif /* DEBUG */
7999 * Continue the search if we
8000 * found an invalid 8K tte outside of the area
8001 * covered by this hmeblk's region.
8003 if (TTE_IS_VALID(ttep)) {
8004 SFMMU_HASH_UNLOCK(hmebp);
8005 pfn = TTE_TO_PFN(sv_vaddr, ttep);
8006 return (pfn);
8007 } else if (get_hblk_ttesz(hmeblkp) > TTE8K ||
8008 (sv_vaddr >= rsaddr && sv_vaddr < readdr)) {
8009 SFMMU_HASH_UNLOCK(hmebp);
8010 pfn = PFN_INVALID;
8011 return (pfn);
8014 SFMMU_HASH_UNLOCK(hmebp);
8015 hashno++;
8016 } while (hashno <= mmu_hashcnt);
8017 return (PFN_INVALID);
8022 * For compatability with AT&T and later optimizations
8024 /* ARGSUSED */
8025 void
8026 hat_map(struct hat *hat, caddr_t addr, size_t len, uint_t flags)
8028 ASSERT(hat != NULL);
8032 * Return the number of mappings to a particular page. This number is an
8033 * approximation of the number of people sharing the page.
8035 * shared hmeblks or ism hmeblks are counted as 1 mapping here.
8036 * hat_page_checkshare() can be used to compare threshold to share
8037 * count that reflects the number of region sharers albeit at higher cost.
8039 ulong_t
8040 hat_page_getshare(page_t *pp)
8042 page_t *spp = pp; /* start page */
8043 kmutex_t *pml;
8044 ulong_t cnt;
8045 int index, sz = TTE64K;
8048 * We need to grab the mlist lock to make sure any outstanding
8049 * load/unloads complete. Otherwise we could return zero
8050 * even though the unload(s) hasn't finished yet.
8052 pml = sfmmu_mlist_enter(spp);
8053 cnt = spp->p_share;
8055 #ifdef VAC
8056 if (kpm_enable)
8057 cnt += spp->p_kpmref;
8058 #endif
8059 if (vpm_enable && pp->p_vpmref) {
8060 cnt += 1;
8064 * If we have any large mappings, we count the number of
8065 * mappings that this large page is part of.
8067 index = PP_MAPINDEX(spp);
8068 index >>= 1;
8069 while (index) {
8070 pp = PP_GROUPLEADER(spp, sz);
8071 if ((index & 0x1) && pp != spp) {
8072 cnt += pp->p_share;
8073 spp = pp;
8075 index >>= 1;
8076 sz++;
8078 sfmmu_mlist_exit(pml);
8079 return (cnt);
8083 * Return 1 if the number of mappings exceeds sh_thresh. Return 0
8084 * otherwise. Count shared hmeblks by region's refcnt.
8087 hat_page_checkshare(page_t *pp, ulong_t sh_thresh)
8089 kmutex_t *pml;
8090 ulong_t cnt = 0;
8091 int index, sz = TTE8K;
8092 struct sf_hment *sfhme, *tmphme = NULL;
8093 struct hme_blk *hmeblkp;
8095 pml = sfmmu_mlist_enter(pp);
8097 #ifdef VAC
8098 if (kpm_enable)
8099 cnt = pp->p_kpmref;
8100 #endif
8102 if (vpm_enable && pp->p_vpmref) {
8103 cnt += 1;
8106 if (pp->p_share + cnt > sh_thresh) {
8107 sfmmu_mlist_exit(pml);
8108 return (1);
8111 index = PP_MAPINDEX(pp);
8113 again:
8114 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) {
8115 tmphme = sfhme->hme_next;
8116 if (IS_PAHME(sfhme)) {
8117 continue;
8120 hmeblkp = sfmmu_hmetohblk(sfhme);
8121 if (hme_size(sfhme) != sz) {
8122 continue;
8125 if (hmeblkp->hblk_shared) {
8126 sf_srd_t *srdp = hblktosrd(hmeblkp);
8127 uint_t rid = hmeblkp->hblk_tag.htag_rid;
8128 sf_region_t *rgnp;
8129 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
8130 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
8131 ASSERT(srdp != NULL);
8132 rgnp = srdp->srd_hmergnp[rid];
8133 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp,
8134 rgnp, rid);
8135 cnt += rgnp->rgn_refcnt;
8136 } else {
8137 cnt++;
8139 if (cnt > sh_thresh) {
8140 sfmmu_mlist_exit(pml);
8141 return (1);
8145 index >>= 1;
8146 sz++;
8147 while (index) {
8148 pp = PP_GROUPLEADER(pp, sz);
8149 ASSERT(sfmmu_mlist_held(pp));
8150 if (index & 0x1) {
8151 goto again;
8153 index >>= 1;
8154 sz++;
8156 sfmmu_mlist_exit(pml);
8157 return (0);
8161 * Unload all large mappings to the pp and reset the p_szc field of every
8162 * constituent page according to the remaining mappings.
8164 * pp must be locked SE_EXCL. Even though no other constituent pages are
8165 * locked it's legal to unload the large mappings to the pp because all
8166 * constituent pages of large locked mappings have to be locked SE_SHARED.
8167 * This means if we have SE_EXCL lock on one of constituent pages none of the
8168 * large mappings to pp are locked.
8170 * Decrease p_szc field starting from the last constituent page and ending
8171 * with the root page. This method is used because other threads rely on the
8172 * root's p_szc to find the lock to syncronize on. After a root page_t's p_szc
8173 * is demoted then other threads will succeed in sfmmu_mlspl_enter(). This
8174 * ensures that p_szc changes of the constituent pages appears atomic for all
8175 * threads that use sfmmu_mlspl_enter() to examine p_szc field.
8177 * This mechanism is only used for file system pages where it's not always
8178 * possible to get SE_EXCL locks on all constituent pages to demote the size
8179 * code (as is done for anonymous or kernel large pages).
8181 * See more comments in front of sfmmu_mlspl_enter().
8183 void
8184 hat_page_demote(page_t *pp)
8186 int index;
8187 int sz;
8188 cpuset_t cpuset;
8189 int sync = 0;
8190 page_t *rootpp;
8191 struct sf_hment *sfhme;
8192 struct sf_hment *tmphme = NULL;
8193 struct hme_blk *hmeblkp;
8194 uint_t pszc;
8195 page_t *lastpp;
8196 cpuset_t tset;
8197 pgcnt_t npgs;
8198 kmutex_t *pml;
8199 kmutex_t *pmtx = NULL;
8201 ASSERT(PAGE_EXCL(pp));
8202 ASSERT(!PP_ISFREE(pp));
8203 ASSERT(!PP_ISKAS(pp));
8204 ASSERT(page_szc_lock_assert(pp));
8205 pml = sfmmu_mlist_enter(pp);
8207 pszc = pp->p_szc;
8208 if (pszc == 0) {
8209 goto out;
8212 index = PP_MAPINDEX(pp) >> 1;
8214 if (index) {
8215 CPUSET_ZERO(cpuset);
8216 sz = TTE64K;
8217 sync = 1;
8220 while (index) {
8221 if (!(index & 0x1)) {
8222 index >>= 1;
8223 sz++;
8224 continue;
8226 ASSERT(sz <= pszc);
8227 rootpp = PP_GROUPLEADER(pp, sz);
8228 for (sfhme = rootpp->p_mapping; sfhme; sfhme = tmphme) {
8229 tmphme = sfhme->hme_next;
8230 ASSERT(!IS_PAHME(sfhme));
8231 hmeblkp = sfmmu_hmetohblk(sfhme);
8232 if (hme_size(sfhme) != sz) {
8233 continue;
8235 tset = sfmmu_pageunload(rootpp, sfhme, sz);
8236 CPUSET_OR(cpuset, tset);
8238 if (index >>= 1) {
8239 sz++;
8243 ASSERT(!PP_ISMAPPED_LARGE(pp));
8245 if (sync) {
8246 xt_sync(cpuset);
8247 #ifdef VAC
8248 if (PP_ISTNC(pp)) {
8249 conv_tnc(rootpp, sz);
8251 #endif /* VAC */
8254 pmtx = sfmmu_page_enter(pp);
8256 ASSERT(pp->p_szc == pszc);
8257 rootpp = PP_PAGEROOT(pp);
8258 ASSERT(rootpp->p_szc == pszc);
8259 lastpp = PP_PAGENEXT_N(rootpp, TTEPAGES(pszc) - 1);
8261 while (lastpp != rootpp) {
8262 sz = PP_MAPINDEX(lastpp) ? fnd_mapping_sz(lastpp) : 0;
8263 ASSERT(sz < pszc);
8264 npgs = (sz == 0) ? 1 : TTEPAGES(sz);
8265 ASSERT(P2PHASE(lastpp->p_pagenum, npgs) == npgs - 1);
8266 while (--npgs > 0) {
8267 lastpp->p_szc = (uchar_t)sz;
8268 lastpp = PP_PAGEPREV(lastpp);
8270 if (sz) {
8272 * make sure before current root's pszc
8273 * is updated all updates to constituent pages pszc
8274 * fields are globally visible.
8276 membar_producer();
8278 lastpp->p_szc = sz;
8279 ASSERT(IS_P2ALIGNED(lastpp->p_pagenum, TTEPAGES(sz)));
8280 if (lastpp != rootpp) {
8281 lastpp = PP_PAGEPREV(lastpp);
8284 if (sz == 0) {
8285 /* the loop above doesn't cover this case */
8286 rootpp->p_szc = 0;
8288 out:
8289 ASSERT(pp->p_szc == 0);
8290 if (pmtx != NULL) {
8291 sfmmu_page_exit(pmtx);
8293 sfmmu_mlist_exit(pml);
8297 * Refresh the HAT ismttecnt[] element for size szc.
8298 * Caller must have set ISM busy flag to prevent mapping
8299 * lists from changing while we're traversing them.
8301 pgcnt_t
8302 ism_tsb_entries(sfmmu_t *sfmmup, int szc)
8304 ism_blk_t *ism_blkp = sfmmup->sfmmu_iblk;
8305 ism_map_t *ism_map;
8306 pgcnt_t npgs = 0;
8307 pgcnt_t npgs_scd = 0;
8308 int j;
8309 sf_scd_t *scdp;
8310 uchar_t rid;
8312 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY));
8313 scdp = sfmmup->sfmmu_scdp;
8315 for (; ism_blkp != NULL; ism_blkp = ism_blkp->iblk_next) {
8316 ism_map = ism_blkp->iblk_maps;
8317 for (j = 0; ism_map[j].imap_ismhat && j < ISM_MAP_SLOTS; j++) {
8318 rid = ism_map[j].imap_rid;
8319 ASSERT(rid == SFMMU_INVALID_ISMRID ||
8320 rid < sfmmup->sfmmu_srdp->srd_next_ismrid);
8322 if (scdp != NULL && rid != SFMMU_INVALID_ISMRID &&
8323 SF_RGNMAP_TEST(scdp->scd_ismregion_map, rid)) {
8324 /* ISM is in sfmmup's SCD */
8325 npgs_scd +=
8326 ism_map[j].imap_ismhat->sfmmu_ttecnt[szc];
8327 } else {
8328 /* ISMs is not in SCD */
8329 npgs +=
8330 ism_map[j].imap_ismhat->sfmmu_ttecnt[szc];
8334 sfmmup->sfmmu_ismttecnt[szc] = npgs;
8335 sfmmup->sfmmu_scdismttecnt[szc] = npgs_scd;
8336 return (npgs);
8340 * Yield the memory claim requirement for an address space.
8342 * This is currently implemented as the number of bytes that have active
8343 * hardware translations that have page structures. Therefore, it can
8344 * underestimate the traditional resident set size, eg, if the
8345 * physical page is present and the hardware translation is missing;
8346 * and it can overestimate the rss, eg, if there are active
8347 * translations to a frame buffer with page structs.
8348 * Also, it does not take sharing into account.
8350 * Note that we don't acquire locks here since this function is most often
8351 * called from the clock thread.
8353 size_t
8354 hat_get_mapped_size(struct hat *hat)
8356 size_t assize = 0;
8357 int i;
8359 if (hat == NULL)
8360 return (0);
8362 for (i = 0; i < mmu_page_sizes; i++)
8363 assize += ((pgcnt_t)hat->sfmmu_ttecnt[i] +
8364 (pgcnt_t)hat->sfmmu_scdrttecnt[i]) * TTEBYTES(i);
8366 if (hat->sfmmu_iblk == NULL)
8367 return (assize);
8369 for (i = 0; i < mmu_page_sizes; i++)
8370 assize += ((pgcnt_t)hat->sfmmu_ismttecnt[i] +
8371 (pgcnt_t)hat->sfmmu_scdismttecnt[i]) * TTEBYTES(i);
8373 return (assize);
8377 hat_stats_enable(struct hat *hat)
8379 hatlock_t *hatlockp;
8381 hatlockp = sfmmu_hat_enter(hat);
8382 hat->sfmmu_rmstat++;
8383 sfmmu_hat_exit(hatlockp);
8384 return (1);
8387 void
8388 hat_stats_disable(struct hat *hat)
8390 hatlock_t *hatlockp;
8392 hatlockp = sfmmu_hat_enter(hat);
8393 hat->sfmmu_rmstat--;
8394 sfmmu_hat_exit(hatlockp);
8398 * Routines for entering or removing ourselves from the
8399 * ism_hat's mapping list. This is used for both private and
8400 * SCD hats.
8402 static void
8403 iment_add(struct ism_ment *iment, struct hat *ism_hat)
8405 ASSERT(MUTEX_HELD(&ism_mlist_lock));
8407 iment->iment_prev = NULL;
8408 iment->iment_next = ism_hat->sfmmu_iment;
8409 if (ism_hat->sfmmu_iment) {
8410 ism_hat->sfmmu_iment->iment_prev = iment;
8412 ism_hat->sfmmu_iment = iment;
8415 static void
8416 iment_sub(struct ism_ment *iment, struct hat *ism_hat)
8418 ASSERT(MUTEX_HELD(&ism_mlist_lock));
8420 if (ism_hat->sfmmu_iment == NULL) {
8421 panic("ism map entry remove - no entries");
8424 if (iment->iment_prev) {
8425 ASSERT(ism_hat->sfmmu_iment != iment);
8426 iment->iment_prev->iment_next = iment->iment_next;
8427 } else {
8428 ASSERT(ism_hat->sfmmu_iment == iment);
8429 ism_hat->sfmmu_iment = iment->iment_next;
8432 if (iment->iment_next) {
8433 iment->iment_next->iment_prev = iment->iment_prev;
8437 * zero out the entry
8439 iment->iment_next = NULL;
8440 iment->iment_prev = NULL;
8441 iment->iment_hat = NULL;
8442 iment->iment_base_va = 0;
8446 * Hat_share()/unshare() return an (non-zero) error
8447 * when saddr and daddr are not properly aligned.
8449 * The top level mapping element determines the alignment
8450 * requirement for saddr and daddr, depending on different
8451 * architectures.
8453 * When hat_share()/unshare() are not supported,
8454 * HATOP_SHARE()/UNSHARE() return 0
8457 hat_share(struct hat *sfmmup, caddr_t addr,
8458 struct hat *ism_hatid, caddr_t sptaddr, size_t len, uint_t ismszc)
8460 ism_blk_t *ism_blkp;
8461 ism_blk_t *new_iblk;
8462 ism_map_t *ism_map;
8463 ism_ment_t *ism_ment;
8464 int i, added;
8465 hatlock_t *hatlockp;
8466 int reload_mmu = 0;
8467 uint_t ismshift = page_get_shift(ismszc);
8468 size_t ismpgsz = page_get_pagesize(ismszc);
8469 uint_t ismmask = (uint_t)ismpgsz - 1;
8470 size_t sh_size = ISM_SHIFT(ismshift, len);
8471 ushort_t ismhatflag;
8472 hat_region_cookie_t rcookie;
8473 sf_scd_t *old_scdp;
8475 #ifdef DEBUG
8476 caddr_t eaddr = addr + len;
8477 #endif /* DEBUG */
8479 ASSERT(ism_hatid != NULL && sfmmup != NULL);
8480 ASSERT(sptaddr == ISMID_STARTADDR);
8482 * Check the alignment.
8484 if (!ISM_ALIGNED(ismshift, addr) || !ISM_ALIGNED(ismshift, sptaddr))
8485 return (EINVAL);
8488 * Check size alignment.
8490 if (!ISM_ALIGNED(ismshift, len))
8491 return (EINVAL);
8494 * Allocate ism_ment for the ism_hat's mapping list, and an
8495 * ism map blk in case we need one. We must do our
8496 * allocations before acquiring locks to prevent a deadlock
8497 * in the kmem allocator on the mapping list lock.
8499 new_iblk = kmem_cache_alloc(ism_blk_cache, KM_SLEEP);
8500 ism_ment = kmem_cache_alloc(ism_ment_cache, KM_SLEEP);
8503 * Serialize ISM mappings with the ISM busy flag, and also the
8504 * trap handlers.
8506 sfmmu_ismhat_enter(sfmmup, 0);
8509 * Allocate an ism map blk if necessary.
8511 if (sfmmup->sfmmu_iblk == NULL) {
8512 sfmmup->sfmmu_iblk = new_iblk;
8513 bzero(new_iblk, sizeof (*new_iblk));
8514 new_iblk->iblk_nextpa = (uint64_t)-1;
8515 membar_stst(); /* make sure next ptr visible to all CPUs */
8516 sfmmup->sfmmu_ismblkpa = va_to_pa((caddr_t)new_iblk);
8517 reload_mmu = 1;
8518 new_iblk = NULL;
8521 #ifdef DEBUG
8523 * Make sure mapping does not already exist.
8525 ism_blkp = sfmmup->sfmmu_iblk;
8526 while (ism_blkp != NULL) {
8527 ism_map = ism_blkp->iblk_maps;
8528 for (i = 0; i < ISM_MAP_SLOTS && ism_map[i].imap_ismhat; i++) {
8529 if ((addr >= ism_start(ism_map[i]) &&
8530 addr < ism_end(ism_map[i])) ||
8531 eaddr > ism_start(ism_map[i]) &&
8532 eaddr <= ism_end(ism_map[i])) {
8533 panic("sfmmu_share: Already mapped!");
8536 ism_blkp = ism_blkp->iblk_next;
8538 #endif /* DEBUG */
8540 ASSERT(ismszc >= TTE4M);
8541 if (ismszc == TTE4M) {
8542 ismhatflag = HAT_4M_FLAG;
8543 } else if (ismszc == TTE32M) {
8544 ismhatflag = HAT_32M_FLAG;
8545 } else if (ismszc == TTE256M) {
8546 ismhatflag = HAT_256M_FLAG;
8549 * Add mapping to first available mapping slot.
8551 ism_blkp = sfmmup->sfmmu_iblk;
8552 added = 0;
8553 while (!added) {
8554 ism_map = ism_blkp->iblk_maps;
8555 for (i = 0; i < ISM_MAP_SLOTS; i++) {
8556 if (ism_map[i].imap_ismhat == NULL) {
8558 ism_map[i].imap_ismhat = ism_hatid;
8559 ism_map[i].imap_vb_shift = (uchar_t)ismshift;
8560 ism_map[i].imap_rid = SFMMU_INVALID_ISMRID;
8561 ism_map[i].imap_hatflags = ismhatflag;
8562 ism_map[i].imap_sz_mask = ismmask;
8564 * imap_seg is checked in ISM_CHECK to see if
8565 * non-NULL, then other info assumed valid.
8567 membar_stst();
8568 ism_map[i].imap_seg = (uintptr_t)addr | sh_size;
8569 ism_map[i].imap_ment = ism_ment;
8572 * Now add ourselves to the ism_hat's
8573 * mapping list.
8575 ism_ment->iment_hat = sfmmup;
8576 ism_ment->iment_base_va = addr;
8577 ism_hatid->sfmmu_ismhat = 1;
8578 mutex_enter(&ism_mlist_lock);
8579 iment_add(ism_ment, ism_hatid);
8580 mutex_exit(&ism_mlist_lock);
8581 added = 1;
8582 break;
8585 if (!added && ism_blkp->iblk_next == NULL) {
8586 ism_blkp->iblk_next = new_iblk;
8587 new_iblk = NULL;
8588 bzero(ism_blkp->iblk_next,
8589 sizeof (*ism_blkp->iblk_next));
8590 ism_blkp->iblk_next->iblk_nextpa = (uint64_t)-1;
8591 membar_stst();
8592 ism_blkp->iblk_nextpa =
8593 va_to_pa((caddr_t)ism_blkp->iblk_next);
8595 ism_blkp = ism_blkp->iblk_next;
8599 * After calling hat_join_region, sfmmup may join a new SCD or
8600 * move from the old scd to a new scd, in which case, we want to
8601 * shrink the sfmmup's private tsb size, i.e., pass shrink to
8602 * sfmmu_check_page_sizes at the end of this routine.
8604 old_scdp = sfmmup->sfmmu_scdp;
8606 rcookie = hat_join_region(sfmmup, addr, len, (void *)ism_hatid, 0,
8607 PROT_ALL, ismszc, NULL, HAT_REGION_ISM);
8608 if (rcookie != HAT_INVALID_REGION_COOKIE) {
8609 ism_map[i].imap_rid = (uchar_t)((uint64_t)rcookie);
8612 * Update our counters for this sfmmup's ism mappings.
8614 for (i = 0; i <= ismszc; i++) {
8615 if (!(disable_ism_large_pages & (1 << i)))
8616 (void) ism_tsb_entries(sfmmup, i);
8620 * For ISM and DISM we do not support 512K pages, so we only only
8621 * search the 4M and 8K/64K hashes for 4 pagesize cpus, and search the
8622 * 256M or 32M, and 4M and 8K/64K hashes for 6 pagesize cpus.
8624 * Need to set 32M/256M ISM flags to make sure
8625 * sfmmu_check_page_sizes() enables them on Panther.
8627 ASSERT((disable_ism_large_pages & (1 << TTE512K)) != 0);
8629 switch (ismszc) {
8630 case TTE256M:
8631 if (!SFMMU_FLAGS_ISSET(sfmmup, HAT_256M_ISM)) {
8632 hatlockp = sfmmu_hat_enter(sfmmup);
8633 SFMMU_FLAGS_SET(sfmmup, HAT_256M_ISM);
8634 sfmmu_hat_exit(hatlockp);
8636 break;
8637 case TTE32M:
8638 if (!SFMMU_FLAGS_ISSET(sfmmup, HAT_32M_ISM)) {
8639 hatlockp = sfmmu_hat_enter(sfmmup);
8640 SFMMU_FLAGS_SET(sfmmup, HAT_32M_ISM);
8641 sfmmu_hat_exit(hatlockp);
8643 break;
8644 default:
8645 break;
8649 * If we updated the ismblkpa for this HAT we must make
8650 * sure all CPUs running this process reload their tsbmiss area.
8651 * Otherwise they will fail to load the mappings in the tsbmiss
8652 * handler and will loop calling pagefault().
8654 if (reload_mmu) {
8655 hatlockp = sfmmu_hat_enter(sfmmup);
8656 sfmmu_sync_mmustate(sfmmup);
8657 sfmmu_hat_exit(hatlockp);
8660 sfmmu_ismhat_exit(sfmmup, 0);
8663 * Free up ismblk if we didn't use it.
8665 if (new_iblk != NULL)
8666 kmem_cache_free(ism_blk_cache, new_iblk);
8669 * Check TSB and TLB page sizes.
8671 if (sfmmup->sfmmu_scdp != NULL && old_scdp != sfmmup->sfmmu_scdp) {
8672 sfmmu_check_page_sizes(sfmmup, 0);
8673 } else {
8674 sfmmu_check_page_sizes(sfmmup, 1);
8676 return (0);
8680 * hat_unshare removes exactly one ism_map from
8681 * this process's as. It expects multiple calls
8682 * to hat_unshare for multiple shm segments.
8684 void
8685 hat_unshare(struct hat *sfmmup, caddr_t addr, size_t len, uint_t ismszc)
8687 ism_map_t *ism_map;
8688 ism_ment_t *free_ment = NULL;
8689 ism_blk_t *ism_blkp;
8690 struct hat *ism_hatid;
8691 int found, i;
8692 hatlock_t *hatlockp;
8693 struct tsb_info *tsbinfo;
8694 uint_t ismshift = page_get_shift(ismszc);
8695 size_t sh_size = ISM_SHIFT(ismshift, len);
8696 uchar_t ism_rid;
8697 sf_scd_t *old_scdp;
8699 ASSERT(ISM_ALIGNED(ismshift, addr));
8700 ASSERT(ISM_ALIGNED(ismshift, len));
8701 ASSERT(sfmmup != NULL);
8702 ASSERT(sfmmup != ksfmmup);
8704 ASSERT(sfmmup->sfmmu_as != NULL);
8707 * Make sure that during the entire time ISM mappings are removed,
8708 * the trap handlers serialize behind us, and that no one else
8709 * can be mucking with ISM mappings. This also lets us get away
8710 * with not doing expensive cross calls to flush the TLB -- we
8711 * just discard the context, flush the entire TSB, and call it
8712 * a day.
8714 sfmmu_ismhat_enter(sfmmup, 0);
8717 * Remove the mapping.
8719 * We can't have any holes in the ism map.
8720 * The tsb miss code while searching the ism map will
8721 * stop on an empty map slot. So we must move
8722 * everyone past the hole up 1 if any.
8724 * Also empty ism map blks are not freed until the
8725 * process exits. This is to prevent a MT race condition
8726 * between sfmmu_unshare() and sfmmu_tsbmiss_exception().
8728 found = 0;
8729 ism_blkp = sfmmup->sfmmu_iblk;
8730 while (!found && ism_blkp != NULL) {
8731 ism_map = ism_blkp->iblk_maps;
8732 for (i = 0; i < ISM_MAP_SLOTS; i++) {
8733 if (addr == ism_start(ism_map[i]) &&
8734 sh_size == (size_t)(ism_size(ism_map[i]))) {
8735 found = 1;
8736 break;
8739 if (!found)
8740 ism_blkp = ism_blkp->iblk_next;
8743 if (found) {
8744 ism_hatid = ism_map[i].imap_ismhat;
8745 ism_rid = ism_map[i].imap_rid;
8746 ASSERT(ism_hatid != NULL);
8747 ASSERT(ism_hatid->sfmmu_ismhat == 1);
8750 * After hat_leave_region, the sfmmup may leave SCD,
8751 * in which case, we want to grow the private tsb size when
8752 * calling sfmmu_check_page_sizes at the end of the routine.
8754 old_scdp = sfmmup->sfmmu_scdp;
8756 * Then remove ourselves from the region.
8758 if (ism_rid != SFMMU_INVALID_ISMRID) {
8759 hat_leave_region(sfmmup, (void *)((uint64_t)ism_rid),
8760 HAT_REGION_ISM);
8764 * And now guarantee that any other cpu
8765 * that tries to process an ISM miss
8766 * will go to tl=0.
8768 hatlockp = sfmmu_hat_enter(sfmmup);
8769 sfmmu_invalidate_ctx(sfmmup);
8770 sfmmu_hat_exit(hatlockp);
8773 * Remove ourselves from the ism mapping list.
8775 mutex_enter(&ism_mlist_lock);
8776 iment_sub(ism_map[i].imap_ment, ism_hatid);
8777 mutex_exit(&ism_mlist_lock);
8778 free_ment = ism_map[i].imap_ment;
8781 * We delete the ism map by copying
8782 * the next map over the current one.
8783 * We will take the next one in the maps
8784 * array or from the next ism_blk.
8786 while (ism_blkp != NULL) {
8787 ism_map = ism_blkp->iblk_maps;
8788 while (i < (ISM_MAP_SLOTS - 1)) {
8789 ism_map[i] = ism_map[i + 1];
8790 i++;
8792 /* i == (ISM_MAP_SLOTS - 1) */
8793 ism_blkp = ism_blkp->iblk_next;
8794 if (ism_blkp != NULL) {
8795 ism_map[i] = ism_blkp->iblk_maps[0];
8796 i = 0;
8797 } else {
8798 ism_map[i].imap_seg = 0;
8799 ism_map[i].imap_vb_shift = 0;
8800 ism_map[i].imap_rid = SFMMU_INVALID_ISMRID;
8801 ism_map[i].imap_hatflags = 0;
8802 ism_map[i].imap_sz_mask = 0;
8803 ism_map[i].imap_ismhat = NULL;
8804 ism_map[i].imap_ment = NULL;
8809 * Now flush entire TSB for the process, since
8810 * demapping page by page can be too expensive.
8811 * We don't have to flush the TLB here anymore
8812 * since we switch to a new TLB ctx instead.
8813 * Also, there is no need to flush if the process
8814 * is exiting since the TSB will be freed later.
8816 if (!sfmmup->sfmmu_free) {
8817 hatlockp = sfmmu_hat_enter(sfmmup);
8818 for (tsbinfo = sfmmup->sfmmu_tsb; tsbinfo != NULL;
8819 tsbinfo = tsbinfo->tsb_next) {
8820 if (tsbinfo->tsb_flags & TSB_SWAPPED)
8821 continue;
8822 if (tsbinfo->tsb_flags & TSB_RELOC_FLAG) {
8823 tsbinfo->tsb_flags |=
8824 TSB_FLUSH_NEEDED;
8825 continue;
8828 sfmmu_inv_tsb(tsbinfo->tsb_va,
8829 TSB_BYTES(tsbinfo->tsb_szc));
8831 sfmmu_hat_exit(hatlockp);
8836 * Update our counters for this sfmmup's ism mappings.
8838 for (i = 0; i <= ismszc; i++) {
8839 if (!(disable_ism_large_pages & (1 << i)))
8840 (void) ism_tsb_entries(sfmmup, i);
8843 sfmmu_ismhat_exit(sfmmup, 0);
8846 * We must do our freeing here after dropping locks
8847 * to prevent a deadlock in the kmem allocator on the
8848 * mapping list lock.
8850 if (free_ment != NULL)
8851 kmem_cache_free(ism_ment_cache, free_ment);
8854 * Check TSB and TLB page sizes if the process isn't exiting.
8856 if (!sfmmup->sfmmu_free) {
8857 if (found && old_scdp != NULL && sfmmup->sfmmu_scdp == NULL) {
8858 sfmmu_check_page_sizes(sfmmup, 1);
8859 } else {
8860 sfmmu_check_page_sizes(sfmmup, 0);
8865 /* ARGSUSED */
8866 static int
8867 sfmmu_idcache_constructor(void *buf, void *cdrarg, int kmflags)
8869 /* void *buf is sfmmu_t pointer */
8870 bzero(buf, sizeof (sfmmu_t));
8872 return (0);
8875 /* ARGSUSED */
8876 static void
8877 sfmmu_idcache_destructor(void *buf, void *cdrarg)
8879 /* void *buf is sfmmu_t pointer */
8883 * setup kmem hmeblks by bzeroing all members and initializing the nextpa
8884 * field to be the pa of this hmeblk
8886 /* ARGSUSED */
8887 static int
8888 sfmmu_hblkcache_constructor(void *buf, void *cdrarg, int kmflags)
8890 struct hme_blk *hmeblkp;
8892 bzero(buf, (size_t)cdrarg);
8893 hmeblkp = (struct hme_blk *)buf;
8894 hmeblkp->hblk_nextpa = va_to_pa((caddr_t)hmeblkp);
8896 #ifdef HBLK_TRACE
8897 mutex_init(&hmeblkp->hblk_audit_lock, NULL, MUTEX_DEFAULT, NULL);
8898 #endif /* HBLK_TRACE */
8900 return (0);
8903 /* ARGSUSED */
8904 static void
8905 sfmmu_hblkcache_destructor(void *buf, void *cdrarg)
8908 #ifdef HBLK_TRACE
8910 struct hme_blk *hmeblkp;
8912 hmeblkp = (struct hme_blk *)buf;
8913 mutex_destroy(&hmeblkp->hblk_audit_lock);
8915 #endif /* HBLK_TRACE */
8918 #define SFMMU_CACHE_RECLAIM_SCAN_RATIO 8
8919 static int sfmmu_cache_reclaim_scan_ratio = SFMMU_CACHE_RECLAIM_SCAN_RATIO;
8921 * The kmem allocator will callback into our reclaim routine when the system
8922 * is running low in memory. We traverse the hash and free up all unused but
8923 * still cached hme_blks. We also traverse the free list and free them up
8924 * as well.
8926 /*ARGSUSED*/
8927 static void
8928 sfmmu_hblkcache_reclaim(void *cdrarg)
8930 int i;
8931 struct hmehash_bucket *hmebp;
8932 struct hme_blk *hmeblkp, *nx_hblk, *pr_hblk = NULL;
8933 static struct hmehash_bucket *uhmehash_reclaim_hand;
8934 static struct hmehash_bucket *khmehash_reclaim_hand;
8935 struct hme_blk *list = NULL, *last_hmeblkp;
8936 cpuset_t cpuset = cpu_ready_set;
8937 cpu_hme_pend_t *cpuhp;
8939 /* Free up hmeblks on the cpu pending lists */
8940 for (i = 0; i < NCPU; i++) {
8941 cpuhp = &cpu_hme_pend[i];
8942 if (cpuhp->chp_listp != NULL) {
8943 mutex_enter(&cpuhp->chp_mutex);
8944 if (cpuhp->chp_listp == NULL) {
8945 mutex_exit(&cpuhp->chp_mutex);
8946 continue;
8948 for (last_hmeblkp = cpuhp->chp_listp;
8949 last_hmeblkp->hblk_next != NULL;
8950 last_hmeblkp = last_hmeblkp->hblk_next)
8952 last_hmeblkp->hblk_next = list;
8953 list = cpuhp->chp_listp;
8954 cpuhp->chp_listp = NULL;
8955 cpuhp->chp_count = 0;
8956 mutex_exit(&cpuhp->chp_mutex);
8961 if (list != NULL) {
8962 kpreempt_disable();
8963 CPUSET_DEL(cpuset, CPU->cpu_id);
8964 xt_sync(cpuset);
8965 xt_sync(cpuset);
8966 kpreempt_enable();
8967 sfmmu_hblk_free(&list);
8968 list = NULL;
8971 hmebp = uhmehash_reclaim_hand;
8972 if (hmebp == NULL || hmebp > &uhme_hash[UHMEHASH_SZ])
8973 uhmehash_reclaim_hand = hmebp = uhme_hash;
8974 uhmehash_reclaim_hand += UHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio;
8976 for (i = UHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; i; i--) {
8977 if (SFMMU_HASH_LOCK_TRYENTER(hmebp) != 0) {
8978 hmeblkp = hmebp->hmeblkp;
8979 pr_hblk = NULL;
8980 while (hmeblkp) {
8981 nx_hblk = hmeblkp->hblk_next;
8982 if (!hmeblkp->hblk_vcnt &&
8983 !hmeblkp->hblk_hmecnt) {
8984 sfmmu_hblk_hash_rm(hmebp, hmeblkp,
8985 pr_hblk, &list, 0);
8986 } else {
8987 pr_hblk = hmeblkp;
8989 hmeblkp = nx_hblk;
8991 SFMMU_HASH_UNLOCK(hmebp);
8993 if (hmebp++ == &uhme_hash[UHMEHASH_SZ])
8994 hmebp = uhme_hash;
8997 hmebp = khmehash_reclaim_hand;
8998 if (hmebp == NULL || hmebp > &khme_hash[KHMEHASH_SZ])
8999 khmehash_reclaim_hand = hmebp = khme_hash;
9000 khmehash_reclaim_hand += KHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio;
9002 for (i = KHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; i; i--) {
9003 if (SFMMU_HASH_LOCK_TRYENTER(hmebp) != 0) {
9004 hmeblkp = hmebp->hmeblkp;
9005 pr_hblk = NULL;
9006 while (hmeblkp) {
9007 nx_hblk = hmeblkp->hblk_next;
9008 if (!hmeblkp->hblk_vcnt &&
9009 !hmeblkp->hblk_hmecnt) {
9010 sfmmu_hblk_hash_rm(hmebp, hmeblkp,
9011 pr_hblk, &list, 0);
9012 } else {
9013 pr_hblk = hmeblkp;
9015 hmeblkp = nx_hblk;
9017 SFMMU_HASH_UNLOCK(hmebp);
9019 if (hmebp++ == &khme_hash[KHMEHASH_SZ])
9020 hmebp = khme_hash;
9022 sfmmu_hblks_list_purge(&list, 0);
9026 * sfmmu_get_ppvcolor should become a vm_machdep or hatop interface.
9027 * same goes for sfmmu_get_addrvcolor().
9029 * This function will return the virtual color for the specified page. The
9030 * virtual color corresponds to this page current mapping or its last mapping.
9031 * It is used by memory allocators to choose addresses with the correct
9032 * alignment so vac consistency is automatically maintained. If the page
9033 * has no color it returns -1.
9035 /*ARGSUSED*/
9037 sfmmu_get_ppvcolor(struct page *pp)
9039 #ifdef VAC
9040 int color;
9042 if (!(cache & CACHE_VAC) || PP_NEWPAGE(pp)) {
9043 return (-1);
9045 color = PP_GET_VCOLOR(pp);
9046 ASSERT(color < mmu_btop(shm_alignment));
9047 return (color);
9048 #else
9049 return (-1);
9050 #endif /* VAC */
9054 * This function will return the desired alignment for vac consistency
9055 * (vac color) given a virtual address. If no vac is present it returns -1.
9057 /*ARGSUSED*/
9059 sfmmu_get_addrvcolor(caddr_t vaddr)
9061 #ifdef VAC
9062 if (cache & CACHE_VAC) {
9063 return (addr_to_vcolor(vaddr));
9064 } else {
9065 return (-1);
9067 #else
9068 return (-1);
9069 #endif /* VAC */
9072 #ifdef VAC
9074 * Check for conflicts.
9075 * A conflict exists if the new and existent mappings do not match in
9076 * their "shm_alignment fields. If conflicts exist, the existant mappings
9077 * are flushed unless one of them is locked. If one of them is locked, then
9078 * the mappings are flushed and converted to non-cacheable mappings.
9080 static void
9081 sfmmu_vac_conflict(struct hat *hat, caddr_t addr, page_t *pp)
9083 struct hat *tmphat;
9084 struct sf_hment *sfhmep, *tmphme = NULL;
9085 struct hme_blk *hmeblkp;
9086 int vcolor;
9087 tte_t tte;
9089 ASSERT(sfmmu_mlist_held(pp));
9090 ASSERT(!PP_ISNC(pp)); /* page better be cacheable */
9092 vcolor = addr_to_vcolor(addr);
9093 if (PP_NEWPAGE(pp)) {
9094 PP_SET_VCOLOR(pp, vcolor);
9095 return;
9098 if (PP_GET_VCOLOR(pp) == vcolor) {
9099 return;
9102 if (!PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp)) {
9104 * Previous user of page had a different color
9105 * but since there are no current users
9106 * we just flush the cache and change the color.
9108 SFMMU_STAT(sf_pgcolor_conflict);
9109 sfmmu_cache_flush(pp->p_pagenum, PP_GET_VCOLOR(pp));
9110 PP_SET_VCOLOR(pp, vcolor);
9111 return;
9115 * If we get here we have a vac conflict with a current
9116 * mapping. VAC conflict policy is as follows.
9117 * - The default is to unload the other mappings unless:
9118 * - If we have a large mapping we uncache the page.
9119 * We need to uncache the rest of the large page too.
9120 * - If any of the mappings are locked we uncache the page.
9121 * - If the requested mapping is inconsistent
9122 * with another mapping and that mapping
9123 * is in the same address space we have to
9124 * make it non-cached. The default thing
9125 * to do is unload the inconsistent mapping
9126 * but if they are in the same address space
9127 * we run the risk of unmapping the pc or the
9128 * stack which we will use as we return to the user,
9129 * in which case we can then fault on the thing
9130 * we just unloaded and get into an infinite loop.
9132 if (PP_ISMAPPED_LARGE(pp)) {
9133 int sz;
9136 * Existing mapping is for big pages. We don't unload
9137 * existing big mappings to satisfy new mappings.
9138 * Always convert all mappings to TNC.
9140 sz = fnd_mapping_sz(pp);
9141 pp = PP_GROUPLEADER(pp, sz);
9142 SFMMU_STAT_ADD(sf_uncache_conflict, TTEPAGES(sz));
9143 sfmmu_page_cache_array(pp, HAT_TMPNC, CACHE_FLUSH,
9144 TTEPAGES(sz));
9146 return;
9150 * check if any mapping is in same as or if it is locked
9151 * since in that case we need to uncache.
9153 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = tmphme) {
9154 tmphme = sfhmep->hme_next;
9155 if (IS_PAHME(sfhmep))
9156 continue;
9157 hmeblkp = sfmmu_hmetohblk(sfhmep);
9158 tmphat = hblktosfmmu(hmeblkp);
9159 sfmmu_copytte(&sfhmep->hme_tte, &tte);
9160 ASSERT(TTE_IS_VALID(&tte));
9161 if (hmeblkp->hblk_shared || tmphat == hat ||
9162 hmeblkp->hblk_lckcnt) {
9164 * We have an uncache conflict
9166 SFMMU_STAT(sf_uncache_conflict);
9167 sfmmu_page_cache_array(pp, HAT_TMPNC, CACHE_FLUSH, 1);
9168 return;
9173 * We have an unload conflict
9174 * We have already checked for LARGE mappings, therefore
9175 * the remaining mapping(s) must be TTE8K.
9177 SFMMU_STAT(sf_unload_conflict);
9179 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = tmphme) {
9180 tmphme = sfhmep->hme_next;
9181 if (IS_PAHME(sfhmep))
9182 continue;
9183 hmeblkp = sfmmu_hmetohblk(sfhmep);
9184 ASSERT(!hmeblkp->hblk_shared);
9185 (void) sfmmu_pageunload(pp, sfhmep, TTE8K);
9188 if (PP_ISMAPPED_KPM(pp))
9189 sfmmu_kpm_vac_unload(pp, addr);
9192 * Unloads only do TLB flushes so we need to flush the
9193 * cache here.
9195 sfmmu_cache_flush(pp->p_pagenum, PP_GET_VCOLOR(pp));
9196 PP_SET_VCOLOR(pp, vcolor);
9200 * Whenever a mapping is unloaded and the page is in TNC state,
9201 * we see if the page can be made cacheable again. 'pp' is
9202 * the page that we just unloaded a mapping from, the size
9203 * of mapping that was unloaded is 'ottesz'.
9204 * Remark:
9205 * The recache policy for mpss pages can leave a performance problem
9206 * under the following circumstances:
9207 * . A large page in uncached mode has just been unmapped.
9208 * . All constituent pages are TNC due to a conflicting small mapping.
9209 * . There are many other, non conflicting, small mappings around for
9210 * a lot of the constituent pages.
9211 * . We're called w/ the "old" groupleader page and the old ottesz,
9212 * but this is irrelevant, since we're no more "PP_ISMAPPED_LARGE", so
9213 * we end up w/ TTE8K or npages == 1.
9214 * . We call tst_tnc w/ the old groupleader only, and if there is no
9215 * conflict, we re-cache only this page.
9216 * . All other small mappings are not checked and will be left in TNC mode.
9217 * The problem is not very serious because:
9218 * . mpss is actually only defined for heap and stack, so the probability
9219 * is not very high that a large page mapping exists in parallel to a small
9220 * one (this is possible, but seems to be bad programming style in the
9221 * appl).
9222 * . The problem gets a little bit more serious, when those TNC pages
9223 * have to be mapped into kernel space, e.g. for networking.
9224 * . When VAC alias conflicts occur in applications, this is regarded
9225 * as an application bug. So if kstat's show them, the appl should
9226 * be changed anyway.
9228 void
9229 conv_tnc(page_t *pp, int ottesz)
9231 int cursz, dosz;
9232 pgcnt_t curnpgs, dopgs;
9233 pgcnt_t pg64k;
9234 page_t *pp2;
9237 * Determine how big a range we check for TNC and find
9238 * leader page. cursz is the size of the biggest
9239 * mapping that still exist on 'pp'.
9241 if (PP_ISMAPPED_LARGE(pp)) {
9242 cursz = fnd_mapping_sz(pp);
9243 } else {
9244 cursz = TTE8K;
9247 if (ottesz >= cursz) {
9248 dosz = ottesz;
9249 pp2 = pp;
9250 } else {
9251 dosz = cursz;
9252 pp2 = PP_GROUPLEADER(pp, dosz);
9255 pg64k = TTEPAGES(TTE64K);
9256 dopgs = TTEPAGES(dosz);
9258 ASSERT(dopgs == 1 || ((dopgs & (pg64k - 1)) == 0));
9260 while (dopgs != 0) {
9261 curnpgs = TTEPAGES(cursz);
9262 if (tst_tnc(pp2, curnpgs)) {
9263 SFMMU_STAT_ADD(sf_recache, curnpgs);
9264 sfmmu_page_cache_array(pp2, HAT_CACHE, CACHE_NO_FLUSH,
9265 curnpgs);
9268 ASSERT(dopgs >= curnpgs);
9269 dopgs -= curnpgs;
9271 if (dopgs == 0) {
9272 break;
9275 pp2 = PP_PAGENEXT_N(pp2, curnpgs);
9276 if (((dopgs & (pg64k - 1)) == 0) && PP_ISMAPPED_LARGE(pp2)) {
9277 cursz = fnd_mapping_sz(pp2);
9278 } else {
9279 cursz = TTE8K;
9285 * Returns 1 if page(s) can be converted from TNC to cacheable setting,
9286 * returns 0 otherwise. Note that oaddr argument is valid for only
9287 * 8k pages.
9290 tst_tnc(page_t *pp, pgcnt_t npages)
9292 struct sf_hment *sfhme;
9293 struct hme_blk *hmeblkp;
9294 tte_t tte;
9295 caddr_t vaddr;
9296 int clr_valid = 0;
9297 int color, color1, bcolor;
9298 int i, ncolors;
9300 ASSERT(pp != NULL);
9301 ASSERT(!(cache & CACHE_WRITEBACK));
9303 if (npages > 1) {
9304 ncolors = CACHE_NUM_COLOR;
9307 for (i = 0; i < npages; i++) {
9308 ASSERT(sfmmu_mlist_held(pp));
9309 ASSERT(PP_ISTNC(pp));
9310 ASSERT(PP_GET_VCOLOR(pp) == NO_VCOLOR);
9312 if (PP_ISPNC(pp)) {
9313 return (0);
9316 clr_valid = 0;
9317 if (PP_ISMAPPED_KPM(pp)) {
9318 caddr_t kpmvaddr;
9320 ASSERT(kpm_enable);
9321 kpmvaddr = hat_kpm_page2va(pp, 1);
9322 ASSERT(!(npages > 1 && IS_KPM_ALIAS_RANGE(kpmvaddr)));
9323 color1 = addr_to_vcolor(kpmvaddr);
9324 clr_valid = 1;
9327 for (sfhme = pp->p_mapping; sfhme; sfhme = sfhme->hme_next) {
9328 if (IS_PAHME(sfhme))
9329 continue;
9330 hmeblkp = sfmmu_hmetohblk(sfhme);
9332 sfmmu_copytte(&sfhme->hme_tte, &tte);
9333 ASSERT(TTE_IS_VALID(&tte));
9335 vaddr = tte_to_vaddr(hmeblkp, tte);
9336 color = addr_to_vcolor(vaddr);
9338 if (npages > 1) {
9340 * If there is a big mapping, make sure
9341 * 8K mapping is consistent with the big
9342 * mapping.
9344 bcolor = i % ncolors;
9345 if (color != bcolor) {
9346 return (0);
9349 if (!clr_valid) {
9350 clr_valid = 1;
9351 color1 = color;
9354 if (color1 != color) {
9355 return (0);
9359 pp = PP_PAGENEXT(pp);
9362 return (1);
9365 void
9366 sfmmu_page_cache_array(page_t *pp, int flags, int cache_flush_flag,
9367 pgcnt_t npages)
9369 kmutex_t *pmtx;
9370 int i, ncolors, bcolor;
9371 kpm_hlk_t *kpmp;
9372 cpuset_t cpuset;
9374 ASSERT(pp != NULL);
9375 ASSERT(!(cache & CACHE_WRITEBACK));
9377 kpmp = sfmmu_kpm_kpmp_enter(pp, npages);
9378 pmtx = sfmmu_page_enter(pp);
9381 * Fast path caching single unmapped page
9383 if (npages == 1 && !PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp) &&
9384 flags == HAT_CACHE) {
9385 PP_CLRTNC(pp);
9386 PP_CLRPNC(pp);
9387 sfmmu_page_exit(pmtx);
9388 sfmmu_kpm_kpmp_exit(kpmp);
9389 return;
9393 * We need to capture all cpus in order to change cacheability
9394 * because we can't allow one cpu to access the same physical
9395 * page using a cacheable and a non-cachebale mapping at the same
9396 * time. Since we may end up walking the ism mapping list
9397 * have to grab it's lock now since we can't after all the
9398 * cpus have been captured.
9400 sfmmu_hat_lock_all();
9401 mutex_enter(&ism_mlist_lock);
9402 kpreempt_disable();
9403 cpuset = cpu_ready_set;
9404 xc_attention(cpuset);
9406 if (npages > 1) {
9408 * Make sure all colors are flushed since the
9409 * sfmmu_page_cache() only flushes one color-
9410 * it does not know big pages.
9412 ncolors = CACHE_NUM_COLOR;
9413 if (flags & HAT_TMPNC) {
9414 for (i = 0; i < ncolors; i++) {
9415 sfmmu_cache_flushcolor(i, pp->p_pagenum);
9417 cache_flush_flag = CACHE_NO_FLUSH;
9421 for (i = 0; i < npages; i++) {
9423 ASSERT(sfmmu_mlist_held(pp));
9425 if (!(flags == HAT_TMPNC && PP_ISTNC(pp))) {
9427 if (npages > 1) {
9428 bcolor = i % ncolors;
9429 } else {
9430 bcolor = NO_VCOLOR;
9433 sfmmu_page_cache(pp, flags, cache_flush_flag,
9434 bcolor);
9437 pp = PP_PAGENEXT(pp);
9440 xt_sync(cpuset);
9441 xc_dismissed(cpuset);
9442 mutex_exit(&ism_mlist_lock);
9443 sfmmu_hat_unlock_all();
9444 sfmmu_page_exit(pmtx);
9445 sfmmu_kpm_kpmp_exit(kpmp);
9446 kpreempt_enable();
9450 * This function changes the virtual cacheability of all mappings to a
9451 * particular page. When changing from uncache to cacheable the mappings will
9452 * only be changed if all of them have the same virtual color.
9453 * We need to flush the cache in all cpus. It is possible that
9454 * a process referenced a page as cacheable but has sinced exited
9455 * and cleared the mapping list. We still to flush it but have no
9456 * state so all cpus is the only alternative.
9458 static void
9459 sfmmu_page_cache(page_t *pp, int flags, int cache_flush_flag, int bcolor)
9461 struct sf_hment *sfhme;
9462 struct hme_blk *hmeblkp;
9463 sfmmu_t *sfmmup;
9464 tte_t tte, ttemod;
9465 caddr_t vaddr;
9466 int ret, color;
9467 pfn_t pfn;
9469 color = bcolor;
9470 pfn = pp->p_pagenum;
9472 for (sfhme = pp->p_mapping; sfhme; sfhme = sfhme->hme_next) {
9474 if (IS_PAHME(sfhme))
9475 continue;
9476 hmeblkp = sfmmu_hmetohblk(sfhme);
9478 sfmmu_copytte(&sfhme->hme_tte, &tte);
9479 ASSERT(TTE_IS_VALID(&tte));
9480 vaddr = tte_to_vaddr(hmeblkp, tte);
9481 color = addr_to_vcolor(vaddr);
9483 #ifdef DEBUG
9484 if ((flags & HAT_CACHE) && bcolor != NO_VCOLOR) {
9485 ASSERT(color == bcolor);
9487 #endif
9489 ASSERT(flags != HAT_TMPNC || color == PP_GET_VCOLOR(pp));
9491 ttemod = tte;
9492 if (flags & (HAT_UNCACHE | HAT_TMPNC)) {
9493 TTE_CLR_VCACHEABLE(&ttemod);
9494 } else { /* flags & HAT_CACHE */
9495 TTE_SET_VCACHEABLE(&ttemod);
9497 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte);
9498 if (ret < 0) {
9500 * Since all cpus are captured modifytte should not
9501 * fail.
9503 panic("sfmmu_page_cache: write to tte failed");
9506 sfmmup = hblktosfmmu(hmeblkp);
9507 if (cache_flush_flag == CACHE_FLUSH) {
9509 * Flush TSBs, TLBs and caches
9511 if (hmeblkp->hblk_shared) {
9512 sf_srd_t *srdp = (sf_srd_t *)sfmmup;
9513 uint_t rid = hmeblkp->hblk_tag.htag_rid;
9514 sf_region_t *rgnp;
9515 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
9516 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
9517 ASSERT(srdp != NULL);
9518 rgnp = srdp->srd_hmergnp[rid];
9519 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp,
9520 srdp, rgnp, rid);
9521 (void) sfmmu_rgntlb_demap(vaddr, rgnp,
9522 hmeblkp, 0);
9523 sfmmu_cache_flush(pfn, addr_to_vcolor(vaddr));
9524 } else if (sfmmup->sfmmu_ismhat) {
9525 if (flags & HAT_CACHE) {
9526 SFMMU_STAT(sf_ism_recache);
9527 } else {
9528 SFMMU_STAT(sf_ism_uncache);
9530 sfmmu_ismtlbcache_demap(vaddr, sfmmup, hmeblkp,
9531 pfn, CACHE_FLUSH);
9532 } else {
9533 sfmmu_tlbcache_demap(vaddr, sfmmup, hmeblkp,
9534 pfn, 0, FLUSH_ALL_CPUS, CACHE_FLUSH, 1);
9538 * all cache entries belonging to this pfn are
9539 * now flushed.
9541 cache_flush_flag = CACHE_NO_FLUSH;
9542 } else {
9544 * Flush only TSBs and TLBs.
9546 if (hmeblkp->hblk_shared) {
9547 sf_srd_t *srdp = (sf_srd_t *)sfmmup;
9548 uint_t rid = hmeblkp->hblk_tag.htag_rid;
9549 sf_region_t *rgnp;
9550 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
9551 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
9552 ASSERT(srdp != NULL);
9553 rgnp = srdp->srd_hmergnp[rid];
9554 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp,
9555 srdp, rgnp, rid);
9556 (void) sfmmu_rgntlb_demap(vaddr, rgnp,
9557 hmeblkp, 0);
9558 } else if (sfmmup->sfmmu_ismhat) {
9559 if (flags & HAT_CACHE) {
9560 SFMMU_STAT(sf_ism_recache);
9561 } else {
9562 SFMMU_STAT(sf_ism_uncache);
9564 sfmmu_ismtlbcache_demap(vaddr, sfmmup, hmeblkp,
9565 pfn, CACHE_NO_FLUSH);
9566 } else {
9567 sfmmu_tlb_demap(vaddr, sfmmup, hmeblkp, 0, 1);
9572 if (PP_ISMAPPED_KPM(pp))
9573 sfmmu_kpm_page_cache(pp, flags, cache_flush_flag);
9575 switch (flags) {
9577 default:
9578 panic("sfmmu_pagecache: unknown flags");
9579 break;
9581 case HAT_CACHE:
9582 PP_CLRTNC(pp);
9583 PP_CLRPNC(pp);
9584 PP_SET_VCOLOR(pp, color);
9585 break;
9587 case HAT_TMPNC:
9588 PP_SETTNC(pp);
9589 PP_SET_VCOLOR(pp, NO_VCOLOR);
9590 break;
9592 case HAT_UNCACHE:
9593 PP_SETPNC(pp);
9594 PP_CLRTNC(pp);
9595 PP_SET_VCOLOR(pp, NO_VCOLOR);
9596 break;
9599 #endif /* VAC */
9603 * Wrapper routine used to return a context.
9605 * It's the responsibility of the caller to guarantee that the
9606 * process serializes on calls here by taking the HAT lock for
9607 * the hat.
9610 static void
9611 sfmmu_get_ctx(sfmmu_t *sfmmup)
9613 mmu_ctx_t *mmu_ctxp;
9614 uint_t pstate_save;
9615 int ret;
9617 ASSERT(sfmmu_hat_lock_held(sfmmup));
9618 ASSERT(sfmmup != ksfmmup);
9620 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_ALLCTX_INVALID)) {
9621 sfmmu_setup_tsbinfo(sfmmup);
9622 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ALLCTX_INVALID);
9625 kpreempt_disable();
9627 mmu_ctxp = CPU_MMU_CTXP(CPU);
9628 ASSERT(mmu_ctxp);
9629 ASSERT(mmu_ctxp->mmu_idx < max_mmu_ctxdoms);
9630 ASSERT(mmu_ctxp == mmu_ctxs_tbl[mmu_ctxp->mmu_idx]);
9633 * Do a wrap-around if cnum reaches the max # cnum supported by a MMU.
9635 if (mmu_ctxp->mmu_cnum == mmu_ctxp->mmu_nctxs)
9636 sfmmu_ctx_wrap_around(mmu_ctxp, B_TRUE);
9639 * Let the MMU set up the page sizes to use for
9640 * this context in the TLB. Don't program 2nd dtlb for ism hat.
9642 if ((&mmu_set_ctx_page_sizes) && (sfmmup->sfmmu_ismhat == 0)) {
9643 mmu_set_ctx_page_sizes(sfmmup);
9647 * sfmmu_alloc_ctx and sfmmu_load_mmustate will be performed with
9648 * interrupts disabled to prevent race condition with wrap-around
9649 * ctx invalidatation. In sun4v, ctx invalidation also involves
9650 * a HV call to set the number of TSBs to 0. If interrupts are not
9651 * disabled until after sfmmu_load_mmustate is complete TSBs may
9652 * become assigned to INVALID_CONTEXT. This is not allowed.
9654 pstate_save = sfmmu_disable_intrs();
9656 if (sfmmu_alloc_ctx(sfmmup, 1, CPU, SFMMU_PRIVATE) &&
9657 sfmmup->sfmmu_scdp != NULL) {
9658 sf_scd_t *scdp = sfmmup->sfmmu_scdp;
9659 sfmmu_t *scsfmmup = scdp->scd_sfmmup;
9660 ret = sfmmu_alloc_ctx(scsfmmup, 1, CPU, SFMMU_SHARED);
9661 /* debug purpose only */
9662 ASSERT(!ret || scsfmmup->sfmmu_ctxs[CPU_MMU_IDX(CPU)].cnum
9663 != INVALID_CONTEXT);
9665 sfmmu_load_mmustate(sfmmup);
9667 sfmmu_enable_intrs(pstate_save);
9669 kpreempt_enable();
9673 * When all cnums are used up in a MMU, cnum will wrap around to the
9674 * next generation and start from 2.
9676 static void
9677 sfmmu_ctx_wrap_around(mmu_ctx_t *mmu_ctxp, boolean_t reset_cnum)
9680 /* caller must have disabled the preemption */
9681 ASSERT(curthread->t_preempt >= 1);
9682 ASSERT(mmu_ctxp != NULL);
9684 /* acquire Per-MMU (PM) spin lock */
9685 mutex_enter(&mmu_ctxp->mmu_lock);
9687 /* re-check to see if wrap-around is needed */
9688 if (mmu_ctxp->mmu_cnum < mmu_ctxp->mmu_nctxs)
9689 goto done;
9691 SFMMU_MMU_STAT(mmu_wrap_around);
9693 /* update gnum */
9694 ASSERT(mmu_ctxp->mmu_gnum != 0);
9695 mmu_ctxp->mmu_gnum++;
9696 if (mmu_ctxp->mmu_gnum == 0 ||
9697 mmu_ctxp->mmu_gnum > MAX_SFMMU_GNUM_VAL) {
9698 cmn_err(CE_PANIC, "mmu_gnum of mmu_ctx 0x%p is out of bound.",
9699 (void *)mmu_ctxp);
9702 if (mmu_ctxp->mmu_ncpus > 1) {
9703 cpuset_t cpuset;
9705 membar_enter(); /* make sure updated gnum visible */
9707 SFMMU_XCALL_STATS(NULL);
9709 /* xcall to others on the same MMU to invalidate ctx */
9710 cpuset = mmu_ctxp->mmu_cpuset;
9711 ASSERT(CPU_IN_SET(cpuset, CPU->cpu_id) || !reset_cnum);
9712 CPUSET_DEL(cpuset, CPU->cpu_id);
9713 CPUSET_AND(cpuset, cpu_ready_set);
9716 * Pass in INVALID_CONTEXT as the first parameter to
9717 * sfmmu_raise_tsb_exception, which invalidates the context
9718 * of any process running on the CPUs in the MMU.
9720 xt_some(cpuset, sfmmu_raise_tsb_exception,
9721 INVALID_CONTEXT, INVALID_CONTEXT);
9722 xt_sync(cpuset);
9724 SFMMU_MMU_STAT(mmu_tsb_raise_exception);
9727 if (sfmmu_getctx_sec() != INVALID_CONTEXT) {
9728 sfmmu_setctx_sec(INVALID_CONTEXT);
9729 sfmmu_clear_utsbinfo();
9733 * No xcall is needed here. For sun4u systems all CPUs in context
9734 * domain share a single physical MMU therefore it's enough to flush
9735 * TLB on local CPU. On sun4v systems we use 1 global context
9736 * domain and flush all remote TLBs in sfmmu_raise_tsb_exception
9737 * handler. Note that vtag_flushall_uctxs() is called
9738 * for Ultra II machine, where the equivalent flushall functionality
9739 * is implemented in SW, and only user ctx TLB entries are flushed.
9741 if (&vtag_flushall_uctxs != NULL) {
9742 vtag_flushall_uctxs();
9743 } else {
9744 vtag_flushall();
9747 /* reset mmu cnum, skips cnum 0 and 1 */
9748 if (reset_cnum == B_TRUE)
9749 mmu_ctxp->mmu_cnum = NUM_LOCKED_CTXS;
9751 done:
9752 mutex_exit(&mmu_ctxp->mmu_lock);
9757 * For multi-threaded process, set the process context to INVALID_CONTEXT
9758 * so that it faults and reloads the MMU state from TL=0. For single-threaded
9759 * process, we can just load the MMU state directly without having to
9760 * set context invalid. Caller must hold the hat lock since we don't
9761 * acquire it here.
9763 static void
9764 sfmmu_sync_mmustate(sfmmu_t *sfmmup)
9766 uint_t cnum;
9767 uint_t pstate_save;
9769 ASSERT(sfmmup != ksfmmup);
9770 ASSERT(sfmmu_hat_lock_held(sfmmup));
9772 kpreempt_disable();
9775 * We check whether the pass'ed-in sfmmup is the same as the
9776 * current running proc. This is to makes sure the current proc
9777 * stays single-threaded if it already is.
9779 if ((sfmmup == curthread->t_procp->p_as->a_hat) &&
9780 (curthread->t_procp->p_lwpcnt == 1)) {
9781 /* single-thread */
9782 cnum = sfmmup->sfmmu_ctxs[CPU_MMU_IDX(CPU)].cnum;
9783 if (cnum != INVALID_CONTEXT) {
9784 uint_t curcnum;
9786 * Disable interrupts to prevent race condition
9787 * with sfmmu_ctx_wrap_around ctx invalidation.
9788 * In sun4v, ctx invalidation involves setting
9789 * TSB to NULL, hence, interrupts should be disabled
9790 * untill after sfmmu_load_mmustate is completed.
9792 pstate_save = sfmmu_disable_intrs();
9793 curcnum = sfmmu_getctx_sec();
9794 if (curcnum == cnum)
9795 sfmmu_load_mmustate(sfmmup);
9796 sfmmu_enable_intrs(pstate_save);
9797 ASSERT(curcnum == cnum || curcnum == INVALID_CONTEXT);
9799 } else {
9801 * multi-thread
9802 * or when sfmmup is not the same as the curproc.
9804 sfmmu_invalidate_ctx(sfmmup);
9807 kpreempt_enable();
9812 * Replace the specified TSB with a new TSB. This function gets called when
9813 * we grow, shrink or swapin a TSB. When swapping in a TSB (TSB_SWAPIN), the
9814 * TSB_FORCEALLOC flag may be used to force allocation of a minimum-sized TSB
9815 * (8K).
9817 * Caller must hold the HAT lock, but should assume any tsb_info
9818 * pointers it has are no longer valid after calling this function.
9820 * Return values:
9821 * TSB_ALLOCFAIL Failed to allocate a TSB, due to memory constraints
9822 * TSB_LOSTRACE HAT is busy, i.e. another thread is already doing
9823 * something to this tsbinfo/TSB
9824 * TSB_SUCCESS Operation succeeded
9826 static tsb_replace_rc_t
9827 sfmmu_replace_tsb(sfmmu_t *sfmmup, struct tsb_info *old_tsbinfo, uint_t szc,
9828 hatlock_t *hatlockp, uint_t flags)
9830 struct tsb_info *new_tsbinfo = NULL;
9831 struct tsb_info *curtsb, *prevtsb;
9832 uint_t tte_sz_mask;
9833 int i;
9835 ASSERT(sfmmup != ksfmmup);
9836 ASSERT(sfmmup->sfmmu_ismhat == 0);
9837 ASSERT(sfmmu_hat_lock_held(sfmmup));
9838 ASSERT(szc <= tsb_max_growsize);
9840 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_BUSY))
9841 return (TSB_LOSTRACE);
9844 * Find the tsb_info ahead of this one in the list, and
9845 * also make sure that the tsb_info passed in really
9846 * exists!
9848 for (prevtsb = NULL, curtsb = sfmmup->sfmmu_tsb;
9849 curtsb != old_tsbinfo && curtsb != NULL;
9850 prevtsb = curtsb, curtsb = curtsb->tsb_next)
9852 ASSERT(curtsb != NULL);
9854 if (!(flags & TSB_SWAPIN) && SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) {
9856 * The process is swapped out, so just set the new size
9857 * code. When it swaps back in, we'll allocate a new one
9858 * of the new chosen size.
9860 curtsb->tsb_szc = szc;
9861 return (TSB_SUCCESS);
9863 SFMMU_FLAGS_SET(sfmmup, HAT_BUSY);
9865 tte_sz_mask = old_tsbinfo->tsb_ttesz_mask;
9868 * All initialization is done inside of sfmmu_tsbinfo_alloc().
9869 * If we fail to allocate a TSB, exit.
9871 * If tsb grows with new tsb size > 4M and old tsb size < 4M,
9872 * then try 4M slab after the initial alloc fails.
9874 * If tsb swapin with tsb size > 4M, then try 4M after the
9875 * initial alloc fails.
9877 sfmmu_hat_exit(hatlockp);
9878 if (sfmmu_tsbinfo_alloc(&new_tsbinfo, szc,
9879 tte_sz_mask, flags, sfmmup) &&
9880 (!(flags & (TSB_GROW | TSB_SWAPIN)) || (szc <= TSB_4M_SZCODE) ||
9881 (!(flags & TSB_SWAPIN) &&
9882 (old_tsbinfo->tsb_szc >= TSB_4M_SZCODE)) ||
9883 sfmmu_tsbinfo_alloc(&new_tsbinfo, TSB_4M_SZCODE,
9884 tte_sz_mask, flags, sfmmup))) {
9885 (void) sfmmu_hat_enter(sfmmup);
9886 if (!(flags & TSB_SWAPIN))
9887 SFMMU_STAT(sf_tsb_resize_failures);
9888 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY);
9889 return (TSB_ALLOCFAIL);
9891 (void) sfmmu_hat_enter(sfmmup);
9894 * Re-check to make sure somebody else didn't muck with us while we
9895 * didn't hold the HAT lock. If the process swapped out, fine, just
9896 * exit; this can happen if we try to shrink the TSB from the context
9897 * of another process (such as on an ISM unmap), though it is rare.
9899 if (!(flags & TSB_SWAPIN) && SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) {
9900 SFMMU_STAT(sf_tsb_resize_failures);
9901 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY);
9902 sfmmu_hat_exit(hatlockp);
9903 sfmmu_tsbinfo_free(new_tsbinfo);
9904 (void) sfmmu_hat_enter(sfmmup);
9905 return (TSB_LOSTRACE);
9908 #ifdef DEBUG
9909 /* Reverify that the tsb_info still exists.. for debugging only */
9910 for (prevtsb = NULL, curtsb = sfmmup->sfmmu_tsb;
9911 curtsb != old_tsbinfo && curtsb != NULL;
9912 prevtsb = curtsb, curtsb = curtsb->tsb_next)
9914 ASSERT(curtsb != NULL);
9915 #endif /* DEBUG */
9918 * Quiesce any CPUs running this process on their next TLB miss
9919 * so they atomically see the new tsb_info. We temporarily set the
9920 * context to invalid context so new threads that come on processor
9921 * after we do the xcall to cpusran will also serialize behind the
9922 * HAT lock on TLB miss and will see the new TSB. Since this short
9923 * race with a new thread coming on processor is relatively rare,
9924 * this synchronization mechanism should be cheaper than always
9925 * pausing all CPUs for the duration of the setup, which is what
9926 * the old implementation did. This is particuarly true if we are
9927 * copying a huge chunk of memory around during that window.
9929 * The memory barriers are to make sure things stay consistent
9930 * with resume() since it does not hold the HAT lock while
9931 * walking the list of tsb_info structures.
9933 if ((flags & TSB_SWAPIN) != TSB_SWAPIN) {
9934 /* The TSB is either growing or shrinking. */
9935 sfmmu_invalidate_ctx(sfmmup);
9936 } else {
9938 * It is illegal to swap in TSBs from a process other
9939 * than a process being swapped in. This in turn
9940 * implies we do not have a valid MMU context here
9941 * since a process needs one to resolve translation
9942 * misses.
9944 ASSERT(curthread->t_procp->p_as->a_hat == sfmmup);
9947 #ifdef DEBUG
9948 ASSERT(max_mmu_ctxdoms > 0);
9951 * Process should have INVALID_CONTEXT on all MMUs
9953 for (i = 0; i < max_mmu_ctxdoms; i++) {
9955 ASSERT(sfmmup->sfmmu_ctxs[i].cnum == INVALID_CONTEXT);
9957 #endif
9959 new_tsbinfo->tsb_next = old_tsbinfo->tsb_next;
9960 membar_stst(); /* strict ordering required */
9961 if (prevtsb)
9962 prevtsb->tsb_next = new_tsbinfo;
9963 else
9964 sfmmup->sfmmu_tsb = new_tsbinfo;
9965 membar_enter(); /* make sure new TSB globally visible */
9968 * We need to migrate TSB entries from the old TSB to the new TSB
9969 * if tsb_remap_ttes is set and the TSB is growing.
9971 if (tsb_remap_ttes && ((flags & TSB_GROW) == TSB_GROW))
9972 sfmmu_copy_tsb(old_tsbinfo, new_tsbinfo);
9974 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY);
9977 * Drop the HAT lock to free our old tsb_info.
9979 sfmmu_hat_exit(hatlockp);
9981 if ((flags & TSB_GROW) == TSB_GROW) {
9982 SFMMU_STAT(sf_tsb_grow);
9983 } else if ((flags & TSB_SHRINK) == TSB_SHRINK) {
9984 SFMMU_STAT(sf_tsb_shrink);
9987 sfmmu_tsbinfo_free(old_tsbinfo);
9989 (void) sfmmu_hat_enter(sfmmup);
9990 return (TSB_SUCCESS);
9994 * This function will re-program hat pgsz array, and invalidate the
9995 * process' context, forcing the process to switch to another
9996 * context on the next TLB miss, and therefore start using the
9997 * TLB that is reprogrammed for the new page sizes.
9999 void
10000 sfmmu_reprog_pgsz_arr(sfmmu_t *sfmmup, uint8_t *tmp_pgsz)
10002 int i;
10003 hatlock_t *hatlockp = NULL;
10005 hatlockp = sfmmu_hat_enter(sfmmup);
10006 /* USIII+-IV+ optimization, requires hat lock */
10007 if (tmp_pgsz) {
10008 for (i = 0; i < mmu_page_sizes; i++)
10009 sfmmup->sfmmu_pgsz[i] = tmp_pgsz[i];
10011 SFMMU_STAT(sf_tlb_reprog_pgsz);
10013 sfmmu_invalidate_ctx(sfmmup);
10015 sfmmu_hat_exit(hatlockp);
10019 * The scd_rttecnt field in the SCD must be updated to take account of the
10020 * regions which it contains.
10022 static void
10023 sfmmu_set_scd_rttecnt(sf_srd_t *srdp, sf_scd_t *scdp)
10025 uint_t rid;
10026 uint_t i, j;
10027 ulong_t w;
10028 sf_region_t *rgnp;
10030 ASSERT(srdp != NULL);
10032 for (i = 0; i < SFMMU_HMERGNMAP_WORDS; i++) {
10033 if ((w = scdp->scd_region_map.bitmap[i]) == 0) {
10034 continue;
10037 j = 0;
10038 while (w) {
10039 if (!(w & 0x1)) {
10040 j++;
10041 w >>= 1;
10042 continue;
10044 rid = (i << BT_ULSHIFT) | j;
10045 j++;
10046 w >>= 1;
10048 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
10049 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
10050 rgnp = srdp->srd_hmergnp[rid];
10051 ASSERT(rgnp->rgn_refcnt > 0);
10052 ASSERT(rgnp->rgn_id == rid);
10054 scdp->scd_rttecnt[rgnp->rgn_pgszc] +=
10055 rgnp->rgn_size >> TTE_PAGE_SHIFT(rgnp->rgn_pgszc);
10058 * Maintain the tsb0 inflation cnt for the regions
10059 * in the SCD.
10061 if (rgnp->rgn_pgszc >= TTE4M) {
10062 scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt +=
10063 rgnp->rgn_size >>
10064 (TTE_PAGE_SHIFT(TTE8K) + 2);
10071 * This function assumes that there are either four or six supported page
10072 * sizes and at most two programmable TLBs, so we need to decide which
10073 * page sizes are most important and then tell the MMU layer so it
10074 * can adjust the TLB page sizes accordingly (if supported).
10076 * If these assumptions change, this function will need to be
10077 * updated to support whatever the new limits are.
10079 * The growing flag is nonzero if we are growing the address space,
10080 * and zero if it is shrinking. This allows us to decide whether
10081 * to grow or shrink our TSB, depending upon available memory
10082 * conditions.
10084 static void
10085 sfmmu_check_page_sizes(sfmmu_t *sfmmup, int growing)
10087 uint64_t ttecnt[MMU_PAGE_SIZES];
10088 uint64_t tte8k_cnt, tte4m_cnt;
10089 uint8_t i;
10090 int sectsb_thresh;
10093 * Kernel threads, processes with small address spaces not using
10094 * large pages, and dummy ISM HATs need not apply.
10096 if (sfmmup == ksfmmup || sfmmup->sfmmu_ismhat != NULL)
10097 return;
10099 if (!SFMMU_LGPGS_INUSE(sfmmup) &&
10100 sfmmup->sfmmu_ttecnt[TTE8K] <= tsb_rss_factor)
10101 return;
10103 for (i = 0; i < mmu_page_sizes; i++) {
10104 ttecnt[i] = sfmmup->sfmmu_ttecnt[i] +
10105 sfmmup->sfmmu_ismttecnt[i];
10108 /* Check pagesizes in use, and possibly reprogram DTLB. */
10109 if (&mmu_check_page_sizes)
10110 mmu_check_page_sizes(sfmmup, ttecnt);
10113 * Calculate the number of 8k ttes to represent the span of these
10114 * pages.
10116 tte8k_cnt = ttecnt[TTE8K] +
10117 (ttecnt[TTE64K] << (MMU_PAGESHIFT64K - MMU_PAGESHIFT)) +
10118 (ttecnt[TTE512K] << (MMU_PAGESHIFT512K - MMU_PAGESHIFT));
10119 if (mmu_page_sizes == max_mmu_page_sizes) {
10120 tte4m_cnt = ttecnt[TTE4M] +
10121 (ttecnt[TTE32M] << (MMU_PAGESHIFT32M - MMU_PAGESHIFT4M)) +
10122 (ttecnt[TTE256M] << (MMU_PAGESHIFT256M - MMU_PAGESHIFT4M));
10123 } else {
10124 tte4m_cnt = ttecnt[TTE4M];
10128 * Inflate tte8k_cnt to allow for region large page allocation failure.
10130 tte8k_cnt += sfmmup->sfmmu_tsb0_4minflcnt;
10133 * Inflate TSB sizes by a factor of 2 if this process
10134 * uses 4M text pages to minimize extra conflict misses
10135 * in the first TSB since without counting text pages
10136 * 8K TSB may become too small.
10138 * Also double the size of the second TSB to minimize
10139 * extra conflict misses due to competition between 4M text pages
10140 * and data pages.
10142 * We need to adjust the second TSB allocation threshold by the
10143 * inflation factor, since there is no point in creating a second
10144 * TSB when we know all the mappings can fit in the I/D TLBs.
10146 sectsb_thresh = tsb_sectsb_threshold;
10147 if (sfmmup->sfmmu_flags & HAT_4MTEXT_FLAG) {
10148 tte8k_cnt <<= 1;
10149 tte4m_cnt <<= 1;
10150 sectsb_thresh <<= 1;
10154 * Check to see if our TSB is the right size; we may need to
10155 * grow or shrink it. If the process is small, our work is
10156 * finished at this point.
10158 if (tte8k_cnt <= tsb_rss_factor && tte4m_cnt <= sectsb_thresh) {
10159 return;
10161 sfmmu_size_tsb(sfmmup, growing, tte8k_cnt, tte4m_cnt, sectsb_thresh);
10164 static void
10165 sfmmu_size_tsb(sfmmu_t *sfmmup, int growing, uint64_t tte8k_cnt,
10166 uint64_t tte4m_cnt, int sectsb_thresh)
10168 int tsb_bits;
10169 uint_t tsb_szc;
10170 struct tsb_info *tsbinfop;
10171 hatlock_t *hatlockp = NULL;
10173 hatlockp = sfmmu_hat_enter(sfmmup);
10174 ASSERT(hatlockp != NULL);
10175 tsbinfop = sfmmup->sfmmu_tsb;
10176 ASSERT(tsbinfop != NULL);
10179 * If we're growing, select the size based on RSS. If we're
10180 * shrinking, leave some room so we don't have to turn around and
10181 * grow again immediately.
10183 if (growing)
10184 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt);
10185 else
10186 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt << 1);
10188 if (!growing && (tsb_szc < tsbinfop->tsb_szc) &&
10189 (tsb_szc >= default_tsb_size) && TSB_OK_SHRINK()) {
10190 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, tsb_szc,
10191 hatlockp, TSB_SHRINK);
10192 } else if (growing && tsb_szc > tsbinfop->tsb_szc && TSB_OK_GROW()) {
10193 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, tsb_szc,
10194 hatlockp, TSB_GROW);
10196 tsbinfop = sfmmup->sfmmu_tsb;
10199 * With the TLB and first TSB out of the way, we need to see if
10200 * we need a second TSB for 4M pages. If we managed to reprogram
10201 * the TLB page sizes above, the process will start using this new
10202 * TSB right away; otherwise, it will start using it on the next
10203 * context switch. Either way, it's no big deal so there's no
10204 * synchronization with the trap handlers here unless we grow the
10205 * TSB (in which case it's required to prevent using the old one
10206 * after it's freed). Note: second tsb is required for 32M/256M
10207 * page sizes.
10209 if (tte4m_cnt > sectsb_thresh) {
10211 * If we're growing, select the size based on RSS. If we're
10212 * shrinking, leave some room so we don't have to turn
10213 * around and grow again immediately.
10215 if (growing)
10216 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt);
10217 else
10218 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt << 1);
10219 if (tsbinfop->tsb_next == NULL) {
10220 struct tsb_info *newtsb;
10221 int allocflags = SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)?
10222 0 : TSB_ALLOC;
10224 sfmmu_hat_exit(hatlockp);
10227 * Try to allocate a TSB for 4[32|256]M pages. If we
10228 * can't get the size we want, retry w/a minimum sized
10229 * TSB. If that still didn't work, give up; we can
10230 * still run without one.
10232 tsb_bits = (mmu_page_sizes == max_mmu_page_sizes)?
10233 TSB4M|TSB32M|TSB256M:TSB4M;
10234 if ((sfmmu_tsbinfo_alloc(&newtsb, tsb_szc, tsb_bits,
10235 allocflags, sfmmup)) &&
10236 (tsb_szc <= TSB_4M_SZCODE ||
10237 sfmmu_tsbinfo_alloc(&newtsb, TSB_4M_SZCODE,
10238 tsb_bits, allocflags, sfmmup)) &&
10239 sfmmu_tsbinfo_alloc(&newtsb, TSB_MIN_SZCODE,
10240 tsb_bits, allocflags, sfmmup)) {
10241 return;
10244 hatlockp = sfmmu_hat_enter(sfmmup);
10246 sfmmu_invalidate_ctx(sfmmup);
10248 if (sfmmup->sfmmu_tsb->tsb_next == NULL) {
10249 sfmmup->sfmmu_tsb->tsb_next = newtsb;
10250 SFMMU_STAT(sf_tsb_sectsb_create);
10251 sfmmu_hat_exit(hatlockp);
10252 return;
10253 } else {
10255 * It's annoying, but possible for us
10256 * to get here.. we dropped the HAT lock
10257 * because of locking order in the kmem
10258 * allocator, and while we were off getting
10259 * our memory, some other thread decided to
10260 * do us a favor and won the race to get a
10261 * second TSB for this process. Sigh.
10263 sfmmu_hat_exit(hatlockp);
10264 sfmmu_tsbinfo_free(newtsb);
10265 return;
10270 * We have a second TSB, see if it's big enough.
10272 tsbinfop = tsbinfop->tsb_next;
10275 * Check to see if our second TSB is the right size;
10276 * we may need to grow or shrink it.
10277 * To prevent thrashing (e.g. growing the TSB on a
10278 * subsequent map operation), only try to shrink if
10279 * the TSB reach exceeds twice the virtual address
10280 * space size.
10282 if (!growing && (tsb_szc < tsbinfop->tsb_szc) &&
10283 (tsb_szc >= default_tsb_size) && TSB_OK_SHRINK()) {
10284 (void) sfmmu_replace_tsb(sfmmup, tsbinfop,
10285 tsb_szc, hatlockp, TSB_SHRINK);
10286 } else if (growing && tsb_szc > tsbinfop->tsb_szc &&
10287 TSB_OK_GROW()) {
10288 (void) sfmmu_replace_tsb(sfmmup, tsbinfop,
10289 tsb_szc, hatlockp, TSB_GROW);
10293 sfmmu_hat_exit(hatlockp);
10297 * Free up a sfmmu
10298 * Since the sfmmu is currently embedded in the hat struct we simply zero
10299 * out our fields and free up the ism map blk list if any.
10301 static void
10302 sfmmu_free_sfmmu(sfmmu_t *sfmmup)
10304 ism_blk_t *blkp, *nx_blkp;
10305 #ifdef DEBUG
10306 ism_map_t *map;
10307 int i;
10308 #endif
10310 ASSERT(sfmmup->sfmmu_ttecnt[TTE8K] == 0);
10311 ASSERT(sfmmup->sfmmu_ttecnt[TTE64K] == 0);
10312 ASSERT(sfmmup->sfmmu_ttecnt[TTE512K] == 0);
10313 ASSERT(sfmmup->sfmmu_ttecnt[TTE4M] == 0);
10314 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0);
10315 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0);
10316 ASSERT(SF_RGNMAP_ISNULL(sfmmup));
10318 sfmmup->sfmmu_free = 0;
10319 sfmmup->sfmmu_ismhat = 0;
10321 blkp = sfmmup->sfmmu_iblk;
10322 sfmmup->sfmmu_iblk = NULL;
10324 while (blkp) {
10325 #ifdef DEBUG
10326 map = blkp->iblk_maps;
10327 for (i = 0; i < ISM_MAP_SLOTS; i++) {
10328 ASSERT(map[i].imap_seg == 0);
10329 ASSERT(map[i].imap_ismhat == NULL);
10330 ASSERT(map[i].imap_ment == NULL);
10332 #endif
10333 nx_blkp = blkp->iblk_next;
10334 blkp->iblk_next = NULL;
10335 blkp->iblk_nextpa = (uint64_t)-1;
10336 kmem_cache_free(ism_blk_cache, blkp);
10337 blkp = nx_blkp;
10342 * Locking primitves accessed by HATLOCK macros
10345 #define SFMMU_SPL_MTX (0x0)
10346 #define SFMMU_ML_MTX (0x1)
10348 #define SFMMU_MLSPL_MTX(type, pg) (((type) == SFMMU_SPL_MTX) ? \
10349 SPL_HASH(pg) : MLIST_HASH(pg))
10351 kmutex_t *
10352 sfmmu_page_enter(struct page *pp)
10354 return (sfmmu_mlspl_enter(pp, SFMMU_SPL_MTX));
10357 void
10358 sfmmu_page_exit(kmutex_t *spl)
10360 mutex_exit(spl);
10364 sfmmu_page_spl_held(struct page *pp)
10366 return (sfmmu_mlspl_held(pp, SFMMU_SPL_MTX));
10369 kmutex_t *
10370 sfmmu_mlist_enter(struct page *pp)
10372 return (sfmmu_mlspl_enter(pp, SFMMU_ML_MTX));
10375 void
10376 sfmmu_mlist_exit(kmutex_t *mml)
10378 mutex_exit(mml);
10382 sfmmu_mlist_held(struct page *pp)
10385 return (sfmmu_mlspl_held(pp, SFMMU_ML_MTX));
10389 * Common code for sfmmu_mlist_enter() and sfmmu_page_enter(). For
10390 * sfmmu_mlist_enter() case mml_table lock array is used and for
10391 * sfmmu_page_enter() sfmmu_page_lock lock array is used.
10393 * The lock is taken on a root page so that it protects an operation on all
10394 * constituent pages of a large page pp belongs to.
10396 * The routine takes a lock from the appropriate array. The lock is determined
10397 * by hashing the root page. After taking the lock this routine checks if the
10398 * root page has the same size code that was used to determine the root (i.e
10399 * that root hasn't changed). If root page has the expected p_szc field we
10400 * have the right lock and it's returned to the caller. If root's p_szc
10401 * decreased we release the lock and retry from the beginning. This case can
10402 * happen due to hat_page_demote() decreasing p_szc between our load of p_szc
10403 * value and taking the lock. The number of retries due to p_szc decrease is
10404 * limited by the maximum p_szc value. If p_szc is 0 we return the lock
10405 * determined by hashing pp itself.
10407 * If our caller doesn't hold a SE_SHARED or SE_EXCL lock on pp it's also
10408 * possible that p_szc can increase. To increase p_szc a thread has to lock
10409 * all constituent pages EXCL and do hat_pageunload() on all of them. All the
10410 * callers that don't hold a page locked recheck if hmeblk through which pp
10411 * was found still maps this pp. If it doesn't map it anymore returned lock
10412 * is immediately dropped. Therefore if sfmmu_mlspl_enter() hits the case of
10413 * p_szc increase after taking the lock it returns this lock without further
10414 * retries because in this case the caller doesn't care about which lock was
10415 * taken. The caller will drop it right away.
10417 * After the routine returns it's guaranteed that hat_page_demote() can't
10418 * change p_szc field of any of constituent pages of a large page pp belongs
10419 * to as long as pp was either locked at least SHARED prior to this call or
10420 * the caller finds that hment that pointed to this pp still references this
10421 * pp (this also assumes that the caller holds hme hash bucket lock so that
10422 * the same pp can't be remapped into the same hmeblk after it was unmapped by
10423 * hat_pageunload()).
10425 static kmutex_t *
10426 sfmmu_mlspl_enter(struct page *pp, int type)
10428 kmutex_t *mtx;
10429 uint_t prev_rszc = UINT_MAX;
10430 page_t *rootpp;
10431 uint_t szc;
10432 uint_t rszc;
10433 uint_t pszc = pp->p_szc;
10435 ASSERT(pp != NULL);
10437 again:
10438 if (pszc == 0) {
10439 mtx = SFMMU_MLSPL_MTX(type, pp);
10440 mutex_enter(mtx);
10441 return (mtx);
10444 /* The lock lives in the root page */
10445 rootpp = PP_GROUPLEADER(pp, pszc);
10446 mtx = SFMMU_MLSPL_MTX(type, rootpp);
10447 mutex_enter(mtx);
10450 * Return mml in the following 3 cases:
10452 * 1) If pp itself is root since if its p_szc decreased before we took
10453 * the lock pp is still the root of smaller szc page. And if its p_szc
10454 * increased it doesn't matter what lock we return (see comment in
10455 * front of this routine).
10457 * 2) If pp's not root but rootpp is the root of a rootpp->p_szc size
10458 * large page we have the right lock since any previous potential
10459 * hat_page_demote() is done demoting from greater than current root's
10460 * p_szc because hat_page_demote() changes root's p_szc last. No
10461 * further hat_page_demote() can start or be in progress since it
10462 * would need the same lock we currently hold.
10464 * 3) If rootpp's p_szc increased since previous iteration it doesn't
10465 * matter what lock we return (see comment in front of this routine).
10467 if (pp == rootpp || (rszc = rootpp->p_szc) == pszc ||
10468 rszc >= prev_rszc) {
10469 return (mtx);
10473 * hat_page_demote() could have decreased root's p_szc.
10474 * In this case pp's p_szc must also be smaller than pszc.
10475 * Retry.
10477 if (rszc < pszc) {
10478 szc = pp->p_szc;
10479 if (szc < pszc) {
10480 mutex_exit(mtx);
10481 pszc = szc;
10482 goto again;
10485 * pp's p_szc increased after it was decreased.
10486 * page cannot be mapped. Return current lock. The caller
10487 * will drop it right away.
10489 return (mtx);
10493 * root's p_szc is greater than pp's p_szc.
10494 * hat_page_demote() is not done with all pages
10495 * yet. Wait for it to complete.
10497 mutex_exit(mtx);
10498 rootpp = PP_GROUPLEADER(rootpp, rszc);
10499 mtx = SFMMU_MLSPL_MTX(type, rootpp);
10500 mutex_enter(mtx);
10501 mutex_exit(mtx);
10502 prev_rszc = rszc;
10503 goto again;
10506 static int
10507 sfmmu_mlspl_held(struct page *pp, int type)
10509 kmutex_t *mtx;
10511 ASSERT(pp != NULL);
10512 /* The lock lives in the root page */
10513 pp = PP_PAGEROOT(pp);
10514 ASSERT(pp != NULL);
10516 mtx = SFMMU_MLSPL_MTX(type, pp);
10517 return (MUTEX_HELD(mtx));
10520 static uint_t
10521 sfmmu_get_free_hblk(struct hme_blk **hmeblkpp, uint_t critical)
10523 struct hme_blk *hblkp;
10526 if (freehblkp != NULL) {
10527 mutex_enter(&freehblkp_lock);
10528 if (freehblkp != NULL) {
10530 * If the current thread is owning hblk_reserve OR
10531 * critical request from sfmmu_hblk_steal()
10532 * let it succeed even if freehblkcnt is really low.
10534 if (freehblkcnt <= HBLK_RESERVE_MIN && !critical) {
10535 SFMMU_STAT(sf_get_free_throttle);
10536 mutex_exit(&freehblkp_lock);
10537 return (0);
10539 freehblkcnt--;
10540 *hmeblkpp = freehblkp;
10541 hblkp = *hmeblkpp;
10542 freehblkp = hblkp->hblk_next;
10543 mutex_exit(&freehblkp_lock);
10544 hblkp->hblk_next = NULL;
10545 SFMMU_STAT(sf_get_free_success);
10547 ASSERT(hblkp->hblk_hmecnt == 0);
10548 ASSERT(hblkp->hblk_vcnt == 0);
10549 ASSERT(hblkp->hblk_nextpa == va_to_pa((caddr_t)hblkp));
10551 return (1);
10553 mutex_exit(&freehblkp_lock);
10556 /* Check cpu hblk pending queues */
10557 if ((*hmeblkpp = sfmmu_check_pending_hblks(TTE8K)) != NULL) {
10558 hblkp = *hmeblkpp;
10559 hblkp->hblk_next = NULL;
10560 hblkp->hblk_nextpa = va_to_pa((caddr_t)hblkp);
10562 ASSERT(hblkp->hblk_hmecnt == 0);
10563 ASSERT(hblkp->hblk_vcnt == 0);
10565 return (1);
10568 SFMMU_STAT(sf_get_free_fail);
10569 return (0);
10572 static uint_t
10573 sfmmu_put_free_hblk(struct hme_blk *hmeblkp, uint_t critical)
10575 struct hme_blk *hblkp;
10577 ASSERT(hmeblkp->hblk_hmecnt == 0);
10578 ASSERT(hmeblkp->hblk_vcnt == 0);
10579 ASSERT(hmeblkp->hblk_nextpa == va_to_pa((caddr_t)hmeblkp));
10582 * If the current thread is mapping into kernel space,
10583 * let it succede even if freehblkcnt is max
10584 * so that it will avoid freeing it to kmem.
10585 * This will prevent stack overflow due to
10586 * possible recursion since kmem_cache_free()
10587 * might require creation of a slab which
10588 * in turn needs an hmeblk to map that slab;
10589 * let's break this vicious chain at the first
10590 * opportunity.
10592 if (freehblkcnt < HBLK_RESERVE_CNT || critical) {
10593 mutex_enter(&freehblkp_lock);
10594 if (freehblkcnt < HBLK_RESERVE_CNT || critical) {
10595 SFMMU_STAT(sf_put_free_success);
10596 freehblkcnt++;
10597 hmeblkp->hblk_next = freehblkp;
10598 freehblkp = hmeblkp;
10599 mutex_exit(&freehblkp_lock);
10600 return (1);
10602 mutex_exit(&freehblkp_lock);
10606 * Bring down freehblkcnt to HBLK_RESERVE_CNT. We are here
10607 * only if freehblkcnt is at least HBLK_RESERVE_CNT *and*
10608 * we are not in the process of mapping into kernel space.
10610 ASSERT(!critical);
10611 while (freehblkcnt > HBLK_RESERVE_CNT) {
10612 mutex_enter(&freehblkp_lock);
10613 if (freehblkcnt > HBLK_RESERVE_CNT) {
10614 freehblkcnt--;
10615 hblkp = freehblkp;
10616 freehblkp = hblkp->hblk_next;
10617 mutex_exit(&freehblkp_lock);
10618 ASSERT(get_hblk_cache(hblkp) == sfmmu8_cache);
10619 kmem_cache_free(sfmmu8_cache, hblkp);
10620 continue;
10622 mutex_exit(&freehblkp_lock);
10624 SFMMU_STAT(sf_put_free_fail);
10625 return (0);
10628 static void
10629 sfmmu_hblk_swap(struct hme_blk *new)
10631 struct hme_blk *old, *hblkp, *prev;
10632 uint64_t newpa;
10633 caddr_t base, vaddr, endaddr;
10634 struct hmehash_bucket *hmebp;
10635 struct sf_hment *osfhme, *nsfhme;
10636 page_t *pp;
10637 kmutex_t *pml;
10638 tte_t tte;
10639 struct hme_blk *list = NULL;
10641 #ifdef DEBUG
10642 hmeblk_tag hblktag;
10643 struct hme_blk *found;
10644 #endif
10645 old = HBLK_RESERVE;
10646 ASSERT(!old->hblk_shared);
10649 * save pa before bcopy clobbers it
10651 newpa = new->hblk_nextpa;
10653 base = (caddr_t)get_hblk_base(old);
10654 endaddr = base + get_hblk_span(old);
10657 * acquire hash bucket lock.
10659 hmebp = sfmmu_tteload_acquire_hashbucket(ksfmmup, base, TTE8K,
10660 SFMMU_INVALID_SHMERID);
10663 * copy contents from old to new
10665 bcopy((void *)old, (void *)new, HME8BLK_SZ);
10668 * add new to hash chain
10670 sfmmu_hblk_hash_add(hmebp, new, newpa);
10673 * search hash chain for hblk_reserve; this needs to be performed
10674 * after adding new, otherwise prev won't correspond to the hblk which
10675 * is prior to old in hash chain when we call sfmmu_hblk_hash_rm to
10676 * remove old later.
10678 for (prev = NULL,
10679 hblkp = hmebp->hmeblkp; hblkp != NULL && hblkp != old;
10680 prev = hblkp, hblkp = hblkp->hblk_next)
10683 if (hblkp != old)
10684 panic("sfmmu_hblk_swap: hblk_reserve not found");
10687 * p_mapping list is still pointing to hments in hblk_reserve;
10688 * fix up p_mapping list so that they point to hments in new.
10690 * Since all these mappings are created by hblk_reserve_thread
10691 * on the way and it's using at least one of the buffers from each of
10692 * the newly minted slabs, there is no danger of any of these
10693 * mappings getting unloaded by another thread.
10695 * tsbmiss could only modify ref/mod bits of hments in old/new.
10696 * Since all of these hments hold mappings established by segkmem
10697 * and mappings in segkmem are setup with HAT_NOSYNC, ref/mod bits
10698 * have no meaning for the mappings in hblk_reserve. hments in
10699 * old and new are identical except for ref/mod bits.
10701 for (vaddr = base; vaddr < endaddr; vaddr += TTEBYTES(TTE8K)) {
10703 HBLKTOHME(osfhme, old, vaddr);
10704 sfmmu_copytte(&osfhme->hme_tte, &tte);
10706 if (TTE_IS_VALID(&tte)) {
10707 if ((pp = osfhme->hme_page) == NULL)
10708 panic("sfmmu_hblk_swap: page not mapped");
10710 pml = sfmmu_mlist_enter(pp);
10712 if (pp != osfhme->hme_page)
10713 panic("sfmmu_hblk_swap: mapping changed");
10715 HBLKTOHME(nsfhme, new, vaddr);
10717 HME_ADD(nsfhme, pp);
10718 HME_SUB(osfhme, pp);
10720 sfmmu_mlist_exit(pml);
10725 * remove old from hash chain
10727 sfmmu_hblk_hash_rm(hmebp, old, prev, &list, 1);
10729 #ifdef DEBUG
10731 hblktag.htag_id = ksfmmup;
10732 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
10733 hblktag.htag_bspage = HME_HASH_BSPAGE(base, HME_HASH_SHIFT(TTE8K));
10734 hblktag.htag_rehash = HME_HASH_REHASH(TTE8K);
10735 HME_HASH_FAST_SEARCH(hmebp, hblktag, found);
10737 if (found != new)
10738 panic("sfmmu_hblk_swap: new hblk not found");
10739 #endif
10741 SFMMU_HASH_UNLOCK(hmebp);
10744 * Reset hblk_reserve
10746 bzero((void *)old, HME8BLK_SZ);
10747 old->hblk_nextpa = va_to_pa((caddr_t)old);
10751 * Grab the mlist mutex for both pages passed in.
10753 * low and high will be returned as pointers to the mutexes for these pages.
10754 * low refers to the mutex residing in the lower bin of the mlist hash, while
10755 * high refers to the mutex residing in the higher bin of the mlist hash. This
10756 * is due to the locking order restrictions on the same thread grabbing
10757 * multiple mlist mutexes. The low lock must be acquired before the high lock.
10759 * If both pages hash to the same mutex, only grab that single mutex, and
10760 * high will be returned as NULL
10761 * If the pages hash to different bins in the hash, grab the lower addressed
10762 * lock first and then the higher addressed lock in order to follow the locking
10763 * rules involved with the same thread grabbing multiple mlist mutexes.
10764 * low and high will both have non-NULL values.
10766 static void
10767 sfmmu_mlist_reloc_enter(struct page *targ, struct page *repl,
10768 kmutex_t **low, kmutex_t **high)
10770 kmutex_t *mml_targ, *mml_repl;
10773 * no need to do the dance around szc as in sfmmu_mlist_enter()
10774 * because this routine is only called by hat_page_relocate() and all
10775 * targ and repl pages are already locked EXCL so szc can't change.
10778 mml_targ = MLIST_HASH(PP_PAGEROOT(targ));
10779 mml_repl = MLIST_HASH(PP_PAGEROOT(repl));
10781 if (mml_targ == mml_repl) {
10782 *low = mml_targ;
10783 *high = NULL;
10784 } else {
10785 if (mml_targ < mml_repl) {
10786 *low = mml_targ;
10787 *high = mml_repl;
10788 } else {
10789 *low = mml_repl;
10790 *high = mml_targ;
10794 mutex_enter(*low);
10795 if (*high)
10796 mutex_enter(*high);
10799 static void
10800 sfmmu_mlist_reloc_exit(kmutex_t *low, kmutex_t *high)
10802 if (high)
10803 mutex_exit(high);
10804 mutex_exit(low);
10807 static hatlock_t *
10808 sfmmu_hat_enter(sfmmu_t *sfmmup)
10810 hatlock_t *hatlockp;
10812 if (sfmmup != ksfmmup) {
10813 hatlockp = TSB_HASH(sfmmup);
10814 mutex_enter(HATLOCK_MUTEXP(hatlockp));
10815 return (hatlockp);
10817 return (NULL);
10820 static hatlock_t *
10821 sfmmu_hat_tryenter(sfmmu_t *sfmmup)
10823 hatlock_t *hatlockp;
10825 if (sfmmup != ksfmmup) {
10826 hatlockp = TSB_HASH(sfmmup);
10827 if (mutex_tryenter(HATLOCK_MUTEXP(hatlockp)) == 0)
10828 return (NULL);
10829 return (hatlockp);
10831 return (NULL);
10834 static void
10835 sfmmu_hat_exit(hatlock_t *hatlockp)
10837 if (hatlockp != NULL)
10838 mutex_exit(HATLOCK_MUTEXP(hatlockp));
10841 static void
10842 sfmmu_hat_lock_all(void)
10844 int i;
10845 for (i = 0; i < SFMMU_NUM_LOCK; i++)
10846 mutex_enter(HATLOCK_MUTEXP(&hat_lock[i]));
10849 static void
10850 sfmmu_hat_unlock_all(void)
10852 int i;
10853 for (i = SFMMU_NUM_LOCK - 1; i >= 0; i--)
10854 mutex_exit(HATLOCK_MUTEXP(&hat_lock[i]));
10858 sfmmu_hat_lock_held(sfmmu_t *sfmmup)
10860 ASSERT(sfmmup != ksfmmup);
10861 return (MUTEX_HELD(HATLOCK_MUTEXP(TSB_HASH(sfmmup))));
10865 * Locking primitives to provide consistency between ISM unmap
10866 * and other operations. Since ISM unmap can take a long time, we
10867 * use HAT_ISMBUSY flag (protected by the hatlock) to avoid creating
10868 * contention on the hatlock buckets while ISM segments are being
10869 * unmapped. The tradeoff is that the flags don't prevent priority
10870 * inversion from occurring, so we must request kernel priority in
10871 * case we have to sleep to keep from getting buried while holding
10872 * the HAT_ISMBUSY flag set, which in turn could block other kernel
10873 * threads from running (for example, in sfmmu_uvatopfn()).
10875 static void
10876 sfmmu_ismhat_enter(sfmmu_t *sfmmup, int hatlock_held)
10878 hatlock_t *hatlockp;
10880 THREAD_KPRI_REQUEST();
10881 if (!hatlock_held)
10882 hatlockp = sfmmu_hat_enter(sfmmup);
10883 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY))
10884 cv_wait(&sfmmup->sfmmu_tsb_cv, HATLOCK_MUTEXP(hatlockp));
10885 SFMMU_FLAGS_SET(sfmmup, HAT_ISMBUSY);
10886 if (!hatlock_held)
10887 sfmmu_hat_exit(hatlockp);
10890 static void
10891 sfmmu_ismhat_exit(sfmmu_t *sfmmup, int hatlock_held)
10893 hatlock_t *hatlockp;
10895 if (!hatlock_held)
10896 hatlockp = sfmmu_hat_enter(sfmmup);
10897 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY));
10898 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ISMBUSY);
10899 cv_broadcast(&sfmmup->sfmmu_tsb_cv);
10900 if (!hatlock_held)
10901 sfmmu_hat_exit(hatlockp);
10902 THREAD_KPRI_RELEASE();
10907 * Algorithm:
10909 * (1) if segkmem is not ready, allocate hblk from an array of pre-alloc'ed
10910 * hblks.
10912 * (2) if we are allocating an hblk for mapping a slab in sfmmu_cache,
10914 * (a) try to return an hblk from reserve pool of free hblks;
10915 * (b) if the reserve pool is empty, acquire hblk_reserve_lock
10916 * and return hblk_reserve.
10918 * (3) call kmem_cache_alloc() to allocate hblk;
10920 * (a) if hblk_reserve_lock is held by the current thread,
10921 * atomically replace hblk_reserve by the hblk that is
10922 * returned by kmem_cache_alloc; release hblk_reserve_lock
10923 * and call kmem_cache_alloc() again.
10924 * (b) if reserve pool is not full, add the hblk that is
10925 * returned by kmem_cache_alloc to reserve pool and
10926 * call kmem_cache_alloc again.
10929 static struct hme_blk *
10930 sfmmu_hblk_alloc(sfmmu_t *sfmmup, caddr_t vaddr,
10931 struct hmehash_bucket *hmebp, uint_t size, hmeblk_tag hblktag,
10932 uint_t flags, uint_t rid)
10934 struct hme_blk *hmeblkp = NULL;
10935 struct hme_blk *newhblkp;
10936 struct hme_blk *shw_hblkp = NULL;
10937 struct kmem_cache *sfmmu_cache = NULL;
10938 uint64_t hblkpa;
10939 ulong_t index;
10940 uint_t owner; /* set to 1 if using hblk_reserve */
10941 uint_t forcefree;
10942 int sleep;
10943 sf_srd_t *srdp;
10944 sf_region_t *rgnp;
10946 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
10947 ASSERT(hblktag.htag_rid == rid);
10948 SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size));
10949 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) ||
10950 IS_P2ALIGNED(vaddr, TTEBYTES(size)));
10953 * If segkmem is not created yet, allocate from static hmeblks
10954 * created at the end of startup_modules(). See the block comment
10955 * in startup_modules() describing how we estimate the number of
10956 * static hmeblks that will be needed during re-map.
10958 if (!hblk_alloc_dynamic) {
10960 ASSERT(!SFMMU_IS_SHMERID_VALID(rid));
10962 if (size == TTE8K) {
10963 index = nucleus_hblk8.index;
10964 if (index >= nucleus_hblk8.len) {
10966 * If we panic here, see startup_modules() to
10967 * make sure that we are calculating the
10968 * number of hblk8's that we need correctly.
10970 prom_panic("no nucleus hblk8 to allocate");
10972 hmeblkp =
10973 (struct hme_blk *)&nucleus_hblk8.list[index];
10974 nucleus_hblk8.index++;
10975 SFMMU_STAT(sf_hblk8_nalloc);
10976 } else {
10977 index = nucleus_hblk1.index;
10978 if (nucleus_hblk1.index >= nucleus_hblk1.len) {
10980 * If we panic here, see startup_modules().
10981 * Most likely you need to update the
10982 * calculation of the number of hblk1 elements
10983 * that the kernel needs to boot.
10985 prom_panic("no nucleus hblk1 to allocate");
10987 hmeblkp =
10988 (struct hme_blk *)&nucleus_hblk1.list[index];
10989 nucleus_hblk1.index++;
10990 SFMMU_STAT(sf_hblk1_nalloc);
10993 goto hblk_init;
10996 SFMMU_HASH_UNLOCK(hmebp);
10998 if (sfmmup != KHATID && !SFMMU_IS_SHMERID_VALID(rid)) {
10999 if (mmu_page_sizes == max_mmu_page_sizes) {
11000 if (size < TTE256M)
11001 shw_hblkp = sfmmu_shadow_hcreate(sfmmup, vaddr,
11002 size, flags);
11003 } else {
11004 if (size < TTE4M)
11005 shw_hblkp = sfmmu_shadow_hcreate(sfmmup, vaddr,
11006 size, flags);
11008 } else if (SFMMU_IS_SHMERID_VALID(rid)) {
11010 * Shared hmes use per region bitmaps in rgn_hmeflag
11011 * rather than shadow hmeblks to keep track of the
11012 * mapping sizes which have been allocated for the region.
11013 * Here we cleanup old invalid hmeblks with this rid,
11014 * which may be left around by pageunload().
11016 int ttesz;
11017 caddr_t va;
11018 caddr_t eva = vaddr + TTEBYTES(size);
11020 ASSERT(sfmmup != KHATID);
11022 srdp = sfmmup->sfmmu_srdp;
11023 ASSERT(srdp != NULL && srdp->srd_refcnt != 0);
11024 rgnp = srdp->srd_hmergnp[rid];
11025 ASSERT(rgnp != NULL && rgnp->rgn_id == rid);
11026 ASSERT(rgnp->rgn_refcnt != 0);
11027 ASSERT(size <= rgnp->rgn_pgszc);
11029 ttesz = HBLK_MIN_TTESZ;
11030 do {
11031 if (!(rgnp->rgn_hmeflags & (0x1 << ttesz))) {
11032 continue;
11035 if (ttesz > size && ttesz != HBLK_MIN_TTESZ) {
11036 sfmmu_cleanup_rhblk(srdp, vaddr, rid, ttesz);
11037 } else if (ttesz < size) {
11038 for (va = vaddr; va < eva;
11039 va += TTEBYTES(ttesz)) {
11040 sfmmu_cleanup_rhblk(srdp, va, rid,
11041 ttesz);
11044 } while (++ttesz <= rgnp->rgn_pgszc);
11047 fill_hblk:
11048 owner = (hblk_reserve_thread == curthread) ? 1 : 0;
11050 if (owner && size == TTE8K) {
11052 ASSERT(!SFMMU_IS_SHMERID_VALID(rid));
11054 * We are really in a tight spot. We already own
11055 * hblk_reserve and we need another hblk. In anticipation
11056 * of this kind of scenario, we specifically set aside
11057 * HBLK_RESERVE_MIN number of hblks to be used exclusively
11058 * by owner of hblk_reserve.
11060 SFMMU_STAT(sf_hblk_recurse_cnt);
11062 if (!sfmmu_get_free_hblk(&hmeblkp, 1))
11063 panic("sfmmu_hblk_alloc: reserve list is empty");
11065 goto hblk_verify;
11068 ASSERT(!owner);
11070 if ((flags & HAT_NO_KALLOC) == 0) {
11072 sfmmu_cache = ((size == TTE8K) ? sfmmu8_cache : sfmmu1_cache);
11073 sleep = ((sfmmup == KHATID) ? KM_NOSLEEP : KM_SLEEP);
11075 if ((hmeblkp = kmem_cache_alloc(sfmmu_cache, sleep)) == NULL) {
11076 hmeblkp = sfmmu_hblk_steal(size);
11077 } else {
11079 * if we are the owner of hblk_reserve,
11080 * swap hblk_reserve with hmeblkp and
11081 * start a fresh life. Hope things go
11082 * better this time.
11084 if (hblk_reserve_thread == curthread) {
11085 ASSERT(sfmmu_cache == sfmmu8_cache);
11086 sfmmu_hblk_swap(hmeblkp);
11087 hblk_reserve_thread = NULL;
11088 mutex_exit(&hblk_reserve_lock);
11089 goto fill_hblk;
11092 * let's donate this hblk to our reserve list if
11093 * we are not mapping kernel range
11095 if (size == TTE8K && sfmmup != KHATID) {
11096 if (sfmmu_put_free_hblk(hmeblkp, 0))
11097 goto fill_hblk;
11100 } else {
11102 * We are here to map the slab in sfmmu8_cache; let's
11103 * check if we could tap our reserve list; if successful,
11104 * this will avoid the pain of going thru sfmmu_hblk_swap
11106 SFMMU_STAT(sf_hblk_slab_cnt);
11107 if (!sfmmu_get_free_hblk(&hmeblkp, 0)) {
11109 * let's start hblk_reserve dance
11111 SFMMU_STAT(sf_hblk_reserve_cnt);
11112 owner = 1;
11113 mutex_enter(&hblk_reserve_lock);
11114 hmeblkp = HBLK_RESERVE;
11115 hblk_reserve_thread = curthread;
11119 hblk_verify:
11120 ASSERT(hmeblkp != NULL);
11121 set_hblk_sz(hmeblkp, size);
11122 ASSERT(hmeblkp->hblk_nextpa == va_to_pa((caddr_t)hmeblkp));
11123 SFMMU_HASH_LOCK(hmebp);
11124 HME_HASH_FAST_SEARCH(hmebp, hblktag, newhblkp);
11125 if (newhblkp != NULL) {
11126 SFMMU_HASH_UNLOCK(hmebp);
11127 if (hmeblkp != HBLK_RESERVE) {
11129 * This is really tricky!
11131 * vmem_alloc(vmem_seg_arena)
11132 * vmem_alloc(vmem_internal_arena)
11133 * segkmem_alloc(heap_arena)
11134 * vmem_alloc(heap_arena)
11135 * page_create()
11136 * hat_memload()
11137 * kmem_cache_free()
11138 * kmem_cache_alloc()
11139 * kmem_slab_create()
11140 * vmem_alloc(kmem_internal_arena)
11141 * segkmem_alloc(heap_arena)
11142 * vmem_alloc(heap_arena)
11143 * page_create()
11144 * hat_memload()
11145 * kmem_cache_free()
11146 * ...
11148 * Thus, hat_memload() could call kmem_cache_free
11149 * for enough number of times that we could easily
11150 * hit the bottom of the stack or run out of reserve
11151 * list of vmem_seg structs. So, we must donate
11152 * this hblk to reserve list if it's allocated
11153 * from sfmmu8_cache *and* mapping kernel range.
11154 * We don't need to worry about freeing hmeblk1's
11155 * to kmem since they don't map any kmem slabs.
11157 * Note: When segkmem supports largepages, we must
11158 * free hmeblk1's to reserve list as well.
11160 forcefree = (sfmmup == KHATID) ? 1 : 0;
11161 if (size == TTE8K &&
11162 sfmmu_put_free_hblk(hmeblkp, forcefree)) {
11163 goto re_verify;
11165 ASSERT(sfmmup != KHATID);
11166 kmem_cache_free(get_hblk_cache(hmeblkp), hmeblkp);
11167 } else {
11169 * Hey! we don't need hblk_reserve any more.
11171 ASSERT(owner);
11172 hblk_reserve_thread = NULL;
11173 mutex_exit(&hblk_reserve_lock);
11174 owner = 0;
11176 re_verify:
11178 * let's check if the goodies are still present
11180 SFMMU_HASH_LOCK(hmebp);
11181 HME_HASH_FAST_SEARCH(hmebp, hblktag, newhblkp);
11182 if (newhblkp != NULL) {
11184 * return newhblkp if it's not hblk_reserve;
11185 * if newhblkp is hblk_reserve, return it
11186 * _only if_ we are the owner of hblk_reserve.
11188 if (newhblkp != HBLK_RESERVE || owner) {
11189 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) ||
11190 newhblkp->hblk_shared);
11191 ASSERT(SFMMU_IS_SHMERID_VALID(rid) ||
11192 !newhblkp->hblk_shared);
11193 return (newhblkp);
11194 } else {
11196 * we just hit hblk_reserve in the hash and
11197 * we are not the owner of that;
11199 * block until hblk_reserve_thread completes
11200 * swapping hblk_reserve and try the dance
11201 * once again.
11203 SFMMU_HASH_UNLOCK(hmebp);
11204 mutex_enter(&hblk_reserve_lock);
11205 mutex_exit(&hblk_reserve_lock);
11206 SFMMU_STAT(sf_hblk_reserve_hit);
11207 goto fill_hblk;
11209 } else {
11211 * it's no more! try the dance once again.
11213 SFMMU_HASH_UNLOCK(hmebp);
11214 goto fill_hblk;
11218 hblk_init:
11219 if (SFMMU_IS_SHMERID_VALID(rid)) {
11220 uint16_t tteflag = 0x1 <<
11221 ((size < HBLK_MIN_TTESZ) ? HBLK_MIN_TTESZ : size);
11223 if (!(rgnp->rgn_hmeflags & tteflag)) {
11224 atomic_or_16(&rgnp->rgn_hmeflags, tteflag);
11226 hmeblkp->hblk_shared = 1;
11227 } else {
11228 hmeblkp->hblk_shared = 0;
11230 set_hblk_sz(hmeblkp, size);
11231 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
11232 hmeblkp->hblk_next = (struct hme_blk *)NULL;
11233 hmeblkp->hblk_tag = hblktag;
11234 hmeblkp->hblk_shadow = shw_hblkp;
11235 hblkpa = hmeblkp->hblk_nextpa;
11236 hmeblkp->hblk_nextpa = HMEBLK_ENDPA;
11238 ASSERT(get_hblk_ttesz(hmeblkp) == size);
11239 ASSERT(get_hblk_span(hmeblkp) == HMEBLK_SPAN(size));
11240 ASSERT(hmeblkp->hblk_hmecnt == 0);
11241 ASSERT(hmeblkp->hblk_vcnt == 0);
11242 ASSERT(hmeblkp->hblk_lckcnt == 0);
11243 ASSERT(hblkpa == va_to_pa((caddr_t)hmeblkp));
11244 sfmmu_hblk_hash_add(hmebp, hmeblkp, hblkpa);
11245 return (hmeblkp);
11249 * This function cleans up the hme_blk and returns it to the free list.
11251 /* ARGSUSED */
11252 static void
11253 sfmmu_hblk_free(struct hme_blk **listp)
11255 struct hme_blk *hmeblkp, *next_hmeblkp;
11256 int size;
11257 uint_t critical;
11258 uint64_t hblkpa;
11260 ASSERT(*listp != NULL);
11262 hmeblkp = *listp;
11263 while (hmeblkp != NULL) {
11264 next_hmeblkp = hmeblkp->hblk_next;
11265 ASSERT(!hmeblkp->hblk_hmecnt);
11266 ASSERT(!hmeblkp->hblk_vcnt);
11267 ASSERT(!hmeblkp->hblk_lckcnt);
11268 ASSERT(hmeblkp != (struct hme_blk *)hblk_reserve);
11269 ASSERT(hmeblkp->hblk_shared == 0);
11270 ASSERT(hmeblkp->hblk_shw_bit == 0);
11271 ASSERT(hmeblkp->hblk_shadow == NULL);
11273 hblkpa = va_to_pa((caddr_t)hmeblkp);
11274 ASSERT(hblkpa != (uint64_t)-1);
11275 critical = (hblktosfmmu(hmeblkp) == KHATID) ? 1 : 0;
11277 size = get_hblk_ttesz(hmeblkp);
11278 hmeblkp->hblk_next = NULL;
11279 hmeblkp->hblk_nextpa = hblkpa;
11281 if (hmeblkp->hblk_nuc_bit == 0) {
11283 if (size != TTE8K ||
11284 !sfmmu_put_free_hblk(hmeblkp, critical))
11285 kmem_cache_free(get_hblk_cache(hmeblkp),
11286 hmeblkp);
11288 hmeblkp = next_hmeblkp;
11292 #define BUCKETS_TO_SEARCH_BEFORE_UNLOAD 30
11293 #define SFMMU_HBLK_STEAL_THRESHOLD 5
11295 static uint_t sfmmu_hblk_steal_twice;
11296 static uint_t sfmmu_hblk_steal_count, sfmmu_hblk_steal_unload_count;
11299 * Steal a hmeblk from user or kernel hme hash lists.
11300 * For 8K tte grab one from reserve pool (freehblkp) before proceeding to
11301 * steal and if we fail to steal after SFMMU_HBLK_STEAL_THRESHOLD attempts
11302 * tap into critical reserve of freehblkp.
11303 * Note: We remain looping in this routine until we find one.
11305 static struct hme_blk *
11306 sfmmu_hblk_steal(int size)
11308 static struct hmehash_bucket *uhmehash_steal_hand = NULL;
11309 struct hmehash_bucket *hmebp;
11310 struct hme_blk *hmeblkp = NULL, *pr_hblk;
11311 uint64_t hblkpa;
11312 int i;
11313 uint_t loop_cnt = 0, critical;
11315 for (;;) {
11316 /* Check cpu hblk pending queues */
11317 if ((hmeblkp = sfmmu_check_pending_hblks(size)) != NULL) {
11318 hmeblkp->hblk_nextpa = va_to_pa((caddr_t)hmeblkp);
11319 ASSERT(hmeblkp->hblk_hmecnt == 0);
11320 ASSERT(hmeblkp->hblk_vcnt == 0);
11321 return (hmeblkp);
11324 if (size == TTE8K) {
11325 critical =
11326 (++loop_cnt > SFMMU_HBLK_STEAL_THRESHOLD) ? 1 : 0;
11327 if (sfmmu_get_free_hblk(&hmeblkp, critical))
11328 return (hmeblkp);
11331 hmebp = (uhmehash_steal_hand == NULL) ? uhme_hash :
11332 uhmehash_steal_hand;
11333 ASSERT(hmebp >= uhme_hash && hmebp <= &uhme_hash[UHMEHASH_SZ]);
11335 for (i = 0; hmeblkp == NULL && i <= UHMEHASH_SZ +
11336 BUCKETS_TO_SEARCH_BEFORE_UNLOAD; i++) {
11337 SFMMU_HASH_LOCK(hmebp);
11338 hmeblkp = hmebp->hmeblkp;
11339 hblkpa = hmebp->hmeh_nextpa;
11340 pr_hblk = NULL;
11341 while (hmeblkp) {
11343 * check if it is a hmeblk that is not locked
11344 * and not shared. skip shadow hmeblks with
11345 * shadow_mask set i.e valid count non zero.
11347 if ((get_hblk_ttesz(hmeblkp) == size) &&
11348 (hmeblkp->hblk_shw_bit == 0 ||
11349 hmeblkp->hblk_vcnt == 0) &&
11350 (hmeblkp->hblk_lckcnt == 0)) {
11352 * there is a high probability that we
11353 * will find a free one. search some
11354 * buckets for a free hmeblk initially
11355 * before unloading a valid hmeblk.
11357 if ((hmeblkp->hblk_vcnt == 0 &&
11358 hmeblkp->hblk_hmecnt == 0) || (i >=
11359 BUCKETS_TO_SEARCH_BEFORE_UNLOAD)) {
11360 if (sfmmu_steal_this_hblk(hmebp,
11361 hmeblkp, hblkpa, pr_hblk)) {
11363 * Hblk is unloaded
11364 * successfully
11366 break;
11370 pr_hblk = hmeblkp;
11371 hblkpa = hmeblkp->hblk_nextpa;
11372 hmeblkp = hmeblkp->hblk_next;
11375 SFMMU_HASH_UNLOCK(hmebp);
11376 if (hmebp++ == &uhme_hash[UHMEHASH_SZ])
11377 hmebp = uhme_hash;
11379 uhmehash_steal_hand = hmebp;
11381 if (hmeblkp != NULL)
11382 break;
11385 * in the worst case, look for a free one in the kernel
11386 * hash table.
11388 for (i = 0, hmebp = khme_hash; i <= KHMEHASH_SZ; i++) {
11389 SFMMU_HASH_LOCK(hmebp);
11390 hmeblkp = hmebp->hmeblkp;
11391 hblkpa = hmebp->hmeh_nextpa;
11392 pr_hblk = NULL;
11393 while (hmeblkp) {
11395 * check if it is free hmeblk
11397 if ((get_hblk_ttesz(hmeblkp) == size) &&
11398 (hmeblkp->hblk_lckcnt == 0) &&
11399 (hmeblkp->hblk_vcnt == 0) &&
11400 (hmeblkp->hblk_hmecnt == 0)) {
11401 if (sfmmu_steal_this_hblk(hmebp,
11402 hmeblkp, hblkpa, pr_hblk)) {
11403 break;
11404 } else {
11406 * Cannot fail since we have
11407 * hash lock.
11409 panic("fail to steal?");
11413 pr_hblk = hmeblkp;
11414 hblkpa = hmeblkp->hblk_nextpa;
11415 hmeblkp = hmeblkp->hblk_next;
11418 SFMMU_HASH_UNLOCK(hmebp);
11419 if (hmebp++ == &khme_hash[KHMEHASH_SZ])
11420 hmebp = khme_hash;
11423 if (hmeblkp != NULL)
11424 break;
11425 sfmmu_hblk_steal_twice++;
11427 return (hmeblkp);
11431 * This routine does real work to prepare a hblk to be "stolen" by
11432 * unloading the mappings, updating shadow counts ....
11433 * It returns 1 if the block is ready to be reused (stolen), or 0
11434 * means the block cannot be stolen yet- pageunload is still working
11435 * on this hblk.
11437 static int
11438 sfmmu_steal_this_hblk(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp,
11439 uint64_t hblkpa, struct hme_blk *pr_hblk)
11441 int shw_size, vshift;
11442 struct hme_blk *shw_hblkp;
11443 caddr_t vaddr;
11444 uint_t shw_mask, newshw_mask;
11445 struct hme_blk *list = NULL;
11447 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
11450 * check if the hmeblk is free, unload if necessary
11452 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) {
11453 sfmmu_t *sfmmup;
11454 demap_range_t dmr;
11456 sfmmup = hblktosfmmu(hmeblkp);
11457 if (hmeblkp->hblk_shared || sfmmup->sfmmu_ismhat) {
11458 return (0);
11460 DEMAP_RANGE_INIT(sfmmup, &dmr);
11461 (void) sfmmu_hblk_unload(sfmmup, hmeblkp,
11462 (caddr_t)get_hblk_base(hmeblkp),
11463 get_hblk_endaddr(hmeblkp), &dmr, HAT_UNLOAD);
11464 DEMAP_RANGE_FLUSH(&dmr);
11465 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) {
11467 * Pageunload is working on the same hblk.
11469 return (0);
11472 sfmmu_hblk_steal_unload_count++;
11475 ASSERT(hmeblkp->hblk_lckcnt == 0);
11476 ASSERT(hmeblkp->hblk_vcnt == 0 && hmeblkp->hblk_hmecnt == 0);
11478 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, &list, 1);
11479 hmeblkp->hblk_nextpa = hblkpa;
11481 shw_hblkp = hmeblkp->hblk_shadow;
11482 if (shw_hblkp) {
11483 ASSERT(!hmeblkp->hblk_shared);
11484 shw_size = get_hblk_ttesz(shw_hblkp);
11485 vaddr = (caddr_t)get_hblk_base(hmeblkp);
11486 vshift = vaddr_to_vshift(shw_hblkp->hblk_tag, vaddr, shw_size);
11487 ASSERT(vshift < 8);
11489 * Atomically clear shadow mask bit
11491 do {
11492 shw_mask = shw_hblkp->hblk_shw_mask;
11493 ASSERT(shw_mask & (1 << vshift));
11494 newshw_mask = shw_mask & ~(1 << vshift);
11495 newshw_mask = atomic_cas_32(&shw_hblkp->hblk_shw_mask,
11496 shw_mask, newshw_mask);
11497 } while (newshw_mask != shw_mask);
11498 hmeblkp->hblk_shadow = NULL;
11502 * remove shadow bit if we are stealing an unused shadow hmeblk.
11503 * sfmmu_hblk_alloc needs it that way, will set shadow bit later if
11504 * we are indeed allocating a shadow hmeblk.
11506 hmeblkp->hblk_shw_bit = 0;
11508 if (hmeblkp->hblk_shared) {
11509 sf_srd_t *srdp;
11510 sf_region_t *rgnp;
11511 uint_t rid;
11513 srdp = hblktosrd(hmeblkp);
11514 ASSERT(srdp != NULL && srdp->srd_refcnt != 0);
11515 rid = hmeblkp->hblk_tag.htag_rid;
11516 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
11517 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
11518 rgnp = srdp->srd_hmergnp[rid];
11519 ASSERT(rgnp != NULL);
11520 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid);
11521 hmeblkp->hblk_shared = 0;
11524 sfmmu_hblk_steal_count++;
11525 SFMMU_STAT(sf_steal_count);
11527 return (1);
11530 struct hme_blk *
11531 sfmmu_hmetohblk(struct sf_hment *sfhme)
11533 struct hme_blk *hmeblkp;
11534 struct sf_hment *sfhme0;
11535 struct hme_blk *hblk_dummy = 0;
11538 * No dummy sf_hments, please.
11540 ASSERT(sfhme->hme_tte.ll != 0);
11542 sfhme0 = sfhme - sfhme->hme_tte.tte_hmenum;
11543 hmeblkp = (struct hme_blk *)((uintptr_t)sfhme0 -
11544 (uintptr_t)&hblk_dummy->hblk_hme[0]);
11546 return (hmeblkp);
11550 * On swapin, get appropriately sized TSB(s) and clear the HAT_SWAPPED flag.
11551 * If we can't get appropriately sized TSB(s), try for 8K TSB(s) using
11552 * KM_SLEEP allocation.
11554 * Return 0 on success, -1 otherwise.
11556 static void
11557 sfmmu_tsb_swapin(sfmmu_t *sfmmup, hatlock_t *hatlockp)
11559 struct tsb_info *tsbinfop, *next;
11560 tsb_replace_rc_t rc;
11561 boolean_t gotfirst = B_FALSE;
11563 ASSERT(sfmmup != ksfmmup);
11564 ASSERT(sfmmu_hat_lock_held(sfmmup));
11566 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPIN)) {
11567 cv_wait(&sfmmup->sfmmu_tsb_cv, HATLOCK_MUTEXP(hatlockp));
11570 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) {
11571 SFMMU_FLAGS_SET(sfmmup, HAT_SWAPIN);
11572 } else {
11573 return;
11576 ASSERT(sfmmup->sfmmu_tsb != NULL);
11579 * Loop over all tsbinfo's replacing them with ones that actually have
11580 * a TSB. If any of the replacements ever fail, bail out of the loop.
11582 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; tsbinfop = next) {
11583 ASSERT(tsbinfop->tsb_flags & TSB_SWAPPED);
11584 next = tsbinfop->tsb_next;
11585 rc = sfmmu_replace_tsb(sfmmup, tsbinfop, tsbinfop->tsb_szc,
11586 hatlockp, TSB_SWAPIN);
11587 if (rc != TSB_SUCCESS) {
11588 break;
11590 gotfirst = B_TRUE;
11593 switch (rc) {
11594 case TSB_SUCCESS:
11595 SFMMU_FLAGS_CLEAR(sfmmup, HAT_SWAPPED|HAT_SWAPIN);
11596 cv_broadcast(&sfmmup->sfmmu_tsb_cv);
11597 return;
11598 case TSB_LOSTRACE:
11599 break;
11600 case TSB_ALLOCFAIL:
11601 break;
11602 default:
11603 panic("sfmmu_replace_tsb returned unrecognized failure code "
11604 "%d", rc);
11608 * In this case, we failed to get one of our TSBs. If we failed to
11609 * get the first TSB, get one of minimum size (8KB). Walk the list
11610 * and throw away the tsbinfos, starting where the allocation failed;
11611 * we can get by with just one TSB as long as we don't leave the
11612 * SWAPPED tsbinfo structures lying around.
11614 tsbinfop = sfmmup->sfmmu_tsb;
11615 next = tsbinfop->tsb_next;
11616 tsbinfop->tsb_next = NULL;
11618 sfmmu_hat_exit(hatlockp);
11619 for (tsbinfop = next; tsbinfop != NULL; tsbinfop = next) {
11620 next = tsbinfop->tsb_next;
11621 sfmmu_tsbinfo_free(tsbinfop);
11623 hatlockp = sfmmu_hat_enter(sfmmup);
11626 * If we don't have any TSBs, get a single 8K TSB for 8K, 64K and 512K
11627 * pages.
11629 if (!gotfirst) {
11630 tsbinfop = sfmmup->sfmmu_tsb;
11631 rc = sfmmu_replace_tsb(sfmmup, tsbinfop, TSB_MIN_SZCODE,
11632 hatlockp, TSB_SWAPIN | TSB_FORCEALLOC);
11633 ASSERT(rc == TSB_SUCCESS);
11636 SFMMU_FLAGS_CLEAR(sfmmup, HAT_SWAPPED|HAT_SWAPIN);
11637 cv_broadcast(&sfmmup->sfmmu_tsb_cv);
11640 static int
11641 sfmmu_is_rgnva(sf_srd_t *srdp, caddr_t addr, ulong_t w, ulong_t bmw)
11643 ulong_t bix = 0;
11644 uint_t rid;
11645 sf_region_t *rgnp;
11647 ASSERT(srdp != NULL);
11648 ASSERT(srdp->srd_refcnt != 0);
11650 w <<= BT_ULSHIFT;
11651 while (bmw) {
11652 if (!(bmw & 0x1)) {
11653 bix++;
11654 bmw >>= 1;
11655 continue;
11657 rid = w | bix;
11658 rgnp = srdp->srd_hmergnp[rid];
11659 ASSERT(rgnp->rgn_refcnt > 0);
11660 ASSERT(rgnp->rgn_id == rid);
11661 if (addr < rgnp->rgn_saddr ||
11662 addr >= (rgnp->rgn_saddr + rgnp->rgn_size)) {
11663 bix++;
11664 bmw >>= 1;
11665 } else {
11666 return (1);
11669 return (0);
11673 * Handle exceptions for low level tsb_handler.
11675 * There are many scenarios that could land us here:
11677 * If the context is invalid we land here. The context can be invalid
11678 * for 3 reasons: 1) we couldn't allocate a new context and now need to
11679 * perform a wrap around operation in order to allocate a new context.
11680 * 2) Context was invalidated to change pagesize programming 3) ISMs or
11681 * TSBs configuration is changeing for this process and we are forced into
11682 * here to do a syncronization operation. If the context is valid we can
11683 * be here from window trap hanlder. In this case just call trap to handle
11684 * the fault.
11686 * Note that the process will run in INVALID_CONTEXT before
11687 * faulting into here and subsequently loading the MMU registers
11688 * (including the TSB base register) associated with this process.
11689 * For this reason, the trap handlers must all test for
11690 * INVALID_CONTEXT before attempting to access any registers other
11691 * than the context registers.
11693 void
11694 sfmmu_tsbmiss_exception(struct regs *rp, uintptr_t tagaccess, uint_t traptype)
11696 sfmmu_t *sfmmup, *shsfmmup;
11697 uint_t ctxtype;
11698 klwp_id_t lwp;
11699 char lwp_save_state;
11700 hatlock_t *hatlockp, *shatlockp;
11701 struct tsb_info *tsbinfop;
11702 struct tsbmiss *tsbmp;
11703 sf_scd_t *scdp;
11705 SFMMU_STAT(sf_tsb_exceptions);
11706 SFMMU_MMU_STAT(mmu_tsb_exceptions);
11707 sfmmup = astosfmmu(curthread->t_procp->p_as);
11709 * note that in sun4u, tagacces register contains ctxnum
11710 * while sun4v passes ctxtype in the tagaccess register.
11712 ctxtype = tagaccess & TAGACC_CTX_MASK;
11714 ASSERT(sfmmup != ksfmmup && ctxtype != KCONTEXT);
11715 ASSERT(sfmmup->sfmmu_ismhat == 0);
11716 ASSERT(!SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED) ||
11717 ctxtype == INVALID_CONTEXT);
11719 if (ctxtype != INVALID_CONTEXT && traptype != T_DATA_PROT) {
11721 * We may land here because shme bitmap and pagesize
11722 * flags are updated lazily in tsbmiss area on other cpus.
11723 * If we detect here that tsbmiss area is out of sync with
11724 * sfmmu update it and retry the trapped instruction.
11725 * Otherwise call trap().
11727 int ret = 0;
11728 uchar_t tteflag_mask = (1 << TTE64K) | (1 << TTE8K);
11729 caddr_t addr = (caddr_t)(tagaccess & TAGACC_VADDR_MASK);
11732 * Must set lwp state to LWP_SYS before
11733 * trying to acquire any adaptive lock
11735 lwp = ttolwp(curthread);
11736 ASSERT(lwp);
11737 lwp_save_state = lwp->lwp_state;
11738 lwp->lwp_state = LWP_SYS;
11740 hatlockp = sfmmu_hat_enter(sfmmup);
11741 kpreempt_disable();
11742 tsbmp = &tsbmiss_area[CPU->cpu_id];
11743 ASSERT(sfmmup == tsbmp->usfmmup);
11744 if (((tsbmp->uhat_tteflags ^ sfmmup->sfmmu_tteflags) &
11745 ~tteflag_mask) ||
11746 ((tsbmp->uhat_rtteflags ^ sfmmup->sfmmu_rtteflags) &
11747 ~tteflag_mask)) {
11748 tsbmp->uhat_tteflags = sfmmup->sfmmu_tteflags;
11749 tsbmp->uhat_rtteflags = sfmmup->sfmmu_rtteflags;
11750 ret = 1;
11752 if (sfmmup->sfmmu_srdp != NULL) {
11753 ulong_t *sm = sfmmup->sfmmu_hmeregion_map.bitmap;
11754 ulong_t *tm = tsbmp->shmermap;
11755 ulong_t i;
11756 for (i = 0; i < SFMMU_HMERGNMAP_WORDS; i++) {
11757 ulong_t d = tm[i] ^ sm[i];
11758 if (d) {
11759 if (d & sm[i]) {
11760 if (!ret && sfmmu_is_rgnva(
11761 sfmmup->sfmmu_srdp,
11762 addr, i, d & sm[i])) {
11763 ret = 1;
11766 tm[i] = sm[i];
11770 kpreempt_enable();
11771 sfmmu_hat_exit(hatlockp);
11772 lwp->lwp_state = lwp_save_state;
11773 if (ret) {
11774 return;
11776 } else if (ctxtype == INVALID_CONTEXT) {
11778 * First, make sure we come out of here with a valid ctx,
11779 * since if we don't get one we'll simply loop on the
11780 * faulting instruction.
11782 * If the ISM mappings are changing, the TSB is relocated,
11783 * the process is swapped, the process is joining SCD or
11784 * leaving SCD or shared regions we serialize behind the
11785 * controlling thread with hat lock, sfmmu_flags and
11786 * sfmmu_tsb_cv condition variable.
11790 * Must set lwp state to LWP_SYS before
11791 * trying to acquire any adaptive lock
11793 lwp = ttolwp(curthread);
11794 ASSERT(lwp);
11795 lwp_save_state = lwp->lwp_state;
11796 lwp->lwp_state = LWP_SYS;
11798 hatlockp = sfmmu_hat_enter(sfmmup);
11799 retry:
11800 if ((scdp = sfmmup->sfmmu_scdp) != NULL) {
11801 shsfmmup = scdp->scd_sfmmup;
11802 ASSERT(shsfmmup != NULL);
11804 for (tsbinfop = shsfmmup->sfmmu_tsb; tsbinfop != NULL;
11805 tsbinfop = tsbinfop->tsb_next) {
11806 if (tsbinfop->tsb_flags & TSB_RELOC_FLAG) {
11807 /* drop the private hat lock */
11808 sfmmu_hat_exit(hatlockp);
11809 /* acquire the shared hat lock */
11810 shatlockp = sfmmu_hat_enter(shsfmmup);
11812 * recheck to see if anything changed
11813 * after we drop the private hat lock.
11815 if (sfmmup->sfmmu_scdp == scdp &&
11816 shsfmmup == scdp->scd_sfmmup) {
11817 sfmmu_tsb_chk_reloc(shsfmmup,
11818 shatlockp);
11820 sfmmu_hat_exit(shatlockp);
11821 hatlockp = sfmmu_hat_enter(sfmmup);
11822 goto retry;
11827 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL;
11828 tsbinfop = tsbinfop->tsb_next) {
11829 if (tsbinfop->tsb_flags & TSB_RELOC_FLAG) {
11830 cv_wait(&sfmmup->sfmmu_tsb_cv,
11831 HATLOCK_MUTEXP(hatlockp));
11832 goto retry;
11837 * Wait for ISM maps to be updated.
11839 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) {
11840 cv_wait(&sfmmup->sfmmu_tsb_cv,
11841 HATLOCK_MUTEXP(hatlockp));
11842 goto retry;
11845 /* Is this process joining an SCD? */
11846 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)) {
11848 * Flush private TSB and setup shared TSB.
11849 * sfmmu_finish_join_scd() does not drop the
11850 * hat lock.
11852 sfmmu_finish_join_scd(sfmmup);
11853 SFMMU_FLAGS_CLEAR(sfmmup, HAT_JOIN_SCD);
11857 * If we're swapping in, get TSB(s). Note that we must do
11858 * this before we get a ctx or load the MMU state. Once
11859 * we swap in we have to recheck to make sure the TSB(s) and
11860 * ISM mappings didn't change while we slept.
11862 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) {
11863 sfmmu_tsb_swapin(sfmmup, hatlockp);
11864 goto retry;
11867 sfmmu_get_ctx(sfmmup);
11869 sfmmu_hat_exit(hatlockp);
11871 * Must restore lwp_state if not calling
11872 * trap() for further processing. Restore
11873 * it anyway.
11875 lwp->lwp_state = lwp_save_state;
11876 return;
11878 trap(rp, (caddr_t)tagaccess, traptype, 0);
11881 static void
11882 sfmmu_tsb_chk_reloc(sfmmu_t *sfmmup, hatlock_t *hatlockp)
11884 struct tsb_info *tp;
11886 ASSERT(sfmmu_hat_lock_held(sfmmup));
11888 for (tp = sfmmup->sfmmu_tsb; tp != NULL; tp = tp->tsb_next) {
11889 if (tp->tsb_flags & TSB_RELOC_FLAG) {
11890 cv_wait(&sfmmup->sfmmu_tsb_cv,
11891 HATLOCK_MUTEXP(hatlockp));
11892 break;
11898 * sfmmu_vatopfn_suspended is called from GET_TTE when TL=0 and
11899 * TTE_SUSPENDED bit set in tte we block on aquiring a page lock
11900 * rather than spinning to avoid send mondo timeouts with
11901 * interrupts enabled. When the lock is acquired it is immediately
11902 * released and we return back to sfmmu_vatopfn just after
11903 * the GET_TTE call.
11905 void
11906 sfmmu_vatopfn_suspended(caddr_t vaddr, sfmmu_t *sfmmu, tte_t *ttep)
11908 struct page **pp;
11910 (void) as_pagelock(sfmmu->sfmmu_as, &pp, vaddr, TTE_CSZ(ttep), S_WRITE);
11911 as_pageunlock(sfmmu->sfmmu_as, pp, vaddr, TTE_CSZ(ttep), S_WRITE);
11915 * sfmmu_tsbmiss_suspended is called from GET_TTE when TL>0 and
11916 * TTE_SUSPENDED bit set in tte. We do this so that we can handle
11917 * cross traps which cannot be handled while spinning in the
11918 * trap handlers. Simply enter and exit the kpr_suspendlock spin
11919 * mutex, which is held by the holder of the suspend bit, and then
11920 * retry the trapped instruction after unwinding.
11922 /*ARGSUSED*/
11923 void
11924 sfmmu_tsbmiss_suspended(struct regs *rp, uintptr_t tagacc, uint_t traptype)
11926 ASSERT(curthread != kreloc_thread);
11927 mutex_enter(&kpr_suspendlock);
11928 mutex_exit(&kpr_suspendlock);
11932 * This routine could be optimized to reduce the number of xcalls by flushing
11933 * the entire TLBs if region reference count is above some threshold but the
11934 * tradeoff will depend on the size of the TLB. So for now flush the specific
11935 * page a context at a time.
11937 * If uselocks is 0 then it's called after all cpus were captured and all the
11938 * hat locks were taken. In this case don't take the region lock by relying on
11939 * the order of list region update operations in hat_join_region(),
11940 * hat_leave_region() and hat_dup_region(). The ordering in those routines
11941 * guarantees that list is always forward walkable and reaches active sfmmus
11942 * regardless of where xc_attention() captures a cpu.
11944 cpuset_t
11945 sfmmu_rgntlb_demap(caddr_t addr, sf_region_t *rgnp,
11946 struct hme_blk *hmeblkp, int uselocks)
11948 sfmmu_t *sfmmup;
11949 cpuset_t cpuset;
11950 cpuset_t rcpuset;
11951 hatlock_t *hatlockp;
11952 uint_t rid = rgnp->rgn_id;
11953 sf_rgn_link_t *rlink;
11954 sf_scd_t *scdp;
11956 ASSERT(hmeblkp->hblk_shared);
11957 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
11958 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
11960 CPUSET_ZERO(rcpuset);
11961 if (uselocks) {
11962 mutex_enter(&rgnp->rgn_mutex);
11964 sfmmup = rgnp->rgn_sfmmu_head;
11965 while (sfmmup != NULL) {
11966 if (uselocks) {
11967 hatlockp = sfmmu_hat_enter(sfmmup);
11971 * When an SCD is created the SCD hat is linked on the sfmmu
11972 * region lists for each hme region which is part of the
11973 * SCD. If we find an SCD hat, when walking these lists,
11974 * then we flush the shared TSBs, if we find a private hat,
11975 * which is part of an SCD, but where the region
11976 * is not part of the SCD then we flush the private TSBs.
11978 if (!sfmmup->sfmmu_scdhat && sfmmup->sfmmu_scdp != NULL &&
11979 !SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)) {
11980 scdp = sfmmup->sfmmu_scdp;
11981 if (SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) {
11982 if (uselocks) {
11983 sfmmu_hat_exit(hatlockp);
11985 goto next;
11989 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0);
11991 kpreempt_disable();
11992 cpuset = sfmmup->sfmmu_cpusran;
11993 CPUSET_AND(cpuset, cpu_ready_set);
11994 CPUSET_DEL(cpuset, CPU->cpu_id);
11995 SFMMU_XCALL_STATS(sfmmup);
11996 xt_some(cpuset, vtag_flushpage_tl1,
11997 (uint64_t)addr, (uint64_t)sfmmup);
11998 vtag_flushpage(addr, (uint64_t)sfmmup);
11999 if (uselocks) {
12000 sfmmu_hat_exit(hatlockp);
12002 kpreempt_enable();
12003 CPUSET_OR(rcpuset, cpuset);
12005 next:
12006 /* LINTED: constant in conditional context */
12007 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 0, 0);
12008 ASSERT(rlink != NULL);
12009 sfmmup = rlink->next;
12011 if (uselocks) {
12012 mutex_exit(&rgnp->rgn_mutex);
12014 return (rcpuset);
12018 * This routine takes an sfmmu pointer and the va for an adddress in an
12019 * ISM region as input and returns the corresponding region id in ism_rid.
12020 * The return value of 1 indicates that a region has been found and ism_rid
12021 * is valid, otherwise 0 is returned.
12023 static int
12024 find_ism_rid(sfmmu_t *sfmmup, sfmmu_t *ism_sfmmup, caddr_t va, uint_t *ism_rid)
12026 ism_blk_t *ism_blkp;
12027 int i;
12028 ism_map_t *ism_map;
12029 #ifdef DEBUG
12030 struct hat *ism_hatid;
12031 #endif
12032 ASSERT(sfmmu_hat_lock_held(sfmmup));
12034 ism_blkp = sfmmup->sfmmu_iblk;
12035 while (ism_blkp != NULL) {
12036 ism_map = ism_blkp->iblk_maps;
12037 for (i = 0; i < ISM_MAP_SLOTS && ism_map[i].imap_ismhat; i++) {
12038 if ((va >= ism_start(ism_map[i])) &&
12039 (va < ism_end(ism_map[i]))) {
12041 *ism_rid = ism_map[i].imap_rid;
12042 #ifdef DEBUG
12043 ism_hatid = ism_map[i].imap_ismhat;
12044 ASSERT(ism_hatid == ism_sfmmup);
12045 ASSERT(ism_hatid->sfmmu_ismhat);
12046 #endif
12047 return (1);
12050 ism_blkp = ism_blkp->iblk_next;
12052 return (0);
12056 * Special routine to flush out ism mappings- TSBs, TLBs and D-caches.
12057 * This routine may be called with all cpu's captured. Therefore, the
12058 * caller is responsible for holding all locks and disabling kernel
12059 * preemption.
12061 /* ARGSUSED */
12062 static void
12063 sfmmu_ismtlbcache_demap(caddr_t addr, sfmmu_t *ism_sfmmup,
12064 struct hme_blk *hmeblkp, pfn_t pfnum, int cache_flush_flag)
12066 cpuset_t cpuset;
12067 caddr_t va;
12068 ism_ment_t *ment;
12069 sfmmu_t *sfmmup;
12070 #ifdef VAC
12071 int vcolor;
12072 #endif
12074 sf_scd_t *scdp;
12075 uint_t ism_rid;
12077 ASSERT(!hmeblkp->hblk_shared);
12079 * Walk the ism_hat's mapping list and flush the page
12080 * from every hat sharing this ism_hat. This routine
12081 * may be called while all cpu's have been captured.
12082 * Therefore we can't attempt to grab any locks. For now
12083 * this means we will protect the ism mapping list under
12084 * a single lock which will be grabbed by the caller.
12085 * If hat_share/unshare scalibility becomes a performance
12086 * problem then we may need to re-think ism mapping list locking.
12088 ASSERT(ism_sfmmup->sfmmu_ismhat);
12089 ASSERT(MUTEX_HELD(&ism_mlist_lock));
12090 addr = addr - ISMID_STARTADDR;
12092 for (ment = ism_sfmmup->sfmmu_iment; ment; ment = ment->iment_next) {
12094 sfmmup = ment->iment_hat;
12096 va = ment->iment_base_va;
12097 va = (caddr_t)((uintptr_t)va + (uintptr_t)addr);
12100 * When an SCD is created the SCD hat is linked on the ism
12101 * mapping lists for each ISM segment which is part of the
12102 * SCD. If we find an SCD hat, when walking these lists,
12103 * then we flush the shared TSBs, if we find a private hat,
12104 * which is part of an SCD, but where the region
12105 * corresponding to this va is not part of the SCD then we
12106 * flush the private TSBs.
12108 if (!sfmmup->sfmmu_scdhat && sfmmup->sfmmu_scdp != NULL &&
12109 !SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD) &&
12110 !SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) {
12111 if (!find_ism_rid(sfmmup, ism_sfmmup, va,
12112 &ism_rid)) {
12113 cmn_err(CE_PANIC,
12114 "can't find matching ISM rid!");
12117 scdp = sfmmup->sfmmu_scdp;
12118 if (SFMMU_IS_ISMRID_VALID(ism_rid) &&
12119 SF_RGNMAP_TEST(scdp->scd_ismregion_map,
12120 ism_rid)) {
12121 continue;
12124 SFMMU_UNLOAD_TSB(va, sfmmup, hmeblkp, 1);
12126 cpuset = sfmmup->sfmmu_cpusran;
12127 CPUSET_AND(cpuset, cpu_ready_set);
12128 CPUSET_DEL(cpuset, CPU->cpu_id);
12129 SFMMU_XCALL_STATS(sfmmup);
12130 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)va,
12131 (uint64_t)sfmmup);
12132 vtag_flushpage(va, (uint64_t)sfmmup);
12134 #ifdef VAC
12136 * Flush D$
12137 * When flushing D$ we must flush all
12138 * cpu's. See sfmmu_cache_flush().
12140 if (cache_flush_flag == CACHE_FLUSH) {
12141 cpuset = cpu_ready_set;
12142 CPUSET_DEL(cpuset, CPU->cpu_id);
12144 SFMMU_XCALL_STATS(sfmmup);
12145 vcolor = addr_to_vcolor(va);
12146 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor);
12147 vac_flushpage(pfnum, vcolor);
12149 #endif /* VAC */
12154 * Demaps the TSB, CPU caches, and flushes all TLBs on all CPUs of
12155 * a particular virtual address and ctx. If noflush is set we do not
12156 * flush the TLB/TSB. This function may or may not be called with the
12157 * HAT lock held.
12159 static void
12160 sfmmu_tlbcache_demap(caddr_t addr, sfmmu_t *sfmmup, struct hme_blk *hmeblkp,
12161 pfn_t pfnum, int tlb_noflush, int cpu_flag, int cache_flush_flag,
12162 int hat_lock_held)
12164 #ifdef VAC
12165 int vcolor;
12166 #endif
12167 cpuset_t cpuset;
12168 hatlock_t *hatlockp;
12170 ASSERT(!hmeblkp->hblk_shared);
12172 #if defined(lint) && !defined(VAC)
12173 pfnum = pfnum;
12174 cpu_flag = cpu_flag;
12175 cache_flush_flag = cache_flush_flag;
12176 #endif
12179 * There is no longer a need to protect against ctx being
12180 * stolen here since we don't store the ctx in the TSB anymore.
12182 #ifdef VAC
12183 vcolor = addr_to_vcolor(addr);
12184 #endif
12187 * We must hold the hat lock during the flush of TLB,
12188 * to avoid a race with sfmmu_invalidate_ctx(), where
12189 * sfmmu_cnum on a MMU could be set to INVALID_CONTEXT,
12190 * causing TLB demap routine to skip flush on that MMU.
12191 * If the context on a MMU has already been set to
12192 * INVALID_CONTEXT, we just get an extra flush on
12193 * that MMU.
12195 if (!hat_lock_held && !tlb_noflush)
12196 hatlockp = sfmmu_hat_enter(sfmmup);
12198 kpreempt_disable();
12199 if (!tlb_noflush) {
12201 * Flush the TSB and TLB.
12203 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0);
12205 cpuset = sfmmup->sfmmu_cpusran;
12206 CPUSET_AND(cpuset, cpu_ready_set);
12207 CPUSET_DEL(cpuset, CPU->cpu_id);
12209 SFMMU_XCALL_STATS(sfmmup);
12211 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr,
12212 (uint64_t)sfmmup);
12214 vtag_flushpage(addr, (uint64_t)sfmmup);
12217 if (!hat_lock_held && !tlb_noflush)
12218 sfmmu_hat_exit(hatlockp);
12220 #ifdef VAC
12222 * Flush the D$
12224 * Even if the ctx is stolen, we need to flush the
12225 * cache. Our ctx stealer only flushes the TLBs.
12227 if (cache_flush_flag == CACHE_FLUSH) {
12228 if (cpu_flag & FLUSH_ALL_CPUS) {
12229 cpuset = cpu_ready_set;
12230 } else {
12231 cpuset = sfmmup->sfmmu_cpusran;
12232 CPUSET_AND(cpuset, cpu_ready_set);
12234 CPUSET_DEL(cpuset, CPU->cpu_id);
12235 SFMMU_XCALL_STATS(sfmmup);
12236 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor);
12237 vac_flushpage(pfnum, vcolor);
12239 #endif /* VAC */
12240 kpreempt_enable();
12244 * Demaps the TSB and flushes all TLBs on all cpus for a particular virtual
12245 * address and ctx. If noflush is set we do not currently do anything.
12246 * This function may or may not be called with the HAT lock held.
12248 static void
12249 sfmmu_tlb_demap(caddr_t addr, sfmmu_t *sfmmup, struct hme_blk *hmeblkp,
12250 int tlb_noflush, int hat_lock_held)
12252 cpuset_t cpuset;
12253 hatlock_t *hatlockp;
12255 ASSERT(!hmeblkp->hblk_shared);
12258 * If the process is exiting we have nothing to do.
12260 if (tlb_noflush)
12261 return;
12264 * Flush TSB.
12266 if (!hat_lock_held)
12267 hatlockp = sfmmu_hat_enter(sfmmup);
12268 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0);
12270 kpreempt_disable();
12272 cpuset = sfmmup->sfmmu_cpusran;
12273 CPUSET_AND(cpuset, cpu_ready_set);
12274 CPUSET_DEL(cpuset, CPU->cpu_id);
12276 SFMMU_XCALL_STATS(sfmmup);
12277 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr, (uint64_t)sfmmup);
12279 vtag_flushpage(addr, (uint64_t)sfmmup);
12281 if (!hat_lock_held)
12282 sfmmu_hat_exit(hatlockp);
12284 kpreempt_enable();
12289 * Special case of sfmmu_tlb_demap for MMU_PAGESIZE hblks. Use the xcall
12290 * call handler that can flush a range of pages to save on xcalls.
12292 static int sfmmu_xcall_save;
12295 * this routine is never used for demaping addresses backed by SRD hmeblks.
12297 static void
12298 sfmmu_tlb_range_demap(demap_range_t *dmrp)
12300 sfmmu_t *sfmmup = dmrp->dmr_sfmmup;
12301 hatlock_t *hatlockp;
12302 cpuset_t cpuset;
12303 uint64_t sfmmu_pgcnt;
12304 pgcnt_t pgcnt = 0;
12305 int pgunload = 0;
12306 int dirtypg = 0;
12307 caddr_t addr = dmrp->dmr_addr;
12308 caddr_t eaddr;
12309 uint64_t bitvec = dmrp->dmr_bitvec;
12311 ASSERT(bitvec & 1);
12314 * Flush TSB and calculate number of pages to flush.
12316 while (bitvec != 0) {
12317 dirtypg = 0;
12319 * Find the first page to flush and then count how many
12320 * pages there are after it that also need to be flushed.
12321 * This way the number of TSB flushes is minimized.
12323 while ((bitvec & 1) == 0) {
12324 pgcnt++;
12325 addr += MMU_PAGESIZE;
12326 bitvec >>= 1;
12328 while (bitvec & 1) {
12329 dirtypg++;
12330 bitvec >>= 1;
12332 eaddr = addr + ptob(dirtypg);
12333 hatlockp = sfmmu_hat_enter(sfmmup);
12334 sfmmu_unload_tsb_range(sfmmup, addr, eaddr, TTE8K);
12335 sfmmu_hat_exit(hatlockp);
12336 pgunload += dirtypg;
12337 addr = eaddr;
12338 pgcnt += dirtypg;
12341 ASSERT((pgcnt<<MMU_PAGESHIFT) <= dmrp->dmr_endaddr - dmrp->dmr_addr);
12342 if (sfmmup->sfmmu_free == 0) {
12343 addr = dmrp->dmr_addr;
12344 bitvec = dmrp->dmr_bitvec;
12347 * make sure it has SFMMU_PGCNT_SHIFT bits only,
12348 * as it will be used to pack argument for xt_some
12350 ASSERT((pgcnt > 0) &&
12351 (pgcnt <= (1 << SFMMU_PGCNT_SHIFT)));
12354 * Encode pgcnt as (pgcnt -1 ), and pass (pgcnt - 1) in
12355 * the low 6 bits of sfmmup. This is doable since pgcnt
12356 * always >= 1.
12358 ASSERT(!((uint64_t)sfmmup & SFMMU_PGCNT_MASK));
12359 sfmmu_pgcnt = (uint64_t)sfmmup |
12360 ((pgcnt - 1) & SFMMU_PGCNT_MASK);
12363 * We must hold the hat lock during the flush of TLB,
12364 * to avoid a race with sfmmu_invalidate_ctx(), where
12365 * sfmmu_cnum on a MMU could be set to INVALID_CONTEXT,
12366 * causing TLB demap routine to skip flush on that MMU.
12367 * If the context on a MMU has already been set to
12368 * INVALID_CONTEXT, we just get an extra flush on
12369 * that MMU.
12371 hatlockp = sfmmu_hat_enter(sfmmup);
12372 kpreempt_disable();
12374 cpuset = sfmmup->sfmmu_cpusran;
12375 CPUSET_AND(cpuset, cpu_ready_set);
12376 CPUSET_DEL(cpuset, CPU->cpu_id);
12378 SFMMU_XCALL_STATS(sfmmup);
12379 xt_some(cpuset, vtag_flush_pgcnt_tl1, (uint64_t)addr,
12380 sfmmu_pgcnt);
12382 for (; bitvec != 0; bitvec >>= 1) {
12383 if (bitvec & 1)
12384 vtag_flushpage(addr, (uint64_t)sfmmup);
12385 addr += MMU_PAGESIZE;
12387 kpreempt_enable();
12388 sfmmu_hat_exit(hatlockp);
12390 sfmmu_xcall_save += (pgunload-1);
12392 dmrp->dmr_bitvec = 0;
12396 * In cases where we need to synchronize with TLB/TSB miss trap
12397 * handlers, _and_ need to flush the TLB, it's a lot easier to
12398 * throw away the context from the process than to do a
12399 * special song and dance to keep things consistent for the
12400 * handlers.
12402 * Since the process suddenly ends up without a context and our caller
12403 * holds the hat lock, threads that fault after this function is called
12404 * will pile up on the lock. We can then do whatever we need to
12405 * atomically from the context of the caller. The first blocked thread
12406 * to resume executing will get the process a new context, and the
12407 * process will resume executing.
12409 * One added advantage of this approach is that on MMUs that
12410 * support a "flush all" operation, we will delay the flush until
12411 * cnum wrap-around, and then flush the TLB one time. This
12412 * is rather rare, so it's a lot less expensive than making 8000
12413 * x-calls to flush the TLB 8000 times.
12415 * A per-process (PP) lock is used to synchronize ctx allocations in
12416 * resume() and ctx invalidations here.
12418 static void
12419 sfmmu_invalidate_ctx(sfmmu_t *sfmmup)
12421 cpuset_t cpuset;
12422 int cnum, currcnum;
12423 mmu_ctx_t *mmu_ctxp;
12424 int i;
12425 uint_t pstate_save;
12427 SFMMU_STAT(sf_ctx_inv);
12429 ASSERT(sfmmu_hat_lock_held(sfmmup));
12430 ASSERT(sfmmup != ksfmmup);
12432 kpreempt_disable();
12434 mmu_ctxp = CPU_MMU_CTXP(CPU);
12435 ASSERT(mmu_ctxp);
12436 ASSERT(mmu_ctxp->mmu_idx < max_mmu_ctxdoms);
12437 ASSERT(mmu_ctxp == mmu_ctxs_tbl[mmu_ctxp->mmu_idx]);
12439 currcnum = sfmmup->sfmmu_ctxs[mmu_ctxp->mmu_idx].cnum;
12441 pstate_save = sfmmu_disable_intrs();
12443 lock_set(&sfmmup->sfmmu_ctx_lock); /* acquire PP lock */
12444 /* set HAT cnum invalid across all context domains. */
12445 for (i = 0; i < max_mmu_ctxdoms; i++) {
12447 cnum = sfmmup->sfmmu_ctxs[i].cnum;
12448 if (cnum == INVALID_CONTEXT) {
12449 continue;
12452 sfmmup->sfmmu_ctxs[i].cnum = INVALID_CONTEXT;
12454 membar_enter(); /* make sure globally visible to all CPUs */
12455 lock_clear(&sfmmup->sfmmu_ctx_lock); /* release PP lock */
12457 sfmmu_enable_intrs(pstate_save);
12459 cpuset = sfmmup->sfmmu_cpusran;
12460 CPUSET_DEL(cpuset, CPU->cpu_id);
12461 CPUSET_AND(cpuset, cpu_ready_set);
12462 if (!CPUSET_ISNULL(cpuset)) {
12463 SFMMU_XCALL_STATS(sfmmup);
12464 xt_some(cpuset, sfmmu_raise_tsb_exception,
12465 (uint64_t)sfmmup, INVALID_CONTEXT);
12466 xt_sync(cpuset);
12467 SFMMU_STAT(sf_tsb_raise_exception);
12468 SFMMU_MMU_STAT(mmu_tsb_raise_exception);
12472 * If the hat to-be-invalidated is the same as the current
12473 * process on local CPU we need to invalidate
12474 * this CPU context as well.
12476 if ((sfmmu_getctx_sec() == currcnum) &&
12477 (currcnum != INVALID_CONTEXT)) {
12478 /* sets shared context to INVALID too */
12479 sfmmu_setctx_sec(INVALID_CONTEXT);
12480 sfmmu_clear_utsbinfo();
12483 SFMMU_FLAGS_SET(sfmmup, HAT_ALLCTX_INVALID);
12485 kpreempt_enable();
12488 * we hold the hat lock, so nobody should allocate a context
12489 * for us yet
12491 ASSERT(sfmmup->sfmmu_ctxs[mmu_ctxp->mmu_idx].cnum == INVALID_CONTEXT);
12494 #ifdef VAC
12496 * We need to flush the cache in all cpus. It is possible that
12497 * a process referenced a page as cacheable but has sinced exited
12498 * and cleared the mapping list. We still to flush it but have no
12499 * state so all cpus is the only alternative.
12501 void
12502 sfmmu_cache_flush(pfn_t pfnum, int vcolor)
12504 cpuset_t cpuset;
12506 kpreempt_disable();
12507 cpuset = cpu_ready_set;
12508 CPUSET_DEL(cpuset, CPU->cpu_id);
12509 SFMMU_XCALL_STATS(NULL); /* account to any ctx */
12510 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor);
12511 xt_sync(cpuset);
12512 vac_flushpage(pfnum, vcolor);
12513 kpreempt_enable();
12516 void
12517 sfmmu_cache_flushcolor(int vcolor, pfn_t pfnum)
12519 cpuset_t cpuset;
12521 ASSERT(vcolor >= 0);
12523 kpreempt_disable();
12524 cpuset = cpu_ready_set;
12525 CPUSET_DEL(cpuset, CPU->cpu_id);
12526 SFMMU_XCALL_STATS(NULL); /* account to any ctx */
12527 xt_some(cpuset, vac_flushcolor_tl1, vcolor, pfnum);
12528 xt_sync(cpuset);
12529 vac_flushcolor(vcolor, pfnum);
12530 kpreempt_enable();
12532 #endif /* VAC */
12535 * We need to prevent processes from accessing the TSB using a cached physical
12536 * address. It's alright if they try to access the TSB via virtual address
12537 * since they will just fault on that virtual address once the mapping has
12538 * been suspended.
12540 #pragma weak sendmondo_in_recover
12542 /* ARGSUSED */
12543 static int
12544 sfmmu_tsb_pre_relocator(caddr_t va, uint_t tsbsz, uint_t flags, void *tsbinfo)
12546 struct tsb_info *tsbinfop = (struct tsb_info *)tsbinfo;
12547 sfmmu_t *sfmmup = tsbinfop->tsb_sfmmu;
12548 hatlock_t *hatlockp;
12549 sf_scd_t *scdp;
12551 if (flags != HAT_PRESUSPEND)
12552 return (0);
12555 * If tsb is a shared TSB with TSB_SHAREDCTX set, sfmmup must
12556 * be a shared hat, then set SCD's tsbinfo's flag.
12557 * If tsb is not shared, sfmmup is a private hat, then set
12558 * its private tsbinfo's flag.
12560 hatlockp = sfmmu_hat_enter(sfmmup);
12561 tsbinfop->tsb_flags |= TSB_RELOC_FLAG;
12563 if (!(tsbinfop->tsb_flags & TSB_SHAREDCTX)) {
12564 sfmmu_tsb_inv_ctx(sfmmup);
12565 sfmmu_hat_exit(hatlockp);
12566 } else {
12567 /* release lock on the shared hat */
12568 sfmmu_hat_exit(hatlockp);
12569 /* sfmmup is a shared hat */
12570 ASSERT(sfmmup->sfmmu_scdhat);
12571 scdp = sfmmup->sfmmu_scdp;
12572 ASSERT(scdp != NULL);
12573 /* get private hat from the scd list */
12574 mutex_enter(&scdp->scd_mutex);
12575 sfmmup = scdp->scd_sf_list;
12576 while (sfmmup != NULL) {
12577 hatlockp = sfmmu_hat_enter(sfmmup);
12579 * We do not call sfmmu_tsb_inv_ctx here because
12580 * sendmondo_in_recover check is only needed for
12581 * sun4u.
12583 sfmmu_invalidate_ctx(sfmmup);
12584 sfmmu_hat_exit(hatlockp);
12585 sfmmup = sfmmup->sfmmu_scd_link.next;
12588 mutex_exit(&scdp->scd_mutex);
12590 return (0);
12593 static void
12594 sfmmu_tsb_inv_ctx(sfmmu_t *sfmmup)
12596 extern uint32_t sendmondo_in_recover;
12598 ASSERT(sfmmu_hat_lock_held(sfmmup));
12601 * For Cheetah+ Erratum 25:
12602 * Wait for any active recovery to finish. We can't risk
12603 * relocating the TSB of the thread running mondo_recover_proc()
12604 * since, if we did that, we would deadlock. The scenario we are
12605 * trying to avoid is as follows:
12607 * THIS CPU RECOVER CPU
12608 * -------- -----------
12609 * Begins recovery, walking through TSB
12610 * hat_pagesuspend() TSB TTE
12611 * TLB miss on TSB TTE, spins at TL1
12612 * xt_sync()
12613 * send_mondo_timeout()
12614 * mondo_recover_proc()
12615 * ((deadlocked))
12617 * The second half of the workaround is that mondo_recover_proc()
12618 * checks to see if the tsb_info has the RELOC flag set, and if it
12619 * does, it skips over that TSB without ever touching tsbinfop->tsb_va
12620 * and hence avoiding the TLB miss that could result in a deadlock.
12622 if (&sendmondo_in_recover) {
12623 membar_enter(); /* make sure RELOC flag visible */
12624 while (sendmondo_in_recover) {
12625 drv_usecwait(1);
12626 membar_consumer();
12630 sfmmu_invalidate_ctx(sfmmup);
12633 /* ARGSUSED */
12634 static int
12635 sfmmu_tsb_post_relocator(caddr_t va, uint_t tsbsz, uint_t flags,
12636 void *tsbinfo, pfn_t newpfn)
12638 hatlock_t *hatlockp;
12639 struct tsb_info *tsbinfop = (struct tsb_info *)tsbinfo;
12640 sfmmu_t *sfmmup = tsbinfop->tsb_sfmmu;
12642 if (flags != HAT_POSTUNSUSPEND)
12643 return (0);
12645 hatlockp = sfmmu_hat_enter(sfmmup);
12647 SFMMU_STAT(sf_tsb_reloc);
12650 * The process may have swapped out while we were relocating one
12651 * of its TSBs. If so, don't bother doing the setup since the
12652 * process can't be using the memory anymore.
12654 if ((tsbinfop->tsb_flags & TSB_SWAPPED) == 0) {
12655 ASSERT(va == tsbinfop->tsb_va);
12656 sfmmu_tsbinfo_setup_phys(tsbinfop, newpfn);
12658 if (tsbinfop->tsb_flags & TSB_FLUSH_NEEDED) {
12659 sfmmu_inv_tsb(tsbinfop->tsb_va,
12660 TSB_BYTES(tsbinfop->tsb_szc));
12661 tsbinfop->tsb_flags &= ~TSB_FLUSH_NEEDED;
12665 membar_exit();
12666 tsbinfop->tsb_flags &= ~TSB_RELOC_FLAG;
12667 cv_broadcast(&sfmmup->sfmmu_tsb_cv);
12669 sfmmu_hat_exit(hatlockp);
12671 return (0);
12675 * Allocate and initialize a tsb_info structure. Note that we may or may not
12676 * allocate a TSB here, depending on the flags passed in.
12678 static int
12679 sfmmu_tsbinfo_alloc(struct tsb_info **tsbinfopp, int tsb_szc, int tte_sz_mask,
12680 uint_t flags, sfmmu_t *sfmmup)
12682 int err;
12684 *tsbinfopp = (struct tsb_info *)kmem_cache_alloc(
12685 sfmmu_tsbinfo_cache, KM_SLEEP);
12687 if ((err = sfmmu_init_tsbinfo(*tsbinfopp, tte_sz_mask,
12688 tsb_szc, flags, sfmmup)) != 0) {
12689 kmem_cache_free(sfmmu_tsbinfo_cache, *tsbinfopp);
12690 SFMMU_STAT(sf_tsb_allocfail);
12691 *tsbinfopp = NULL;
12692 return (err);
12694 SFMMU_STAT(sf_tsb_alloc);
12697 * Bump the TSB size counters for this TSB size.
12699 (*(((int *)&sfmmu_tsbsize_stat) + tsb_szc))++;
12700 return (0);
12703 static void
12704 sfmmu_tsb_free(struct tsb_info *tsbinfo)
12706 caddr_t tsbva = tsbinfo->tsb_va;
12707 uint_t tsb_size = TSB_BYTES(tsbinfo->tsb_szc);
12708 struct kmem_cache *kmem_cachep = tsbinfo->tsb_cache;
12709 vmem_t *vmp = tsbinfo->tsb_vmp;
12712 * If we allocated this TSB from relocatable kernel memory, then we
12713 * need to uninstall the callback handler.
12715 if (tsbinfo->tsb_cache != sfmmu_tsb8k_cache) {
12716 uintptr_t slab_mask;
12717 caddr_t slab_vaddr;
12718 page_t **ppl;
12719 int ret;
12721 ASSERT(tsb_size <= MMU_PAGESIZE4M || use_bigtsb_arena);
12722 if (tsb_size > MMU_PAGESIZE4M)
12723 slab_mask = ~((uintptr_t)bigtsb_slab_mask) << PAGESHIFT;
12724 else
12725 slab_mask = ~((uintptr_t)tsb_slab_mask) << PAGESHIFT;
12726 slab_vaddr = (caddr_t)((uintptr_t)tsbva & slab_mask);
12728 ret = as_pagelock(&kas, &ppl, slab_vaddr, PAGESIZE, S_WRITE);
12729 ASSERT(ret == 0);
12730 hat_delete_callback(tsbva, (uint_t)tsb_size, (void *)tsbinfo,
12731 0, NULL);
12732 as_pageunlock(&kas, ppl, slab_vaddr, PAGESIZE, S_WRITE);
12735 if (kmem_cachep != NULL) {
12736 kmem_cache_free(kmem_cachep, tsbva);
12737 } else {
12738 vmem_xfree(vmp, (void *)tsbva, tsb_size);
12740 tsbinfo->tsb_va = (caddr_t)0xbad00bad;
12741 atomic_add_64(&tsb_alloc_bytes, -(int64_t)tsb_size);
12744 static void
12745 sfmmu_tsbinfo_free(struct tsb_info *tsbinfo)
12747 if ((tsbinfo->tsb_flags & TSB_SWAPPED) == 0) {
12748 sfmmu_tsb_free(tsbinfo);
12750 kmem_cache_free(sfmmu_tsbinfo_cache, tsbinfo);
12755 * Setup all the references to physical memory for this tsbinfo.
12756 * The underlying page(s) must be locked.
12758 static void
12759 sfmmu_tsbinfo_setup_phys(struct tsb_info *tsbinfo, pfn_t pfn)
12761 ASSERT(pfn != PFN_INVALID);
12762 ASSERT(pfn == va_to_pfn(tsbinfo->tsb_va));
12764 #ifndef sun4v
12765 if (tsbinfo->tsb_szc == 0) {
12766 sfmmu_memtte(&tsbinfo->tsb_tte, pfn,
12767 PROT_WRITE|PROT_READ, TTE8K);
12768 } else {
12770 * Round down PA and use a large mapping; the handlers will
12771 * compute the TSB pointer at the correct offset into the
12772 * big virtual page. NOTE: this assumes all TSBs larger
12773 * than 8K must come from physically contiguous slabs of
12774 * size tsb_slab_size.
12776 sfmmu_memtte(&tsbinfo->tsb_tte, pfn & ~tsb_slab_mask,
12777 PROT_WRITE|PROT_READ, tsb_slab_ttesz);
12779 tsbinfo->tsb_pa = ptob(pfn);
12781 TTE_SET_LOCKED(&tsbinfo->tsb_tte); /* lock the tte into dtlb */
12782 TTE_SET_MOD(&tsbinfo->tsb_tte); /* enable writes */
12784 ASSERT(TTE_IS_PRIVILEGED(&tsbinfo->tsb_tte));
12785 ASSERT(TTE_IS_LOCKED(&tsbinfo->tsb_tte));
12786 #else /* sun4v */
12787 tsbinfo->tsb_pa = ptob(pfn);
12788 #endif /* sun4v */
12793 * Returns zero on success, ENOMEM if over the high water mark,
12794 * or EAGAIN if the caller needs to retry with a smaller TSB
12795 * size (or specify TSB_FORCEALLOC if the allocation can't fail).
12797 * This call cannot fail to allocate a TSB if TSB_FORCEALLOC
12798 * is specified and the TSB requested is PAGESIZE, though it
12799 * may sleep waiting for memory if sufficient memory is not
12800 * available.
12802 static int
12803 sfmmu_init_tsbinfo(struct tsb_info *tsbinfo, int tteszmask,
12804 int tsbcode, uint_t flags, sfmmu_t *sfmmup)
12806 caddr_t vaddr = NULL;
12807 caddr_t slab_vaddr;
12808 uintptr_t slab_mask;
12809 int tsbbytes = TSB_BYTES(tsbcode);
12810 int lowmem = 0;
12811 struct kmem_cache *kmem_cachep = NULL;
12812 vmem_t *vmp = NULL;
12813 lgrp_id_t lgrpid = LGRP_NONE;
12814 pfn_t pfn;
12815 uint_t cbflags = HAC_SLEEP;
12816 page_t **pplist;
12817 int ret;
12819 ASSERT(tsbbytes <= MMU_PAGESIZE4M || use_bigtsb_arena);
12820 if (tsbbytes > MMU_PAGESIZE4M)
12821 slab_mask = ~((uintptr_t)bigtsb_slab_mask) << PAGESHIFT;
12822 else
12823 slab_mask = ~((uintptr_t)tsb_slab_mask) << PAGESHIFT;
12825 if (flags & (TSB_FORCEALLOC | TSB_SWAPIN | TSB_GROW | TSB_SHRINK))
12826 flags |= TSB_ALLOC;
12828 ASSERT((flags & TSB_FORCEALLOC) == 0 || tsbcode == TSB_MIN_SZCODE);
12830 tsbinfo->tsb_sfmmu = sfmmup;
12833 * If not allocating a TSB, set up the tsbinfo, set TSB_SWAPPED, and
12834 * return.
12836 if ((flags & TSB_ALLOC) == 0) {
12837 tsbinfo->tsb_szc = tsbcode;
12838 tsbinfo->tsb_ttesz_mask = tteszmask;
12839 tsbinfo->tsb_va = (caddr_t)0xbadbadbeef;
12840 tsbinfo->tsb_pa = -1;
12841 tsbinfo->tsb_tte.ll = 0;
12842 tsbinfo->tsb_next = NULL;
12843 tsbinfo->tsb_flags = TSB_SWAPPED;
12844 tsbinfo->tsb_cache = NULL;
12845 tsbinfo->tsb_vmp = NULL;
12846 return (0);
12849 #ifdef DEBUG
12851 * For debugging:
12852 * Randomly force allocation failures every tsb_alloc_mtbf
12853 * tries if TSB_FORCEALLOC is not specified. This will
12854 * return ENOMEM if tsb_alloc_mtbf is odd, or EAGAIN if
12855 * it is even, to allow testing of both failure paths...
12857 if (tsb_alloc_mtbf && ((flags & TSB_FORCEALLOC) == 0) &&
12858 (tsb_alloc_count++ == tsb_alloc_mtbf)) {
12859 tsb_alloc_count = 0;
12860 tsb_alloc_fail_mtbf++;
12861 return ((tsb_alloc_mtbf & 1)? ENOMEM : EAGAIN);
12863 #endif /* DEBUG */
12866 * Enforce high water mark if we are not doing a forced allocation
12867 * and are not shrinking a process' TSB.
12869 if ((flags & TSB_SHRINK) == 0 &&
12870 (tsbbytes + tsb_alloc_bytes) > tsb_alloc_hiwater) {
12871 if ((flags & TSB_FORCEALLOC) == 0)
12872 return (ENOMEM);
12873 lowmem = 1;
12877 * Allocate from the correct location based upon the size of the TSB
12878 * compared to the base page size, and what memory conditions dictate.
12879 * Note we always do nonblocking allocations from the TSB arena since
12880 * we don't want memory fragmentation to cause processes to block
12881 * indefinitely waiting for memory; until the kernel algorithms that
12882 * coalesce large pages are improved this is our best option.
12884 * Algorithm:
12885 * If allocating a "large" TSB (>8K), allocate from the
12886 * appropriate kmem_tsb_default_arena vmem arena
12887 * else if low on memory or the TSB_FORCEALLOC flag is set or
12888 * tsb_forceheap is set
12889 * Allocate from kernel heap via sfmmu_tsb8k_cache with
12890 * KM_SLEEP (never fails)
12891 * else
12892 * Allocate from appropriate sfmmu_tsb_cache with
12893 * KM_NOSLEEP
12894 * endif
12896 if (tsb_lgrp_affinity)
12897 lgrpid = lgrp_home_id(curthread);
12898 if (lgrpid == LGRP_NONE)
12899 lgrpid = 0; /* use lgrp of boot CPU */
12901 if (tsbbytes > MMU_PAGESIZE) {
12902 if (tsbbytes > MMU_PAGESIZE4M) {
12903 vmp = kmem_bigtsb_default_arena[lgrpid];
12904 vaddr = (caddr_t)vmem_xalloc(vmp, tsbbytes, tsbbytes,
12905 0, 0, NULL, NULL, VM_NOSLEEP);
12906 } else {
12907 vmp = kmem_tsb_default_arena[lgrpid];
12908 vaddr = (caddr_t)vmem_xalloc(vmp, tsbbytes, tsbbytes,
12909 0, 0, NULL, NULL, VM_NOSLEEP);
12911 #ifdef DEBUG
12912 } else if (lowmem || (flags & TSB_FORCEALLOC) || tsb_forceheap) {
12913 #else /* !DEBUG */
12914 } else if (lowmem || (flags & TSB_FORCEALLOC)) {
12915 #endif /* DEBUG */
12916 kmem_cachep = sfmmu_tsb8k_cache;
12917 vaddr = (caddr_t)kmem_cache_alloc(kmem_cachep, KM_SLEEP);
12918 ASSERT(vaddr != NULL);
12919 } else {
12920 kmem_cachep = sfmmu_tsb_cache[lgrpid];
12921 vaddr = (caddr_t)kmem_cache_alloc(kmem_cachep, KM_NOSLEEP);
12924 tsbinfo->tsb_cache = kmem_cachep;
12925 tsbinfo->tsb_vmp = vmp;
12927 if (vaddr == NULL) {
12928 return (EAGAIN);
12931 atomic_add_64(&tsb_alloc_bytes, (int64_t)tsbbytes);
12932 kmem_cachep = tsbinfo->tsb_cache;
12935 * If we are allocating from outside the cage, then we need to
12936 * register a relocation callback handler. Note that for now
12937 * since pseudo mappings always hang off of the slab's root page,
12938 * we need only lock the first 8K of the TSB slab. This is a bit
12939 * hacky but it is good for performance.
12941 if (kmem_cachep != sfmmu_tsb8k_cache) {
12942 slab_vaddr = (caddr_t)((uintptr_t)vaddr & slab_mask);
12943 ret = as_pagelock(&kas, &pplist, slab_vaddr, PAGESIZE, S_WRITE);
12944 ASSERT(ret == 0);
12945 ret = hat_add_callback(sfmmu_tsb_cb_id, vaddr, (uint_t)tsbbytes,
12946 cbflags, (void *)tsbinfo, &pfn, NULL);
12949 * Need to free up resources if we could not successfully
12950 * add the callback function and return an error condition.
12952 if (ret != 0) {
12953 if (kmem_cachep) {
12954 kmem_cache_free(kmem_cachep, vaddr);
12955 } else {
12956 vmem_xfree(vmp, (void *)vaddr, tsbbytes);
12958 as_pageunlock(&kas, pplist, slab_vaddr, PAGESIZE,
12959 S_WRITE);
12960 return (EAGAIN);
12962 } else {
12964 * Since allocation of 8K TSBs from heap is rare and occurs
12965 * during memory pressure we allocate them from permanent
12966 * memory rather than using callbacks to get the PFN.
12968 pfn = hat_getpfnum(kas.a_hat, vaddr);
12971 tsbinfo->tsb_va = vaddr;
12972 tsbinfo->tsb_szc = tsbcode;
12973 tsbinfo->tsb_ttesz_mask = tteszmask;
12974 tsbinfo->tsb_next = NULL;
12975 tsbinfo->tsb_flags = 0;
12977 sfmmu_tsbinfo_setup_phys(tsbinfo, pfn);
12979 sfmmu_inv_tsb(vaddr, tsbbytes);
12981 if (kmem_cachep != sfmmu_tsb8k_cache) {
12982 as_pageunlock(&kas, pplist, slab_vaddr, PAGESIZE, S_WRITE);
12985 return (0);
12989 * Initialize per cpu tsb and per cpu tsbmiss_area
12991 void
12992 sfmmu_init_tsbs(void)
12994 int i;
12995 struct tsbmiss *tsbmissp;
12996 struct kpmtsbm *kpmtsbmp;
12997 #ifndef sun4v
12998 extern int dcache_line_mask;
12999 #endif /* sun4v */
13000 extern uint_t vac_colors;
13003 * Init. tsb miss area.
13005 tsbmissp = tsbmiss_area;
13007 for (i = 0; i < NCPU; tsbmissp++, i++) {
13009 * initialize the tsbmiss area.
13010 * Do this for all possible CPUs as some may be added
13011 * while the system is running. There is no cost to this.
13013 tsbmissp->ksfmmup = ksfmmup;
13014 #ifndef sun4v
13015 tsbmissp->dcache_line_mask = (uint16_t)dcache_line_mask;
13016 #endif /* sun4v */
13017 tsbmissp->khashstart =
13018 (struct hmehash_bucket *)va_to_pa((caddr_t)khme_hash);
13019 tsbmissp->uhashstart =
13020 (struct hmehash_bucket *)va_to_pa((caddr_t)uhme_hash);
13021 tsbmissp->khashsz = khmehash_num;
13022 tsbmissp->uhashsz = uhmehash_num;
13025 sfmmu_tsb_cb_id = hat_register_callback('T'<<16 | 'S' << 8 | 'B',
13026 sfmmu_tsb_pre_relocator, sfmmu_tsb_post_relocator, NULL, 0);
13028 if (kpm_enable == 0)
13029 return;
13031 /* -- Begin KPM specific init -- */
13033 if (kpm_smallpages) {
13035 * If we're using base pagesize pages for seg_kpm
13036 * mappings, we use the kernel TSB since we can't afford
13037 * to allocate a second huge TSB for these mappings.
13039 kpm_tsbbase = ktsb_phys? ktsb_pbase : (uint64_t)ktsb_base;
13040 kpm_tsbsz = ktsb_szcode;
13041 kpmsm_tsbbase = kpm_tsbbase;
13042 kpmsm_tsbsz = kpm_tsbsz;
13043 } else {
13045 * In VAC conflict case, just put the entries in the
13046 * kernel 8K indexed TSB for now so we can find them.
13047 * This could really be changed in the future if we feel
13048 * the need...
13050 kpmsm_tsbbase = ktsb_phys? ktsb_pbase : (uint64_t)ktsb_base;
13051 kpmsm_tsbsz = ktsb_szcode;
13052 kpm_tsbbase = ktsb_phys? ktsb4m_pbase : (uint64_t)ktsb4m_base;
13053 kpm_tsbsz = ktsb4m_szcode;
13056 kpmtsbmp = kpmtsbm_area;
13057 for (i = 0; i < NCPU; kpmtsbmp++, i++) {
13059 * Initialize the kpmtsbm area.
13060 * Do this for all possible CPUs as some may be added
13061 * while the system is running. There is no cost to this.
13063 kpmtsbmp->vbase = kpm_vbase;
13064 kpmtsbmp->vend = kpm_vbase + kpm_size * vac_colors;
13065 kpmtsbmp->sz_shift = kpm_size_shift;
13066 kpmtsbmp->kpmp_shift = kpmp_shift;
13067 kpmtsbmp->kpmp2pshft = (uchar_t)kpmp2pshft;
13068 if (kpm_smallpages == 0) {
13069 kpmtsbmp->kpmp_table_sz = kpmp_table_sz;
13070 kpmtsbmp->kpmp_tablepa = va_to_pa(kpmp_table);
13071 } else {
13072 kpmtsbmp->kpmp_table_sz = kpmp_stable_sz;
13073 kpmtsbmp->kpmp_tablepa = va_to_pa(kpmp_stable);
13075 kpmtsbmp->msegphashpa = va_to_pa(memseg_phash);
13076 kpmtsbmp->flags = KPMTSBM_ENABLE_FLAG;
13077 #ifdef DEBUG
13078 kpmtsbmp->flags |= (kpm_tsbmtl) ? KPMTSBM_TLTSBM_FLAG : 0;
13079 #endif /* DEBUG */
13080 if (ktsb_phys)
13081 kpmtsbmp->flags |= KPMTSBM_TSBPHYS_FLAG;
13084 /* -- End KPM specific init -- */
13087 /* Avoid using sfmmu_tsbinfo_alloc() to avoid kmem_alloc - no real reason */
13088 struct tsb_info ktsb_info[2];
13091 * Called from hat_kern_setup() to setup the tsb_info for ksfmmup.
13093 void
13094 sfmmu_init_ktsbinfo()
13096 ASSERT(ksfmmup != NULL);
13097 ASSERT(ksfmmup->sfmmu_tsb == NULL);
13099 * Allocate tsbinfos for kernel and copy in data
13100 * to make debug easier and sun4v setup easier.
13102 ktsb_info[0].tsb_sfmmu = ksfmmup;
13103 ktsb_info[0].tsb_szc = ktsb_szcode;
13104 ktsb_info[0].tsb_ttesz_mask = TSB8K|TSB64K|TSB512K;
13105 ktsb_info[0].tsb_va = ktsb_base;
13106 ktsb_info[0].tsb_pa = ktsb_pbase;
13107 ktsb_info[0].tsb_flags = 0;
13108 ktsb_info[0].tsb_tte.ll = 0;
13109 ktsb_info[0].tsb_cache = NULL;
13111 ktsb_info[1].tsb_sfmmu = ksfmmup;
13112 ktsb_info[1].tsb_szc = ktsb4m_szcode;
13113 ktsb_info[1].tsb_ttesz_mask = TSB4M;
13114 ktsb_info[1].tsb_va = ktsb4m_base;
13115 ktsb_info[1].tsb_pa = ktsb4m_pbase;
13116 ktsb_info[1].tsb_flags = 0;
13117 ktsb_info[1].tsb_tte.ll = 0;
13118 ktsb_info[1].tsb_cache = NULL;
13120 /* Link them into ksfmmup. */
13121 ktsb_info[0].tsb_next = &ktsb_info[1];
13122 ktsb_info[1].tsb_next = NULL;
13123 ksfmmup->sfmmu_tsb = &ktsb_info[0];
13125 sfmmu_setup_tsbinfo(ksfmmup);
13129 * Cache the last value returned from va_to_pa(). If the VA specified
13130 * in the current call to cached_va_to_pa() maps to the same Page (as the
13131 * previous call to cached_va_to_pa()), then compute the PA using
13132 * cached info, else call va_to_pa().
13134 * Note: this function is neither MT-safe nor consistent in the presence
13135 * of multiple, interleaved threads. This function was created to enable
13136 * an optimization used during boot (at a point when there's only one thread
13137 * executing on the "boot CPU", and before startup_vm() has been called).
13139 static uint64_t
13140 cached_va_to_pa(void *vaddr)
13142 static uint64_t prev_vaddr_base = 0;
13143 static uint64_t prev_pfn = 0;
13145 if ((((uint64_t)vaddr) & MMU_PAGEMASK) == prev_vaddr_base) {
13146 return (prev_pfn | ((uint64_t)vaddr & MMU_PAGEOFFSET));
13147 } else {
13148 uint64_t pa = va_to_pa(vaddr);
13150 if (pa != ((uint64_t)-1)) {
13152 * Computed physical address is valid. Cache its
13153 * related info for the next cached_va_to_pa() call.
13155 prev_pfn = pa & MMU_PAGEMASK;
13156 prev_vaddr_base = ((uint64_t)vaddr) & MMU_PAGEMASK;
13159 return (pa);
13164 * Carve up our nucleus hblk region. We may allocate more hblks than
13165 * asked due to rounding errors but we are guaranteed to have at least
13166 * enough space to allocate the requested number of hblk8's and hblk1's.
13168 void
13169 sfmmu_init_nucleus_hblks(caddr_t addr, size_t size, int nhblk8, int nhblk1)
13171 struct hme_blk *hmeblkp;
13172 size_t hme8blk_sz, hme1blk_sz;
13173 size_t i;
13174 size_t hblk8_bound;
13175 ulong_t j = 0, k = 0;
13177 ASSERT(addr != NULL && size != 0);
13179 /* Need to use proper structure alignment */
13180 hme8blk_sz = roundup(HME8BLK_SZ, sizeof (int64_t));
13181 hme1blk_sz = roundup(HME1BLK_SZ, sizeof (int64_t));
13183 nucleus_hblk8.list = (void *)addr;
13184 nucleus_hblk8.index = 0;
13187 * Use as much memory as possible for hblk8's since we
13188 * expect all bop_alloc'ed memory to be allocated in 8k chunks.
13189 * We need to hold back enough space for the hblk1's which
13190 * we'll allocate next.
13192 hblk8_bound = size - (nhblk1 * hme1blk_sz) - hme8blk_sz;
13193 for (i = 0; i <= hblk8_bound; i += hme8blk_sz, j++) {
13194 hmeblkp = (struct hme_blk *)addr;
13195 addr += hme8blk_sz;
13196 hmeblkp->hblk_nuc_bit = 1;
13197 hmeblkp->hblk_nextpa = cached_va_to_pa((caddr_t)hmeblkp);
13199 nucleus_hblk8.len = j;
13200 ASSERT(j >= nhblk8);
13201 SFMMU_STAT_ADD(sf_hblk8_ncreate, j);
13203 nucleus_hblk1.list = (void *)addr;
13204 nucleus_hblk1.index = 0;
13205 for (; i <= (size - hme1blk_sz); i += hme1blk_sz, k++) {
13206 hmeblkp = (struct hme_blk *)addr;
13207 addr += hme1blk_sz;
13208 hmeblkp->hblk_nuc_bit = 1;
13209 hmeblkp->hblk_nextpa = cached_va_to_pa((caddr_t)hmeblkp);
13211 ASSERT(k >= nhblk1);
13212 nucleus_hblk1.len = k;
13213 SFMMU_STAT_ADD(sf_hblk1_ncreate, k);
13217 * This function is currently not supported on this platform. For what
13218 * it's supposed to do, see hat.c and hat_srmmu.c
13220 /* ARGSUSED */
13221 faultcode_t
13222 hat_softlock(struct hat *hat, caddr_t addr, size_t *lenp, page_t **ppp,
13223 uint_t flags)
13225 return (FC_NOSUPPORT);
13229 * Searchs the mapping list of the page for a mapping of the same size. If not
13230 * found the corresponding bit is cleared in the p_index field. When large
13231 * pages are more prevalent in the system, we can maintain the mapping list
13232 * in order and we don't have to traverse the list each time. Just check the
13233 * next and prev entries, and if both are of different size, we clear the bit.
13235 static void
13236 sfmmu_rm_large_mappings(page_t *pp, int ttesz)
13238 struct sf_hment *sfhmep;
13239 struct hme_blk *hmeblkp;
13240 int index;
13241 pgcnt_t npgs;
13243 ASSERT(ttesz > TTE8K);
13245 ASSERT(sfmmu_mlist_held(pp));
13247 ASSERT(PP_ISMAPPED_LARGE(pp));
13250 * Traverse mapping list looking for another mapping of same size.
13251 * since we only want to clear index field if all mappings of
13252 * that size are gone.
13255 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) {
13256 if (IS_PAHME(sfhmep))
13257 continue;
13258 hmeblkp = sfmmu_hmetohblk(sfhmep);
13259 if (hme_size(sfhmep) == ttesz) {
13261 * another mapping of the same size. don't clear index.
13263 return;
13268 * Clear the p_index bit for large page.
13270 index = PAGESZ_TO_INDEX(ttesz);
13271 npgs = TTEPAGES(ttesz);
13272 while (npgs-- > 0) {
13273 ASSERT(pp->p_index & index);
13274 pp->p_index &= ~index;
13275 pp = PP_PAGENEXT(pp);
13280 * return supported features
13282 /* ARGSUSED */
13284 hat_supported(enum hat_features feature, void *arg)
13286 switch (feature) {
13287 case HAT_SHARED_PT:
13288 case HAT_DYNAMIC_ISM_UNMAP:
13289 case HAT_VMODSORT:
13290 return (1);
13291 case HAT_SHARED_REGIONS:
13292 if (shctx_on)
13293 return (1);
13294 else
13295 return (0);
13296 default:
13297 return (0);
13301 void
13302 hat_enter(struct hat *hat)
13304 hatlock_t *hatlockp;
13306 if (hat != ksfmmup) {
13307 hatlockp = TSB_HASH(hat);
13308 mutex_enter(HATLOCK_MUTEXP(hatlockp));
13312 void
13313 hat_exit(struct hat *hat)
13315 hatlock_t *hatlockp;
13317 if (hat != ksfmmup) {
13318 hatlockp = TSB_HASH(hat);
13319 mutex_exit(HATLOCK_MUTEXP(hatlockp));
13323 /*ARGSUSED*/
13324 void
13325 hat_reserve(struct as *as, caddr_t addr, size_t len)
13329 static void
13330 hat_kstat_init(void)
13332 kstat_t *ksp;
13334 ksp = kstat_create("unix", 0, "sfmmu_global_stat", "hat",
13335 KSTAT_TYPE_RAW, sizeof (struct sfmmu_global_stat),
13336 KSTAT_FLAG_VIRTUAL);
13337 if (ksp) {
13338 ksp->ks_data = (void *) &sfmmu_global_stat;
13339 kstat_install(ksp);
13341 ksp = kstat_create("unix", 0, "sfmmu_tsbsize_stat", "hat",
13342 KSTAT_TYPE_RAW, sizeof (struct sfmmu_tsbsize_stat),
13343 KSTAT_FLAG_VIRTUAL);
13344 if (ksp) {
13345 ksp->ks_data = (void *) &sfmmu_tsbsize_stat;
13346 kstat_install(ksp);
13348 ksp = kstat_create("unix", 0, "sfmmu_percpu_stat", "hat",
13349 KSTAT_TYPE_RAW, sizeof (struct sfmmu_percpu_stat) * NCPU,
13350 KSTAT_FLAG_WRITABLE);
13351 if (ksp) {
13352 ksp->ks_update = sfmmu_kstat_percpu_update;
13353 kstat_install(ksp);
13357 /* ARGSUSED */
13358 static int
13359 sfmmu_kstat_percpu_update(kstat_t *ksp, int rw)
13361 struct sfmmu_percpu_stat *cpu_kstat = ksp->ks_data;
13362 struct tsbmiss *tsbm = tsbmiss_area;
13363 struct kpmtsbm *kpmtsbm = kpmtsbm_area;
13364 int i;
13366 ASSERT(cpu_kstat);
13367 if (rw == KSTAT_READ) {
13368 for (i = 0; i < NCPU; cpu_kstat++, tsbm++, kpmtsbm++, i++) {
13369 cpu_kstat->sf_itlb_misses = 0;
13370 cpu_kstat->sf_dtlb_misses = 0;
13371 cpu_kstat->sf_utsb_misses = tsbm->utsb_misses -
13372 tsbm->uprot_traps;
13373 cpu_kstat->sf_ktsb_misses = tsbm->ktsb_misses +
13374 kpmtsbm->kpm_tsb_misses - tsbm->kprot_traps;
13375 cpu_kstat->sf_tsb_hits = 0;
13376 cpu_kstat->sf_umod_faults = tsbm->uprot_traps;
13377 cpu_kstat->sf_kmod_faults = tsbm->kprot_traps;
13379 } else {
13380 /* KSTAT_WRITE is used to clear stats */
13381 for (i = 0; i < NCPU; tsbm++, kpmtsbm++, i++) {
13382 tsbm->utsb_misses = 0;
13383 tsbm->ktsb_misses = 0;
13384 tsbm->uprot_traps = 0;
13385 tsbm->kprot_traps = 0;
13386 kpmtsbm->kpm_dtlb_misses = 0;
13387 kpmtsbm->kpm_tsb_misses = 0;
13390 return (0);
13393 #ifdef DEBUG
13395 tte_t *gorig[NCPU], *gcur[NCPU], *gnew[NCPU];
13398 * A tte checker. *orig_old is the value we read before cas.
13399 * *cur is the value returned by cas.
13400 * *new is the desired value when we do the cas.
13402 * *hmeblkp is currently unused.
13405 /* ARGSUSED */
13406 void
13407 chk_tte(tte_t *orig_old, tte_t *cur, tte_t *new, struct hme_blk *hmeblkp)
13409 pfn_t i, j, k;
13410 int cpuid = CPU->cpu_id;
13412 gorig[cpuid] = orig_old;
13413 gcur[cpuid] = cur;
13414 gnew[cpuid] = new;
13416 #ifdef lint
13417 hmeblkp = hmeblkp;
13418 #endif
13420 if (TTE_IS_VALID(orig_old)) {
13421 if (TTE_IS_VALID(cur)) {
13422 i = TTE_TO_TTEPFN(orig_old);
13423 j = TTE_TO_TTEPFN(cur);
13424 k = TTE_TO_TTEPFN(new);
13425 if (i != j) {
13426 /* remap error? */
13427 panic("chk_tte: bad pfn, 0x%lx, 0x%lx", i, j);
13430 if (i != k) {
13431 /* remap error? */
13432 panic("chk_tte: bad pfn2, 0x%lx, 0x%lx", i, k);
13434 } else {
13435 if (TTE_IS_VALID(new)) {
13436 panic("chk_tte: invalid cur? ");
13439 i = TTE_TO_TTEPFN(orig_old);
13440 k = TTE_TO_TTEPFN(new);
13441 if (i != k) {
13442 panic("chk_tte: bad pfn3, 0x%lx, 0x%lx", i, k);
13445 } else {
13446 if (TTE_IS_VALID(cur)) {
13447 j = TTE_TO_TTEPFN(cur);
13448 if (TTE_IS_VALID(new)) {
13449 k = TTE_TO_TTEPFN(new);
13450 if (j != k) {
13451 panic("chk_tte: bad pfn4, 0x%lx, 0x%lx",
13452 j, k);
13454 } else {
13455 panic("chk_tte: why here?");
13457 } else {
13458 if (!TTE_IS_VALID(new)) {
13459 panic("chk_tte: why here2 ?");
13465 #endif /* DEBUG */
13467 extern void prefetch_tsbe_read(struct tsbe *);
13468 extern void prefetch_tsbe_write(struct tsbe *);
13472 * We want to prefetch 7 cache lines ahead for our read prefetch. This gives
13473 * us optimal performance on Cheetah+. You can only have 8 outstanding
13474 * prefetches at any one time, so we opted for 7 read prefetches and 1 write
13475 * prefetch to make the most utilization of the prefetch capability.
13477 #define TSBE_PREFETCH_STRIDE (7)
13479 void
13480 sfmmu_copy_tsb(struct tsb_info *old_tsbinfo, struct tsb_info *new_tsbinfo)
13482 int old_bytes = TSB_BYTES(old_tsbinfo->tsb_szc);
13483 int new_bytes = TSB_BYTES(new_tsbinfo->tsb_szc);
13484 int old_entries = TSB_ENTRIES(old_tsbinfo->tsb_szc);
13485 int new_entries = TSB_ENTRIES(new_tsbinfo->tsb_szc);
13486 struct tsbe *old;
13487 struct tsbe *new;
13488 struct tsbe *new_base = (struct tsbe *)new_tsbinfo->tsb_va;
13489 uint64_t va;
13490 int new_offset;
13491 int i;
13492 int vpshift;
13493 int last_prefetch;
13495 if (old_bytes == new_bytes) {
13496 bcopy(old_tsbinfo->tsb_va, new_tsbinfo->tsb_va, new_bytes);
13497 } else {
13500 * A TSBE is 16 bytes which means there are four TSBE's per
13501 * P$ line (64 bytes), thus every 4 TSBE's we prefetch.
13503 old = (struct tsbe *)old_tsbinfo->tsb_va;
13504 last_prefetch = old_entries - (4*(TSBE_PREFETCH_STRIDE+1));
13505 for (i = 0; i < old_entries; i++, old++) {
13506 if (((i & (4-1)) == 0) && (i < last_prefetch))
13507 prefetch_tsbe_read(old);
13508 if (!old->tte_tag.tag_invalid) {
13510 * We have a valid TTE to remap. Check the
13511 * size. We won't remap 64K or 512K TTEs
13512 * because they span more than one TSB entry
13513 * and are indexed using an 8K virt. page.
13514 * Ditto for 32M and 256M TTEs.
13516 if (TTE_CSZ(&old->tte_data) == TTE64K ||
13517 TTE_CSZ(&old->tte_data) == TTE512K)
13518 continue;
13519 if (mmu_page_sizes == max_mmu_page_sizes) {
13520 if (TTE_CSZ(&old->tte_data) == TTE32M ||
13521 TTE_CSZ(&old->tte_data) == TTE256M)
13522 continue;
13525 /* clear the lower 22 bits of the va */
13526 va = *(uint64_t *)old << 22;
13527 /* turn va into a virtual pfn */
13528 va >>= 22 - TSB_START_SIZE;
13530 * or in bits from the offset in the tsb
13531 * to get the real virtual pfn. These
13532 * correspond to bits [21:13] in the va
13534 vpshift =
13535 TTE_BSZS_SHIFT(TTE_CSZ(&old->tte_data)) &
13536 0x1ff;
13537 va |= (i << vpshift);
13538 va >>= vpshift;
13539 new_offset = va & (new_entries - 1);
13540 new = new_base + new_offset;
13541 prefetch_tsbe_write(new);
13542 *new = *old;
13549 * unused in sfmmu
13551 void
13552 hat_dump(void)
13557 * Called when a thread is exiting and we have switched to the kernel address
13558 * space. Perform the same VM initialization resume() uses when switching
13559 * processes.
13561 * Note that sfmmu_load_mmustate() is currently a no-op for kernel threads, but
13562 * we call it anyway in case the semantics change in the future.
13564 /*ARGSUSED*/
13565 void
13566 hat_thread_exit(kthread_t *thd)
13568 uint_t pgsz_cnum;
13569 uint_t pstate_save;
13571 ASSERT(thd->t_procp->p_as == &kas);
13573 pgsz_cnum = KCONTEXT;
13574 #ifdef sun4u
13575 pgsz_cnum |= (ksfmmup->sfmmu_cext << CTXREG_EXT_SHIFT);
13576 #endif
13579 * Note that sfmmu_load_mmustate() is currently a no-op for
13580 * kernel threads. We need to disable interrupts here,
13581 * simply because otherwise sfmmu_load_mmustate() would panic
13582 * if the caller does not disable interrupts.
13584 pstate_save = sfmmu_disable_intrs();
13586 /* Compatibility Note: hw takes care of MMU_SCONTEXT1 */
13587 sfmmu_setctx_sec(pgsz_cnum);
13588 sfmmu_load_mmustate(ksfmmup);
13589 sfmmu_enable_intrs(pstate_save);
13594 * SRD support
13596 #define SRD_HASH_FUNCTION(vp) (((((uintptr_t)(vp)) >> 4) ^ \
13597 (((uintptr_t)(vp)) >> 11)) & \
13598 srd_hashmask)
13601 * Attach the process to the srd struct associated with the exec vnode
13602 * from which the process is started.
13604 void
13605 hat_join_srd(struct hat *sfmmup, vnode_t *evp)
13607 uint_t hash = SRD_HASH_FUNCTION(evp);
13608 sf_srd_t *srdp;
13609 sf_srd_t *newsrdp;
13611 ASSERT(sfmmup != ksfmmup);
13612 ASSERT(sfmmup->sfmmu_srdp == NULL);
13614 if (!shctx_on) {
13615 return;
13618 VN_HOLD(evp);
13620 if (srd_buckets[hash].srdb_srdp != NULL) {
13621 mutex_enter(&srd_buckets[hash].srdb_lock);
13622 for (srdp = srd_buckets[hash].srdb_srdp; srdp != NULL;
13623 srdp = srdp->srd_hash) {
13624 if (srdp->srd_evp == evp) {
13625 ASSERT(srdp->srd_refcnt >= 0);
13626 sfmmup->sfmmu_srdp = srdp;
13627 atomic_inc_32(
13628 (volatile uint_t *)&srdp->srd_refcnt);
13629 mutex_exit(&srd_buckets[hash].srdb_lock);
13630 return;
13633 mutex_exit(&srd_buckets[hash].srdb_lock);
13635 newsrdp = kmem_cache_alloc(srd_cache, KM_SLEEP);
13636 ASSERT(newsrdp->srd_next_ismrid == 0 && newsrdp->srd_next_hmerid == 0);
13638 newsrdp->srd_evp = evp;
13639 newsrdp->srd_refcnt = 1;
13640 newsrdp->srd_hmergnfree = NULL;
13641 newsrdp->srd_ismrgnfree = NULL;
13643 mutex_enter(&srd_buckets[hash].srdb_lock);
13644 for (srdp = srd_buckets[hash].srdb_srdp; srdp != NULL;
13645 srdp = srdp->srd_hash) {
13646 if (srdp->srd_evp == evp) {
13647 ASSERT(srdp->srd_refcnt >= 0);
13648 sfmmup->sfmmu_srdp = srdp;
13649 atomic_inc_32((volatile uint_t *)&srdp->srd_refcnt);
13650 mutex_exit(&srd_buckets[hash].srdb_lock);
13651 kmem_cache_free(srd_cache, newsrdp);
13652 return;
13655 newsrdp->srd_hash = srd_buckets[hash].srdb_srdp;
13656 srd_buckets[hash].srdb_srdp = newsrdp;
13657 sfmmup->sfmmu_srdp = newsrdp;
13659 mutex_exit(&srd_buckets[hash].srdb_lock);
13663 static void
13664 sfmmu_leave_srd(sfmmu_t *sfmmup)
13666 vnode_t *evp;
13667 sf_srd_t *srdp = sfmmup->sfmmu_srdp;
13668 uint_t hash;
13669 sf_srd_t **prev_srdpp;
13670 sf_region_t *rgnp;
13671 sf_region_t *nrgnp;
13672 #ifdef DEBUG
13673 int rgns = 0;
13674 #endif
13675 int i;
13677 ASSERT(sfmmup != ksfmmup);
13678 ASSERT(srdp != NULL);
13679 ASSERT(srdp->srd_refcnt > 0);
13680 ASSERT(sfmmup->sfmmu_scdp == NULL);
13681 ASSERT(sfmmup->sfmmu_free == 1);
13683 sfmmup->sfmmu_srdp = NULL;
13684 evp = srdp->srd_evp;
13685 ASSERT(evp != NULL);
13686 if (atomic_dec_32_nv((volatile uint_t *)&srdp->srd_refcnt)) {
13687 VN_RELE(evp);
13688 return;
13691 hash = SRD_HASH_FUNCTION(evp);
13692 mutex_enter(&srd_buckets[hash].srdb_lock);
13693 for (prev_srdpp = &srd_buckets[hash].srdb_srdp;
13694 (srdp = *prev_srdpp) != NULL; prev_srdpp = &srdp->srd_hash) {
13695 if (srdp->srd_evp == evp) {
13696 break;
13699 if (srdp == NULL || srdp->srd_refcnt) {
13700 mutex_exit(&srd_buckets[hash].srdb_lock);
13701 VN_RELE(evp);
13702 return;
13704 *prev_srdpp = srdp->srd_hash;
13705 mutex_exit(&srd_buckets[hash].srdb_lock);
13707 ASSERT(srdp->srd_refcnt == 0);
13708 VN_RELE(evp);
13710 #ifdef DEBUG
13711 for (i = 0; i < SFMMU_MAX_REGION_BUCKETS; i++) {
13712 ASSERT(srdp->srd_rgnhash[i] == NULL);
13714 #endif /* DEBUG */
13716 /* free each hme regions in the srd */
13717 for (rgnp = srdp->srd_hmergnfree; rgnp != NULL; rgnp = nrgnp) {
13718 nrgnp = rgnp->rgn_next;
13719 ASSERT(rgnp->rgn_id < srdp->srd_next_hmerid);
13720 ASSERT(rgnp->rgn_refcnt == 0);
13721 ASSERT(rgnp->rgn_sfmmu_head == NULL);
13722 ASSERT(rgnp->rgn_flags & SFMMU_REGION_FREE);
13723 ASSERT(rgnp->rgn_hmeflags == 0);
13724 ASSERT(srdp->srd_hmergnp[rgnp->rgn_id] == rgnp);
13725 #ifdef DEBUG
13726 for (i = 0; i < MMU_PAGE_SIZES; i++) {
13727 ASSERT(rgnp->rgn_ttecnt[i] == 0);
13729 rgns++;
13730 #endif /* DEBUG */
13731 kmem_cache_free(region_cache, rgnp);
13733 ASSERT(rgns == srdp->srd_next_hmerid);
13735 #ifdef DEBUG
13736 rgns = 0;
13737 #endif
13738 /* free each ism rgns in the srd */
13739 for (rgnp = srdp->srd_ismrgnfree; rgnp != NULL; rgnp = nrgnp) {
13740 nrgnp = rgnp->rgn_next;
13741 ASSERT(rgnp->rgn_id < srdp->srd_next_ismrid);
13742 ASSERT(rgnp->rgn_refcnt == 0);
13743 ASSERT(rgnp->rgn_sfmmu_head == NULL);
13744 ASSERT(rgnp->rgn_flags & SFMMU_REGION_FREE);
13745 ASSERT(srdp->srd_ismrgnp[rgnp->rgn_id] == rgnp);
13746 #ifdef DEBUG
13747 for (i = 0; i < MMU_PAGE_SIZES; i++) {
13748 ASSERT(rgnp->rgn_ttecnt[i] == 0);
13750 rgns++;
13751 #endif /* DEBUG */
13752 kmem_cache_free(region_cache, rgnp);
13754 ASSERT(rgns == srdp->srd_next_ismrid);
13755 ASSERT(srdp->srd_ismbusyrgns == 0);
13756 ASSERT(srdp->srd_hmebusyrgns == 0);
13758 srdp->srd_next_ismrid = 0;
13759 srdp->srd_next_hmerid = 0;
13761 bzero((void *)srdp->srd_ismrgnp,
13762 sizeof (sf_region_t *) * SFMMU_MAX_ISM_REGIONS);
13763 bzero((void *)srdp->srd_hmergnp,
13764 sizeof (sf_region_t *) * SFMMU_MAX_HME_REGIONS);
13766 ASSERT(srdp->srd_scdp == NULL);
13767 kmem_cache_free(srd_cache, srdp);
13770 /* ARGSUSED */
13771 static int
13772 sfmmu_srdcache_constructor(void *buf, void *cdrarg, int kmflags)
13774 sf_srd_t *srdp = (sf_srd_t *)buf;
13775 bzero(buf, sizeof (*srdp));
13777 mutex_init(&srdp->srd_mutex, NULL, MUTEX_DEFAULT, NULL);
13778 mutex_init(&srdp->srd_scd_mutex, NULL, MUTEX_DEFAULT, NULL);
13779 return (0);
13782 /* ARGSUSED */
13783 static void
13784 sfmmu_srdcache_destructor(void *buf, void *cdrarg)
13786 sf_srd_t *srdp = (sf_srd_t *)buf;
13788 mutex_destroy(&srdp->srd_mutex);
13789 mutex_destroy(&srdp->srd_scd_mutex);
13793 * The caller makes sure hat_join_region()/hat_leave_region() can't be called
13794 * at the same time for the same process and address range. This is ensured by
13795 * the fact that address space is locked as writer when a process joins the
13796 * regions. Therefore there's no need to hold an srd lock during the entire
13797 * execution of hat_join_region()/hat_leave_region().
13800 #define RGN_HASH_FUNCTION(obj) (((((uintptr_t)(obj)) >> 4) ^ \
13801 (((uintptr_t)(obj)) >> 11)) & \
13802 srd_rgn_hashmask)
13804 * This routine implements the shared context functionality required when
13805 * attaching a segment to an address space. It must be called from
13806 * hat_share() for D(ISM) segments and from segvn_create() for segments
13807 * with the MAP_PRIVATE and MAP_TEXT flags set. It returns a region_cookie
13808 * which is saved in the private segment data for hme segments and
13809 * the ism_map structure for ism segments.
13811 hat_region_cookie_t
13812 hat_join_region(struct hat *sfmmup,
13813 caddr_t r_saddr,
13814 size_t r_size,
13815 void *r_obj,
13816 u_offset_t r_objoff,
13817 uchar_t r_perm,
13818 uchar_t r_pgszc,
13819 hat_rgn_cb_func_t r_cb_function,
13820 uint_t flags)
13822 sf_srd_t *srdp = sfmmup->sfmmu_srdp;
13823 uint_t rhash;
13824 uint_t rid;
13825 hatlock_t *hatlockp;
13826 sf_region_t *rgnp;
13827 sf_region_t *new_rgnp = NULL;
13828 int i;
13829 uint16_t *nextidp;
13830 sf_region_t **freelistp;
13831 int maxids;
13832 sf_region_t **rarrp;
13833 uint16_t *busyrgnsp;
13834 ulong_t rttecnt;
13835 uchar_t tteflag;
13836 uchar_t r_type = flags & HAT_REGION_TYPE_MASK;
13837 int text = (r_type == HAT_REGION_TEXT);
13839 if (srdp == NULL || r_size == 0) {
13840 return (HAT_INVALID_REGION_COOKIE);
13843 ASSERT(sfmmup != ksfmmup);
13844 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as));
13845 ASSERT(srdp->srd_refcnt > 0);
13846 ASSERT(!(flags & ~HAT_REGION_TYPE_MASK));
13847 ASSERT(flags == HAT_REGION_TEXT || flags == HAT_REGION_ISM);
13848 ASSERT(r_pgszc < mmu_page_sizes);
13849 if (!IS_P2ALIGNED(r_saddr, TTEBYTES(r_pgszc)) ||
13850 !IS_P2ALIGNED(r_size, TTEBYTES(r_pgszc))) {
13851 panic("hat_join_region: region addr or size is not aligned\n");
13855 r_type = (r_type == HAT_REGION_ISM) ? SFMMU_REGION_ISM :
13856 SFMMU_REGION_HME;
13858 * Currently only support shared hmes for the read only main text
13859 * region.
13861 if (r_type == SFMMU_REGION_HME && ((r_obj != srdp->srd_evp) ||
13862 (r_perm & PROT_WRITE))) {
13863 return (HAT_INVALID_REGION_COOKIE);
13866 rhash = RGN_HASH_FUNCTION(r_obj);
13868 if (r_type == SFMMU_REGION_ISM) {
13869 nextidp = &srdp->srd_next_ismrid;
13870 freelistp = &srdp->srd_ismrgnfree;
13871 maxids = SFMMU_MAX_ISM_REGIONS;
13872 rarrp = srdp->srd_ismrgnp;
13873 busyrgnsp = &srdp->srd_ismbusyrgns;
13874 } else {
13875 nextidp = &srdp->srd_next_hmerid;
13876 freelistp = &srdp->srd_hmergnfree;
13877 maxids = SFMMU_MAX_HME_REGIONS;
13878 rarrp = srdp->srd_hmergnp;
13879 busyrgnsp = &srdp->srd_hmebusyrgns;
13882 mutex_enter(&srdp->srd_mutex);
13884 for (rgnp = srdp->srd_rgnhash[rhash]; rgnp != NULL;
13885 rgnp = rgnp->rgn_hash) {
13886 if (rgnp->rgn_saddr == r_saddr && rgnp->rgn_size == r_size &&
13887 rgnp->rgn_obj == r_obj && rgnp->rgn_objoff == r_objoff &&
13888 rgnp->rgn_perm == r_perm && rgnp->rgn_pgszc == r_pgszc) {
13889 break;
13893 rfound:
13894 if (rgnp != NULL) {
13895 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == r_type);
13896 ASSERT(rgnp->rgn_cb_function == r_cb_function);
13897 ASSERT(rgnp->rgn_refcnt >= 0);
13898 rid = rgnp->rgn_id;
13899 ASSERT(rid < maxids);
13900 ASSERT(rarrp[rid] == rgnp);
13901 ASSERT(rid < *nextidp);
13902 atomic_inc_32((volatile uint_t *)&rgnp->rgn_refcnt);
13903 mutex_exit(&srdp->srd_mutex);
13904 if (new_rgnp != NULL) {
13905 kmem_cache_free(region_cache, new_rgnp);
13907 if (r_type == SFMMU_REGION_HME) {
13908 int myjoin =
13909 (sfmmup == astosfmmu(curthread->t_procp->p_as));
13911 sfmmu_link_to_hmeregion(sfmmup, rgnp);
13913 * bitmap should be updated after linking sfmmu on
13914 * region list so that pageunload() doesn't skip
13915 * TSB/TLB flush. As soon as bitmap is updated another
13916 * thread in this process can already start accessing
13917 * this region.
13920 * Normally ttecnt accounting is done as part of
13921 * pagefault handling. But a process may not take any
13922 * pagefaults on shared hmeblks created by some other
13923 * process. To compensate for this assume that the
13924 * entire region will end up faulted in using
13925 * the region's pagesize.
13928 if (r_pgszc > TTE8K) {
13929 tteflag = 1 << r_pgszc;
13930 if (disable_large_pages & tteflag) {
13931 tteflag = 0;
13933 } else {
13934 tteflag = 0;
13936 if (tteflag && !(sfmmup->sfmmu_rtteflags & tteflag)) {
13937 hatlockp = sfmmu_hat_enter(sfmmup);
13938 sfmmup->sfmmu_rtteflags |= tteflag;
13939 sfmmu_hat_exit(hatlockp);
13941 hatlockp = sfmmu_hat_enter(sfmmup);
13944 * Preallocate 1/4 of ttecnt's in 8K TSB for >= 4M
13945 * region to allow for large page allocation failure.
13947 if (r_pgszc >= TTE4M) {
13948 sfmmup->sfmmu_tsb0_4minflcnt +=
13949 r_size >> (TTE_PAGE_SHIFT(TTE8K) + 2);
13952 /* update sfmmu_ttecnt with the shme rgn ttecnt */
13953 rttecnt = r_size >> TTE_PAGE_SHIFT(r_pgszc);
13954 atomic_add_long(&sfmmup->sfmmu_ttecnt[r_pgszc],
13955 rttecnt);
13957 if (text && r_pgszc >= TTE4M &&
13958 (tteflag || ((disable_large_pages >> TTE4M) &
13959 ((1 << (r_pgszc - TTE4M + 1)) - 1))) &&
13960 !SFMMU_FLAGS_ISSET(sfmmup, HAT_4MTEXT_FLAG)) {
13961 SFMMU_FLAGS_SET(sfmmup, HAT_4MTEXT_FLAG);
13964 sfmmu_hat_exit(hatlockp);
13966 * On Panther we need to make sure TLB is programmed
13967 * to accept 32M/256M pages. Call
13968 * sfmmu_check_page_sizes() now to make sure TLB is
13969 * setup before making hmeregions visible to other
13970 * threads.
13972 sfmmu_check_page_sizes(sfmmup, 1);
13973 hatlockp = sfmmu_hat_enter(sfmmup);
13974 SF_RGNMAP_ADD(sfmmup->sfmmu_hmeregion_map, rid);
13977 * if context is invalid tsb miss exception code will
13978 * call sfmmu_check_page_sizes() and update tsbmiss
13979 * area later.
13981 kpreempt_disable();
13982 if (myjoin &&
13983 (sfmmup->sfmmu_ctxs[CPU_MMU_IDX(CPU)].cnum
13984 != INVALID_CONTEXT)) {
13985 struct tsbmiss *tsbmp;
13987 tsbmp = &tsbmiss_area[CPU->cpu_id];
13988 ASSERT(sfmmup == tsbmp->usfmmup);
13989 BT_SET(tsbmp->shmermap, rid);
13990 if (r_pgszc > TTE64K) {
13991 tsbmp->uhat_rtteflags |= tteflag;
13995 kpreempt_enable();
13997 sfmmu_hat_exit(hatlockp);
13998 ASSERT((hat_region_cookie_t)((uint64_t)rid) !=
13999 HAT_INVALID_REGION_COOKIE);
14000 } else {
14001 hatlockp = sfmmu_hat_enter(sfmmup);
14002 SF_RGNMAP_ADD(sfmmup->sfmmu_ismregion_map, rid);
14003 sfmmu_hat_exit(hatlockp);
14005 ASSERT(rid < maxids);
14007 if (r_type == SFMMU_REGION_ISM) {
14008 sfmmu_find_scd(sfmmup);
14010 return ((hat_region_cookie_t)((uint64_t)rid));
14013 ASSERT(new_rgnp == NULL);
14015 if (*busyrgnsp >= maxids) {
14016 mutex_exit(&srdp->srd_mutex);
14017 return (HAT_INVALID_REGION_COOKIE);
14020 ASSERT(MUTEX_HELD(&srdp->srd_mutex));
14021 if (*freelistp != NULL) {
14022 rgnp = *freelistp;
14023 *freelistp = rgnp->rgn_next;
14024 ASSERT(rgnp->rgn_id < *nextidp);
14025 ASSERT(rgnp->rgn_id < maxids);
14026 ASSERT(rgnp->rgn_flags & SFMMU_REGION_FREE);
14027 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK)
14028 == r_type);
14029 ASSERT(rarrp[rgnp->rgn_id] == rgnp);
14030 ASSERT(rgnp->rgn_hmeflags == 0);
14031 } else {
14033 * release local locks before memory allocation.
14035 mutex_exit(&srdp->srd_mutex);
14037 new_rgnp = kmem_cache_alloc(region_cache, KM_SLEEP);
14039 mutex_enter(&srdp->srd_mutex);
14040 for (rgnp = srdp->srd_rgnhash[rhash]; rgnp != NULL;
14041 rgnp = rgnp->rgn_hash) {
14042 if (rgnp->rgn_saddr == r_saddr &&
14043 rgnp->rgn_size == r_size &&
14044 rgnp->rgn_obj == r_obj &&
14045 rgnp->rgn_objoff == r_objoff &&
14046 rgnp->rgn_perm == r_perm &&
14047 rgnp->rgn_pgszc == r_pgszc) {
14048 break;
14051 if (rgnp != NULL) {
14052 goto rfound;
14055 if (*nextidp >= maxids) {
14056 mutex_exit(&srdp->srd_mutex);
14057 goto fail;
14059 rgnp = new_rgnp;
14060 new_rgnp = NULL;
14061 rgnp->rgn_id = (*nextidp)++;
14062 ASSERT(rgnp->rgn_id < maxids);
14063 ASSERT(rarrp[rgnp->rgn_id] == NULL);
14064 rarrp[rgnp->rgn_id] = rgnp;
14067 ASSERT(rgnp->rgn_sfmmu_head == NULL);
14068 ASSERT(rgnp->rgn_hmeflags == 0);
14069 #ifdef DEBUG
14070 for (i = 0; i < MMU_PAGE_SIZES; i++) {
14071 ASSERT(rgnp->rgn_ttecnt[i] == 0);
14073 #endif
14074 rgnp->rgn_saddr = r_saddr;
14075 rgnp->rgn_size = r_size;
14076 rgnp->rgn_obj = r_obj;
14077 rgnp->rgn_objoff = r_objoff;
14078 rgnp->rgn_perm = r_perm;
14079 rgnp->rgn_pgszc = r_pgszc;
14080 rgnp->rgn_flags = r_type;
14081 rgnp->rgn_refcnt = 0;
14082 rgnp->rgn_cb_function = r_cb_function;
14083 rgnp->rgn_hash = srdp->srd_rgnhash[rhash];
14084 srdp->srd_rgnhash[rhash] = rgnp;
14085 (*busyrgnsp)++;
14086 ASSERT(*busyrgnsp <= maxids);
14087 goto rfound;
14089 fail:
14090 ASSERT(new_rgnp != NULL);
14091 kmem_cache_free(region_cache, new_rgnp);
14092 return (HAT_INVALID_REGION_COOKIE);
14096 * This function implements the shared context functionality required
14097 * when detaching a segment from an address space. It must be called
14098 * from hat_unshare() for all D(ISM) segments and from segvn_unmap(),
14099 * for segments with a valid region_cookie.
14100 * It will also be called from all seg_vn routines which change a
14101 * segment's attributes such as segvn_setprot(), segvn_setpagesize(),
14102 * segvn_clrszc() & segvn_advise(), as well as in the case of COW fault
14103 * from segvn_fault().
14105 void
14106 hat_leave_region(struct hat *sfmmup, hat_region_cookie_t rcookie, uint_t flags)
14108 sf_srd_t *srdp = sfmmup->sfmmu_srdp;
14109 sf_scd_t *scdp;
14110 uint_t rhash;
14111 uint_t rid = (uint_t)((uint64_t)rcookie);
14112 hatlock_t *hatlockp = NULL;
14113 sf_region_t *rgnp;
14114 sf_region_t **prev_rgnpp;
14115 sf_region_t *cur_rgnp;
14116 void *r_obj;
14117 int i;
14118 caddr_t r_saddr;
14119 caddr_t r_eaddr;
14120 size_t r_size;
14121 uchar_t r_pgszc;
14122 uchar_t r_type = flags & HAT_REGION_TYPE_MASK;
14124 ASSERT(sfmmup != ksfmmup);
14125 ASSERT(srdp != NULL);
14126 ASSERT(srdp->srd_refcnt > 0);
14127 ASSERT(!(flags & ~HAT_REGION_TYPE_MASK));
14128 ASSERT(flags == HAT_REGION_TEXT || flags == HAT_REGION_ISM);
14129 ASSERT(!sfmmup->sfmmu_free || sfmmup->sfmmu_scdp == NULL);
14131 r_type = (r_type == HAT_REGION_ISM) ? SFMMU_REGION_ISM :
14132 SFMMU_REGION_HME;
14134 if (r_type == SFMMU_REGION_ISM) {
14135 ASSERT(SFMMU_IS_ISMRID_VALID(rid));
14136 ASSERT(rid < SFMMU_MAX_ISM_REGIONS);
14137 rgnp = srdp->srd_ismrgnp[rid];
14138 } else {
14139 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
14140 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
14141 rgnp = srdp->srd_hmergnp[rid];
14143 ASSERT(rgnp != NULL);
14144 ASSERT(rgnp->rgn_id == rid);
14145 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == r_type);
14146 ASSERT(!(rgnp->rgn_flags & SFMMU_REGION_FREE));
14147 ASSERT(AS_LOCK_HELD(sfmmup->sfmmu_as));
14149 if (sfmmup->sfmmu_free) {
14150 ulong_t rttecnt;
14151 r_pgszc = rgnp->rgn_pgszc;
14152 r_size = rgnp->rgn_size;
14154 ASSERT(sfmmup->sfmmu_scdp == NULL);
14155 if (r_type == SFMMU_REGION_ISM) {
14156 SF_RGNMAP_DEL(sfmmup->sfmmu_ismregion_map, rid);
14157 } else {
14158 /* update shme rgns ttecnt in sfmmu_ttecnt */
14159 rttecnt = r_size >> TTE_PAGE_SHIFT(r_pgszc);
14160 ASSERT(sfmmup->sfmmu_ttecnt[r_pgszc] >= rttecnt);
14162 atomic_add_long(&sfmmup->sfmmu_ttecnt[r_pgszc],
14163 -rttecnt);
14165 SF_RGNMAP_DEL(sfmmup->sfmmu_hmeregion_map, rid);
14167 } else if (r_type == SFMMU_REGION_ISM) {
14168 hatlockp = sfmmu_hat_enter(sfmmup);
14169 ASSERT(rid < srdp->srd_next_ismrid);
14170 SF_RGNMAP_DEL(sfmmup->sfmmu_ismregion_map, rid);
14171 scdp = sfmmup->sfmmu_scdp;
14172 if (scdp != NULL &&
14173 SF_RGNMAP_TEST(scdp->scd_ismregion_map, rid)) {
14174 sfmmu_leave_scd(sfmmup, r_type);
14175 ASSERT(sfmmu_hat_lock_held(sfmmup));
14177 sfmmu_hat_exit(hatlockp);
14178 } else {
14179 ulong_t rttecnt;
14180 r_pgszc = rgnp->rgn_pgszc;
14181 r_saddr = rgnp->rgn_saddr;
14182 r_size = rgnp->rgn_size;
14183 r_eaddr = r_saddr + r_size;
14185 ASSERT(r_type == SFMMU_REGION_HME);
14186 hatlockp = sfmmu_hat_enter(sfmmup);
14187 ASSERT(rid < srdp->srd_next_hmerid);
14188 SF_RGNMAP_DEL(sfmmup->sfmmu_hmeregion_map, rid);
14191 * If region is part of an SCD call sfmmu_leave_scd().
14192 * Otherwise if process is not exiting and has valid context
14193 * just drop the context on the floor to lose stale TLB
14194 * entries and force the update of tsb miss area to reflect
14195 * the new region map. After that clean our TSB entries.
14197 scdp = sfmmup->sfmmu_scdp;
14198 if (scdp != NULL &&
14199 SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) {
14200 sfmmu_leave_scd(sfmmup, r_type);
14201 ASSERT(sfmmu_hat_lock_held(sfmmup));
14203 sfmmu_invalidate_ctx(sfmmup);
14205 i = TTE8K;
14206 while (i < mmu_page_sizes) {
14207 if (rgnp->rgn_ttecnt[i] != 0) {
14208 sfmmu_unload_tsb_range(sfmmup, r_saddr,
14209 r_eaddr, i);
14210 if (i < TTE4M) {
14211 i = TTE4M;
14212 continue;
14213 } else {
14214 break;
14217 i++;
14219 /* Remove the preallocated 1/4 8k ttecnt for 4M regions. */
14220 if (r_pgszc >= TTE4M) {
14221 rttecnt = r_size >> (TTE_PAGE_SHIFT(TTE8K) + 2);
14222 ASSERT(sfmmup->sfmmu_tsb0_4minflcnt >=
14223 rttecnt);
14224 sfmmup->sfmmu_tsb0_4minflcnt -= rttecnt;
14227 /* update shme rgns ttecnt in sfmmu_ttecnt */
14228 rttecnt = r_size >> TTE_PAGE_SHIFT(r_pgszc);
14229 ASSERT(sfmmup->sfmmu_ttecnt[r_pgszc] >= rttecnt);
14230 atomic_add_long(&sfmmup->sfmmu_ttecnt[r_pgszc], -rttecnt);
14232 sfmmu_hat_exit(hatlockp);
14233 if (scdp != NULL && sfmmup->sfmmu_scdp == NULL) {
14234 /* sfmmup left the scd, grow private tsb */
14235 sfmmu_check_page_sizes(sfmmup, 1);
14236 } else {
14237 sfmmu_check_page_sizes(sfmmup, 0);
14241 if (r_type == SFMMU_REGION_HME) {
14242 sfmmu_unlink_from_hmeregion(sfmmup, rgnp);
14245 r_obj = rgnp->rgn_obj;
14246 if (atomic_dec_32_nv((volatile uint_t *)&rgnp->rgn_refcnt)) {
14247 return;
14251 * looks like nobody uses this region anymore. Free it.
14253 rhash = RGN_HASH_FUNCTION(r_obj);
14254 mutex_enter(&srdp->srd_mutex);
14255 for (prev_rgnpp = &srdp->srd_rgnhash[rhash];
14256 (cur_rgnp = *prev_rgnpp) != NULL;
14257 prev_rgnpp = &cur_rgnp->rgn_hash) {
14258 if (cur_rgnp == rgnp && cur_rgnp->rgn_refcnt == 0) {
14259 break;
14263 if (cur_rgnp == NULL) {
14264 mutex_exit(&srdp->srd_mutex);
14265 return;
14268 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == r_type);
14269 *prev_rgnpp = rgnp->rgn_hash;
14270 if (r_type == SFMMU_REGION_ISM) {
14271 rgnp->rgn_flags |= SFMMU_REGION_FREE;
14272 ASSERT(rid < srdp->srd_next_ismrid);
14273 rgnp->rgn_next = srdp->srd_ismrgnfree;
14274 srdp->srd_ismrgnfree = rgnp;
14275 ASSERT(srdp->srd_ismbusyrgns > 0);
14276 srdp->srd_ismbusyrgns--;
14277 mutex_exit(&srdp->srd_mutex);
14278 return;
14280 mutex_exit(&srdp->srd_mutex);
14283 * Destroy region's hmeblks.
14285 sfmmu_unload_hmeregion(srdp, rgnp);
14287 rgnp->rgn_hmeflags = 0;
14289 ASSERT(rgnp->rgn_sfmmu_head == NULL);
14290 ASSERT(rgnp->rgn_id == rid);
14291 for (i = 0; i < MMU_PAGE_SIZES; i++) {
14292 rgnp->rgn_ttecnt[i] = 0;
14294 rgnp->rgn_flags |= SFMMU_REGION_FREE;
14295 mutex_enter(&srdp->srd_mutex);
14296 ASSERT(rid < srdp->srd_next_hmerid);
14297 rgnp->rgn_next = srdp->srd_hmergnfree;
14298 srdp->srd_hmergnfree = rgnp;
14299 ASSERT(srdp->srd_hmebusyrgns > 0);
14300 srdp->srd_hmebusyrgns--;
14301 mutex_exit(&srdp->srd_mutex);
14305 * For now only called for hmeblk regions and not for ISM regions.
14307 void
14308 hat_dup_region(struct hat *sfmmup, hat_region_cookie_t rcookie)
14310 sf_srd_t *srdp = sfmmup->sfmmu_srdp;
14311 uint_t rid = (uint_t)((uint64_t)rcookie);
14312 sf_region_t *rgnp;
14313 sf_rgn_link_t *rlink;
14314 sf_rgn_link_t *hrlink;
14315 ulong_t rttecnt;
14317 ASSERT(sfmmup != ksfmmup);
14318 ASSERT(srdp != NULL);
14319 ASSERT(srdp->srd_refcnt > 0);
14321 ASSERT(rid < srdp->srd_next_hmerid);
14322 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
14323 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
14325 rgnp = srdp->srd_hmergnp[rid];
14326 ASSERT(rgnp->rgn_refcnt > 0);
14327 ASSERT(rgnp->rgn_id == rid);
14328 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == SFMMU_REGION_HME);
14329 ASSERT(!(rgnp->rgn_flags & SFMMU_REGION_FREE));
14331 atomic_inc_32((volatile uint_t *)&rgnp->rgn_refcnt);
14333 /* LINTED: constant in conditional context */
14334 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 1, 0);
14335 ASSERT(rlink != NULL);
14336 mutex_enter(&rgnp->rgn_mutex);
14337 ASSERT(rgnp->rgn_sfmmu_head != NULL);
14338 /* LINTED: constant in conditional context */
14339 SFMMU_HMERID2RLINKP(rgnp->rgn_sfmmu_head, rid, hrlink, 0, 0);
14340 ASSERT(hrlink != NULL);
14341 ASSERT(hrlink->prev == NULL);
14342 rlink->next = rgnp->rgn_sfmmu_head;
14343 rlink->prev = NULL;
14344 hrlink->prev = sfmmup;
14346 * make sure rlink's next field is correct
14347 * before making this link visible.
14349 membar_stst();
14350 rgnp->rgn_sfmmu_head = sfmmup;
14351 mutex_exit(&rgnp->rgn_mutex);
14353 /* update sfmmu_ttecnt with the shme rgn ttecnt */
14354 rttecnt = rgnp->rgn_size >> TTE_PAGE_SHIFT(rgnp->rgn_pgszc);
14355 atomic_add_long(&sfmmup->sfmmu_ttecnt[rgnp->rgn_pgszc], rttecnt);
14356 /* update tsb0 inflation count */
14357 if (rgnp->rgn_pgszc >= TTE4M) {
14358 sfmmup->sfmmu_tsb0_4minflcnt +=
14359 rgnp->rgn_size >> (TTE_PAGE_SHIFT(TTE8K) + 2);
14362 * Update regionid bitmask without hat lock since no other thread
14363 * can update this region bitmask right now.
14365 SF_RGNMAP_ADD(sfmmup->sfmmu_hmeregion_map, rid);
14368 /* ARGSUSED */
14369 static int
14370 sfmmu_rgncache_constructor(void *buf, void *cdrarg, int kmflags)
14372 sf_region_t *rgnp = (sf_region_t *)buf;
14373 bzero(buf, sizeof (*rgnp));
14375 mutex_init(&rgnp->rgn_mutex, NULL, MUTEX_DEFAULT, NULL);
14377 return (0);
14380 /* ARGSUSED */
14381 static void
14382 sfmmu_rgncache_destructor(void *buf, void *cdrarg)
14384 sf_region_t *rgnp = (sf_region_t *)buf;
14385 mutex_destroy(&rgnp->rgn_mutex);
14388 static int
14389 sfrgnmap_isnull(sf_region_map_t *map)
14391 int i;
14393 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) {
14394 if (map->bitmap[i] != 0) {
14395 return (0);
14398 return (1);
14401 static int
14402 sfhmergnmap_isnull(sf_hmeregion_map_t *map)
14404 int i;
14406 for (i = 0; i < SFMMU_HMERGNMAP_WORDS; i++) {
14407 if (map->bitmap[i] != 0) {
14408 return (0);
14411 return (1);
14414 #ifdef DEBUG
14415 static void
14416 check_scd_sfmmu_list(sfmmu_t **headp, sfmmu_t *sfmmup, int onlist)
14418 sfmmu_t *sp;
14419 sf_srd_t *srdp = sfmmup->sfmmu_srdp;
14421 for (sp = *headp; sp != NULL; sp = sp->sfmmu_scd_link.next) {
14422 ASSERT(srdp == sp->sfmmu_srdp);
14423 if (sp == sfmmup) {
14424 if (onlist) {
14425 return;
14426 } else {
14427 panic("shctx: sfmmu 0x%p found on scd"
14428 "list 0x%p", (void *)sfmmup,
14429 (void *)*headp);
14433 if (onlist) {
14434 panic("shctx: sfmmu 0x%p not found on scd list 0x%p",
14435 (void *)sfmmup, (void *)*headp);
14436 } else {
14437 return;
14440 #else /* DEBUG */
14441 #define check_scd_sfmmu_list(headp, sfmmup, onlist)
14442 #endif /* DEBUG */
14445 * Removes an sfmmu from the SCD sfmmu list.
14447 static void
14448 sfmmu_from_scd_list(sfmmu_t **headp, sfmmu_t *sfmmup)
14450 ASSERT(sfmmup->sfmmu_srdp != NULL);
14451 check_scd_sfmmu_list(headp, sfmmup, 1);
14452 if (sfmmup->sfmmu_scd_link.prev != NULL) {
14453 ASSERT(*headp != sfmmup);
14454 sfmmup->sfmmu_scd_link.prev->sfmmu_scd_link.next =
14455 sfmmup->sfmmu_scd_link.next;
14456 } else {
14457 ASSERT(*headp == sfmmup);
14458 *headp = sfmmup->sfmmu_scd_link.next;
14460 if (sfmmup->sfmmu_scd_link.next != NULL) {
14461 sfmmup->sfmmu_scd_link.next->sfmmu_scd_link.prev =
14462 sfmmup->sfmmu_scd_link.prev;
14468 * Adds an sfmmu to the start of the queue.
14470 static void
14471 sfmmu_to_scd_list(sfmmu_t **headp, sfmmu_t *sfmmup)
14473 check_scd_sfmmu_list(headp, sfmmup, 0);
14474 sfmmup->sfmmu_scd_link.prev = NULL;
14475 sfmmup->sfmmu_scd_link.next = *headp;
14476 if (*headp != NULL)
14477 (*headp)->sfmmu_scd_link.prev = sfmmup;
14478 *headp = sfmmup;
14482 * Remove an scd from the start of the queue.
14484 static void
14485 sfmmu_remove_scd(sf_scd_t **headp, sf_scd_t *scdp)
14487 if (scdp->scd_prev != NULL) {
14488 ASSERT(*headp != scdp);
14489 scdp->scd_prev->scd_next = scdp->scd_next;
14490 } else {
14491 ASSERT(*headp == scdp);
14492 *headp = scdp->scd_next;
14495 if (scdp->scd_next != NULL) {
14496 scdp->scd_next->scd_prev = scdp->scd_prev;
14501 * Add an scd to the start of the queue.
14503 static void
14504 sfmmu_add_scd(sf_scd_t **headp, sf_scd_t *scdp)
14506 scdp->scd_prev = NULL;
14507 scdp->scd_next = *headp;
14508 if (*headp != NULL) {
14509 (*headp)->scd_prev = scdp;
14511 *headp = scdp;
14514 static int
14515 sfmmu_alloc_scd_tsbs(sf_srd_t *srdp, sf_scd_t *scdp)
14517 uint_t rid;
14518 uint_t i;
14519 uint_t j;
14520 ulong_t w;
14521 sf_region_t *rgnp;
14522 ulong_t tte8k_cnt = 0;
14523 ulong_t tte4m_cnt = 0;
14524 uint_t tsb_szc;
14525 sfmmu_t *scsfmmup = scdp->scd_sfmmup;
14526 sfmmu_t *ism_hatid;
14527 struct tsb_info *newtsb;
14528 int szc;
14530 ASSERT(srdp != NULL);
14532 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) {
14533 if ((w = scdp->scd_region_map.bitmap[i]) == 0) {
14534 continue;
14536 j = 0;
14537 while (w) {
14538 if (!(w & 0x1)) {
14539 j++;
14540 w >>= 1;
14541 continue;
14543 rid = (i << BT_ULSHIFT) | j;
14544 j++;
14545 w >>= 1;
14547 if (rid < SFMMU_MAX_HME_REGIONS) {
14548 rgnp = srdp->srd_hmergnp[rid];
14549 ASSERT(rgnp->rgn_id == rid);
14550 ASSERT(rgnp->rgn_refcnt > 0);
14552 if (rgnp->rgn_pgszc < TTE4M) {
14553 tte8k_cnt += rgnp->rgn_size >>
14554 TTE_PAGE_SHIFT(TTE8K);
14555 } else {
14556 ASSERT(rgnp->rgn_pgszc >= TTE4M);
14557 tte4m_cnt += rgnp->rgn_size >>
14558 TTE_PAGE_SHIFT(TTE4M);
14560 * Inflate SCD tsb0 by preallocating
14561 * 1/4 8k ttecnt for 4M regions to
14562 * allow for lgpg alloc failure.
14564 tte8k_cnt += rgnp->rgn_size >>
14565 (TTE_PAGE_SHIFT(TTE8K) + 2);
14567 } else {
14568 rid -= SFMMU_MAX_HME_REGIONS;
14569 rgnp = srdp->srd_ismrgnp[rid];
14570 ASSERT(rgnp->rgn_id == rid);
14571 ASSERT(rgnp->rgn_refcnt > 0);
14573 ism_hatid = (sfmmu_t *)rgnp->rgn_obj;
14574 ASSERT(ism_hatid->sfmmu_ismhat);
14576 for (szc = 0; szc < TTE4M; szc++) {
14577 tte8k_cnt +=
14578 ism_hatid->sfmmu_ttecnt[szc] <<
14579 TTE_BSZS_SHIFT(szc);
14582 ASSERT(rgnp->rgn_pgszc >= TTE4M);
14583 if (rgnp->rgn_pgszc >= TTE4M) {
14584 tte4m_cnt += rgnp->rgn_size >>
14585 TTE_PAGE_SHIFT(TTE4M);
14591 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt);
14593 /* Allocate both the SCD TSBs here. */
14594 if (sfmmu_tsbinfo_alloc(&scsfmmup->sfmmu_tsb,
14595 tsb_szc, TSB8K|TSB64K|TSB512K, TSB_ALLOC, scsfmmup) &&
14596 (tsb_szc <= TSB_4M_SZCODE ||
14597 sfmmu_tsbinfo_alloc(&scsfmmup->sfmmu_tsb,
14598 TSB_4M_SZCODE, TSB8K|TSB64K|TSB512K,
14599 TSB_ALLOC, scsfmmup))) {
14601 SFMMU_STAT(sf_scd_1sttsb_allocfail);
14602 return (TSB_ALLOCFAIL);
14603 } else {
14604 scsfmmup->sfmmu_tsb->tsb_flags |= TSB_SHAREDCTX;
14606 if (tte4m_cnt) {
14607 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt);
14608 if (sfmmu_tsbinfo_alloc(&newtsb, tsb_szc,
14609 TSB4M|TSB32M|TSB256M, TSB_ALLOC, scsfmmup) &&
14610 (tsb_szc <= TSB_4M_SZCODE ||
14611 sfmmu_tsbinfo_alloc(&newtsb, TSB_4M_SZCODE,
14612 TSB4M|TSB32M|TSB256M,
14613 TSB_ALLOC, scsfmmup))) {
14615 * If we fail to allocate the 2nd shared tsb,
14616 * just free the 1st tsb, return failure.
14618 sfmmu_tsbinfo_free(scsfmmup->sfmmu_tsb);
14619 SFMMU_STAT(sf_scd_2ndtsb_allocfail);
14620 return (TSB_ALLOCFAIL);
14621 } else {
14622 ASSERT(scsfmmup->sfmmu_tsb->tsb_next == NULL);
14623 newtsb->tsb_flags |= TSB_SHAREDCTX;
14624 scsfmmup->sfmmu_tsb->tsb_next = newtsb;
14625 SFMMU_STAT(sf_scd_2ndtsb_alloc);
14628 SFMMU_STAT(sf_scd_1sttsb_alloc);
14630 return (TSB_SUCCESS);
14633 static void
14634 sfmmu_free_scd_tsbs(sfmmu_t *scd_sfmmu)
14636 while (scd_sfmmu->sfmmu_tsb != NULL) {
14637 struct tsb_info *next = scd_sfmmu->sfmmu_tsb->tsb_next;
14638 sfmmu_tsbinfo_free(scd_sfmmu->sfmmu_tsb);
14639 scd_sfmmu->sfmmu_tsb = next;
14644 * Link the sfmmu onto the hme region list.
14646 void
14647 sfmmu_link_to_hmeregion(sfmmu_t *sfmmup, sf_region_t *rgnp)
14649 uint_t rid;
14650 sf_rgn_link_t *rlink;
14651 sfmmu_t *head;
14652 sf_rgn_link_t *hrlink;
14654 rid = rgnp->rgn_id;
14655 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
14657 /* LINTED: constant in conditional context */
14658 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 1, 1);
14659 ASSERT(rlink != NULL);
14660 mutex_enter(&rgnp->rgn_mutex);
14661 if ((head = rgnp->rgn_sfmmu_head) == NULL) {
14662 rlink->next = NULL;
14663 rlink->prev = NULL;
14665 * make sure rlink's next field is NULL
14666 * before making this link visible.
14668 membar_stst();
14669 rgnp->rgn_sfmmu_head = sfmmup;
14670 } else {
14671 /* LINTED: constant in conditional context */
14672 SFMMU_HMERID2RLINKP(head, rid, hrlink, 0, 0);
14673 ASSERT(hrlink != NULL);
14674 ASSERT(hrlink->prev == NULL);
14675 rlink->next = head;
14676 rlink->prev = NULL;
14677 hrlink->prev = sfmmup;
14679 * make sure rlink's next field is correct
14680 * before making this link visible.
14682 membar_stst();
14683 rgnp->rgn_sfmmu_head = sfmmup;
14685 mutex_exit(&rgnp->rgn_mutex);
14689 * Unlink the sfmmu from the hme region list.
14691 void
14692 sfmmu_unlink_from_hmeregion(sfmmu_t *sfmmup, sf_region_t *rgnp)
14694 uint_t rid;
14695 sf_rgn_link_t *rlink;
14697 rid = rgnp->rgn_id;
14698 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
14700 /* LINTED: constant in conditional context */
14701 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 0, 0);
14702 ASSERT(rlink != NULL);
14703 mutex_enter(&rgnp->rgn_mutex);
14704 if (rgnp->rgn_sfmmu_head == sfmmup) {
14705 sfmmu_t *next = rlink->next;
14706 rgnp->rgn_sfmmu_head = next;
14708 * if we are stopped by xc_attention() after this
14709 * point the forward link walking in
14710 * sfmmu_rgntlb_demap() will work correctly since the
14711 * head correctly points to the next element.
14713 membar_stst();
14714 rlink->next = NULL;
14715 ASSERT(rlink->prev == NULL);
14716 if (next != NULL) {
14717 sf_rgn_link_t *nrlink;
14718 /* LINTED: constant in conditional context */
14719 SFMMU_HMERID2RLINKP(next, rid, nrlink, 0, 0);
14720 ASSERT(nrlink != NULL);
14721 ASSERT(nrlink->prev == sfmmup);
14722 nrlink->prev = NULL;
14724 } else {
14725 sfmmu_t *next = rlink->next;
14726 sfmmu_t *prev = rlink->prev;
14727 sf_rgn_link_t *prlink;
14729 ASSERT(prev != NULL);
14730 /* LINTED: constant in conditional context */
14731 SFMMU_HMERID2RLINKP(prev, rid, prlink, 0, 0);
14732 ASSERT(prlink != NULL);
14733 ASSERT(prlink->next == sfmmup);
14734 prlink->next = next;
14736 * if we are stopped by xc_attention()
14737 * after this point the forward link walking
14738 * will work correctly since the prev element
14739 * correctly points to the next element.
14741 membar_stst();
14742 rlink->next = NULL;
14743 rlink->prev = NULL;
14744 if (next != NULL) {
14745 sf_rgn_link_t *nrlink;
14746 /* LINTED: constant in conditional context */
14747 SFMMU_HMERID2RLINKP(next, rid, nrlink, 0, 0);
14748 ASSERT(nrlink != NULL);
14749 ASSERT(nrlink->prev == sfmmup);
14750 nrlink->prev = prev;
14753 mutex_exit(&rgnp->rgn_mutex);
14757 * Link scd sfmmu onto ism or hme region list for each region in the
14758 * scd region map.
14760 void
14761 sfmmu_link_scd_to_regions(sf_srd_t *srdp, sf_scd_t *scdp)
14763 uint_t rid;
14764 uint_t i;
14765 uint_t j;
14766 ulong_t w;
14767 sf_region_t *rgnp;
14768 sfmmu_t *scsfmmup;
14770 scsfmmup = scdp->scd_sfmmup;
14771 ASSERT(scsfmmup->sfmmu_scdhat);
14772 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) {
14773 if ((w = scdp->scd_region_map.bitmap[i]) == 0) {
14774 continue;
14776 j = 0;
14777 while (w) {
14778 if (!(w & 0x1)) {
14779 j++;
14780 w >>= 1;
14781 continue;
14783 rid = (i << BT_ULSHIFT) | j;
14784 j++;
14785 w >>= 1;
14787 if (rid < SFMMU_MAX_HME_REGIONS) {
14788 rgnp = srdp->srd_hmergnp[rid];
14789 ASSERT(rgnp->rgn_id == rid);
14790 ASSERT(rgnp->rgn_refcnt > 0);
14791 sfmmu_link_to_hmeregion(scsfmmup, rgnp);
14792 } else {
14793 sfmmu_t *ism_hatid = NULL;
14794 ism_ment_t *ism_ment;
14795 rid -= SFMMU_MAX_HME_REGIONS;
14796 rgnp = srdp->srd_ismrgnp[rid];
14797 ASSERT(rgnp->rgn_id == rid);
14798 ASSERT(rgnp->rgn_refcnt > 0);
14800 ism_hatid = (sfmmu_t *)rgnp->rgn_obj;
14801 ASSERT(ism_hatid->sfmmu_ismhat);
14802 ism_ment = &scdp->scd_ism_links[rid];
14803 ism_ment->iment_hat = scsfmmup;
14804 ism_ment->iment_base_va = rgnp->rgn_saddr;
14805 mutex_enter(&ism_mlist_lock);
14806 iment_add(ism_ment, ism_hatid);
14807 mutex_exit(&ism_mlist_lock);
14814 * Unlink scd sfmmu from ism or hme region list for each region in the
14815 * scd region map.
14817 void
14818 sfmmu_unlink_scd_from_regions(sf_srd_t *srdp, sf_scd_t *scdp)
14820 uint_t rid;
14821 uint_t i;
14822 uint_t j;
14823 ulong_t w;
14824 sf_region_t *rgnp;
14825 sfmmu_t *scsfmmup;
14827 scsfmmup = scdp->scd_sfmmup;
14828 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) {
14829 if ((w = scdp->scd_region_map.bitmap[i]) == 0) {
14830 continue;
14832 j = 0;
14833 while (w) {
14834 if (!(w & 0x1)) {
14835 j++;
14836 w >>= 1;
14837 continue;
14839 rid = (i << BT_ULSHIFT) | j;
14840 j++;
14841 w >>= 1;
14843 if (rid < SFMMU_MAX_HME_REGIONS) {
14844 rgnp = srdp->srd_hmergnp[rid];
14845 ASSERT(rgnp->rgn_id == rid);
14846 ASSERT(rgnp->rgn_refcnt > 0);
14847 sfmmu_unlink_from_hmeregion(scsfmmup,
14848 rgnp);
14850 } else {
14851 sfmmu_t *ism_hatid = NULL;
14852 ism_ment_t *ism_ment;
14853 rid -= SFMMU_MAX_HME_REGIONS;
14854 rgnp = srdp->srd_ismrgnp[rid];
14855 ASSERT(rgnp->rgn_id == rid);
14856 ASSERT(rgnp->rgn_refcnt > 0);
14858 ism_hatid = (sfmmu_t *)rgnp->rgn_obj;
14859 ASSERT(ism_hatid->sfmmu_ismhat);
14860 ism_ment = &scdp->scd_ism_links[rid];
14861 ASSERT(ism_ment->iment_hat == scdp->scd_sfmmup);
14862 ASSERT(ism_ment->iment_base_va ==
14863 rgnp->rgn_saddr);
14864 mutex_enter(&ism_mlist_lock);
14865 iment_sub(ism_ment, ism_hatid);
14866 mutex_exit(&ism_mlist_lock);
14873 * Allocates and initialises a new SCD structure, this is called with
14874 * the srd_scd_mutex held and returns with the reference count
14875 * initialised to 1.
14877 static sf_scd_t *
14878 sfmmu_alloc_scd(sf_srd_t *srdp, sf_region_map_t *new_map)
14880 sf_scd_t *new_scdp;
14881 sfmmu_t *scsfmmup;
14882 int i;
14884 ASSERT(MUTEX_HELD(&srdp->srd_scd_mutex));
14885 new_scdp = kmem_cache_alloc(scd_cache, KM_SLEEP);
14887 scsfmmup = kmem_cache_alloc(sfmmuid_cache, KM_SLEEP);
14888 new_scdp->scd_sfmmup = scsfmmup;
14889 scsfmmup->sfmmu_srdp = srdp;
14890 scsfmmup->sfmmu_scdp = new_scdp;
14891 scsfmmup->sfmmu_tsb0_4minflcnt = 0;
14892 scsfmmup->sfmmu_scdhat = 1;
14893 CPUSET_ALL(scsfmmup->sfmmu_cpusran);
14894 bzero(scsfmmup->sfmmu_hmeregion_links, SFMMU_L1_HMERLINKS_SIZE);
14896 ASSERT(max_mmu_ctxdoms > 0);
14897 for (i = 0; i < max_mmu_ctxdoms; i++) {
14898 scsfmmup->sfmmu_ctxs[i].cnum = INVALID_CONTEXT;
14899 scsfmmup->sfmmu_ctxs[i].gnum = 0;
14902 for (i = 0; i < MMU_PAGE_SIZES; i++) {
14903 new_scdp->scd_rttecnt[i] = 0;
14906 new_scdp->scd_region_map = *new_map;
14907 new_scdp->scd_refcnt = 1;
14908 if (sfmmu_alloc_scd_tsbs(srdp, new_scdp) != TSB_SUCCESS) {
14909 kmem_cache_free(scd_cache, new_scdp);
14910 kmem_cache_free(sfmmuid_cache, scsfmmup);
14911 return (NULL);
14913 if (&mmu_init_scd) {
14914 mmu_init_scd(new_scdp);
14916 return (new_scdp);
14920 * The first phase of a process joining an SCD. The hat structure is
14921 * linked to the SCD queue and then the HAT_JOIN_SCD sfmmu flag is set
14922 * and a cross-call with context invalidation is used to cause the
14923 * remaining work to be carried out in the sfmmu_tsbmiss_exception()
14924 * routine.
14926 static void
14927 sfmmu_join_scd(sf_scd_t *scdp, sfmmu_t *sfmmup)
14929 hatlock_t *hatlockp;
14930 sf_srd_t *srdp = sfmmup->sfmmu_srdp;
14931 int i;
14932 sf_scd_t *old_scdp;
14934 ASSERT(srdp != NULL);
14935 ASSERT(scdp != NULL);
14936 ASSERT(scdp->scd_refcnt > 0);
14937 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as));
14939 if ((old_scdp = sfmmup->sfmmu_scdp) != NULL) {
14940 ASSERT(old_scdp != scdp);
14942 mutex_enter(&old_scdp->scd_mutex);
14943 sfmmu_from_scd_list(&old_scdp->scd_sf_list, sfmmup);
14944 mutex_exit(&old_scdp->scd_mutex);
14946 * sfmmup leaves the old scd. Update sfmmu_ttecnt to
14947 * include the shme rgn ttecnt for rgns that
14948 * were in the old SCD
14950 for (i = 0; i < mmu_page_sizes; i++) {
14951 ASSERT(sfmmup->sfmmu_scdrttecnt[i] ==
14952 old_scdp->scd_rttecnt[i]);
14953 atomic_add_long(&sfmmup->sfmmu_ttecnt[i],
14954 sfmmup->sfmmu_scdrttecnt[i]);
14959 * Move sfmmu to the scd lists.
14961 mutex_enter(&scdp->scd_mutex);
14962 sfmmu_to_scd_list(&scdp->scd_sf_list, sfmmup);
14963 mutex_exit(&scdp->scd_mutex);
14964 SF_SCD_INCR_REF(scdp);
14966 hatlockp = sfmmu_hat_enter(sfmmup);
14968 * For a multi-thread process, we must stop
14969 * all the other threads before joining the scd.
14972 SFMMU_FLAGS_SET(sfmmup, HAT_JOIN_SCD);
14974 sfmmu_invalidate_ctx(sfmmup);
14975 sfmmup->sfmmu_scdp = scdp;
14978 * Copy scd_rttecnt into sfmmup's sfmmu_scdrttecnt, and update
14979 * sfmmu_ttecnt to not include the rgn ttecnt just joined in SCD.
14981 for (i = 0; i < mmu_page_sizes; i++) {
14982 sfmmup->sfmmu_scdrttecnt[i] = scdp->scd_rttecnt[i];
14983 ASSERT(sfmmup->sfmmu_ttecnt[i] >= scdp->scd_rttecnt[i]);
14984 atomic_add_long(&sfmmup->sfmmu_ttecnt[i],
14985 -sfmmup->sfmmu_scdrttecnt[i]);
14987 /* update tsb0 inflation count */
14988 if (old_scdp != NULL) {
14989 sfmmup->sfmmu_tsb0_4minflcnt +=
14990 old_scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt;
14992 ASSERT(sfmmup->sfmmu_tsb0_4minflcnt >=
14993 scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt);
14994 sfmmup->sfmmu_tsb0_4minflcnt -= scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt;
14996 sfmmu_hat_exit(hatlockp);
14998 if (old_scdp != NULL) {
14999 SF_SCD_DECR_REF(srdp, old_scdp);
15005 * This routine is called by a process to become part of an SCD. It is called
15006 * from sfmmu_tsbmiss_exception() once most of the initial work has been
15007 * done by sfmmu_join_scd(). This routine must not drop the hat lock.
15009 static void
15010 sfmmu_finish_join_scd(sfmmu_t *sfmmup)
15012 struct tsb_info *tsbinfop;
15014 ASSERT(sfmmu_hat_lock_held(sfmmup));
15015 ASSERT(sfmmup->sfmmu_scdp != NULL);
15016 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD));
15017 ASSERT(!SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY));
15018 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ALLCTX_INVALID));
15020 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL;
15021 tsbinfop = tsbinfop->tsb_next) {
15022 if (tsbinfop->tsb_flags & TSB_SWAPPED) {
15023 continue;
15025 ASSERT(!(tsbinfop->tsb_flags & TSB_RELOC_FLAG));
15027 sfmmu_inv_tsb(tsbinfop->tsb_va,
15028 TSB_BYTES(tsbinfop->tsb_szc));
15031 /* Set HAT_CTX1_FLAG for all SCD ISMs */
15032 sfmmu_ism_hatflags(sfmmup, 1);
15034 SFMMU_STAT(sf_join_scd);
15038 * This routine is called in order to check if there is an SCD which matches
15039 * the process's region map if not then a new SCD may be created.
15041 static void
15042 sfmmu_find_scd(sfmmu_t *sfmmup)
15044 sf_srd_t *srdp = sfmmup->sfmmu_srdp;
15045 sf_scd_t *scdp, *new_scdp;
15046 int ret;
15048 ASSERT(srdp != NULL);
15049 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as));
15051 mutex_enter(&srdp->srd_scd_mutex);
15052 for (scdp = srdp->srd_scdp; scdp != NULL;
15053 scdp = scdp->scd_next) {
15054 SF_RGNMAP_EQUAL(&scdp->scd_region_map,
15055 &sfmmup->sfmmu_region_map, ret);
15056 if (ret == 1) {
15057 SF_SCD_INCR_REF(scdp);
15058 mutex_exit(&srdp->srd_scd_mutex);
15059 sfmmu_join_scd(scdp, sfmmup);
15060 ASSERT(scdp->scd_refcnt >= 2);
15061 atomic_dec_32((volatile uint32_t *)&scdp->scd_refcnt);
15062 return;
15063 } else {
15065 * If the sfmmu region map is a subset of the scd
15066 * region map, then the assumption is that this process
15067 * will continue attaching to ISM segments until the
15068 * region maps are equal.
15070 SF_RGNMAP_IS_SUBSET(&scdp->scd_region_map,
15071 &sfmmup->sfmmu_region_map, ret);
15072 if (ret == 1) {
15073 mutex_exit(&srdp->srd_scd_mutex);
15074 return;
15079 ASSERT(scdp == NULL);
15081 * No matching SCD has been found, create a new one.
15083 if ((new_scdp = sfmmu_alloc_scd(srdp, &sfmmup->sfmmu_region_map)) ==
15084 NULL) {
15085 mutex_exit(&srdp->srd_scd_mutex);
15086 return;
15090 * sfmmu_alloc_scd() returns with a ref count of 1 on the scd.
15093 /* Set scd_rttecnt for shme rgns in SCD */
15094 sfmmu_set_scd_rttecnt(srdp, new_scdp);
15097 * Link scd onto srd_scdp list and scd sfmmu onto region/iment lists.
15099 sfmmu_link_scd_to_regions(srdp, new_scdp);
15100 sfmmu_add_scd(&srdp->srd_scdp, new_scdp);
15101 SFMMU_STAT_ADD(sf_create_scd, 1);
15103 mutex_exit(&srdp->srd_scd_mutex);
15104 sfmmu_join_scd(new_scdp, sfmmup);
15105 ASSERT(new_scdp->scd_refcnt >= 2);
15106 atomic_dec_32((volatile uint32_t *)&new_scdp->scd_refcnt);
15110 * This routine is called by a process to remove itself from an SCD. It is
15111 * either called when the processes has detached from a segment or from
15112 * hat_free_start() as a result of calling exit.
15114 static void
15115 sfmmu_leave_scd(sfmmu_t *sfmmup, uchar_t r_type)
15117 sf_scd_t *scdp = sfmmup->sfmmu_scdp;
15118 sf_srd_t *srdp = sfmmup->sfmmu_srdp;
15119 hatlock_t *hatlockp = TSB_HASH(sfmmup);
15120 int i;
15122 ASSERT(scdp != NULL);
15123 ASSERT(srdp != NULL);
15125 if (sfmmup->sfmmu_free) {
15127 * If the process is part of an SCD the sfmmu is unlinked
15128 * from scd_sf_list.
15130 mutex_enter(&scdp->scd_mutex);
15131 sfmmu_from_scd_list(&scdp->scd_sf_list, sfmmup);
15132 mutex_exit(&scdp->scd_mutex);
15134 * Update sfmmu_ttecnt to include the rgn ttecnt for rgns that
15135 * are about to leave the SCD
15137 for (i = 0; i < mmu_page_sizes; i++) {
15138 ASSERT(sfmmup->sfmmu_scdrttecnt[i] ==
15139 scdp->scd_rttecnt[i]);
15140 atomic_add_long(&sfmmup->sfmmu_ttecnt[i],
15141 sfmmup->sfmmu_scdrttecnt[i]);
15142 sfmmup->sfmmu_scdrttecnt[i] = 0;
15144 sfmmup->sfmmu_scdp = NULL;
15146 SF_SCD_DECR_REF(srdp, scdp);
15147 return;
15150 ASSERT(r_type != SFMMU_REGION_ISM ||
15151 SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY));
15152 ASSERT(scdp->scd_refcnt);
15153 ASSERT(!sfmmup->sfmmu_free);
15154 ASSERT(sfmmu_hat_lock_held(sfmmup));
15155 ASSERT(AS_LOCK_HELD(sfmmup->sfmmu_as));
15158 * Wait for ISM maps to be updated.
15160 if (r_type != SFMMU_REGION_ISM) {
15161 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY) &&
15162 sfmmup->sfmmu_scdp != NULL) {
15163 cv_wait(&sfmmup->sfmmu_tsb_cv,
15164 HATLOCK_MUTEXP(hatlockp));
15167 if (sfmmup->sfmmu_scdp == NULL) {
15168 sfmmu_hat_exit(hatlockp);
15169 return;
15171 SFMMU_FLAGS_SET(sfmmup, HAT_ISMBUSY);
15174 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)) {
15175 SFMMU_FLAGS_CLEAR(sfmmup, HAT_JOIN_SCD);
15177 * Since HAT_JOIN_SCD was set our context
15178 * is still invalid.
15180 } else {
15182 * For a multi-thread process, we must stop
15183 * all the other threads before leaving the scd.
15186 sfmmu_invalidate_ctx(sfmmup);
15189 /* Clear all the rid's for ISM, delete flags, etc */
15190 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY));
15191 sfmmu_ism_hatflags(sfmmup, 0);
15194 * Update sfmmu_ttecnt to include the rgn ttecnt for rgns that
15195 * are in SCD before this sfmmup leaves the SCD.
15197 for (i = 0; i < mmu_page_sizes; i++) {
15198 ASSERT(sfmmup->sfmmu_scdrttecnt[i] ==
15199 scdp->scd_rttecnt[i]);
15200 atomic_add_long(&sfmmup->sfmmu_ttecnt[i],
15201 sfmmup->sfmmu_scdrttecnt[i]);
15202 sfmmup->sfmmu_scdrttecnt[i] = 0;
15203 /* update ismttecnt to include SCD ism before hat leaves SCD */
15204 sfmmup->sfmmu_ismttecnt[i] += sfmmup->sfmmu_scdismttecnt[i];
15205 sfmmup->sfmmu_scdismttecnt[i] = 0;
15207 /* update tsb0 inflation count */
15208 sfmmup->sfmmu_tsb0_4minflcnt += scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt;
15210 if (r_type != SFMMU_REGION_ISM) {
15211 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ISMBUSY);
15213 sfmmup->sfmmu_scdp = NULL;
15215 sfmmu_hat_exit(hatlockp);
15218 * Unlink sfmmu from scd_sf_list this can be done without holding
15219 * the hat lock as we hold the sfmmu_as lock which prevents
15220 * hat_join_region from adding this thread to the scd again. Other
15221 * threads check if sfmmu_scdp is NULL under hat lock and if it's NULL
15222 * they won't get here, since sfmmu_leave_scd() clears sfmmu_scdp
15223 * while holding the hat lock.
15225 mutex_enter(&scdp->scd_mutex);
15226 sfmmu_from_scd_list(&scdp->scd_sf_list, sfmmup);
15227 mutex_exit(&scdp->scd_mutex);
15228 SFMMU_STAT(sf_leave_scd);
15230 SF_SCD_DECR_REF(srdp, scdp);
15231 hatlockp = sfmmu_hat_enter(sfmmup);
15236 * Unlink and free up an SCD structure with a reference count of 0.
15238 static void
15239 sfmmu_destroy_scd(sf_srd_t *srdp, sf_scd_t *scdp, sf_region_map_t *scd_rmap)
15241 sfmmu_t *scsfmmup;
15242 sf_scd_t *sp;
15243 hatlock_t *shatlockp;
15244 int i, ret;
15246 mutex_enter(&srdp->srd_scd_mutex);
15247 for (sp = srdp->srd_scdp; sp != NULL; sp = sp->scd_next) {
15248 if (sp == scdp)
15249 break;
15251 if (sp == NULL || sp->scd_refcnt) {
15252 mutex_exit(&srdp->srd_scd_mutex);
15253 return;
15257 * It is possible that the scd has been freed and reallocated with a
15258 * different region map while we've been waiting for the srd_scd_mutex.
15260 SF_RGNMAP_EQUAL(scd_rmap, &sp->scd_region_map, ret);
15261 if (ret != 1) {
15262 mutex_exit(&srdp->srd_scd_mutex);
15263 return;
15266 ASSERT(scdp->scd_sf_list == NULL);
15268 * Unlink scd from srd_scdp list.
15270 sfmmu_remove_scd(&srdp->srd_scdp, scdp);
15271 mutex_exit(&srdp->srd_scd_mutex);
15273 sfmmu_unlink_scd_from_regions(srdp, scdp);
15275 /* Clear shared context tsb and release ctx */
15276 scsfmmup = scdp->scd_sfmmup;
15279 * create a barrier so that scd will not be destroyed
15280 * if other thread still holds the same shared hat lock.
15281 * E.g., sfmmu_tsbmiss_exception() needs to acquire the
15282 * shared hat lock before checking the shared tsb reloc flag.
15284 shatlockp = sfmmu_hat_enter(scsfmmup);
15285 sfmmu_hat_exit(shatlockp);
15287 sfmmu_free_scd_tsbs(scsfmmup);
15289 for (i = 0; i < SFMMU_L1_HMERLINKS; i++) {
15290 if (scsfmmup->sfmmu_hmeregion_links[i] != NULL) {
15291 kmem_free(scsfmmup->sfmmu_hmeregion_links[i],
15292 SFMMU_L2_HMERLINKS_SIZE);
15293 scsfmmup->sfmmu_hmeregion_links[i] = NULL;
15296 kmem_cache_free(sfmmuid_cache, scsfmmup);
15297 kmem_cache_free(scd_cache, scdp);
15298 SFMMU_STAT(sf_destroy_scd);
15302 * Modifies the HAT_CTX1_FLAG for each of the ISM segments which correspond to
15303 * bits which are set in the ism_region_map parameter. This flag indicates to
15304 * the tsbmiss handler that mapping for these segments should be loaded using
15305 * the shared context.
15307 static void
15308 sfmmu_ism_hatflags(sfmmu_t *sfmmup, int addflag)
15310 sf_scd_t *scdp = sfmmup->sfmmu_scdp;
15311 ism_blk_t *ism_blkp;
15312 ism_map_t *ism_map;
15313 int i, rid;
15315 ASSERT(sfmmup->sfmmu_iblk != NULL);
15316 ASSERT(scdp != NULL);
15318 * Note that the caller either set HAT_ISMBUSY flag or checked
15319 * under hat lock that HAT_ISMBUSY was not set by another thread.
15321 ASSERT(sfmmu_hat_lock_held(sfmmup));
15323 ism_blkp = sfmmup->sfmmu_iblk;
15324 while (ism_blkp != NULL) {
15325 ism_map = ism_blkp->iblk_maps;
15326 for (i = 0; ism_map[i].imap_ismhat && i < ISM_MAP_SLOTS; i++) {
15327 rid = ism_map[i].imap_rid;
15328 if (rid == SFMMU_INVALID_ISMRID) {
15329 continue;
15331 ASSERT(rid >= 0 && rid < SFMMU_MAX_ISM_REGIONS);
15332 if (SF_RGNMAP_TEST(scdp->scd_ismregion_map, rid) &&
15333 addflag) {
15334 ism_map[i].imap_hatflags |=
15335 HAT_CTX1_FLAG;
15336 } else {
15337 ism_map[i].imap_hatflags &=
15338 ~HAT_CTX1_FLAG;
15341 ism_blkp = ism_blkp->iblk_next;
15345 static int
15346 sfmmu_srd_lock_held(sf_srd_t *srdp)
15348 return (MUTEX_HELD(&srdp->srd_mutex));
15351 /* ARGSUSED */
15352 static int
15353 sfmmu_scdcache_constructor(void *buf, void *cdrarg, int kmflags)
15355 sf_scd_t *scdp = (sf_scd_t *)buf;
15357 bzero(buf, sizeof (sf_scd_t));
15358 mutex_init(&scdp->scd_mutex, NULL, MUTEX_DEFAULT, NULL);
15359 return (0);
15362 /* ARGSUSED */
15363 static void
15364 sfmmu_scdcache_destructor(void *buf, void *cdrarg)
15366 sf_scd_t *scdp = (sf_scd_t *)buf;
15368 mutex_destroy(&scdp->scd_mutex);
15372 * The listp parameter is a pointer to a list of hmeblks which are partially
15373 * freed as result of calling sfmmu_hblk_hash_rm(), the last phase of the
15374 * freeing process is to cross-call all cpus to ensure that there are no
15375 * remaining cached references.
15377 * If the local generation number is less than the global then we can free
15378 * hmeblks which are already on the pending queue as another cpu has completed
15379 * the cross-call.
15381 * We cross-call to make sure that there are no threads on other cpus accessing
15382 * these hmblks and then complete the process of freeing them under the
15383 * following conditions:
15384 * The total number of pending hmeblks is greater than the threshold
15385 * The reserve list has fewer than HBLK_RESERVE_CNT hmeblks
15386 * It is at least 1 second since the last time we cross-called
15388 * Otherwise, we add the hmeblks to the per-cpu pending queue.
15390 static void
15391 sfmmu_hblks_list_purge(struct hme_blk **listp, int dontfree)
15393 struct hme_blk *hblkp, *pr_hblkp = NULL;
15394 int count = 0;
15395 cpuset_t cpuset = cpu_ready_set;
15396 cpu_hme_pend_t *cpuhp;
15397 timestruc_t now;
15398 int one_second_expired = 0;
15400 gethrestime_lasttick(&now);
15402 for (hblkp = *listp; hblkp != NULL; hblkp = hblkp->hblk_next) {
15403 ASSERT(hblkp->hblk_shw_bit == 0);
15404 ASSERT(hblkp->hblk_shared == 0);
15405 count++;
15406 pr_hblkp = hblkp;
15409 cpuhp = &cpu_hme_pend[CPU->cpu_seqid];
15410 mutex_enter(&cpuhp->chp_mutex);
15412 if ((cpuhp->chp_count + count) == 0) {
15413 mutex_exit(&cpuhp->chp_mutex);
15414 return;
15417 if ((now.tv_sec - cpuhp->chp_timestamp) > 1) {
15418 one_second_expired = 1;
15421 if (!dontfree && (freehblkcnt < HBLK_RESERVE_CNT ||
15422 (cpuhp->chp_count + count) > cpu_hme_pend_thresh ||
15423 one_second_expired)) {
15424 /* Append global list to local */
15425 if (pr_hblkp == NULL) {
15426 *listp = cpuhp->chp_listp;
15427 } else {
15428 pr_hblkp->hblk_next = cpuhp->chp_listp;
15430 cpuhp->chp_listp = NULL;
15431 cpuhp->chp_count = 0;
15432 cpuhp->chp_timestamp = now.tv_sec;
15433 mutex_exit(&cpuhp->chp_mutex);
15435 kpreempt_disable();
15436 CPUSET_DEL(cpuset, CPU->cpu_id);
15437 xt_sync(cpuset);
15438 xt_sync(cpuset);
15439 kpreempt_enable();
15442 * At this stage we know that no trap handlers on other
15443 * cpus can have references to hmeblks on the list.
15445 sfmmu_hblk_free(listp);
15446 } else if (*listp != NULL) {
15447 pr_hblkp->hblk_next = cpuhp->chp_listp;
15448 cpuhp->chp_listp = *listp;
15449 cpuhp->chp_count += count;
15450 *listp = NULL;
15451 mutex_exit(&cpuhp->chp_mutex);
15452 } else {
15453 mutex_exit(&cpuhp->chp_mutex);
15458 * Add an hmeblk to the the hash list.
15460 void
15461 sfmmu_hblk_hash_add(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp,
15462 uint64_t hblkpa)
15464 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
15465 #ifdef DEBUG
15466 if (hmebp->hmeblkp == NULL) {
15467 ASSERT(hmebp->hmeh_nextpa == HMEBLK_ENDPA);
15469 #endif /* DEBUG */
15471 hmeblkp->hblk_nextpa = hmebp->hmeh_nextpa;
15473 * Since the TSB miss handler now does not lock the hash chain before
15474 * walking it, make sure that the hmeblks nextpa is globally visible
15475 * before we make the hmeblk globally visible by updating the chain root
15476 * pointer in the hash bucket.
15478 membar_producer();
15479 hmebp->hmeh_nextpa = hblkpa;
15480 hmeblkp->hblk_next = hmebp->hmeblkp;
15481 hmebp->hmeblkp = hmeblkp;
15486 * This function is the first part of a 2 part process to remove an hmeblk
15487 * from the hash chain. In this phase we unlink the hmeblk from the hash chain
15488 * but leave the next physical pointer unchanged. The hmeblk is then linked onto
15489 * a per-cpu pending list using the virtual address pointer.
15491 * TSB miss trap handlers that start after this phase will no longer see
15492 * this hmeblk. TSB miss handlers that still cache this hmeblk in a register
15493 * can still use it for further chain traversal because we haven't yet modifed
15494 * the next physical pointer or freed it.
15496 * In the second phase of hmeblk removal we'll issue a barrier xcall before
15497 * we reuse or free this hmeblk. This will make sure all lingering references to
15498 * the hmeblk after first phase disappear before we finally reclaim it.
15499 * This scheme eliminates the need for TSB miss handlers to lock hmeblk chains
15500 * during their traversal.
15502 * The hmehash_mutex must be held when calling this function.
15504 * Input:
15505 * hmebp - hme hash bucket pointer
15506 * hmeblkp - address of hmeblk to be removed
15507 * pr_hblk - virtual address of previous hmeblkp
15508 * listp - pointer to list of hmeblks linked by virtual address
15509 * free_now flag - indicates that a complete removal from the hash chains
15510 * is necessary.
15512 * It is inefficient to use the free_now flag as a cross-call is required to
15513 * remove a single hmeblk from the hash chain but is necessary when hmeblks are
15514 * in short supply.
15516 void
15517 sfmmu_hblk_hash_rm(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp,
15518 struct hme_blk *pr_hblk, struct hme_blk **listp,
15519 int free_now)
15521 int shw_size, vshift;
15522 struct hme_blk *shw_hblkp;
15523 uint_t shw_mask, newshw_mask;
15524 caddr_t vaddr;
15525 int size;
15526 cpuset_t cpuset = cpu_ready_set;
15528 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
15530 if (hmebp->hmeblkp == hmeblkp) {
15531 hmebp->hmeh_nextpa = hmeblkp->hblk_nextpa;
15532 hmebp->hmeblkp = hmeblkp->hblk_next;
15533 } else {
15534 pr_hblk->hblk_nextpa = hmeblkp->hblk_nextpa;
15535 pr_hblk->hblk_next = hmeblkp->hblk_next;
15538 size = get_hblk_ttesz(hmeblkp);
15539 shw_hblkp = hmeblkp->hblk_shadow;
15540 if (shw_hblkp) {
15541 ASSERT(hblktosfmmu(hmeblkp) != KHATID);
15542 ASSERT(!hmeblkp->hblk_shared);
15543 #ifdef DEBUG
15544 if (mmu_page_sizes == max_mmu_page_sizes) {
15545 ASSERT(size < TTE256M);
15546 } else {
15547 ASSERT(size < TTE4M);
15549 #endif /* DEBUG */
15551 shw_size = get_hblk_ttesz(shw_hblkp);
15552 vaddr = (caddr_t)get_hblk_base(hmeblkp);
15553 vshift = vaddr_to_vshift(shw_hblkp->hblk_tag, vaddr, shw_size);
15554 ASSERT(vshift < 8);
15556 * Atomically clear shadow mask bit
15558 do {
15559 shw_mask = shw_hblkp->hblk_shw_mask;
15560 ASSERT(shw_mask & (1 << vshift));
15561 newshw_mask = shw_mask & ~(1 << vshift);
15562 newshw_mask = atomic_cas_32(&shw_hblkp->hblk_shw_mask,
15563 shw_mask, newshw_mask);
15564 } while (newshw_mask != shw_mask);
15565 hmeblkp->hblk_shadow = NULL;
15567 hmeblkp->hblk_shw_bit = 0;
15569 if (hmeblkp->hblk_shared) {
15570 #ifdef DEBUG
15571 sf_srd_t *srdp;
15572 sf_region_t *rgnp;
15573 uint_t rid;
15575 srdp = hblktosrd(hmeblkp);
15576 ASSERT(srdp != NULL && srdp->srd_refcnt != 0);
15577 rid = hmeblkp->hblk_tag.htag_rid;
15578 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
15579 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
15580 rgnp = srdp->srd_hmergnp[rid];
15581 ASSERT(rgnp != NULL);
15582 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid);
15583 #endif /* DEBUG */
15584 hmeblkp->hblk_shared = 0;
15586 if (free_now) {
15587 kpreempt_disable();
15588 CPUSET_DEL(cpuset, CPU->cpu_id);
15589 xt_sync(cpuset);
15590 xt_sync(cpuset);
15591 kpreempt_enable();
15593 hmeblkp->hblk_nextpa = HMEBLK_ENDPA;
15594 hmeblkp->hblk_next = NULL;
15595 } else {
15596 /* Append hmeblkp to listp for processing later. */
15597 hmeblkp->hblk_next = *listp;
15598 *listp = hmeblkp;
15603 * This routine is called when memory is in short supply and returns a free
15604 * hmeblk of the requested size from the cpu pending lists.
15606 static struct hme_blk *
15607 sfmmu_check_pending_hblks(int size)
15609 int i;
15610 struct hme_blk *hmeblkp = NULL, *last_hmeblkp;
15611 int found_hmeblk;
15612 cpuset_t cpuset = cpu_ready_set;
15613 cpu_hme_pend_t *cpuhp;
15615 /* Flush cpu hblk pending queues */
15616 for (i = 0; i < NCPU; i++) {
15617 cpuhp = &cpu_hme_pend[i];
15618 if (cpuhp->chp_listp != NULL) {
15619 mutex_enter(&cpuhp->chp_mutex);
15620 if (cpuhp->chp_listp == NULL) {
15621 mutex_exit(&cpuhp->chp_mutex);
15622 continue;
15624 found_hmeblk = 0;
15625 last_hmeblkp = NULL;
15626 for (hmeblkp = cpuhp->chp_listp; hmeblkp != NULL;
15627 hmeblkp = hmeblkp->hblk_next) {
15628 if (get_hblk_ttesz(hmeblkp) == size) {
15629 if (last_hmeblkp == NULL) {
15630 cpuhp->chp_listp =
15631 hmeblkp->hblk_next;
15632 } else {
15633 last_hmeblkp->hblk_next =
15634 hmeblkp->hblk_next;
15636 ASSERT(cpuhp->chp_count > 0);
15637 cpuhp->chp_count--;
15638 found_hmeblk = 1;
15639 break;
15640 } else {
15641 last_hmeblkp = hmeblkp;
15644 mutex_exit(&cpuhp->chp_mutex);
15646 if (found_hmeblk) {
15647 kpreempt_disable();
15648 CPUSET_DEL(cpuset, CPU->cpu_id);
15649 xt_sync(cpuset);
15650 xt_sync(cpuset);
15651 kpreempt_enable();
15652 return (hmeblkp);
15656 return (NULL);