6514 AS_* lock macros simplification
[illumos-gate.git] / usr / src / uts / sfmmu / vm / hat_sfmmu.c
blob8e1c3838d3d98ca839048df77924add9e0e6d5fa
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
29 * VM - Hardware Address Translation management for Spitfire MMU.
31 * This file implements the machine specific hardware translation
32 * needed by the VM system. The machine independent interface is
33 * described in <vm/hat.h> while the machine dependent interface
34 * and data structures are described in <vm/hat_sfmmu.h>.
36 * The hat layer manages the address translation hardware as a cache
37 * driven by calls from the higher levels in the VM system.
40 #include <sys/types.h>
41 #include <sys/kstat.h>
42 #include <vm/hat.h>
43 #include <vm/hat_sfmmu.h>
44 #include <vm/page.h>
45 #include <sys/pte.h>
46 #include <sys/systm.h>
47 #include <sys/mman.h>
48 #include <sys/sysmacros.h>
49 #include <sys/machparam.h>
50 #include <sys/vtrace.h>
51 #include <sys/kmem.h>
52 #include <sys/mmu.h>
53 #include <sys/cmn_err.h>
54 #include <sys/cpu.h>
55 #include <sys/cpuvar.h>
56 #include <sys/debug.h>
57 #include <sys/lgrp.h>
58 #include <sys/archsystm.h>
59 #include <sys/machsystm.h>
60 #include <sys/vmsystm.h>
61 #include <vm/as.h>
62 #include <vm/seg.h>
63 #include <vm/seg_kp.h>
64 #include <vm/seg_kmem.h>
65 #include <vm/seg_kpm.h>
66 #include <vm/rm.h>
67 #include <sys/t_lock.h>
68 #include <sys/obpdefs.h>
69 #include <sys/vm_machparam.h>
70 #include <sys/var.h>
71 #include <sys/trap.h>
72 #include <sys/machtrap.h>
73 #include <sys/scb.h>
74 #include <sys/bitmap.h>
75 #include <sys/machlock.h>
76 #include <sys/membar.h>
77 #include <sys/atomic.h>
78 #include <sys/cpu_module.h>
79 #include <sys/prom_debug.h>
80 #include <sys/ksynch.h>
81 #include <sys/mem_config.h>
82 #include <sys/mem_cage.h>
83 #include <vm/vm_dep.h>
84 #include <vm/xhat_sfmmu.h>
85 #include <sys/fpu/fpusystm.h>
86 #include <vm/mach_kpm.h>
87 #include <sys/callb.h>
89 #ifdef DEBUG
90 #define SFMMU_VALIDATE_HMERID(hat, rid, saddr, len) \
91 if (SFMMU_IS_SHMERID_VALID(rid)) { \
92 caddr_t _eaddr = (saddr) + (len); \
93 sf_srd_t *_srdp; \
94 sf_region_t *_rgnp; \
95 ASSERT((rid) < SFMMU_MAX_HME_REGIONS); \
96 ASSERT(SF_RGNMAP_TEST(hat->sfmmu_hmeregion_map, rid)); \
97 ASSERT((hat) != ksfmmup); \
98 _srdp = (hat)->sfmmu_srdp; \
99 ASSERT(_srdp != NULL); \
100 ASSERT(_srdp->srd_refcnt != 0); \
101 _rgnp = _srdp->srd_hmergnp[(rid)]; \
102 ASSERT(_rgnp != NULL && _rgnp->rgn_id == rid); \
103 ASSERT(_rgnp->rgn_refcnt != 0); \
104 ASSERT(!(_rgnp->rgn_flags & SFMMU_REGION_FREE)); \
105 ASSERT((_rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == \
106 SFMMU_REGION_HME); \
107 ASSERT((saddr) >= _rgnp->rgn_saddr); \
108 ASSERT((saddr) < _rgnp->rgn_saddr + _rgnp->rgn_size); \
109 ASSERT(_eaddr > _rgnp->rgn_saddr); \
110 ASSERT(_eaddr <= _rgnp->rgn_saddr + _rgnp->rgn_size); \
113 #define SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid) \
115 caddr_t _hsva; \
116 caddr_t _heva; \
117 caddr_t _rsva; \
118 caddr_t _reva; \
119 int _ttesz = get_hblk_ttesz(hmeblkp); \
120 int _flagtte; \
121 ASSERT((srdp)->srd_refcnt != 0); \
122 ASSERT((rid) < SFMMU_MAX_HME_REGIONS); \
123 ASSERT((rgnp)->rgn_id == rid); \
124 ASSERT(!((rgnp)->rgn_flags & SFMMU_REGION_FREE)); \
125 ASSERT(((rgnp)->rgn_flags & SFMMU_REGION_TYPE_MASK) == \
126 SFMMU_REGION_HME); \
127 ASSERT(_ttesz <= (rgnp)->rgn_pgszc); \
128 _hsva = (caddr_t)get_hblk_base(hmeblkp); \
129 _heva = get_hblk_endaddr(hmeblkp); \
130 _rsva = (caddr_t)P2ALIGN( \
131 (uintptr_t)(rgnp)->rgn_saddr, HBLK_MIN_BYTES); \
132 _reva = (caddr_t)P2ROUNDUP( \
133 (uintptr_t)((rgnp)->rgn_saddr + (rgnp)->rgn_size), \
134 HBLK_MIN_BYTES); \
135 ASSERT(_hsva >= _rsva); \
136 ASSERT(_hsva < _reva); \
137 ASSERT(_heva > _rsva); \
138 ASSERT(_heva <= _reva); \
139 _flagtte = (_ttesz < HBLK_MIN_TTESZ) ? HBLK_MIN_TTESZ : \
140 _ttesz; \
141 ASSERT(rgnp->rgn_hmeflags & (0x1 << _flagtte)); \
144 #else /* DEBUG */
145 #define SFMMU_VALIDATE_HMERID(hat, rid, addr, len)
146 #define SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid)
147 #endif /* DEBUG */
149 #if defined(SF_ERRATA_57)
150 extern caddr_t errata57_limit;
151 #endif
153 #define HME8BLK_SZ_RND ((roundup(HME8BLK_SZ, sizeof (int64_t))) / \
154 (sizeof (int64_t)))
155 #define HBLK_RESERVE ((struct hme_blk *)hblk_reserve)
157 #define HBLK_RESERVE_CNT 128
158 #define HBLK_RESERVE_MIN 20
160 static struct hme_blk *freehblkp;
161 static kmutex_t freehblkp_lock;
162 static int freehblkcnt;
164 static int64_t hblk_reserve[HME8BLK_SZ_RND];
165 static kmutex_t hblk_reserve_lock;
166 static kthread_t *hblk_reserve_thread;
168 static nucleus_hblk8_info_t nucleus_hblk8;
169 static nucleus_hblk1_info_t nucleus_hblk1;
172 * Data to manage per-cpu hmeblk pending queues, hmeblks are queued here
173 * after the initial phase of removing an hmeblk from the hash chain, see
174 * the detailed comment in sfmmu_hblk_hash_rm() for further details.
176 static cpu_hme_pend_t *cpu_hme_pend;
177 static uint_t cpu_hme_pend_thresh;
179 * SFMMU specific hat functions
181 void hat_pagecachectl(struct page *, int);
183 /* flags for hat_pagecachectl */
184 #define HAT_CACHE 0x1
185 #define HAT_UNCACHE 0x2
186 #define HAT_TMPNC 0x4
189 * Flag to allow the creation of non-cacheable translations
190 * to system memory. It is off by default. At the moment this
191 * flag is used by the ecache error injector. The error injector
192 * will turn it on when creating such a translation then shut it
193 * off when it's finished.
196 int sfmmu_allow_nc_trans = 0;
199 * Flag to disable large page support.
200 * value of 1 => disable all large pages.
201 * bits 1, 2, and 3 are to disable 64K, 512K and 4M pages respectively.
203 * For example, use the value 0x4 to disable 512K pages.
206 #define LARGE_PAGES_OFF 0x1
209 * The disable_large_pages and disable_ism_large_pages variables control
210 * hat_memload_array and the page sizes to be used by ISM and the kernel.
212 * The disable_auto_data_large_pages and disable_auto_text_large_pages variables
213 * are only used to control which OOB pages to use at upper VM segment creation
214 * time, and are set in hat_init_pagesizes and used in the map_pgsz* routines.
215 * Their values may come from platform or CPU specific code to disable page
216 * sizes that should not be used.
218 * WARNING: 512K pages are currently not supported for ISM/DISM.
220 uint_t disable_large_pages = 0;
221 uint_t disable_ism_large_pages = (1 << TTE512K);
222 uint_t disable_auto_data_large_pages = 0;
223 uint_t disable_auto_text_large_pages = 0;
226 * Private sfmmu data structures for hat management
228 static struct kmem_cache *sfmmuid_cache;
229 static struct kmem_cache *mmuctxdom_cache;
232 * Private sfmmu data structures for tsb management
234 static struct kmem_cache *sfmmu_tsbinfo_cache;
235 static struct kmem_cache *sfmmu_tsb8k_cache;
236 static struct kmem_cache *sfmmu_tsb_cache[NLGRPS_MAX];
237 static vmem_t *kmem_bigtsb_arena;
238 static vmem_t *kmem_tsb_arena;
241 * sfmmu static variables for hmeblk resource management.
243 static vmem_t *hat_memload1_arena; /* HAT translation arena for sfmmu1_cache */
244 static struct kmem_cache *sfmmu8_cache;
245 static struct kmem_cache *sfmmu1_cache;
246 static struct kmem_cache *pa_hment_cache;
248 static kmutex_t ism_mlist_lock; /* mutex for ism mapping list */
250 * private data for ism
252 static struct kmem_cache *ism_blk_cache;
253 static struct kmem_cache *ism_ment_cache;
254 #define ISMID_STARTADDR NULL
257 * Region management data structures and function declarations.
260 static void sfmmu_leave_srd(sfmmu_t *);
261 static int sfmmu_srdcache_constructor(void *, void *, int);
262 static void sfmmu_srdcache_destructor(void *, void *);
263 static int sfmmu_rgncache_constructor(void *, void *, int);
264 static void sfmmu_rgncache_destructor(void *, void *);
265 static int sfrgnmap_isnull(sf_region_map_t *);
266 static int sfhmergnmap_isnull(sf_hmeregion_map_t *);
267 static int sfmmu_scdcache_constructor(void *, void *, int);
268 static void sfmmu_scdcache_destructor(void *, void *);
269 static void sfmmu_rgn_cb_noop(caddr_t, caddr_t, caddr_t,
270 size_t, void *, u_offset_t);
272 static uint_t srd_hashmask = SFMMU_MAX_SRD_BUCKETS - 1;
273 static sf_srd_bucket_t *srd_buckets;
274 static struct kmem_cache *srd_cache;
275 static uint_t srd_rgn_hashmask = SFMMU_MAX_REGION_BUCKETS - 1;
276 static struct kmem_cache *region_cache;
277 static struct kmem_cache *scd_cache;
279 #ifdef sun4v
280 int use_bigtsb_arena = 1;
281 #else
282 int use_bigtsb_arena = 0;
283 #endif
285 /* External /etc/system tunable, for turning on&off the shctx support */
286 int disable_shctx = 0;
287 /* Internal variable, set by MD if the HW supports shctx feature */
288 int shctx_on = 0;
290 #ifdef DEBUG
291 static void check_scd_sfmmu_list(sfmmu_t **, sfmmu_t *, int);
292 #endif
293 static void sfmmu_to_scd_list(sfmmu_t **, sfmmu_t *);
294 static void sfmmu_from_scd_list(sfmmu_t **, sfmmu_t *);
296 static sf_scd_t *sfmmu_alloc_scd(sf_srd_t *, sf_region_map_t *);
297 static void sfmmu_find_scd(sfmmu_t *);
298 static void sfmmu_join_scd(sf_scd_t *, sfmmu_t *);
299 static void sfmmu_finish_join_scd(sfmmu_t *);
300 static void sfmmu_leave_scd(sfmmu_t *, uchar_t);
301 static void sfmmu_destroy_scd(sf_srd_t *, sf_scd_t *, sf_region_map_t *);
302 static int sfmmu_alloc_scd_tsbs(sf_srd_t *, sf_scd_t *);
303 static void sfmmu_free_scd_tsbs(sfmmu_t *);
304 static void sfmmu_tsb_inv_ctx(sfmmu_t *);
305 static int find_ism_rid(sfmmu_t *, sfmmu_t *, caddr_t, uint_t *);
306 static void sfmmu_ism_hatflags(sfmmu_t *, int);
307 static int sfmmu_srd_lock_held(sf_srd_t *);
308 static void sfmmu_remove_scd(sf_scd_t **, sf_scd_t *);
309 static void sfmmu_add_scd(sf_scd_t **headp, sf_scd_t *);
310 static void sfmmu_link_scd_to_regions(sf_srd_t *, sf_scd_t *);
311 static void sfmmu_unlink_scd_from_regions(sf_srd_t *, sf_scd_t *);
312 static void sfmmu_link_to_hmeregion(sfmmu_t *, sf_region_t *);
313 static void sfmmu_unlink_from_hmeregion(sfmmu_t *, sf_region_t *);
316 * ``hat_lock'' is a hashed mutex lock for protecting sfmmu TSB lists,
317 * HAT flags, synchronizing TLB/TSB coherency, and context management.
318 * The lock is hashed on the sfmmup since the case where we need to lock
319 * all processes is rare but does occur (e.g. we need to unload a shared
320 * mapping from all processes using the mapping). We have a lot of buckets,
321 * and each slab of sfmmu_t's can use about a quarter of them, giving us
322 * a fairly good distribution without wasting too much space and overhead
323 * when we have to grab them all.
325 #define SFMMU_NUM_LOCK 128 /* must be power of two */
326 hatlock_t hat_lock[SFMMU_NUM_LOCK];
329 * Hash algorithm optimized for a small number of slabs.
330 * 7 is (highbit((sizeof sfmmu_t)) - 1)
331 * This hash algorithm is based upon the knowledge that sfmmu_t's come from a
332 * kmem_cache, and thus they will be sequential within that cache. In
333 * addition, each new slab will have a different "color" up to cache_maxcolor
334 * which will skew the hashing for each successive slab which is allocated.
335 * If the size of sfmmu_t changed to a larger size, this algorithm may need
336 * to be revisited.
338 #define TSB_HASH_SHIFT_BITS (7)
339 #define PTR_HASH(x) ((uintptr_t)x >> TSB_HASH_SHIFT_BITS)
341 #ifdef DEBUG
342 int tsb_hash_debug = 0;
343 #define TSB_HASH(sfmmup) \
344 (tsb_hash_debug ? &hat_lock[0] : \
345 &hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)])
346 #else /* DEBUG */
347 #define TSB_HASH(sfmmup) &hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)]
348 #endif /* DEBUG */
351 /* sfmmu_replace_tsb() return codes. */
352 typedef enum tsb_replace_rc {
353 TSB_SUCCESS,
354 TSB_ALLOCFAIL,
355 TSB_LOSTRACE,
356 TSB_ALREADY_SWAPPED,
357 TSB_CANTGROW
358 } tsb_replace_rc_t;
361 * Flags for TSB allocation routines.
363 #define TSB_ALLOC 0x01
364 #define TSB_FORCEALLOC 0x02
365 #define TSB_GROW 0x04
366 #define TSB_SHRINK 0x08
367 #define TSB_SWAPIN 0x10
370 * Support for HAT callbacks.
372 #define SFMMU_MAX_RELOC_CALLBACKS 10
373 int sfmmu_max_cb_id = SFMMU_MAX_RELOC_CALLBACKS;
374 static id_t sfmmu_cb_nextid = 0;
375 static id_t sfmmu_tsb_cb_id;
376 struct sfmmu_callback *sfmmu_cb_table;
378 kmutex_t kpr_mutex;
379 kmutex_t kpr_suspendlock;
380 kthread_t *kreloc_thread;
383 * Enable VA->PA translation sanity checking on DEBUG kernels.
384 * Disabled by default. This is incompatible with some
385 * drivers (error injector, RSM) so if it breaks you get
386 * to keep both pieces.
388 int hat_check_vtop = 0;
391 * Private sfmmu routines (prototypes)
393 static struct hme_blk *sfmmu_shadow_hcreate(sfmmu_t *, caddr_t, int, uint_t);
394 static struct hme_blk *sfmmu_hblk_alloc(sfmmu_t *, caddr_t,
395 struct hmehash_bucket *, uint_t, hmeblk_tag, uint_t,
396 uint_t);
397 static caddr_t sfmmu_hblk_unload(struct hat *, struct hme_blk *, caddr_t,
398 caddr_t, demap_range_t *, uint_t);
399 static caddr_t sfmmu_hblk_sync(struct hat *, struct hme_blk *, caddr_t,
400 caddr_t, int);
401 static void sfmmu_hblk_free(struct hme_blk **);
402 static void sfmmu_hblks_list_purge(struct hme_blk **, int);
403 static uint_t sfmmu_get_free_hblk(struct hme_blk **, uint_t);
404 static uint_t sfmmu_put_free_hblk(struct hme_blk *, uint_t);
405 static struct hme_blk *sfmmu_hblk_steal(int);
406 static int sfmmu_steal_this_hblk(struct hmehash_bucket *,
407 struct hme_blk *, uint64_t, struct hme_blk *);
408 static caddr_t sfmmu_hblk_unlock(struct hme_blk *, caddr_t, caddr_t);
410 static void hat_do_memload_array(struct hat *, caddr_t, size_t,
411 struct page **, uint_t, uint_t, uint_t);
412 static void hat_do_memload(struct hat *, caddr_t, struct page *,
413 uint_t, uint_t, uint_t);
414 static void sfmmu_memload_batchsmall(struct hat *, caddr_t, page_t **,
415 uint_t, uint_t, pgcnt_t, uint_t);
416 void sfmmu_tteload(struct hat *, tte_t *, caddr_t, page_t *,
417 uint_t);
418 static int sfmmu_tteload_array(sfmmu_t *, tte_t *, caddr_t, page_t **,
419 uint_t, uint_t);
420 static struct hmehash_bucket *sfmmu_tteload_acquire_hashbucket(sfmmu_t *,
421 caddr_t, int, uint_t);
422 static struct hme_blk *sfmmu_tteload_find_hmeblk(sfmmu_t *,
423 struct hmehash_bucket *, caddr_t, uint_t, uint_t,
424 uint_t);
425 static int sfmmu_tteload_addentry(sfmmu_t *, struct hme_blk *, tte_t *,
426 caddr_t, page_t **, uint_t, uint_t);
427 static void sfmmu_tteload_release_hashbucket(struct hmehash_bucket *);
429 static int sfmmu_pagearray_setup(caddr_t, page_t **, tte_t *, int);
430 static pfn_t sfmmu_uvatopfn(caddr_t, sfmmu_t *, tte_t *);
431 void sfmmu_memtte(tte_t *, pfn_t, uint_t, int);
432 #ifdef VAC
433 static void sfmmu_vac_conflict(struct hat *, caddr_t, page_t *);
434 static int sfmmu_vacconflict_array(caddr_t, page_t *, int *);
435 int tst_tnc(page_t *pp, pgcnt_t);
436 void conv_tnc(page_t *pp, int);
437 #endif
439 static void sfmmu_get_ctx(sfmmu_t *);
440 static void sfmmu_free_sfmmu(sfmmu_t *);
442 static void sfmmu_ttesync(struct hat *, caddr_t, tte_t *, page_t *);
443 static void sfmmu_chgattr(struct hat *, caddr_t, size_t, uint_t, int);
445 cpuset_t sfmmu_pageunload(page_t *, struct sf_hment *, int);
446 static void hat_pagereload(struct page *, struct page *);
447 static cpuset_t sfmmu_pagesync(page_t *, struct sf_hment *, uint_t);
448 #ifdef VAC
449 void sfmmu_page_cache_array(page_t *, int, int, pgcnt_t);
450 static void sfmmu_page_cache(page_t *, int, int, int);
451 #endif
453 cpuset_t sfmmu_rgntlb_demap(caddr_t, sf_region_t *,
454 struct hme_blk *, int);
455 static void sfmmu_tlbcache_demap(caddr_t, sfmmu_t *, struct hme_blk *,
456 pfn_t, int, int, int, int);
457 static void sfmmu_ismtlbcache_demap(caddr_t, sfmmu_t *, struct hme_blk *,
458 pfn_t, int);
459 static void sfmmu_tlb_demap(caddr_t, sfmmu_t *, struct hme_blk *, int, int);
460 static void sfmmu_tlb_range_demap(demap_range_t *);
461 static void sfmmu_invalidate_ctx(sfmmu_t *);
462 static void sfmmu_sync_mmustate(sfmmu_t *);
464 static void sfmmu_tsbinfo_setup_phys(struct tsb_info *, pfn_t);
465 static int sfmmu_tsbinfo_alloc(struct tsb_info **, int, int, uint_t,
466 sfmmu_t *);
467 static void sfmmu_tsb_free(struct tsb_info *);
468 static void sfmmu_tsbinfo_free(struct tsb_info *);
469 static int sfmmu_init_tsbinfo(struct tsb_info *, int, int, uint_t,
470 sfmmu_t *);
471 static void sfmmu_tsb_chk_reloc(sfmmu_t *, hatlock_t *);
472 static void sfmmu_tsb_swapin(sfmmu_t *, hatlock_t *);
473 static int sfmmu_select_tsb_szc(pgcnt_t);
474 static void sfmmu_mod_tsb(sfmmu_t *, caddr_t, tte_t *, int);
475 #define sfmmu_load_tsb(sfmmup, vaddr, tte, szc) \
476 sfmmu_mod_tsb(sfmmup, vaddr, tte, szc)
477 #define sfmmu_unload_tsb(sfmmup, vaddr, szc) \
478 sfmmu_mod_tsb(sfmmup, vaddr, NULL, szc)
479 static void sfmmu_copy_tsb(struct tsb_info *, struct tsb_info *);
480 static tsb_replace_rc_t sfmmu_replace_tsb(sfmmu_t *, struct tsb_info *, uint_t,
481 hatlock_t *, uint_t);
482 static void sfmmu_size_tsb(sfmmu_t *, int, uint64_t, uint64_t, int);
484 #ifdef VAC
485 void sfmmu_cache_flush(pfn_t, int);
486 void sfmmu_cache_flushcolor(int, pfn_t);
487 #endif
488 static caddr_t sfmmu_hblk_chgattr(sfmmu_t *, struct hme_blk *, caddr_t,
489 caddr_t, demap_range_t *, uint_t, int);
491 static uint64_t sfmmu_vtop_attr(uint_t, int mode, tte_t *);
492 static uint_t sfmmu_ptov_attr(tte_t *);
493 static caddr_t sfmmu_hblk_chgprot(sfmmu_t *, struct hme_blk *, caddr_t,
494 caddr_t, demap_range_t *, uint_t);
495 static uint_t sfmmu_vtop_prot(uint_t, uint_t *);
496 static int sfmmu_idcache_constructor(void *, void *, int);
497 static void sfmmu_idcache_destructor(void *, void *);
498 static int sfmmu_hblkcache_constructor(void *, void *, int);
499 static void sfmmu_hblkcache_destructor(void *, void *);
500 static void sfmmu_hblkcache_reclaim(void *);
501 static void sfmmu_shadow_hcleanup(sfmmu_t *, struct hme_blk *,
502 struct hmehash_bucket *);
503 static void sfmmu_hblk_hash_rm(struct hmehash_bucket *, struct hme_blk *,
504 struct hme_blk *, struct hme_blk **, int);
505 static void sfmmu_hblk_hash_add(struct hmehash_bucket *, struct hme_blk *,
506 uint64_t);
507 static struct hme_blk *sfmmu_check_pending_hblks(int);
508 static void sfmmu_free_hblks(sfmmu_t *, caddr_t, caddr_t, int);
509 static void sfmmu_cleanup_rhblk(sf_srd_t *, caddr_t, uint_t, int);
510 static void sfmmu_unload_hmeregion_va(sf_srd_t *, uint_t, caddr_t, caddr_t,
511 int, caddr_t *);
512 static void sfmmu_unload_hmeregion(sf_srd_t *, sf_region_t *);
514 static void sfmmu_rm_large_mappings(page_t *, int);
516 static void hat_lock_init(void);
517 static void hat_kstat_init(void);
518 static int sfmmu_kstat_percpu_update(kstat_t *ksp, int rw);
519 static void sfmmu_set_scd_rttecnt(sf_srd_t *, sf_scd_t *);
520 static int sfmmu_is_rgnva(sf_srd_t *, caddr_t, ulong_t, ulong_t);
521 static void sfmmu_check_page_sizes(sfmmu_t *, int);
522 int fnd_mapping_sz(page_t *);
523 static void iment_add(struct ism_ment *, struct hat *);
524 static void iment_sub(struct ism_ment *, struct hat *);
525 static pgcnt_t ism_tsb_entries(sfmmu_t *, int szc);
526 extern void sfmmu_setup_tsbinfo(sfmmu_t *);
527 extern void sfmmu_clear_utsbinfo(void);
529 static void sfmmu_ctx_wrap_around(mmu_ctx_t *, boolean_t);
531 extern int vpm_enable;
533 /* kpm globals */
534 #ifdef DEBUG
536 * Enable trap level tsbmiss handling
538 int kpm_tsbmtl = 1;
541 * Flush the TLB on kpm mapout. Note: Xcalls are used (again) for the
542 * required TLB shootdowns in this case, so handle w/ care. Off by default.
544 int kpm_tlb_flush;
545 #endif /* DEBUG */
547 static void *sfmmu_vmem_xalloc_aligned_wrapper(vmem_t *, size_t, int);
549 #ifdef DEBUG
550 static void sfmmu_check_hblk_flist();
551 #endif
554 * Semi-private sfmmu data structures. Some of them are initialize in
555 * startup or in hat_init. Some of them are private but accessed by
556 * assembly code or mach_sfmmu.c
558 struct hmehash_bucket *uhme_hash; /* user hmeblk hash table */
559 struct hmehash_bucket *khme_hash; /* kernel hmeblk hash table */
560 uint64_t uhme_hash_pa; /* PA of uhme_hash */
561 uint64_t khme_hash_pa; /* PA of khme_hash */
562 int uhmehash_num; /* # of buckets in user hash table */
563 int khmehash_num; /* # of buckets in kernel hash table */
565 uint_t max_mmu_ctxdoms = 0; /* max context domains in the system */
566 mmu_ctx_t **mmu_ctxs_tbl; /* global array of context domains */
567 uint64_t mmu_saved_gnum = 0; /* to init incoming MMUs' gnums */
569 #define DEFAULT_NUM_CTXS_PER_MMU 8192
570 static uint_t nctxs = DEFAULT_NUM_CTXS_PER_MMU;
572 int cache; /* describes system cache */
574 caddr_t ktsb_base; /* kernel 8k-indexed tsb base address */
575 uint64_t ktsb_pbase; /* kernel 8k-indexed tsb phys address */
576 int ktsb_szcode; /* kernel 8k-indexed tsb size code */
577 int ktsb_sz; /* kernel 8k-indexed tsb size */
579 caddr_t ktsb4m_base; /* kernel 4m-indexed tsb base address */
580 uint64_t ktsb4m_pbase; /* kernel 4m-indexed tsb phys address */
581 int ktsb4m_szcode; /* kernel 4m-indexed tsb size code */
582 int ktsb4m_sz; /* kernel 4m-indexed tsb size */
584 uint64_t kpm_tsbbase; /* kernel seg_kpm 4M TSB base address */
585 int kpm_tsbsz; /* kernel seg_kpm 4M TSB size code */
586 uint64_t kpmsm_tsbbase; /* kernel seg_kpm 8K TSB base address */
587 int kpmsm_tsbsz; /* kernel seg_kpm 8K TSB size code */
589 #ifndef sun4v
590 int utsb_dtlb_ttenum = -1; /* index in TLB for utsb locked TTE */
591 int utsb4m_dtlb_ttenum = -1; /* index in TLB for 4M TSB TTE */
592 int dtlb_resv_ttenum; /* index in TLB of first reserved TTE */
593 caddr_t utsb_vabase; /* reserved kernel virtual memory */
594 caddr_t utsb4m_vabase; /* for trap handler TSB accesses */
595 #endif /* sun4v */
596 uint64_t tsb_alloc_bytes = 0; /* bytes allocated to TSBs */
597 vmem_t *kmem_tsb_default_arena[NLGRPS_MAX]; /* For dynamic TSBs */
598 vmem_t *kmem_bigtsb_default_arena[NLGRPS_MAX]; /* dynamic 256M TSBs */
601 * Size to use for TSB slabs. Future platforms that support page sizes
602 * larger than 4M may wish to change these values, and provide their own
603 * assembly macros for building and decoding the TSB base register contents.
604 * Note disable_large_pages will override the value set here.
606 static uint_t tsb_slab_ttesz = TTE4M;
607 size_t tsb_slab_size = MMU_PAGESIZE4M;
608 uint_t tsb_slab_shift = MMU_PAGESHIFT4M;
609 /* PFN mask for TTE */
610 size_t tsb_slab_mask = MMU_PAGEOFFSET4M >> MMU_PAGESHIFT;
613 * Size to use for TSB slabs. These are used only when 256M tsb arenas
614 * exist.
616 static uint_t bigtsb_slab_ttesz = TTE256M;
617 static size_t bigtsb_slab_size = MMU_PAGESIZE256M;
618 static uint_t bigtsb_slab_shift = MMU_PAGESHIFT256M;
619 /* 256M page alignment for 8K pfn */
620 static size_t bigtsb_slab_mask = MMU_PAGEOFFSET256M >> MMU_PAGESHIFT;
622 /* largest TSB size to grow to, will be smaller on smaller memory systems */
623 static int tsb_max_growsize = 0;
626 * Tunable parameters dealing with TSB policies.
630 * This undocumented tunable forces all 8K TSBs to be allocated from
631 * the kernel heap rather than from the kmem_tsb_default_arena arenas.
633 #ifdef DEBUG
634 int tsb_forceheap = 0;
635 #endif /* DEBUG */
638 * Decide whether to use per-lgroup arenas, or one global set of
639 * TSB arenas. The default is not to break up per-lgroup, since
640 * most platforms don't recognize any tangible benefit from it.
642 int tsb_lgrp_affinity = 0;
645 * Used for growing the TSB based on the process RSS.
646 * tsb_rss_factor is based on the smallest TSB, and is
647 * shifted by the TSB size to determine if we need to grow.
648 * The default will grow the TSB if the number of TTEs for
649 * this page size exceeds 75% of the number of TSB entries,
650 * which should _almost_ eliminate all conflict misses
651 * (at the expense of using up lots and lots of memory).
653 #define TSB_RSS_FACTOR (TSB_ENTRIES(TSB_MIN_SZCODE) * 0.75)
654 #define SFMMU_RSS_TSBSIZE(tsbszc) (tsb_rss_factor << tsbszc)
655 #define SELECT_TSB_SIZECODE(pgcnt) ( \
656 (enable_tsb_rss_sizing)? sfmmu_select_tsb_szc(pgcnt) : \
657 default_tsb_size)
658 #define TSB_OK_SHRINK() \
659 (tsb_alloc_bytes > tsb_alloc_hiwater || freemem < desfree)
660 #define TSB_OK_GROW() \
661 (tsb_alloc_bytes < tsb_alloc_hiwater && freemem > desfree)
663 int enable_tsb_rss_sizing = 1;
664 int tsb_rss_factor = (int)TSB_RSS_FACTOR;
666 /* which TSB size code to use for new address spaces or if rss sizing off */
667 int default_tsb_size = TSB_8K_SZCODE;
669 static uint64_t tsb_alloc_hiwater; /* limit TSB reserved memory */
670 uint64_t tsb_alloc_hiwater_factor; /* tsb_alloc_hiwater = physmem / this */
671 #define TSB_ALLOC_HIWATER_FACTOR_DEFAULT 32
673 #ifdef DEBUG
674 static int tsb_random_size = 0; /* set to 1 to test random tsb sizes on alloc */
675 static int tsb_grow_stress = 0; /* if set to 1, keep replacing TSB w/ random */
676 static int tsb_alloc_mtbf = 0; /* fail allocation every n attempts */
677 static int tsb_alloc_fail_mtbf = 0;
678 static int tsb_alloc_count = 0;
679 #endif /* DEBUG */
681 /* if set to 1, will remap valid TTEs when growing TSB. */
682 int tsb_remap_ttes = 1;
685 * If we have more than this many mappings, allocate a second TSB.
686 * This default is chosen because the I/D fully associative TLBs are
687 * assumed to have at least 8 available entries. Platforms with a
688 * larger fully-associative TLB could probably override the default.
691 #ifdef sun4v
692 int tsb_sectsb_threshold = 0;
693 #else
694 int tsb_sectsb_threshold = 8;
695 #endif
698 * kstat data
700 struct sfmmu_global_stat sfmmu_global_stat;
701 struct sfmmu_tsbsize_stat sfmmu_tsbsize_stat;
704 * Global data
706 sfmmu_t *ksfmmup; /* kernel's hat id */
708 #ifdef DEBUG
709 static void chk_tte(tte_t *, tte_t *, tte_t *, struct hme_blk *);
710 #endif
712 /* sfmmu locking operations */
713 static kmutex_t *sfmmu_mlspl_enter(struct page *, int);
714 static int sfmmu_mlspl_held(struct page *, int);
716 kmutex_t *sfmmu_page_enter(page_t *);
717 void sfmmu_page_exit(kmutex_t *);
718 int sfmmu_page_spl_held(struct page *);
720 /* sfmmu internal locking operations - accessed directly */
721 static void sfmmu_mlist_reloc_enter(page_t *, page_t *,
722 kmutex_t **, kmutex_t **);
723 static void sfmmu_mlist_reloc_exit(kmutex_t *, kmutex_t *);
724 static hatlock_t *
725 sfmmu_hat_enter(sfmmu_t *);
726 static hatlock_t *
727 sfmmu_hat_tryenter(sfmmu_t *);
728 static void sfmmu_hat_exit(hatlock_t *);
729 static void sfmmu_hat_lock_all(void);
730 static void sfmmu_hat_unlock_all(void);
731 static void sfmmu_ismhat_enter(sfmmu_t *, int);
732 static void sfmmu_ismhat_exit(sfmmu_t *, int);
734 kpm_hlk_t *kpmp_table;
735 uint_t kpmp_table_sz; /* must be a power of 2 */
736 uchar_t kpmp_shift;
738 kpm_shlk_t *kpmp_stable;
739 uint_t kpmp_stable_sz; /* must be a power of 2 */
742 * SPL_TABLE_SIZE is 2 * NCPU, but no smaller than 128.
743 * SPL_SHIFT is log2(SPL_TABLE_SIZE).
745 #if ((2*NCPU_P2) > 128)
746 #define SPL_SHIFT ((unsigned)(NCPU_LOG2 + 1))
747 #else
748 #define SPL_SHIFT 7U
749 #endif
750 #define SPL_TABLE_SIZE (1U << SPL_SHIFT)
751 #define SPL_MASK (SPL_TABLE_SIZE - 1)
754 * We shift by PP_SHIFT to take care of the low-order 0 bits of a page_t
755 * and by multiples of SPL_SHIFT to get as many varied bits as we can.
757 #define SPL_INDEX(pp) \
758 ((((uintptr_t)(pp) >> PP_SHIFT) ^ \
759 ((uintptr_t)(pp) >> (PP_SHIFT + SPL_SHIFT)) ^ \
760 ((uintptr_t)(pp) >> (PP_SHIFT + SPL_SHIFT * 2)) ^ \
761 ((uintptr_t)(pp) >> (PP_SHIFT + SPL_SHIFT * 3))) & \
762 SPL_MASK)
764 #define SPL_HASH(pp) \
765 (&sfmmu_page_lock[SPL_INDEX(pp)].pad_mutex)
767 static pad_mutex_t sfmmu_page_lock[SPL_TABLE_SIZE];
769 /* Array of mutexes protecting a page's mapping list and p_nrm field. */
771 #define MML_TABLE_SIZE SPL_TABLE_SIZE
772 #define MLIST_HASH(pp) (&mml_table[SPL_INDEX(pp)].pad_mutex)
774 static pad_mutex_t mml_table[MML_TABLE_SIZE];
777 * hat_unload_callback() will group together callbacks in order
778 * to avoid xt_sync() calls. This is the maximum size of the group.
780 #define MAX_CB_ADDR 32
782 tte_t hw_tte;
783 static ulong_t sfmmu_dmr_maxbit = DMR_MAXBIT;
785 static char *mmu_ctx_kstat_names[] = {
786 "mmu_ctx_tsb_exceptions",
787 "mmu_ctx_tsb_raise_exception",
788 "mmu_ctx_wrap_around",
792 * Wrapper for vmem_xalloc since vmem_create only allows limited
793 * parameters for vm_source_alloc functions. This function allows us
794 * to specify alignment consistent with the size of the object being
795 * allocated.
797 static void *
798 sfmmu_vmem_xalloc_aligned_wrapper(vmem_t *vmp, size_t size, int vmflag)
800 return (vmem_xalloc(vmp, size, size, 0, 0, NULL, NULL, vmflag));
803 /* Common code for setting tsb_alloc_hiwater. */
804 #define SFMMU_SET_TSB_ALLOC_HIWATER(pages) tsb_alloc_hiwater = \
805 ptob(pages) / tsb_alloc_hiwater_factor
808 * Set tsb_max_growsize to allow at most all of physical memory to be mapped by
809 * a single TSB. physmem is the number of physical pages so we need physmem 8K
810 * TTEs to represent all those physical pages. We round this up by using
811 * 1<<highbit(). To figure out which size code to use, remember that the size
812 * code is just an amount to shift the smallest TSB size to get the size of
813 * this TSB. So we subtract that size, TSB_START_SIZE, from highbit() (or
814 * highbit() - 1) to get the size code for the smallest TSB that can represent
815 * all of physical memory, while erring on the side of too much.
817 * Restrict tsb_max_growsize to make sure that:
818 * 1) TSBs can't grow larger than the TSB slab size
819 * 2) TSBs can't grow larger than UTSB_MAX_SZCODE.
821 #define SFMMU_SET_TSB_MAX_GROWSIZE(pages) { \
822 int _i, _szc, _slabszc, _tsbszc; \
824 _i = highbit(pages); \
825 if ((1 << (_i - 1)) == (pages)) \
826 _i--; /* 2^n case, round down */ \
827 _szc = _i - TSB_START_SIZE; \
828 _slabszc = bigtsb_slab_shift - (TSB_START_SIZE + TSB_ENTRY_SHIFT); \
829 _tsbszc = MIN(_szc, _slabszc); \
830 tsb_max_growsize = MIN(_tsbszc, UTSB_MAX_SZCODE); \
834 * Given a pointer to an sfmmu and a TTE size code, return a pointer to the
835 * tsb_info which handles that TTE size.
837 #define SFMMU_GET_TSBINFO(tsbinfop, sfmmup, tte_szc) { \
838 (tsbinfop) = (sfmmup)->sfmmu_tsb; \
839 ASSERT(((tsbinfop)->tsb_flags & TSB_SHAREDCTX) || \
840 sfmmu_hat_lock_held(sfmmup)); \
841 if ((tte_szc) >= TTE4M) { \
842 ASSERT((tsbinfop) != NULL); \
843 (tsbinfop) = (tsbinfop)->tsb_next; \
848 * Macro to use to unload entries from the TSB.
849 * It has knowledge of which page sizes get replicated in the TSB
850 * and will call the appropriate unload routine for the appropriate size.
852 #define SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, ismhat) \
854 int ttesz = get_hblk_ttesz(hmeblkp); \
855 if (ttesz == TTE8K || ttesz == TTE4M) { \
856 sfmmu_unload_tsb(sfmmup, addr, ttesz); \
857 } else { \
858 caddr_t sva = ismhat ? addr : \
859 (caddr_t)get_hblk_base(hmeblkp); \
860 caddr_t eva = sva + get_hblk_span(hmeblkp); \
861 ASSERT(addr >= sva && addr < eva); \
862 sfmmu_unload_tsb_range(sfmmup, sva, eva, ttesz); \
867 /* Update tsb_alloc_hiwater after memory is configured. */
868 /*ARGSUSED*/
869 static void
870 sfmmu_update_post_add(void *arg, pgcnt_t delta_pages)
872 /* Assumes physmem has already been updated. */
873 SFMMU_SET_TSB_ALLOC_HIWATER(physmem);
874 SFMMU_SET_TSB_MAX_GROWSIZE(physmem);
878 * Update tsb_alloc_hiwater before memory is deleted. We'll do nothing here
879 * and update tsb_alloc_hiwater and tsb_max_growsize after the memory is
880 * deleted.
882 /*ARGSUSED*/
883 static int
884 sfmmu_update_pre_del(void *arg, pgcnt_t delta_pages)
886 return (0);
889 /* Update tsb_alloc_hiwater after memory fails to be unconfigured. */
890 /*ARGSUSED*/
891 static void
892 sfmmu_update_post_del(void *arg, pgcnt_t delta_pages, int cancelled)
895 * Whether the delete was cancelled or not, just go ahead and update
896 * tsb_alloc_hiwater and tsb_max_growsize.
898 SFMMU_SET_TSB_ALLOC_HIWATER(physmem);
899 SFMMU_SET_TSB_MAX_GROWSIZE(physmem);
902 static kphysm_setup_vector_t sfmmu_update_vec = {
903 KPHYSM_SETUP_VECTOR_VERSION, /* version */
904 sfmmu_update_post_add, /* post_add */
905 sfmmu_update_pre_del, /* pre_del */
906 sfmmu_update_post_del /* post_del */
911 * HME_BLK HASH PRIMITIVES
915 * Enter a hme on the mapping list for page pp.
916 * When large pages are more prevalent in the system we might want to
917 * keep the mapping list in ascending order by the hment size. For now,
918 * small pages are more frequent, so don't slow it down.
920 #define HME_ADD(hme, pp) \
922 ASSERT(sfmmu_mlist_held(pp)); \
924 hme->hme_prev = NULL; \
925 hme->hme_next = pp->p_mapping; \
926 hme->hme_page = pp; \
927 if (pp->p_mapping) { \
928 ((struct sf_hment *)(pp->p_mapping))->hme_prev = hme;\
929 ASSERT(pp->p_share > 0); \
930 } else { \
931 /* EMPTY */ \
932 ASSERT(pp->p_share == 0); \
934 pp->p_mapping = hme; \
935 pp->p_share++; \
939 * Enter a hme on the mapping list for page pp.
940 * If we are unmapping a large translation, we need to make sure that the
941 * change is reflect in the corresponding bit of the p_index field.
943 #define HME_SUB(hme, pp) \
945 ASSERT(sfmmu_mlist_held(pp)); \
946 ASSERT(hme->hme_page == pp || IS_PAHME(hme)); \
948 if (pp->p_mapping == NULL) { \
949 panic("hme_remove - no mappings"); \
952 membar_stst(); /* ensure previous stores finish */ \
954 ASSERT(pp->p_share > 0); \
955 pp->p_share--; \
957 if (hme->hme_prev) { \
958 ASSERT(pp->p_mapping != hme); \
959 ASSERT(hme->hme_prev->hme_page == pp || \
960 IS_PAHME(hme->hme_prev)); \
961 hme->hme_prev->hme_next = hme->hme_next; \
962 } else { \
963 ASSERT(pp->p_mapping == hme); \
964 pp->p_mapping = hme->hme_next; \
965 ASSERT((pp->p_mapping == NULL) ? \
966 (pp->p_share == 0) : 1); \
969 if (hme->hme_next) { \
970 ASSERT(hme->hme_next->hme_page == pp || \
971 IS_PAHME(hme->hme_next)); \
972 hme->hme_next->hme_prev = hme->hme_prev; \
975 /* zero out the entry */ \
976 hme->hme_next = NULL; \
977 hme->hme_prev = NULL; \
978 hme->hme_page = NULL; \
980 if (hme_size(hme) > TTE8K) { \
981 /* remove mappings for remainder of large pg */ \
982 sfmmu_rm_large_mappings(pp, hme_size(hme)); \
987 * This function returns the hment given the hme_blk and a vaddr.
988 * It assumes addr has already been checked to belong to hme_blk's
989 * range.
991 #define HBLKTOHME(hment, hmeblkp, addr) \
993 int index; \
994 HBLKTOHME_IDX(hment, hmeblkp, addr, index) \
998 * Version of HBLKTOHME that also returns the index in hmeblkp
999 * of the hment.
1001 #define HBLKTOHME_IDX(hment, hmeblkp, addr, idx) \
1003 ASSERT(in_hblk_range((hmeblkp), (addr))); \
1005 if (get_hblk_ttesz(hmeblkp) == TTE8K) { \
1006 idx = (((uintptr_t)(addr) >> MMU_PAGESHIFT) & (NHMENTS-1)); \
1007 } else \
1008 idx = 0; \
1010 (hment) = &(hmeblkp)->hblk_hme[idx]; \
1014 * Disable any page sizes not supported by the CPU
1016 void
1017 hat_init_pagesizes()
1019 int i;
1021 mmu_exported_page_sizes = 0;
1022 for (i = TTE8K; i < max_mmu_page_sizes; i++) {
1024 szc_2_userszc[i] = (uint_t)-1;
1025 userszc_2_szc[i] = (uint_t)-1;
1027 if ((mmu_exported_pagesize_mask & (1 << i)) == 0) {
1028 disable_large_pages |= (1 << i);
1029 } else {
1030 szc_2_userszc[i] = mmu_exported_page_sizes;
1031 userszc_2_szc[mmu_exported_page_sizes] = i;
1032 mmu_exported_page_sizes++;
1036 disable_ism_large_pages |= disable_large_pages;
1037 disable_auto_data_large_pages = disable_large_pages;
1038 disable_auto_text_large_pages = disable_large_pages;
1041 * Initialize mmu-specific large page sizes.
1043 if (&mmu_large_pages_disabled) {
1044 disable_large_pages |= mmu_large_pages_disabled(HAT_LOAD);
1045 disable_ism_large_pages |=
1046 mmu_large_pages_disabled(HAT_LOAD_SHARE);
1047 disable_auto_data_large_pages |=
1048 mmu_large_pages_disabled(HAT_AUTO_DATA);
1049 disable_auto_text_large_pages |=
1050 mmu_large_pages_disabled(HAT_AUTO_TEXT);
1055 * Initialize the hardware address translation structures.
1057 void
1058 hat_init(void)
1060 int i;
1061 uint_t sz;
1062 size_t size;
1064 hat_lock_init();
1065 hat_kstat_init();
1068 * Hardware-only bits in a TTE
1070 MAKE_TTE_MASK(&hw_tte);
1072 hat_init_pagesizes();
1074 /* Initialize the hash locks */
1075 for (i = 0; i < khmehash_num; i++) {
1076 mutex_init(&khme_hash[i].hmehash_mutex, NULL,
1077 MUTEX_DEFAULT, NULL);
1078 khme_hash[i].hmeh_nextpa = HMEBLK_ENDPA;
1080 for (i = 0; i < uhmehash_num; i++) {
1081 mutex_init(&uhme_hash[i].hmehash_mutex, NULL,
1082 MUTEX_DEFAULT, NULL);
1083 uhme_hash[i].hmeh_nextpa = HMEBLK_ENDPA;
1085 khmehash_num--; /* make sure counter starts from 0 */
1086 uhmehash_num--; /* make sure counter starts from 0 */
1089 * Allocate context domain structures.
1091 * A platform may choose to modify max_mmu_ctxdoms in
1092 * set_platform_defaults(). If a platform does not define
1093 * a set_platform_defaults() or does not choose to modify
1094 * max_mmu_ctxdoms, it gets one MMU context domain for every CPU.
1096 * For all platforms that have CPUs sharing MMUs, this
1097 * value must be defined.
1099 if (max_mmu_ctxdoms == 0)
1100 max_mmu_ctxdoms = max_ncpus;
1102 size = max_mmu_ctxdoms * sizeof (mmu_ctx_t *);
1103 mmu_ctxs_tbl = kmem_zalloc(size, KM_SLEEP);
1105 /* mmu_ctx_t is 64 bytes aligned */
1106 mmuctxdom_cache = kmem_cache_create("mmuctxdom_cache",
1107 sizeof (mmu_ctx_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
1109 * MMU context domain initialization for the Boot CPU.
1110 * This needs the context domains array allocated above.
1112 mutex_enter(&cpu_lock);
1113 sfmmu_cpu_init(CPU);
1114 mutex_exit(&cpu_lock);
1117 * Intialize ism mapping list lock.
1120 mutex_init(&ism_mlist_lock, NULL, MUTEX_DEFAULT, NULL);
1123 * Each sfmmu structure carries an array of MMU context info
1124 * structures, one per context domain. The size of this array depends
1125 * on the maximum number of context domains. So, the size of the
1126 * sfmmu structure varies per platform.
1128 * sfmmu is allocated from static arena, because trap
1129 * handler at TL > 0 is not allowed to touch kernel relocatable
1130 * memory. sfmmu's alignment is changed to 64 bytes from
1131 * default 8 bytes, as the lower 6 bits will be used to pass
1132 * pgcnt to vtag_flush_pgcnt_tl1.
1134 size = sizeof (sfmmu_t) + sizeof (sfmmu_ctx_t) * (max_mmu_ctxdoms - 1);
1136 sfmmuid_cache = kmem_cache_create("sfmmuid_cache", size,
1137 64, sfmmu_idcache_constructor, sfmmu_idcache_destructor,
1138 NULL, NULL, static_arena, 0);
1140 sfmmu_tsbinfo_cache = kmem_cache_create("sfmmu_tsbinfo_cache",
1141 sizeof (struct tsb_info), 0, NULL, NULL, NULL, NULL, NULL, 0);
1144 * Since we only use the tsb8k cache to "borrow" pages for TSBs
1145 * from the heap when low on memory or when TSB_FORCEALLOC is
1146 * specified, don't use magazines to cache them--we want to return
1147 * them to the system as quickly as possible.
1149 sfmmu_tsb8k_cache = kmem_cache_create("sfmmu_tsb8k_cache",
1150 MMU_PAGESIZE, MMU_PAGESIZE, NULL, NULL, NULL, NULL,
1151 static_arena, KMC_NOMAGAZINE);
1154 * Set tsb_alloc_hiwater to 1/tsb_alloc_hiwater_factor of physical
1155 * memory, which corresponds to the old static reserve for TSBs.
1156 * tsb_alloc_hiwater_factor defaults to 32. This caps the amount of
1157 * memory we'll allocate for TSB slabs; beyond this point TSB
1158 * allocations will be taken from the kernel heap (via
1159 * sfmmu_tsb8k_cache) and will be throttled as would any other kmem
1160 * consumer.
1162 if (tsb_alloc_hiwater_factor == 0) {
1163 tsb_alloc_hiwater_factor = TSB_ALLOC_HIWATER_FACTOR_DEFAULT;
1165 SFMMU_SET_TSB_ALLOC_HIWATER(physmem);
1167 for (sz = tsb_slab_ttesz; sz > 0; sz--) {
1168 if (!(disable_large_pages & (1 << sz)))
1169 break;
1172 if (sz < tsb_slab_ttesz) {
1173 tsb_slab_ttesz = sz;
1174 tsb_slab_shift = MMU_PAGESHIFT + (sz << 1) + sz;
1175 tsb_slab_size = 1 << tsb_slab_shift;
1176 tsb_slab_mask = (1 << (tsb_slab_shift - MMU_PAGESHIFT)) - 1;
1177 use_bigtsb_arena = 0;
1178 } else if (use_bigtsb_arena &&
1179 (disable_large_pages & (1 << bigtsb_slab_ttesz))) {
1180 use_bigtsb_arena = 0;
1183 if (!use_bigtsb_arena) {
1184 bigtsb_slab_shift = tsb_slab_shift;
1186 SFMMU_SET_TSB_MAX_GROWSIZE(physmem);
1189 * On smaller memory systems, allocate TSB memory in smaller chunks
1190 * than the default 4M slab size. We also honor disable_large_pages
1191 * here.
1193 * The trap handlers need to be patched with the final slab shift,
1194 * since they need to be able to construct the TSB pointer at runtime.
1196 if ((tsb_max_growsize <= TSB_512K_SZCODE) &&
1197 !(disable_large_pages & (1 << TTE512K))) {
1198 tsb_slab_ttesz = TTE512K;
1199 tsb_slab_shift = MMU_PAGESHIFT512K;
1200 tsb_slab_size = MMU_PAGESIZE512K;
1201 tsb_slab_mask = MMU_PAGEOFFSET512K >> MMU_PAGESHIFT;
1202 use_bigtsb_arena = 0;
1205 if (!use_bigtsb_arena) {
1206 bigtsb_slab_ttesz = tsb_slab_ttesz;
1207 bigtsb_slab_shift = tsb_slab_shift;
1208 bigtsb_slab_size = tsb_slab_size;
1209 bigtsb_slab_mask = tsb_slab_mask;
1214 * Set up memory callback to update tsb_alloc_hiwater and
1215 * tsb_max_growsize.
1217 i = kphysm_setup_func_register(&sfmmu_update_vec, (void *) 0);
1218 ASSERT(i == 0);
1221 * kmem_tsb_arena is the source from which large TSB slabs are
1222 * drawn. The quantum of this arena corresponds to the largest
1223 * TSB size we can dynamically allocate for user processes.
1224 * Currently it must also be a supported page size since we
1225 * use exactly one translation entry to map each slab page.
1227 * The per-lgroup kmem_tsb_default_arena arenas are the arenas from
1228 * which most TSBs are allocated. Since most TSB allocations are
1229 * typically 8K we have a kmem cache we stack on top of each
1230 * kmem_tsb_default_arena to speed up those allocations.
1232 * Note the two-level scheme of arenas is required only
1233 * because vmem_create doesn't allow us to specify alignment
1234 * requirements. If this ever changes the code could be
1235 * simplified to use only one level of arenas.
1237 * If 256M page support exists on sun4v, 256MB kmem_bigtsb_arena
1238 * will be provided in addition to the 4M kmem_tsb_arena.
1240 if (use_bigtsb_arena) {
1241 kmem_bigtsb_arena = vmem_create("kmem_bigtsb", NULL, 0,
1242 bigtsb_slab_size, sfmmu_vmem_xalloc_aligned_wrapper,
1243 vmem_xfree, heap_arena, 0, VM_SLEEP);
1246 kmem_tsb_arena = vmem_create("kmem_tsb", NULL, 0, tsb_slab_size,
1247 sfmmu_vmem_xalloc_aligned_wrapper,
1248 vmem_xfree, heap_arena, 0, VM_SLEEP);
1250 if (tsb_lgrp_affinity) {
1251 char s[50];
1252 for (i = 0; i < NLGRPS_MAX; i++) {
1253 if (use_bigtsb_arena) {
1254 (void) sprintf(s, "kmem_bigtsb_lgrp%d", i);
1255 kmem_bigtsb_default_arena[i] = vmem_create(s,
1256 NULL, 0, 2 * tsb_slab_size,
1257 sfmmu_tsb_segkmem_alloc,
1258 sfmmu_tsb_segkmem_free, kmem_bigtsb_arena,
1259 0, VM_SLEEP | VM_BESTFIT);
1262 (void) sprintf(s, "kmem_tsb_lgrp%d", i);
1263 kmem_tsb_default_arena[i] = vmem_create(s,
1264 NULL, 0, PAGESIZE, sfmmu_tsb_segkmem_alloc,
1265 sfmmu_tsb_segkmem_free, kmem_tsb_arena, 0,
1266 VM_SLEEP | VM_BESTFIT);
1268 (void) sprintf(s, "sfmmu_tsb_lgrp%d_cache", i);
1269 sfmmu_tsb_cache[i] = kmem_cache_create(s,
1270 PAGESIZE, PAGESIZE, NULL, NULL, NULL, NULL,
1271 kmem_tsb_default_arena[i], 0);
1273 } else {
1274 if (use_bigtsb_arena) {
1275 kmem_bigtsb_default_arena[0] =
1276 vmem_create("kmem_bigtsb_default", NULL, 0,
1277 2 * tsb_slab_size, sfmmu_tsb_segkmem_alloc,
1278 sfmmu_tsb_segkmem_free, kmem_bigtsb_arena, 0,
1279 VM_SLEEP | VM_BESTFIT);
1282 kmem_tsb_default_arena[0] = vmem_create("kmem_tsb_default",
1283 NULL, 0, PAGESIZE, sfmmu_tsb_segkmem_alloc,
1284 sfmmu_tsb_segkmem_free, kmem_tsb_arena, 0,
1285 VM_SLEEP | VM_BESTFIT);
1286 sfmmu_tsb_cache[0] = kmem_cache_create("sfmmu_tsb_cache",
1287 PAGESIZE, PAGESIZE, NULL, NULL, NULL, NULL,
1288 kmem_tsb_default_arena[0], 0);
1291 sfmmu8_cache = kmem_cache_create("sfmmu8_cache", HME8BLK_SZ,
1292 HMEBLK_ALIGN, sfmmu_hblkcache_constructor,
1293 sfmmu_hblkcache_destructor,
1294 sfmmu_hblkcache_reclaim, (void *)HME8BLK_SZ,
1295 hat_memload_arena, KMC_NOHASH);
1297 hat_memload1_arena = vmem_create("hat_memload1", NULL, 0, PAGESIZE,
1298 segkmem_alloc_permanent, segkmem_free, heap_arena, 0,
1299 VMC_DUMPSAFE | VM_SLEEP);
1301 sfmmu1_cache = kmem_cache_create("sfmmu1_cache", HME1BLK_SZ,
1302 HMEBLK_ALIGN, sfmmu_hblkcache_constructor,
1303 sfmmu_hblkcache_destructor,
1304 NULL, (void *)HME1BLK_SZ,
1305 hat_memload1_arena, KMC_NOHASH);
1307 pa_hment_cache = kmem_cache_create("pa_hment_cache", PAHME_SZ,
1308 0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH);
1310 ism_blk_cache = kmem_cache_create("ism_blk_cache",
1311 sizeof (ism_blk_t), ecache_alignsize, NULL, NULL,
1312 NULL, NULL, static_arena, KMC_NOHASH);
1314 ism_ment_cache = kmem_cache_create("ism_ment_cache",
1315 sizeof (ism_ment_t), 0, NULL, NULL,
1316 NULL, NULL, NULL, 0);
1319 * We grab the first hat for the kernel,
1321 AS_LOCK_ENTER(&kas, RW_WRITER);
1322 kas.a_hat = hat_alloc(&kas);
1323 AS_LOCK_EXIT(&kas);
1326 * Initialize hblk_reserve.
1328 ((struct hme_blk *)hblk_reserve)->hblk_nextpa =
1329 va_to_pa((caddr_t)hblk_reserve);
1331 #ifndef UTSB_PHYS
1333 * Reserve some kernel virtual address space for the locked TTEs
1334 * that allow us to probe the TSB from TL>0.
1336 utsb_vabase = vmem_xalloc(heap_arena, tsb_slab_size, tsb_slab_size,
1337 0, 0, NULL, NULL, VM_SLEEP);
1338 utsb4m_vabase = vmem_xalloc(heap_arena, tsb_slab_size, tsb_slab_size,
1339 0, 0, NULL, NULL, VM_SLEEP);
1340 #endif
1342 #ifdef VAC
1344 * The big page VAC handling code assumes VAC
1345 * will not be bigger than the smallest big
1346 * page- which is 64K.
1348 if (TTEPAGES(TTE64K) < CACHE_NUM_COLOR) {
1349 cmn_err(CE_PANIC, "VAC too big!");
1351 #endif
1353 (void) xhat_init();
1355 uhme_hash_pa = va_to_pa(uhme_hash);
1356 khme_hash_pa = va_to_pa(khme_hash);
1359 * Initialize relocation locks. kpr_suspendlock is held
1360 * at PIL_MAX to prevent interrupts from pinning the holder
1361 * of a suspended TTE which may access it leading to a
1362 * deadlock condition.
1364 mutex_init(&kpr_mutex, NULL, MUTEX_DEFAULT, NULL);
1365 mutex_init(&kpr_suspendlock, NULL, MUTEX_SPIN, (void *)PIL_MAX);
1368 * If Shared context support is disabled via /etc/system
1369 * set shctx_on to 0 here if it was set to 1 earlier in boot
1370 * sequence by cpu module initialization code.
1372 if (shctx_on && disable_shctx) {
1373 shctx_on = 0;
1376 if (shctx_on) {
1377 srd_buckets = kmem_zalloc(SFMMU_MAX_SRD_BUCKETS *
1378 sizeof (srd_buckets[0]), KM_SLEEP);
1379 for (i = 0; i < SFMMU_MAX_SRD_BUCKETS; i++) {
1380 mutex_init(&srd_buckets[i].srdb_lock, NULL,
1381 MUTEX_DEFAULT, NULL);
1384 srd_cache = kmem_cache_create("srd_cache", sizeof (sf_srd_t),
1385 0, sfmmu_srdcache_constructor, sfmmu_srdcache_destructor,
1386 NULL, NULL, NULL, 0);
1387 region_cache = kmem_cache_create("region_cache",
1388 sizeof (sf_region_t), 0, sfmmu_rgncache_constructor,
1389 sfmmu_rgncache_destructor, NULL, NULL, NULL, 0);
1390 scd_cache = kmem_cache_create("scd_cache", sizeof (sf_scd_t),
1391 0, sfmmu_scdcache_constructor, sfmmu_scdcache_destructor,
1392 NULL, NULL, NULL, 0);
1396 * Pre-allocate hrm_hashtab before enabling the collection of
1397 * refmod statistics. Allocating on the fly would mean us
1398 * running the risk of suffering recursive mutex enters or
1399 * deadlocks.
1401 hrm_hashtab = kmem_zalloc(HRM_HASHSIZE * sizeof (struct hrmstat *),
1402 KM_SLEEP);
1404 /* Allocate per-cpu pending freelist of hmeblks */
1405 cpu_hme_pend = kmem_zalloc((NCPU * sizeof (cpu_hme_pend_t)) + 64,
1406 KM_SLEEP);
1407 cpu_hme_pend = (cpu_hme_pend_t *)P2ROUNDUP(
1408 (uintptr_t)cpu_hme_pend, 64);
1410 for (i = 0; i < NCPU; i++) {
1411 mutex_init(&cpu_hme_pend[i].chp_mutex, NULL, MUTEX_DEFAULT,
1412 NULL);
1415 if (cpu_hme_pend_thresh == 0) {
1416 cpu_hme_pend_thresh = CPU_HME_PEND_THRESH;
1421 * Initialize locking for the hat layer, called early during boot.
1423 static void
1424 hat_lock_init()
1426 int i;
1429 * initialize the array of mutexes protecting a page's mapping
1430 * list and p_nrm field.
1432 for (i = 0; i < MML_TABLE_SIZE; i++)
1433 mutex_init(&mml_table[i].pad_mutex, NULL, MUTEX_DEFAULT, NULL);
1435 if (kpm_enable) {
1436 for (i = 0; i < kpmp_table_sz; i++) {
1437 mutex_init(&kpmp_table[i].khl_mutex, NULL,
1438 MUTEX_DEFAULT, NULL);
1443 * Initialize array of mutex locks that protects sfmmu fields and
1444 * TSB lists.
1446 for (i = 0; i < SFMMU_NUM_LOCK; i++)
1447 mutex_init(HATLOCK_MUTEXP(&hat_lock[i]), NULL, MUTEX_DEFAULT,
1448 NULL);
1451 #define SFMMU_KERNEL_MAXVA \
1452 (kmem64_base ? (uintptr_t)kmem64_end : (SYSLIMIT))
1455 * Allocate a hat structure.
1456 * Called when an address space first uses a hat.
1458 struct hat *
1459 hat_alloc(struct as *as)
1461 sfmmu_t *sfmmup;
1462 int i;
1463 uint64_t cnum;
1464 extern uint_t get_color_start(struct as *);
1466 ASSERT(AS_WRITE_HELD(as));
1467 sfmmup = kmem_cache_alloc(sfmmuid_cache, KM_SLEEP);
1468 sfmmup->sfmmu_as = as;
1469 sfmmup->sfmmu_flags = 0;
1470 sfmmup->sfmmu_tteflags = 0;
1471 sfmmup->sfmmu_rtteflags = 0;
1472 LOCK_INIT_CLEAR(&sfmmup->sfmmu_ctx_lock);
1474 if (as == &kas) {
1475 ksfmmup = sfmmup;
1476 sfmmup->sfmmu_cext = 0;
1477 cnum = KCONTEXT;
1479 sfmmup->sfmmu_clrstart = 0;
1480 sfmmup->sfmmu_tsb = NULL;
1482 * hat_kern_setup() will call sfmmu_init_ktsbinfo()
1483 * to setup tsb_info for ksfmmup.
1485 } else {
1488 * Just set to invalid ctx. When it faults, it will
1489 * get a valid ctx. This would avoid the situation
1490 * where we get a ctx, but it gets stolen and then
1491 * we fault when we try to run and so have to get
1492 * another ctx.
1494 sfmmup->sfmmu_cext = 0;
1495 cnum = INVALID_CONTEXT;
1497 /* initialize original physical page coloring bin */
1498 sfmmup->sfmmu_clrstart = get_color_start(as);
1499 #ifdef DEBUG
1500 if (tsb_random_size) {
1501 uint32_t randval = (uint32_t)gettick() >> 4;
1502 int size = randval % (tsb_max_growsize + 1);
1504 /* chose a random tsb size for stress testing */
1505 (void) sfmmu_tsbinfo_alloc(&sfmmup->sfmmu_tsb, size,
1506 TSB8K|TSB64K|TSB512K, 0, sfmmup);
1507 } else
1508 #endif /* DEBUG */
1509 (void) sfmmu_tsbinfo_alloc(&sfmmup->sfmmu_tsb,
1510 default_tsb_size,
1511 TSB8K|TSB64K|TSB512K, 0, sfmmup);
1512 sfmmup->sfmmu_flags = HAT_SWAPPED | HAT_ALLCTX_INVALID;
1513 ASSERT(sfmmup->sfmmu_tsb != NULL);
1516 ASSERT(max_mmu_ctxdoms > 0);
1517 for (i = 0; i < max_mmu_ctxdoms; i++) {
1518 sfmmup->sfmmu_ctxs[i].cnum = cnum;
1519 sfmmup->sfmmu_ctxs[i].gnum = 0;
1522 for (i = 0; i < max_mmu_page_sizes; i++) {
1523 sfmmup->sfmmu_ttecnt[i] = 0;
1524 sfmmup->sfmmu_scdrttecnt[i] = 0;
1525 sfmmup->sfmmu_ismttecnt[i] = 0;
1526 sfmmup->sfmmu_scdismttecnt[i] = 0;
1527 sfmmup->sfmmu_pgsz[i] = TTE8K;
1529 sfmmup->sfmmu_tsb0_4minflcnt = 0;
1530 sfmmup->sfmmu_iblk = NULL;
1531 sfmmup->sfmmu_ismhat = 0;
1532 sfmmup->sfmmu_scdhat = 0;
1533 sfmmup->sfmmu_ismblkpa = (uint64_t)-1;
1534 if (sfmmup == ksfmmup) {
1535 CPUSET_ALL(sfmmup->sfmmu_cpusran);
1536 } else {
1537 CPUSET_ZERO(sfmmup->sfmmu_cpusran);
1539 sfmmup->sfmmu_free = 0;
1540 sfmmup->sfmmu_rmstat = 0;
1541 sfmmup->sfmmu_clrbin = sfmmup->sfmmu_clrstart;
1542 sfmmup->sfmmu_xhat_provider = NULL;
1543 cv_init(&sfmmup->sfmmu_tsb_cv, NULL, CV_DEFAULT, NULL);
1544 sfmmup->sfmmu_srdp = NULL;
1545 SF_RGNMAP_ZERO(sfmmup->sfmmu_region_map);
1546 bzero(sfmmup->sfmmu_hmeregion_links, SFMMU_L1_HMERLINKS_SIZE);
1547 sfmmup->sfmmu_scdp = NULL;
1548 sfmmup->sfmmu_scd_link.next = NULL;
1549 sfmmup->sfmmu_scd_link.prev = NULL;
1550 return (sfmmup);
1554 * Create per-MMU context domain kstats for a given MMU ctx.
1556 static void
1557 sfmmu_mmu_kstat_create(mmu_ctx_t *mmu_ctxp)
1559 mmu_ctx_stat_t stat;
1560 kstat_t *mmu_kstat;
1562 ASSERT(MUTEX_HELD(&cpu_lock));
1563 ASSERT(mmu_ctxp->mmu_kstat == NULL);
1565 mmu_kstat = kstat_create("unix", mmu_ctxp->mmu_idx, "mmu_ctx",
1566 "hat", KSTAT_TYPE_NAMED, MMU_CTX_NUM_STATS, KSTAT_FLAG_VIRTUAL);
1568 if (mmu_kstat == NULL) {
1569 cmn_err(CE_WARN, "kstat_create for MMU %d failed",
1570 mmu_ctxp->mmu_idx);
1571 } else {
1572 mmu_kstat->ks_data = mmu_ctxp->mmu_kstat_data;
1573 for (stat = 0; stat < MMU_CTX_NUM_STATS; stat++)
1574 kstat_named_init(&mmu_ctxp->mmu_kstat_data[stat],
1575 mmu_ctx_kstat_names[stat], KSTAT_DATA_INT64);
1576 mmu_ctxp->mmu_kstat = mmu_kstat;
1577 kstat_install(mmu_kstat);
1582 * plat_cpuid_to_mmu_ctx_info() is a platform interface that returns MMU
1583 * context domain information for a given CPU. If a platform does not
1584 * specify that interface, then the function below is used instead to return
1585 * default information. The defaults are as follows:
1587 * - The number of MMU context IDs supported on any CPU in the
1588 * system is 8K.
1589 * - There is one MMU context domain per CPU.
1591 /*ARGSUSED*/
1592 static void
1593 sfmmu_cpuid_to_mmu_ctx_info(processorid_t cpuid, mmu_ctx_info_t *infop)
1595 infop->mmu_nctxs = nctxs;
1596 infop->mmu_idx = cpu[cpuid]->cpu_seqid;
1600 * Called during CPU initialization to set the MMU context-related information
1601 * for a CPU.
1603 * cpu_lock serializes accesses to mmu_ctxs and mmu_saved_gnum.
1605 void
1606 sfmmu_cpu_init(cpu_t *cp)
1608 mmu_ctx_info_t info;
1609 mmu_ctx_t *mmu_ctxp;
1611 ASSERT(MUTEX_HELD(&cpu_lock));
1613 if (&plat_cpuid_to_mmu_ctx_info == NULL)
1614 sfmmu_cpuid_to_mmu_ctx_info(cp->cpu_id, &info);
1615 else
1616 plat_cpuid_to_mmu_ctx_info(cp->cpu_id, &info);
1618 ASSERT(info.mmu_idx < max_mmu_ctxdoms);
1620 if ((mmu_ctxp = mmu_ctxs_tbl[info.mmu_idx]) == NULL) {
1621 /* Each mmu_ctx is cacheline aligned. */
1622 mmu_ctxp = kmem_cache_alloc(mmuctxdom_cache, KM_SLEEP);
1623 bzero(mmu_ctxp, sizeof (mmu_ctx_t));
1625 mutex_init(&mmu_ctxp->mmu_lock, NULL, MUTEX_SPIN,
1626 (void *)ipltospl(DISP_LEVEL));
1627 mmu_ctxp->mmu_idx = info.mmu_idx;
1628 mmu_ctxp->mmu_nctxs = info.mmu_nctxs;
1630 * Globally for lifetime of a system,
1631 * gnum must always increase.
1632 * mmu_saved_gnum is protected by the cpu_lock.
1634 mmu_ctxp->mmu_gnum = mmu_saved_gnum + 1;
1635 mmu_ctxp->mmu_cnum = NUM_LOCKED_CTXS;
1637 sfmmu_mmu_kstat_create(mmu_ctxp);
1639 mmu_ctxs_tbl[info.mmu_idx] = mmu_ctxp;
1640 } else {
1641 ASSERT(mmu_ctxp->mmu_idx == info.mmu_idx);
1642 ASSERT(mmu_ctxp->mmu_nctxs <= info.mmu_nctxs);
1646 * The mmu_lock is acquired here to prevent races with
1647 * the wrap-around code.
1649 mutex_enter(&mmu_ctxp->mmu_lock);
1652 mmu_ctxp->mmu_ncpus++;
1653 CPUSET_ADD(mmu_ctxp->mmu_cpuset, cp->cpu_id);
1654 CPU_MMU_IDX(cp) = info.mmu_idx;
1655 CPU_MMU_CTXP(cp) = mmu_ctxp;
1657 mutex_exit(&mmu_ctxp->mmu_lock);
1660 static void
1661 sfmmu_ctxdom_free(mmu_ctx_t *mmu_ctxp)
1663 ASSERT(MUTEX_HELD(&cpu_lock));
1664 ASSERT(!MUTEX_HELD(&mmu_ctxp->mmu_lock));
1666 mutex_destroy(&mmu_ctxp->mmu_lock);
1668 if (mmu_ctxp->mmu_kstat)
1669 kstat_delete(mmu_ctxp->mmu_kstat);
1671 /* mmu_saved_gnum is protected by the cpu_lock. */
1672 if (mmu_saved_gnum < mmu_ctxp->mmu_gnum)
1673 mmu_saved_gnum = mmu_ctxp->mmu_gnum;
1675 kmem_cache_free(mmuctxdom_cache, mmu_ctxp);
1679 * Called to perform MMU context-related cleanup for a CPU.
1681 void
1682 sfmmu_cpu_cleanup(cpu_t *cp)
1684 mmu_ctx_t *mmu_ctxp;
1686 ASSERT(MUTEX_HELD(&cpu_lock));
1688 mmu_ctxp = CPU_MMU_CTXP(cp);
1689 ASSERT(mmu_ctxp != NULL);
1692 * The mmu_lock is acquired here to prevent races with
1693 * the wrap-around code.
1695 mutex_enter(&mmu_ctxp->mmu_lock);
1697 CPU_MMU_CTXP(cp) = NULL;
1699 CPUSET_DEL(mmu_ctxp->mmu_cpuset, cp->cpu_id);
1700 if (--mmu_ctxp->mmu_ncpus == 0) {
1701 mmu_ctxs_tbl[mmu_ctxp->mmu_idx] = NULL;
1702 mutex_exit(&mmu_ctxp->mmu_lock);
1703 sfmmu_ctxdom_free(mmu_ctxp);
1704 return;
1707 mutex_exit(&mmu_ctxp->mmu_lock);
1710 uint_t
1711 sfmmu_ctxdom_nctxs(int idx)
1713 return (mmu_ctxs_tbl[idx]->mmu_nctxs);
1716 #ifdef sun4v
1718 * sfmmu_ctxdoms_* is an interface provided to help keep context domains
1719 * consistant after suspend/resume on system that can resume on a different
1720 * hardware than it was suspended.
1722 * sfmmu_ctxdom_lock(void) locks all context domains and prevents new contexts
1723 * from being allocated. It acquires all hat_locks, which blocks most access to
1724 * context data, except for a few cases that are handled separately or are
1725 * harmless. It wraps each domain to increment gnum and invalidate on-CPU
1726 * contexts, and forces cnum to its max. As a result of this call all user
1727 * threads that are running on CPUs trap and try to perform wrap around but
1728 * can't because hat_locks are taken. Threads that were not on CPUs but started
1729 * by scheduler go to sfmmu_alloc_ctx() to aquire context without checking
1730 * hat_lock, but fail, because cnum == nctxs, and therefore also trap and block
1731 * on hat_lock trying to wrap. sfmmu_ctxdom_lock() must be called before CPUs
1732 * are paused, else it could deadlock acquiring locks held by paused CPUs.
1734 * sfmmu_ctxdoms_remove() removes context domains from every CPUs and records
1735 * the CPUs that had them. It must be called after CPUs have been paused. This
1736 * ensures that no threads are in sfmmu_alloc_ctx() accessing domain data,
1737 * because pause_cpus sends a mondo interrupt to every CPU, and sfmmu_alloc_ctx
1738 * runs with interrupts disabled. When CPUs are later resumed, they may enter
1739 * sfmmu_alloc_ctx, but it will check for CPU_MMU_CTXP = NULL and immediately
1740 * return failure. Or, they will be blocked trying to acquire hat_lock. Thus
1741 * after sfmmu_ctxdoms_remove returns, we are guaranteed that no one is
1742 * accessing the old context domains.
1744 * sfmmu_ctxdoms_update(void) frees space used by old context domains and
1745 * allocates new context domains based on hardware layout. It initializes
1746 * every CPU that had context domain before migration to have one again.
1747 * sfmmu_ctxdoms_update must be called after CPUs are resumed, else it
1748 * could deadlock acquiring locks held by paused CPUs.
1750 * sfmmu_ctxdoms_unlock(void) releases all hat_locks after which user threads
1751 * acquire new context ids and continue execution.
1753 * Therefore functions should be called in the following order:
1754 * suspend_routine()
1755 * sfmmu_ctxdom_lock()
1756 * pause_cpus()
1757 * suspend()
1758 * if (suspend failed)
1759 * sfmmu_ctxdom_unlock()
1760 * ...
1761 * sfmmu_ctxdom_remove()
1762 * resume_cpus()
1763 * sfmmu_ctxdom_update()
1764 * sfmmu_ctxdom_unlock()
1766 static cpuset_t sfmmu_ctxdoms_pset;
1768 void
1769 sfmmu_ctxdoms_remove()
1771 processorid_t id;
1772 cpu_t *cp;
1775 * Record the CPUs that have domains in sfmmu_ctxdoms_pset, so they can
1776 * be restored post-migration. A CPU may be powered off and not have a
1777 * domain, for example.
1779 CPUSET_ZERO(sfmmu_ctxdoms_pset);
1781 for (id = 0; id < NCPU; id++) {
1782 if ((cp = cpu[id]) != NULL && CPU_MMU_CTXP(cp) != NULL) {
1783 CPUSET_ADD(sfmmu_ctxdoms_pset, id);
1784 CPU_MMU_CTXP(cp) = NULL;
1789 void
1790 sfmmu_ctxdoms_lock(void)
1792 int idx;
1793 mmu_ctx_t *mmu_ctxp;
1795 sfmmu_hat_lock_all();
1798 * At this point, no thread can be in sfmmu_ctx_wrap_around, because
1799 * hat_lock is always taken before calling it.
1801 * For each domain, set mmu_cnum to max so no more contexts can be
1802 * allocated, and wrap to flush on-CPU contexts and force threads to
1803 * acquire a new context when we later drop hat_lock after migration.
1804 * Setting mmu_cnum may race with sfmmu_alloc_ctx which also sets cnum,
1805 * but the latter uses CAS and will miscompare and not overwrite it.
1807 kpreempt_disable(); /* required by sfmmu_ctx_wrap_around */
1808 for (idx = 0; idx < max_mmu_ctxdoms; idx++) {
1809 if ((mmu_ctxp = mmu_ctxs_tbl[idx]) != NULL) {
1810 mutex_enter(&mmu_ctxp->mmu_lock);
1811 mmu_ctxp->mmu_cnum = mmu_ctxp->mmu_nctxs;
1812 /* make sure updated cnum visible */
1813 membar_enter();
1814 mutex_exit(&mmu_ctxp->mmu_lock);
1815 sfmmu_ctx_wrap_around(mmu_ctxp, B_FALSE);
1818 kpreempt_enable();
1821 void
1822 sfmmu_ctxdoms_unlock(void)
1824 sfmmu_hat_unlock_all();
1827 void
1828 sfmmu_ctxdoms_update(void)
1830 processorid_t id;
1831 cpu_t *cp;
1832 uint_t idx;
1833 mmu_ctx_t *mmu_ctxp;
1836 * Free all context domains. As side effect, this increases
1837 * mmu_saved_gnum to the maximum gnum over all domains, which is used to
1838 * init gnum in the new domains, which therefore will be larger than the
1839 * sfmmu gnum for any process, guaranteeing that every process will see
1840 * a new generation and allocate a new context regardless of what new
1841 * domain it runs in.
1843 mutex_enter(&cpu_lock);
1845 for (idx = 0; idx < max_mmu_ctxdoms; idx++) {
1846 if (mmu_ctxs_tbl[idx] != NULL) {
1847 mmu_ctxp = mmu_ctxs_tbl[idx];
1848 mmu_ctxs_tbl[idx] = NULL;
1849 sfmmu_ctxdom_free(mmu_ctxp);
1853 for (id = 0; id < NCPU; id++) {
1854 if (CPU_IN_SET(sfmmu_ctxdoms_pset, id) &&
1855 (cp = cpu[id]) != NULL)
1856 sfmmu_cpu_init(cp);
1858 mutex_exit(&cpu_lock);
1860 #endif
1863 * Hat_setup, makes an address space context the current active one.
1864 * In sfmmu this translates to setting the secondary context with the
1865 * corresponding context.
1867 void
1868 hat_setup(struct hat *sfmmup, int allocflag)
1870 hatlock_t *hatlockp;
1872 /* Init needs some special treatment. */
1873 if (allocflag == HAT_INIT) {
1875 * Make sure that we have
1876 * 1. a TSB
1877 * 2. a valid ctx that doesn't get stolen after this point.
1879 hatlockp = sfmmu_hat_enter(sfmmup);
1882 * Swap in the TSB. hat_init() allocates tsbinfos without
1883 * TSBs, but we need one for init, since the kernel does some
1884 * special things to set up its stack and needs the TSB to
1885 * resolve page faults.
1887 sfmmu_tsb_swapin(sfmmup, hatlockp);
1889 sfmmu_get_ctx(sfmmup);
1891 sfmmu_hat_exit(hatlockp);
1892 } else {
1893 ASSERT(allocflag == HAT_ALLOC);
1895 hatlockp = sfmmu_hat_enter(sfmmup);
1896 kpreempt_disable();
1898 CPUSET_ADD(sfmmup->sfmmu_cpusran, CPU->cpu_id);
1900 * sfmmu_setctx_sec takes <pgsz|cnum> as a parameter,
1901 * pagesize bits don't matter in this case since we are passing
1902 * INVALID_CONTEXT to it.
1903 * Compatibility Note: hw takes care of MMU_SCONTEXT1
1905 sfmmu_setctx_sec(INVALID_CONTEXT);
1906 sfmmu_clear_utsbinfo();
1908 kpreempt_enable();
1909 sfmmu_hat_exit(hatlockp);
1914 * Free all the translation resources for the specified address space.
1915 * Called from as_free when an address space is being destroyed.
1917 void
1918 hat_free_start(struct hat *sfmmup)
1920 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as));
1921 ASSERT(sfmmup != ksfmmup);
1922 ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
1924 sfmmup->sfmmu_free = 1;
1925 if (sfmmup->sfmmu_scdp != NULL) {
1926 sfmmu_leave_scd(sfmmup, 0);
1929 ASSERT(sfmmup->sfmmu_scdp == NULL);
1932 void
1933 hat_free_end(struct hat *sfmmup)
1935 int i;
1937 ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
1938 ASSERT(sfmmup->sfmmu_free == 1);
1939 ASSERT(sfmmup->sfmmu_ttecnt[TTE8K] == 0);
1940 ASSERT(sfmmup->sfmmu_ttecnt[TTE64K] == 0);
1941 ASSERT(sfmmup->sfmmu_ttecnt[TTE512K] == 0);
1942 ASSERT(sfmmup->sfmmu_ttecnt[TTE4M] == 0);
1943 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0);
1944 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0);
1946 if (sfmmup->sfmmu_rmstat) {
1947 hat_freestat(sfmmup->sfmmu_as, NULL);
1950 while (sfmmup->sfmmu_tsb != NULL) {
1951 struct tsb_info *next = sfmmup->sfmmu_tsb->tsb_next;
1952 sfmmu_tsbinfo_free(sfmmup->sfmmu_tsb);
1953 sfmmup->sfmmu_tsb = next;
1956 if (sfmmup->sfmmu_srdp != NULL) {
1957 sfmmu_leave_srd(sfmmup);
1958 ASSERT(sfmmup->sfmmu_srdp == NULL);
1959 for (i = 0; i < SFMMU_L1_HMERLINKS; i++) {
1960 if (sfmmup->sfmmu_hmeregion_links[i] != NULL) {
1961 kmem_free(sfmmup->sfmmu_hmeregion_links[i],
1962 SFMMU_L2_HMERLINKS_SIZE);
1963 sfmmup->sfmmu_hmeregion_links[i] = NULL;
1967 sfmmu_free_sfmmu(sfmmup);
1969 #ifdef DEBUG
1970 for (i = 0; i < SFMMU_L1_HMERLINKS; i++) {
1971 ASSERT(sfmmup->sfmmu_hmeregion_links[i] == NULL);
1973 #endif
1975 kmem_cache_free(sfmmuid_cache, sfmmup);
1979 * Set up any translation structures, for the specified address space,
1980 * that are needed or preferred when the process is being swapped in.
1982 /* ARGSUSED */
1983 void
1984 hat_swapin(struct hat *hat)
1986 ASSERT(hat->sfmmu_xhat_provider == NULL);
1990 * Free all of the translation resources, for the specified address space,
1991 * that can be freed while the process is swapped out. Called from as_swapout.
1992 * Also, free up the ctx that this process was using.
1994 void
1995 hat_swapout(struct hat *sfmmup)
1997 struct hmehash_bucket *hmebp;
1998 struct hme_blk *hmeblkp;
1999 struct hme_blk *pr_hblk = NULL;
2000 struct hme_blk *nx_hblk;
2001 int i;
2002 struct hme_blk *list = NULL;
2003 hatlock_t *hatlockp;
2004 struct tsb_info *tsbinfop;
2005 struct free_tsb {
2006 struct free_tsb *next;
2007 struct tsb_info *tsbinfop;
2008 }; /* free list of TSBs */
2009 struct free_tsb *freelist, *last, *next;
2011 ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
2012 SFMMU_STAT(sf_swapout);
2015 * There is no way to go from an as to all its translations in sfmmu.
2016 * Here is one of the times when we take the big hit and traverse
2017 * the hash looking for hme_blks to free up. Not only do we free up
2018 * this as hme_blks but all those that are free. We are obviously
2019 * swapping because we need memory so let's free up as much
2020 * as we can.
2022 * Note that we don't flush TLB/TSB here -- it's not necessary
2023 * because:
2024 * 1) we free the ctx we're using and throw away the TSB(s);
2025 * 2) processes aren't runnable while being swapped out.
2027 ASSERT(sfmmup != KHATID);
2028 for (i = 0; i <= UHMEHASH_SZ; i++) {
2029 hmebp = &uhme_hash[i];
2030 SFMMU_HASH_LOCK(hmebp);
2031 hmeblkp = hmebp->hmeblkp;
2032 pr_hblk = NULL;
2033 while (hmeblkp) {
2035 ASSERT(!hmeblkp->hblk_xhat_bit);
2037 if ((hmeblkp->hblk_tag.htag_id == sfmmup) &&
2038 !hmeblkp->hblk_shw_bit && !hmeblkp->hblk_lckcnt) {
2039 ASSERT(!hmeblkp->hblk_shared);
2040 (void) sfmmu_hblk_unload(sfmmup, hmeblkp,
2041 (caddr_t)get_hblk_base(hmeblkp),
2042 get_hblk_endaddr(hmeblkp),
2043 NULL, HAT_UNLOAD);
2045 nx_hblk = hmeblkp->hblk_next;
2046 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) {
2047 ASSERT(!hmeblkp->hblk_lckcnt);
2048 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
2049 &list, 0);
2050 } else {
2051 pr_hblk = hmeblkp;
2053 hmeblkp = nx_hblk;
2055 SFMMU_HASH_UNLOCK(hmebp);
2058 sfmmu_hblks_list_purge(&list, 0);
2061 * Now free up the ctx so that others can reuse it.
2063 hatlockp = sfmmu_hat_enter(sfmmup);
2065 sfmmu_invalidate_ctx(sfmmup);
2068 * Free TSBs, but not tsbinfos, and set SWAPPED flag.
2069 * If TSBs were never swapped in, just return.
2070 * This implies that we don't support partial swapping
2071 * of TSBs -- either all are swapped out, or none are.
2073 * We must hold the HAT lock here to prevent racing with another
2074 * thread trying to unmap TTEs from the TSB or running the post-
2075 * relocator after relocating the TSB's memory. Unfortunately, we
2076 * can't free memory while holding the HAT lock or we could
2077 * deadlock, so we build a list of TSBs to be freed after marking
2078 * the tsbinfos as swapped out and free them after dropping the
2079 * lock.
2081 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) {
2082 sfmmu_hat_exit(hatlockp);
2083 return;
2086 SFMMU_FLAGS_SET(sfmmup, HAT_SWAPPED);
2087 last = freelist = NULL;
2088 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL;
2089 tsbinfop = tsbinfop->tsb_next) {
2090 ASSERT((tsbinfop->tsb_flags & TSB_SWAPPED) == 0);
2093 * Cast the TSB into a struct free_tsb and put it on the free
2094 * list.
2096 if (freelist == NULL) {
2097 last = freelist = (struct free_tsb *)tsbinfop->tsb_va;
2098 } else {
2099 last->next = (struct free_tsb *)tsbinfop->tsb_va;
2100 last = last->next;
2102 last->next = NULL;
2103 last->tsbinfop = tsbinfop;
2104 tsbinfop->tsb_flags |= TSB_SWAPPED;
2106 * Zero out the TTE to clear the valid bit.
2107 * Note we can't use a value like 0xbad because we want to
2108 * ensure diagnostic bits are NEVER set on TTEs that might
2109 * be loaded. The intent is to catch any invalid access
2110 * to the swapped TSB, such as a thread running with a valid
2111 * context without first calling sfmmu_tsb_swapin() to
2112 * allocate TSB memory.
2114 tsbinfop->tsb_tte.ll = 0;
2117 /* Now we can drop the lock and free the TSB memory. */
2118 sfmmu_hat_exit(hatlockp);
2119 for (; freelist != NULL; freelist = next) {
2120 next = freelist->next;
2121 sfmmu_tsb_free(freelist->tsbinfop);
2126 * Duplicate the translations of an as into another newas
2128 /* ARGSUSED */
2130 hat_dup(struct hat *hat, struct hat *newhat, caddr_t addr, size_t len,
2131 uint_t flag)
2133 sf_srd_t *srdp;
2134 sf_scd_t *scdp;
2135 int i;
2136 extern uint_t get_color_start(struct as *);
2138 ASSERT(hat->sfmmu_xhat_provider == NULL);
2139 ASSERT((flag == 0) || (flag == HAT_DUP_ALL) || (flag == HAT_DUP_COW) ||
2140 (flag == HAT_DUP_SRD));
2141 ASSERT(hat != ksfmmup);
2142 ASSERT(newhat != ksfmmup);
2143 ASSERT(flag != HAT_DUP_ALL || hat->sfmmu_srdp == newhat->sfmmu_srdp);
2145 if (flag == HAT_DUP_COW) {
2146 panic("hat_dup: HAT_DUP_COW not supported");
2149 if (flag == HAT_DUP_SRD && ((srdp = hat->sfmmu_srdp) != NULL)) {
2150 ASSERT(srdp->srd_evp != NULL);
2151 VN_HOLD(srdp->srd_evp);
2152 ASSERT(srdp->srd_refcnt > 0);
2153 newhat->sfmmu_srdp = srdp;
2154 atomic_inc_32((volatile uint_t *)&srdp->srd_refcnt);
2158 * HAT_DUP_ALL flag is used after as duplication is done.
2160 if (flag == HAT_DUP_ALL && ((srdp = newhat->sfmmu_srdp) != NULL)) {
2161 ASSERT(newhat->sfmmu_srdp->srd_refcnt >= 2);
2162 newhat->sfmmu_rtteflags = hat->sfmmu_rtteflags;
2163 if (hat->sfmmu_flags & HAT_4MTEXT_FLAG) {
2164 newhat->sfmmu_flags |= HAT_4MTEXT_FLAG;
2167 /* check if need to join scd */
2168 if ((scdp = hat->sfmmu_scdp) != NULL &&
2169 newhat->sfmmu_scdp != scdp) {
2170 int ret;
2171 SF_RGNMAP_IS_SUBSET(&newhat->sfmmu_region_map,
2172 &scdp->scd_region_map, ret);
2173 ASSERT(ret);
2174 sfmmu_join_scd(scdp, newhat);
2175 ASSERT(newhat->sfmmu_scdp == scdp &&
2176 scdp->scd_refcnt >= 2);
2177 for (i = 0; i < max_mmu_page_sizes; i++) {
2178 newhat->sfmmu_ismttecnt[i] =
2179 hat->sfmmu_ismttecnt[i];
2180 newhat->sfmmu_scdismttecnt[i] =
2181 hat->sfmmu_scdismttecnt[i];
2185 sfmmu_check_page_sizes(newhat, 1);
2188 if (flag == HAT_DUP_ALL && consistent_coloring == 0 &&
2189 update_proc_pgcolorbase_after_fork != 0) {
2190 hat->sfmmu_clrbin = get_color_start(hat->sfmmu_as);
2192 return (0);
2195 void
2196 hat_memload(struct hat *hat, caddr_t addr, struct page *pp,
2197 uint_t attr, uint_t flags)
2199 hat_do_memload(hat, addr, pp, attr, flags,
2200 SFMMU_INVALID_SHMERID);
2203 void
2204 hat_memload_region(struct hat *hat, caddr_t addr, struct page *pp,
2205 uint_t attr, uint_t flags, hat_region_cookie_t rcookie)
2207 uint_t rid;
2208 if (rcookie == HAT_INVALID_REGION_COOKIE ||
2209 hat->sfmmu_xhat_provider != NULL) {
2210 hat_do_memload(hat, addr, pp, attr, flags,
2211 SFMMU_INVALID_SHMERID);
2212 return;
2214 rid = (uint_t)((uint64_t)rcookie);
2215 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
2216 hat_do_memload(hat, addr, pp, attr, flags, rid);
2220 * Set up addr to map to page pp with protection prot.
2221 * As an optimization we also load the TSB with the
2222 * corresponding tte but it is no big deal if the tte gets kicked out.
2224 static void
2225 hat_do_memload(struct hat *hat, caddr_t addr, struct page *pp,
2226 uint_t attr, uint_t flags, uint_t rid)
2228 tte_t tte;
2231 ASSERT(hat != NULL);
2232 ASSERT(PAGE_LOCKED(pp));
2233 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET));
2234 ASSERT(!(flags & ~SFMMU_LOAD_ALLFLAG));
2235 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR));
2236 SFMMU_VALIDATE_HMERID(hat, rid, addr, MMU_PAGESIZE);
2238 if (PP_ISFREE(pp)) {
2239 panic("hat_memload: loading a mapping to free page %p",
2240 (void *)pp);
2243 if (hat->sfmmu_xhat_provider) {
2244 /* no regions for xhats */
2245 ASSERT(!SFMMU_IS_SHMERID_VALID(rid));
2246 XHAT_MEMLOAD(hat, addr, pp, attr, flags);
2247 return;
2250 ASSERT((hat == ksfmmup) || AS_LOCK_HELD(hat->sfmmu_as));
2252 if (flags & ~SFMMU_LOAD_ALLFLAG)
2253 cmn_err(CE_NOTE, "hat_memload: unsupported flags %d",
2254 flags & ~SFMMU_LOAD_ALLFLAG);
2256 if (hat->sfmmu_rmstat)
2257 hat_resvstat(MMU_PAGESIZE, hat->sfmmu_as, addr);
2259 #if defined(SF_ERRATA_57)
2260 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) &&
2261 (addr < errata57_limit) && (attr & PROT_EXEC) &&
2262 !(flags & HAT_LOAD_SHARE)) {
2263 cmn_err(CE_WARN, "hat_memload: illegal attempt to make user "
2264 " page executable");
2265 attr &= ~PROT_EXEC;
2267 #endif
2269 sfmmu_memtte(&tte, pp->p_pagenum, attr, TTE8K);
2270 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, flags, rid);
2273 * Check TSB and TLB page sizes.
2275 if ((flags & HAT_LOAD_SHARE) == 0) {
2276 sfmmu_check_page_sizes(hat, 1);
2281 * hat_devload can be called to map real memory (e.g.
2282 * /dev/kmem) and even though hat_devload will determine pf is
2283 * for memory, it will be unable to get a shared lock on the
2284 * page (because someone else has it exclusively) and will
2285 * pass dp = NULL. If tteload doesn't get a non-NULL
2286 * page pointer it can't cache memory.
2288 void
2289 hat_devload(struct hat *hat, caddr_t addr, size_t len, pfn_t pfn,
2290 uint_t attr, int flags)
2292 tte_t tte;
2293 struct page *pp = NULL;
2294 int use_lgpg = 0;
2296 ASSERT(hat != NULL);
2298 if (hat->sfmmu_xhat_provider) {
2299 XHAT_DEVLOAD(hat, addr, len, pfn, attr, flags);
2300 return;
2303 ASSERT(!(flags & ~SFMMU_LOAD_ALLFLAG));
2304 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR));
2305 ASSERT((hat == ksfmmup) || AS_LOCK_HELD(hat->sfmmu_as));
2306 if (len == 0)
2307 panic("hat_devload: zero len");
2308 if (flags & ~SFMMU_LOAD_ALLFLAG)
2309 cmn_err(CE_NOTE, "hat_devload: unsupported flags %d",
2310 flags & ~SFMMU_LOAD_ALLFLAG);
2312 #if defined(SF_ERRATA_57)
2313 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) &&
2314 (addr < errata57_limit) && (attr & PROT_EXEC) &&
2315 !(flags & HAT_LOAD_SHARE)) {
2316 cmn_err(CE_WARN, "hat_devload: illegal attempt to make user "
2317 " page executable");
2318 attr &= ~PROT_EXEC;
2320 #endif
2323 * If it's a memory page find its pp
2325 if (!(flags & HAT_LOAD_NOCONSIST) && pf_is_memory(pfn)) {
2326 pp = page_numtopp_nolock(pfn);
2327 if (pp == NULL) {
2328 flags |= HAT_LOAD_NOCONSIST;
2329 } else {
2330 if (PP_ISFREE(pp)) {
2331 panic("hat_memload: loading "
2332 "a mapping to free page %p",
2333 (void *)pp);
2335 if (!PAGE_LOCKED(pp) && !PP_ISNORELOC(pp)) {
2336 panic("hat_memload: loading a mapping "
2337 "to unlocked relocatable page %p",
2338 (void *)pp);
2340 ASSERT(len == MMU_PAGESIZE);
2344 if (hat->sfmmu_rmstat)
2345 hat_resvstat(len, hat->sfmmu_as, addr);
2347 if (flags & HAT_LOAD_NOCONSIST) {
2348 attr |= SFMMU_UNCACHEVTTE;
2349 use_lgpg = 1;
2351 if (!pf_is_memory(pfn)) {
2352 attr |= SFMMU_UNCACHEPTTE | HAT_NOSYNC;
2353 use_lgpg = 1;
2354 switch (attr & HAT_ORDER_MASK) {
2355 case HAT_STRICTORDER:
2356 case HAT_UNORDERED_OK:
2358 * we set the side effect bit for all non
2359 * memory mappings unless merging is ok
2361 attr |= SFMMU_SIDEFFECT;
2362 break;
2363 case HAT_MERGING_OK:
2364 case HAT_LOADCACHING_OK:
2365 case HAT_STORECACHING_OK:
2366 break;
2367 default:
2368 panic("hat_devload: bad attr");
2369 break;
2372 while (len) {
2373 if (!use_lgpg) {
2374 sfmmu_memtte(&tte, pfn, attr, TTE8K);
2375 (void) sfmmu_tteload_array(hat, &tte, addr, &pp,
2376 flags, SFMMU_INVALID_SHMERID);
2377 len -= MMU_PAGESIZE;
2378 addr += MMU_PAGESIZE;
2379 pfn++;
2380 continue;
2383 * try to use large pages, check va/pa alignments
2384 * Note that 32M/256M page sizes are not (yet) supported.
2386 if ((len >= MMU_PAGESIZE4M) &&
2387 !((uintptr_t)addr & MMU_PAGEOFFSET4M) &&
2388 !(disable_large_pages & (1 << TTE4M)) &&
2389 !(mmu_ptob(pfn) & MMU_PAGEOFFSET4M)) {
2390 sfmmu_memtte(&tte, pfn, attr, TTE4M);
2391 (void) sfmmu_tteload_array(hat, &tte, addr, &pp,
2392 flags, SFMMU_INVALID_SHMERID);
2393 len -= MMU_PAGESIZE4M;
2394 addr += MMU_PAGESIZE4M;
2395 pfn += MMU_PAGESIZE4M / MMU_PAGESIZE;
2396 } else if ((len >= MMU_PAGESIZE512K) &&
2397 !((uintptr_t)addr & MMU_PAGEOFFSET512K) &&
2398 !(disable_large_pages & (1 << TTE512K)) &&
2399 !(mmu_ptob(pfn) & MMU_PAGEOFFSET512K)) {
2400 sfmmu_memtte(&tte, pfn, attr, TTE512K);
2401 (void) sfmmu_tteload_array(hat, &tte, addr, &pp,
2402 flags, SFMMU_INVALID_SHMERID);
2403 len -= MMU_PAGESIZE512K;
2404 addr += MMU_PAGESIZE512K;
2405 pfn += MMU_PAGESIZE512K / MMU_PAGESIZE;
2406 } else if ((len >= MMU_PAGESIZE64K) &&
2407 !((uintptr_t)addr & MMU_PAGEOFFSET64K) &&
2408 !(disable_large_pages & (1 << TTE64K)) &&
2409 !(mmu_ptob(pfn) & MMU_PAGEOFFSET64K)) {
2410 sfmmu_memtte(&tte, pfn, attr, TTE64K);
2411 (void) sfmmu_tteload_array(hat, &tte, addr, &pp,
2412 flags, SFMMU_INVALID_SHMERID);
2413 len -= MMU_PAGESIZE64K;
2414 addr += MMU_PAGESIZE64K;
2415 pfn += MMU_PAGESIZE64K / MMU_PAGESIZE;
2416 } else {
2417 sfmmu_memtte(&tte, pfn, attr, TTE8K);
2418 (void) sfmmu_tteload_array(hat, &tte, addr, &pp,
2419 flags, SFMMU_INVALID_SHMERID);
2420 len -= MMU_PAGESIZE;
2421 addr += MMU_PAGESIZE;
2422 pfn++;
2427 * Check TSB and TLB page sizes.
2429 if ((flags & HAT_LOAD_SHARE) == 0) {
2430 sfmmu_check_page_sizes(hat, 1);
2434 void
2435 hat_memload_array(struct hat *hat, caddr_t addr, size_t len,
2436 struct page **pps, uint_t attr, uint_t flags)
2438 hat_do_memload_array(hat, addr, len, pps, attr, flags,
2439 SFMMU_INVALID_SHMERID);
2442 void
2443 hat_memload_array_region(struct hat *hat, caddr_t addr, size_t len,
2444 struct page **pps, uint_t attr, uint_t flags,
2445 hat_region_cookie_t rcookie)
2447 uint_t rid;
2448 if (rcookie == HAT_INVALID_REGION_COOKIE ||
2449 hat->sfmmu_xhat_provider != NULL) {
2450 hat_do_memload_array(hat, addr, len, pps, attr, flags,
2451 SFMMU_INVALID_SHMERID);
2452 return;
2454 rid = (uint_t)((uint64_t)rcookie);
2455 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
2456 hat_do_memload_array(hat, addr, len, pps, attr, flags, rid);
2460 * Map the largest extend possible out of the page array. The array may NOT
2461 * be in order. The largest possible mapping a page can have
2462 * is specified in the p_szc field. The p_szc field
2463 * cannot change as long as there any mappings (large or small)
2464 * to any of the pages that make up the large page. (ie. any
2465 * promotion/demotion of page size is not up to the hat but up to
2466 * the page free list manager). The array
2467 * should consist of properly aligned contigous pages that are
2468 * part of a big page for a large mapping to be created.
2470 static void
2471 hat_do_memload_array(struct hat *hat, caddr_t addr, size_t len,
2472 struct page **pps, uint_t attr, uint_t flags, uint_t rid)
2474 int ttesz;
2475 size_t mapsz;
2476 pgcnt_t numpg, npgs;
2477 tte_t tte;
2478 page_t *pp;
2479 uint_t large_pages_disable;
2481 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET));
2482 SFMMU_VALIDATE_HMERID(hat, rid, addr, len);
2484 if (hat->sfmmu_xhat_provider) {
2485 ASSERT(!SFMMU_IS_SHMERID_VALID(rid));
2486 XHAT_MEMLOAD_ARRAY(hat, addr, len, pps, attr, flags);
2487 return;
2490 if (hat->sfmmu_rmstat)
2491 hat_resvstat(len, hat->sfmmu_as, addr);
2493 #if defined(SF_ERRATA_57)
2494 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) &&
2495 (addr < errata57_limit) && (attr & PROT_EXEC) &&
2496 !(flags & HAT_LOAD_SHARE)) {
2497 cmn_err(CE_WARN, "hat_memload_array: illegal attempt to make "
2498 "user page executable");
2499 attr &= ~PROT_EXEC;
2501 #endif
2503 /* Get number of pages */
2504 npgs = len >> MMU_PAGESHIFT;
2506 if (flags & HAT_LOAD_SHARE) {
2507 large_pages_disable = disable_ism_large_pages;
2508 } else {
2509 large_pages_disable = disable_large_pages;
2512 if (npgs < NHMENTS || large_pages_disable == LARGE_PAGES_OFF) {
2513 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, npgs,
2514 rid);
2515 return;
2518 while (npgs >= NHMENTS) {
2519 pp = *pps;
2520 for (ttesz = pp->p_szc; ttesz != TTE8K; ttesz--) {
2522 * Check if this page size is disabled.
2524 if (large_pages_disable & (1 << ttesz))
2525 continue;
2527 numpg = TTEPAGES(ttesz);
2528 mapsz = numpg << MMU_PAGESHIFT;
2529 if ((npgs >= numpg) &&
2530 IS_P2ALIGNED(addr, mapsz) &&
2531 IS_P2ALIGNED(pp->p_pagenum, numpg)) {
2533 * At this point we have enough pages and
2534 * we know the virtual address and the pfn
2535 * are properly aligned. We still need
2536 * to check for physical contiguity but since
2537 * it is very likely that this is the case
2538 * we will assume they are so and undo
2539 * the request if necessary. It would
2540 * be great if we could get a hint flag
2541 * like HAT_CONTIG which would tell us
2542 * the pages are contigous for sure.
2544 sfmmu_memtte(&tte, (*pps)->p_pagenum,
2545 attr, ttesz);
2546 if (!sfmmu_tteload_array(hat, &tte, addr,
2547 pps, flags, rid)) {
2548 break;
2552 if (ttesz == TTE8K) {
2554 * We were not able to map array using a large page
2555 * batch a hmeblk or fraction at a time.
2557 numpg = ((uintptr_t)addr >> MMU_PAGESHIFT)
2558 & (NHMENTS-1);
2559 numpg = NHMENTS - numpg;
2560 ASSERT(numpg <= npgs);
2561 mapsz = numpg * MMU_PAGESIZE;
2562 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags,
2563 numpg, rid);
2565 addr += mapsz;
2566 npgs -= numpg;
2567 pps += numpg;
2570 if (npgs) {
2571 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, npgs,
2572 rid);
2576 * Check TSB and TLB page sizes.
2578 if ((flags & HAT_LOAD_SHARE) == 0) {
2579 sfmmu_check_page_sizes(hat, 1);
2584 * Function tries to batch 8K pages into the same hme blk.
2586 static void
2587 sfmmu_memload_batchsmall(struct hat *hat, caddr_t vaddr, page_t **pps,
2588 uint_t attr, uint_t flags, pgcnt_t npgs, uint_t rid)
2590 tte_t tte;
2591 page_t *pp;
2592 struct hmehash_bucket *hmebp;
2593 struct hme_blk *hmeblkp;
2594 int index;
2596 while (npgs) {
2598 * Acquire the hash bucket.
2600 hmebp = sfmmu_tteload_acquire_hashbucket(hat, vaddr, TTE8K,
2601 rid);
2602 ASSERT(hmebp);
2605 * Find the hment block.
2607 hmeblkp = sfmmu_tteload_find_hmeblk(hat, hmebp, vaddr,
2608 TTE8K, flags, rid);
2609 ASSERT(hmeblkp);
2611 do {
2613 * Make the tte.
2615 pp = *pps;
2616 sfmmu_memtte(&tte, pp->p_pagenum, attr, TTE8K);
2619 * Add the translation.
2621 (void) sfmmu_tteload_addentry(hat, hmeblkp, &tte,
2622 vaddr, pps, flags, rid);
2625 * Goto next page.
2627 pps++;
2628 npgs--;
2631 * Goto next address.
2633 vaddr += MMU_PAGESIZE;
2636 * Don't crossover into a different hmentblk.
2638 index = (int)(((uintptr_t)vaddr >> MMU_PAGESHIFT) &
2639 (NHMENTS-1));
2641 } while (index != 0 && npgs != 0);
2644 * Release the hash bucket.
2647 sfmmu_tteload_release_hashbucket(hmebp);
2652 * Construct a tte for a page:
2654 * tte_valid = 1
2655 * tte_size2 = size & TTE_SZ2_BITS (Panther and Olympus-C only)
2656 * tte_size = size
2657 * tte_nfo = attr & HAT_NOFAULT
2658 * tte_ie = attr & HAT_STRUCTURE_LE
2659 * tte_hmenum = hmenum
2660 * tte_pahi = pp->p_pagenum >> TTE_PASHIFT;
2661 * tte_palo = pp->p_pagenum & TTE_PALOMASK;
2662 * tte_ref = 1 (optimization)
2663 * tte_wr_perm = attr & PROT_WRITE;
2664 * tte_no_sync = attr & HAT_NOSYNC
2665 * tte_lock = attr & SFMMU_LOCKTTE
2666 * tte_cp = !(attr & SFMMU_UNCACHEPTTE)
2667 * tte_cv = !(attr & SFMMU_UNCACHEVTTE)
2668 * tte_e = attr & SFMMU_SIDEFFECT
2669 * tte_priv = !(attr & PROT_USER)
2670 * tte_hwwr = if nosync is set and it is writable we set the mod bit (opt)
2671 * tte_glb = 0
2673 void
2674 sfmmu_memtte(tte_t *ttep, pfn_t pfn, uint_t attr, int tte_sz)
2676 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR));
2678 ttep->tte_inthi = MAKE_TTE_INTHI(pfn, attr, tte_sz, 0 /* hmenum */);
2679 ttep->tte_intlo = MAKE_TTE_INTLO(pfn, attr, tte_sz, 0 /* hmenum */);
2681 if (TTE_IS_NOSYNC(ttep)) {
2682 TTE_SET_REF(ttep);
2683 if (TTE_IS_WRITABLE(ttep)) {
2684 TTE_SET_MOD(ttep);
2687 if (TTE_IS_NFO(ttep) && TTE_IS_EXECUTABLE(ttep)) {
2688 panic("sfmmu_memtte: can't set both NFO and EXEC bits");
2693 * This function will add a translation to the hme_blk and allocate the
2694 * hme_blk if one does not exist.
2695 * If a page structure is specified then it will add the
2696 * corresponding hment to the mapping list.
2697 * It will also update the hmenum field for the tte.
2699 * Currently this function is only used for kernel mappings.
2700 * So pass invalid region to sfmmu_tteload_array().
2702 void
2703 sfmmu_tteload(struct hat *sfmmup, tte_t *ttep, caddr_t vaddr, page_t *pp,
2704 uint_t flags)
2706 ASSERT(sfmmup == ksfmmup);
2707 (void) sfmmu_tteload_array(sfmmup, ttep, vaddr, &pp, flags,
2708 SFMMU_INVALID_SHMERID);
2712 * Load (ttep != NULL) or unload (ttep == NULL) one entry in the TSB.
2713 * Assumes that a particular page size may only be resident in one TSB.
2715 static void
2716 sfmmu_mod_tsb(sfmmu_t *sfmmup, caddr_t vaddr, tte_t *ttep, int ttesz)
2718 struct tsb_info *tsbinfop = NULL;
2719 uint64_t tag;
2720 struct tsbe *tsbe_addr;
2721 uint64_t tsb_base;
2722 uint_t tsb_size;
2723 int vpshift = MMU_PAGESHIFT;
2724 int phys = 0;
2726 if (sfmmup == ksfmmup) { /* No support for 32/256M ksfmmu pages */
2727 phys = ktsb_phys;
2728 if (ttesz >= TTE4M) {
2729 #ifndef sun4v
2730 ASSERT((ttesz != TTE32M) && (ttesz != TTE256M));
2731 #endif
2732 tsb_base = (phys)? ktsb4m_pbase : (uint64_t)ktsb4m_base;
2733 tsb_size = ktsb4m_szcode;
2734 } else {
2735 tsb_base = (phys)? ktsb_pbase : (uint64_t)ktsb_base;
2736 tsb_size = ktsb_szcode;
2738 } else {
2739 SFMMU_GET_TSBINFO(tsbinfop, sfmmup, ttesz);
2742 * If there isn't a TSB for this page size, or the TSB is
2743 * swapped out, there is nothing to do. Note that the latter
2744 * case seems impossible but can occur if hat_pageunload()
2745 * is called on an ISM mapping while the process is swapped
2746 * out.
2748 if (tsbinfop == NULL || (tsbinfop->tsb_flags & TSB_SWAPPED))
2749 return;
2752 * If another thread is in the middle of relocating a TSB
2753 * we can't unload the entry so set a flag so that the
2754 * TSB will be flushed before it can be accessed by the
2755 * process.
2757 if ((tsbinfop->tsb_flags & TSB_RELOC_FLAG) != 0) {
2758 if (ttep == NULL)
2759 tsbinfop->tsb_flags |= TSB_FLUSH_NEEDED;
2760 return;
2762 #if defined(UTSB_PHYS)
2763 phys = 1;
2764 tsb_base = (uint64_t)tsbinfop->tsb_pa;
2765 #else
2766 tsb_base = (uint64_t)tsbinfop->tsb_va;
2767 #endif
2768 tsb_size = tsbinfop->tsb_szc;
2770 if (ttesz >= TTE4M)
2771 vpshift = MMU_PAGESHIFT4M;
2773 tsbe_addr = sfmmu_get_tsbe(tsb_base, vaddr, vpshift, tsb_size);
2774 tag = sfmmu_make_tsbtag(vaddr);
2776 if (ttep == NULL) {
2777 sfmmu_unload_tsbe(tsbe_addr, tag, phys);
2778 } else {
2779 if (ttesz >= TTE4M) {
2780 SFMMU_STAT(sf_tsb_load4m);
2781 } else {
2782 SFMMU_STAT(sf_tsb_load8k);
2785 sfmmu_load_tsbe(tsbe_addr, tag, ttep, phys);
2790 * Unmap all entries from [start, end) matching the given page size.
2792 * This function is used primarily to unmap replicated 64K or 512K entries
2793 * from the TSB that are inserted using the base page size TSB pointer, but
2794 * it may also be called to unmap a range of addresses from the TSB.
2796 void
2797 sfmmu_unload_tsb_range(sfmmu_t *sfmmup, caddr_t start, caddr_t end, int ttesz)
2799 struct tsb_info *tsbinfop;
2800 uint64_t tag;
2801 struct tsbe *tsbe_addr;
2802 caddr_t vaddr;
2803 uint64_t tsb_base;
2804 int vpshift, vpgsz;
2805 uint_t tsb_size;
2806 int phys = 0;
2809 * Assumptions:
2810 * If ttesz == 8K, 64K or 512K, we walk through the range 8K
2811 * at a time shooting down any valid entries we encounter.
2813 * If ttesz >= 4M we walk the range 4M at a time shooting
2814 * down any valid mappings we find.
2816 if (sfmmup == ksfmmup) {
2817 phys = ktsb_phys;
2818 if (ttesz >= TTE4M) {
2819 #ifndef sun4v
2820 ASSERT((ttesz != TTE32M) && (ttesz != TTE256M));
2821 #endif
2822 tsb_base = (phys)? ktsb4m_pbase : (uint64_t)ktsb4m_base;
2823 tsb_size = ktsb4m_szcode;
2824 } else {
2825 tsb_base = (phys)? ktsb_pbase : (uint64_t)ktsb_base;
2826 tsb_size = ktsb_szcode;
2828 } else {
2829 SFMMU_GET_TSBINFO(tsbinfop, sfmmup, ttesz);
2832 * If there isn't a TSB for this page size, or the TSB is
2833 * swapped out, there is nothing to do. Note that the latter
2834 * case seems impossible but can occur if hat_pageunload()
2835 * is called on an ISM mapping while the process is swapped
2836 * out.
2838 if (tsbinfop == NULL || (tsbinfop->tsb_flags & TSB_SWAPPED))
2839 return;
2842 * If another thread is in the middle of relocating a TSB
2843 * we can't unload the entry so set a flag so that the
2844 * TSB will be flushed before it can be accessed by the
2845 * process.
2847 if ((tsbinfop->tsb_flags & TSB_RELOC_FLAG) != 0) {
2848 tsbinfop->tsb_flags |= TSB_FLUSH_NEEDED;
2849 return;
2851 #if defined(UTSB_PHYS)
2852 phys = 1;
2853 tsb_base = (uint64_t)tsbinfop->tsb_pa;
2854 #else
2855 tsb_base = (uint64_t)tsbinfop->tsb_va;
2856 #endif
2857 tsb_size = tsbinfop->tsb_szc;
2859 if (ttesz >= TTE4M) {
2860 vpshift = MMU_PAGESHIFT4M;
2861 vpgsz = MMU_PAGESIZE4M;
2862 } else {
2863 vpshift = MMU_PAGESHIFT;
2864 vpgsz = MMU_PAGESIZE;
2867 for (vaddr = start; vaddr < end; vaddr += vpgsz) {
2868 tag = sfmmu_make_tsbtag(vaddr);
2869 tsbe_addr = sfmmu_get_tsbe(tsb_base, vaddr, vpshift, tsb_size);
2870 sfmmu_unload_tsbe(tsbe_addr, tag, phys);
2875 * Select the optimum TSB size given the number of mappings
2876 * that need to be cached.
2878 static int
2879 sfmmu_select_tsb_szc(pgcnt_t pgcnt)
2881 int szc = 0;
2883 #ifdef DEBUG
2884 if (tsb_grow_stress) {
2885 uint32_t randval = (uint32_t)gettick() >> 4;
2886 return (randval % (tsb_max_growsize + 1));
2888 #endif /* DEBUG */
2890 while ((szc < tsb_max_growsize) && (pgcnt > SFMMU_RSS_TSBSIZE(szc)))
2891 szc++;
2892 return (szc);
2896 * This function will add a translation to the hme_blk and allocate the
2897 * hme_blk if one does not exist.
2898 * If a page structure is specified then it will add the
2899 * corresponding hment to the mapping list.
2900 * It will also update the hmenum field for the tte.
2901 * Furthermore, it attempts to create a large page translation
2902 * for <addr,hat> at page array pps. It assumes addr and first
2903 * pp is correctly aligned. It returns 0 if successful and 1 otherwise.
2905 static int
2906 sfmmu_tteload_array(sfmmu_t *sfmmup, tte_t *ttep, caddr_t vaddr,
2907 page_t **pps, uint_t flags, uint_t rid)
2909 struct hmehash_bucket *hmebp;
2910 struct hme_blk *hmeblkp;
2911 int ret;
2912 uint_t size;
2915 * Get mapping size.
2917 size = TTE_CSZ(ttep);
2918 ASSERT(!((uintptr_t)vaddr & TTE_PAGE_OFFSET(size)));
2921 * Acquire the hash bucket.
2923 hmebp = sfmmu_tteload_acquire_hashbucket(sfmmup, vaddr, size, rid);
2924 ASSERT(hmebp);
2927 * Find the hment block.
2929 hmeblkp = sfmmu_tteload_find_hmeblk(sfmmup, hmebp, vaddr, size, flags,
2930 rid);
2931 ASSERT(hmeblkp);
2934 * Add the translation.
2936 ret = sfmmu_tteload_addentry(sfmmup, hmeblkp, ttep, vaddr, pps, flags,
2937 rid);
2940 * Release the hash bucket.
2942 sfmmu_tteload_release_hashbucket(hmebp);
2944 return (ret);
2948 * Function locks and returns a pointer to the hash bucket for vaddr and size.
2950 static struct hmehash_bucket *
2951 sfmmu_tteload_acquire_hashbucket(sfmmu_t *sfmmup, caddr_t vaddr, int size,
2952 uint_t rid)
2954 struct hmehash_bucket *hmebp;
2955 int hmeshift;
2956 void *htagid = sfmmutohtagid(sfmmup, rid);
2958 ASSERT(htagid != NULL);
2960 hmeshift = HME_HASH_SHIFT(size);
2962 hmebp = HME_HASH_FUNCTION(htagid, vaddr, hmeshift);
2964 SFMMU_HASH_LOCK(hmebp);
2966 return (hmebp);
2970 * Function returns a pointer to an hmeblk in the hash bucket, hmebp. If the
2971 * hmeblk doesn't exists for the [sfmmup, vaddr & size] signature, a hmeblk is
2972 * allocated.
2974 static struct hme_blk *
2975 sfmmu_tteload_find_hmeblk(sfmmu_t *sfmmup, struct hmehash_bucket *hmebp,
2976 caddr_t vaddr, uint_t size, uint_t flags, uint_t rid)
2978 hmeblk_tag hblktag;
2979 int hmeshift;
2980 struct hme_blk *hmeblkp, *pr_hblk, *list = NULL;
2982 SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size));
2984 hblktag.htag_id = sfmmutohtagid(sfmmup, rid);
2985 ASSERT(hblktag.htag_id != NULL);
2986 hmeshift = HME_HASH_SHIFT(size);
2987 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift);
2988 hblktag.htag_rehash = HME_HASH_REHASH(size);
2989 hblktag.htag_rid = rid;
2991 ttearray_realloc:
2993 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list);
2996 * We block until hblk_reserve_lock is released; it's held by
2997 * the thread, temporarily using hblk_reserve, until hblk_reserve is
2998 * replaced by a hblk from sfmmu8_cache.
3000 if (hmeblkp == (struct hme_blk *)hblk_reserve &&
3001 hblk_reserve_thread != curthread) {
3002 SFMMU_HASH_UNLOCK(hmebp);
3003 mutex_enter(&hblk_reserve_lock);
3004 mutex_exit(&hblk_reserve_lock);
3005 SFMMU_STAT(sf_hblk_reserve_hit);
3006 SFMMU_HASH_LOCK(hmebp);
3007 goto ttearray_realloc;
3010 if (hmeblkp == NULL) {
3011 hmeblkp = sfmmu_hblk_alloc(sfmmup, vaddr, hmebp, size,
3012 hblktag, flags, rid);
3013 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared);
3014 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared);
3015 } else {
3017 * It is possible for 8k and 64k hblks to collide since they
3018 * have the same rehash value. This is because we
3019 * lazily free hblks and 8K/64K blks could be lingering.
3020 * If we find size mismatch we free the block and & try again.
3022 if (get_hblk_ttesz(hmeblkp) != size) {
3023 ASSERT(!hmeblkp->hblk_vcnt);
3024 ASSERT(!hmeblkp->hblk_hmecnt);
3025 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
3026 &list, 0);
3027 goto ttearray_realloc;
3029 if (hmeblkp->hblk_shw_bit) {
3031 * if the hblk was previously used as a shadow hblk then
3032 * we will change it to a normal hblk
3034 ASSERT(!hmeblkp->hblk_shared);
3035 if (hmeblkp->hblk_shw_mask) {
3036 sfmmu_shadow_hcleanup(sfmmup, hmeblkp, hmebp);
3037 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
3038 goto ttearray_realloc;
3039 } else {
3040 hmeblkp->hblk_shw_bit = 0;
3043 SFMMU_STAT(sf_hblk_hit);
3047 * hat_memload() should never call kmem_cache_free() for kernel hmeblks;
3048 * see block comment showing the stacktrace in sfmmu_hblk_alloc();
3049 * set the flag parameter to 1 so that sfmmu_hblks_list_purge() will
3050 * just add these hmeblks to the per-cpu pending queue.
3052 sfmmu_hblks_list_purge(&list, 1);
3054 ASSERT(get_hblk_ttesz(hmeblkp) == size);
3055 ASSERT(!hmeblkp->hblk_shw_bit);
3056 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared);
3057 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared);
3058 ASSERT(hmeblkp->hblk_tag.htag_rid == rid);
3060 return (hmeblkp);
3064 * Function adds a tte entry into the hmeblk. It returns 0 if successful and 1
3065 * otherwise.
3067 static int
3068 sfmmu_tteload_addentry(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, tte_t *ttep,
3069 caddr_t vaddr, page_t **pps, uint_t flags, uint_t rid)
3071 page_t *pp = *pps;
3072 int hmenum, size, remap;
3073 tte_t tteold, flush_tte;
3074 #ifdef DEBUG
3075 tte_t orig_old;
3076 #endif /* DEBUG */
3077 struct sf_hment *sfhme;
3078 kmutex_t *pml, *pmtx;
3079 hatlock_t *hatlockp;
3080 int myflt;
3083 * remove this panic when we decide to let user virtual address
3084 * space be >= USERLIMIT.
3086 if (!TTE_IS_PRIVILEGED(ttep) && vaddr >= (caddr_t)USERLIMIT)
3087 panic("user addr %p in kernel space", (void *)vaddr);
3088 #if defined(TTE_IS_GLOBAL)
3089 if (TTE_IS_GLOBAL(ttep))
3090 panic("sfmmu_tteload: creating global tte");
3091 #endif
3093 #ifdef DEBUG
3094 if (pf_is_memory(sfmmu_ttetopfn(ttep, vaddr)) &&
3095 !TTE_IS_PCACHEABLE(ttep) && !sfmmu_allow_nc_trans)
3096 panic("sfmmu_tteload: non cacheable memory tte");
3097 #endif /* DEBUG */
3099 /* don't simulate dirty bit for writeable ISM/DISM mappings */
3100 if ((flags & HAT_LOAD_SHARE) && TTE_IS_WRITABLE(ttep)) {
3101 TTE_SET_REF(ttep);
3102 TTE_SET_MOD(ttep);
3105 if ((flags & HAT_LOAD_SHARE) || !TTE_IS_REF(ttep) ||
3106 !TTE_IS_MOD(ttep)) {
3108 * Don't load TSB for dummy as in ISM. Also don't preload
3109 * the TSB if the TTE isn't writable since we're likely to
3110 * fault on it again -- preloading can be fairly expensive.
3112 flags |= SFMMU_NO_TSBLOAD;
3115 size = TTE_CSZ(ttep);
3116 switch (size) {
3117 case TTE8K:
3118 SFMMU_STAT(sf_tteload8k);
3119 break;
3120 case TTE64K:
3121 SFMMU_STAT(sf_tteload64k);
3122 break;
3123 case TTE512K:
3124 SFMMU_STAT(sf_tteload512k);
3125 break;
3126 case TTE4M:
3127 SFMMU_STAT(sf_tteload4m);
3128 break;
3129 case (TTE32M):
3130 SFMMU_STAT(sf_tteload32m);
3131 ASSERT(mmu_page_sizes == max_mmu_page_sizes);
3132 break;
3133 case (TTE256M):
3134 SFMMU_STAT(sf_tteload256m);
3135 ASSERT(mmu_page_sizes == max_mmu_page_sizes);
3136 break;
3139 ASSERT(!((uintptr_t)vaddr & TTE_PAGE_OFFSET(size)));
3140 SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size));
3141 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared);
3142 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared);
3144 HBLKTOHME_IDX(sfhme, hmeblkp, vaddr, hmenum);
3147 * Need to grab mlist lock here so that pageunload
3148 * will not change tte behind us.
3150 if (pp) {
3151 pml = sfmmu_mlist_enter(pp);
3154 sfmmu_copytte(&sfhme->hme_tte, &tteold);
3156 * Look for corresponding hment and if valid verify
3157 * pfns are equal.
3159 remap = TTE_IS_VALID(&tteold);
3160 if (remap) {
3161 pfn_t new_pfn, old_pfn;
3163 old_pfn = TTE_TO_PFN(vaddr, &tteold);
3164 new_pfn = TTE_TO_PFN(vaddr, ttep);
3166 if (flags & HAT_LOAD_REMAP) {
3167 /* make sure we are remapping same type of pages */
3168 if (pf_is_memory(old_pfn) != pf_is_memory(new_pfn)) {
3169 panic("sfmmu_tteload - tte remap io<->memory");
3171 if (old_pfn != new_pfn &&
3172 (pp != NULL || sfhme->hme_page != NULL)) {
3173 panic("sfmmu_tteload - tte remap pp != NULL");
3175 } else if (old_pfn != new_pfn) {
3176 panic("sfmmu_tteload - tte remap, hmeblkp 0x%p",
3177 (void *)hmeblkp);
3179 ASSERT(TTE_CSZ(&tteold) == TTE_CSZ(ttep));
3182 if (pp) {
3183 if (size == TTE8K) {
3184 #ifdef VAC
3186 * Handle VAC consistency
3188 if (!remap && (cache & CACHE_VAC) && !PP_ISNC(pp)) {
3189 sfmmu_vac_conflict(sfmmup, vaddr, pp);
3191 #endif
3193 if (TTE_IS_WRITABLE(ttep) && PP_ISRO(pp)) {
3194 pmtx = sfmmu_page_enter(pp);
3195 PP_CLRRO(pp);
3196 sfmmu_page_exit(pmtx);
3197 } else if (!PP_ISMAPPED(pp) &&
3198 (!TTE_IS_WRITABLE(ttep)) && !(PP_ISMOD(pp))) {
3199 pmtx = sfmmu_page_enter(pp);
3200 if (!(PP_ISMOD(pp))) {
3201 PP_SETRO(pp);
3203 sfmmu_page_exit(pmtx);
3206 } else if (sfmmu_pagearray_setup(vaddr, pps, ttep, remap)) {
3208 * sfmmu_pagearray_setup failed so return
3210 sfmmu_mlist_exit(pml);
3211 return (1);
3216 * Make sure hment is not on a mapping list.
3218 ASSERT(remap || (sfhme->hme_page == NULL));
3220 /* if it is not a remap then hme->next better be NULL */
3221 ASSERT((!remap) ? sfhme->hme_next == NULL : 1);
3223 if (flags & HAT_LOAD_LOCK) {
3224 if ((hmeblkp->hblk_lckcnt + 1) >= MAX_HBLK_LCKCNT) {
3225 panic("too high lckcnt-hmeblk %p",
3226 (void *)hmeblkp);
3228 atomic_inc_32(&hmeblkp->hblk_lckcnt);
3230 HBLK_STACK_TRACE(hmeblkp, HBLK_LOCK);
3233 #ifdef VAC
3234 if (pp && PP_ISNC(pp)) {
3236 * If the physical page is marked to be uncacheable, like
3237 * by a vac conflict, make sure the new mapping is also
3238 * uncacheable.
3240 TTE_CLR_VCACHEABLE(ttep);
3241 ASSERT(PP_GET_VCOLOR(pp) == NO_VCOLOR);
3243 #endif
3244 ttep->tte_hmenum = hmenum;
3246 #ifdef DEBUG
3247 orig_old = tteold;
3248 #endif /* DEBUG */
3250 while (sfmmu_modifytte_try(&tteold, ttep, &sfhme->hme_tte) < 0) {
3251 if ((sfmmup == KHATID) &&
3252 (flags & (HAT_LOAD_LOCK | HAT_LOAD_REMAP))) {
3253 sfmmu_copytte(&sfhme->hme_tte, &tteold);
3255 #ifdef DEBUG
3256 chk_tte(&orig_old, &tteold, ttep, hmeblkp);
3257 #endif /* DEBUG */
3259 ASSERT(TTE_IS_VALID(&sfhme->hme_tte));
3261 if (!TTE_IS_VALID(&tteold)) {
3263 atomic_inc_16(&hmeblkp->hblk_vcnt);
3264 if (rid == SFMMU_INVALID_SHMERID) {
3265 atomic_inc_ulong(&sfmmup->sfmmu_ttecnt[size]);
3266 } else {
3267 sf_srd_t *srdp = sfmmup->sfmmu_srdp;
3268 sf_region_t *rgnp = srdp->srd_hmergnp[rid];
3270 * We already accounted for region ttecnt's in sfmmu
3271 * during hat_join_region() processing. Here we
3272 * only update ttecnt's in region struture.
3274 atomic_inc_ulong(&rgnp->rgn_ttecnt[size]);
3278 myflt = (astosfmmu(curthread->t_procp->p_as) == sfmmup);
3279 if (size > TTE8K && (flags & HAT_LOAD_SHARE) == 0 &&
3280 sfmmup != ksfmmup) {
3281 uchar_t tteflag = 1 << size;
3282 if (rid == SFMMU_INVALID_SHMERID) {
3283 if (!(sfmmup->sfmmu_tteflags & tteflag)) {
3284 hatlockp = sfmmu_hat_enter(sfmmup);
3285 sfmmup->sfmmu_tteflags |= tteflag;
3286 sfmmu_hat_exit(hatlockp);
3288 } else if (!(sfmmup->sfmmu_rtteflags & tteflag)) {
3289 hatlockp = sfmmu_hat_enter(sfmmup);
3290 sfmmup->sfmmu_rtteflags |= tteflag;
3291 sfmmu_hat_exit(hatlockp);
3294 * Update the current CPU tsbmiss area, so the current thread
3295 * won't need to take the tsbmiss for the new pagesize.
3296 * The other threads in the process will update their tsb
3297 * miss area lazily in sfmmu_tsbmiss_exception() when they
3298 * fail to find the translation for a newly added pagesize.
3300 if (size > TTE64K && myflt) {
3301 struct tsbmiss *tsbmp;
3302 kpreempt_disable();
3303 tsbmp = &tsbmiss_area[CPU->cpu_id];
3304 if (rid == SFMMU_INVALID_SHMERID) {
3305 if (!(tsbmp->uhat_tteflags & tteflag)) {
3306 tsbmp->uhat_tteflags |= tteflag;
3308 } else {
3309 if (!(tsbmp->uhat_rtteflags & tteflag)) {
3310 tsbmp->uhat_rtteflags |= tteflag;
3313 kpreempt_enable();
3317 if (size >= TTE4M && (flags & HAT_LOAD_TEXT) &&
3318 !SFMMU_FLAGS_ISSET(sfmmup, HAT_4MTEXT_FLAG)) {
3319 hatlockp = sfmmu_hat_enter(sfmmup);
3320 SFMMU_FLAGS_SET(sfmmup, HAT_4MTEXT_FLAG);
3321 sfmmu_hat_exit(hatlockp);
3324 flush_tte.tte_intlo = (tteold.tte_intlo ^ ttep->tte_intlo) &
3325 hw_tte.tte_intlo;
3326 flush_tte.tte_inthi = (tteold.tte_inthi ^ ttep->tte_inthi) &
3327 hw_tte.tte_inthi;
3329 if (remap && (flush_tte.tte_inthi || flush_tte.tte_intlo)) {
3331 * If remap and new tte differs from old tte we need
3332 * to sync the mod bit and flush TLB/TSB. We don't
3333 * need to sync ref bit because we currently always set
3334 * ref bit in tteload.
3336 ASSERT(TTE_IS_REF(ttep));
3337 if (TTE_IS_MOD(&tteold)) {
3338 sfmmu_ttesync(sfmmup, vaddr, &tteold, pp);
3341 * hwtte bits shouldn't change for SRD hmeblks as long as SRD
3342 * hmes are only used for read only text. Adding this code for
3343 * completeness and future use of shared hmeblks with writable
3344 * mappings of VMODSORT vnodes.
3346 if (hmeblkp->hblk_shared) {
3347 cpuset_t cpuset = sfmmu_rgntlb_demap(vaddr,
3348 sfmmup->sfmmu_srdp->srd_hmergnp[rid], hmeblkp, 1);
3349 xt_sync(cpuset);
3350 SFMMU_STAT_ADD(sf_region_remap_demap, 1);
3351 } else {
3352 sfmmu_tlb_demap(vaddr, sfmmup, hmeblkp, 0, 0);
3353 xt_sync(sfmmup->sfmmu_cpusran);
3357 if ((flags & SFMMU_NO_TSBLOAD) == 0) {
3359 * We only preload 8K and 4M mappings into the TSB, since
3360 * 64K and 512K mappings are replicated and hence don't
3361 * have a single, unique TSB entry. Ditto for 32M/256M.
3363 if (size == TTE8K || size == TTE4M) {
3364 sf_scd_t *scdp;
3365 hatlockp = sfmmu_hat_enter(sfmmup);
3367 * Don't preload private TSB if the mapping is used
3368 * by the shctx in the SCD.
3370 scdp = sfmmup->sfmmu_scdp;
3371 if (rid == SFMMU_INVALID_SHMERID || scdp == NULL ||
3372 !SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) {
3373 sfmmu_load_tsb(sfmmup, vaddr, &sfhme->hme_tte,
3374 size);
3376 sfmmu_hat_exit(hatlockp);
3379 if (pp) {
3380 if (!remap) {
3381 HME_ADD(sfhme, pp);
3382 atomic_inc_16(&hmeblkp->hblk_hmecnt);
3383 ASSERT(hmeblkp->hblk_hmecnt > 0);
3386 * Cannot ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS)
3387 * see pageunload() for comment.
3390 sfmmu_mlist_exit(pml);
3393 return (0);
3396 * Function unlocks hash bucket.
3398 static void
3399 sfmmu_tteload_release_hashbucket(struct hmehash_bucket *hmebp)
3401 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
3402 SFMMU_HASH_UNLOCK(hmebp);
3406 * function which checks and sets up page array for a large
3407 * translation. Will set p_vcolor, p_index, p_ro fields.
3408 * Assumes addr and pfnum of first page are properly aligned.
3409 * Will check for physical contiguity. If check fails it return
3410 * non null.
3412 static int
3413 sfmmu_pagearray_setup(caddr_t addr, page_t **pps, tte_t *ttep, int remap)
3415 int i, index, ttesz;
3416 pfn_t pfnum;
3417 pgcnt_t npgs;
3418 page_t *pp, *pp1;
3419 kmutex_t *pmtx;
3420 #ifdef VAC
3421 int osz;
3422 int cflags = 0;
3423 int vac_err = 0;
3424 #endif
3425 int newidx = 0;
3427 ttesz = TTE_CSZ(ttep);
3429 ASSERT(ttesz > TTE8K);
3431 npgs = TTEPAGES(ttesz);
3432 index = PAGESZ_TO_INDEX(ttesz);
3434 pfnum = (*pps)->p_pagenum;
3435 ASSERT(IS_P2ALIGNED(pfnum, npgs));
3438 * Save the first pp so we can do HAT_TMPNC at the end.
3440 pp1 = *pps;
3441 #ifdef VAC
3442 osz = fnd_mapping_sz(pp1);
3443 #endif
3445 for (i = 0; i < npgs; i++, pps++) {
3446 pp = *pps;
3447 ASSERT(PAGE_LOCKED(pp));
3448 ASSERT(pp->p_szc >= ttesz);
3449 ASSERT(pp->p_szc == pp1->p_szc);
3450 ASSERT(sfmmu_mlist_held(pp));
3453 * XXX is it possible to maintain P_RO on the root only?
3455 if (TTE_IS_WRITABLE(ttep) && PP_ISRO(pp)) {
3456 pmtx = sfmmu_page_enter(pp);
3457 PP_CLRRO(pp);
3458 sfmmu_page_exit(pmtx);
3459 } else if (!PP_ISMAPPED(pp) && !TTE_IS_WRITABLE(ttep) &&
3460 !PP_ISMOD(pp)) {
3461 pmtx = sfmmu_page_enter(pp);
3462 if (!(PP_ISMOD(pp))) {
3463 PP_SETRO(pp);
3465 sfmmu_page_exit(pmtx);
3469 * If this is a remap we skip vac & contiguity checks.
3471 if (remap)
3472 continue;
3475 * set p_vcolor and detect any vac conflicts.
3477 #ifdef VAC
3478 if (vac_err == 0) {
3479 vac_err = sfmmu_vacconflict_array(addr, pp, &cflags);
3482 #endif
3485 * Save current index in case we need to undo it.
3486 * Note: "PAGESZ_TO_INDEX(sz) (1 << (sz))"
3487 * "SFMMU_INDEX_SHIFT 6"
3488 * "SFMMU_INDEX_MASK ((1 << SFMMU_INDEX_SHIFT) - 1)"
3489 * "PP_MAPINDEX(p_index) (p_index & SFMMU_INDEX_MASK)"
3491 * So: index = PAGESZ_TO_INDEX(ttesz);
3492 * if ttesz == 1 then index = 0x2
3493 * 2 then index = 0x4
3494 * 3 then index = 0x8
3495 * 4 then index = 0x10
3496 * 5 then index = 0x20
3497 * The code below checks if it's a new pagesize (ie, newidx)
3498 * in case we need to take it back out of p_index,
3499 * and then or's the new index into the existing index.
3501 if ((PP_MAPINDEX(pp) & index) == 0)
3502 newidx = 1;
3503 pp->p_index = (PP_MAPINDEX(pp) | index);
3506 * contiguity check
3508 if (pp->p_pagenum != pfnum) {
3510 * If we fail the contiguity test then
3511 * the only thing we need to fix is the p_index field.
3512 * We might get a few extra flushes but since this
3513 * path is rare that is ok. The p_ro field will
3514 * get automatically fixed on the next tteload to
3515 * the page. NO TNC bit is set yet.
3517 while (i >= 0) {
3518 pp = *pps;
3519 if (newidx)
3520 pp->p_index = (PP_MAPINDEX(pp) &
3521 ~index);
3522 pps--;
3523 i--;
3525 return (1);
3527 pfnum++;
3528 addr += MMU_PAGESIZE;
3531 #ifdef VAC
3532 if (vac_err) {
3533 if (ttesz > osz) {
3535 * There are some smaller mappings that causes vac
3536 * conflicts. Convert all existing small mappings to
3537 * TNC.
3539 SFMMU_STAT_ADD(sf_uncache_conflict, npgs);
3540 sfmmu_page_cache_array(pp1, HAT_TMPNC, CACHE_FLUSH,
3541 npgs);
3542 } else {
3543 /* EMPTY */
3545 * If there exists an big page mapping,
3546 * that means the whole existing big page
3547 * has TNC setting already. No need to covert to
3548 * TNC again.
3550 ASSERT(PP_ISTNC(pp1));
3553 #endif /* VAC */
3555 return (0);
3558 #ifdef VAC
3560 * Routine that detects vac consistency for a large page. It also
3561 * sets virtual color for all pp's for this big mapping.
3563 static int
3564 sfmmu_vacconflict_array(caddr_t addr, page_t *pp, int *cflags)
3566 int vcolor, ocolor;
3568 ASSERT(sfmmu_mlist_held(pp));
3570 if (PP_ISNC(pp)) {
3571 return (HAT_TMPNC);
3574 vcolor = addr_to_vcolor(addr);
3575 if (PP_NEWPAGE(pp)) {
3576 PP_SET_VCOLOR(pp, vcolor);
3577 return (0);
3580 ocolor = PP_GET_VCOLOR(pp);
3581 if (ocolor == vcolor) {
3582 return (0);
3585 if (!PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp)) {
3587 * Previous user of page had a differnet color
3588 * but since there are no current users
3589 * we just flush the cache and change the color.
3590 * As an optimization for large pages we flush the
3591 * entire cache of that color and set a flag.
3593 SFMMU_STAT(sf_pgcolor_conflict);
3594 if (!CacheColor_IsFlushed(*cflags, ocolor)) {
3595 CacheColor_SetFlushed(*cflags, ocolor);
3596 sfmmu_cache_flushcolor(ocolor, pp->p_pagenum);
3598 PP_SET_VCOLOR(pp, vcolor);
3599 return (0);
3603 * We got a real conflict with a current mapping.
3604 * set flags to start unencaching all mappings
3605 * and return failure so we restart looping
3606 * the pp array from the beginning.
3608 return (HAT_TMPNC);
3610 #endif /* VAC */
3613 * creates a large page shadow hmeblk for a tte.
3614 * The purpose of this routine is to allow us to do quick unloads because
3615 * the vm layer can easily pass a very large but sparsely populated range.
3617 static struct hme_blk *
3618 sfmmu_shadow_hcreate(sfmmu_t *sfmmup, caddr_t vaddr, int ttesz, uint_t flags)
3620 struct hmehash_bucket *hmebp;
3621 hmeblk_tag hblktag;
3622 int hmeshift, size, vshift;
3623 uint_t shw_mask, newshw_mask;
3624 struct hme_blk *hmeblkp;
3626 ASSERT(sfmmup != KHATID);
3627 if (mmu_page_sizes == max_mmu_page_sizes) {
3628 ASSERT(ttesz < TTE256M);
3629 } else {
3630 ASSERT(ttesz < TTE4M);
3631 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0);
3632 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0);
3635 if (ttesz == TTE8K) {
3636 size = TTE512K;
3637 } else {
3638 size = ++ttesz;
3641 hblktag.htag_id = sfmmup;
3642 hmeshift = HME_HASH_SHIFT(size);
3643 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift);
3644 hblktag.htag_rehash = HME_HASH_REHASH(size);
3645 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
3646 hmebp = HME_HASH_FUNCTION(sfmmup, vaddr, hmeshift);
3648 SFMMU_HASH_LOCK(hmebp);
3650 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp);
3651 ASSERT(hmeblkp != (struct hme_blk *)hblk_reserve);
3652 if (hmeblkp == NULL) {
3653 hmeblkp = sfmmu_hblk_alloc(sfmmup, vaddr, hmebp, size,
3654 hblktag, flags, SFMMU_INVALID_SHMERID);
3656 ASSERT(hmeblkp);
3657 if (!hmeblkp->hblk_shw_mask) {
3659 * if this is a unused hblk it was just allocated or could
3660 * potentially be a previous large page hblk so we need to
3661 * set the shadow bit.
3663 ASSERT(!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt);
3664 hmeblkp->hblk_shw_bit = 1;
3665 } else if (hmeblkp->hblk_shw_bit == 0) {
3666 panic("sfmmu_shadow_hcreate: shw bit not set in hmeblkp 0x%p",
3667 (void *)hmeblkp);
3669 ASSERT(hmeblkp->hblk_shw_bit == 1);
3670 ASSERT(!hmeblkp->hblk_shared);
3671 vshift = vaddr_to_vshift(hblktag, vaddr, size);
3672 ASSERT(vshift < 8);
3674 * Atomically set shw mask bit
3676 do {
3677 shw_mask = hmeblkp->hblk_shw_mask;
3678 newshw_mask = shw_mask | (1 << vshift);
3679 newshw_mask = atomic_cas_32(&hmeblkp->hblk_shw_mask, shw_mask,
3680 newshw_mask);
3681 } while (newshw_mask != shw_mask);
3683 SFMMU_HASH_UNLOCK(hmebp);
3685 return (hmeblkp);
3689 * This routine cleanup a previous shadow hmeblk and changes it to
3690 * a regular hblk. This happens rarely but it is possible
3691 * when a process wants to use large pages and there are hblks still
3692 * lying around from the previous as that used these hmeblks.
3693 * The alternative was to cleanup the shadow hblks at unload time
3694 * but since so few user processes actually use large pages, it is
3695 * better to be lazy and cleanup at this time.
3697 static void
3698 sfmmu_shadow_hcleanup(sfmmu_t *sfmmup, struct hme_blk *hmeblkp,
3699 struct hmehash_bucket *hmebp)
3701 caddr_t addr, endaddr;
3702 int hashno, size;
3704 ASSERT(hmeblkp->hblk_shw_bit);
3705 ASSERT(!hmeblkp->hblk_shared);
3707 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
3709 if (!hmeblkp->hblk_shw_mask) {
3710 hmeblkp->hblk_shw_bit = 0;
3711 return;
3713 addr = (caddr_t)get_hblk_base(hmeblkp);
3714 endaddr = get_hblk_endaddr(hmeblkp);
3715 size = get_hblk_ttesz(hmeblkp);
3716 hashno = size - 1;
3717 ASSERT(hashno > 0);
3718 SFMMU_HASH_UNLOCK(hmebp);
3720 sfmmu_free_hblks(sfmmup, addr, endaddr, hashno);
3722 SFMMU_HASH_LOCK(hmebp);
3725 static void
3726 sfmmu_free_hblks(sfmmu_t *sfmmup, caddr_t addr, caddr_t endaddr,
3727 int hashno)
3729 int hmeshift, shadow = 0;
3730 hmeblk_tag hblktag;
3731 struct hmehash_bucket *hmebp;
3732 struct hme_blk *hmeblkp;
3733 struct hme_blk *nx_hblk, *pr_hblk, *list = NULL;
3735 ASSERT(hashno > 0);
3736 hblktag.htag_id = sfmmup;
3737 hblktag.htag_rehash = hashno;
3738 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
3740 hmeshift = HME_HASH_SHIFT(hashno);
3742 while (addr < endaddr) {
3743 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
3744 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
3745 SFMMU_HASH_LOCK(hmebp);
3746 /* inline HME_HASH_SEARCH */
3747 hmeblkp = hmebp->hmeblkp;
3748 pr_hblk = NULL;
3749 while (hmeblkp) {
3750 if (HTAGS_EQ(hmeblkp->hblk_tag, hblktag)) {
3751 /* found hme_blk */
3752 ASSERT(!hmeblkp->hblk_shared);
3753 if (hmeblkp->hblk_shw_bit) {
3754 if (hmeblkp->hblk_shw_mask) {
3755 shadow = 1;
3756 sfmmu_shadow_hcleanup(sfmmup,
3757 hmeblkp, hmebp);
3758 break;
3759 } else {
3760 hmeblkp->hblk_shw_bit = 0;
3765 * Hblk_hmecnt and hblk_vcnt could be non zero
3766 * since hblk_unload() does not gurantee that.
3768 * XXX - this could cause tteload() to spin
3769 * where sfmmu_shadow_hcleanup() is called.
3773 nx_hblk = hmeblkp->hblk_next;
3774 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) {
3775 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
3776 &list, 0);
3777 } else {
3778 pr_hblk = hmeblkp;
3780 hmeblkp = nx_hblk;
3783 SFMMU_HASH_UNLOCK(hmebp);
3785 if (shadow) {
3787 * We found another shadow hblk so cleaned its
3788 * children. We need to go back and cleanup
3789 * the original hblk so we don't change the
3790 * addr.
3792 shadow = 0;
3793 } else {
3794 addr = (caddr_t)roundup((uintptr_t)addr + 1,
3795 (1 << hmeshift));
3798 sfmmu_hblks_list_purge(&list, 0);
3802 * This routine's job is to delete stale invalid shared hmeregions hmeblks that
3803 * may still linger on after pageunload.
3805 static void
3806 sfmmu_cleanup_rhblk(sf_srd_t *srdp, caddr_t addr, uint_t rid, int ttesz)
3808 int hmeshift;
3809 hmeblk_tag hblktag;
3810 struct hmehash_bucket *hmebp;
3811 struct hme_blk *hmeblkp;
3812 struct hme_blk *pr_hblk;
3813 struct hme_blk *list = NULL;
3815 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
3816 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
3818 hmeshift = HME_HASH_SHIFT(ttesz);
3819 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
3820 hblktag.htag_rehash = ttesz;
3821 hblktag.htag_rid = rid;
3822 hblktag.htag_id = srdp;
3823 hmebp = HME_HASH_FUNCTION(srdp, addr, hmeshift);
3825 SFMMU_HASH_LOCK(hmebp);
3826 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list);
3827 if (hmeblkp != NULL) {
3828 ASSERT(hmeblkp->hblk_shared);
3829 ASSERT(!hmeblkp->hblk_shw_bit);
3830 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) {
3831 panic("sfmmu_cleanup_rhblk: valid hmeblk");
3833 ASSERT(!hmeblkp->hblk_lckcnt);
3834 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
3835 &list, 0);
3837 SFMMU_HASH_UNLOCK(hmebp);
3838 sfmmu_hblks_list_purge(&list, 0);
3841 /* ARGSUSED */
3842 static void
3843 sfmmu_rgn_cb_noop(caddr_t saddr, caddr_t eaddr, caddr_t r_saddr,
3844 size_t r_size, void *r_obj, u_offset_t r_objoff)
3849 * Searches for an hmeblk which maps addr, then unloads this mapping
3850 * and updates *eaddrp, if the hmeblk is found.
3852 static void
3853 sfmmu_unload_hmeregion_va(sf_srd_t *srdp, uint_t rid, caddr_t addr,
3854 caddr_t eaddr, int ttesz, caddr_t *eaddrp)
3856 int hmeshift;
3857 hmeblk_tag hblktag;
3858 struct hmehash_bucket *hmebp;
3859 struct hme_blk *hmeblkp;
3860 struct hme_blk *pr_hblk;
3861 struct hme_blk *list = NULL;
3863 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
3864 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
3865 ASSERT(ttesz >= HBLK_MIN_TTESZ);
3867 hmeshift = HME_HASH_SHIFT(ttesz);
3868 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
3869 hblktag.htag_rehash = ttesz;
3870 hblktag.htag_rid = rid;
3871 hblktag.htag_id = srdp;
3872 hmebp = HME_HASH_FUNCTION(srdp, addr, hmeshift);
3874 SFMMU_HASH_LOCK(hmebp);
3875 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list);
3876 if (hmeblkp != NULL) {
3877 ASSERT(hmeblkp->hblk_shared);
3878 ASSERT(!hmeblkp->hblk_lckcnt);
3879 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) {
3880 *eaddrp = sfmmu_hblk_unload(NULL, hmeblkp, addr,
3881 eaddr, NULL, HAT_UNLOAD);
3882 ASSERT(*eaddrp > addr);
3884 ASSERT(!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt);
3885 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
3886 &list, 0);
3888 SFMMU_HASH_UNLOCK(hmebp);
3889 sfmmu_hblks_list_purge(&list, 0);
3892 static void
3893 sfmmu_unload_hmeregion(sf_srd_t *srdp, sf_region_t *rgnp)
3895 int ttesz = rgnp->rgn_pgszc;
3896 size_t rsz = rgnp->rgn_size;
3897 caddr_t rsaddr = rgnp->rgn_saddr;
3898 caddr_t readdr = rsaddr + rsz;
3899 caddr_t rhsaddr;
3900 caddr_t va;
3901 uint_t rid = rgnp->rgn_id;
3902 caddr_t cbsaddr;
3903 caddr_t cbeaddr;
3904 hat_rgn_cb_func_t rcbfunc;
3905 ulong_t cnt;
3907 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
3908 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
3910 ASSERT(IS_P2ALIGNED(rsaddr, TTEBYTES(ttesz)));
3911 ASSERT(IS_P2ALIGNED(rsz, TTEBYTES(ttesz)));
3912 if (ttesz < HBLK_MIN_TTESZ) {
3913 ttesz = HBLK_MIN_TTESZ;
3914 rhsaddr = (caddr_t)P2ALIGN((uintptr_t)rsaddr, HBLK_MIN_BYTES);
3915 } else {
3916 rhsaddr = rsaddr;
3919 if ((rcbfunc = rgnp->rgn_cb_function) == NULL) {
3920 rcbfunc = sfmmu_rgn_cb_noop;
3923 while (ttesz >= HBLK_MIN_TTESZ) {
3924 cbsaddr = rsaddr;
3925 cbeaddr = rsaddr;
3926 if (!(rgnp->rgn_hmeflags & (1 << ttesz))) {
3927 ttesz--;
3928 continue;
3930 cnt = 0;
3931 va = rsaddr;
3932 while (va < readdr) {
3933 ASSERT(va >= rhsaddr);
3934 if (va != cbeaddr) {
3935 if (cbeaddr != cbsaddr) {
3936 ASSERT(cbeaddr > cbsaddr);
3937 (*rcbfunc)(cbsaddr, cbeaddr,
3938 rsaddr, rsz, rgnp->rgn_obj,
3939 rgnp->rgn_objoff);
3941 cbsaddr = va;
3942 cbeaddr = va;
3944 sfmmu_unload_hmeregion_va(srdp, rid, va, readdr,
3945 ttesz, &cbeaddr);
3946 cnt++;
3947 va = rhsaddr + (cnt << TTE_PAGE_SHIFT(ttesz));
3949 if (cbeaddr != cbsaddr) {
3950 ASSERT(cbeaddr > cbsaddr);
3951 (*rcbfunc)(cbsaddr, cbeaddr, rsaddr,
3952 rsz, rgnp->rgn_obj,
3953 rgnp->rgn_objoff);
3955 ttesz--;
3960 * Release one hardware address translation lock on the given address range.
3962 void
3963 hat_unlock(struct hat *sfmmup, caddr_t addr, size_t len)
3965 struct hmehash_bucket *hmebp;
3966 hmeblk_tag hblktag;
3967 int hmeshift, hashno = 1;
3968 struct hme_blk *hmeblkp, *list = NULL;
3969 caddr_t endaddr;
3971 ASSERT(sfmmup != NULL);
3972 ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
3974 ASSERT((sfmmup == ksfmmup) || AS_LOCK_HELD(sfmmup->sfmmu_as));
3975 ASSERT((len & MMU_PAGEOFFSET) == 0);
3976 endaddr = addr + len;
3977 hblktag.htag_id = sfmmup;
3978 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
3981 * Spitfire supports 4 page sizes.
3982 * Most pages are expected to be of the smallest page size (8K) and
3983 * these will not need to be rehashed. 64K pages also don't need to be
3984 * rehashed because an hmeblk spans 64K of address space. 512K pages
3985 * might need 1 rehash and and 4M pages might need 2 rehashes.
3987 while (addr < endaddr) {
3988 hmeshift = HME_HASH_SHIFT(hashno);
3989 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
3990 hblktag.htag_rehash = hashno;
3991 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
3993 SFMMU_HASH_LOCK(hmebp);
3995 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list);
3996 if (hmeblkp != NULL) {
3997 ASSERT(!hmeblkp->hblk_shared);
3999 * If we encounter a shadow hmeblk then
4000 * we know there are no valid hmeblks mapping
4001 * this address at this size or larger.
4002 * Just increment address by the smallest
4003 * page size.
4005 if (hmeblkp->hblk_shw_bit) {
4006 addr += MMU_PAGESIZE;
4007 } else {
4008 addr = sfmmu_hblk_unlock(hmeblkp, addr,
4009 endaddr);
4011 SFMMU_HASH_UNLOCK(hmebp);
4012 hashno = 1;
4013 continue;
4015 SFMMU_HASH_UNLOCK(hmebp);
4017 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) {
4019 * We have traversed the whole list and rehashed
4020 * if necessary without finding the address to unlock
4021 * which should never happen.
4023 panic("sfmmu_unlock: addr not found. "
4024 "addr %p hat %p", (void *)addr, (void *)sfmmup);
4025 } else {
4026 hashno++;
4030 sfmmu_hblks_list_purge(&list, 0);
4033 void
4034 hat_unlock_region(struct hat *sfmmup, caddr_t addr, size_t len,
4035 hat_region_cookie_t rcookie)
4037 sf_srd_t *srdp;
4038 sf_region_t *rgnp;
4039 int ttesz;
4040 uint_t rid;
4041 caddr_t eaddr;
4042 caddr_t va;
4043 int hmeshift;
4044 hmeblk_tag hblktag;
4045 struct hmehash_bucket *hmebp;
4046 struct hme_blk *hmeblkp;
4047 struct hme_blk *pr_hblk;
4048 struct hme_blk *list;
4050 if (rcookie == HAT_INVALID_REGION_COOKIE) {
4051 hat_unlock(sfmmup, addr, len);
4052 return;
4055 ASSERT(sfmmup != NULL);
4056 ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
4057 ASSERT(sfmmup != ksfmmup);
4059 srdp = sfmmup->sfmmu_srdp;
4060 rid = (uint_t)((uint64_t)rcookie);
4061 VERIFY3U(rid, <, SFMMU_MAX_HME_REGIONS);
4062 eaddr = addr + len;
4063 va = addr;
4064 list = NULL;
4065 rgnp = srdp->srd_hmergnp[rid];
4066 SFMMU_VALIDATE_HMERID(sfmmup, rid, addr, len);
4068 ASSERT(IS_P2ALIGNED(addr, TTEBYTES(rgnp->rgn_pgszc)));
4069 ASSERT(IS_P2ALIGNED(len, TTEBYTES(rgnp->rgn_pgszc)));
4070 if (rgnp->rgn_pgszc < HBLK_MIN_TTESZ) {
4071 ttesz = HBLK_MIN_TTESZ;
4072 } else {
4073 ttesz = rgnp->rgn_pgszc;
4075 while (va < eaddr) {
4076 while (ttesz < rgnp->rgn_pgszc &&
4077 IS_P2ALIGNED(va, TTEBYTES(ttesz + 1))) {
4078 ttesz++;
4080 while (ttesz >= HBLK_MIN_TTESZ) {
4081 if (!(rgnp->rgn_hmeflags & (1 << ttesz))) {
4082 ttesz--;
4083 continue;
4085 hmeshift = HME_HASH_SHIFT(ttesz);
4086 hblktag.htag_bspage = HME_HASH_BSPAGE(va, hmeshift);
4087 hblktag.htag_rehash = ttesz;
4088 hblktag.htag_rid = rid;
4089 hblktag.htag_id = srdp;
4090 hmebp = HME_HASH_FUNCTION(srdp, va, hmeshift);
4091 SFMMU_HASH_LOCK(hmebp);
4092 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk,
4093 &list);
4094 if (hmeblkp == NULL) {
4095 SFMMU_HASH_UNLOCK(hmebp);
4096 ttesz--;
4097 continue;
4099 ASSERT(hmeblkp->hblk_shared);
4100 va = sfmmu_hblk_unlock(hmeblkp, va, eaddr);
4101 ASSERT(va >= eaddr ||
4102 IS_P2ALIGNED((uintptr_t)va, TTEBYTES(ttesz)));
4103 SFMMU_HASH_UNLOCK(hmebp);
4104 break;
4106 if (ttesz < HBLK_MIN_TTESZ) {
4107 panic("hat_unlock_region: addr not found "
4108 "addr %p hat %p", (void *)va, (void *)sfmmup);
4111 sfmmu_hblks_list_purge(&list, 0);
4115 * Function to unlock a range of addresses in an hmeblk. It returns the
4116 * next address that needs to be unlocked.
4117 * Should be called with the hash lock held.
4119 static caddr_t
4120 sfmmu_hblk_unlock(struct hme_blk *hmeblkp, caddr_t addr, caddr_t endaddr)
4122 struct sf_hment *sfhme;
4123 tte_t tteold, ttemod;
4124 int ttesz, ret;
4126 ASSERT(in_hblk_range(hmeblkp, addr));
4127 ASSERT(hmeblkp->hblk_shw_bit == 0);
4129 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp));
4130 ttesz = get_hblk_ttesz(hmeblkp);
4132 HBLKTOHME(sfhme, hmeblkp, addr);
4133 while (addr < endaddr) {
4134 readtte:
4135 sfmmu_copytte(&sfhme->hme_tte, &tteold);
4136 if (TTE_IS_VALID(&tteold)) {
4138 ttemod = tteold;
4140 ret = sfmmu_modifytte_try(&tteold, &ttemod,
4141 &sfhme->hme_tte);
4143 if (ret < 0)
4144 goto readtte;
4146 if (hmeblkp->hblk_lckcnt == 0)
4147 panic("zero hblk lckcnt");
4149 if (((uintptr_t)addr + TTEBYTES(ttesz)) >
4150 (uintptr_t)endaddr)
4151 panic("can't unlock large tte");
4153 ASSERT(hmeblkp->hblk_lckcnt > 0);
4154 atomic_dec_32(&hmeblkp->hblk_lckcnt);
4155 HBLK_STACK_TRACE(hmeblkp, HBLK_UNLOCK);
4156 } else {
4157 panic("sfmmu_hblk_unlock: invalid tte");
4159 addr += TTEBYTES(ttesz);
4160 sfhme++;
4162 return (addr);
4166 * Physical Address Mapping Framework
4168 * General rules:
4170 * (1) Applies only to seg_kmem memory pages. To make things easier,
4171 * seg_kpm addresses are also accepted by the routines, but nothing
4172 * is done with them since by definition their PA mappings are static.
4173 * (2) hat_add_callback() may only be called while holding the page lock
4174 * SE_SHARED or SE_EXCL of the underlying page (e.g., as_pagelock()),
4175 * or passing HAC_PAGELOCK flag.
4176 * (3) prehandler() and posthandler() may not call hat_add_callback() or
4177 * hat_delete_callback(), nor should they allocate memory. Post quiesce
4178 * callbacks may not sleep or acquire adaptive mutex locks.
4179 * (4) Either prehandler() or posthandler() (but not both) may be specified
4180 * as being NULL. Specifying an errhandler() is optional.
4182 * Details of using the framework:
4184 * registering a callback (hat_register_callback())
4186 * Pass prehandler, posthandler, errhandler addresses
4187 * as described below. If capture_cpus argument is nonzero,
4188 * suspend callback to the prehandler will occur with CPUs
4189 * captured and executing xc_loop() and CPUs will remain
4190 * captured until after the posthandler suspend callback
4191 * occurs.
4193 * adding a callback (hat_add_callback())
4195 * as_pagelock();
4196 * hat_add_callback();
4197 * save returned pfn in private data structures or program registers;
4198 * as_pageunlock();
4200 * prehandler()
4202 * Stop all accesses by physical address to this memory page.
4203 * Called twice: the first, PRESUSPEND, is a context safe to acquire
4204 * adaptive locks. The second, SUSPEND, is called at high PIL with
4205 * CPUs captured so adaptive locks may NOT be acquired (and all spin
4206 * locks must be XCALL_PIL or higher locks).
4208 * May return the following errors:
4209 * EIO: A fatal error has occurred. This will result in panic.
4210 * EAGAIN: The page cannot be suspended. This will fail the
4211 * relocation.
4212 * 0: Success.
4214 * posthandler()
4216 * Save new pfn in private data structures or program registers;
4217 * not allowed to fail (non-zero return values will result in panic).
4219 * errhandler()
4221 * called when an error occurs related to the callback. Currently
4222 * the only such error is HAT_CB_ERR_LEAKED which indicates that
4223 * a page is being freed, but there are still outstanding callback(s)
4224 * registered on the page.
4226 * removing a callback (hat_delete_callback(); e.g., prior to freeing memory)
4228 * stop using physical address
4229 * hat_delete_callback();
4234 * Register a callback class. Each subsystem should do this once and
4235 * cache the id_t returned for use in setting up and tearing down callbacks.
4237 * There is no facility for removing callback IDs once they are created;
4238 * the "key" should be unique for each module, so in case a module is unloaded
4239 * and subsequently re-loaded, we can recycle the module's previous entry.
4241 id_t
4242 hat_register_callback(int key,
4243 int (*prehandler)(caddr_t, uint_t, uint_t, void *),
4244 int (*posthandler)(caddr_t, uint_t, uint_t, void *, pfn_t),
4245 int (*errhandler)(caddr_t, uint_t, uint_t, void *),
4246 int capture_cpus)
4248 id_t id;
4251 * Search the table for a pre-existing callback associated with
4252 * the identifier "key". If one exists, we re-use that entry in
4253 * the table for this instance, otherwise we assign the next
4254 * available table slot.
4256 for (id = 0; id < sfmmu_max_cb_id; id++) {
4257 if (sfmmu_cb_table[id].key == key)
4258 break;
4261 if (id == sfmmu_max_cb_id) {
4262 id = sfmmu_cb_nextid++;
4263 if (id >= sfmmu_max_cb_id)
4264 panic("hat_register_callback: out of callback IDs");
4267 ASSERT(prehandler != NULL || posthandler != NULL);
4269 sfmmu_cb_table[id].key = key;
4270 sfmmu_cb_table[id].prehandler = prehandler;
4271 sfmmu_cb_table[id].posthandler = posthandler;
4272 sfmmu_cb_table[id].errhandler = errhandler;
4273 sfmmu_cb_table[id].capture_cpus = capture_cpus;
4275 return (id);
4278 #define HAC_COOKIE_NONE (void *)-1
4281 * Add relocation callbacks to the specified addr/len which will be called
4282 * when relocating the associated page. See the description of pre and
4283 * posthandler above for more details.
4285 * If HAC_PAGELOCK is included in flags, the underlying memory page is
4286 * locked internally so the caller must be able to deal with the callback
4287 * running even before this function has returned. If HAC_PAGELOCK is not
4288 * set, it is assumed that the underlying memory pages are locked.
4290 * Since the caller must track the individual page boundaries anyway,
4291 * we only allow a callback to be added to a single page (large
4292 * or small). Thus [addr, addr + len) MUST be contained within a single
4293 * page.
4295 * Registering multiple callbacks on the same [addr, addr+len) is supported,
4296 * _provided_that_ a unique parameter is specified for each callback.
4297 * If multiple callbacks are registered on the same range the callback will
4298 * be invoked with each unique parameter. Registering the same callback with
4299 * the same argument more than once will result in corrupted kernel state.
4301 * Returns the pfn of the underlying kernel page in *rpfn
4302 * on success, or PFN_INVALID on failure.
4304 * cookiep (if passed) provides storage space for an opaque cookie
4305 * to return later to hat_delete_callback(). This cookie makes the callback
4306 * deletion significantly quicker by avoiding a potentially lengthy hash
4307 * search.
4309 * Returns values:
4310 * 0: success
4311 * ENOMEM: memory allocation failure (e.g. flags was passed as HAC_NOSLEEP)
4312 * EINVAL: callback ID is not valid
4313 * ENXIO: ["vaddr", "vaddr" + len) is not mapped in the kernel's address
4314 * space
4315 * ERANGE: ["vaddr", "vaddr" + len) crosses a page boundary
4318 hat_add_callback(id_t callback_id, caddr_t vaddr, uint_t len, uint_t flags,
4319 void *pvt, pfn_t *rpfn, void **cookiep)
4321 struct hmehash_bucket *hmebp;
4322 hmeblk_tag hblktag;
4323 struct hme_blk *hmeblkp;
4324 int hmeshift, hashno;
4325 caddr_t saddr, eaddr, baseaddr;
4326 struct pa_hment *pahmep;
4327 struct sf_hment *sfhmep, *osfhmep;
4328 kmutex_t *pml;
4329 tte_t tte;
4330 page_t *pp;
4331 vnode_t *vp;
4332 u_offset_t off;
4333 pfn_t pfn;
4334 int kmflags = (flags & HAC_SLEEP)? KM_SLEEP : KM_NOSLEEP;
4335 int locked = 0;
4338 * For KPM mappings, just return the physical address since we
4339 * don't need to register any callbacks.
4341 if (IS_KPM_ADDR(vaddr)) {
4342 uint64_t paddr;
4343 SFMMU_KPM_VTOP(vaddr, paddr);
4344 *rpfn = btop(paddr);
4345 if (cookiep != NULL)
4346 *cookiep = HAC_COOKIE_NONE;
4347 return (0);
4350 if (callback_id < (id_t)0 || callback_id >= sfmmu_cb_nextid) {
4351 *rpfn = PFN_INVALID;
4352 return (EINVAL);
4355 if ((pahmep = kmem_cache_alloc(pa_hment_cache, kmflags)) == NULL) {
4356 *rpfn = PFN_INVALID;
4357 return (ENOMEM);
4360 sfhmep = &pahmep->sfment;
4362 saddr = (caddr_t)((uintptr_t)vaddr & MMU_PAGEMASK);
4363 eaddr = saddr + len;
4365 rehash:
4366 /* Find the mapping(s) for this page */
4367 for (hashno = TTE64K, hmeblkp = NULL;
4368 hmeblkp == NULL && hashno <= mmu_hashcnt;
4369 hashno++) {
4370 hmeshift = HME_HASH_SHIFT(hashno);
4371 hblktag.htag_id = ksfmmup;
4372 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
4373 hblktag.htag_bspage = HME_HASH_BSPAGE(saddr, hmeshift);
4374 hblktag.htag_rehash = hashno;
4375 hmebp = HME_HASH_FUNCTION(ksfmmup, saddr, hmeshift);
4377 SFMMU_HASH_LOCK(hmebp);
4379 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp);
4381 if (hmeblkp == NULL)
4382 SFMMU_HASH_UNLOCK(hmebp);
4385 if (hmeblkp == NULL) {
4386 kmem_cache_free(pa_hment_cache, pahmep);
4387 *rpfn = PFN_INVALID;
4388 return (ENXIO);
4391 ASSERT(!hmeblkp->hblk_shared);
4393 HBLKTOHME(osfhmep, hmeblkp, saddr);
4394 sfmmu_copytte(&osfhmep->hme_tte, &tte);
4396 if (!TTE_IS_VALID(&tte)) {
4397 SFMMU_HASH_UNLOCK(hmebp);
4398 kmem_cache_free(pa_hment_cache, pahmep);
4399 *rpfn = PFN_INVALID;
4400 return (ENXIO);
4404 * Make sure the boundaries for the callback fall within this
4405 * single mapping.
4407 baseaddr = (caddr_t)get_hblk_base(hmeblkp);
4408 ASSERT(saddr >= baseaddr);
4409 if (eaddr > saddr + TTEBYTES(TTE_CSZ(&tte))) {
4410 SFMMU_HASH_UNLOCK(hmebp);
4411 kmem_cache_free(pa_hment_cache, pahmep);
4412 *rpfn = PFN_INVALID;
4413 return (ERANGE);
4416 pfn = sfmmu_ttetopfn(&tte, vaddr);
4419 * The pfn may not have a page_t underneath in which case we
4420 * just return it. This can happen if we are doing I/O to a
4421 * static portion of the kernel's address space, for instance.
4423 pp = osfhmep->hme_page;
4424 if (pp == NULL) {
4425 SFMMU_HASH_UNLOCK(hmebp);
4426 kmem_cache_free(pa_hment_cache, pahmep);
4427 *rpfn = pfn;
4428 if (cookiep)
4429 *cookiep = HAC_COOKIE_NONE;
4430 return (0);
4432 ASSERT(pp == PP_PAGEROOT(pp));
4434 vp = pp->p_vnode;
4435 off = pp->p_offset;
4437 pml = sfmmu_mlist_enter(pp);
4439 if (flags & HAC_PAGELOCK) {
4440 if (!page_trylock(pp, SE_SHARED)) {
4442 * Somebody is holding SE_EXCL lock. Might
4443 * even be hat_page_relocate(). Drop all
4444 * our locks, lookup the page in &kvp, and
4445 * retry. If it doesn't exist in &kvp and &zvp,
4446 * then we must be dealing with a kernel mapped
4447 * page which doesn't actually belong to
4448 * segkmem so we punt.
4450 sfmmu_mlist_exit(pml);
4451 SFMMU_HASH_UNLOCK(hmebp);
4452 pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED);
4454 /* check zvp before giving up */
4455 if (pp == NULL)
4456 pp = page_lookup(&zvp, (u_offset_t)saddr,
4457 SE_SHARED);
4459 /* Okay, we didn't find it, give up */
4460 if (pp == NULL) {
4461 kmem_cache_free(pa_hment_cache, pahmep);
4462 *rpfn = pfn;
4463 if (cookiep)
4464 *cookiep = HAC_COOKIE_NONE;
4465 return (0);
4467 page_unlock(pp);
4468 goto rehash;
4470 locked = 1;
4473 if (!PAGE_LOCKED(pp) && !panicstr)
4474 panic("hat_add_callback: page 0x%p not locked", (void *)pp);
4476 if (osfhmep->hme_page != pp || pp->p_vnode != vp ||
4477 pp->p_offset != off) {
4479 * The page moved before we got our hands on it. Drop
4480 * all the locks and try again.
4482 ASSERT((flags & HAC_PAGELOCK) != 0);
4483 sfmmu_mlist_exit(pml);
4484 SFMMU_HASH_UNLOCK(hmebp);
4485 page_unlock(pp);
4486 locked = 0;
4487 goto rehash;
4490 if (!VN_ISKAS(vp)) {
4492 * This is not a segkmem page but another page which
4493 * has been kernel mapped. It had better have at least
4494 * a share lock on it. Return the pfn.
4496 sfmmu_mlist_exit(pml);
4497 SFMMU_HASH_UNLOCK(hmebp);
4498 if (locked)
4499 page_unlock(pp);
4500 kmem_cache_free(pa_hment_cache, pahmep);
4501 ASSERT(PAGE_LOCKED(pp));
4502 *rpfn = pfn;
4503 if (cookiep)
4504 *cookiep = HAC_COOKIE_NONE;
4505 return (0);
4509 * Setup this pa_hment and link its embedded dummy sf_hment into
4510 * the mapping list.
4512 pp->p_share++;
4513 pahmep->cb_id = callback_id;
4514 pahmep->addr = vaddr;
4515 pahmep->len = len;
4516 pahmep->refcnt = 1;
4517 pahmep->flags = 0;
4518 pahmep->pvt = pvt;
4520 sfhmep->hme_tte.ll = 0;
4521 sfhmep->hme_data = pahmep;
4522 sfhmep->hme_prev = osfhmep;
4523 sfhmep->hme_next = osfhmep->hme_next;
4525 if (osfhmep->hme_next)
4526 osfhmep->hme_next->hme_prev = sfhmep;
4528 osfhmep->hme_next = sfhmep;
4530 sfmmu_mlist_exit(pml);
4531 SFMMU_HASH_UNLOCK(hmebp);
4533 if (locked)
4534 page_unlock(pp);
4536 *rpfn = pfn;
4537 if (cookiep)
4538 *cookiep = (void *)pahmep;
4540 return (0);
4544 * Remove the relocation callbacks from the specified addr/len.
4546 void
4547 hat_delete_callback(caddr_t vaddr, uint_t len, void *pvt, uint_t flags,
4548 void *cookie)
4550 struct hmehash_bucket *hmebp;
4551 hmeblk_tag hblktag;
4552 struct hme_blk *hmeblkp;
4553 int hmeshift, hashno;
4554 caddr_t saddr;
4555 struct pa_hment *pahmep;
4556 struct sf_hment *sfhmep, *osfhmep;
4557 kmutex_t *pml;
4558 tte_t tte;
4559 page_t *pp;
4560 vnode_t *vp;
4561 u_offset_t off;
4562 int locked = 0;
4565 * If the cookie is HAC_COOKIE_NONE then there is no pa_hment to
4566 * remove so just return.
4568 if (cookie == HAC_COOKIE_NONE || IS_KPM_ADDR(vaddr))
4569 return;
4571 saddr = (caddr_t)((uintptr_t)vaddr & MMU_PAGEMASK);
4573 rehash:
4574 /* Find the mapping(s) for this page */
4575 for (hashno = TTE64K, hmeblkp = NULL;
4576 hmeblkp == NULL && hashno <= mmu_hashcnt;
4577 hashno++) {
4578 hmeshift = HME_HASH_SHIFT(hashno);
4579 hblktag.htag_id = ksfmmup;
4580 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
4581 hblktag.htag_bspage = HME_HASH_BSPAGE(saddr, hmeshift);
4582 hblktag.htag_rehash = hashno;
4583 hmebp = HME_HASH_FUNCTION(ksfmmup, saddr, hmeshift);
4585 SFMMU_HASH_LOCK(hmebp);
4587 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp);
4589 if (hmeblkp == NULL)
4590 SFMMU_HASH_UNLOCK(hmebp);
4593 if (hmeblkp == NULL)
4594 return;
4596 ASSERT(!hmeblkp->hblk_shared);
4598 HBLKTOHME(osfhmep, hmeblkp, saddr);
4600 sfmmu_copytte(&osfhmep->hme_tte, &tte);
4601 if (!TTE_IS_VALID(&tte)) {
4602 SFMMU_HASH_UNLOCK(hmebp);
4603 return;
4606 pp = osfhmep->hme_page;
4607 if (pp == NULL) {
4608 SFMMU_HASH_UNLOCK(hmebp);
4609 ASSERT(cookie == NULL);
4610 return;
4613 vp = pp->p_vnode;
4614 off = pp->p_offset;
4616 pml = sfmmu_mlist_enter(pp);
4618 if (flags & HAC_PAGELOCK) {
4619 if (!page_trylock(pp, SE_SHARED)) {
4621 * Somebody is holding SE_EXCL lock. Might
4622 * even be hat_page_relocate(). Drop all
4623 * our locks, lookup the page in &kvp, and
4624 * retry. If it doesn't exist in &kvp and &zvp,
4625 * then we must be dealing with a kernel mapped
4626 * page which doesn't actually belong to
4627 * segkmem so we punt.
4629 sfmmu_mlist_exit(pml);
4630 SFMMU_HASH_UNLOCK(hmebp);
4631 pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED);
4632 /* check zvp before giving up */
4633 if (pp == NULL)
4634 pp = page_lookup(&zvp, (u_offset_t)saddr,
4635 SE_SHARED);
4637 if (pp == NULL) {
4638 ASSERT(cookie == NULL);
4639 return;
4641 page_unlock(pp);
4642 goto rehash;
4644 locked = 1;
4647 ASSERT(PAGE_LOCKED(pp));
4649 if (osfhmep->hme_page != pp || pp->p_vnode != vp ||
4650 pp->p_offset != off) {
4652 * The page moved before we got our hands on it. Drop
4653 * all the locks and try again.
4655 ASSERT((flags & HAC_PAGELOCK) != 0);
4656 sfmmu_mlist_exit(pml);
4657 SFMMU_HASH_UNLOCK(hmebp);
4658 page_unlock(pp);
4659 locked = 0;
4660 goto rehash;
4663 if (!VN_ISKAS(vp)) {
4665 * This is not a segkmem page but another page which
4666 * has been kernel mapped.
4668 sfmmu_mlist_exit(pml);
4669 SFMMU_HASH_UNLOCK(hmebp);
4670 if (locked)
4671 page_unlock(pp);
4672 ASSERT(cookie == NULL);
4673 return;
4676 if (cookie != NULL) {
4677 pahmep = (struct pa_hment *)cookie;
4678 sfhmep = &pahmep->sfment;
4679 } else {
4680 for (sfhmep = pp->p_mapping; sfhmep != NULL;
4681 sfhmep = sfhmep->hme_next) {
4684 * skip va<->pa mappings
4686 if (!IS_PAHME(sfhmep))
4687 continue;
4689 pahmep = sfhmep->hme_data;
4690 ASSERT(pahmep != NULL);
4693 * if pa_hment matches, remove it
4695 if ((pahmep->pvt == pvt) &&
4696 (pahmep->addr == vaddr) &&
4697 (pahmep->len == len)) {
4698 break;
4703 if (sfhmep == NULL) {
4704 if (!panicstr) {
4705 panic("hat_delete_callback: pa_hment not found, pp %p",
4706 (void *)pp);
4708 return;
4712 * Note: at this point a valid kernel mapping must still be
4713 * present on this page.
4715 pp->p_share--;
4716 if (pp->p_share <= 0)
4717 panic("hat_delete_callback: zero p_share");
4719 if (--pahmep->refcnt == 0) {
4720 if (pahmep->flags != 0)
4721 panic("hat_delete_callback: pa_hment is busy");
4724 * Remove sfhmep from the mapping list for the page.
4726 if (sfhmep->hme_prev) {
4727 sfhmep->hme_prev->hme_next = sfhmep->hme_next;
4728 } else {
4729 pp->p_mapping = sfhmep->hme_next;
4732 if (sfhmep->hme_next)
4733 sfhmep->hme_next->hme_prev = sfhmep->hme_prev;
4735 sfmmu_mlist_exit(pml);
4736 SFMMU_HASH_UNLOCK(hmebp);
4738 if (locked)
4739 page_unlock(pp);
4741 kmem_cache_free(pa_hment_cache, pahmep);
4742 return;
4745 sfmmu_mlist_exit(pml);
4746 SFMMU_HASH_UNLOCK(hmebp);
4747 if (locked)
4748 page_unlock(pp);
4752 * hat_probe returns 1 if the translation for the address 'addr' is
4753 * loaded, zero otherwise.
4755 * hat_probe should be used only for advisorary purposes because it may
4756 * occasionally return the wrong value. The implementation must guarantee that
4757 * returning the wrong value is a very rare event. hat_probe is used
4758 * to implement optimizations in the segment drivers.
4762 hat_probe(struct hat *sfmmup, caddr_t addr)
4764 pfn_t pfn;
4765 tte_t tte;
4767 ASSERT(sfmmup != NULL);
4768 ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
4770 ASSERT((sfmmup == ksfmmup) || AS_LOCK_HELD(sfmmup->sfmmu_as));
4772 if (sfmmup == ksfmmup) {
4773 while ((pfn = sfmmu_vatopfn(addr, sfmmup, &tte))
4774 == PFN_SUSPENDED) {
4775 sfmmu_vatopfn_suspended(addr, sfmmup, &tte);
4777 } else {
4778 pfn = sfmmu_uvatopfn(addr, sfmmup, NULL);
4781 if (pfn != PFN_INVALID)
4782 return (1);
4783 else
4784 return (0);
4787 ssize_t
4788 hat_getpagesize(struct hat *sfmmup, caddr_t addr)
4790 tte_t tte;
4792 ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
4794 if (sfmmup == ksfmmup) {
4795 if (sfmmu_vatopfn(addr, sfmmup, &tte) == PFN_INVALID) {
4796 return (-1);
4798 } else {
4799 if (sfmmu_uvatopfn(addr, sfmmup, &tte) == PFN_INVALID) {
4800 return (-1);
4804 ASSERT(TTE_IS_VALID(&tte));
4805 return (TTEBYTES(TTE_CSZ(&tte)));
4808 uint_t
4809 hat_getattr(struct hat *sfmmup, caddr_t addr, uint_t *attr)
4811 tte_t tte;
4813 ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
4815 if (sfmmup == ksfmmup) {
4816 if (sfmmu_vatopfn(addr, sfmmup, &tte) == PFN_INVALID) {
4817 tte.ll = 0;
4819 } else {
4820 if (sfmmu_uvatopfn(addr, sfmmup, &tte) == PFN_INVALID) {
4821 tte.ll = 0;
4824 if (TTE_IS_VALID(&tte)) {
4825 *attr = sfmmu_ptov_attr(&tte);
4826 return (0);
4828 *attr = 0;
4829 return ((uint_t)0xffffffff);
4833 * Enables more attributes on specified address range (ie. logical OR)
4835 void
4836 hat_setattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr)
4838 if (hat->sfmmu_xhat_provider) {
4839 XHAT_SETATTR(hat, addr, len, attr);
4840 return;
4841 } else {
4843 * This must be a CPU HAT. If the address space has
4844 * XHATs attached, change attributes for all of them,
4845 * just in case
4847 ASSERT(hat->sfmmu_as != NULL);
4848 if (hat->sfmmu_as->a_xhat != NULL)
4849 xhat_setattr_all(hat->sfmmu_as, addr, len, attr);
4852 sfmmu_chgattr(hat, addr, len, attr, SFMMU_SETATTR);
4856 * Assigns attributes to the specified address range. All the attributes
4857 * are specified.
4859 void
4860 hat_chgattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr)
4862 if (hat->sfmmu_xhat_provider) {
4863 XHAT_CHGATTR(hat, addr, len, attr);
4864 return;
4865 } else {
4867 * This must be a CPU HAT. If the address space has
4868 * XHATs attached, change attributes for all of them,
4869 * just in case
4871 ASSERT(hat->sfmmu_as != NULL);
4872 if (hat->sfmmu_as->a_xhat != NULL)
4873 xhat_chgattr_all(hat->sfmmu_as, addr, len, attr);
4876 sfmmu_chgattr(hat, addr, len, attr, SFMMU_CHGATTR);
4880 * Remove attributes on the specified address range (ie. loginal NAND)
4882 void
4883 hat_clrattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr)
4885 if (hat->sfmmu_xhat_provider) {
4886 XHAT_CLRATTR(hat, addr, len, attr);
4887 return;
4888 } else {
4890 * This must be a CPU HAT. If the address space has
4891 * XHATs attached, change attributes for all of them,
4892 * just in case
4894 ASSERT(hat->sfmmu_as != NULL);
4895 if (hat->sfmmu_as->a_xhat != NULL)
4896 xhat_clrattr_all(hat->sfmmu_as, addr, len, attr);
4899 sfmmu_chgattr(hat, addr, len, attr, SFMMU_CLRATTR);
4903 * Change attributes on an address range to that specified by attr and mode.
4905 static void
4906 sfmmu_chgattr(struct hat *sfmmup, caddr_t addr, size_t len, uint_t attr,
4907 int mode)
4909 struct hmehash_bucket *hmebp;
4910 hmeblk_tag hblktag;
4911 int hmeshift, hashno = 1;
4912 struct hme_blk *hmeblkp, *list = NULL;
4913 caddr_t endaddr;
4914 cpuset_t cpuset;
4915 demap_range_t dmr;
4917 CPUSET_ZERO(cpuset);
4919 ASSERT((sfmmup == ksfmmup) || AS_LOCK_HELD(sfmmup->sfmmu_as));
4920 ASSERT((len & MMU_PAGEOFFSET) == 0);
4921 ASSERT(((uintptr_t)addr & MMU_PAGEOFFSET) == 0);
4923 if ((attr & PROT_USER) && (mode != SFMMU_CLRATTR) &&
4924 ((addr + len) > (caddr_t)USERLIMIT)) {
4925 panic("user addr %p in kernel space",
4926 (void *)addr);
4929 endaddr = addr + len;
4930 hblktag.htag_id = sfmmup;
4931 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
4932 DEMAP_RANGE_INIT(sfmmup, &dmr);
4934 while (addr < endaddr) {
4935 hmeshift = HME_HASH_SHIFT(hashno);
4936 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
4937 hblktag.htag_rehash = hashno;
4938 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
4940 SFMMU_HASH_LOCK(hmebp);
4942 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list);
4943 if (hmeblkp != NULL) {
4944 ASSERT(!hmeblkp->hblk_shared);
4946 * We've encountered a shadow hmeblk so skip the range
4947 * of the next smaller mapping size.
4949 if (hmeblkp->hblk_shw_bit) {
4950 ASSERT(sfmmup != ksfmmup);
4951 ASSERT(hashno > 1);
4952 addr = (caddr_t)P2END((uintptr_t)addr,
4953 TTEBYTES(hashno - 1));
4954 } else {
4955 addr = sfmmu_hblk_chgattr(sfmmup,
4956 hmeblkp, addr, endaddr, &dmr, attr, mode);
4958 SFMMU_HASH_UNLOCK(hmebp);
4959 hashno = 1;
4960 continue;
4962 SFMMU_HASH_UNLOCK(hmebp);
4964 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) {
4966 * We have traversed the whole list and rehashed
4967 * if necessary without finding the address to chgattr.
4968 * This is ok, so we increment the address by the
4969 * smallest hmeblk range for kernel mappings or for
4970 * user mappings with no large pages, and the largest
4971 * hmeblk range, to account for shadow hmeblks, for
4972 * user mappings with large pages and continue.
4974 if (sfmmup == ksfmmup)
4975 addr = (caddr_t)P2END((uintptr_t)addr,
4976 TTEBYTES(1));
4977 else
4978 addr = (caddr_t)P2END((uintptr_t)addr,
4979 TTEBYTES(hashno));
4980 hashno = 1;
4981 } else {
4982 hashno++;
4986 sfmmu_hblks_list_purge(&list, 0);
4987 DEMAP_RANGE_FLUSH(&dmr);
4988 cpuset = sfmmup->sfmmu_cpusran;
4989 xt_sync(cpuset);
4993 * This function chgattr on a range of addresses in an hmeblk. It returns the
4994 * next addres that needs to be chgattr.
4995 * It should be called with the hash lock held.
4996 * XXX It should be possible to optimize chgattr by not flushing every time but
4997 * on the other hand:
4998 * 1. do one flush crosscall.
4999 * 2. only flush if we are increasing permissions (make sure this will work)
5001 static caddr_t
5002 sfmmu_hblk_chgattr(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr,
5003 caddr_t endaddr, demap_range_t *dmrp, uint_t attr, int mode)
5005 tte_t tte, tteattr, tteflags, ttemod;
5006 struct sf_hment *sfhmep;
5007 int ttesz;
5008 struct page *pp = NULL;
5009 kmutex_t *pml, *pmtx;
5010 int ret;
5011 int use_demap_range;
5012 #if defined(SF_ERRATA_57)
5013 int check_exec;
5014 #endif
5016 ASSERT(in_hblk_range(hmeblkp, addr));
5017 ASSERT(hmeblkp->hblk_shw_bit == 0);
5018 ASSERT(!hmeblkp->hblk_shared);
5020 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp));
5021 ttesz = get_hblk_ttesz(hmeblkp);
5024 * Flush the current demap region if addresses have been
5025 * skipped or the page size doesn't match.
5027 use_demap_range = (TTEBYTES(ttesz) == DEMAP_RANGE_PGSZ(dmrp));
5028 if (use_demap_range) {
5029 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr);
5030 } else if (dmrp != NULL) {
5031 DEMAP_RANGE_FLUSH(dmrp);
5034 tteattr.ll = sfmmu_vtop_attr(attr, mode, &tteflags);
5035 #if defined(SF_ERRATA_57)
5036 check_exec = (sfmmup != ksfmmup) &&
5037 AS_TYPE_64BIT(sfmmup->sfmmu_as) &&
5038 TTE_IS_EXECUTABLE(&tteattr);
5039 #endif
5040 HBLKTOHME(sfhmep, hmeblkp, addr);
5041 while (addr < endaddr) {
5042 sfmmu_copytte(&sfhmep->hme_tte, &tte);
5043 if (TTE_IS_VALID(&tte)) {
5044 if ((tte.ll & tteflags.ll) == tteattr.ll) {
5046 * if the new attr is the same as old
5047 * continue
5049 goto next_addr;
5051 if (!TTE_IS_WRITABLE(&tteattr)) {
5053 * make sure we clear hw modify bit if we
5054 * removing write protections
5056 tteflags.tte_intlo |= TTE_HWWR_INT;
5059 pml = NULL;
5060 pp = sfhmep->hme_page;
5061 if (pp) {
5062 pml = sfmmu_mlist_enter(pp);
5065 if (pp != sfhmep->hme_page) {
5067 * tte must have been unloaded.
5069 ASSERT(pml);
5070 sfmmu_mlist_exit(pml);
5071 continue;
5074 ASSERT(pp == NULL || sfmmu_mlist_held(pp));
5076 ttemod = tte;
5077 ttemod.ll = (ttemod.ll & ~tteflags.ll) | tteattr.ll;
5078 ASSERT(TTE_TO_TTEPFN(&ttemod) == TTE_TO_TTEPFN(&tte));
5080 #if defined(SF_ERRATA_57)
5081 if (check_exec && addr < errata57_limit)
5082 ttemod.tte_exec_perm = 0;
5083 #endif
5084 ret = sfmmu_modifytte_try(&tte, &ttemod,
5085 &sfhmep->hme_tte);
5087 if (ret < 0) {
5088 /* tte changed underneath us */
5089 if (pml) {
5090 sfmmu_mlist_exit(pml);
5092 continue;
5095 if (tteflags.tte_intlo & TTE_HWWR_INT) {
5097 * need to sync if we are clearing modify bit.
5099 sfmmu_ttesync(sfmmup, addr, &tte, pp);
5102 if (pp && PP_ISRO(pp)) {
5103 if (tteattr.tte_intlo & TTE_WRPRM_INT) {
5104 pmtx = sfmmu_page_enter(pp);
5105 PP_CLRRO(pp);
5106 sfmmu_page_exit(pmtx);
5110 if (ret > 0 && use_demap_range) {
5111 DEMAP_RANGE_MARKPG(dmrp, addr);
5112 } else if (ret > 0) {
5113 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0);
5116 if (pml) {
5117 sfmmu_mlist_exit(pml);
5120 next_addr:
5121 addr += TTEBYTES(ttesz);
5122 sfhmep++;
5123 DEMAP_RANGE_NEXTPG(dmrp);
5125 return (addr);
5129 * This routine converts virtual attributes to physical ones. It will
5130 * update the tteflags field with the tte mask corresponding to the attributes
5131 * affected and it returns the new attributes. It will also clear the modify
5132 * bit if we are taking away write permission. This is necessary since the
5133 * modify bit is the hardware permission bit and we need to clear it in order
5134 * to detect write faults.
5136 static uint64_t
5137 sfmmu_vtop_attr(uint_t attr, int mode, tte_t *ttemaskp)
5139 tte_t ttevalue;
5141 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR));
5143 switch (mode) {
5144 case SFMMU_CHGATTR:
5145 /* all attributes specified */
5146 ttevalue.tte_inthi = MAKE_TTEATTR_INTHI(attr);
5147 ttevalue.tte_intlo = MAKE_TTEATTR_INTLO(attr);
5148 ttemaskp->tte_inthi = TTEINTHI_ATTR;
5149 ttemaskp->tte_intlo = TTEINTLO_ATTR;
5150 break;
5151 case SFMMU_SETATTR:
5152 ASSERT(!(attr & ~HAT_PROT_MASK));
5153 ttemaskp->ll = 0;
5154 ttevalue.ll = 0;
5156 * a valid tte implies exec and read for sfmmu
5157 * so no need to do anything about them.
5158 * since priviledged access implies user access
5159 * PROT_USER doesn't make sense either.
5161 if (attr & PROT_WRITE) {
5162 ttemaskp->tte_intlo |= TTE_WRPRM_INT;
5163 ttevalue.tte_intlo |= TTE_WRPRM_INT;
5165 break;
5166 case SFMMU_CLRATTR:
5167 /* attributes will be nand with current ones */
5168 if (attr & ~(PROT_WRITE | PROT_USER)) {
5169 panic("sfmmu: attr %x not supported", attr);
5171 ttemaskp->ll = 0;
5172 ttevalue.ll = 0;
5173 if (attr & PROT_WRITE) {
5174 /* clear both writable and modify bit */
5175 ttemaskp->tte_intlo |= TTE_WRPRM_INT | TTE_HWWR_INT;
5177 if (attr & PROT_USER) {
5178 ttemaskp->tte_intlo |= TTE_PRIV_INT;
5179 ttevalue.tte_intlo |= TTE_PRIV_INT;
5181 break;
5182 default:
5183 panic("sfmmu_vtop_attr: bad mode %x", mode);
5185 ASSERT(TTE_TO_TTEPFN(&ttevalue) == 0);
5186 return (ttevalue.ll);
5189 static uint_t
5190 sfmmu_ptov_attr(tte_t *ttep)
5192 uint_t attr;
5194 ASSERT(TTE_IS_VALID(ttep));
5196 attr = PROT_READ;
5198 if (TTE_IS_WRITABLE(ttep)) {
5199 attr |= PROT_WRITE;
5201 if (TTE_IS_EXECUTABLE(ttep)) {
5202 attr |= PROT_EXEC;
5204 if (!TTE_IS_PRIVILEGED(ttep)) {
5205 attr |= PROT_USER;
5207 if (TTE_IS_NFO(ttep)) {
5208 attr |= HAT_NOFAULT;
5210 if (TTE_IS_NOSYNC(ttep)) {
5211 attr |= HAT_NOSYNC;
5213 if (TTE_IS_SIDEFFECT(ttep)) {
5214 attr |= SFMMU_SIDEFFECT;
5216 if (!TTE_IS_VCACHEABLE(ttep)) {
5217 attr |= SFMMU_UNCACHEVTTE;
5219 if (!TTE_IS_PCACHEABLE(ttep)) {
5220 attr |= SFMMU_UNCACHEPTTE;
5222 return (attr);
5226 * hat_chgprot is a deprecated hat call. New segment drivers
5227 * should store all attributes and use hat_*attr calls.
5229 * Change the protections in the virtual address range
5230 * given to the specified virtual protection. If vprot is ~PROT_WRITE,
5231 * then remove write permission, leaving the other
5232 * permissions unchanged. If vprot is ~PROT_USER, remove user permissions.
5235 void
5236 hat_chgprot(struct hat *sfmmup, caddr_t addr, size_t len, uint_t vprot)
5238 struct hmehash_bucket *hmebp;
5239 hmeblk_tag hblktag;
5240 int hmeshift, hashno = 1;
5241 struct hme_blk *hmeblkp, *list = NULL;
5242 caddr_t endaddr;
5243 cpuset_t cpuset;
5244 demap_range_t dmr;
5246 ASSERT((len & MMU_PAGEOFFSET) == 0);
5247 ASSERT(((uintptr_t)addr & MMU_PAGEOFFSET) == 0);
5249 if (sfmmup->sfmmu_xhat_provider) {
5250 XHAT_CHGPROT(sfmmup, addr, len, vprot);
5251 return;
5252 } else {
5254 * This must be a CPU HAT. If the address space has
5255 * XHATs attached, change attributes for all of them,
5256 * just in case
5258 ASSERT(sfmmup->sfmmu_as != NULL);
5259 if (sfmmup->sfmmu_as->a_xhat != NULL)
5260 xhat_chgprot_all(sfmmup->sfmmu_as, addr, len, vprot);
5263 CPUSET_ZERO(cpuset);
5265 if ((vprot != (uint_t)~PROT_WRITE) && (vprot & PROT_USER) &&
5266 ((addr + len) > (caddr_t)USERLIMIT)) {
5267 panic("user addr %p vprot %x in kernel space",
5268 (void *)addr, vprot);
5270 endaddr = addr + len;
5271 hblktag.htag_id = sfmmup;
5272 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
5273 DEMAP_RANGE_INIT(sfmmup, &dmr);
5275 while (addr < endaddr) {
5276 hmeshift = HME_HASH_SHIFT(hashno);
5277 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
5278 hblktag.htag_rehash = hashno;
5279 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
5281 SFMMU_HASH_LOCK(hmebp);
5283 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list);
5284 if (hmeblkp != NULL) {
5285 ASSERT(!hmeblkp->hblk_shared);
5287 * We've encountered a shadow hmeblk so skip the range
5288 * of the next smaller mapping size.
5290 if (hmeblkp->hblk_shw_bit) {
5291 ASSERT(sfmmup != ksfmmup);
5292 ASSERT(hashno > 1);
5293 addr = (caddr_t)P2END((uintptr_t)addr,
5294 TTEBYTES(hashno - 1));
5295 } else {
5296 addr = sfmmu_hblk_chgprot(sfmmup, hmeblkp,
5297 addr, endaddr, &dmr, vprot);
5299 SFMMU_HASH_UNLOCK(hmebp);
5300 hashno = 1;
5301 continue;
5303 SFMMU_HASH_UNLOCK(hmebp);
5305 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) {
5307 * We have traversed the whole list and rehashed
5308 * if necessary without finding the address to chgprot.
5309 * This is ok so we increment the address by the
5310 * smallest hmeblk range for kernel mappings and the
5311 * largest hmeblk range, to account for shadow hmeblks,
5312 * for user mappings and continue.
5314 if (sfmmup == ksfmmup)
5315 addr = (caddr_t)P2END((uintptr_t)addr,
5316 TTEBYTES(1));
5317 else
5318 addr = (caddr_t)P2END((uintptr_t)addr,
5319 TTEBYTES(hashno));
5320 hashno = 1;
5321 } else {
5322 hashno++;
5326 sfmmu_hblks_list_purge(&list, 0);
5327 DEMAP_RANGE_FLUSH(&dmr);
5328 cpuset = sfmmup->sfmmu_cpusran;
5329 xt_sync(cpuset);
5333 * This function chgprots a range of addresses in an hmeblk. It returns the
5334 * next addres that needs to be chgprot.
5335 * It should be called with the hash lock held.
5336 * XXX It shold be possible to optimize chgprot by not flushing every time but
5337 * on the other hand:
5338 * 1. do one flush crosscall.
5339 * 2. only flush if we are increasing permissions (make sure this will work)
5341 static caddr_t
5342 sfmmu_hblk_chgprot(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, caddr_t addr,
5343 caddr_t endaddr, demap_range_t *dmrp, uint_t vprot)
5345 uint_t pprot;
5346 tte_t tte, ttemod;
5347 struct sf_hment *sfhmep;
5348 uint_t tteflags;
5349 int ttesz;
5350 struct page *pp = NULL;
5351 kmutex_t *pml, *pmtx;
5352 int ret;
5353 int use_demap_range;
5354 #if defined(SF_ERRATA_57)
5355 int check_exec;
5356 #endif
5358 ASSERT(in_hblk_range(hmeblkp, addr));
5359 ASSERT(hmeblkp->hblk_shw_bit == 0);
5360 ASSERT(!hmeblkp->hblk_shared);
5362 #ifdef DEBUG
5363 if (get_hblk_ttesz(hmeblkp) != TTE8K &&
5364 (endaddr < get_hblk_endaddr(hmeblkp))) {
5365 panic("sfmmu_hblk_chgprot: partial chgprot of large page");
5367 #endif /* DEBUG */
5369 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp));
5370 ttesz = get_hblk_ttesz(hmeblkp);
5372 pprot = sfmmu_vtop_prot(vprot, &tteflags);
5373 #if defined(SF_ERRATA_57)
5374 check_exec = (sfmmup != ksfmmup) &&
5375 AS_TYPE_64BIT(sfmmup->sfmmu_as) &&
5376 ((vprot & PROT_EXEC) == PROT_EXEC);
5377 #endif
5378 HBLKTOHME(sfhmep, hmeblkp, addr);
5381 * Flush the current demap region if addresses have been
5382 * skipped or the page size doesn't match.
5384 use_demap_range = (TTEBYTES(ttesz) == MMU_PAGESIZE);
5385 if (use_demap_range) {
5386 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr);
5387 } else if (dmrp != NULL) {
5388 DEMAP_RANGE_FLUSH(dmrp);
5391 while (addr < endaddr) {
5392 sfmmu_copytte(&sfhmep->hme_tte, &tte);
5393 if (TTE_IS_VALID(&tte)) {
5394 if (TTE_GET_LOFLAGS(&tte, tteflags) == pprot) {
5396 * if the new protection is the same as old
5397 * continue
5399 goto next_addr;
5401 pml = NULL;
5402 pp = sfhmep->hme_page;
5403 if (pp) {
5404 pml = sfmmu_mlist_enter(pp);
5406 if (pp != sfhmep->hme_page) {
5408 * tte most have been unloaded
5409 * underneath us. Recheck
5411 ASSERT(pml);
5412 sfmmu_mlist_exit(pml);
5413 continue;
5416 ASSERT(pp == NULL || sfmmu_mlist_held(pp));
5418 ttemod = tte;
5419 TTE_SET_LOFLAGS(&ttemod, tteflags, pprot);
5420 #if defined(SF_ERRATA_57)
5421 if (check_exec && addr < errata57_limit)
5422 ttemod.tte_exec_perm = 0;
5423 #endif
5424 ret = sfmmu_modifytte_try(&tte, &ttemod,
5425 &sfhmep->hme_tte);
5427 if (ret < 0) {
5428 /* tte changed underneath us */
5429 if (pml) {
5430 sfmmu_mlist_exit(pml);
5432 continue;
5435 if (tteflags & TTE_HWWR_INT) {
5437 * need to sync if we are clearing modify bit.
5439 sfmmu_ttesync(sfmmup, addr, &tte, pp);
5442 if (pp && PP_ISRO(pp)) {
5443 if (pprot & TTE_WRPRM_INT) {
5444 pmtx = sfmmu_page_enter(pp);
5445 PP_CLRRO(pp);
5446 sfmmu_page_exit(pmtx);
5450 if (ret > 0 && use_demap_range) {
5451 DEMAP_RANGE_MARKPG(dmrp, addr);
5452 } else if (ret > 0) {
5453 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0);
5456 if (pml) {
5457 sfmmu_mlist_exit(pml);
5460 next_addr:
5461 addr += TTEBYTES(ttesz);
5462 sfhmep++;
5463 DEMAP_RANGE_NEXTPG(dmrp);
5465 return (addr);
5469 * This routine is deprecated and should only be used by hat_chgprot.
5470 * The correct routine is sfmmu_vtop_attr.
5471 * This routine converts virtual page protections to physical ones. It will
5472 * update the tteflags field with the tte mask corresponding to the protections
5473 * affected and it returns the new protections. It will also clear the modify
5474 * bit if we are taking away write permission. This is necessary since the
5475 * modify bit is the hardware permission bit and we need to clear it in order
5476 * to detect write faults.
5477 * It accepts the following special protections:
5478 * ~PROT_WRITE = remove write permissions.
5479 * ~PROT_USER = remove user permissions.
5481 static uint_t
5482 sfmmu_vtop_prot(uint_t vprot, uint_t *tteflagsp)
5484 if (vprot == (uint_t)~PROT_WRITE) {
5485 *tteflagsp = TTE_WRPRM_INT | TTE_HWWR_INT;
5486 return (0); /* will cause wrprm to be cleared */
5488 if (vprot == (uint_t)~PROT_USER) {
5489 *tteflagsp = TTE_PRIV_INT;
5490 return (0); /* will cause privprm to be cleared */
5492 if ((vprot == 0) || (vprot == PROT_USER) ||
5493 ((vprot & PROT_ALL) != vprot)) {
5494 panic("sfmmu_vtop_prot -- bad prot %x", vprot);
5497 switch (vprot) {
5498 case (PROT_READ):
5499 case (PROT_EXEC):
5500 case (PROT_EXEC | PROT_READ):
5501 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT | TTE_HWWR_INT;
5502 return (TTE_PRIV_INT); /* set prv and clr wrt */
5503 case (PROT_WRITE):
5504 case (PROT_WRITE | PROT_READ):
5505 case (PROT_EXEC | PROT_WRITE):
5506 case (PROT_EXEC | PROT_WRITE | PROT_READ):
5507 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT;
5508 return (TTE_PRIV_INT | TTE_WRPRM_INT); /* set prv and wrt */
5509 case (PROT_USER | PROT_READ):
5510 case (PROT_USER | PROT_EXEC):
5511 case (PROT_USER | PROT_EXEC | PROT_READ):
5512 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT | TTE_HWWR_INT;
5513 return (0); /* clr prv and wrt */
5514 case (PROT_USER | PROT_WRITE):
5515 case (PROT_USER | PROT_WRITE | PROT_READ):
5516 case (PROT_USER | PROT_EXEC | PROT_WRITE):
5517 case (PROT_USER | PROT_EXEC | PROT_WRITE | PROT_READ):
5518 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT;
5519 return (TTE_WRPRM_INT); /* clr prv and set wrt */
5520 default:
5521 panic("sfmmu_vtop_prot -- bad prot %x", vprot);
5523 return (0);
5527 * Alternate unload for very large virtual ranges. With a true 64 bit VA,
5528 * the normal algorithm would take too long for a very large VA range with
5529 * few real mappings. This routine just walks thru all HMEs in the global
5530 * hash table to find and remove mappings.
5532 static void
5533 hat_unload_large_virtual(
5534 struct hat *sfmmup,
5535 caddr_t startaddr,
5536 size_t len,
5537 uint_t flags,
5538 hat_callback_t *callback)
5540 struct hmehash_bucket *hmebp;
5541 struct hme_blk *hmeblkp;
5542 struct hme_blk *pr_hblk = NULL;
5543 struct hme_blk *nx_hblk;
5544 struct hme_blk *list = NULL;
5545 int i;
5546 demap_range_t dmr, *dmrp;
5547 cpuset_t cpuset;
5548 caddr_t endaddr = startaddr + len;
5549 caddr_t sa;
5550 caddr_t ea;
5551 caddr_t cb_sa[MAX_CB_ADDR];
5552 caddr_t cb_ea[MAX_CB_ADDR];
5553 int addr_cnt = 0;
5554 int a = 0;
5556 if (sfmmup->sfmmu_free) {
5557 dmrp = NULL;
5558 } else {
5559 dmrp = &dmr;
5560 DEMAP_RANGE_INIT(sfmmup, dmrp);
5564 * Loop through all the hash buckets of HME blocks looking for matches.
5566 for (i = 0; i <= UHMEHASH_SZ; i++) {
5567 hmebp = &uhme_hash[i];
5568 SFMMU_HASH_LOCK(hmebp);
5569 hmeblkp = hmebp->hmeblkp;
5570 pr_hblk = NULL;
5571 while (hmeblkp) {
5572 nx_hblk = hmeblkp->hblk_next;
5575 * skip if not this context, if a shadow block or
5576 * if the mapping is not in the requested range
5578 if (hmeblkp->hblk_tag.htag_id != sfmmup ||
5579 hmeblkp->hblk_shw_bit ||
5580 (sa = (caddr_t)get_hblk_base(hmeblkp)) >= endaddr ||
5581 (ea = get_hblk_endaddr(hmeblkp)) <= startaddr) {
5582 pr_hblk = hmeblkp;
5583 goto next_block;
5586 ASSERT(!hmeblkp->hblk_shared);
5588 * unload if there are any current valid mappings
5590 if (hmeblkp->hblk_vcnt != 0 ||
5591 hmeblkp->hblk_hmecnt != 0)
5592 (void) sfmmu_hblk_unload(sfmmup, hmeblkp,
5593 sa, ea, dmrp, flags);
5596 * on unmap we also release the HME block itself, once
5597 * all mappings are gone.
5599 if ((flags & HAT_UNLOAD_UNMAP) != 0 &&
5600 !hmeblkp->hblk_vcnt &&
5601 !hmeblkp->hblk_hmecnt) {
5602 ASSERT(!hmeblkp->hblk_lckcnt);
5603 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
5604 &list, 0);
5605 } else {
5606 pr_hblk = hmeblkp;
5609 if (callback == NULL)
5610 goto next_block;
5613 * HME blocks may span more than one page, but we may be
5614 * unmapping only one page, so check for a smaller range
5615 * for the callback
5617 if (sa < startaddr)
5618 sa = startaddr;
5619 if (--ea > endaddr)
5620 ea = endaddr - 1;
5622 cb_sa[addr_cnt] = sa;
5623 cb_ea[addr_cnt] = ea;
5624 if (++addr_cnt == MAX_CB_ADDR) {
5625 if (dmrp != NULL) {
5626 DEMAP_RANGE_FLUSH(dmrp);
5627 cpuset = sfmmup->sfmmu_cpusran;
5628 xt_sync(cpuset);
5631 for (a = 0; a < MAX_CB_ADDR; ++a) {
5632 callback->hcb_start_addr = cb_sa[a];
5633 callback->hcb_end_addr = cb_ea[a];
5634 callback->hcb_function(callback);
5636 addr_cnt = 0;
5639 next_block:
5640 hmeblkp = nx_hblk;
5642 SFMMU_HASH_UNLOCK(hmebp);
5645 sfmmu_hblks_list_purge(&list, 0);
5646 if (dmrp != NULL) {
5647 DEMAP_RANGE_FLUSH(dmrp);
5648 cpuset = sfmmup->sfmmu_cpusran;
5649 xt_sync(cpuset);
5652 for (a = 0; a < addr_cnt; ++a) {
5653 callback->hcb_start_addr = cb_sa[a];
5654 callback->hcb_end_addr = cb_ea[a];
5655 callback->hcb_function(callback);
5659 * Check TSB and TLB page sizes if the process isn't exiting.
5661 if (!sfmmup->sfmmu_free)
5662 sfmmu_check_page_sizes(sfmmup, 0);
5666 * Unload all the mappings in the range [addr..addr+len). addr and len must
5667 * be MMU_PAGESIZE aligned.
5670 extern struct seg *segkmap;
5671 #define ISSEGKMAP(sfmmup, addr) (sfmmup == ksfmmup && \
5672 segkmap->s_base <= (addr) && (addr) < (segkmap->s_base + segkmap->s_size))
5675 void
5676 hat_unload_callback(
5677 struct hat *sfmmup,
5678 caddr_t addr,
5679 size_t len,
5680 uint_t flags,
5681 hat_callback_t *callback)
5683 struct hmehash_bucket *hmebp;
5684 hmeblk_tag hblktag;
5685 int hmeshift, hashno, iskernel;
5686 struct hme_blk *hmeblkp, *pr_hblk, *list = NULL;
5687 caddr_t endaddr;
5688 cpuset_t cpuset;
5689 int addr_count = 0;
5690 int a;
5691 caddr_t cb_start_addr[MAX_CB_ADDR];
5692 caddr_t cb_end_addr[MAX_CB_ADDR];
5693 int issegkmap = ISSEGKMAP(sfmmup, addr);
5694 demap_range_t dmr, *dmrp;
5696 if (sfmmup->sfmmu_xhat_provider) {
5697 XHAT_UNLOAD_CALLBACK(sfmmup, addr, len, flags, callback);
5698 return;
5699 } else {
5701 * This must be a CPU HAT. If the address space has
5702 * XHATs attached, unload the mappings for all of them,
5703 * just in case
5705 ASSERT(sfmmup->sfmmu_as != NULL);
5706 if (sfmmup->sfmmu_as->a_xhat != NULL)
5707 xhat_unload_callback_all(sfmmup->sfmmu_as, addr,
5708 len, flags, callback);
5711 ASSERT((sfmmup == ksfmmup) || (flags & HAT_UNLOAD_OTHER) || \
5712 AS_LOCK_HELD(sfmmup->sfmmu_as));
5714 ASSERT(sfmmup != NULL);
5715 ASSERT((len & MMU_PAGEOFFSET) == 0);
5716 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET));
5719 * Probing through a large VA range (say 63 bits) will be slow, even
5720 * at 4 Meg steps between the probes. So, when the virtual address range
5721 * is very large, search the HME entries for what to unload.
5723 * len >> TTE_PAGE_SHIFT(TTE4M) is the # of 4Meg probes we'd need
5725 * UHMEHASH_SZ is number of hash buckets to examine
5728 if (sfmmup != KHATID && (len >> TTE_PAGE_SHIFT(TTE4M)) > UHMEHASH_SZ) {
5729 hat_unload_large_virtual(sfmmup, addr, len, flags, callback);
5730 return;
5733 CPUSET_ZERO(cpuset);
5736 * If the process is exiting, we can save a lot of fuss since
5737 * we'll flush the TLB when we free the ctx anyway.
5739 if (sfmmup->sfmmu_free) {
5740 dmrp = NULL;
5741 } else {
5742 dmrp = &dmr;
5743 DEMAP_RANGE_INIT(sfmmup, dmrp);
5746 endaddr = addr + len;
5747 hblktag.htag_id = sfmmup;
5748 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
5751 * It is likely for the vm to call unload over a wide range of
5752 * addresses that are actually very sparsely populated by
5753 * translations. In order to speed this up the sfmmu hat supports
5754 * the concept of shadow hmeblks. Dummy large page hmeblks that
5755 * correspond to actual small translations are allocated at tteload
5756 * time and are referred to as shadow hmeblks. Now, during unload
5757 * time, we first check if we have a shadow hmeblk for that
5758 * translation. The absence of one means the corresponding address
5759 * range is empty and can be skipped.
5761 * The kernel is an exception to above statement and that is why
5762 * we don't use shadow hmeblks and hash starting from the smallest
5763 * page size.
5765 if (sfmmup == KHATID) {
5766 iskernel = 1;
5767 hashno = TTE64K;
5768 } else {
5769 iskernel = 0;
5770 if (mmu_page_sizes == max_mmu_page_sizes) {
5771 hashno = TTE256M;
5772 } else {
5773 hashno = TTE4M;
5776 while (addr < endaddr) {
5777 hmeshift = HME_HASH_SHIFT(hashno);
5778 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
5779 hblktag.htag_rehash = hashno;
5780 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
5782 SFMMU_HASH_LOCK(hmebp);
5784 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list);
5785 if (hmeblkp == NULL) {
5787 * didn't find an hmeblk. skip the appropiate
5788 * address range.
5790 SFMMU_HASH_UNLOCK(hmebp);
5791 if (iskernel) {
5792 if (hashno < mmu_hashcnt) {
5793 hashno++;
5794 continue;
5795 } else {
5796 hashno = TTE64K;
5797 addr = (caddr_t)roundup((uintptr_t)addr
5798 + 1, MMU_PAGESIZE64K);
5799 continue;
5802 addr = (caddr_t)roundup((uintptr_t)addr + 1,
5803 (1 << hmeshift));
5804 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) {
5805 ASSERT(hashno == TTE64K);
5806 continue;
5808 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) {
5809 hashno = TTE512K;
5810 continue;
5812 if (mmu_page_sizes == max_mmu_page_sizes) {
5813 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) {
5814 hashno = TTE4M;
5815 continue;
5817 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) {
5818 hashno = TTE32M;
5819 continue;
5821 hashno = TTE256M;
5822 continue;
5823 } else {
5824 hashno = TTE4M;
5825 continue;
5828 ASSERT(hmeblkp);
5829 ASSERT(!hmeblkp->hblk_shared);
5830 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) {
5832 * If the valid count is zero we can skip the range
5833 * mapped by this hmeblk.
5834 * We free hblks in the case of HAT_UNMAP. HAT_UNMAP
5835 * is used by segment drivers as a hint
5836 * that the mapping resource won't be used any longer.
5837 * The best example of this is during exit().
5839 addr = (caddr_t)roundup((uintptr_t)addr + 1,
5840 get_hblk_span(hmeblkp));
5841 if ((flags & HAT_UNLOAD_UNMAP) ||
5842 (iskernel && !issegkmap)) {
5843 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
5844 &list, 0);
5846 SFMMU_HASH_UNLOCK(hmebp);
5848 if (iskernel) {
5849 hashno = TTE64K;
5850 continue;
5852 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) {
5853 ASSERT(hashno == TTE64K);
5854 continue;
5856 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) {
5857 hashno = TTE512K;
5858 continue;
5860 if (mmu_page_sizes == max_mmu_page_sizes) {
5861 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) {
5862 hashno = TTE4M;
5863 continue;
5865 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) {
5866 hashno = TTE32M;
5867 continue;
5869 hashno = TTE256M;
5870 continue;
5871 } else {
5872 hashno = TTE4M;
5873 continue;
5876 if (hmeblkp->hblk_shw_bit) {
5878 * If we encounter a shadow hmeblk we know there is
5879 * smaller sized hmeblks mapping the same address space.
5880 * Decrement the hash size and rehash.
5882 ASSERT(sfmmup != KHATID);
5883 hashno--;
5884 SFMMU_HASH_UNLOCK(hmebp);
5885 continue;
5889 * track callback address ranges.
5890 * only start a new range when it's not contiguous
5892 if (callback != NULL) {
5893 if (addr_count > 0 &&
5894 addr == cb_end_addr[addr_count - 1])
5895 --addr_count;
5896 else
5897 cb_start_addr[addr_count] = addr;
5900 addr = sfmmu_hblk_unload(sfmmup, hmeblkp, addr, endaddr,
5901 dmrp, flags);
5903 if (callback != NULL)
5904 cb_end_addr[addr_count++] = addr;
5906 if (((flags & HAT_UNLOAD_UNMAP) || (iskernel && !issegkmap)) &&
5907 !hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) {
5908 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, &list, 0);
5910 SFMMU_HASH_UNLOCK(hmebp);
5913 * Notify our caller as to exactly which pages
5914 * have been unloaded. We do these in clumps,
5915 * to minimize the number of xt_sync()s that need to occur.
5917 if (callback != NULL && addr_count == MAX_CB_ADDR) {
5918 if (dmrp != NULL) {
5919 DEMAP_RANGE_FLUSH(dmrp);
5920 cpuset = sfmmup->sfmmu_cpusran;
5921 xt_sync(cpuset);
5924 for (a = 0; a < MAX_CB_ADDR; ++a) {
5925 callback->hcb_start_addr = cb_start_addr[a];
5926 callback->hcb_end_addr = cb_end_addr[a];
5927 callback->hcb_function(callback);
5929 addr_count = 0;
5931 if (iskernel) {
5932 hashno = TTE64K;
5933 continue;
5935 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) {
5936 ASSERT(hashno == TTE64K);
5937 continue;
5939 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) {
5940 hashno = TTE512K;
5941 continue;
5943 if (mmu_page_sizes == max_mmu_page_sizes) {
5944 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) {
5945 hashno = TTE4M;
5946 continue;
5948 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) {
5949 hashno = TTE32M;
5950 continue;
5952 hashno = TTE256M;
5953 } else {
5954 hashno = TTE4M;
5958 sfmmu_hblks_list_purge(&list, 0);
5959 if (dmrp != NULL) {
5960 DEMAP_RANGE_FLUSH(dmrp);
5961 cpuset = sfmmup->sfmmu_cpusran;
5962 xt_sync(cpuset);
5964 if (callback && addr_count != 0) {
5965 for (a = 0; a < addr_count; ++a) {
5966 callback->hcb_start_addr = cb_start_addr[a];
5967 callback->hcb_end_addr = cb_end_addr[a];
5968 callback->hcb_function(callback);
5973 * Check TSB and TLB page sizes if the process isn't exiting.
5975 if (!sfmmup->sfmmu_free)
5976 sfmmu_check_page_sizes(sfmmup, 0);
5980 * Unload all the mappings in the range [addr..addr+len). addr and len must
5981 * be MMU_PAGESIZE aligned.
5983 void
5984 hat_unload(struct hat *sfmmup, caddr_t addr, size_t len, uint_t flags)
5986 if (sfmmup->sfmmu_xhat_provider) {
5987 XHAT_UNLOAD(sfmmup, addr, len, flags);
5988 return;
5990 hat_unload_callback(sfmmup, addr, len, flags, NULL);
5995 * Find the largest mapping size for this page.
5998 fnd_mapping_sz(page_t *pp)
6000 int sz;
6001 int p_index;
6003 p_index = PP_MAPINDEX(pp);
6005 sz = 0;
6006 p_index >>= 1; /* don't care about 8K bit */
6007 for (; p_index; p_index >>= 1) {
6008 sz++;
6011 return (sz);
6015 * This function unloads a range of addresses for an hmeblk.
6016 * It returns the next address to be unloaded.
6017 * It should be called with the hash lock held.
6019 static caddr_t
6020 sfmmu_hblk_unload(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr,
6021 caddr_t endaddr, demap_range_t *dmrp, uint_t flags)
6023 tte_t tte, ttemod;
6024 struct sf_hment *sfhmep;
6025 int ttesz;
6026 long ttecnt;
6027 page_t *pp;
6028 kmutex_t *pml;
6029 int ret;
6030 int use_demap_range;
6032 ASSERT(in_hblk_range(hmeblkp, addr));
6033 ASSERT(!hmeblkp->hblk_shw_bit);
6034 ASSERT(sfmmup != NULL || hmeblkp->hblk_shared);
6035 ASSERT(sfmmup == NULL || !hmeblkp->hblk_shared);
6036 ASSERT(dmrp == NULL || !hmeblkp->hblk_shared);
6038 #ifdef DEBUG
6039 if (get_hblk_ttesz(hmeblkp) != TTE8K &&
6040 (endaddr < get_hblk_endaddr(hmeblkp))) {
6041 panic("sfmmu_hblk_unload: partial unload of large page");
6043 #endif /* DEBUG */
6045 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp));
6046 ttesz = get_hblk_ttesz(hmeblkp);
6048 use_demap_range = ((dmrp == NULL) ||
6049 (TTEBYTES(ttesz) == DEMAP_RANGE_PGSZ(dmrp)));
6051 if (use_demap_range) {
6052 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr);
6053 } else if (dmrp != NULL) {
6054 DEMAP_RANGE_FLUSH(dmrp);
6056 ttecnt = 0;
6057 HBLKTOHME(sfhmep, hmeblkp, addr);
6059 while (addr < endaddr) {
6060 pml = NULL;
6061 sfmmu_copytte(&sfhmep->hme_tte, &tte);
6062 if (TTE_IS_VALID(&tte)) {
6063 pp = sfhmep->hme_page;
6064 if (pp != NULL) {
6065 pml = sfmmu_mlist_enter(pp);
6069 * Verify if hme still points to 'pp' now that
6070 * we have p_mapping lock.
6072 if (sfhmep->hme_page != pp) {
6073 if (pp != NULL && sfhmep->hme_page != NULL) {
6074 ASSERT(pml != NULL);
6075 sfmmu_mlist_exit(pml);
6076 /* Re-start this iteration. */
6077 continue;
6079 ASSERT((pp != NULL) &&
6080 (sfhmep->hme_page == NULL));
6081 goto tte_unloaded;
6085 * This point on we have both HASH and p_mapping
6086 * lock.
6088 ASSERT(pp == sfhmep->hme_page);
6089 ASSERT(pp == NULL || sfmmu_mlist_held(pp));
6092 * We need to loop on modify tte because it is
6093 * possible for pagesync to come along and
6094 * change the software bits beneath us.
6096 * Page_unload can also invalidate the tte after
6097 * we read tte outside of p_mapping lock.
6099 again:
6100 ttemod = tte;
6102 TTE_SET_INVALID(&ttemod);
6103 ret = sfmmu_modifytte_try(&tte, &ttemod,
6104 &sfhmep->hme_tte);
6106 if (ret <= 0) {
6107 if (TTE_IS_VALID(&tte)) {
6108 ASSERT(ret < 0);
6109 goto again;
6111 if (pp != NULL) {
6112 panic("sfmmu_hblk_unload: pp = 0x%p "
6113 "tte became invalid under mlist"
6114 " lock = 0x%p", (void *)pp,
6115 (void *)pml);
6117 continue;
6120 if (!(flags & HAT_UNLOAD_NOSYNC)) {
6121 sfmmu_ttesync(sfmmup, addr, &tte, pp);
6125 * Ok- we invalidated the tte. Do the rest of the job.
6127 ttecnt++;
6129 if (flags & HAT_UNLOAD_UNLOCK) {
6130 ASSERT(hmeblkp->hblk_lckcnt > 0);
6131 atomic_dec_32(&hmeblkp->hblk_lckcnt);
6132 HBLK_STACK_TRACE(hmeblkp, HBLK_UNLOCK);
6136 * Normally we would need to flush the page
6137 * from the virtual cache at this point in
6138 * order to prevent a potential cache alias
6139 * inconsistency.
6140 * The particular scenario we need to worry
6141 * about is:
6142 * Given: va1 and va2 are two virtual address
6143 * that alias and map the same physical
6144 * address.
6145 * 1. mapping exists from va1 to pa and data
6146 * has been read into the cache.
6147 * 2. unload va1.
6148 * 3. load va2 and modify data using va2.
6149 * 4 unload va2.
6150 * 5. load va1 and reference data. Unless we
6151 * flush the data cache when we unload we will
6152 * get stale data.
6153 * Fortunately, page coloring eliminates the
6154 * above scenario by remembering the color a
6155 * physical page was last or is currently
6156 * mapped to. Now, we delay the flush until
6157 * the loading of translations. Only when the
6158 * new translation is of a different color
6159 * are we forced to flush.
6161 if (use_demap_range) {
6163 * Mark this page as needing a demap.
6165 DEMAP_RANGE_MARKPG(dmrp, addr);
6166 } else {
6167 ASSERT(sfmmup != NULL);
6168 ASSERT(!hmeblkp->hblk_shared);
6169 sfmmu_tlb_demap(addr, sfmmup, hmeblkp,
6170 sfmmup->sfmmu_free, 0);
6173 if (pp) {
6175 * Remove the hment from the mapping list
6177 ASSERT(hmeblkp->hblk_hmecnt > 0);
6180 * Again, we cannot
6181 * ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS);
6183 HME_SUB(sfhmep, pp);
6184 membar_stst();
6185 atomic_dec_16(&hmeblkp->hblk_hmecnt);
6188 ASSERT(hmeblkp->hblk_vcnt > 0);
6189 atomic_dec_16(&hmeblkp->hblk_vcnt);
6191 ASSERT(hmeblkp->hblk_hmecnt || hmeblkp->hblk_vcnt ||
6192 !hmeblkp->hblk_lckcnt);
6194 #ifdef VAC
6195 if (pp && (pp->p_nrm & (P_KPMC | P_KPMS | P_TNC))) {
6196 if (PP_ISTNC(pp)) {
6198 * If page was temporary
6199 * uncached, try to recache
6200 * it. Note that HME_SUB() was
6201 * called above so p_index and
6202 * mlist had been updated.
6204 conv_tnc(pp, ttesz);
6205 } else if (pp->p_mapping == NULL) {
6206 ASSERT(kpm_enable);
6208 * Page is marked to be in VAC conflict
6209 * to an existing kpm mapping and/or is
6210 * kpm mapped using only the regular
6211 * pagesize.
6213 sfmmu_kpm_hme_unload(pp);
6216 #endif /* VAC */
6217 } else if ((pp = sfhmep->hme_page) != NULL) {
6219 * TTE is invalid but the hme
6220 * still exists. let pageunload
6221 * complete its job.
6223 ASSERT(pml == NULL);
6224 pml = sfmmu_mlist_enter(pp);
6225 if (sfhmep->hme_page != NULL) {
6226 sfmmu_mlist_exit(pml);
6227 continue;
6229 ASSERT(sfhmep->hme_page == NULL);
6230 } else if (hmeblkp->hblk_hmecnt != 0) {
6232 * pageunload may have not finished decrementing
6233 * hblk_vcnt and hblk_hmecnt. Find page_t if any and
6234 * wait for pageunload to finish. Rely on pageunload
6235 * to decrement hblk_hmecnt after hblk_vcnt.
6237 pfn_t pfn = TTE_TO_TTEPFN(&tte);
6238 ASSERT(pml == NULL);
6239 if (pf_is_memory(pfn)) {
6240 pp = page_numtopp_nolock(pfn);
6241 if (pp != NULL) {
6242 pml = sfmmu_mlist_enter(pp);
6243 sfmmu_mlist_exit(pml);
6244 pml = NULL;
6249 tte_unloaded:
6251 * At this point, the tte we are looking at
6252 * should be unloaded, and hme has been unlinked
6253 * from page too. This is important because in
6254 * pageunload, it does ttesync() then HME_SUB.
6255 * We need to make sure HME_SUB has been completed
6256 * so we know ttesync() has been completed. Otherwise,
6257 * at exit time, after return from hat layer, VM will
6258 * release as structure which hat_setstat() (called
6259 * by ttesync()) needs.
6261 #ifdef DEBUG
6263 tte_t dtte;
6265 ASSERT(sfhmep->hme_page == NULL);
6267 sfmmu_copytte(&sfhmep->hme_tte, &dtte);
6268 ASSERT(!TTE_IS_VALID(&dtte));
6270 #endif
6272 if (pml) {
6273 sfmmu_mlist_exit(pml);
6276 addr += TTEBYTES(ttesz);
6277 sfhmep++;
6278 DEMAP_RANGE_NEXTPG(dmrp);
6281 * For shared hmeblks this routine is only called when region is freed
6282 * and no longer referenced. So no need to decrement ttecnt
6283 * in the region structure here.
6285 if (ttecnt > 0 && sfmmup != NULL) {
6286 atomic_add_long(&sfmmup->sfmmu_ttecnt[ttesz], -ttecnt);
6288 return (addr);
6292 * Invalidate a virtual address range for the local CPU.
6293 * For best performance ensure that the va range is completely
6294 * mapped, otherwise the entire TLB will be flushed.
6296 void
6297 hat_flush_range(struct hat *sfmmup, caddr_t va, size_t size)
6299 ssize_t sz;
6300 caddr_t endva = va + size;
6302 while (va < endva) {
6303 sz = hat_getpagesize(sfmmup, va);
6304 if (sz < 0) {
6305 vtag_flushall();
6306 break;
6308 vtag_flushpage(va, (uint64_t)sfmmup);
6309 va += sz;
6314 * Synchronize all the mappings in the range [addr..addr+len).
6315 * Can be called with clearflag having two states:
6316 * HAT_SYNC_DONTZERO means just return the rm stats
6317 * HAT_SYNC_ZERORM means zero rm bits in the tte and return the stats
6319 void
6320 hat_sync(struct hat *sfmmup, caddr_t addr, size_t len, uint_t clearflag)
6322 struct hmehash_bucket *hmebp;
6323 hmeblk_tag hblktag;
6324 int hmeshift, hashno = 1;
6325 struct hme_blk *hmeblkp, *list = NULL;
6326 caddr_t endaddr;
6327 cpuset_t cpuset;
6329 ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
6330 ASSERT((sfmmup == ksfmmup) || AS_LOCK_HELD(sfmmup->sfmmu_as));
6331 ASSERT((len & MMU_PAGEOFFSET) == 0);
6332 ASSERT((clearflag == HAT_SYNC_DONTZERO) ||
6333 (clearflag == HAT_SYNC_ZERORM));
6335 CPUSET_ZERO(cpuset);
6337 endaddr = addr + len;
6338 hblktag.htag_id = sfmmup;
6339 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
6342 * Spitfire supports 4 page sizes.
6343 * Most pages are expected to be of the smallest page
6344 * size (8K) and these will not need to be rehashed. 64K
6345 * pages also don't need to be rehashed because the an hmeblk
6346 * spans 64K of address space. 512K pages might need 1 rehash and
6347 * and 4M pages 2 rehashes.
6349 while (addr < endaddr) {
6350 hmeshift = HME_HASH_SHIFT(hashno);
6351 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
6352 hblktag.htag_rehash = hashno;
6353 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
6355 SFMMU_HASH_LOCK(hmebp);
6357 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list);
6358 if (hmeblkp != NULL) {
6359 ASSERT(!hmeblkp->hblk_shared);
6361 * We've encountered a shadow hmeblk so skip the range
6362 * of the next smaller mapping size.
6364 if (hmeblkp->hblk_shw_bit) {
6365 ASSERT(sfmmup != ksfmmup);
6366 ASSERT(hashno > 1);
6367 addr = (caddr_t)P2END((uintptr_t)addr,
6368 TTEBYTES(hashno - 1));
6369 } else {
6370 addr = sfmmu_hblk_sync(sfmmup, hmeblkp,
6371 addr, endaddr, clearflag);
6373 SFMMU_HASH_UNLOCK(hmebp);
6374 hashno = 1;
6375 continue;
6377 SFMMU_HASH_UNLOCK(hmebp);
6379 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) {
6381 * We have traversed the whole list and rehashed
6382 * if necessary without finding the address to sync.
6383 * This is ok so we increment the address by the
6384 * smallest hmeblk range for kernel mappings and the
6385 * largest hmeblk range, to account for shadow hmeblks,
6386 * for user mappings and continue.
6388 if (sfmmup == ksfmmup)
6389 addr = (caddr_t)P2END((uintptr_t)addr,
6390 TTEBYTES(1));
6391 else
6392 addr = (caddr_t)P2END((uintptr_t)addr,
6393 TTEBYTES(hashno));
6394 hashno = 1;
6395 } else {
6396 hashno++;
6399 sfmmu_hblks_list_purge(&list, 0);
6400 cpuset = sfmmup->sfmmu_cpusran;
6401 xt_sync(cpuset);
6404 static caddr_t
6405 sfmmu_hblk_sync(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr,
6406 caddr_t endaddr, int clearflag)
6408 tte_t tte, ttemod;
6409 struct sf_hment *sfhmep;
6410 int ttesz;
6411 struct page *pp;
6412 kmutex_t *pml;
6413 int ret;
6415 ASSERT(hmeblkp->hblk_shw_bit == 0);
6416 ASSERT(!hmeblkp->hblk_shared);
6418 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp));
6420 ttesz = get_hblk_ttesz(hmeblkp);
6421 HBLKTOHME(sfhmep, hmeblkp, addr);
6423 while (addr < endaddr) {
6424 sfmmu_copytte(&sfhmep->hme_tte, &tte);
6425 if (TTE_IS_VALID(&tte)) {
6426 pml = NULL;
6427 pp = sfhmep->hme_page;
6428 if (pp) {
6429 pml = sfmmu_mlist_enter(pp);
6431 if (pp != sfhmep->hme_page) {
6433 * tte most have been unloaded
6434 * underneath us. Recheck
6436 ASSERT(pml);
6437 sfmmu_mlist_exit(pml);
6438 continue;
6441 ASSERT(pp == NULL || sfmmu_mlist_held(pp));
6443 if (clearflag == HAT_SYNC_ZERORM) {
6444 ttemod = tte;
6445 TTE_CLR_RM(&ttemod);
6446 ret = sfmmu_modifytte_try(&tte, &ttemod,
6447 &sfhmep->hme_tte);
6448 if (ret < 0) {
6449 if (pml) {
6450 sfmmu_mlist_exit(pml);
6452 continue;
6455 if (ret > 0) {
6456 sfmmu_tlb_demap(addr, sfmmup,
6457 hmeblkp, 0, 0);
6460 sfmmu_ttesync(sfmmup, addr, &tte, pp);
6461 if (pml) {
6462 sfmmu_mlist_exit(pml);
6465 addr += TTEBYTES(ttesz);
6466 sfhmep++;
6468 return (addr);
6472 * This function will sync a tte to the page struct and it will
6473 * update the hat stats. Currently it allows us to pass a NULL pp
6474 * and we will simply update the stats. We may want to change this
6475 * so we only keep stats for pages backed by pp's.
6477 static void
6478 sfmmu_ttesync(struct hat *sfmmup, caddr_t addr, tte_t *ttep, page_t *pp)
6480 uint_t rm = 0;
6481 int sz;
6482 pgcnt_t npgs;
6484 ASSERT(TTE_IS_VALID(ttep));
6486 if (TTE_IS_NOSYNC(ttep)) {
6487 return;
6490 if (TTE_IS_REF(ttep)) {
6491 rm = P_REF;
6493 if (TTE_IS_MOD(ttep)) {
6494 rm |= P_MOD;
6497 if (rm == 0) {
6498 return;
6501 sz = TTE_CSZ(ttep);
6502 if (sfmmup != NULL && sfmmup->sfmmu_rmstat) {
6503 int i;
6504 caddr_t vaddr = addr;
6506 for (i = 0; i < TTEPAGES(sz); i++, vaddr += MMU_PAGESIZE) {
6507 hat_setstat(sfmmup->sfmmu_as, vaddr, MMU_PAGESIZE, rm);
6513 * XXX I want to use cas to update nrm bits but they
6514 * currently belong in common/vm and not in hat where
6515 * they should be.
6516 * The nrm bits are protected by the same mutex as
6517 * the one that protects the page's mapping list.
6519 if (!pp)
6520 return;
6521 ASSERT(sfmmu_mlist_held(pp));
6523 * If the tte is for a large page, we need to sync all the
6524 * pages covered by the tte.
6526 if (sz != TTE8K) {
6527 ASSERT(pp->p_szc != 0);
6528 pp = PP_GROUPLEADER(pp, sz);
6529 ASSERT(sfmmu_mlist_held(pp));
6532 /* Get number of pages from tte size. */
6533 npgs = TTEPAGES(sz);
6535 do {
6536 ASSERT(pp);
6537 ASSERT(sfmmu_mlist_held(pp));
6538 if (((rm & P_REF) != 0 && !PP_ISREF(pp)) ||
6539 ((rm & P_MOD) != 0 && !PP_ISMOD(pp)))
6540 hat_page_setattr(pp, rm);
6543 * Are we done? If not, we must have a large mapping.
6544 * For large mappings we need to sync the rest of the pages
6545 * covered by this tte; goto the next page.
6547 } while (--npgs > 0 && (pp = PP_PAGENEXT(pp)));
6551 * Execute pre-callback handler of each pa_hment linked to pp
6553 * Inputs:
6554 * flag: either HAT_PRESUSPEND or HAT_SUSPEND.
6555 * capture_cpus: pointer to return value (below)
6557 * Returns:
6558 * Propagates the subsystem callback return values back to the caller;
6559 * returns 0 on success. If capture_cpus is non-NULL, the value returned
6560 * is zero if all of the pa_hments are of a type that do not require
6561 * capturing CPUs prior to suspending the mapping, else it is 1.
6563 static int
6564 hat_pageprocess_precallbacks(struct page *pp, uint_t flag, int *capture_cpus)
6566 struct sf_hment *sfhmep;
6567 struct pa_hment *pahmep;
6568 int (*f)(caddr_t, uint_t, uint_t, void *);
6569 int ret;
6570 id_t id;
6571 int locked = 0;
6572 kmutex_t *pml;
6574 ASSERT(PAGE_EXCL(pp));
6575 if (!sfmmu_mlist_held(pp)) {
6576 pml = sfmmu_mlist_enter(pp);
6577 locked = 1;
6580 if (capture_cpus)
6581 *capture_cpus = 0;
6583 top:
6584 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) {
6586 * skip sf_hments corresponding to VA<->PA mappings;
6587 * for pa_hment's, hme_tte.ll is zero
6589 if (!IS_PAHME(sfhmep))
6590 continue;
6592 pahmep = sfhmep->hme_data;
6593 ASSERT(pahmep != NULL);
6596 * skip if pre-handler has been called earlier in this loop
6598 if (pahmep->flags & flag)
6599 continue;
6601 id = pahmep->cb_id;
6602 ASSERT(id >= (id_t)0 && id < sfmmu_cb_nextid);
6603 if (capture_cpus && sfmmu_cb_table[id].capture_cpus != 0)
6604 *capture_cpus = 1;
6605 if ((f = sfmmu_cb_table[id].prehandler) == NULL) {
6606 pahmep->flags |= flag;
6607 continue;
6611 * Drop the mapping list lock to avoid locking order issues.
6613 if (locked)
6614 sfmmu_mlist_exit(pml);
6616 ret = f(pahmep->addr, pahmep->len, flag, pahmep->pvt);
6617 if (ret != 0)
6618 return (ret); /* caller must do the cleanup */
6620 if (locked) {
6621 pml = sfmmu_mlist_enter(pp);
6622 pahmep->flags |= flag;
6623 goto top;
6626 pahmep->flags |= flag;
6629 if (locked)
6630 sfmmu_mlist_exit(pml);
6632 return (0);
6636 * Execute post-callback handler of each pa_hment linked to pp
6638 * Same overall assumptions and restrictions apply as for
6639 * hat_pageprocess_precallbacks().
6641 static void
6642 hat_pageprocess_postcallbacks(struct page *pp, uint_t flag)
6644 pfn_t pgpfn = pp->p_pagenum;
6645 pfn_t pgmask = btop(page_get_pagesize(pp->p_szc)) - 1;
6646 pfn_t newpfn;
6647 struct sf_hment *sfhmep;
6648 struct pa_hment *pahmep;
6649 int (*f)(caddr_t, uint_t, uint_t, void *, pfn_t);
6650 id_t id;
6651 int locked = 0;
6652 kmutex_t *pml;
6654 ASSERT(PAGE_EXCL(pp));
6655 if (!sfmmu_mlist_held(pp)) {
6656 pml = sfmmu_mlist_enter(pp);
6657 locked = 1;
6660 top:
6661 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) {
6663 * skip sf_hments corresponding to VA<->PA mappings;
6664 * for pa_hment's, hme_tte.ll is zero
6666 if (!IS_PAHME(sfhmep))
6667 continue;
6669 pahmep = sfhmep->hme_data;
6670 ASSERT(pahmep != NULL);
6672 if ((pahmep->flags & flag) == 0)
6673 continue;
6675 pahmep->flags &= ~flag;
6677 id = pahmep->cb_id;
6678 ASSERT(id >= (id_t)0 && id < sfmmu_cb_nextid);
6679 if ((f = sfmmu_cb_table[id].posthandler) == NULL)
6680 continue;
6683 * Convert the base page PFN into the constituent PFN
6684 * which is needed by the callback handler.
6686 newpfn = pgpfn | (btop((uintptr_t)pahmep->addr) & pgmask);
6689 * Drop the mapping list lock to avoid locking order issues.
6691 if (locked)
6692 sfmmu_mlist_exit(pml);
6694 if (f(pahmep->addr, pahmep->len, flag, pahmep->pvt, newpfn)
6695 != 0)
6696 panic("sfmmu: posthandler failed");
6698 if (locked) {
6699 pml = sfmmu_mlist_enter(pp);
6700 goto top;
6704 if (locked)
6705 sfmmu_mlist_exit(pml);
6709 * Suspend locked kernel mapping
6711 void
6712 hat_pagesuspend(struct page *pp)
6714 struct sf_hment *sfhmep;
6715 sfmmu_t *sfmmup;
6716 tte_t tte, ttemod;
6717 struct hme_blk *hmeblkp;
6718 caddr_t addr;
6719 int index, cons;
6720 cpuset_t cpuset;
6722 ASSERT(PAGE_EXCL(pp));
6723 ASSERT(sfmmu_mlist_held(pp));
6725 mutex_enter(&kpr_suspendlock);
6728 * We're about to suspend a kernel mapping so mark this thread as
6729 * non-traceable by DTrace. This prevents us from running into issues
6730 * with probe context trying to touch a suspended page
6731 * in the relocation codepath itself.
6733 curthread->t_flag |= T_DONTDTRACE;
6735 index = PP_MAPINDEX(pp);
6736 cons = TTE8K;
6738 retry:
6739 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) {
6741 if (IS_PAHME(sfhmep))
6742 continue;
6744 if (get_hblk_ttesz(sfmmu_hmetohblk(sfhmep)) != cons)
6745 continue;
6748 * Loop until we successfully set the suspend bit in
6749 * the TTE.
6751 again:
6752 sfmmu_copytte(&sfhmep->hme_tte, &tte);
6753 ASSERT(TTE_IS_VALID(&tte));
6755 ttemod = tte;
6756 TTE_SET_SUSPEND(&ttemod);
6757 if (sfmmu_modifytte_try(&tte, &ttemod,
6758 &sfhmep->hme_tte) < 0)
6759 goto again;
6762 * Invalidate TSB entry
6764 hmeblkp = sfmmu_hmetohblk(sfhmep);
6766 sfmmup = hblktosfmmu(hmeblkp);
6767 ASSERT(sfmmup == ksfmmup);
6768 ASSERT(!hmeblkp->hblk_shared);
6770 addr = tte_to_vaddr(hmeblkp, tte);
6773 * No need to make sure that the TSB for this sfmmu is
6774 * not being relocated since it is ksfmmup and thus it
6775 * will never be relocated.
6777 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0);
6780 * Update xcall stats
6782 cpuset = cpu_ready_set;
6783 CPUSET_DEL(cpuset, CPU->cpu_id);
6785 /* LINTED: constant in conditional context */
6786 SFMMU_XCALL_STATS(ksfmmup);
6789 * Flush TLB entry on remote CPU's
6791 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr,
6792 (uint64_t)ksfmmup);
6793 xt_sync(cpuset);
6796 * Flush TLB entry on local CPU
6798 vtag_flushpage(addr, (uint64_t)ksfmmup);
6801 while (index != 0) {
6802 index = index >> 1;
6803 if (index != 0)
6804 cons++;
6805 if (index & 0x1) {
6806 pp = PP_GROUPLEADER(pp, cons);
6807 goto retry;
6812 #ifdef DEBUG
6814 #define N_PRLE 1024
6815 struct prle {
6816 page_t *targ;
6817 page_t *repl;
6818 int status;
6819 int pausecpus;
6820 hrtime_t whence;
6823 static struct prle page_relocate_log[N_PRLE];
6824 static int prl_entry;
6825 static kmutex_t prl_mutex;
6827 #define PAGE_RELOCATE_LOG(t, r, s, p) \
6828 mutex_enter(&prl_mutex); \
6829 page_relocate_log[prl_entry].targ = *(t); \
6830 page_relocate_log[prl_entry].repl = *(r); \
6831 page_relocate_log[prl_entry].status = (s); \
6832 page_relocate_log[prl_entry].pausecpus = (p); \
6833 page_relocate_log[prl_entry].whence = gethrtime(); \
6834 prl_entry = (prl_entry == (N_PRLE - 1))? 0 : prl_entry + 1; \
6835 mutex_exit(&prl_mutex);
6837 #else /* !DEBUG */
6838 #define PAGE_RELOCATE_LOG(t, r, s, p)
6839 #endif
6842 * Core Kernel Page Relocation Algorithm
6844 * Input:
6846 * target : constituent pages are SE_EXCL locked.
6847 * replacement: constituent pages are SE_EXCL locked.
6849 * Output:
6851 * nrelocp: number of pages relocated
6854 hat_page_relocate(page_t **target, page_t **replacement, spgcnt_t *nrelocp)
6856 page_t *targ, *repl;
6857 page_t *tpp, *rpp;
6858 kmutex_t *low, *high;
6859 spgcnt_t npages, i;
6860 page_t *pl = NULL;
6861 int old_pil;
6862 cpuset_t cpuset;
6863 int cap_cpus;
6864 int ret;
6865 #ifdef VAC
6866 int cflags = 0;
6867 #endif
6869 if (!kcage_on || PP_ISNORELOC(*target)) {
6870 PAGE_RELOCATE_LOG(target, replacement, EAGAIN, -1);
6871 return (EAGAIN);
6874 mutex_enter(&kpr_mutex);
6875 kreloc_thread = curthread;
6877 targ = *target;
6878 repl = *replacement;
6879 ASSERT(repl != NULL);
6880 ASSERT(targ->p_szc == repl->p_szc);
6882 npages = page_get_pagecnt(targ->p_szc);
6885 * unload VA<->PA mappings that are not locked
6887 tpp = targ;
6888 for (i = 0; i < npages; i++) {
6889 (void) hat_pageunload(tpp, SFMMU_KERNEL_RELOC);
6890 tpp++;
6894 * Do "presuspend" callbacks, in a context from which we can still
6895 * block as needed. Note that we don't hold the mapping list lock
6896 * of "targ" at this point due to potential locking order issues;
6897 * we assume that between the hat_pageunload() above and holding
6898 * the SE_EXCL lock that the mapping list *cannot* change at this
6899 * point.
6901 ret = hat_pageprocess_precallbacks(targ, HAT_PRESUSPEND, &cap_cpus);
6902 if (ret != 0) {
6904 * EIO translates to fatal error, for all others cleanup
6905 * and return EAGAIN.
6907 ASSERT(ret != EIO);
6908 hat_pageprocess_postcallbacks(targ, HAT_POSTUNSUSPEND);
6909 PAGE_RELOCATE_LOG(target, replacement, ret, -1);
6910 kreloc_thread = NULL;
6911 mutex_exit(&kpr_mutex);
6912 return (EAGAIN);
6916 * acquire p_mapping list lock for both the target and replacement
6917 * root pages.
6919 * low and high refer to the need to grab the mlist locks in a
6920 * specific order in order to prevent race conditions. Thus the
6921 * lower lock must be grabbed before the higher lock.
6923 * This will block hat_unload's accessing p_mapping list. Since
6924 * we have SE_EXCL lock, hat_memload and hat_pageunload will be
6925 * blocked. Thus, no one else will be accessing the p_mapping list
6926 * while we suspend and reload the locked mapping below.
6928 tpp = targ;
6929 rpp = repl;
6930 sfmmu_mlist_reloc_enter(tpp, rpp, &low, &high);
6932 kpreempt_disable();
6935 * We raise our PIL to 13 so that we don't get captured by
6936 * another CPU or pinned by an interrupt thread. We can't go to
6937 * PIL 14 since the nexus driver(s) may need to interrupt at
6938 * that level in the case of IOMMU pseudo mappings.
6940 cpuset = cpu_ready_set;
6941 CPUSET_DEL(cpuset, CPU->cpu_id);
6942 if (!cap_cpus || CPUSET_ISNULL(cpuset)) {
6943 old_pil = splr(XCALL_PIL);
6944 } else {
6945 old_pil = -1;
6946 xc_attention(cpuset);
6948 ASSERT(getpil() == XCALL_PIL);
6951 * Now do suspend callbacks. In the case of an IOMMU mapping
6952 * this will suspend all DMA activity to the page while it is
6953 * being relocated. Since we are well above LOCK_LEVEL and CPUs
6954 * may be captured at this point we should have acquired any needed
6955 * locks in the presuspend callback.
6957 ret = hat_pageprocess_precallbacks(targ, HAT_SUSPEND, NULL);
6958 if (ret != 0) {
6959 repl = targ;
6960 goto suspend_fail;
6964 * Raise the PIL yet again, this time to block all high-level
6965 * interrupts on this CPU. This is necessary to prevent an
6966 * interrupt routine from pinning the thread which holds the
6967 * mapping suspended and then touching the suspended page.
6969 * Once the page is suspended we also need to be careful to
6970 * avoid calling any functions which touch any seg_kmem memory
6971 * since that memory may be backed by the very page we are
6972 * relocating in here!
6974 hat_pagesuspend(targ);
6977 * Now that we are confident everybody has stopped using this page,
6978 * copy the page contents. Note we use a physical copy to prevent
6979 * locking issues and to avoid fpRAS because we can't handle it in
6980 * this context.
6982 for (i = 0; i < npages; i++, tpp++, rpp++) {
6983 #ifdef VAC
6985 * If the replacement has a different vcolor than
6986 * the one being replacd, we need to handle VAC
6987 * consistency for it just as we were setting up
6988 * a new mapping to it.
6990 if ((PP_GET_VCOLOR(rpp) != NO_VCOLOR) &&
6991 (tpp->p_vcolor != rpp->p_vcolor) &&
6992 !CacheColor_IsFlushed(cflags, PP_GET_VCOLOR(rpp))) {
6993 CacheColor_SetFlushed(cflags, PP_GET_VCOLOR(rpp));
6994 sfmmu_cache_flushcolor(PP_GET_VCOLOR(rpp),
6995 rpp->p_pagenum);
6997 #endif
6999 * Copy the contents of the page.
7001 ppcopy_kernel(tpp, rpp);
7004 tpp = targ;
7005 rpp = repl;
7006 for (i = 0; i < npages; i++, tpp++, rpp++) {
7008 * Copy attributes. VAC consistency was handled above,
7009 * if required.
7011 rpp->p_nrm = tpp->p_nrm;
7012 tpp->p_nrm = 0;
7013 rpp->p_index = tpp->p_index;
7014 tpp->p_index = 0;
7015 #ifdef VAC
7016 rpp->p_vcolor = tpp->p_vcolor;
7017 #endif
7021 * First, unsuspend the page, if we set the suspend bit, and transfer
7022 * the mapping list from the target page to the replacement page.
7023 * Next process postcallbacks; since pa_hment's are linked only to the
7024 * p_mapping list of root page, we don't iterate over the constituent
7025 * pages.
7027 hat_pagereload(targ, repl);
7029 suspend_fail:
7030 hat_pageprocess_postcallbacks(repl, HAT_UNSUSPEND);
7033 * Now lower our PIL and release any captured CPUs since we
7034 * are out of the "danger zone". After this it will again be
7035 * safe to acquire adaptive mutex locks, or to drop them...
7037 if (old_pil != -1) {
7038 splx(old_pil);
7039 } else {
7040 xc_dismissed(cpuset);
7043 kpreempt_enable();
7045 sfmmu_mlist_reloc_exit(low, high);
7048 * Postsuspend callbacks should drop any locks held across
7049 * the suspend callbacks. As before, we don't hold the mapping
7050 * list lock at this point.. our assumption is that the mapping
7051 * list still can't change due to our holding SE_EXCL lock and
7052 * there being no unlocked mappings left. Hence the restriction
7053 * on calling context to hat_delete_callback()
7055 hat_pageprocess_postcallbacks(repl, HAT_POSTUNSUSPEND);
7056 if (ret != 0) {
7058 * The second presuspend call failed: we got here through
7059 * the suspend_fail label above.
7061 ASSERT(ret != EIO);
7062 PAGE_RELOCATE_LOG(target, replacement, ret, cap_cpus);
7063 kreloc_thread = NULL;
7064 mutex_exit(&kpr_mutex);
7065 return (EAGAIN);
7069 * Now that we're out of the performance critical section we can
7070 * take care of updating the hash table, since we still
7071 * hold all the pages locked SE_EXCL at this point we
7072 * needn't worry about things changing out from under us.
7074 tpp = targ;
7075 rpp = repl;
7076 for (i = 0; i < npages; i++, tpp++, rpp++) {
7079 * replace targ with replacement in page_hash table
7081 targ = tpp;
7082 page_relocate_hash(rpp, targ);
7085 * concatenate target; caller of platform_page_relocate()
7086 * expects target to be concatenated after returning.
7088 ASSERT(targ->p_next == targ);
7089 ASSERT(targ->p_prev == targ);
7090 page_list_concat(&pl, &targ);
7093 ASSERT(*target == pl);
7094 *nrelocp = npages;
7095 PAGE_RELOCATE_LOG(target, replacement, 0, cap_cpus);
7096 kreloc_thread = NULL;
7097 mutex_exit(&kpr_mutex);
7098 return (0);
7102 * Called when stray pa_hments are found attached to a page which is
7103 * being freed. Notify the subsystem which attached the pa_hment of
7104 * the error if it registered a suitable handler, else panic.
7106 static void
7107 sfmmu_pahment_leaked(struct pa_hment *pahmep)
7109 id_t cb_id = pahmep->cb_id;
7111 ASSERT(cb_id >= (id_t)0 && cb_id < sfmmu_cb_nextid);
7112 if (sfmmu_cb_table[cb_id].errhandler != NULL) {
7113 if (sfmmu_cb_table[cb_id].errhandler(pahmep->addr, pahmep->len,
7114 HAT_CB_ERR_LEAKED, pahmep->pvt) == 0)
7115 return; /* non-fatal */
7117 panic("pa_hment leaked: 0x%p", (void *)pahmep);
7121 * Remove all mappings to page 'pp'.
7124 hat_pageunload(struct page *pp, uint_t forceflag)
7126 struct page *origpp = pp;
7127 struct sf_hment *sfhme, *tmphme;
7128 struct hme_blk *hmeblkp;
7129 kmutex_t *pml;
7130 #ifdef VAC
7131 kmutex_t *pmtx;
7132 #endif
7133 cpuset_t cpuset, tset;
7134 int index, cons;
7135 int xhme_blks;
7136 int pa_hments;
7138 ASSERT(PAGE_EXCL(pp));
7140 retry_xhat:
7141 tmphme = NULL;
7142 xhme_blks = 0;
7143 pa_hments = 0;
7144 CPUSET_ZERO(cpuset);
7146 pml = sfmmu_mlist_enter(pp);
7148 #ifdef VAC
7149 if (pp->p_kpmref)
7150 sfmmu_kpm_pageunload(pp);
7151 ASSERT(!PP_ISMAPPED_KPM(pp));
7152 #endif
7154 * Clear vpm reference. Since the page is exclusively locked
7155 * vpm cannot be referencing it.
7157 if (vpm_enable) {
7158 pp->p_vpmref = 0;
7161 index = PP_MAPINDEX(pp);
7162 cons = TTE8K;
7163 retry:
7164 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) {
7165 tmphme = sfhme->hme_next;
7167 if (IS_PAHME(sfhme)) {
7168 ASSERT(sfhme->hme_data != NULL);
7169 pa_hments++;
7170 continue;
7173 hmeblkp = sfmmu_hmetohblk(sfhme);
7174 if (hmeblkp->hblk_xhat_bit) {
7175 struct xhat_hme_blk *xblk =
7176 (struct xhat_hme_blk *)hmeblkp;
7178 (void) XHAT_PAGEUNLOAD(xblk->xhat_hme_blk_hat,
7179 pp, forceflag, XBLK2PROVBLK(xblk));
7181 xhme_blks = 1;
7182 continue;
7186 * If there are kernel mappings don't unload them, they will
7187 * be suspended.
7189 if (forceflag == SFMMU_KERNEL_RELOC && hmeblkp->hblk_lckcnt &&
7190 hmeblkp->hblk_tag.htag_id == ksfmmup)
7191 continue;
7193 tset = sfmmu_pageunload(pp, sfhme, cons);
7194 CPUSET_OR(cpuset, tset);
7197 while (index != 0) {
7198 index = index >> 1;
7199 if (index != 0)
7200 cons++;
7201 if (index & 0x1) {
7202 /* Go to leading page */
7203 pp = PP_GROUPLEADER(pp, cons);
7204 ASSERT(sfmmu_mlist_held(pp));
7205 goto retry;
7210 * cpuset may be empty if the page was only mapped by segkpm,
7211 * in which case we won't actually cross-trap.
7213 xt_sync(cpuset);
7216 * The page should have no mappings at this point, unless
7217 * we were called from hat_page_relocate() in which case we
7218 * leave the locked mappings which will be suspended later.
7220 ASSERT(!PP_ISMAPPED(origpp) || xhme_blks || pa_hments ||
7221 (forceflag == SFMMU_KERNEL_RELOC));
7223 #ifdef VAC
7224 if (PP_ISTNC(pp)) {
7225 if (cons == TTE8K) {
7226 pmtx = sfmmu_page_enter(pp);
7227 PP_CLRTNC(pp);
7228 sfmmu_page_exit(pmtx);
7229 } else {
7230 conv_tnc(pp, cons);
7233 #endif /* VAC */
7235 if (pa_hments && forceflag != SFMMU_KERNEL_RELOC) {
7237 * Unlink any pa_hments and free them, calling back
7238 * the responsible subsystem to notify it of the error.
7239 * This can occur in situations such as drivers leaking
7240 * DMA handles: naughty, but common enough that we'd like
7241 * to keep the system running rather than bringing it
7242 * down with an obscure error like "pa_hment leaked"
7243 * which doesn't aid the user in debugging their driver.
7245 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) {
7246 tmphme = sfhme->hme_next;
7247 if (IS_PAHME(sfhme)) {
7248 struct pa_hment *pahmep = sfhme->hme_data;
7249 sfmmu_pahment_leaked(pahmep);
7250 HME_SUB(sfhme, pp);
7251 kmem_cache_free(pa_hment_cache, pahmep);
7255 ASSERT(!PP_ISMAPPED(origpp) || xhme_blks);
7258 sfmmu_mlist_exit(pml);
7261 * XHAT may not have finished unloading pages
7262 * because some other thread was waiting for
7263 * mlist lock and XHAT_PAGEUNLOAD let it do
7264 * the job.
7266 if (xhme_blks) {
7267 pp = origpp;
7268 goto retry_xhat;
7271 return (0);
7274 cpuset_t
7275 sfmmu_pageunload(page_t *pp, struct sf_hment *sfhme, int cons)
7277 struct hme_blk *hmeblkp;
7278 sfmmu_t *sfmmup;
7279 tte_t tte, ttemod;
7280 #ifdef DEBUG
7281 tte_t orig_old;
7282 #endif /* DEBUG */
7283 caddr_t addr;
7284 int ttesz;
7285 int ret;
7286 cpuset_t cpuset;
7288 ASSERT(pp != NULL);
7289 ASSERT(sfmmu_mlist_held(pp));
7290 ASSERT(!PP_ISKAS(pp));
7292 CPUSET_ZERO(cpuset);
7294 hmeblkp = sfmmu_hmetohblk(sfhme);
7296 readtte:
7297 sfmmu_copytte(&sfhme->hme_tte, &tte);
7298 if (TTE_IS_VALID(&tte)) {
7299 sfmmup = hblktosfmmu(hmeblkp);
7300 ttesz = get_hblk_ttesz(hmeblkp);
7302 * Only unload mappings of 'cons' size.
7304 if (ttesz != cons)
7305 return (cpuset);
7308 * Note that we have p_mapping lock, but no hash lock here.
7309 * hblk_unload() has to have both hash lock AND p_mapping
7310 * lock before it tries to modify tte. So, the tte could
7311 * not become invalid in the sfmmu_modifytte_try() below.
7313 ttemod = tte;
7314 #ifdef DEBUG
7315 orig_old = tte;
7316 #endif /* DEBUG */
7318 TTE_SET_INVALID(&ttemod);
7319 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte);
7320 if (ret < 0) {
7321 #ifdef DEBUG
7322 /* only R/M bits can change. */
7323 chk_tte(&orig_old, &tte, &ttemod, hmeblkp);
7324 #endif /* DEBUG */
7325 goto readtte;
7328 if (ret == 0) {
7329 panic("pageunload: cas failed?");
7332 addr = tte_to_vaddr(hmeblkp, tte);
7334 if (hmeblkp->hblk_shared) {
7335 sf_srd_t *srdp = (sf_srd_t *)sfmmup;
7336 uint_t rid = hmeblkp->hblk_tag.htag_rid;
7337 sf_region_t *rgnp;
7338 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
7339 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
7340 ASSERT(srdp != NULL);
7341 rgnp = srdp->srd_hmergnp[rid];
7342 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid);
7343 cpuset = sfmmu_rgntlb_demap(addr, rgnp, hmeblkp, 1);
7344 sfmmu_ttesync(NULL, addr, &tte, pp);
7345 ASSERT(rgnp->rgn_ttecnt[ttesz] > 0);
7346 atomic_dec_ulong(&rgnp->rgn_ttecnt[ttesz]);
7347 } else {
7348 sfmmu_ttesync(sfmmup, addr, &tte, pp);
7349 atomic_dec_ulong(&sfmmup->sfmmu_ttecnt[ttesz]);
7352 * We need to flush the page from the virtual cache
7353 * in order to prevent a virtual cache alias
7354 * inconsistency. The particular scenario we need
7355 * to worry about is:
7356 * Given: va1 and va2 are two virtual address that
7357 * alias and will map the same physical address.
7358 * 1. mapping exists from va1 to pa and data has
7359 * been read into the cache.
7360 * 2. unload va1.
7361 * 3. load va2 and modify data using va2.
7362 * 4 unload va2.
7363 * 5. load va1 and reference data. Unless we flush
7364 * the data cache when we unload we will get
7365 * stale data.
7366 * This scenario is taken care of by using virtual
7367 * page coloring.
7369 if (sfmmup->sfmmu_ismhat) {
7371 * Flush TSBs, TLBs and caches
7372 * of every process
7373 * sharing this ism segment.
7375 sfmmu_hat_lock_all();
7376 mutex_enter(&ism_mlist_lock);
7377 kpreempt_disable();
7378 sfmmu_ismtlbcache_demap(addr, sfmmup, hmeblkp,
7379 pp->p_pagenum, CACHE_NO_FLUSH);
7380 kpreempt_enable();
7381 mutex_exit(&ism_mlist_lock);
7382 sfmmu_hat_unlock_all();
7383 cpuset = cpu_ready_set;
7384 } else {
7385 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0);
7386 cpuset = sfmmup->sfmmu_cpusran;
7391 * Hme_sub has to run after ttesync() and a_rss update.
7392 * See hblk_unload().
7394 HME_SUB(sfhme, pp);
7395 membar_stst();
7398 * We can not make ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS)
7399 * since pteload may have done a HME_ADD() right after
7400 * we did the HME_SUB() above. Hmecnt is now maintained
7401 * by cas only. no lock guranteed its value. The only
7402 * gurantee we have is the hmecnt should not be less than
7403 * what it should be so the hblk will not be taken away.
7404 * It's also important that we decremented the hmecnt after
7405 * we are done with hmeblkp so that this hmeblk won't be
7406 * stolen.
7408 ASSERT(hmeblkp->hblk_hmecnt > 0);
7409 ASSERT(hmeblkp->hblk_vcnt > 0);
7410 atomic_dec_16(&hmeblkp->hblk_vcnt);
7411 atomic_dec_16(&hmeblkp->hblk_hmecnt);
7413 * This is bug 4063182.
7414 * XXX: fixme
7415 * ASSERT(hmeblkp->hblk_hmecnt || hmeblkp->hblk_vcnt ||
7416 * !hmeblkp->hblk_lckcnt);
7418 } else {
7419 panic("invalid tte? pp %p &tte %p",
7420 (void *)pp, (void *)&tte);
7423 return (cpuset);
7427 * While relocating a kernel page, this function will move the mappings
7428 * from tpp to dpp and modify any associated data with these mappings.
7429 * It also unsuspends the suspended kernel mapping.
7431 static void
7432 hat_pagereload(struct page *tpp, struct page *dpp)
7434 struct sf_hment *sfhme;
7435 tte_t tte, ttemod;
7436 int index, cons;
7438 ASSERT(getpil() == PIL_MAX);
7439 ASSERT(sfmmu_mlist_held(tpp));
7440 ASSERT(sfmmu_mlist_held(dpp));
7442 index = PP_MAPINDEX(tpp);
7443 cons = TTE8K;
7445 /* Update real mappings to the page */
7446 retry:
7447 for (sfhme = tpp->p_mapping; sfhme != NULL; sfhme = sfhme->hme_next) {
7448 if (IS_PAHME(sfhme))
7449 continue;
7450 sfmmu_copytte(&sfhme->hme_tte, &tte);
7451 ttemod = tte;
7454 * replace old pfn with new pfn in TTE
7456 PFN_TO_TTE(ttemod, dpp->p_pagenum);
7459 * clear suspend bit
7461 ASSERT(TTE_IS_SUSPEND(&ttemod));
7462 TTE_CLR_SUSPEND(&ttemod);
7464 if (sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte) < 0)
7465 panic("hat_pagereload(): sfmmu_modifytte_try() failed");
7468 * set hme_page point to new page
7470 sfhme->hme_page = dpp;
7474 * move p_mapping list from old page to new page
7476 dpp->p_mapping = tpp->p_mapping;
7477 tpp->p_mapping = NULL;
7478 dpp->p_share = tpp->p_share;
7479 tpp->p_share = 0;
7481 while (index != 0) {
7482 index = index >> 1;
7483 if (index != 0)
7484 cons++;
7485 if (index & 0x1) {
7486 tpp = PP_GROUPLEADER(tpp, cons);
7487 dpp = PP_GROUPLEADER(dpp, cons);
7488 goto retry;
7492 curthread->t_flag &= ~T_DONTDTRACE;
7493 mutex_exit(&kpr_suspendlock);
7496 uint_t
7497 hat_pagesync(struct page *pp, uint_t clearflag)
7499 struct sf_hment *sfhme, *tmphme = NULL;
7500 struct hme_blk *hmeblkp;
7501 kmutex_t *pml;
7502 cpuset_t cpuset, tset;
7503 int index, cons;
7504 extern ulong_t po_share;
7505 page_t *save_pp = pp;
7506 int stop_on_sh = 0;
7507 uint_t shcnt;
7509 CPUSET_ZERO(cpuset);
7511 if (PP_ISRO(pp) && (clearflag & HAT_SYNC_STOPON_MOD)) {
7512 return (PP_GENERIC_ATTR(pp));
7515 if ((clearflag & HAT_SYNC_ZERORM) == 0) {
7516 if ((clearflag & HAT_SYNC_STOPON_REF) && PP_ISREF(pp)) {
7517 return (PP_GENERIC_ATTR(pp));
7519 if ((clearflag & HAT_SYNC_STOPON_MOD) && PP_ISMOD(pp)) {
7520 return (PP_GENERIC_ATTR(pp));
7522 if (clearflag & HAT_SYNC_STOPON_SHARED) {
7523 if (pp->p_share > po_share) {
7524 hat_page_setattr(pp, P_REF);
7525 return (PP_GENERIC_ATTR(pp));
7527 stop_on_sh = 1;
7528 shcnt = 0;
7532 clearflag &= ~HAT_SYNC_STOPON_SHARED;
7533 pml = sfmmu_mlist_enter(pp);
7534 index = PP_MAPINDEX(pp);
7535 cons = TTE8K;
7536 retry:
7537 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) {
7539 * We need to save the next hment on the list since
7540 * it is possible for pagesync to remove an invalid hment
7541 * from the list.
7543 tmphme = sfhme->hme_next;
7544 if (IS_PAHME(sfhme))
7545 continue;
7547 * If we are looking for large mappings and this hme doesn't
7548 * reach the range we are seeking, just ignore it.
7550 hmeblkp = sfmmu_hmetohblk(sfhme);
7551 if (hmeblkp->hblk_xhat_bit)
7552 continue;
7554 if (hme_size(sfhme) < cons)
7555 continue;
7557 if (stop_on_sh) {
7558 if (hmeblkp->hblk_shared) {
7559 sf_srd_t *srdp = hblktosrd(hmeblkp);
7560 uint_t rid = hmeblkp->hblk_tag.htag_rid;
7561 sf_region_t *rgnp;
7562 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
7563 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
7564 ASSERT(srdp != NULL);
7565 rgnp = srdp->srd_hmergnp[rid];
7566 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp,
7567 rgnp, rid);
7568 shcnt += rgnp->rgn_refcnt;
7569 } else {
7570 shcnt++;
7572 if (shcnt > po_share) {
7574 * tell the pager to spare the page this time
7575 * around.
7577 hat_page_setattr(save_pp, P_REF);
7578 index = 0;
7579 break;
7582 tset = sfmmu_pagesync(pp, sfhme,
7583 clearflag & ~HAT_SYNC_STOPON_RM);
7584 CPUSET_OR(cpuset, tset);
7587 * If clearflag is HAT_SYNC_DONTZERO, break out as soon
7588 * as the "ref" or "mod" is set or share cnt exceeds po_share.
7590 if ((clearflag & ~HAT_SYNC_STOPON_RM) == HAT_SYNC_DONTZERO &&
7591 (((clearflag & HAT_SYNC_STOPON_MOD) && PP_ISMOD(save_pp)) ||
7592 ((clearflag & HAT_SYNC_STOPON_REF) && PP_ISREF(save_pp)))) {
7593 index = 0;
7594 break;
7598 while (index) {
7599 index = index >> 1;
7600 cons++;
7601 if (index & 0x1) {
7602 /* Go to leading page */
7603 pp = PP_GROUPLEADER(pp, cons);
7604 goto retry;
7608 xt_sync(cpuset);
7609 sfmmu_mlist_exit(pml);
7610 return (PP_GENERIC_ATTR(save_pp));
7614 * Get all the hardware dependent attributes for a page struct
7616 static cpuset_t
7617 sfmmu_pagesync(struct page *pp, struct sf_hment *sfhme,
7618 uint_t clearflag)
7620 caddr_t addr;
7621 tte_t tte, ttemod;
7622 struct hme_blk *hmeblkp;
7623 int ret;
7624 sfmmu_t *sfmmup;
7625 cpuset_t cpuset;
7627 ASSERT(pp != NULL);
7628 ASSERT(sfmmu_mlist_held(pp));
7629 ASSERT((clearflag == HAT_SYNC_DONTZERO) ||
7630 (clearflag == HAT_SYNC_ZERORM));
7632 SFMMU_STAT(sf_pagesync);
7634 CPUSET_ZERO(cpuset);
7636 sfmmu_pagesync_retry:
7638 sfmmu_copytte(&sfhme->hme_tte, &tte);
7639 if (TTE_IS_VALID(&tte)) {
7640 hmeblkp = sfmmu_hmetohblk(sfhme);
7641 sfmmup = hblktosfmmu(hmeblkp);
7642 addr = tte_to_vaddr(hmeblkp, tte);
7643 if (clearflag == HAT_SYNC_ZERORM) {
7644 ttemod = tte;
7645 TTE_CLR_RM(&ttemod);
7646 ret = sfmmu_modifytte_try(&tte, &ttemod,
7647 &sfhme->hme_tte);
7648 if (ret < 0) {
7650 * cas failed and the new value is not what
7651 * we want.
7653 goto sfmmu_pagesync_retry;
7656 if (ret > 0) {
7657 /* we win the cas */
7658 if (hmeblkp->hblk_shared) {
7659 sf_srd_t *srdp = (sf_srd_t *)sfmmup;
7660 uint_t rid =
7661 hmeblkp->hblk_tag.htag_rid;
7662 sf_region_t *rgnp;
7663 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
7664 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
7665 ASSERT(srdp != NULL);
7666 rgnp = srdp->srd_hmergnp[rid];
7667 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp,
7668 srdp, rgnp, rid);
7669 cpuset = sfmmu_rgntlb_demap(addr,
7670 rgnp, hmeblkp, 1);
7671 } else {
7672 sfmmu_tlb_demap(addr, sfmmup, hmeblkp,
7673 0, 0);
7674 cpuset = sfmmup->sfmmu_cpusran;
7678 sfmmu_ttesync(hmeblkp->hblk_shared ? NULL : sfmmup, addr,
7679 &tte, pp);
7681 return (cpuset);
7685 * Remove write permission from a mappings to a page, so that
7686 * we can detect the next modification of it. This requires modifying
7687 * the TTE then invalidating (demap) any TLB entry using that TTE.
7688 * This code is similar to sfmmu_pagesync().
7690 static cpuset_t
7691 sfmmu_pageclrwrt(struct page *pp, struct sf_hment *sfhme)
7693 caddr_t addr;
7694 tte_t tte;
7695 tte_t ttemod;
7696 struct hme_blk *hmeblkp;
7697 int ret;
7698 sfmmu_t *sfmmup;
7699 cpuset_t cpuset;
7701 ASSERT(pp != NULL);
7702 ASSERT(sfmmu_mlist_held(pp));
7704 CPUSET_ZERO(cpuset);
7705 SFMMU_STAT(sf_clrwrt);
7707 retry:
7709 sfmmu_copytte(&sfhme->hme_tte, &tte);
7710 if (TTE_IS_VALID(&tte) && TTE_IS_WRITABLE(&tte)) {
7711 hmeblkp = sfmmu_hmetohblk(sfhme);
7714 * xhat mappings should never be to a VMODSORT page.
7716 ASSERT(hmeblkp->hblk_xhat_bit == 0);
7718 sfmmup = hblktosfmmu(hmeblkp);
7719 addr = tte_to_vaddr(hmeblkp, tte);
7721 ttemod = tte;
7722 TTE_CLR_WRT(&ttemod);
7723 TTE_CLR_MOD(&ttemod);
7724 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte);
7727 * if cas failed and the new value is not what
7728 * we want retry
7730 if (ret < 0)
7731 goto retry;
7733 /* we win the cas */
7734 if (ret > 0) {
7735 if (hmeblkp->hblk_shared) {
7736 sf_srd_t *srdp = (sf_srd_t *)sfmmup;
7737 uint_t rid = hmeblkp->hblk_tag.htag_rid;
7738 sf_region_t *rgnp;
7739 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
7740 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
7741 ASSERT(srdp != NULL);
7742 rgnp = srdp->srd_hmergnp[rid];
7743 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp,
7744 srdp, rgnp, rid);
7745 cpuset = sfmmu_rgntlb_demap(addr,
7746 rgnp, hmeblkp, 1);
7747 } else {
7748 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0);
7749 cpuset = sfmmup->sfmmu_cpusran;
7754 return (cpuset);
7758 * Walk all mappings of a page, removing write permission and clearing the
7759 * ref/mod bits. This code is similar to hat_pagesync()
7761 static void
7762 hat_page_clrwrt(page_t *pp)
7764 struct sf_hment *sfhme;
7765 struct sf_hment *tmphme = NULL;
7766 kmutex_t *pml;
7767 cpuset_t cpuset;
7768 cpuset_t tset;
7769 int index;
7770 int cons;
7772 CPUSET_ZERO(cpuset);
7774 pml = sfmmu_mlist_enter(pp);
7775 index = PP_MAPINDEX(pp);
7776 cons = TTE8K;
7777 retry:
7778 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) {
7779 tmphme = sfhme->hme_next;
7782 * If we are looking for large mappings and this hme doesn't
7783 * reach the range we are seeking, just ignore its.
7786 if (hme_size(sfhme) < cons)
7787 continue;
7789 tset = sfmmu_pageclrwrt(pp, sfhme);
7790 CPUSET_OR(cpuset, tset);
7793 while (index) {
7794 index = index >> 1;
7795 cons++;
7796 if (index & 0x1) {
7797 /* Go to leading page */
7798 pp = PP_GROUPLEADER(pp, cons);
7799 goto retry;
7803 xt_sync(cpuset);
7804 sfmmu_mlist_exit(pml);
7808 * Set the given REF/MOD/RO bits for the given page.
7809 * For a vnode with a sorted v_pages list, we need to change
7810 * the attributes and the v_pages list together under page_vnode_mutex.
7812 void
7813 hat_page_setattr(page_t *pp, uint_t flag)
7815 vnode_t *vp = pp->p_vnode;
7816 page_t **listp;
7817 kmutex_t *pmtx;
7818 kmutex_t *vphm = NULL;
7819 int noshuffle;
7821 noshuffle = flag & P_NSH;
7822 flag &= ~P_NSH;
7824 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO)));
7827 * nothing to do if attribute already set
7829 if ((pp->p_nrm & flag) == flag)
7830 return;
7832 if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp) &&
7833 !noshuffle) {
7834 vphm = page_vnode_mutex(vp);
7835 mutex_enter(vphm);
7838 pmtx = sfmmu_page_enter(pp);
7839 pp->p_nrm |= flag;
7840 sfmmu_page_exit(pmtx);
7842 if (vphm != NULL) {
7844 * Some File Systems examine v_pages for NULL w/o
7845 * grabbing the vphm mutex. Must not let it become NULL when
7846 * pp is the only page on the list.
7848 if (pp->p_vpnext != pp) {
7849 page_vpsub(&vp->v_pages, pp);
7850 if (vp->v_pages != NULL)
7851 listp = &vp->v_pages->p_vpprev->p_vpnext;
7852 else
7853 listp = &vp->v_pages;
7854 page_vpadd(listp, pp);
7856 mutex_exit(vphm);
7860 void
7861 hat_page_clrattr(page_t *pp, uint_t flag)
7863 vnode_t *vp = pp->p_vnode;
7864 kmutex_t *pmtx;
7866 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO)));
7868 pmtx = sfmmu_page_enter(pp);
7871 * Caller is expected to hold page's io lock for VMODSORT to work
7872 * correctly with pvn_vplist_dirty() and pvn_getdirty() when mod
7873 * bit is cleared.
7874 * We don't have assert to avoid tripping some existing third party
7875 * code. The dirty page is moved back to top of the v_page list
7876 * after IO is done in pvn_write_done().
7878 pp->p_nrm &= ~flag;
7879 sfmmu_page_exit(pmtx);
7881 if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp)) {
7884 * VMODSORT works by removing write permissions and getting
7885 * a fault when a page is made dirty. At this point
7886 * we need to remove write permission from all mappings
7887 * to this page.
7889 hat_page_clrwrt(pp);
7893 uint_t
7894 hat_page_getattr(page_t *pp, uint_t flag)
7896 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO)));
7897 return ((uint_t)(pp->p_nrm & flag));
7901 * DEBUG kernels: verify that a kernel va<->pa translation
7902 * is safe by checking the underlying page_t is in a page
7903 * relocation-safe state.
7905 #ifdef DEBUG
7906 void
7907 sfmmu_check_kpfn(pfn_t pfn)
7909 page_t *pp;
7910 int index, cons;
7912 if (hat_check_vtop == 0)
7913 return;
7915 if (kvseg.s_base == NULL || panicstr)
7916 return;
7918 pp = page_numtopp_nolock(pfn);
7919 if (!pp)
7920 return;
7922 if (PAGE_LOCKED(pp) || PP_ISNORELOC(pp))
7923 return;
7926 * Handed a large kernel page, we dig up the root page since we
7927 * know the root page might have the lock also.
7929 if (pp->p_szc != 0) {
7930 index = PP_MAPINDEX(pp);
7931 cons = TTE8K;
7932 again:
7933 while (index != 0) {
7934 index >>= 1;
7935 if (index != 0)
7936 cons++;
7937 if (index & 0x1) {
7938 pp = PP_GROUPLEADER(pp, cons);
7939 goto again;
7944 if (PAGE_LOCKED(pp) || PP_ISNORELOC(pp))
7945 return;
7948 * Pages need to be locked or allocated "permanent" (either from
7949 * static_arena arena or explicitly setting PG_NORELOC when calling
7950 * page_create_va()) for VA->PA translations to be valid.
7952 if (!PP_ISNORELOC(pp))
7953 panic("Illegal VA->PA translation, pp 0x%p not permanent",
7954 (void *)pp);
7955 else
7956 panic("Illegal VA->PA translation, pp 0x%p not locked",
7957 (void *)pp);
7959 #endif /* DEBUG */
7962 * Returns a page frame number for a given virtual address.
7963 * Returns PFN_INVALID to indicate an invalid mapping
7965 pfn_t
7966 hat_getpfnum(struct hat *hat, caddr_t addr)
7968 pfn_t pfn;
7969 tte_t tte;
7972 * We would like to
7973 * ASSERT(AS_LOCK_HELD(as));
7974 * but we can't because the iommu driver will call this
7975 * routine at interrupt time and it can't grab the as lock
7976 * or it will deadlock: A thread could have the as lock
7977 * and be waiting for io. The io can't complete
7978 * because the interrupt thread is blocked trying to grab
7979 * the as lock.
7982 ASSERT(hat->sfmmu_xhat_provider == NULL);
7984 if (hat == ksfmmup) {
7985 if (IS_KMEM_VA_LARGEPAGE(addr)) {
7986 ASSERT(segkmem_lpszc > 0);
7987 pfn = sfmmu_kvaszc2pfn(addr, segkmem_lpszc);
7988 if (pfn != PFN_INVALID) {
7989 sfmmu_check_kpfn(pfn);
7990 return (pfn);
7992 } else if (segkpm && IS_KPM_ADDR(addr)) {
7993 return (sfmmu_kpm_vatopfn(addr));
7995 while ((pfn = sfmmu_vatopfn(addr, ksfmmup, &tte))
7996 == PFN_SUSPENDED) {
7997 sfmmu_vatopfn_suspended(addr, ksfmmup, &tte);
7999 sfmmu_check_kpfn(pfn);
8000 return (pfn);
8001 } else {
8002 return (sfmmu_uvatopfn(addr, hat, NULL));
8007 * This routine will return both pfn and tte for the vaddr.
8009 static pfn_t
8010 sfmmu_uvatopfn(caddr_t vaddr, struct hat *sfmmup, tte_t *ttep)
8012 struct hmehash_bucket *hmebp;
8013 hmeblk_tag hblktag;
8014 int hmeshift, hashno = 1;
8015 struct hme_blk *hmeblkp = NULL;
8016 tte_t tte;
8018 struct sf_hment *sfhmep;
8019 pfn_t pfn;
8021 /* support for ISM */
8022 ism_map_t *ism_map;
8023 ism_blk_t *ism_blkp;
8024 int i;
8025 sfmmu_t *ism_hatid = NULL;
8026 sfmmu_t *locked_hatid = NULL;
8027 sfmmu_t *sv_sfmmup = sfmmup;
8028 caddr_t sv_vaddr = vaddr;
8029 sf_srd_t *srdp;
8031 if (ttep == NULL) {
8032 ttep = &tte;
8033 } else {
8034 ttep->ll = 0;
8037 ASSERT(sfmmup != ksfmmup);
8038 SFMMU_STAT(sf_user_vtop);
8040 * Set ism_hatid if vaddr falls in a ISM segment.
8042 ism_blkp = sfmmup->sfmmu_iblk;
8043 if (ism_blkp != NULL) {
8044 sfmmu_ismhat_enter(sfmmup, 0);
8045 locked_hatid = sfmmup;
8047 while (ism_blkp != NULL && ism_hatid == NULL) {
8048 ism_map = ism_blkp->iblk_maps;
8049 for (i = 0; ism_map[i].imap_ismhat && i < ISM_MAP_SLOTS; i++) {
8050 if (vaddr >= ism_start(ism_map[i]) &&
8051 vaddr < ism_end(ism_map[i])) {
8052 sfmmup = ism_hatid = ism_map[i].imap_ismhat;
8053 vaddr = (caddr_t)(vaddr -
8054 ism_start(ism_map[i]));
8055 break;
8058 ism_blkp = ism_blkp->iblk_next;
8060 if (locked_hatid) {
8061 sfmmu_ismhat_exit(locked_hatid, 0);
8064 hblktag.htag_id = sfmmup;
8065 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
8066 do {
8067 hmeshift = HME_HASH_SHIFT(hashno);
8068 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift);
8069 hblktag.htag_rehash = hashno;
8070 hmebp = HME_HASH_FUNCTION(sfmmup, vaddr, hmeshift);
8072 SFMMU_HASH_LOCK(hmebp);
8074 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp);
8075 if (hmeblkp != NULL) {
8076 ASSERT(!hmeblkp->hblk_shared);
8077 HBLKTOHME(sfhmep, hmeblkp, vaddr);
8078 sfmmu_copytte(&sfhmep->hme_tte, ttep);
8079 SFMMU_HASH_UNLOCK(hmebp);
8080 if (TTE_IS_VALID(ttep)) {
8081 pfn = TTE_TO_PFN(vaddr, ttep);
8082 return (pfn);
8084 break;
8086 SFMMU_HASH_UNLOCK(hmebp);
8087 hashno++;
8088 } while (HME_REHASH(sfmmup) && (hashno <= mmu_hashcnt));
8090 if (SF_HMERGNMAP_ISNULL(sv_sfmmup)) {
8091 return (PFN_INVALID);
8093 srdp = sv_sfmmup->sfmmu_srdp;
8094 ASSERT(srdp != NULL);
8095 ASSERT(srdp->srd_refcnt != 0);
8096 hblktag.htag_id = srdp;
8097 hashno = 1;
8098 do {
8099 hmeshift = HME_HASH_SHIFT(hashno);
8100 hblktag.htag_bspage = HME_HASH_BSPAGE(sv_vaddr, hmeshift);
8101 hblktag.htag_rehash = hashno;
8102 hmebp = HME_HASH_FUNCTION(srdp, sv_vaddr, hmeshift);
8104 SFMMU_HASH_LOCK(hmebp);
8105 for (hmeblkp = hmebp->hmeblkp; hmeblkp != NULL;
8106 hmeblkp = hmeblkp->hblk_next) {
8107 uint_t rid;
8108 sf_region_t *rgnp;
8109 caddr_t rsaddr;
8110 caddr_t readdr;
8112 if (!HTAGS_EQ_SHME(hmeblkp->hblk_tag, hblktag,
8113 sv_sfmmup->sfmmu_hmeregion_map)) {
8114 continue;
8116 ASSERT(hmeblkp->hblk_shared);
8117 rid = hmeblkp->hblk_tag.htag_rid;
8118 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
8119 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
8120 rgnp = srdp->srd_hmergnp[rid];
8121 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid);
8122 HBLKTOHME(sfhmep, hmeblkp, sv_vaddr);
8123 sfmmu_copytte(&sfhmep->hme_tte, ttep);
8124 rsaddr = rgnp->rgn_saddr;
8125 readdr = rsaddr + rgnp->rgn_size;
8126 #ifdef DEBUG
8127 if (TTE_IS_VALID(ttep) ||
8128 get_hblk_ttesz(hmeblkp) > TTE8K) {
8129 caddr_t eva = tte_to_evaddr(hmeblkp, ttep);
8130 ASSERT(eva > sv_vaddr);
8131 ASSERT(sv_vaddr >= rsaddr);
8132 ASSERT(sv_vaddr < readdr);
8133 ASSERT(eva <= readdr);
8135 #endif /* DEBUG */
8137 * Continue the search if we
8138 * found an invalid 8K tte outside of the area
8139 * covered by this hmeblk's region.
8141 if (TTE_IS_VALID(ttep)) {
8142 SFMMU_HASH_UNLOCK(hmebp);
8143 pfn = TTE_TO_PFN(sv_vaddr, ttep);
8144 return (pfn);
8145 } else if (get_hblk_ttesz(hmeblkp) > TTE8K ||
8146 (sv_vaddr >= rsaddr && sv_vaddr < readdr)) {
8147 SFMMU_HASH_UNLOCK(hmebp);
8148 pfn = PFN_INVALID;
8149 return (pfn);
8152 SFMMU_HASH_UNLOCK(hmebp);
8153 hashno++;
8154 } while (hashno <= mmu_hashcnt);
8155 return (PFN_INVALID);
8160 * For compatability with AT&T and later optimizations
8162 /* ARGSUSED */
8163 void
8164 hat_map(struct hat *hat, caddr_t addr, size_t len, uint_t flags)
8166 ASSERT(hat != NULL);
8167 ASSERT(hat->sfmmu_xhat_provider == NULL);
8171 * Return the number of mappings to a particular page. This number is an
8172 * approximation of the number of people sharing the page.
8174 * shared hmeblks or ism hmeblks are counted as 1 mapping here.
8175 * hat_page_checkshare() can be used to compare threshold to share
8176 * count that reflects the number of region sharers albeit at higher cost.
8178 ulong_t
8179 hat_page_getshare(page_t *pp)
8181 page_t *spp = pp; /* start page */
8182 kmutex_t *pml;
8183 ulong_t cnt;
8184 int index, sz = TTE64K;
8187 * We need to grab the mlist lock to make sure any outstanding
8188 * load/unloads complete. Otherwise we could return zero
8189 * even though the unload(s) hasn't finished yet.
8191 pml = sfmmu_mlist_enter(spp);
8192 cnt = spp->p_share;
8194 #ifdef VAC
8195 if (kpm_enable)
8196 cnt += spp->p_kpmref;
8197 #endif
8198 if (vpm_enable && pp->p_vpmref) {
8199 cnt += 1;
8203 * If we have any large mappings, we count the number of
8204 * mappings that this large page is part of.
8206 index = PP_MAPINDEX(spp);
8207 index >>= 1;
8208 while (index) {
8209 pp = PP_GROUPLEADER(spp, sz);
8210 if ((index & 0x1) && pp != spp) {
8211 cnt += pp->p_share;
8212 spp = pp;
8214 index >>= 1;
8215 sz++;
8217 sfmmu_mlist_exit(pml);
8218 return (cnt);
8222 * Return 1 if the number of mappings exceeds sh_thresh. Return 0
8223 * otherwise. Count shared hmeblks by region's refcnt.
8226 hat_page_checkshare(page_t *pp, ulong_t sh_thresh)
8228 kmutex_t *pml;
8229 ulong_t cnt = 0;
8230 int index, sz = TTE8K;
8231 struct sf_hment *sfhme, *tmphme = NULL;
8232 struct hme_blk *hmeblkp;
8234 pml = sfmmu_mlist_enter(pp);
8236 #ifdef VAC
8237 if (kpm_enable)
8238 cnt = pp->p_kpmref;
8239 #endif
8241 if (vpm_enable && pp->p_vpmref) {
8242 cnt += 1;
8245 if (pp->p_share + cnt > sh_thresh) {
8246 sfmmu_mlist_exit(pml);
8247 return (1);
8250 index = PP_MAPINDEX(pp);
8252 again:
8253 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) {
8254 tmphme = sfhme->hme_next;
8255 if (IS_PAHME(sfhme)) {
8256 continue;
8259 hmeblkp = sfmmu_hmetohblk(sfhme);
8260 if (hmeblkp->hblk_xhat_bit) {
8261 cnt++;
8262 if (cnt > sh_thresh) {
8263 sfmmu_mlist_exit(pml);
8264 return (1);
8266 continue;
8268 if (hme_size(sfhme) != sz) {
8269 continue;
8272 if (hmeblkp->hblk_shared) {
8273 sf_srd_t *srdp = hblktosrd(hmeblkp);
8274 uint_t rid = hmeblkp->hblk_tag.htag_rid;
8275 sf_region_t *rgnp;
8276 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
8277 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
8278 ASSERT(srdp != NULL);
8279 rgnp = srdp->srd_hmergnp[rid];
8280 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp,
8281 rgnp, rid);
8282 cnt += rgnp->rgn_refcnt;
8283 } else {
8284 cnt++;
8286 if (cnt > sh_thresh) {
8287 sfmmu_mlist_exit(pml);
8288 return (1);
8292 index >>= 1;
8293 sz++;
8294 while (index) {
8295 pp = PP_GROUPLEADER(pp, sz);
8296 ASSERT(sfmmu_mlist_held(pp));
8297 if (index & 0x1) {
8298 goto again;
8300 index >>= 1;
8301 sz++;
8303 sfmmu_mlist_exit(pml);
8304 return (0);
8308 * Unload all large mappings to the pp and reset the p_szc field of every
8309 * constituent page according to the remaining mappings.
8311 * pp must be locked SE_EXCL. Even though no other constituent pages are
8312 * locked it's legal to unload the large mappings to the pp because all
8313 * constituent pages of large locked mappings have to be locked SE_SHARED.
8314 * This means if we have SE_EXCL lock on one of constituent pages none of the
8315 * large mappings to pp are locked.
8317 * Decrease p_szc field starting from the last constituent page and ending
8318 * with the root page. This method is used because other threads rely on the
8319 * root's p_szc to find the lock to syncronize on. After a root page_t's p_szc
8320 * is demoted then other threads will succeed in sfmmu_mlspl_enter(). This
8321 * ensures that p_szc changes of the constituent pages appears atomic for all
8322 * threads that use sfmmu_mlspl_enter() to examine p_szc field.
8324 * This mechanism is only used for file system pages where it's not always
8325 * possible to get SE_EXCL locks on all constituent pages to demote the size
8326 * code (as is done for anonymous or kernel large pages).
8328 * See more comments in front of sfmmu_mlspl_enter().
8330 void
8331 hat_page_demote(page_t *pp)
8333 int index;
8334 int sz;
8335 cpuset_t cpuset;
8336 int sync = 0;
8337 page_t *rootpp;
8338 struct sf_hment *sfhme;
8339 struct sf_hment *tmphme = NULL;
8340 struct hme_blk *hmeblkp;
8341 uint_t pszc;
8342 page_t *lastpp;
8343 cpuset_t tset;
8344 pgcnt_t npgs;
8345 kmutex_t *pml;
8346 kmutex_t *pmtx = NULL;
8348 ASSERT(PAGE_EXCL(pp));
8349 ASSERT(!PP_ISFREE(pp));
8350 ASSERT(!PP_ISKAS(pp));
8351 ASSERT(page_szc_lock_assert(pp));
8352 pml = sfmmu_mlist_enter(pp);
8354 pszc = pp->p_szc;
8355 if (pszc == 0) {
8356 goto out;
8359 index = PP_MAPINDEX(pp) >> 1;
8361 if (index) {
8362 CPUSET_ZERO(cpuset);
8363 sz = TTE64K;
8364 sync = 1;
8367 while (index) {
8368 if (!(index & 0x1)) {
8369 index >>= 1;
8370 sz++;
8371 continue;
8373 ASSERT(sz <= pszc);
8374 rootpp = PP_GROUPLEADER(pp, sz);
8375 for (sfhme = rootpp->p_mapping; sfhme; sfhme = tmphme) {
8376 tmphme = sfhme->hme_next;
8377 ASSERT(!IS_PAHME(sfhme));
8378 hmeblkp = sfmmu_hmetohblk(sfhme);
8379 if (hme_size(sfhme) != sz) {
8380 continue;
8382 if (hmeblkp->hblk_xhat_bit) {
8383 cmn_err(CE_PANIC,
8384 "hat_page_demote: xhat hmeblk");
8386 tset = sfmmu_pageunload(rootpp, sfhme, sz);
8387 CPUSET_OR(cpuset, tset);
8389 if (index >>= 1) {
8390 sz++;
8394 ASSERT(!PP_ISMAPPED_LARGE(pp));
8396 if (sync) {
8397 xt_sync(cpuset);
8398 #ifdef VAC
8399 if (PP_ISTNC(pp)) {
8400 conv_tnc(rootpp, sz);
8402 #endif /* VAC */
8405 pmtx = sfmmu_page_enter(pp);
8407 ASSERT(pp->p_szc == pszc);
8408 rootpp = PP_PAGEROOT(pp);
8409 ASSERT(rootpp->p_szc == pszc);
8410 lastpp = PP_PAGENEXT_N(rootpp, TTEPAGES(pszc) - 1);
8412 while (lastpp != rootpp) {
8413 sz = PP_MAPINDEX(lastpp) ? fnd_mapping_sz(lastpp) : 0;
8414 ASSERT(sz < pszc);
8415 npgs = (sz == 0) ? 1 : TTEPAGES(sz);
8416 ASSERT(P2PHASE(lastpp->p_pagenum, npgs) == npgs - 1);
8417 while (--npgs > 0) {
8418 lastpp->p_szc = (uchar_t)sz;
8419 lastpp = PP_PAGEPREV(lastpp);
8421 if (sz) {
8423 * make sure before current root's pszc
8424 * is updated all updates to constituent pages pszc
8425 * fields are globally visible.
8427 membar_producer();
8429 lastpp->p_szc = sz;
8430 ASSERT(IS_P2ALIGNED(lastpp->p_pagenum, TTEPAGES(sz)));
8431 if (lastpp != rootpp) {
8432 lastpp = PP_PAGEPREV(lastpp);
8435 if (sz == 0) {
8436 /* the loop above doesn't cover this case */
8437 rootpp->p_szc = 0;
8439 out:
8440 ASSERT(pp->p_szc == 0);
8441 if (pmtx != NULL) {
8442 sfmmu_page_exit(pmtx);
8444 sfmmu_mlist_exit(pml);
8448 * Refresh the HAT ismttecnt[] element for size szc.
8449 * Caller must have set ISM busy flag to prevent mapping
8450 * lists from changing while we're traversing them.
8452 pgcnt_t
8453 ism_tsb_entries(sfmmu_t *sfmmup, int szc)
8455 ism_blk_t *ism_blkp = sfmmup->sfmmu_iblk;
8456 ism_map_t *ism_map;
8457 pgcnt_t npgs = 0;
8458 pgcnt_t npgs_scd = 0;
8459 int j;
8460 sf_scd_t *scdp;
8461 uchar_t rid;
8463 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY));
8464 scdp = sfmmup->sfmmu_scdp;
8466 for (; ism_blkp != NULL; ism_blkp = ism_blkp->iblk_next) {
8467 ism_map = ism_blkp->iblk_maps;
8468 for (j = 0; ism_map[j].imap_ismhat && j < ISM_MAP_SLOTS; j++) {
8469 rid = ism_map[j].imap_rid;
8470 ASSERT(rid == SFMMU_INVALID_ISMRID ||
8471 rid < sfmmup->sfmmu_srdp->srd_next_ismrid);
8473 if (scdp != NULL && rid != SFMMU_INVALID_ISMRID &&
8474 SF_RGNMAP_TEST(scdp->scd_ismregion_map, rid)) {
8475 /* ISM is in sfmmup's SCD */
8476 npgs_scd +=
8477 ism_map[j].imap_ismhat->sfmmu_ttecnt[szc];
8478 } else {
8479 /* ISMs is not in SCD */
8480 npgs +=
8481 ism_map[j].imap_ismhat->sfmmu_ttecnt[szc];
8485 sfmmup->sfmmu_ismttecnt[szc] = npgs;
8486 sfmmup->sfmmu_scdismttecnt[szc] = npgs_scd;
8487 return (npgs);
8491 * Yield the memory claim requirement for an address space.
8493 * This is currently implemented as the number of bytes that have active
8494 * hardware translations that have page structures. Therefore, it can
8495 * underestimate the traditional resident set size, eg, if the
8496 * physical page is present and the hardware translation is missing;
8497 * and it can overestimate the rss, eg, if there are active
8498 * translations to a frame buffer with page structs.
8499 * Also, it does not take sharing into account.
8501 * Note that we don't acquire locks here since this function is most often
8502 * called from the clock thread.
8504 size_t
8505 hat_get_mapped_size(struct hat *hat)
8507 size_t assize = 0;
8508 int i;
8510 if (hat == NULL)
8511 return (0);
8513 ASSERT(hat->sfmmu_xhat_provider == NULL);
8515 for (i = 0; i < mmu_page_sizes; i++)
8516 assize += ((pgcnt_t)hat->sfmmu_ttecnt[i] +
8517 (pgcnt_t)hat->sfmmu_scdrttecnt[i]) * TTEBYTES(i);
8519 if (hat->sfmmu_iblk == NULL)
8520 return (assize);
8522 for (i = 0; i < mmu_page_sizes; i++)
8523 assize += ((pgcnt_t)hat->sfmmu_ismttecnt[i] +
8524 (pgcnt_t)hat->sfmmu_scdismttecnt[i]) * TTEBYTES(i);
8526 return (assize);
8530 hat_stats_enable(struct hat *hat)
8532 hatlock_t *hatlockp;
8534 ASSERT(hat->sfmmu_xhat_provider == NULL);
8536 hatlockp = sfmmu_hat_enter(hat);
8537 hat->sfmmu_rmstat++;
8538 sfmmu_hat_exit(hatlockp);
8539 return (1);
8542 void
8543 hat_stats_disable(struct hat *hat)
8545 hatlock_t *hatlockp;
8547 ASSERT(hat->sfmmu_xhat_provider == NULL);
8549 hatlockp = sfmmu_hat_enter(hat);
8550 hat->sfmmu_rmstat--;
8551 sfmmu_hat_exit(hatlockp);
8555 * Routines for entering or removing ourselves from the
8556 * ism_hat's mapping list. This is used for both private and
8557 * SCD hats.
8559 static void
8560 iment_add(struct ism_ment *iment, struct hat *ism_hat)
8562 ASSERT(MUTEX_HELD(&ism_mlist_lock));
8564 iment->iment_prev = NULL;
8565 iment->iment_next = ism_hat->sfmmu_iment;
8566 if (ism_hat->sfmmu_iment) {
8567 ism_hat->sfmmu_iment->iment_prev = iment;
8569 ism_hat->sfmmu_iment = iment;
8572 static void
8573 iment_sub(struct ism_ment *iment, struct hat *ism_hat)
8575 ASSERT(MUTEX_HELD(&ism_mlist_lock));
8577 if (ism_hat->sfmmu_iment == NULL) {
8578 panic("ism map entry remove - no entries");
8581 if (iment->iment_prev) {
8582 ASSERT(ism_hat->sfmmu_iment != iment);
8583 iment->iment_prev->iment_next = iment->iment_next;
8584 } else {
8585 ASSERT(ism_hat->sfmmu_iment == iment);
8586 ism_hat->sfmmu_iment = iment->iment_next;
8589 if (iment->iment_next) {
8590 iment->iment_next->iment_prev = iment->iment_prev;
8594 * zero out the entry
8596 iment->iment_next = NULL;
8597 iment->iment_prev = NULL;
8598 iment->iment_hat = NULL;
8599 iment->iment_base_va = 0;
8603 * Hat_share()/unshare() return an (non-zero) error
8604 * when saddr and daddr are not properly aligned.
8606 * The top level mapping element determines the alignment
8607 * requirement for saddr and daddr, depending on different
8608 * architectures.
8610 * When hat_share()/unshare() are not supported,
8611 * HATOP_SHARE()/UNSHARE() return 0
8614 hat_share(struct hat *sfmmup, caddr_t addr,
8615 struct hat *ism_hatid, caddr_t sptaddr, size_t len, uint_t ismszc)
8617 ism_blk_t *ism_blkp;
8618 ism_blk_t *new_iblk;
8619 ism_map_t *ism_map;
8620 ism_ment_t *ism_ment;
8621 int i, added;
8622 hatlock_t *hatlockp;
8623 int reload_mmu = 0;
8624 uint_t ismshift = page_get_shift(ismszc);
8625 size_t ismpgsz = page_get_pagesize(ismszc);
8626 uint_t ismmask = (uint_t)ismpgsz - 1;
8627 size_t sh_size = ISM_SHIFT(ismshift, len);
8628 ushort_t ismhatflag;
8629 hat_region_cookie_t rcookie;
8630 sf_scd_t *old_scdp;
8632 #ifdef DEBUG
8633 caddr_t eaddr = addr + len;
8634 #endif /* DEBUG */
8636 ASSERT(ism_hatid != NULL && sfmmup != NULL);
8637 ASSERT(sptaddr == ISMID_STARTADDR);
8639 * Check the alignment.
8641 if (!ISM_ALIGNED(ismshift, addr) || !ISM_ALIGNED(ismshift, sptaddr))
8642 return (EINVAL);
8645 * Check size alignment.
8647 if (!ISM_ALIGNED(ismshift, len))
8648 return (EINVAL);
8650 ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
8653 * Allocate ism_ment for the ism_hat's mapping list, and an
8654 * ism map blk in case we need one. We must do our
8655 * allocations before acquiring locks to prevent a deadlock
8656 * in the kmem allocator on the mapping list lock.
8658 new_iblk = kmem_cache_alloc(ism_blk_cache, KM_SLEEP);
8659 ism_ment = kmem_cache_alloc(ism_ment_cache, KM_SLEEP);
8662 * Serialize ISM mappings with the ISM busy flag, and also the
8663 * trap handlers.
8665 sfmmu_ismhat_enter(sfmmup, 0);
8668 * Allocate an ism map blk if necessary.
8670 if (sfmmup->sfmmu_iblk == NULL) {
8671 sfmmup->sfmmu_iblk = new_iblk;
8672 bzero(new_iblk, sizeof (*new_iblk));
8673 new_iblk->iblk_nextpa = (uint64_t)-1;
8674 membar_stst(); /* make sure next ptr visible to all CPUs */
8675 sfmmup->sfmmu_ismblkpa = va_to_pa((caddr_t)new_iblk);
8676 reload_mmu = 1;
8677 new_iblk = NULL;
8680 #ifdef DEBUG
8682 * Make sure mapping does not already exist.
8684 ism_blkp = sfmmup->sfmmu_iblk;
8685 while (ism_blkp != NULL) {
8686 ism_map = ism_blkp->iblk_maps;
8687 for (i = 0; i < ISM_MAP_SLOTS && ism_map[i].imap_ismhat; i++) {
8688 if ((addr >= ism_start(ism_map[i]) &&
8689 addr < ism_end(ism_map[i])) ||
8690 eaddr > ism_start(ism_map[i]) &&
8691 eaddr <= ism_end(ism_map[i])) {
8692 panic("sfmmu_share: Already mapped!");
8695 ism_blkp = ism_blkp->iblk_next;
8697 #endif /* DEBUG */
8699 ASSERT(ismszc >= TTE4M);
8700 if (ismszc == TTE4M) {
8701 ismhatflag = HAT_4M_FLAG;
8702 } else if (ismszc == TTE32M) {
8703 ismhatflag = HAT_32M_FLAG;
8704 } else if (ismszc == TTE256M) {
8705 ismhatflag = HAT_256M_FLAG;
8708 * Add mapping to first available mapping slot.
8710 ism_blkp = sfmmup->sfmmu_iblk;
8711 added = 0;
8712 while (!added) {
8713 ism_map = ism_blkp->iblk_maps;
8714 for (i = 0; i < ISM_MAP_SLOTS; i++) {
8715 if (ism_map[i].imap_ismhat == NULL) {
8717 ism_map[i].imap_ismhat = ism_hatid;
8718 ism_map[i].imap_vb_shift = (uchar_t)ismshift;
8719 ism_map[i].imap_rid = SFMMU_INVALID_ISMRID;
8720 ism_map[i].imap_hatflags = ismhatflag;
8721 ism_map[i].imap_sz_mask = ismmask;
8723 * imap_seg is checked in ISM_CHECK to see if
8724 * non-NULL, then other info assumed valid.
8726 membar_stst();
8727 ism_map[i].imap_seg = (uintptr_t)addr | sh_size;
8728 ism_map[i].imap_ment = ism_ment;
8731 * Now add ourselves to the ism_hat's
8732 * mapping list.
8734 ism_ment->iment_hat = sfmmup;
8735 ism_ment->iment_base_va = addr;
8736 ism_hatid->sfmmu_ismhat = 1;
8737 mutex_enter(&ism_mlist_lock);
8738 iment_add(ism_ment, ism_hatid);
8739 mutex_exit(&ism_mlist_lock);
8740 added = 1;
8741 break;
8744 if (!added && ism_blkp->iblk_next == NULL) {
8745 ism_blkp->iblk_next = new_iblk;
8746 new_iblk = NULL;
8747 bzero(ism_blkp->iblk_next,
8748 sizeof (*ism_blkp->iblk_next));
8749 ism_blkp->iblk_next->iblk_nextpa = (uint64_t)-1;
8750 membar_stst();
8751 ism_blkp->iblk_nextpa =
8752 va_to_pa((caddr_t)ism_blkp->iblk_next);
8754 ism_blkp = ism_blkp->iblk_next;
8758 * After calling hat_join_region, sfmmup may join a new SCD or
8759 * move from the old scd to a new scd, in which case, we want to
8760 * shrink the sfmmup's private tsb size, i.e., pass shrink to
8761 * sfmmu_check_page_sizes at the end of this routine.
8763 old_scdp = sfmmup->sfmmu_scdp;
8765 rcookie = hat_join_region(sfmmup, addr, len, (void *)ism_hatid, 0,
8766 PROT_ALL, ismszc, NULL, HAT_REGION_ISM);
8767 if (rcookie != HAT_INVALID_REGION_COOKIE) {
8768 ism_map[i].imap_rid = (uchar_t)((uint64_t)rcookie);
8771 * Update our counters for this sfmmup's ism mappings.
8773 for (i = 0; i <= ismszc; i++) {
8774 if (!(disable_ism_large_pages & (1 << i)))
8775 (void) ism_tsb_entries(sfmmup, i);
8779 * For ISM and DISM we do not support 512K pages, so we only only
8780 * search the 4M and 8K/64K hashes for 4 pagesize cpus, and search the
8781 * 256M or 32M, and 4M and 8K/64K hashes for 6 pagesize cpus.
8783 * Need to set 32M/256M ISM flags to make sure
8784 * sfmmu_check_page_sizes() enables them on Panther.
8786 ASSERT((disable_ism_large_pages & (1 << TTE512K)) != 0);
8788 switch (ismszc) {
8789 case TTE256M:
8790 if (!SFMMU_FLAGS_ISSET(sfmmup, HAT_256M_ISM)) {
8791 hatlockp = sfmmu_hat_enter(sfmmup);
8792 SFMMU_FLAGS_SET(sfmmup, HAT_256M_ISM);
8793 sfmmu_hat_exit(hatlockp);
8795 break;
8796 case TTE32M:
8797 if (!SFMMU_FLAGS_ISSET(sfmmup, HAT_32M_ISM)) {
8798 hatlockp = sfmmu_hat_enter(sfmmup);
8799 SFMMU_FLAGS_SET(sfmmup, HAT_32M_ISM);
8800 sfmmu_hat_exit(hatlockp);
8802 break;
8803 default:
8804 break;
8808 * If we updated the ismblkpa for this HAT we must make
8809 * sure all CPUs running this process reload their tsbmiss area.
8810 * Otherwise they will fail to load the mappings in the tsbmiss
8811 * handler and will loop calling pagefault().
8813 if (reload_mmu) {
8814 hatlockp = sfmmu_hat_enter(sfmmup);
8815 sfmmu_sync_mmustate(sfmmup);
8816 sfmmu_hat_exit(hatlockp);
8819 sfmmu_ismhat_exit(sfmmup, 0);
8822 * Free up ismblk if we didn't use it.
8824 if (new_iblk != NULL)
8825 kmem_cache_free(ism_blk_cache, new_iblk);
8828 * Check TSB and TLB page sizes.
8830 if (sfmmup->sfmmu_scdp != NULL && old_scdp != sfmmup->sfmmu_scdp) {
8831 sfmmu_check_page_sizes(sfmmup, 0);
8832 } else {
8833 sfmmu_check_page_sizes(sfmmup, 1);
8835 return (0);
8839 * hat_unshare removes exactly one ism_map from
8840 * this process's as. It expects multiple calls
8841 * to hat_unshare for multiple shm segments.
8843 void
8844 hat_unshare(struct hat *sfmmup, caddr_t addr, size_t len, uint_t ismszc)
8846 ism_map_t *ism_map;
8847 ism_ment_t *free_ment = NULL;
8848 ism_blk_t *ism_blkp;
8849 struct hat *ism_hatid;
8850 int found, i;
8851 hatlock_t *hatlockp;
8852 struct tsb_info *tsbinfo;
8853 uint_t ismshift = page_get_shift(ismszc);
8854 size_t sh_size = ISM_SHIFT(ismshift, len);
8855 uchar_t ism_rid;
8856 sf_scd_t *old_scdp;
8858 ASSERT(ISM_ALIGNED(ismshift, addr));
8859 ASSERT(ISM_ALIGNED(ismshift, len));
8860 ASSERT(sfmmup != NULL);
8861 ASSERT(sfmmup != ksfmmup);
8863 if (sfmmup->sfmmu_xhat_provider) {
8864 XHAT_UNSHARE(sfmmup, addr, len);
8865 return;
8866 } else {
8868 * This must be a CPU HAT. If the address space has
8869 * XHATs attached, inform all XHATs that ISM segment
8870 * is going away
8872 ASSERT(sfmmup->sfmmu_as != NULL);
8873 if (sfmmup->sfmmu_as->a_xhat != NULL)
8874 xhat_unshare_all(sfmmup->sfmmu_as, addr, len);
8878 * Make sure that during the entire time ISM mappings are removed,
8879 * the trap handlers serialize behind us, and that no one else
8880 * can be mucking with ISM mappings. This also lets us get away
8881 * with not doing expensive cross calls to flush the TLB -- we
8882 * just discard the context, flush the entire TSB, and call it
8883 * a day.
8885 sfmmu_ismhat_enter(sfmmup, 0);
8888 * Remove the mapping.
8890 * We can't have any holes in the ism map.
8891 * The tsb miss code while searching the ism map will
8892 * stop on an empty map slot. So we must move
8893 * everyone past the hole up 1 if any.
8895 * Also empty ism map blks are not freed until the
8896 * process exits. This is to prevent a MT race condition
8897 * between sfmmu_unshare() and sfmmu_tsbmiss_exception().
8899 found = 0;
8900 ism_blkp = sfmmup->sfmmu_iblk;
8901 while (!found && ism_blkp != NULL) {
8902 ism_map = ism_blkp->iblk_maps;
8903 for (i = 0; i < ISM_MAP_SLOTS; i++) {
8904 if (addr == ism_start(ism_map[i]) &&
8905 sh_size == (size_t)(ism_size(ism_map[i]))) {
8906 found = 1;
8907 break;
8910 if (!found)
8911 ism_blkp = ism_blkp->iblk_next;
8914 if (found) {
8915 ism_hatid = ism_map[i].imap_ismhat;
8916 ism_rid = ism_map[i].imap_rid;
8917 ASSERT(ism_hatid != NULL);
8918 ASSERT(ism_hatid->sfmmu_ismhat == 1);
8921 * After hat_leave_region, the sfmmup may leave SCD,
8922 * in which case, we want to grow the private tsb size when
8923 * calling sfmmu_check_page_sizes at the end of the routine.
8925 old_scdp = sfmmup->sfmmu_scdp;
8927 * Then remove ourselves from the region.
8929 if (ism_rid != SFMMU_INVALID_ISMRID) {
8930 hat_leave_region(sfmmup, (void *)((uint64_t)ism_rid),
8931 HAT_REGION_ISM);
8935 * And now guarantee that any other cpu
8936 * that tries to process an ISM miss
8937 * will go to tl=0.
8939 hatlockp = sfmmu_hat_enter(sfmmup);
8940 sfmmu_invalidate_ctx(sfmmup);
8941 sfmmu_hat_exit(hatlockp);
8944 * Remove ourselves from the ism mapping list.
8946 mutex_enter(&ism_mlist_lock);
8947 iment_sub(ism_map[i].imap_ment, ism_hatid);
8948 mutex_exit(&ism_mlist_lock);
8949 free_ment = ism_map[i].imap_ment;
8952 * We delete the ism map by copying
8953 * the next map over the current one.
8954 * We will take the next one in the maps
8955 * array or from the next ism_blk.
8957 while (ism_blkp != NULL) {
8958 ism_map = ism_blkp->iblk_maps;
8959 while (i < (ISM_MAP_SLOTS - 1)) {
8960 ism_map[i] = ism_map[i + 1];
8961 i++;
8963 /* i == (ISM_MAP_SLOTS - 1) */
8964 ism_blkp = ism_blkp->iblk_next;
8965 if (ism_blkp != NULL) {
8966 ism_map[i] = ism_blkp->iblk_maps[0];
8967 i = 0;
8968 } else {
8969 ism_map[i].imap_seg = 0;
8970 ism_map[i].imap_vb_shift = 0;
8971 ism_map[i].imap_rid = SFMMU_INVALID_ISMRID;
8972 ism_map[i].imap_hatflags = 0;
8973 ism_map[i].imap_sz_mask = 0;
8974 ism_map[i].imap_ismhat = NULL;
8975 ism_map[i].imap_ment = NULL;
8980 * Now flush entire TSB for the process, since
8981 * demapping page by page can be too expensive.
8982 * We don't have to flush the TLB here anymore
8983 * since we switch to a new TLB ctx instead.
8984 * Also, there is no need to flush if the process
8985 * is exiting since the TSB will be freed later.
8987 if (!sfmmup->sfmmu_free) {
8988 hatlockp = sfmmu_hat_enter(sfmmup);
8989 for (tsbinfo = sfmmup->sfmmu_tsb; tsbinfo != NULL;
8990 tsbinfo = tsbinfo->tsb_next) {
8991 if (tsbinfo->tsb_flags & TSB_SWAPPED)
8992 continue;
8993 if (tsbinfo->tsb_flags & TSB_RELOC_FLAG) {
8994 tsbinfo->tsb_flags |=
8995 TSB_FLUSH_NEEDED;
8996 continue;
8999 sfmmu_inv_tsb(tsbinfo->tsb_va,
9000 TSB_BYTES(tsbinfo->tsb_szc));
9002 sfmmu_hat_exit(hatlockp);
9007 * Update our counters for this sfmmup's ism mappings.
9009 for (i = 0; i <= ismszc; i++) {
9010 if (!(disable_ism_large_pages & (1 << i)))
9011 (void) ism_tsb_entries(sfmmup, i);
9014 sfmmu_ismhat_exit(sfmmup, 0);
9017 * We must do our freeing here after dropping locks
9018 * to prevent a deadlock in the kmem allocator on the
9019 * mapping list lock.
9021 if (free_ment != NULL)
9022 kmem_cache_free(ism_ment_cache, free_ment);
9025 * Check TSB and TLB page sizes if the process isn't exiting.
9027 if (!sfmmup->sfmmu_free) {
9028 if (found && old_scdp != NULL && sfmmup->sfmmu_scdp == NULL) {
9029 sfmmu_check_page_sizes(sfmmup, 1);
9030 } else {
9031 sfmmu_check_page_sizes(sfmmup, 0);
9036 /* ARGSUSED */
9037 static int
9038 sfmmu_idcache_constructor(void *buf, void *cdrarg, int kmflags)
9040 /* void *buf is sfmmu_t pointer */
9041 bzero(buf, sizeof (sfmmu_t));
9043 return (0);
9046 /* ARGSUSED */
9047 static void
9048 sfmmu_idcache_destructor(void *buf, void *cdrarg)
9050 /* void *buf is sfmmu_t pointer */
9054 * setup kmem hmeblks by bzeroing all members and initializing the nextpa
9055 * field to be the pa of this hmeblk
9057 /* ARGSUSED */
9058 static int
9059 sfmmu_hblkcache_constructor(void *buf, void *cdrarg, int kmflags)
9061 struct hme_blk *hmeblkp;
9063 bzero(buf, (size_t)cdrarg);
9064 hmeblkp = (struct hme_blk *)buf;
9065 hmeblkp->hblk_nextpa = va_to_pa((caddr_t)hmeblkp);
9067 #ifdef HBLK_TRACE
9068 mutex_init(&hmeblkp->hblk_audit_lock, NULL, MUTEX_DEFAULT, NULL);
9069 #endif /* HBLK_TRACE */
9071 return (0);
9074 /* ARGSUSED */
9075 static void
9076 sfmmu_hblkcache_destructor(void *buf, void *cdrarg)
9079 #ifdef HBLK_TRACE
9081 struct hme_blk *hmeblkp;
9083 hmeblkp = (struct hme_blk *)buf;
9084 mutex_destroy(&hmeblkp->hblk_audit_lock);
9086 #endif /* HBLK_TRACE */
9089 #define SFMMU_CACHE_RECLAIM_SCAN_RATIO 8
9090 static int sfmmu_cache_reclaim_scan_ratio = SFMMU_CACHE_RECLAIM_SCAN_RATIO;
9092 * The kmem allocator will callback into our reclaim routine when the system
9093 * is running low in memory. We traverse the hash and free up all unused but
9094 * still cached hme_blks. We also traverse the free list and free them up
9095 * as well.
9097 /*ARGSUSED*/
9098 static void
9099 sfmmu_hblkcache_reclaim(void *cdrarg)
9101 int i;
9102 struct hmehash_bucket *hmebp;
9103 struct hme_blk *hmeblkp, *nx_hblk, *pr_hblk = NULL;
9104 static struct hmehash_bucket *uhmehash_reclaim_hand;
9105 static struct hmehash_bucket *khmehash_reclaim_hand;
9106 struct hme_blk *list = NULL, *last_hmeblkp;
9107 cpuset_t cpuset = cpu_ready_set;
9108 cpu_hme_pend_t *cpuhp;
9110 /* Free up hmeblks on the cpu pending lists */
9111 for (i = 0; i < NCPU; i++) {
9112 cpuhp = &cpu_hme_pend[i];
9113 if (cpuhp->chp_listp != NULL) {
9114 mutex_enter(&cpuhp->chp_mutex);
9115 if (cpuhp->chp_listp == NULL) {
9116 mutex_exit(&cpuhp->chp_mutex);
9117 continue;
9119 for (last_hmeblkp = cpuhp->chp_listp;
9120 last_hmeblkp->hblk_next != NULL;
9121 last_hmeblkp = last_hmeblkp->hblk_next)
9123 last_hmeblkp->hblk_next = list;
9124 list = cpuhp->chp_listp;
9125 cpuhp->chp_listp = NULL;
9126 cpuhp->chp_count = 0;
9127 mutex_exit(&cpuhp->chp_mutex);
9132 if (list != NULL) {
9133 kpreempt_disable();
9134 CPUSET_DEL(cpuset, CPU->cpu_id);
9135 xt_sync(cpuset);
9136 xt_sync(cpuset);
9137 kpreempt_enable();
9138 sfmmu_hblk_free(&list);
9139 list = NULL;
9142 hmebp = uhmehash_reclaim_hand;
9143 if (hmebp == NULL || hmebp > &uhme_hash[UHMEHASH_SZ])
9144 uhmehash_reclaim_hand = hmebp = uhme_hash;
9145 uhmehash_reclaim_hand += UHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio;
9147 for (i = UHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; i; i--) {
9148 if (SFMMU_HASH_LOCK_TRYENTER(hmebp) != 0) {
9149 hmeblkp = hmebp->hmeblkp;
9150 pr_hblk = NULL;
9151 while (hmeblkp) {
9152 nx_hblk = hmeblkp->hblk_next;
9153 if (!hmeblkp->hblk_vcnt &&
9154 !hmeblkp->hblk_hmecnt) {
9155 sfmmu_hblk_hash_rm(hmebp, hmeblkp,
9156 pr_hblk, &list, 0);
9157 } else {
9158 pr_hblk = hmeblkp;
9160 hmeblkp = nx_hblk;
9162 SFMMU_HASH_UNLOCK(hmebp);
9164 if (hmebp++ == &uhme_hash[UHMEHASH_SZ])
9165 hmebp = uhme_hash;
9168 hmebp = khmehash_reclaim_hand;
9169 if (hmebp == NULL || hmebp > &khme_hash[KHMEHASH_SZ])
9170 khmehash_reclaim_hand = hmebp = khme_hash;
9171 khmehash_reclaim_hand += KHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio;
9173 for (i = KHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; i; i--) {
9174 if (SFMMU_HASH_LOCK_TRYENTER(hmebp) != 0) {
9175 hmeblkp = hmebp->hmeblkp;
9176 pr_hblk = NULL;
9177 while (hmeblkp) {
9178 nx_hblk = hmeblkp->hblk_next;
9179 if (!hmeblkp->hblk_vcnt &&
9180 !hmeblkp->hblk_hmecnt) {
9181 sfmmu_hblk_hash_rm(hmebp, hmeblkp,
9182 pr_hblk, &list, 0);
9183 } else {
9184 pr_hblk = hmeblkp;
9186 hmeblkp = nx_hblk;
9188 SFMMU_HASH_UNLOCK(hmebp);
9190 if (hmebp++ == &khme_hash[KHMEHASH_SZ])
9191 hmebp = khme_hash;
9193 sfmmu_hblks_list_purge(&list, 0);
9197 * sfmmu_get_ppvcolor should become a vm_machdep or hatop interface.
9198 * same goes for sfmmu_get_addrvcolor().
9200 * This function will return the virtual color for the specified page. The
9201 * virtual color corresponds to this page current mapping or its last mapping.
9202 * It is used by memory allocators to choose addresses with the correct
9203 * alignment so vac consistency is automatically maintained. If the page
9204 * has no color it returns -1.
9206 /*ARGSUSED*/
9208 sfmmu_get_ppvcolor(struct page *pp)
9210 #ifdef VAC
9211 int color;
9213 if (!(cache & CACHE_VAC) || PP_NEWPAGE(pp)) {
9214 return (-1);
9216 color = PP_GET_VCOLOR(pp);
9217 ASSERT(color < mmu_btop(shm_alignment));
9218 return (color);
9219 #else
9220 return (-1);
9221 #endif /* VAC */
9225 * This function will return the desired alignment for vac consistency
9226 * (vac color) given a virtual address. If no vac is present it returns -1.
9228 /*ARGSUSED*/
9230 sfmmu_get_addrvcolor(caddr_t vaddr)
9232 #ifdef VAC
9233 if (cache & CACHE_VAC) {
9234 return (addr_to_vcolor(vaddr));
9235 } else {
9236 return (-1);
9238 #else
9239 return (-1);
9240 #endif /* VAC */
9243 #ifdef VAC
9245 * Check for conflicts.
9246 * A conflict exists if the new and existent mappings do not match in
9247 * their "shm_alignment fields. If conflicts exist, the existant mappings
9248 * are flushed unless one of them is locked. If one of them is locked, then
9249 * the mappings are flushed and converted to non-cacheable mappings.
9251 static void
9252 sfmmu_vac_conflict(struct hat *hat, caddr_t addr, page_t *pp)
9254 struct hat *tmphat;
9255 struct sf_hment *sfhmep, *tmphme = NULL;
9256 struct hme_blk *hmeblkp;
9257 int vcolor;
9258 tte_t tte;
9260 ASSERT(sfmmu_mlist_held(pp));
9261 ASSERT(!PP_ISNC(pp)); /* page better be cacheable */
9263 vcolor = addr_to_vcolor(addr);
9264 if (PP_NEWPAGE(pp)) {
9265 PP_SET_VCOLOR(pp, vcolor);
9266 return;
9269 if (PP_GET_VCOLOR(pp) == vcolor) {
9270 return;
9273 if (!PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp)) {
9275 * Previous user of page had a different color
9276 * but since there are no current users
9277 * we just flush the cache and change the color.
9279 SFMMU_STAT(sf_pgcolor_conflict);
9280 sfmmu_cache_flush(pp->p_pagenum, PP_GET_VCOLOR(pp));
9281 PP_SET_VCOLOR(pp, vcolor);
9282 return;
9286 * If we get here we have a vac conflict with a current
9287 * mapping. VAC conflict policy is as follows.
9288 * - The default is to unload the other mappings unless:
9289 * - If we have a large mapping we uncache the page.
9290 * We need to uncache the rest of the large page too.
9291 * - If any of the mappings are locked we uncache the page.
9292 * - If the requested mapping is inconsistent
9293 * with another mapping and that mapping
9294 * is in the same address space we have to
9295 * make it non-cached. The default thing
9296 * to do is unload the inconsistent mapping
9297 * but if they are in the same address space
9298 * we run the risk of unmapping the pc or the
9299 * stack which we will use as we return to the user,
9300 * in which case we can then fault on the thing
9301 * we just unloaded and get into an infinite loop.
9303 if (PP_ISMAPPED_LARGE(pp)) {
9304 int sz;
9307 * Existing mapping is for big pages. We don't unload
9308 * existing big mappings to satisfy new mappings.
9309 * Always convert all mappings to TNC.
9311 sz = fnd_mapping_sz(pp);
9312 pp = PP_GROUPLEADER(pp, sz);
9313 SFMMU_STAT_ADD(sf_uncache_conflict, TTEPAGES(sz));
9314 sfmmu_page_cache_array(pp, HAT_TMPNC, CACHE_FLUSH,
9315 TTEPAGES(sz));
9317 return;
9321 * check if any mapping is in same as or if it is locked
9322 * since in that case we need to uncache.
9324 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = tmphme) {
9325 tmphme = sfhmep->hme_next;
9326 if (IS_PAHME(sfhmep))
9327 continue;
9328 hmeblkp = sfmmu_hmetohblk(sfhmep);
9329 if (hmeblkp->hblk_xhat_bit)
9330 continue;
9331 tmphat = hblktosfmmu(hmeblkp);
9332 sfmmu_copytte(&sfhmep->hme_tte, &tte);
9333 ASSERT(TTE_IS_VALID(&tte));
9334 if (hmeblkp->hblk_shared || tmphat == hat ||
9335 hmeblkp->hblk_lckcnt) {
9337 * We have an uncache conflict
9339 SFMMU_STAT(sf_uncache_conflict);
9340 sfmmu_page_cache_array(pp, HAT_TMPNC, CACHE_FLUSH, 1);
9341 return;
9346 * We have an unload conflict
9347 * We have already checked for LARGE mappings, therefore
9348 * the remaining mapping(s) must be TTE8K.
9350 SFMMU_STAT(sf_unload_conflict);
9352 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = tmphme) {
9353 tmphme = sfhmep->hme_next;
9354 if (IS_PAHME(sfhmep))
9355 continue;
9356 hmeblkp = sfmmu_hmetohblk(sfhmep);
9357 if (hmeblkp->hblk_xhat_bit)
9358 continue;
9359 ASSERT(!hmeblkp->hblk_shared);
9360 (void) sfmmu_pageunload(pp, sfhmep, TTE8K);
9363 if (PP_ISMAPPED_KPM(pp))
9364 sfmmu_kpm_vac_unload(pp, addr);
9367 * Unloads only do TLB flushes so we need to flush the
9368 * cache here.
9370 sfmmu_cache_flush(pp->p_pagenum, PP_GET_VCOLOR(pp));
9371 PP_SET_VCOLOR(pp, vcolor);
9375 * Whenever a mapping is unloaded and the page is in TNC state,
9376 * we see if the page can be made cacheable again. 'pp' is
9377 * the page that we just unloaded a mapping from, the size
9378 * of mapping that was unloaded is 'ottesz'.
9379 * Remark:
9380 * The recache policy for mpss pages can leave a performance problem
9381 * under the following circumstances:
9382 * . A large page in uncached mode has just been unmapped.
9383 * . All constituent pages are TNC due to a conflicting small mapping.
9384 * . There are many other, non conflicting, small mappings around for
9385 * a lot of the constituent pages.
9386 * . We're called w/ the "old" groupleader page and the old ottesz,
9387 * but this is irrelevant, since we're no more "PP_ISMAPPED_LARGE", so
9388 * we end up w/ TTE8K or npages == 1.
9389 * . We call tst_tnc w/ the old groupleader only, and if there is no
9390 * conflict, we re-cache only this page.
9391 * . All other small mappings are not checked and will be left in TNC mode.
9392 * The problem is not very serious because:
9393 * . mpss is actually only defined for heap and stack, so the probability
9394 * is not very high that a large page mapping exists in parallel to a small
9395 * one (this is possible, but seems to be bad programming style in the
9396 * appl).
9397 * . The problem gets a little bit more serious, when those TNC pages
9398 * have to be mapped into kernel space, e.g. for networking.
9399 * . When VAC alias conflicts occur in applications, this is regarded
9400 * as an application bug. So if kstat's show them, the appl should
9401 * be changed anyway.
9403 void
9404 conv_tnc(page_t *pp, int ottesz)
9406 int cursz, dosz;
9407 pgcnt_t curnpgs, dopgs;
9408 pgcnt_t pg64k;
9409 page_t *pp2;
9412 * Determine how big a range we check for TNC and find
9413 * leader page. cursz is the size of the biggest
9414 * mapping that still exist on 'pp'.
9416 if (PP_ISMAPPED_LARGE(pp)) {
9417 cursz = fnd_mapping_sz(pp);
9418 } else {
9419 cursz = TTE8K;
9422 if (ottesz >= cursz) {
9423 dosz = ottesz;
9424 pp2 = pp;
9425 } else {
9426 dosz = cursz;
9427 pp2 = PP_GROUPLEADER(pp, dosz);
9430 pg64k = TTEPAGES(TTE64K);
9431 dopgs = TTEPAGES(dosz);
9433 ASSERT(dopgs == 1 || ((dopgs & (pg64k - 1)) == 0));
9435 while (dopgs != 0) {
9436 curnpgs = TTEPAGES(cursz);
9437 if (tst_tnc(pp2, curnpgs)) {
9438 SFMMU_STAT_ADD(sf_recache, curnpgs);
9439 sfmmu_page_cache_array(pp2, HAT_CACHE, CACHE_NO_FLUSH,
9440 curnpgs);
9443 ASSERT(dopgs >= curnpgs);
9444 dopgs -= curnpgs;
9446 if (dopgs == 0) {
9447 break;
9450 pp2 = PP_PAGENEXT_N(pp2, curnpgs);
9451 if (((dopgs & (pg64k - 1)) == 0) && PP_ISMAPPED_LARGE(pp2)) {
9452 cursz = fnd_mapping_sz(pp2);
9453 } else {
9454 cursz = TTE8K;
9460 * Returns 1 if page(s) can be converted from TNC to cacheable setting,
9461 * returns 0 otherwise. Note that oaddr argument is valid for only
9462 * 8k pages.
9465 tst_tnc(page_t *pp, pgcnt_t npages)
9467 struct sf_hment *sfhme;
9468 struct hme_blk *hmeblkp;
9469 tte_t tte;
9470 caddr_t vaddr;
9471 int clr_valid = 0;
9472 int color, color1, bcolor;
9473 int i, ncolors;
9475 ASSERT(pp != NULL);
9476 ASSERT(!(cache & CACHE_WRITEBACK));
9478 if (npages > 1) {
9479 ncolors = CACHE_NUM_COLOR;
9482 for (i = 0; i < npages; i++) {
9483 ASSERT(sfmmu_mlist_held(pp));
9484 ASSERT(PP_ISTNC(pp));
9485 ASSERT(PP_GET_VCOLOR(pp) == NO_VCOLOR);
9487 if (PP_ISPNC(pp)) {
9488 return (0);
9491 clr_valid = 0;
9492 if (PP_ISMAPPED_KPM(pp)) {
9493 caddr_t kpmvaddr;
9495 ASSERT(kpm_enable);
9496 kpmvaddr = hat_kpm_page2va(pp, 1);
9497 ASSERT(!(npages > 1 && IS_KPM_ALIAS_RANGE(kpmvaddr)));
9498 color1 = addr_to_vcolor(kpmvaddr);
9499 clr_valid = 1;
9502 for (sfhme = pp->p_mapping; sfhme; sfhme = sfhme->hme_next) {
9503 if (IS_PAHME(sfhme))
9504 continue;
9505 hmeblkp = sfmmu_hmetohblk(sfhme);
9506 if (hmeblkp->hblk_xhat_bit)
9507 continue;
9509 sfmmu_copytte(&sfhme->hme_tte, &tte);
9510 ASSERT(TTE_IS_VALID(&tte));
9512 vaddr = tte_to_vaddr(hmeblkp, tte);
9513 color = addr_to_vcolor(vaddr);
9515 if (npages > 1) {
9517 * If there is a big mapping, make sure
9518 * 8K mapping is consistent with the big
9519 * mapping.
9521 bcolor = i % ncolors;
9522 if (color != bcolor) {
9523 return (0);
9526 if (!clr_valid) {
9527 clr_valid = 1;
9528 color1 = color;
9531 if (color1 != color) {
9532 return (0);
9536 pp = PP_PAGENEXT(pp);
9539 return (1);
9542 void
9543 sfmmu_page_cache_array(page_t *pp, int flags, int cache_flush_flag,
9544 pgcnt_t npages)
9546 kmutex_t *pmtx;
9547 int i, ncolors, bcolor;
9548 kpm_hlk_t *kpmp;
9549 cpuset_t cpuset;
9551 ASSERT(pp != NULL);
9552 ASSERT(!(cache & CACHE_WRITEBACK));
9554 kpmp = sfmmu_kpm_kpmp_enter(pp, npages);
9555 pmtx = sfmmu_page_enter(pp);
9558 * Fast path caching single unmapped page
9560 if (npages == 1 && !PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp) &&
9561 flags == HAT_CACHE) {
9562 PP_CLRTNC(pp);
9563 PP_CLRPNC(pp);
9564 sfmmu_page_exit(pmtx);
9565 sfmmu_kpm_kpmp_exit(kpmp);
9566 return;
9570 * We need to capture all cpus in order to change cacheability
9571 * because we can't allow one cpu to access the same physical
9572 * page using a cacheable and a non-cachebale mapping at the same
9573 * time. Since we may end up walking the ism mapping list
9574 * have to grab it's lock now since we can't after all the
9575 * cpus have been captured.
9577 sfmmu_hat_lock_all();
9578 mutex_enter(&ism_mlist_lock);
9579 kpreempt_disable();
9580 cpuset = cpu_ready_set;
9581 xc_attention(cpuset);
9583 if (npages > 1) {
9585 * Make sure all colors are flushed since the
9586 * sfmmu_page_cache() only flushes one color-
9587 * it does not know big pages.
9589 ncolors = CACHE_NUM_COLOR;
9590 if (flags & HAT_TMPNC) {
9591 for (i = 0; i < ncolors; i++) {
9592 sfmmu_cache_flushcolor(i, pp->p_pagenum);
9594 cache_flush_flag = CACHE_NO_FLUSH;
9598 for (i = 0; i < npages; i++) {
9600 ASSERT(sfmmu_mlist_held(pp));
9602 if (!(flags == HAT_TMPNC && PP_ISTNC(pp))) {
9604 if (npages > 1) {
9605 bcolor = i % ncolors;
9606 } else {
9607 bcolor = NO_VCOLOR;
9610 sfmmu_page_cache(pp, flags, cache_flush_flag,
9611 bcolor);
9614 pp = PP_PAGENEXT(pp);
9617 xt_sync(cpuset);
9618 xc_dismissed(cpuset);
9619 mutex_exit(&ism_mlist_lock);
9620 sfmmu_hat_unlock_all();
9621 sfmmu_page_exit(pmtx);
9622 sfmmu_kpm_kpmp_exit(kpmp);
9623 kpreempt_enable();
9627 * This function changes the virtual cacheability of all mappings to a
9628 * particular page. When changing from uncache to cacheable the mappings will
9629 * only be changed if all of them have the same virtual color.
9630 * We need to flush the cache in all cpus. It is possible that
9631 * a process referenced a page as cacheable but has sinced exited
9632 * and cleared the mapping list. We still to flush it but have no
9633 * state so all cpus is the only alternative.
9635 static void
9636 sfmmu_page_cache(page_t *pp, int flags, int cache_flush_flag, int bcolor)
9638 struct sf_hment *sfhme;
9639 struct hme_blk *hmeblkp;
9640 sfmmu_t *sfmmup;
9641 tte_t tte, ttemod;
9642 caddr_t vaddr;
9643 int ret, color;
9644 pfn_t pfn;
9646 color = bcolor;
9647 pfn = pp->p_pagenum;
9649 for (sfhme = pp->p_mapping; sfhme; sfhme = sfhme->hme_next) {
9651 if (IS_PAHME(sfhme))
9652 continue;
9653 hmeblkp = sfmmu_hmetohblk(sfhme);
9655 if (hmeblkp->hblk_xhat_bit)
9656 continue;
9658 sfmmu_copytte(&sfhme->hme_tte, &tte);
9659 ASSERT(TTE_IS_VALID(&tte));
9660 vaddr = tte_to_vaddr(hmeblkp, tte);
9661 color = addr_to_vcolor(vaddr);
9663 #ifdef DEBUG
9664 if ((flags & HAT_CACHE) && bcolor != NO_VCOLOR) {
9665 ASSERT(color == bcolor);
9667 #endif
9669 ASSERT(flags != HAT_TMPNC || color == PP_GET_VCOLOR(pp));
9671 ttemod = tte;
9672 if (flags & (HAT_UNCACHE | HAT_TMPNC)) {
9673 TTE_CLR_VCACHEABLE(&ttemod);
9674 } else { /* flags & HAT_CACHE */
9675 TTE_SET_VCACHEABLE(&ttemod);
9677 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte);
9678 if (ret < 0) {
9680 * Since all cpus are captured modifytte should not
9681 * fail.
9683 panic("sfmmu_page_cache: write to tte failed");
9686 sfmmup = hblktosfmmu(hmeblkp);
9687 if (cache_flush_flag == CACHE_FLUSH) {
9689 * Flush TSBs, TLBs and caches
9691 if (hmeblkp->hblk_shared) {
9692 sf_srd_t *srdp = (sf_srd_t *)sfmmup;
9693 uint_t rid = hmeblkp->hblk_tag.htag_rid;
9694 sf_region_t *rgnp;
9695 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
9696 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
9697 ASSERT(srdp != NULL);
9698 rgnp = srdp->srd_hmergnp[rid];
9699 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp,
9700 srdp, rgnp, rid);
9701 (void) sfmmu_rgntlb_demap(vaddr, rgnp,
9702 hmeblkp, 0);
9703 sfmmu_cache_flush(pfn, addr_to_vcolor(vaddr));
9704 } else if (sfmmup->sfmmu_ismhat) {
9705 if (flags & HAT_CACHE) {
9706 SFMMU_STAT(sf_ism_recache);
9707 } else {
9708 SFMMU_STAT(sf_ism_uncache);
9710 sfmmu_ismtlbcache_demap(vaddr, sfmmup, hmeblkp,
9711 pfn, CACHE_FLUSH);
9712 } else {
9713 sfmmu_tlbcache_demap(vaddr, sfmmup, hmeblkp,
9714 pfn, 0, FLUSH_ALL_CPUS, CACHE_FLUSH, 1);
9718 * all cache entries belonging to this pfn are
9719 * now flushed.
9721 cache_flush_flag = CACHE_NO_FLUSH;
9722 } else {
9724 * Flush only TSBs and TLBs.
9726 if (hmeblkp->hblk_shared) {
9727 sf_srd_t *srdp = (sf_srd_t *)sfmmup;
9728 uint_t rid = hmeblkp->hblk_tag.htag_rid;
9729 sf_region_t *rgnp;
9730 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
9731 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
9732 ASSERT(srdp != NULL);
9733 rgnp = srdp->srd_hmergnp[rid];
9734 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp,
9735 srdp, rgnp, rid);
9736 (void) sfmmu_rgntlb_demap(vaddr, rgnp,
9737 hmeblkp, 0);
9738 } else if (sfmmup->sfmmu_ismhat) {
9739 if (flags & HAT_CACHE) {
9740 SFMMU_STAT(sf_ism_recache);
9741 } else {
9742 SFMMU_STAT(sf_ism_uncache);
9744 sfmmu_ismtlbcache_demap(vaddr, sfmmup, hmeblkp,
9745 pfn, CACHE_NO_FLUSH);
9746 } else {
9747 sfmmu_tlb_demap(vaddr, sfmmup, hmeblkp, 0, 1);
9752 if (PP_ISMAPPED_KPM(pp))
9753 sfmmu_kpm_page_cache(pp, flags, cache_flush_flag);
9755 switch (flags) {
9757 default:
9758 panic("sfmmu_pagecache: unknown flags");
9759 break;
9761 case HAT_CACHE:
9762 PP_CLRTNC(pp);
9763 PP_CLRPNC(pp);
9764 PP_SET_VCOLOR(pp, color);
9765 break;
9767 case HAT_TMPNC:
9768 PP_SETTNC(pp);
9769 PP_SET_VCOLOR(pp, NO_VCOLOR);
9770 break;
9772 case HAT_UNCACHE:
9773 PP_SETPNC(pp);
9774 PP_CLRTNC(pp);
9775 PP_SET_VCOLOR(pp, NO_VCOLOR);
9776 break;
9779 #endif /* VAC */
9783 * Wrapper routine used to return a context.
9785 * It's the responsibility of the caller to guarantee that the
9786 * process serializes on calls here by taking the HAT lock for
9787 * the hat.
9790 static void
9791 sfmmu_get_ctx(sfmmu_t *sfmmup)
9793 mmu_ctx_t *mmu_ctxp;
9794 uint_t pstate_save;
9795 int ret;
9797 ASSERT(sfmmu_hat_lock_held(sfmmup));
9798 ASSERT(sfmmup != ksfmmup);
9800 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_ALLCTX_INVALID)) {
9801 sfmmu_setup_tsbinfo(sfmmup);
9802 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ALLCTX_INVALID);
9805 kpreempt_disable();
9807 mmu_ctxp = CPU_MMU_CTXP(CPU);
9808 ASSERT(mmu_ctxp);
9809 ASSERT(mmu_ctxp->mmu_idx < max_mmu_ctxdoms);
9810 ASSERT(mmu_ctxp == mmu_ctxs_tbl[mmu_ctxp->mmu_idx]);
9813 * Do a wrap-around if cnum reaches the max # cnum supported by a MMU.
9815 if (mmu_ctxp->mmu_cnum == mmu_ctxp->mmu_nctxs)
9816 sfmmu_ctx_wrap_around(mmu_ctxp, B_TRUE);
9819 * Let the MMU set up the page sizes to use for
9820 * this context in the TLB. Don't program 2nd dtlb for ism hat.
9822 if ((&mmu_set_ctx_page_sizes) && (sfmmup->sfmmu_ismhat == 0)) {
9823 mmu_set_ctx_page_sizes(sfmmup);
9827 * sfmmu_alloc_ctx and sfmmu_load_mmustate will be performed with
9828 * interrupts disabled to prevent race condition with wrap-around
9829 * ctx invalidatation. In sun4v, ctx invalidation also involves
9830 * a HV call to set the number of TSBs to 0. If interrupts are not
9831 * disabled until after sfmmu_load_mmustate is complete TSBs may
9832 * become assigned to INVALID_CONTEXT. This is not allowed.
9834 pstate_save = sfmmu_disable_intrs();
9836 if (sfmmu_alloc_ctx(sfmmup, 1, CPU, SFMMU_PRIVATE) &&
9837 sfmmup->sfmmu_scdp != NULL) {
9838 sf_scd_t *scdp = sfmmup->sfmmu_scdp;
9839 sfmmu_t *scsfmmup = scdp->scd_sfmmup;
9840 ret = sfmmu_alloc_ctx(scsfmmup, 1, CPU, SFMMU_SHARED);
9841 /* debug purpose only */
9842 ASSERT(!ret || scsfmmup->sfmmu_ctxs[CPU_MMU_IDX(CPU)].cnum
9843 != INVALID_CONTEXT);
9845 sfmmu_load_mmustate(sfmmup);
9847 sfmmu_enable_intrs(pstate_save);
9849 kpreempt_enable();
9853 * When all cnums are used up in a MMU, cnum will wrap around to the
9854 * next generation and start from 2.
9856 static void
9857 sfmmu_ctx_wrap_around(mmu_ctx_t *mmu_ctxp, boolean_t reset_cnum)
9860 /* caller must have disabled the preemption */
9861 ASSERT(curthread->t_preempt >= 1);
9862 ASSERT(mmu_ctxp != NULL);
9864 /* acquire Per-MMU (PM) spin lock */
9865 mutex_enter(&mmu_ctxp->mmu_lock);
9867 /* re-check to see if wrap-around is needed */
9868 if (mmu_ctxp->mmu_cnum < mmu_ctxp->mmu_nctxs)
9869 goto done;
9871 SFMMU_MMU_STAT(mmu_wrap_around);
9873 /* update gnum */
9874 ASSERT(mmu_ctxp->mmu_gnum != 0);
9875 mmu_ctxp->mmu_gnum++;
9876 if (mmu_ctxp->mmu_gnum == 0 ||
9877 mmu_ctxp->mmu_gnum > MAX_SFMMU_GNUM_VAL) {
9878 cmn_err(CE_PANIC, "mmu_gnum of mmu_ctx 0x%p is out of bound.",
9879 (void *)mmu_ctxp);
9882 if (mmu_ctxp->mmu_ncpus > 1) {
9883 cpuset_t cpuset;
9885 membar_enter(); /* make sure updated gnum visible */
9887 SFMMU_XCALL_STATS(NULL);
9889 /* xcall to others on the same MMU to invalidate ctx */
9890 cpuset = mmu_ctxp->mmu_cpuset;
9891 ASSERT(CPU_IN_SET(cpuset, CPU->cpu_id) || !reset_cnum);
9892 CPUSET_DEL(cpuset, CPU->cpu_id);
9893 CPUSET_AND(cpuset, cpu_ready_set);
9896 * Pass in INVALID_CONTEXT as the first parameter to
9897 * sfmmu_raise_tsb_exception, which invalidates the context
9898 * of any process running on the CPUs in the MMU.
9900 xt_some(cpuset, sfmmu_raise_tsb_exception,
9901 INVALID_CONTEXT, INVALID_CONTEXT);
9902 xt_sync(cpuset);
9904 SFMMU_MMU_STAT(mmu_tsb_raise_exception);
9907 if (sfmmu_getctx_sec() != INVALID_CONTEXT) {
9908 sfmmu_setctx_sec(INVALID_CONTEXT);
9909 sfmmu_clear_utsbinfo();
9913 * No xcall is needed here. For sun4u systems all CPUs in context
9914 * domain share a single physical MMU therefore it's enough to flush
9915 * TLB on local CPU. On sun4v systems we use 1 global context
9916 * domain and flush all remote TLBs in sfmmu_raise_tsb_exception
9917 * handler. Note that vtag_flushall_uctxs() is called
9918 * for Ultra II machine, where the equivalent flushall functionality
9919 * is implemented in SW, and only user ctx TLB entries are flushed.
9921 if (&vtag_flushall_uctxs != NULL) {
9922 vtag_flushall_uctxs();
9923 } else {
9924 vtag_flushall();
9927 /* reset mmu cnum, skips cnum 0 and 1 */
9928 if (reset_cnum == B_TRUE)
9929 mmu_ctxp->mmu_cnum = NUM_LOCKED_CTXS;
9931 done:
9932 mutex_exit(&mmu_ctxp->mmu_lock);
9937 * For multi-threaded process, set the process context to INVALID_CONTEXT
9938 * so that it faults and reloads the MMU state from TL=0. For single-threaded
9939 * process, we can just load the MMU state directly without having to
9940 * set context invalid. Caller must hold the hat lock since we don't
9941 * acquire it here.
9943 static void
9944 sfmmu_sync_mmustate(sfmmu_t *sfmmup)
9946 uint_t cnum;
9947 uint_t pstate_save;
9949 ASSERT(sfmmup != ksfmmup);
9950 ASSERT(sfmmu_hat_lock_held(sfmmup));
9952 kpreempt_disable();
9955 * We check whether the pass'ed-in sfmmup is the same as the
9956 * current running proc. This is to makes sure the current proc
9957 * stays single-threaded if it already is.
9959 if ((sfmmup == curthread->t_procp->p_as->a_hat) &&
9960 (curthread->t_procp->p_lwpcnt == 1)) {
9961 /* single-thread */
9962 cnum = sfmmup->sfmmu_ctxs[CPU_MMU_IDX(CPU)].cnum;
9963 if (cnum != INVALID_CONTEXT) {
9964 uint_t curcnum;
9966 * Disable interrupts to prevent race condition
9967 * with sfmmu_ctx_wrap_around ctx invalidation.
9968 * In sun4v, ctx invalidation involves setting
9969 * TSB to NULL, hence, interrupts should be disabled
9970 * untill after sfmmu_load_mmustate is completed.
9972 pstate_save = sfmmu_disable_intrs();
9973 curcnum = sfmmu_getctx_sec();
9974 if (curcnum == cnum)
9975 sfmmu_load_mmustate(sfmmup);
9976 sfmmu_enable_intrs(pstate_save);
9977 ASSERT(curcnum == cnum || curcnum == INVALID_CONTEXT);
9979 } else {
9981 * multi-thread
9982 * or when sfmmup is not the same as the curproc.
9984 sfmmu_invalidate_ctx(sfmmup);
9987 kpreempt_enable();
9992 * Replace the specified TSB with a new TSB. This function gets called when
9993 * we grow, shrink or swapin a TSB. When swapping in a TSB (TSB_SWAPIN), the
9994 * TSB_FORCEALLOC flag may be used to force allocation of a minimum-sized TSB
9995 * (8K).
9997 * Caller must hold the HAT lock, but should assume any tsb_info
9998 * pointers it has are no longer valid after calling this function.
10000 * Return values:
10001 * TSB_ALLOCFAIL Failed to allocate a TSB, due to memory constraints
10002 * TSB_LOSTRACE HAT is busy, i.e. another thread is already doing
10003 * something to this tsbinfo/TSB
10004 * TSB_SUCCESS Operation succeeded
10006 static tsb_replace_rc_t
10007 sfmmu_replace_tsb(sfmmu_t *sfmmup, struct tsb_info *old_tsbinfo, uint_t szc,
10008 hatlock_t *hatlockp, uint_t flags)
10010 struct tsb_info *new_tsbinfo = NULL;
10011 struct tsb_info *curtsb, *prevtsb;
10012 uint_t tte_sz_mask;
10013 int i;
10015 ASSERT(sfmmup != ksfmmup);
10016 ASSERT(sfmmup->sfmmu_ismhat == 0);
10017 ASSERT(sfmmu_hat_lock_held(sfmmup));
10018 ASSERT(szc <= tsb_max_growsize);
10020 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_BUSY))
10021 return (TSB_LOSTRACE);
10024 * Find the tsb_info ahead of this one in the list, and
10025 * also make sure that the tsb_info passed in really
10026 * exists!
10028 for (prevtsb = NULL, curtsb = sfmmup->sfmmu_tsb;
10029 curtsb != old_tsbinfo && curtsb != NULL;
10030 prevtsb = curtsb, curtsb = curtsb->tsb_next)
10032 ASSERT(curtsb != NULL);
10034 if (!(flags & TSB_SWAPIN) && SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) {
10036 * The process is swapped out, so just set the new size
10037 * code. When it swaps back in, we'll allocate a new one
10038 * of the new chosen size.
10040 curtsb->tsb_szc = szc;
10041 return (TSB_SUCCESS);
10043 SFMMU_FLAGS_SET(sfmmup, HAT_BUSY);
10045 tte_sz_mask = old_tsbinfo->tsb_ttesz_mask;
10048 * All initialization is done inside of sfmmu_tsbinfo_alloc().
10049 * If we fail to allocate a TSB, exit.
10051 * If tsb grows with new tsb size > 4M and old tsb size < 4M,
10052 * then try 4M slab after the initial alloc fails.
10054 * If tsb swapin with tsb size > 4M, then try 4M after the
10055 * initial alloc fails.
10057 sfmmu_hat_exit(hatlockp);
10058 if (sfmmu_tsbinfo_alloc(&new_tsbinfo, szc,
10059 tte_sz_mask, flags, sfmmup) &&
10060 (!(flags & (TSB_GROW | TSB_SWAPIN)) || (szc <= TSB_4M_SZCODE) ||
10061 (!(flags & TSB_SWAPIN) &&
10062 (old_tsbinfo->tsb_szc >= TSB_4M_SZCODE)) ||
10063 sfmmu_tsbinfo_alloc(&new_tsbinfo, TSB_4M_SZCODE,
10064 tte_sz_mask, flags, sfmmup))) {
10065 (void) sfmmu_hat_enter(sfmmup);
10066 if (!(flags & TSB_SWAPIN))
10067 SFMMU_STAT(sf_tsb_resize_failures);
10068 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY);
10069 return (TSB_ALLOCFAIL);
10071 (void) sfmmu_hat_enter(sfmmup);
10074 * Re-check to make sure somebody else didn't muck with us while we
10075 * didn't hold the HAT lock. If the process swapped out, fine, just
10076 * exit; this can happen if we try to shrink the TSB from the context
10077 * of another process (such as on an ISM unmap), though it is rare.
10079 if (!(flags & TSB_SWAPIN) && SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) {
10080 SFMMU_STAT(sf_tsb_resize_failures);
10081 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY);
10082 sfmmu_hat_exit(hatlockp);
10083 sfmmu_tsbinfo_free(new_tsbinfo);
10084 (void) sfmmu_hat_enter(sfmmup);
10085 return (TSB_LOSTRACE);
10088 #ifdef DEBUG
10089 /* Reverify that the tsb_info still exists.. for debugging only */
10090 for (prevtsb = NULL, curtsb = sfmmup->sfmmu_tsb;
10091 curtsb != old_tsbinfo && curtsb != NULL;
10092 prevtsb = curtsb, curtsb = curtsb->tsb_next)
10094 ASSERT(curtsb != NULL);
10095 #endif /* DEBUG */
10098 * Quiesce any CPUs running this process on their next TLB miss
10099 * so they atomically see the new tsb_info. We temporarily set the
10100 * context to invalid context so new threads that come on processor
10101 * after we do the xcall to cpusran will also serialize behind the
10102 * HAT lock on TLB miss and will see the new TSB. Since this short
10103 * race with a new thread coming on processor is relatively rare,
10104 * this synchronization mechanism should be cheaper than always
10105 * pausing all CPUs for the duration of the setup, which is what
10106 * the old implementation did. This is particuarly true if we are
10107 * copying a huge chunk of memory around during that window.
10109 * The memory barriers are to make sure things stay consistent
10110 * with resume() since it does not hold the HAT lock while
10111 * walking the list of tsb_info structures.
10113 if ((flags & TSB_SWAPIN) != TSB_SWAPIN) {
10114 /* The TSB is either growing or shrinking. */
10115 sfmmu_invalidate_ctx(sfmmup);
10116 } else {
10118 * It is illegal to swap in TSBs from a process other
10119 * than a process being swapped in. This in turn
10120 * implies we do not have a valid MMU context here
10121 * since a process needs one to resolve translation
10122 * misses.
10124 ASSERT(curthread->t_procp->p_as->a_hat == sfmmup);
10127 #ifdef DEBUG
10128 ASSERT(max_mmu_ctxdoms > 0);
10131 * Process should have INVALID_CONTEXT on all MMUs
10133 for (i = 0; i < max_mmu_ctxdoms; i++) {
10135 ASSERT(sfmmup->sfmmu_ctxs[i].cnum == INVALID_CONTEXT);
10137 #endif
10139 new_tsbinfo->tsb_next = old_tsbinfo->tsb_next;
10140 membar_stst(); /* strict ordering required */
10141 if (prevtsb)
10142 prevtsb->tsb_next = new_tsbinfo;
10143 else
10144 sfmmup->sfmmu_tsb = new_tsbinfo;
10145 membar_enter(); /* make sure new TSB globally visible */
10148 * We need to migrate TSB entries from the old TSB to the new TSB
10149 * if tsb_remap_ttes is set and the TSB is growing.
10151 if (tsb_remap_ttes && ((flags & TSB_GROW) == TSB_GROW))
10152 sfmmu_copy_tsb(old_tsbinfo, new_tsbinfo);
10154 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY);
10157 * Drop the HAT lock to free our old tsb_info.
10159 sfmmu_hat_exit(hatlockp);
10161 if ((flags & TSB_GROW) == TSB_GROW) {
10162 SFMMU_STAT(sf_tsb_grow);
10163 } else if ((flags & TSB_SHRINK) == TSB_SHRINK) {
10164 SFMMU_STAT(sf_tsb_shrink);
10167 sfmmu_tsbinfo_free(old_tsbinfo);
10169 (void) sfmmu_hat_enter(sfmmup);
10170 return (TSB_SUCCESS);
10174 * This function will re-program hat pgsz array, and invalidate the
10175 * process' context, forcing the process to switch to another
10176 * context on the next TLB miss, and therefore start using the
10177 * TLB that is reprogrammed for the new page sizes.
10179 void
10180 sfmmu_reprog_pgsz_arr(sfmmu_t *sfmmup, uint8_t *tmp_pgsz)
10182 int i;
10183 hatlock_t *hatlockp = NULL;
10185 hatlockp = sfmmu_hat_enter(sfmmup);
10186 /* USIII+-IV+ optimization, requires hat lock */
10187 if (tmp_pgsz) {
10188 for (i = 0; i < mmu_page_sizes; i++)
10189 sfmmup->sfmmu_pgsz[i] = tmp_pgsz[i];
10191 SFMMU_STAT(sf_tlb_reprog_pgsz);
10193 sfmmu_invalidate_ctx(sfmmup);
10195 sfmmu_hat_exit(hatlockp);
10199 * The scd_rttecnt field in the SCD must be updated to take account of the
10200 * regions which it contains.
10202 static void
10203 sfmmu_set_scd_rttecnt(sf_srd_t *srdp, sf_scd_t *scdp)
10205 uint_t rid;
10206 uint_t i, j;
10207 ulong_t w;
10208 sf_region_t *rgnp;
10210 ASSERT(srdp != NULL);
10212 for (i = 0; i < SFMMU_HMERGNMAP_WORDS; i++) {
10213 if ((w = scdp->scd_region_map.bitmap[i]) == 0) {
10214 continue;
10217 j = 0;
10218 while (w) {
10219 if (!(w & 0x1)) {
10220 j++;
10221 w >>= 1;
10222 continue;
10224 rid = (i << BT_ULSHIFT) | j;
10225 j++;
10226 w >>= 1;
10228 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
10229 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
10230 rgnp = srdp->srd_hmergnp[rid];
10231 ASSERT(rgnp->rgn_refcnt > 0);
10232 ASSERT(rgnp->rgn_id == rid);
10234 scdp->scd_rttecnt[rgnp->rgn_pgszc] +=
10235 rgnp->rgn_size >> TTE_PAGE_SHIFT(rgnp->rgn_pgszc);
10238 * Maintain the tsb0 inflation cnt for the regions
10239 * in the SCD.
10241 if (rgnp->rgn_pgszc >= TTE4M) {
10242 scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt +=
10243 rgnp->rgn_size >>
10244 (TTE_PAGE_SHIFT(TTE8K) + 2);
10251 * This function assumes that there are either four or six supported page
10252 * sizes and at most two programmable TLBs, so we need to decide which
10253 * page sizes are most important and then tell the MMU layer so it
10254 * can adjust the TLB page sizes accordingly (if supported).
10256 * If these assumptions change, this function will need to be
10257 * updated to support whatever the new limits are.
10259 * The growing flag is nonzero if we are growing the address space,
10260 * and zero if it is shrinking. This allows us to decide whether
10261 * to grow or shrink our TSB, depending upon available memory
10262 * conditions.
10264 static void
10265 sfmmu_check_page_sizes(sfmmu_t *sfmmup, int growing)
10267 uint64_t ttecnt[MMU_PAGE_SIZES];
10268 uint64_t tte8k_cnt, tte4m_cnt;
10269 uint8_t i;
10270 int sectsb_thresh;
10273 * Kernel threads, processes with small address spaces not using
10274 * large pages, and dummy ISM HATs need not apply.
10276 if (sfmmup == ksfmmup || sfmmup->sfmmu_ismhat != NULL)
10277 return;
10279 if (!SFMMU_LGPGS_INUSE(sfmmup) &&
10280 sfmmup->sfmmu_ttecnt[TTE8K] <= tsb_rss_factor)
10281 return;
10283 for (i = 0; i < mmu_page_sizes; i++) {
10284 ttecnt[i] = sfmmup->sfmmu_ttecnt[i] +
10285 sfmmup->sfmmu_ismttecnt[i];
10288 /* Check pagesizes in use, and possibly reprogram DTLB. */
10289 if (&mmu_check_page_sizes)
10290 mmu_check_page_sizes(sfmmup, ttecnt);
10293 * Calculate the number of 8k ttes to represent the span of these
10294 * pages.
10296 tte8k_cnt = ttecnt[TTE8K] +
10297 (ttecnt[TTE64K] << (MMU_PAGESHIFT64K - MMU_PAGESHIFT)) +
10298 (ttecnt[TTE512K] << (MMU_PAGESHIFT512K - MMU_PAGESHIFT));
10299 if (mmu_page_sizes == max_mmu_page_sizes) {
10300 tte4m_cnt = ttecnt[TTE4M] +
10301 (ttecnt[TTE32M] << (MMU_PAGESHIFT32M - MMU_PAGESHIFT4M)) +
10302 (ttecnt[TTE256M] << (MMU_PAGESHIFT256M - MMU_PAGESHIFT4M));
10303 } else {
10304 tte4m_cnt = ttecnt[TTE4M];
10308 * Inflate tte8k_cnt to allow for region large page allocation failure.
10310 tte8k_cnt += sfmmup->sfmmu_tsb0_4minflcnt;
10313 * Inflate TSB sizes by a factor of 2 if this process
10314 * uses 4M text pages to minimize extra conflict misses
10315 * in the first TSB since without counting text pages
10316 * 8K TSB may become too small.
10318 * Also double the size of the second TSB to minimize
10319 * extra conflict misses due to competition between 4M text pages
10320 * and data pages.
10322 * We need to adjust the second TSB allocation threshold by the
10323 * inflation factor, since there is no point in creating a second
10324 * TSB when we know all the mappings can fit in the I/D TLBs.
10326 sectsb_thresh = tsb_sectsb_threshold;
10327 if (sfmmup->sfmmu_flags & HAT_4MTEXT_FLAG) {
10328 tte8k_cnt <<= 1;
10329 tte4m_cnt <<= 1;
10330 sectsb_thresh <<= 1;
10334 * Check to see if our TSB is the right size; we may need to
10335 * grow or shrink it. If the process is small, our work is
10336 * finished at this point.
10338 if (tte8k_cnt <= tsb_rss_factor && tte4m_cnt <= sectsb_thresh) {
10339 return;
10341 sfmmu_size_tsb(sfmmup, growing, tte8k_cnt, tte4m_cnt, sectsb_thresh);
10344 static void
10345 sfmmu_size_tsb(sfmmu_t *sfmmup, int growing, uint64_t tte8k_cnt,
10346 uint64_t tte4m_cnt, int sectsb_thresh)
10348 int tsb_bits;
10349 uint_t tsb_szc;
10350 struct tsb_info *tsbinfop;
10351 hatlock_t *hatlockp = NULL;
10353 hatlockp = sfmmu_hat_enter(sfmmup);
10354 ASSERT(hatlockp != NULL);
10355 tsbinfop = sfmmup->sfmmu_tsb;
10356 ASSERT(tsbinfop != NULL);
10359 * If we're growing, select the size based on RSS. If we're
10360 * shrinking, leave some room so we don't have to turn around and
10361 * grow again immediately.
10363 if (growing)
10364 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt);
10365 else
10366 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt << 1);
10368 if (!growing && (tsb_szc < tsbinfop->tsb_szc) &&
10369 (tsb_szc >= default_tsb_size) && TSB_OK_SHRINK()) {
10370 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, tsb_szc,
10371 hatlockp, TSB_SHRINK);
10372 } else if (growing && tsb_szc > tsbinfop->tsb_szc && TSB_OK_GROW()) {
10373 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, tsb_szc,
10374 hatlockp, TSB_GROW);
10376 tsbinfop = sfmmup->sfmmu_tsb;
10379 * With the TLB and first TSB out of the way, we need to see if
10380 * we need a second TSB for 4M pages. If we managed to reprogram
10381 * the TLB page sizes above, the process will start using this new
10382 * TSB right away; otherwise, it will start using it on the next
10383 * context switch. Either way, it's no big deal so there's no
10384 * synchronization with the trap handlers here unless we grow the
10385 * TSB (in which case it's required to prevent using the old one
10386 * after it's freed). Note: second tsb is required for 32M/256M
10387 * page sizes.
10389 if (tte4m_cnt > sectsb_thresh) {
10391 * If we're growing, select the size based on RSS. If we're
10392 * shrinking, leave some room so we don't have to turn
10393 * around and grow again immediately.
10395 if (growing)
10396 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt);
10397 else
10398 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt << 1);
10399 if (tsbinfop->tsb_next == NULL) {
10400 struct tsb_info *newtsb;
10401 int allocflags = SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)?
10402 0 : TSB_ALLOC;
10404 sfmmu_hat_exit(hatlockp);
10407 * Try to allocate a TSB for 4[32|256]M pages. If we
10408 * can't get the size we want, retry w/a minimum sized
10409 * TSB. If that still didn't work, give up; we can
10410 * still run without one.
10412 tsb_bits = (mmu_page_sizes == max_mmu_page_sizes)?
10413 TSB4M|TSB32M|TSB256M:TSB4M;
10414 if ((sfmmu_tsbinfo_alloc(&newtsb, tsb_szc, tsb_bits,
10415 allocflags, sfmmup)) &&
10416 (tsb_szc <= TSB_4M_SZCODE ||
10417 sfmmu_tsbinfo_alloc(&newtsb, TSB_4M_SZCODE,
10418 tsb_bits, allocflags, sfmmup)) &&
10419 sfmmu_tsbinfo_alloc(&newtsb, TSB_MIN_SZCODE,
10420 tsb_bits, allocflags, sfmmup)) {
10421 return;
10424 hatlockp = sfmmu_hat_enter(sfmmup);
10426 sfmmu_invalidate_ctx(sfmmup);
10428 if (sfmmup->sfmmu_tsb->tsb_next == NULL) {
10429 sfmmup->sfmmu_tsb->tsb_next = newtsb;
10430 SFMMU_STAT(sf_tsb_sectsb_create);
10431 sfmmu_hat_exit(hatlockp);
10432 return;
10433 } else {
10435 * It's annoying, but possible for us
10436 * to get here.. we dropped the HAT lock
10437 * because of locking order in the kmem
10438 * allocator, and while we were off getting
10439 * our memory, some other thread decided to
10440 * do us a favor and won the race to get a
10441 * second TSB for this process. Sigh.
10443 sfmmu_hat_exit(hatlockp);
10444 sfmmu_tsbinfo_free(newtsb);
10445 return;
10450 * We have a second TSB, see if it's big enough.
10452 tsbinfop = tsbinfop->tsb_next;
10455 * Check to see if our second TSB is the right size;
10456 * we may need to grow or shrink it.
10457 * To prevent thrashing (e.g. growing the TSB on a
10458 * subsequent map operation), only try to shrink if
10459 * the TSB reach exceeds twice the virtual address
10460 * space size.
10462 if (!growing && (tsb_szc < tsbinfop->tsb_szc) &&
10463 (tsb_szc >= default_tsb_size) && TSB_OK_SHRINK()) {
10464 (void) sfmmu_replace_tsb(sfmmup, tsbinfop,
10465 tsb_szc, hatlockp, TSB_SHRINK);
10466 } else if (growing && tsb_szc > tsbinfop->tsb_szc &&
10467 TSB_OK_GROW()) {
10468 (void) sfmmu_replace_tsb(sfmmup, tsbinfop,
10469 tsb_szc, hatlockp, TSB_GROW);
10473 sfmmu_hat_exit(hatlockp);
10477 * Free up a sfmmu
10478 * Since the sfmmu is currently embedded in the hat struct we simply zero
10479 * out our fields and free up the ism map blk list if any.
10481 static void
10482 sfmmu_free_sfmmu(sfmmu_t *sfmmup)
10484 ism_blk_t *blkp, *nx_blkp;
10485 #ifdef DEBUG
10486 ism_map_t *map;
10487 int i;
10488 #endif
10490 ASSERT(sfmmup->sfmmu_ttecnt[TTE8K] == 0);
10491 ASSERT(sfmmup->sfmmu_ttecnt[TTE64K] == 0);
10492 ASSERT(sfmmup->sfmmu_ttecnt[TTE512K] == 0);
10493 ASSERT(sfmmup->sfmmu_ttecnt[TTE4M] == 0);
10494 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0);
10495 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0);
10496 ASSERT(SF_RGNMAP_ISNULL(sfmmup));
10498 sfmmup->sfmmu_free = 0;
10499 sfmmup->sfmmu_ismhat = 0;
10501 blkp = sfmmup->sfmmu_iblk;
10502 sfmmup->sfmmu_iblk = NULL;
10504 while (blkp) {
10505 #ifdef DEBUG
10506 map = blkp->iblk_maps;
10507 for (i = 0; i < ISM_MAP_SLOTS; i++) {
10508 ASSERT(map[i].imap_seg == 0);
10509 ASSERT(map[i].imap_ismhat == NULL);
10510 ASSERT(map[i].imap_ment == NULL);
10512 #endif
10513 nx_blkp = blkp->iblk_next;
10514 blkp->iblk_next = NULL;
10515 blkp->iblk_nextpa = (uint64_t)-1;
10516 kmem_cache_free(ism_blk_cache, blkp);
10517 blkp = nx_blkp;
10522 * Locking primitves accessed by HATLOCK macros
10525 #define SFMMU_SPL_MTX (0x0)
10526 #define SFMMU_ML_MTX (0x1)
10528 #define SFMMU_MLSPL_MTX(type, pg) (((type) == SFMMU_SPL_MTX) ? \
10529 SPL_HASH(pg) : MLIST_HASH(pg))
10531 kmutex_t *
10532 sfmmu_page_enter(struct page *pp)
10534 return (sfmmu_mlspl_enter(pp, SFMMU_SPL_MTX));
10537 void
10538 sfmmu_page_exit(kmutex_t *spl)
10540 mutex_exit(spl);
10544 sfmmu_page_spl_held(struct page *pp)
10546 return (sfmmu_mlspl_held(pp, SFMMU_SPL_MTX));
10549 kmutex_t *
10550 sfmmu_mlist_enter(struct page *pp)
10552 return (sfmmu_mlspl_enter(pp, SFMMU_ML_MTX));
10555 void
10556 sfmmu_mlist_exit(kmutex_t *mml)
10558 mutex_exit(mml);
10562 sfmmu_mlist_held(struct page *pp)
10565 return (sfmmu_mlspl_held(pp, SFMMU_ML_MTX));
10569 * Common code for sfmmu_mlist_enter() and sfmmu_page_enter(). For
10570 * sfmmu_mlist_enter() case mml_table lock array is used and for
10571 * sfmmu_page_enter() sfmmu_page_lock lock array is used.
10573 * The lock is taken on a root page so that it protects an operation on all
10574 * constituent pages of a large page pp belongs to.
10576 * The routine takes a lock from the appropriate array. The lock is determined
10577 * by hashing the root page. After taking the lock this routine checks if the
10578 * root page has the same size code that was used to determine the root (i.e
10579 * that root hasn't changed). If root page has the expected p_szc field we
10580 * have the right lock and it's returned to the caller. If root's p_szc
10581 * decreased we release the lock and retry from the beginning. This case can
10582 * happen due to hat_page_demote() decreasing p_szc between our load of p_szc
10583 * value and taking the lock. The number of retries due to p_szc decrease is
10584 * limited by the maximum p_szc value. If p_szc is 0 we return the lock
10585 * determined by hashing pp itself.
10587 * If our caller doesn't hold a SE_SHARED or SE_EXCL lock on pp it's also
10588 * possible that p_szc can increase. To increase p_szc a thread has to lock
10589 * all constituent pages EXCL and do hat_pageunload() on all of them. All the
10590 * callers that don't hold a page locked recheck if hmeblk through which pp
10591 * was found still maps this pp. If it doesn't map it anymore returned lock
10592 * is immediately dropped. Therefore if sfmmu_mlspl_enter() hits the case of
10593 * p_szc increase after taking the lock it returns this lock without further
10594 * retries because in this case the caller doesn't care about which lock was
10595 * taken. The caller will drop it right away.
10597 * After the routine returns it's guaranteed that hat_page_demote() can't
10598 * change p_szc field of any of constituent pages of a large page pp belongs
10599 * to as long as pp was either locked at least SHARED prior to this call or
10600 * the caller finds that hment that pointed to this pp still references this
10601 * pp (this also assumes that the caller holds hme hash bucket lock so that
10602 * the same pp can't be remapped into the same hmeblk after it was unmapped by
10603 * hat_pageunload()).
10605 static kmutex_t *
10606 sfmmu_mlspl_enter(struct page *pp, int type)
10608 kmutex_t *mtx;
10609 uint_t prev_rszc = UINT_MAX;
10610 page_t *rootpp;
10611 uint_t szc;
10612 uint_t rszc;
10613 uint_t pszc = pp->p_szc;
10615 ASSERT(pp != NULL);
10617 again:
10618 if (pszc == 0) {
10619 mtx = SFMMU_MLSPL_MTX(type, pp);
10620 mutex_enter(mtx);
10621 return (mtx);
10624 /* The lock lives in the root page */
10625 rootpp = PP_GROUPLEADER(pp, pszc);
10626 mtx = SFMMU_MLSPL_MTX(type, rootpp);
10627 mutex_enter(mtx);
10630 * Return mml in the following 3 cases:
10632 * 1) If pp itself is root since if its p_szc decreased before we took
10633 * the lock pp is still the root of smaller szc page. And if its p_szc
10634 * increased it doesn't matter what lock we return (see comment in
10635 * front of this routine).
10637 * 2) If pp's not root but rootpp is the root of a rootpp->p_szc size
10638 * large page we have the right lock since any previous potential
10639 * hat_page_demote() is done demoting from greater than current root's
10640 * p_szc because hat_page_demote() changes root's p_szc last. No
10641 * further hat_page_demote() can start or be in progress since it
10642 * would need the same lock we currently hold.
10644 * 3) If rootpp's p_szc increased since previous iteration it doesn't
10645 * matter what lock we return (see comment in front of this routine).
10647 if (pp == rootpp || (rszc = rootpp->p_szc) == pszc ||
10648 rszc >= prev_rszc) {
10649 return (mtx);
10653 * hat_page_demote() could have decreased root's p_szc.
10654 * In this case pp's p_szc must also be smaller than pszc.
10655 * Retry.
10657 if (rszc < pszc) {
10658 szc = pp->p_szc;
10659 if (szc < pszc) {
10660 mutex_exit(mtx);
10661 pszc = szc;
10662 goto again;
10665 * pp's p_szc increased after it was decreased.
10666 * page cannot be mapped. Return current lock. The caller
10667 * will drop it right away.
10669 return (mtx);
10673 * root's p_szc is greater than pp's p_szc.
10674 * hat_page_demote() is not done with all pages
10675 * yet. Wait for it to complete.
10677 mutex_exit(mtx);
10678 rootpp = PP_GROUPLEADER(rootpp, rszc);
10679 mtx = SFMMU_MLSPL_MTX(type, rootpp);
10680 mutex_enter(mtx);
10681 mutex_exit(mtx);
10682 prev_rszc = rszc;
10683 goto again;
10686 static int
10687 sfmmu_mlspl_held(struct page *pp, int type)
10689 kmutex_t *mtx;
10691 ASSERT(pp != NULL);
10692 /* The lock lives in the root page */
10693 pp = PP_PAGEROOT(pp);
10694 ASSERT(pp != NULL);
10696 mtx = SFMMU_MLSPL_MTX(type, pp);
10697 return (MUTEX_HELD(mtx));
10700 static uint_t
10701 sfmmu_get_free_hblk(struct hme_blk **hmeblkpp, uint_t critical)
10703 struct hme_blk *hblkp;
10706 if (freehblkp != NULL) {
10707 mutex_enter(&freehblkp_lock);
10708 if (freehblkp != NULL) {
10710 * If the current thread is owning hblk_reserve OR
10711 * critical request from sfmmu_hblk_steal()
10712 * let it succeed even if freehblkcnt is really low.
10714 if (freehblkcnt <= HBLK_RESERVE_MIN && !critical) {
10715 SFMMU_STAT(sf_get_free_throttle);
10716 mutex_exit(&freehblkp_lock);
10717 return (0);
10719 freehblkcnt--;
10720 *hmeblkpp = freehblkp;
10721 hblkp = *hmeblkpp;
10722 freehblkp = hblkp->hblk_next;
10723 mutex_exit(&freehblkp_lock);
10724 hblkp->hblk_next = NULL;
10725 SFMMU_STAT(sf_get_free_success);
10727 ASSERT(hblkp->hblk_hmecnt == 0);
10728 ASSERT(hblkp->hblk_vcnt == 0);
10729 ASSERT(hblkp->hblk_nextpa == va_to_pa((caddr_t)hblkp));
10731 return (1);
10733 mutex_exit(&freehblkp_lock);
10736 /* Check cpu hblk pending queues */
10737 if ((*hmeblkpp = sfmmu_check_pending_hblks(TTE8K)) != NULL) {
10738 hblkp = *hmeblkpp;
10739 hblkp->hblk_next = NULL;
10740 hblkp->hblk_nextpa = va_to_pa((caddr_t)hblkp);
10742 ASSERT(hblkp->hblk_hmecnt == 0);
10743 ASSERT(hblkp->hblk_vcnt == 0);
10745 return (1);
10748 SFMMU_STAT(sf_get_free_fail);
10749 return (0);
10752 static uint_t
10753 sfmmu_put_free_hblk(struct hme_blk *hmeblkp, uint_t critical)
10755 struct hme_blk *hblkp;
10757 ASSERT(hmeblkp->hblk_hmecnt == 0);
10758 ASSERT(hmeblkp->hblk_vcnt == 0);
10759 ASSERT(hmeblkp->hblk_nextpa == va_to_pa((caddr_t)hmeblkp));
10762 * If the current thread is mapping into kernel space,
10763 * let it succede even if freehblkcnt is max
10764 * so that it will avoid freeing it to kmem.
10765 * This will prevent stack overflow due to
10766 * possible recursion since kmem_cache_free()
10767 * might require creation of a slab which
10768 * in turn needs an hmeblk to map that slab;
10769 * let's break this vicious chain at the first
10770 * opportunity.
10772 if (freehblkcnt < HBLK_RESERVE_CNT || critical) {
10773 mutex_enter(&freehblkp_lock);
10774 if (freehblkcnt < HBLK_RESERVE_CNT || critical) {
10775 SFMMU_STAT(sf_put_free_success);
10776 freehblkcnt++;
10777 hmeblkp->hblk_next = freehblkp;
10778 freehblkp = hmeblkp;
10779 mutex_exit(&freehblkp_lock);
10780 return (1);
10782 mutex_exit(&freehblkp_lock);
10786 * Bring down freehblkcnt to HBLK_RESERVE_CNT. We are here
10787 * only if freehblkcnt is at least HBLK_RESERVE_CNT *and*
10788 * we are not in the process of mapping into kernel space.
10790 ASSERT(!critical);
10791 while (freehblkcnt > HBLK_RESERVE_CNT) {
10792 mutex_enter(&freehblkp_lock);
10793 if (freehblkcnt > HBLK_RESERVE_CNT) {
10794 freehblkcnt--;
10795 hblkp = freehblkp;
10796 freehblkp = hblkp->hblk_next;
10797 mutex_exit(&freehblkp_lock);
10798 ASSERT(get_hblk_cache(hblkp) == sfmmu8_cache);
10799 kmem_cache_free(sfmmu8_cache, hblkp);
10800 continue;
10802 mutex_exit(&freehblkp_lock);
10804 SFMMU_STAT(sf_put_free_fail);
10805 return (0);
10808 static void
10809 sfmmu_hblk_swap(struct hme_blk *new)
10811 struct hme_blk *old, *hblkp, *prev;
10812 uint64_t newpa;
10813 caddr_t base, vaddr, endaddr;
10814 struct hmehash_bucket *hmebp;
10815 struct sf_hment *osfhme, *nsfhme;
10816 page_t *pp;
10817 kmutex_t *pml;
10818 tte_t tte;
10819 struct hme_blk *list = NULL;
10821 #ifdef DEBUG
10822 hmeblk_tag hblktag;
10823 struct hme_blk *found;
10824 #endif
10825 old = HBLK_RESERVE;
10826 ASSERT(!old->hblk_shared);
10829 * save pa before bcopy clobbers it
10831 newpa = new->hblk_nextpa;
10833 base = (caddr_t)get_hblk_base(old);
10834 endaddr = base + get_hblk_span(old);
10837 * acquire hash bucket lock.
10839 hmebp = sfmmu_tteload_acquire_hashbucket(ksfmmup, base, TTE8K,
10840 SFMMU_INVALID_SHMERID);
10843 * copy contents from old to new
10845 bcopy((void *)old, (void *)new, HME8BLK_SZ);
10848 * add new to hash chain
10850 sfmmu_hblk_hash_add(hmebp, new, newpa);
10853 * search hash chain for hblk_reserve; this needs to be performed
10854 * after adding new, otherwise prev won't correspond to the hblk which
10855 * is prior to old in hash chain when we call sfmmu_hblk_hash_rm to
10856 * remove old later.
10858 for (prev = NULL,
10859 hblkp = hmebp->hmeblkp; hblkp != NULL && hblkp != old;
10860 prev = hblkp, hblkp = hblkp->hblk_next)
10863 if (hblkp != old)
10864 panic("sfmmu_hblk_swap: hblk_reserve not found");
10867 * p_mapping list is still pointing to hments in hblk_reserve;
10868 * fix up p_mapping list so that they point to hments in new.
10870 * Since all these mappings are created by hblk_reserve_thread
10871 * on the way and it's using at least one of the buffers from each of
10872 * the newly minted slabs, there is no danger of any of these
10873 * mappings getting unloaded by another thread.
10875 * tsbmiss could only modify ref/mod bits of hments in old/new.
10876 * Since all of these hments hold mappings established by segkmem
10877 * and mappings in segkmem are setup with HAT_NOSYNC, ref/mod bits
10878 * have no meaning for the mappings in hblk_reserve. hments in
10879 * old and new are identical except for ref/mod bits.
10881 for (vaddr = base; vaddr < endaddr; vaddr += TTEBYTES(TTE8K)) {
10883 HBLKTOHME(osfhme, old, vaddr);
10884 sfmmu_copytte(&osfhme->hme_tte, &tte);
10886 if (TTE_IS_VALID(&tte)) {
10887 if ((pp = osfhme->hme_page) == NULL)
10888 panic("sfmmu_hblk_swap: page not mapped");
10890 pml = sfmmu_mlist_enter(pp);
10892 if (pp != osfhme->hme_page)
10893 panic("sfmmu_hblk_swap: mapping changed");
10895 HBLKTOHME(nsfhme, new, vaddr);
10897 HME_ADD(nsfhme, pp);
10898 HME_SUB(osfhme, pp);
10900 sfmmu_mlist_exit(pml);
10905 * remove old from hash chain
10907 sfmmu_hblk_hash_rm(hmebp, old, prev, &list, 1);
10909 #ifdef DEBUG
10911 hblktag.htag_id = ksfmmup;
10912 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
10913 hblktag.htag_bspage = HME_HASH_BSPAGE(base, HME_HASH_SHIFT(TTE8K));
10914 hblktag.htag_rehash = HME_HASH_REHASH(TTE8K);
10915 HME_HASH_FAST_SEARCH(hmebp, hblktag, found);
10917 if (found != new)
10918 panic("sfmmu_hblk_swap: new hblk not found");
10919 #endif
10921 SFMMU_HASH_UNLOCK(hmebp);
10924 * Reset hblk_reserve
10926 bzero((void *)old, HME8BLK_SZ);
10927 old->hblk_nextpa = va_to_pa((caddr_t)old);
10931 * Grab the mlist mutex for both pages passed in.
10933 * low and high will be returned as pointers to the mutexes for these pages.
10934 * low refers to the mutex residing in the lower bin of the mlist hash, while
10935 * high refers to the mutex residing in the higher bin of the mlist hash. This
10936 * is due to the locking order restrictions on the same thread grabbing
10937 * multiple mlist mutexes. The low lock must be acquired before the high lock.
10939 * If both pages hash to the same mutex, only grab that single mutex, and
10940 * high will be returned as NULL
10941 * If the pages hash to different bins in the hash, grab the lower addressed
10942 * lock first and then the higher addressed lock in order to follow the locking
10943 * rules involved with the same thread grabbing multiple mlist mutexes.
10944 * low and high will both have non-NULL values.
10946 static void
10947 sfmmu_mlist_reloc_enter(struct page *targ, struct page *repl,
10948 kmutex_t **low, kmutex_t **high)
10950 kmutex_t *mml_targ, *mml_repl;
10953 * no need to do the dance around szc as in sfmmu_mlist_enter()
10954 * because this routine is only called by hat_page_relocate() and all
10955 * targ and repl pages are already locked EXCL so szc can't change.
10958 mml_targ = MLIST_HASH(PP_PAGEROOT(targ));
10959 mml_repl = MLIST_HASH(PP_PAGEROOT(repl));
10961 if (mml_targ == mml_repl) {
10962 *low = mml_targ;
10963 *high = NULL;
10964 } else {
10965 if (mml_targ < mml_repl) {
10966 *low = mml_targ;
10967 *high = mml_repl;
10968 } else {
10969 *low = mml_repl;
10970 *high = mml_targ;
10974 mutex_enter(*low);
10975 if (*high)
10976 mutex_enter(*high);
10979 static void
10980 sfmmu_mlist_reloc_exit(kmutex_t *low, kmutex_t *high)
10982 if (high)
10983 mutex_exit(high);
10984 mutex_exit(low);
10987 static hatlock_t *
10988 sfmmu_hat_enter(sfmmu_t *sfmmup)
10990 hatlock_t *hatlockp;
10992 if (sfmmup != ksfmmup) {
10993 hatlockp = TSB_HASH(sfmmup);
10994 mutex_enter(HATLOCK_MUTEXP(hatlockp));
10995 return (hatlockp);
10997 return (NULL);
11000 static hatlock_t *
11001 sfmmu_hat_tryenter(sfmmu_t *sfmmup)
11003 hatlock_t *hatlockp;
11005 if (sfmmup != ksfmmup) {
11006 hatlockp = TSB_HASH(sfmmup);
11007 if (mutex_tryenter(HATLOCK_MUTEXP(hatlockp)) == 0)
11008 return (NULL);
11009 return (hatlockp);
11011 return (NULL);
11014 static void
11015 sfmmu_hat_exit(hatlock_t *hatlockp)
11017 if (hatlockp != NULL)
11018 mutex_exit(HATLOCK_MUTEXP(hatlockp));
11021 static void
11022 sfmmu_hat_lock_all(void)
11024 int i;
11025 for (i = 0; i < SFMMU_NUM_LOCK; i++)
11026 mutex_enter(HATLOCK_MUTEXP(&hat_lock[i]));
11029 static void
11030 sfmmu_hat_unlock_all(void)
11032 int i;
11033 for (i = SFMMU_NUM_LOCK - 1; i >= 0; i--)
11034 mutex_exit(HATLOCK_MUTEXP(&hat_lock[i]));
11038 sfmmu_hat_lock_held(sfmmu_t *sfmmup)
11040 ASSERT(sfmmup != ksfmmup);
11041 return (MUTEX_HELD(HATLOCK_MUTEXP(TSB_HASH(sfmmup))));
11045 * Locking primitives to provide consistency between ISM unmap
11046 * and other operations. Since ISM unmap can take a long time, we
11047 * use HAT_ISMBUSY flag (protected by the hatlock) to avoid creating
11048 * contention on the hatlock buckets while ISM segments are being
11049 * unmapped. The tradeoff is that the flags don't prevent priority
11050 * inversion from occurring, so we must request kernel priority in
11051 * case we have to sleep to keep from getting buried while holding
11052 * the HAT_ISMBUSY flag set, which in turn could block other kernel
11053 * threads from running (for example, in sfmmu_uvatopfn()).
11055 static void
11056 sfmmu_ismhat_enter(sfmmu_t *sfmmup, int hatlock_held)
11058 hatlock_t *hatlockp;
11060 THREAD_KPRI_REQUEST();
11061 if (!hatlock_held)
11062 hatlockp = sfmmu_hat_enter(sfmmup);
11063 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY))
11064 cv_wait(&sfmmup->sfmmu_tsb_cv, HATLOCK_MUTEXP(hatlockp));
11065 SFMMU_FLAGS_SET(sfmmup, HAT_ISMBUSY);
11066 if (!hatlock_held)
11067 sfmmu_hat_exit(hatlockp);
11070 static void
11071 sfmmu_ismhat_exit(sfmmu_t *sfmmup, int hatlock_held)
11073 hatlock_t *hatlockp;
11075 if (!hatlock_held)
11076 hatlockp = sfmmu_hat_enter(sfmmup);
11077 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY));
11078 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ISMBUSY);
11079 cv_broadcast(&sfmmup->sfmmu_tsb_cv);
11080 if (!hatlock_held)
11081 sfmmu_hat_exit(hatlockp);
11082 THREAD_KPRI_RELEASE();
11087 * Algorithm:
11089 * (1) if segkmem is not ready, allocate hblk from an array of pre-alloc'ed
11090 * hblks.
11092 * (2) if we are allocating an hblk for mapping a slab in sfmmu_cache,
11094 * (a) try to return an hblk from reserve pool of free hblks;
11095 * (b) if the reserve pool is empty, acquire hblk_reserve_lock
11096 * and return hblk_reserve.
11098 * (3) call kmem_cache_alloc() to allocate hblk;
11100 * (a) if hblk_reserve_lock is held by the current thread,
11101 * atomically replace hblk_reserve by the hblk that is
11102 * returned by kmem_cache_alloc; release hblk_reserve_lock
11103 * and call kmem_cache_alloc() again.
11104 * (b) if reserve pool is not full, add the hblk that is
11105 * returned by kmem_cache_alloc to reserve pool and
11106 * call kmem_cache_alloc again.
11109 static struct hme_blk *
11110 sfmmu_hblk_alloc(sfmmu_t *sfmmup, caddr_t vaddr,
11111 struct hmehash_bucket *hmebp, uint_t size, hmeblk_tag hblktag,
11112 uint_t flags, uint_t rid)
11114 struct hme_blk *hmeblkp = NULL;
11115 struct hme_blk *newhblkp;
11116 struct hme_blk *shw_hblkp = NULL;
11117 struct kmem_cache *sfmmu_cache = NULL;
11118 uint64_t hblkpa;
11119 ulong_t index;
11120 uint_t owner; /* set to 1 if using hblk_reserve */
11121 uint_t forcefree;
11122 int sleep;
11123 sf_srd_t *srdp;
11124 sf_region_t *rgnp;
11126 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
11127 ASSERT(hblktag.htag_rid == rid);
11128 SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size));
11129 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) ||
11130 IS_P2ALIGNED(vaddr, TTEBYTES(size)));
11133 * If segkmem is not created yet, allocate from static hmeblks
11134 * created at the end of startup_modules(). See the block comment
11135 * in startup_modules() describing how we estimate the number of
11136 * static hmeblks that will be needed during re-map.
11138 if (!hblk_alloc_dynamic) {
11140 ASSERT(!SFMMU_IS_SHMERID_VALID(rid));
11142 if (size == TTE8K) {
11143 index = nucleus_hblk8.index;
11144 if (index >= nucleus_hblk8.len) {
11146 * If we panic here, see startup_modules() to
11147 * make sure that we are calculating the
11148 * number of hblk8's that we need correctly.
11150 prom_panic("no nucleus hblk8 to allocate");
11152 hmeblkp =
11153 (struct hme_blk *)&nucleus_hblk8.list[index];
11154 nucleus_hblk8.index++;
11155 SFMMU_STAT(sf_hblk8_nalloc);
11156 } else {
11157 index = nucleus_hblk1.index;
11158 if (nucleus_hblk1.index >= nucleus_hblk1.len) {
11160 * If we panic here, see startup_modules().
11161 * Most likely you need to update the
11162 * calculation of the number of hblk1 elements
11163 * that the kernel needs to boot.
11165 prom_panic("no nucleus hblk1 to allocate");
11167 hmeblkp =
11168 (struct hme_blk *)&nucleus_hblk1.list[index];
11169 nucleus_hblk1.index++;
11170 SFMMU_STAT(sf_hblk1_nalloc);
11173 goto hblk_init;
11176 SFMMU_HASH_UNLOCK(hmebp);
11178 if (sfmmup != KHATID && !SFMMU_IS_SHMERID_VALID(rid)) {
11179 if (mmu_page_sizes == max_mmu_page_sizes) {
11180 if (size < TTE256M)
11181 shw_hblkp = sfmmu_shadow_hcreate(sfmmup, vaddr,
11182 size, flags);
11183 } else {
11184 if (size < TTE4M)
11185 shw_hblkp = sfmmu_shadow_hcreate(sfmmup, vaddr,
11186 size, flags);
11188 } else if (SFMMU_IS_SHMERID_VALID(rid)) {
11190 * Shared hmes use per region bitmaps in rgn_hmeflag
11191 * rather than shadow hmeblks to keep track of the
11192 * mapping sizes which have been allocated for the region.
11193 * Here we cleanup old invalid hmeblks with this rid,
11194 * which may be left around by pageunload().
11196 int ttesz;
11197 caddr_t va;
11198 caddr_t eva = vaddr + TTEBYTES(size);
11200 ASSERT(sfmmup != KHATID);
11202 srdp = sfmmup->sfmmu_srdp;
11203 ASSERT(srdp != NULL && srdp->srd_refcnt != 0);
11204 rgnp = srdp->srd_hmergnp[rid];
11205 ASSERT(rgnp != NULL && rgnp->rgn_id == rid);
11206 ASSERT(rgnp->rgn_refcnt != 0);
11207 ASSERT(size <= rgnp->rgn_pgszc);
11209 ttesz = HBLK_MIN_TTESZ;
11210 do {
11211 if (!(rgnp->rgn_hmeflags & (0x1 << ttesz))) {
11212 continue;
11215 if (ttesz > size && ttesz != HBLK_MIN_TTESZ) {
11216 sfmmu_cleanup_rhblk(srdp, vaddr, rid, ttesz);
11217 } else if (ttesz < size) {
11218 for (va = vaddr; va < eva;
11219 va += TTEBYTES(ttesz)) {
11220 sfmmu_cleanup_rhblk(srdp, va, rid,
11221 ttesz);
11224 } while (++ttesz <= rgnp->rgn_pgszc);
11227 fill_hblk:
11228 owner = (hblk_reserve_thread == curthread) ? 1 : 0;
11230 if (owner && size == TTE8K) {
11232 ASSERT(!SFMMU_IS_SHMERID_VALID(rid));
11234 * We are really in a tight spot. We already own
11235 * hblk_reserve and we need another hblk. In anticipation
11236 * of this kind of scenario, we specifically set aside
11237 * HBLK_RESERVE_MIN number of hblks to be used exclusively
11238 * by owner of hblk_reserve.
11240 SFMMU_STAT(sf_hblk_recurse_cnt);
11242 if (!sfmmu_get_free_hblk(&hmeblkp, 1))
11243 panic("sfmmu_hblk_alloc: reserve list is empty");
11245 goto hblk_verify;
11248 ASSERT(!owner);
11250 if ((flags & HAT_NO_KALLOC) == 0) {
11252 sfmmu_cache = ((size == TTE8K) ? sfmmu8_cache : sfmmu1_cache);
11253 sleep = ((sfmmup == KHATID) ? KM_NOSLEEP : KM_SLEEP);
11255 if ((hmeblkp = kmem_cache_alloc(sfmmu_cache, sleep)) == NULL) {
11256 hmeblkp = sfmmu_hblk_steal(size);
11257 } else {
11259 * if we are the owner of hblk_reserve,
11260 * swap hblk_reserve with hmeblkp and
11261 * start a fresh life. Hope things go
11262 * better this time.
11264 if (hblk_reserve_thread == curthread) {
11265 ASSERT(sfmmu_cache == sfmmu8_cache);
11266 sfmmu_hblk_swap(hmeblkp);
11267 hblk_reserve_thread = NULL;
11268 mutex_exit(&hblk_reserve_lock);
11269 goto fill_hblk;
11272 * let's donate this hblk to our reserve list if
11273 * we are not mapping kernel range
11275 if (size == TTE8K && sfmmup != KHATID) {
11276 if (sfmmu_put_free_hblk(hmeblkp, 0))
11277 goto fill_hblk;
11280 } else {
11282 * We are here to map the slab in sfmmu8_cache; let's
11283 * check if we could tap our reserve list; if successful,
11284 * this will avoid the pain of going thru sfmmu_hblk_swap
11286 SFMMU_STAT(sf_hblk_slab_cnt);
11287 if (!sfmmu_get_free_hblk(&hmeblkp, 0)) {
11289 * let's start hblk_reserve dance
11291 SFMMU_STAT(sf_hblk_reserve_cnt);
11292 owner = 1;
11293 mutex_enter(&hblk_reserve_lock);
11294 hmeblkp = HBLK_RESERVE;
11295 hblk_reserve_thread = curthread;
11299 hblk_verify:
11300 ASSERT(hmeblkp != NULL);
11301 set_hblk_sz(hmeblkp, size);
11302 ASSERT(hmeblkp->hblk_nextpa == va_to_pa((caddr_t)hmeblkp));
11303 SFMMU_HASH_LOCK(hmebp);
11304 HME_HASH_FAST_SEARCH(hmebp, hblktag, newhblkp);
11305 if (newhblkp != NULL) {
11306 SFMMU_HASH_UNLOCK(hmebp);
11307 if (hmeblkp != HBLK_RESERVE) {
11309 * This is really tricky!
11311 * vmem_alloc(vmem_seg_arena)
11312 * vmem_alloc(vmem_internal_arena)
11313 * segkmem_alloc(heap_arena)
11314 * vmem_alloc(heap_arena)
11315 * page_create()
11316 * hat_memload()
11317 * kmem_cache_free()
11318 * kmem_cache_alloc()
11319 * kmem_slab_create()
11320 * vmem_alloc(kmem_internal_arena)
11321 * segkmem_alloc(heap_arena)
11322 * vmem_alloc(heap_arena)
11323 * page_create()
11324 * hat_memload()
11325 * kmem_cache_free()
11326 * ...
11328 * Thus, hat_memload() could call kmem_cache_free
11329 * for enough number of times that we could easily
11330 * hit the bottom of the stack or run out of reserve
11331 * list of vmem_seg structs. So, we must donate
11332 * this hblk to reserve list if it's allocated
11333 * from sfmmu8_cache *and* mapping kernel range.
11334 * We don't need to worry about freeing hmeblk1's
11335 * to kmem since they don't map any kmem slabs.
11337 * Note: When segkmem supports largepages, we must
11338 * free hmeblk1's to reserve list as well.
11340 forcefree = (sfmmup == KHATID) ? 1 : 0;
11341 if (size == TTE8K &&
11342 sfmmu_put_free_hblk(hmeblkp, forcefree)) {
11343 goto re_verify;
11345 ASSERT(sfmmup != KHATID);
11346 kmem_cache_free(get_hblk_cache(hmeblkp), hmeblkp);
11347 } else {
11349 * Hey! we don't need hblk_reserve any more.
11351 ASSERT(owner);
11352 hblk_reserve_thread = NULL;
11353 mutex_exit(&hblk_reserve_lock);
11354 owner = 0;
11356 re_verify:
11358 * let's check if the goodies are still present
11360 SFMMU_HASH_LOCK(hmebp);
11361 HME_HASH_FAST_SEARCH(hmebp, hblktag, newhblkp);
11362 if (newhblkp != NULL) {
11364 * return newhblkp if it's not hblk_reserve;
11365 * if newhblkp is hblk_reserve, return it
11366 * _only if_ we are the owner of hblk_reserve.
11368 if (newhblkp != HBLK_RESERVE || owner) {
11369 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) ||
11370 newhblkp->hblk_shared);
11371 ASSERT(SFMMU_IS_SHMERID_VALID(rid) ||
11372 !newhblkp->hblk_shared);
11373 return (newhblkp);
11374 } else {
11376 * we just hit hblk_reserve in the hash and
11377 * we are not the owner of that;
11379 * block until hblk_reserve_thread completes
11380 * swapping hblk_reserve and try the dance
11381 * once again.
11383 SFMMU_HASH_UNLOCK(hmebp);
11384 mutex_enter(&hblk_reserve_lock);
11385 mutex_exit(&hblk_reserve_lock);
11386 SFMMU_STAT(sf_hblk_reserve_hit);
11387 goto fill_hblk;
11389 } else {
11391 * it's no more! try the dance once again.
11393 SFMMU_HASH_UNLOCK(hmebp);
11394 goto fill_hblk;
11398 hblk_init:
11399 if (SFMMU_IS_SHMERID_VALID(rid)) {
11400 uint16_t tteflag = 0x1 <<
11401 ((size < HBLK_MIN_TTESZ) ? HBLK_MIN_TTESZ : size);
11403 if (!(rgnp->rgn_hmeflags & tteflag)) {
11404 atomic_or_16(&rgnp->rgn_hmeflags, tteflag);
11406 hmeblkp->hblk_shared = 1;
11407 } else {
11408 hmeblkp->hblk_shared = 0;
11410 set_hblk_sz(hmeblkp, size);
11411 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
11412 hmeblkp->hblk_next = (struct hme_blk *)NULL;
11413 hmeblkp->hblk_tag = hblktag;
11414 hmeblkp->hblk_shadow = shw_hblkp;
11415 hblkpa = hmeblkp->hblk_nextpa;
11416 hmeblkp->hblk_nextpa = HMEBLK_ENDPA;
11418 ASSERT(get_hblk_ttesz(hmeblkp) == size);
11419 ASSERT(get_hblk_span(hmeblkp) == HMEBLK_SPAN(size));
11420 ASSERT(hmeblkp->hblk_hmecnt == 0);
11421 ASSERT(hmeblkp->hblk_vcnt == 0);
11422 ASSERT(hmeblkp->hblk_lckcnt == 0);
11423 ASSERT(hblkpa == va_to_pa((caddr_t)hmeblkp));
11424 sfmmu_hblk_hash_add(hmebp, hmeblkp, hblkpa);
11425 return (hmeblkp);
11429 * This function cleans up the hme_blk and returns it to the free list.
11431 /* ARGSUSED */
11432 static void
11433 sfmmu_hblk_free(struct hme_blk **listp)
11435 struct hme_blk *hmeblkp, *next_hmeblkp;
11436 int size;
11437 uint_t critical;
11438 uint64_t hblkpa;
11440 ASSERT(*listp != NULL);
11442 hmeblkp = *listp;
11443 while (hmeblkp != NULL) {
11444 next_hmeblkp = hmeblkp->hblk_next;
11445 ASSERT(!hmeblkp->hblk_hmecnt);
11446 ASSERT(!hmeblkp->hblk_vcnt);
11447 ASSERT(!hmeblkp->hblk_lckcnt);
11448 ASSERT(hmeblkp != (struct hme_blk *)hblk_reserve);
11449 ASSERT(hmeblkp->hblk_shared == 0);
11450 ASSERT(hmeblkp->hblk_shw_bit == 0);
11451 ASSERT(hmeblkp->hblk_shadow == NULL);
11453 hblkpa = va_to_pa((caddr_t)hmeblkp);
11454 ASSERT(hblkpa != (uint64_t)-1);
11455 critical = (hblktosfmmu(hmeblkp) == KHATID) ? 1 : 0;
11457 size = get_hblk_ttesz(hmeblkp);
11458 hmeblkp->hblk_next = NULL;
11459 hmeblkp->hblk_nextpa = hblkpa;
11461 if (hmeblkp->hblk_nuc_bit == 0) {
11463 if (size != TTE8K ||
11464 !sfmmu_put_free_hblk(hmeblkp, critical))
11465 kmem_cache_free(get_hblk_cache(hmeblkp),
11466 hmeblkp);
11468 hmeblkp = next_hmeblkp;
11472 #define BUCKETS_TO_SEARCH_BEFORE_UNLOAD 30
11473 #define SFMMU_HBLK_STEAL_THRESHOLD 5
11475 static uint_t sfmmu_hblk_steal_twice;
11476 static uint_t sfmmu_hblk_steal_count, sfmmu_hblk_steal_unload_count;
11479 * Steal a hmeblk from user or kernel hme hash lists.
11480 * For 8K tte grab one from reserve pool (freehblkp) before proceeding to
11481 * steal and if we fail to steal after SFMMU_HBLK_STEAL_THRESHOLD attempts
11482 * tap into critical reserve of freehblkp.
11483 * Note: We remain looping in this routine until we find one.
11485 static struct hme_blk *
11486 sfmmu_hblk_steal(int size)
11488 static struct hmehash_bucket *uhmehash_steal_hand = NULL;
11489 struct hmehash_bucket *hmebp;
11490 struct hme_blk *hmeblkp = NULL, *pr_hblk;
11491 uint64_t hblkpa;
11492 int i;
11493 uint_t loop_cnt = 0, critical;
11495 for (;;) {
11496 /* Check cpu hblk pending queues */
11497 if ((hmeblkp = sfmmu_check_pending_hblks(size)) != NULL) {
11498 hmeblkp->hblk_nextpa = va_to_pa((caddr_t)hmeblkp);
11499 ASSERT(hmeblkp->hblk_hmecnt == 0);
11500 ASSERT(hmeblkp->hblk_vcnt == 0);
11501 return (hmeblkp);
11504 if (size == TTE8K) {
11505 critical =
11506 (++loop_cnt > SFMMU_HBLK_STEAL_THRESHOLD) ? 1 : 0;
11507 if (sfmmu_get_free_hblk(&hmeblkp, critical))
11508 return (hmeblkp);
11511 hmebp = (uhmehash_steal_hand == NULL) ? uhme_hash :
11512 uhmehash_steal_hand;
11513 ASSERT(hmebp >= uhme_hash && hmebp <= &uhme_hash[UHMEHASH_SZ]);
11515 for (i = 0; hmeblkp == NULL && i <= UHMEHASH_SZ +
11516 BUCKETS_TO_SEARCH_BEFORE_UNLOAD; i++) {
11517 SFMMU_HASH_LOCK(hmebp);
11518 hmeblkp = hmebp->hmeblkp;
11519 hblkpa = hmebp->hmeh_nextpa;
11520 pr_hblk = NULL;
11521 while (hmeblkp) {
11523 * check if it is a hmeblk that is not locked
11524 * and not shared. skip shadow hmeblks with
11525 * shadow_mask set i.e valid count non zero.
11527 if ((get_hblk_ttesz(hmeblkp) == size) &&
11528 (hmeblkp->hblk_shw_bit == 0 ||
11529 hmeblkp->hblk_vcnt == 0) &&
11530 (hmeblkp->hblk_lckcnt == 0)) {
11532 * there is a high probability that we
11533 * will find a free one. search some
11534 * buckets for a free hmeblk initially
11535 * before unloading a valid hmeblk.
11537 if ((hmeblkp->hblk_vcnt == 0 &&
11538 hmeblkp->hblk_hmecnt == 0) || (i >=
11539 BUCKETS_TO_SEARCH_BEFORE_UNLOAD)) {
11540 if (sfmmu_steal_this_hblk(hmebp,
11541 hmeblkp, hblkpa, pr_hblk)) {
11543 * Hblk is unloaded
11544 * successfully
11546 break;
11550 pr_hblk = hmeblkp;
11551 hblkpa = hmeblkp->hblk_nextpa;
11552 hmeblkp = hmeblkp->hblk_next;
11555 SFMMU_HASH_UNLOCK(hmebp);
11556 if (hmebp++ == &uhme_hash[UHMEHASH_SZ])
11557 hmebp = uhme_hash;
11559 uhmehash_steal_hand = hmebp;
11561 if (hmeblkp != NULL)
11562 break;
11565 * in the worst case, look for a free one in the kernel
11566 * hash table.
11568 for (i = 0, hmebp = khme_hash; i <= KHMEHASH_SZ; i++) {
11569 SFMMU_HASH_LOCK(hmebp);
11570 hmeblkp = hmebp->hmeblkp;
11571 hblkpa = hmebp->hmeh_nextpa;
11572 pr_hblk = NULL;
11573 while (hmeblkp) {
11575 * check if it is free hmeblk
11577 if ((get_hblk_ttesz(hmeblkp) == size) &&
11578 (hmeblkp->hblk_lckcnt == 0) &&
11579 (hmeblkp->hblk_vcnt == 0) &&
11580 (hmeblkp->hblk_hmecnt == 0)) {
11581 if (sfmmu_steal_this_hblk(hmebp,
11582 hmeblkp, hblkpa, pr_hblk)) {
11583 break;
11584 } else {
11586 * Cannot fail since we have
11587 * hash lock.
11589 panic("fail to steal?");
11593 pr_hblk = hmeblkp;
11594 hblkpa = hmeblkp->hblk_nextpa;
11595 hmeblkp = hmeblkp->hblk_next;
11598 SFMMU_HASH_UNLOCK(hmebp);
11599 if (hmebp++ == &khme_hash[KHMEHASH_SZ])
11600 hmebp = khme_hash;
11603 if (hmeblkp != NULL)
11604 break;
11605 sfmmu_hblk_steal_twice++;
11607 return (hmeblkp);
11611 * This routine does real work to prepare a hblk to be "stolen" by
11612 * unloading the mappings, updating shadow counts ....
11613 * It returns 1 if the block is ready to be reused (stolen), or 0
11614 * means the block cannot be stolen yet- pageunload is still working
11615 * on this hblk.
11617 static int
11618 sfmmu_steal_this_hblk(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp,
11619 uint64_t hblkpa, struct hme_blk *pr_hblk)
11621 int shw_size, vshift;
11622 struct hme_blk *shw_hblkp;
11623 caddr_t vaddr;
11624 uint_t shw_mask, newshw_mask;
11625 struct hme_blk *list = NULL;
11627 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
11630 * check if the hmeblk is free, unload if necessary
11632 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) {
11633 sfmmu_t *sfmmup;
11634 demap_range_t dmr;
11636 sfmmup = hblktosfmmu(hmeblkp);
11637 if (hmeblkp->hblk_shared || sfmmup->sfmmu_ismhat) {
11638 return (0);
11640 DEMAP_RANGE_INIT(sfmmup, &dmr);
11641 (void) sfmmu_hblk_unload(sfmmup, hmeblkp,
11642 (caddr_t)get_hblk_base(hmeblkp),
11643 get_hblk_endaddr(hmeblkp), &dmr, HAT_UNLOAD);
11644 DEMAP_RANGE_FLUSH(&dmr);
11645 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) {
11647 * Pageunload is working on the same hblk.
11649 return (0);
11652 sfmmu_hblk_steal_unload_count++;
11655 ASSERT(hmeblkp->hblk_lckcnt == 0);
11656 ASSERT(hmeblkp->hblk_vcnt == 0 && hmeblkp->hblk_hmecnt == 0);
11658 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, &list, 1);
11659 hmeblkp->hblk_nextpa = hblkpa;
11661 shw_hblkp = hmeblkp->hblk_shadow;
11662 if (shw_hblkp) {
11663 ASSERT(!hmeblkp->hblk_shared);
11664 shw_size = get_hblk_ttesz(shw_hblkp);
11665 vaddr = (caddr_t)get_hblk_base(hmeblkp);
11666 vshift = vaddr_to_vshift(shw_hblkp->hblk_tag, vaddr, shw_size);
11667 ASSERT(vshift < 8);
11669 * Atomically clear shadow mask bit
11671 do {
11672 shw_mask = shw_hblkp->hblk_shw_mask;
11673 ASSERT(shw_mask & (1 << vshift));
11674 newshw_mask = shw_mask & ~(1 << vshift);
11675 newshw_mask = atomic_cas_32(&shw_hblkp->hblk_shw_mask,
11676 shw_mask, newshw_mask);
11677 } while (newshw_mask != shw_mask);
11678 hmeblkp->hblk_shadow = NULL;
11682 * remove shadow bit if we are stealing an unused shadow hmeblk.
11683 * sfmmu_hblk_alloc needs it that way, will set shadow bit later if
11684 * we are indeed allocating a shadow hmeblk.
11686 hmeblkp->hblk_shw_bit = 0;
11688 if (hmeblkp->hblk_shared) {
11689 sf_srd_t *srdp;
11690 sf_region_t *rgnp;
11691 uint_t rid;
11693 srdp = hblktosrd(hmeblkp);
11694 ASSERT(srdp != NULL && srdp->srd_refcnt != 0);
11695 rid = hmeblkp->hblk_tag.htag_rid;
11696 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
11697 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
11698 rgnp = srdp->srd_hmergnp[rid];
11699 ASSERT(rgnp != NULL);
11700 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid);
11701 hmeblkp->hblk_shared = 0;
11704 sfmmu_hblk_steal_count++;
11705 SFMMU_STAT(sf_steal_count);
11707 return (1);
11710 struct hme_blk *
11711 sfmmu_hmetohblk(struct sf_hment *sfhme)
11713 struct hme_blk *hmeblkp;
11714 struct sf_hment *sfhme0;
11715 struct hme_blk *hblk_dummy = 0;
11718 * No dummy sf_hments, please.
11720 ASSERT(sfhme->hme_tte.ll != 0);
11722 sfhme0 = sfhme - sfhme->hme_tte.tte_hmenum;
11723 hmeblkp = (struct hme_blk *)((uintptr_t)sfhme0 -
11724 (uintptr_t)&hblk_dummy->hblk_hme[0]);
11726 return (hmeblkp);
11730 * On swapin, get appropriately sized TSB(s) and clear the HAT_SWAPPED flag.
11731 * If we can't get appropriately sized TSB(s), try for 8K TSB(s) using
11732 * KM_SLEEP allocation.
11734 * Return 0 on success, -1 otherwise.
11736 static void
11737 sfmmu_tsb_swapin(sfmmu_t *sfmmup, hatlock_t *hatlockp)
11739 struct tsb_info *tsbinfop, *next;
11740 tsb_replace_rc_t rc;
11741 boolean_t gotfirst = B_FALSE;
11743 ASSERT(sfmmup != ksfmmup);
11744 ASSERT(sfmmu_hat_lock_held(sfmmup));
11746 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPIN)) {
11747 cv_wait(&sfmmup->sfmmu_tsb_cv, HATLOCK_MUTEXP(hatlockp));
11750 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) {
11751 SFMMU_FLAGS_SET(sfmmup, HAT_SWAPIN);
11752 } else {
11753 return;
11756 ASSERT(sfmmup->sfmmu_tsb != NULL);
11759 * Loop over all tsbinfo's replacing them with ones that actually have
11760 * a TSB. If any of the replacements ever fail, bail out of the loop.
11762 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; tsbinfop = next) {
11763 ASSERT(tsbinfop->tsb_flags & TSB_SWAPPED);
11764 next = tsbinfop->tsb_next;
11765 rc = sfmmu_replace_tsb(sfmmup, tsbinfop, tsbinfop->tsb_szc,
11766 hatlockp, TSB_SWAPIN);
11767 if (rc != TSB_SUCCESS) {
11768 break;
11770 gotfirst = B_TRUE;
11773 switch (rc) {
11774 case TSB_SUCCESS:
11775 SFMMU_FLAGS_CLEAR(sfmmup, HAT_SWAPPED|HAT_SWAPIN);
11776 cv_broadcast(&sfmmup->sfmmu_tsb_cv);
11777 return;
11778 case TSB_LOSTRACE:
11779 break;
11780 case TSB_ALLOCFAIL:
11781 break;
11782 default:
11783 panic("sfmmu_replace_tsb returned unrecognized failure code "
11784 "%d", rc);
11788 * In this case, we failed to get one of our TSBs. If we failed to
11789 * get the first TSB, get one of minimum size (8KB). Walk the list
11790 * and throw away the tsbinfos, starting where the allocation failed;
11791 * we can get by with just one TSB as long as we don't leave the
11792 * SWAPPED tsbinfo structures lying around.
11794 tsbinfop = sfmmup->sfmmu_tsb;
11795 next = tsbinfop->tsb_next;
11796 tsbinfop->tsb_next = NULL;
11798 sfmmu_hat_exit(hatlockp);
11799 for (tsbinfop = next; tsbinfop != NULL; tsbinfop = next) {
11800 next = tsbinfop->tsb_next;
11801 sfmmu_tsbinfo_free(tsbinfop);
11803 hatlockp = sfmmu_hat_enter(sfmmup);
11806 * If we don't have any TSBs, get a single 8K TSB for 8K, 64K and 512K
11807 * pages.
11809 if (!gotfirst) {
11810 tsbinfop = sfmmup->sfmmu_tsb;
11811 rc = sfmmu_replace_tsb(sfmmup, tsbinfop, TSB_MIN_SZCODE,
11812 hatlockp, TSB_SWAPIN | TSB_FORCEALLOC);
11813 ASSERT(rc == TSB_SUCCESS);
11816 SFMMU_FLAGS_CLEAR(sfmmup, HAT_SWAPPED|HAT_SWAPIN);
11817 cv_broadcast(&sfmmup->sfmmu_tsb_cv);
11820 static int
11821 sfmmu_is_rgnva(sf_srd_t *srdp, caddr_t addr, ulong_t w, ulong_t bmw)
11823 ulong_t bix = 0;
11824 uint_t rid;
11825 sf_region_t *rgnp;
11827 ASSERT(srdp != NULL);
11828 ASSERT(srdp->srd_refcnt != 0);
11830 w <<= BT_ULSHIFT;
11831 while (bmw) {
11832 if (!(bmw & 0x1)) {
11833 bix++;
11834 bmw >>= 1;
11835 continue;
11837 rid = w | bix;
11838 rgnp = srdp->srd_hmergnp[rid];
11839 ASSERT(rgnp->rgn_refcnt > 0);
11840 ASSERT(rgnp->rgn_id == rid);
11841 if (addr < rgnp->rgn_saddr ||
11842 addr >= (rgnp->rgn_saddr + rgnp->rgn_size)) {
11843 bix++;
11844 bmw >>= 1;
11845 } else {
11846 return (1);
11849 return (0);
11853 * Handle exceptions for low level tsb_handler.
11855 * There are many scenarios that could land us here:
11857 * If the context is invalid we land here. The context can be invalid
11858 * for 3 reasons: 1) we couldn't allocate a new context and now need to
11859 * perform a wrap around operation in order to allocate a new context.
11860 * 2) Context was invalidated to change pagesize programming 3) ISMs or
11861 * TSBs configuration is changeing for this process and we are forced into
11862 * here to do a syncronization operation. If the context is valid we can
11863 * be here from window trap hanlder. In this case just call trap to handle
11864 * the fault.
11866 * Note that the process will run in INVALID_CONTEXT before
11867 * faulting into here and subsequently loading the MMU registers
11868 * (including the TSB base register) associated with this process.
11869 * For this reason, the trap handlers must all test for
11870 * INVALID_CONTEXT before attempting to access any registers other
11871 * than the context registers.
11873 void
11874 sfmmu_tsbmiss_exception(struct regs *rp, uintptr_t tagaccess, uint_t traptype)
11876 sfmmu_t *sfmmup, *shsfmmup;
11877 uint_t ctxtype;
11878 klwp_id_t lwp;
11879 char lwp_save_state;
11880 hatlock_t *hatlockp, *shatlockp;
11881 struct tsb_info *tsbinfop;
11882 struct tsbmiss *tsbmp;
11883 sf_scd_t *scdp;
11885 SFMMU_STAT(sf_tsb_exceptions);
11886 SFMMU_MMU_STAT(mmu_tsb_exceptions);
11887 sfmmup = astosfmmu(curthread->t_procp->p_as);
11889 * note that in sun4u, tagacces register contains ctxnum
11890 * while sun4v passes ctxtype in the tagaccess register.
11892 ctxtype = tagaccess & TAGACC_CTX_MASK;
11894 ASSERT(sfmmup != ksfmmup && ctxtype != KCONTEXT);
11895 ASSERT(sfmmup->sfmmu_ismhat == 0);
11896 ASSERT(!SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED) ||
11897 ctxtype == INVALID_CONTEXT);
11899 if (ctxtype != INVALID_CONTEXT && traptype != T_DATA_PROT) {
11901 * We may land here because shme bitmap and pagesize
11902 * flags are updated lazily in tsbmiss area on other cpus.
11903 * If we detect here that tsbmiss area is out of sync with
11904 * sfmmu update it and retry the trapped instruction.
11905 * Otherwise call trap().
11907 int ret = 0;
11908 uchar_t tteflag_mask = (1 << TTE64K) | (1 << TTE8K);
11909 caddr_t addr = (caddr_t)(tagaccess & TAGACC_VADDR_MASK);
11912 * Must set lwp state to LWP_SYS before
11913 * trying to acquire any adaptive lock
11915 lwp = ttolwp(curthread);
11916 ASSERT(lwp);
11917 lwp_save_state = lwp->lwp_state;
11918 lwp->lwp_state = LWP_SYS;
11920 hatlockp = sfmmu_hat_enter(sfmmup);
11921 kpreempt_disable();
11922 tsbmp = &tsbmiss_area[CPU->cpu_id];
11923 ASSERT(sfmmup == tsbmp->usfmmup);
11924 if (((tsbmp->uhat_tteflags ^ sfmmup->sfmmu_tteflags) &
11925 ~tteflag_mask) ||
11926 ((tsbmp->uhat_rtteflags ^ sfmmup->sfmmu_rtteflags) &
11927 ~tteflag_mask)) {
11928 tsbmp->uhat_tteflags = sfmmup->sfmmu_tteflags;
11929 tsbmp->uhat_rtteflags = sfmmup->sfmmu_rtteflags;
11930 ret = 1;
11932 if (sfmmup->sfmmu_srdp != NULL) {
11933 ulong_t *sm = sfmmup->sfmmu_hmeregion_map.bitmap;
11934 ulong_t *tm = tsbmp->shmermap;
11935 ulong_t i;
11936 for (i = 0; i < SFMMU_HMERGNMAP_WORDS; i++) {
11937 ulong_t d = tm[i] ^ sm[i];
11938 if (d) {
11939 if (d & sm[i]) {
11940 if (!ret && sfmmu_is_rgnva(
11941 sfmmup->sfmmu_srdp,
11942 addr, i, d & sm[i])) {
11943 ret = 1;
11946 tm[i] = sm[i];
11950 kpreempt_enable();
11951 sfmmu_hat_exit(hatlockp);
11952 lwp->lwp_state = lwp_save_state;
11953 if (ret) {
11954 return;
11956 } else if (ctxtype == INVALID_CONTEXT) {
11958 * First, make sure we come out of here with a valid ctx,
11959 * since if we don't get one we'll simply loop on the
11960 * faulting instruction.
11962 * If the ISM mappings are changing, the TSB is relocated,
11963 * the process is swapped, the process is joining SCD or
11964 * leaving SCD or shared regions we serialize behind the
11965 * controlling thread with hat lock, sfmmu_flags and
11966 * sfmmu_tsb_cv condition variable.
11970 * Must set lwp state to LWP_SYS before
11971 * trying to acquire any adaptive lock
11973 lwp = ttolwp(curthread);
11974 ASSERT(lwp);
11975 lwp_save_state = lwp->lwp_state;
11976 lwp->lwp_state = LWP_SYS;
11978 hatlockp = sfmmu_hat_enter(sfmmup);
11979 retry:
11980 if ((scdp = sfmmup->sfmmu_scdp) != NULL) {
11981 shsfmmup = scdp->scd_sfmmup;
11982 ASSERT(shsfmmup != NULL);
11984 for (tsbinfop = shsfmmup->sfmmu_tsb; tsbinfop != NULL;
11985 tsbinfop = tsbinfop->tsb_next) {
11986 if (tsbinfop->tsb_flags & TSB_RELOC_FLAG) {
11987 /* drop the private hat lock */
11988 sfmmu_hat_exit(hatlockp);
11989 /* acquire the shared hat lock */
11990 shatlockp = sfmmu_hat_enter(shsfmmup);
11992 * recheck to see if anything changed
11993 * after we drop the private hat lock.
11995 if (sfmmup->sfmmu_scdp == scdp &&
11996 shsfmmup == scdp->scd_sfmmup) {
11997 sfmmu_tsb_chk_reloc(shsfmmup,
11998 shatlockp);
12000 sfmmu_hat_exit(shatlockp);
12001 hatlockp = sfmmu_hat_enter(sfmmup);
12002 goto retry;
12007 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL;
12008 tsbinfop = tsbinfop->tsb_next) {
12009 if (tsbinfop->tsb_flags & TSB_RELOC_FLAG) {
12010 cv_wait(&sfmmup->sfmmu_tsb_cv,
12011 HATLOCK_MUTEXP(hatlockp));
12012 goto retry;
12017 * Wait for ISM maps to be updated.
12019 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) {
12020 cv_wait(&sfmmup->sfmmu_tsb_cv,
12021 HATLOCK_MUTEXP(hatlockp));
12022 goto retry;
12025 /* Is this process joining an SCD? */
12026 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)) {
12028 * Flush private TSB and setup shared TSB.
12029 * sfmmu_finish_join_scd() does not drop the
12030 * hat lock.
12032 sfmmu_finish_join_scd(sfmmup);
12033 SFMMU_FLAGS_CLEAR(sfmmup, HAT_JOIN_SCD);
12037 * If we're swapping in, get TSB(s). Note that we must do
12038 * this before we get a ctx or load the MMU state. Once
12039 * we swap in we have to recheck to make sure the TSB(s) and
12040 * ISM mappings didn't change while we slept.
12042 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) {
12043 sfmmu_tsb_swapin(sfmmup, hatlockp);
12044 goto retry;
12047 sfmmu_get_ctx(sfmmup);
12049 sfmmu_hat_exit(hatlockp);
12051 * Must restore lwp_state if not calling
12052 * trap() for further processing. Restore
12053 * it anyway.
12055 lwp->lwp_state = lwp_save_state;
12056 return;
12058 trap(rp, (caddr_t)tagaccess, traptype, 0);
12061 static void
12062 sfmmu_tsb_chk_reloc(sfmmu_t *sfmmup, hatlock_t *hatlockp)
12064 struct tsb_info *tp;
12066 ASSERT(sfmmu_hat_lock_held(sfmmup));
12068 for (tp = sfmmup->sfmmu_tsb; tp != NULL; tp = tp->tsb_next) {
12069 if (tp->tsb_flags & TSB_RELOC_FLAG) {
12070 cv_wait(&sfmmup->sfmmu_tsb_cv,
12071 HATLOCK_MUTEXP(hatlockp));
12072 break;
12078 * sfmmu_vatopfn_suspended is called from GET_TTE when TL=0 and
12079 * TTE_SUSPENDED bit set in tte we block on aquiring a page lock
12080 * rather than spinning to avoid send mondo timeouts with
12081 * interrupts enabled. When the lock is acquired it is immediately
12082 * released and we return back to sfmmu_vatopfn just after
12083 * the GET_TTE call.
12085 void
12086 sfmmu_vatopfn_suspended(caddr_t vaddr, sfmmu_t *sfmmu, tte_t *ttep)
12088 struct page **pp;
12090 (void) as_pagelock(sfmmu->sfmmu_as, &pp, vaddr, TTE_CSZ(ttep), S_WRITE);
12091 as_pageunlock(sfmmu->sfmmu_as, pp, vaddr, TTE_CSZ(ttep), S_WRITE);
12095 * sfmmu_tsbmiss_suspended is called from GET_TTE when TL>0 and
12096 * TTE_SUSPENDED bit set in tte. We do this so that we can handle
12097 * cross traps which cannot be handled while spinning in the
12098 * trap handlers. Simply enter and exit the kpr_suspendlock spin
12099 * mutex, which is held by the holder of the suspend bit, and then
12100 * retry the trapped instruction after unwinding.
12102 /*ARGSUSED*/
12103 void
12104 sfmmu_tsbmiss_suspended(struct regs *rp, uintptr_t tagacc, uint_t traptype)
12106 ASSERT(curthread != kreloc_thread);
12107 mutex_enter(&kpr_suspendlock);
12108 mutex_exit(&kpr_suspendlock);
12112 * This routine could be optimized to reduce the number of xcalls by flushing
12113 * the entire TLBs if region reference count is above some threshold but the
12114 * tradeoff will depend on the size of the TLB. So for now flush the specific
12115 * page a context at a time.
12117 * If uselocks is 0 then it's called after all cpus were captured and all the
12118 * hat locks were taken. In this case don't take the region lock by relying on
12119 * the order of list region update operations in hat_join_region(),
12120 * hat_leave_region() and hat_dup_region(). The ordering in those routines
12121 * guarantees that list is always forward walkable and reaches active sfmmus
12122 * regardless of where xc_attention() captures a cpu.
12124 cpuset_t
12125 sfmmu_rgntlb_demap(caddr_t addr, sf_region_t *rgnp,
12126 struct hme_blk *hmeblkp, int uselocks)
12128 sfmmu_t *sfmmup;
12129 cpuset_t cpuset;
12130 cpuset_t rcpuset;
12131 hatlock_t *hatlockp;
12132 uint_t rid = rgnp->rgn_id;
12133 sf_rgn_link_t *rlink;
12134 sf_scd_t *scdp;
12136 ASSERT(hmeblkp->hblk_shared);
12137 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
12138 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
12140 CPUSET_ZERO(rcpuset);
12141 if (uselocks) {
12142 mutex_enter(&rgnp->rgn_mutex);
12144 sfmmup = rgnp->rgn_sfmmu_head;
12145 while (sfmmup != NULL) {
12146 if (uselocks) {
12147 hatlockp = sfmmu_hat_enter(sfmmup);
12151 * When an SCD is created the SCD hat is linked on the sfmmu
12152 * region lists for each hme region which is part of the
12153 * SCD. If we find an SCD hat, when walking these lists,
12154 * then we flush the shared TSBs, if we find a private hat,
12155 * which is part of an SCD, but where the region
12156 * is not part of the SCD then we flush the private TSBs.
12158 if (!sfmmup->sfmmu_scdhat && sfmmup->sfmmu_scdp != NULL &&
12159 !SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)) {
12160 scdp = sfmmup->sfmmu_scdp;
12161 if (SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) {
12162 if (uselocks) {
12163 sfmmu_hat_exit(hatlockp);
12165 goto next;
12169 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0);
12171 kpreempt_disable();
12172 cpuset = sfmmup->sfmmu_cpusran;
12173 CPUSET_AND(cpuset, cpu_ready_set);
12174 CPUSET_DEL(cpuset, CPU->cpu_id);
12175 SFMMU_XCALL_STATS(sfmmup);
12176 xt_some(cpuset, vtag_flushpage_tl1,
12177 (uint64_t)addr, (uint64_t)sfmmup);
12178 vtag_flushpage(addr, (uint64_t)sfmmup);
12179 if (uselocks) {
12180 sfmmu_hat_exit(hatlockp);
12182 kpreempt_enable();
12183 CPUSET_OR(rcpuset, cpuset);
12185 next:
12186 /* LINTED: constant in conditional context */
12187 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 0, 0);
12188 ASSERT(rlink != NULL);
12189 sfmmup = rlink->next;
12191 if (uselocks) {
12192 mutex_exit(&rgnp->rgn_mutex);
12194 return (rcpuset);
12198 * This routine takes an sfmmu pointer and the va for an adddress in an
12199 * ISM region as input and returns the corresponding region id in ism_rid.
12200 * The return value of 1 indicates that a region has been found and ism_rid
12201 * is valid, otherwise 0 is returned.
12203 static int
12204 find_ism_rid(sfmmu_t *sfmmup, sfmmu_t *ism_sfmmup, caddr_t va, uint_t *ism_rid)
12206 ism_blk_t *ism_blkp;
12207 int i;
12208 ism_map_t *ism_map;
12209 #ifdef DEBUG
12210 struct hat *ism_hatid;
12211 #endif
12212 ASSERT(sfmmu_hat_lock_held(sfmmup));
12214 ism_blkp = sfmmup->sfmmu_iblk;
12215 while (ism_blkp != NULL) {
12216 ism_map = ism_blkp->iblk_maps;
12217 for (i = 0; i < ISM_MAP_SLOTS && ism_map[i].imap_ismhat; i++) {
12218 if ((va >= ism_start(ism_map[i])) &&
12219 (va < ism_end(ism_map[i]))) {
12221 *ism_rid = ism_map[i].imap_rid;
12222 #ifdef DEBUG
12223 ism_hatid = ism_map[i].imap_ismhat;
12224 ASSERT(ism_hatid == ism_sfmmup);
12225 ASSERT(ism_hatid->sfmmu_ismhat);
12226 #endif
12227 return (1);
12230 ism_blkp = ism_blkp->iblk_next;
12232 return (0);
12236 * Special routine to flush out ism mappings- TSBs, TLBs and D-caches.
12237 * This routine may be called with all cpu's captured. Therefore, the
12238 * caller is responsible for holding all locks and disabling kernel
12239 * preemption.
12241 /* ARGSUSED */
12242 static void
12243 sfmmu_ismtlbcache_demap(caddr_t addr, sfmmu_t *ism_sfmmup,
12244 struct hme_blk *hmeblkp, pfn_t pfnum, int cache_flush_flag)
12246 cpuset_t cpuset;
12247 caddr_t va;
12248 ism_ment_t *ment;
12249 sfmmu_t *sfmmup;
12250 #ifdef VAC
12251 int vcolor;
12252 #endif
12254 sf_scd_t *scdp;
12255 uint_t ism_rid;
12257 ASSERT(!hmeblkp->hblk_shared);
12259 * Walk the ism_hat's mapping list and flush the page
12260 * from every hat sharing this ism_hat. This routine
12261 * may be called while all cpu's have been captured.
12262 * Therefore we can't attempt to grab any locks. For now
12263 * this means we will protect the ism mapping list under
12264 * a single lock which will be grabbed by the caller.
12265 * If hat_share/unshare scalibility becomes a performance
12266 * problem then we may need to re-think ism mapping list locking.
12268 ASSERT(ism_sfmmup->sfmmu_ismhat);
12269 ASSERT(MUTEX_HELD(&ism_mlist_lock));
12270 addr = addr - ISMID_STARTADDR;
12272 for (ment = ism_sfmmup->sfmmu_iment; ment; ment = ment->iment_next) {
12274 sfmmup = ment->iment_hat;
12276 va = ment->iment_base_va;
12277 va = (caddr_t)((uintptr_t)va + (uintptr_t)addr);
12280 * When an SCD is created the SCD hat is linked on the ism
12281 * mapping lists for each ISM segment which is part of the
12282 * SCD. If we find an SCD hat, when walking these lists,
12283 * then we flush the shared TSBs, if we find a private hat,
12284 * which is part of an SCD, but where the region
12285 * corresponding to this va is not part of the SCD then we
12286 * flush the private TSBs.
12288 if (!sfmmup->sfmmu_scdhat && sfmmup->sfmmu_scdp != NULL &&
12289 !SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD) &&
12290 !SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) {
12291 if (!find_ism_rid(sfmmup, ism_sfmmup, va,
12292 &ism_rid)) {
12293 cmn_err(CE_PANIC,
12294 "can't find matching ISM rid!");
12297 scdp = sfmmup->sfmmu_scdp;
12298 if (SFMMU_IS_ISMRID_VALID(ism_rid) &&
12299 SF_RGNMAP_TEST(scdp->scd_ismregion_map,
12300 ism_rid)) {
12301 continue;
12304 SFMMU_UNLOAD_TSB(va, sfmmup, hmeblkp, 1);
12306 cpuset = sfmmup->sfmmu_cpusran;
12307 CPUSET_AND(cpuset, cpu_ready_set);
12308 CPUSET_DEL(cpuset, CPU->cpu_id);
12309 SFMMU_XCALL_STATS(sfmmup);
12310 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)va,
12311 (uint64_t)sfmmup);
12312 vtag_flushpage(va, (uint64_t)sfmmup);
12314 #ifdef VAC
12316 * Flush D$
12317 * When flushing D$ we must flush all
12318 * cpu's. See sfmmu_cache_flush().
12320 if (cache_flush_flag == CACHE_FLUSH) {
12321 cpuset = cpu_ready_set;
12322 CPUSET_DEL(cpuset, CPU->cpu_id);
12324 SFMMU_XCALL_STATS(sfmmup);
12325 vcolor = addr_to_vcolor(va);
12326 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor);
12327 vac_flushpage(pfnum, vcolor);
12329 #endif /* VAC */
12334 * Demaps the TSB, CPU caches, and flushes all TLBs on all CPUs of
12335 * a particular virtual address and ctx. If noflush is set we do not
12336 * flush the TLB/TSB. This function may or may not be called with the
12337 * HAT lock held.
12339 static void
12340 sfmmu_tlbcache_demap(caddr_t addr, sfmmu_t *sfmmup, struct hme_blk *hmeblkp,
12341 pfn_t pfnum, int tlb_noflush, int cpu_flag, int cache_flush_flag,
12342 int hat_lock_held)
12344 #ifdef VAC
12345 int vcolor;
12346 #endif
12347 cpuset_t cpuset;
12348 hatlock_t *hatlockp;
12350 ASSERT(!hmeblkp->hblk_shared);
12352 #if defined(lint) && !defined(VAC)
12353 pfnum = pfnum;
12354 cpu_flag = cpu_flag;
12355 cache_flush_flag = cache_flush_flag;
12356 #endif
12359 * There is no longer a need to protect against ctx being
12360 * stolen here since we don't store the ctx in the TSB anymore.
12362 #ifdef VAC
12363 vcolor = addr_to_vcolor(addr);
12364 #endif
12367 * We must hold the hat lock during the flush of TLB,
12368 * to avoid a race with sfmmu_invalidate_ctx(), where
12369 * sfmmu_cnum on a MMU could be set to INVALID_CONTEXT,
12370 * causing TLB demap routine to skip flush on that MMU.
12371 * If the context on a MMU has already been set to
12372 * INVALID_CONTEXT, we just get an extra flush on
12373 * that MMU.
12375 if (!hat_lock_held && !tlb_noflush)
12376 hatlockp = sfmmu_hat_enter(sfmmup);
12378 kpreempt_disable();
12379 if (!tlb_noflush) {
12381 * Flush the TSB and TLB.
12383 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0);
12385 cpuset = sfmmup->sfmmu_cpusran;
12386 CPUSET_AND(cpuset, cpu_ready_set);
12387 CPUSET_DEL(cpuset, CPU->cpu_id);
12389 SFMMU_XCALL_STATS(sfmmup);
12391 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr,
12392 (uint64_t)sfmmup);
12394 vtag_flushpage(addr, (uint64_t)sfmmup);
12397 if (!hat_lock_held && !tlb_noflush)
12398 sfmmu_hat_exit(hatlockp);
12400 #ifdef VAC
12402 * Flush the D$
12404 * Even if the ctx is stolen, we need to flush the
12405 * cache. Our ctx stealer only flushes the TLBs.
12407 if (cache_flush_flag == CACHE_FLUSH) {
12408 if (cpu_flag & FLUSH_ALL_CPUS) {
12409 cpuset = cpu_ready_set;
12410 } else {
12411 cpuset = sfmmup->sfmmu_cpusran;
12412 CPUSET_AND(cpuset, cpu_ready_set);
12414 CPUSET_DEL(cpuset, CPU->cpu_id);
12415 SFMMU_XCALL_STATS(sfmmup);
12416 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor);
12417 vac_flushpage(pfnum, vcolor);
12419 #endif /* VAC */
12420 kpreempt_enable();
12424 * Demaps the TSB and flushes all TLBs on all cpus for a particular virtual
12425 * address and ctx. If noflush is set we do not currently do anything.
12426 * This function may or may not be called with the HAT lock held.
12428 static void
12429 sfmmu_tlb_demap(caddr_t addr, sfmmu_t *sfmmup, struct hme_blk *hmeblkp,
12430 int tlb_noflush, int hat_lock_held)
12432 cpuset_t cpuset;
12433 hatlock_t *hatlockp;
12435 ASSERT(!hmeblkp->hblk_shared);
12438 * If the process is exiting we have nothing to do.
12440 if (tlb_noflush)
12441 return;
12444 * Flush TSB.
12446 if (!hat_lock_held)
12447 hatlockp = sfmmu_hat_enter(sfmmup);
12448 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0);
12450 kpreempt_disable();
12452 cpuset = sfmmup->sfmmu_cpusran;
12453 CPUSET_AND(cpuset, cpu_ready_set);
12454 CPUSET_DEL(cpuset, CPU->cpu_id);
12456 SFMMU_XCALL_STATS(sfmmup);
12457 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr, (uint64_t)sfmmup);
12459 vtag_flushpage(addr, (uint64_t)sfmmup);
12461 if (!hat_lock_held)
12462 sfmmu_hat_exit(hatlockp);
12464 kpreempt_enable();
12469 * Special case of sfmmu_tlb_demap for MMU_PAGESIZE hblks. Use the xcall
12470 * call handler that can flush a range of pages to save on xcalls.
12472 static int sfmmu_xcall_save;
12475 * this routine is never used for demaping addresses backed by SRD hmeblks.
12477 static void
12478 sfmmu_tlb_range_demap(demap_range_t *dmrp)
12480 sfmmu_t *sfmmup = dmrp->dmr_sfmmup;
12481 hatlock_t *hatlockp;
12482 cpuset_t cpuset;
12483 uint64_t sfmmu_pgcnt;
12484 pgcnt_t pgcnt = 0;
12485 int pgunload = 0;
12486 int dirtypg = 0;
12487 caddr_t addr = dmrp->dmr_addr;
12488 caddr_t eaddr;
12489 uint64_t bitvec = dmrp->dmr_bitvec;
12491 ASSERT(bitvec & 1);
12494 * Flush TSB and calculate number of pages to flush.
12496 while (bitvec != 0) {
12497 dirtypg = 0;
12499 * Find the first page to flush and then count how many
12500 * pages there are after it that also need to be flushed.
12501 * This way the number of TSB flushes is minimized.
12503 while ((bitvec & 1) == 0) {
12504 pgcnt++;
12505 addr += MMU_PAGESIZE;
12506 bitvec >>= 1;
12508 while (bitvec & 1) {
12509 dirtypg++;
12510 bitvec >>= 1;
12512 eaddr = addr + ptob(dirtypg);
12513 hatlockp = sfmmu_hat_enter(sfmmup);
12514 sfmmu_unload_tsb_range(sfmmup, addr, eaddr, TTE8K);
12515 sfmmu_hat_exit(hatlockp);
12516 pgunload += dirtypg;
12517 addr = eaddr;
12518 pgcnt += dirtypg;
12521 ASSERT((pgcnt<<MMU_PAGESHIFT) <= dmrp->dmr_endaddr - dmrp->dmr_addr);
12522 if (sfmmup->sfmmu_free == 0) {
12523 addr = dmrp->dmr_addr;
12524 bitvec = dmrp->dmr_bitvec;
12527 * make sure it has SFMMU_PGCNT_SHIFT bits only,
12528 * as it will be used to pack argument for xt_some
12530 ASSERT((pgcnt > 0) &&
12531 (pgcnt <= (1 << SFMMU_PGCNT_SHIFT)));
12534 * Encode pgcnt as (pgcnt -1 ), and pass (pgcnt - 1) in
12535 * the low 6 bits of sfmmup. This is doable since pgcnt
12536 * always >= 1.
12538 ASSERT(!((uint64_t)sfmmup & SFMMU_PGCNT_MASK));
12539 sfmmu_pgcnt = (uint64_t)sfmmup |
12540 ((pgcnt - 1) & SFMMU_PGCNT_MASK);
12543 * We must hold the hat lock during the flush of TLB,
12544 * to avoid a race with sfmmu_invalidate_ctx(), where
12545 * sfmmu_cnum on a MMU could be set to INVALID_CONTEXT,
12546 * causing TLB demap routine to skip flush on that MMU.
12547 * If the context on a MMU has already been set to
12548 * INVALID_CONTEXT, we just get an extra flush on
12549 * that MMU.
12551 hatlockp = sfmmu_hat_enter(sfmmup);
12552 kpreempt_disable();
12554 cpuset = sfmmup->sfmmu_cpusran;
12555 CPUSET_AND(cpuset, cpu_ready_set);
12556 CPUSET_DEL(cpuset, CPU->cpu_id);
12558 SFMMU_XCALL_STATS(sfmmup);
12559 xt_some(cpuset, vtag_flush_pgcnt_tl1, (uint64_t)addr,
12560 sfmmu_pgcnt);
12562 for (; bitvec != 0; bitvec >>= 1) {
12563 if (bitvec & 1)
12564 vtag_flushpage(addr, (uint64_t)sfmmup);
12565 addr += MMU_PAGESIZE;
12567 kpreempt_enable();
12568 sfmmu_hat_exit(hatlockp);
12570 sfmmu_xcall_save += (pgunload-1);
12572 dmrp->dmr_bitvec = 0;
12576 * In cases where we need to synchronize with TLB/TSB miss trap
12577 * handlers, _and_ need to flush the TLB, it's a lot easier to
12578 * throw away the context from the process than to do a
12579 * special song and dance to keep things consistent for the
12580 * handlers.
12582 * Since the process suddenly ends up without a context and our caller
12583 * holds the hat lock, threads that fault after this function is called
12584 * will pile up on the lock. We can then do whatever we need to
12585 * atomically from the context of the caller. The first blocked thread
12586 * to resume executing will get the process a new context, and the
12587 * process will resume executing.
12589 * One added advantage of this approach is that on MMUs that
12590 * support a "flush all" operation, we will delay the flush until
12591 * cnum wrap-around, and then flush the TLB one time. This
12592 * is rather rare, so it's a lot less expensive than making 8000
12593 * x-calls to flush the TLB 8000 times.
12595 * A per-process (PP) lock is used to synchronize ctx allocations in
12596 * resume() and ctx invalidations here.
12598 static void
12599 sfmmu_invalidate_ctx(sfmmu_t *sfmmup)
12601 cpuset_t cpuset;
12602 int cnum, currcnum;
12603 mmu_ctx_t *mmu_ctxp;
12604 int i;
12605 uint_t pstate_save;
12607 SFMMU_STAT(sf_ctx_inv);
12609 ASSERT(sfmmu_hat_lock_held(sfmmup));
12610 ASSERT(sfmmup != ksfmmup);
12612 kpreempt_disable();
12614 mmu_ctxp = CPU_MMU_CTXP(CPU);
12615 ASSERT(mmu_ctxp);
12616 ASSERT(mmu_ctxp->mmu_idx < max_mmu_ctxdoms);
12617 ASSERT(mmu_ctxp == mmu_ctxs_tbl[mmu_ctxp->mmu_idx]);
12619 currcnum = sfmmup->sfmmu_ctxs[mmu_ctxp->mmu_idx].cnum;
12621 pstate_save = sfmmu_disable_intrs();
12623 lock_set(&sfmmup->sfmmu_ctx_lock); /* acquire PP lock */
12624 /* set HAT cnum invalid across all context domains. */
12625 for (i = 0; i < max_mmu_ctxdoms; i++) {
12627 cnum = sfmmup->sfmmu_ctxs[i].cnum;
12628 if (cnum == INVALID_CONTEXT) {
12629 continue;
12632 sfmmup->sfmmu_ctxs[i].cnum = INVALID_CONTEXT;
12634 membar_enter(); /* make sure globally visible to all CPUs */
12635 lock_clear(&sfmmup->sfmmu_ctx_lock); /* release PP lock */
12637 sfmmu_enable_intrs(pstate_save);
12639 cpuset = sfmmup->sfmmu_cpusran;
12640 CPUSET_DEL(cpuset, CPU->cpu_id);
12641 CPUSET_AND(cpuset, cpu_ready_set);
12642 if (!CPUSET_ISNULL(cpuset)) {
12643 SFMMU_XCALL_STATS(sfmmup);
12644 xt_some(cpuset, sfmmu_raise_tsb_exception,
12645 (uint64_t)sfmmup, INVALID_CONTEXT);
12646 xt_sync(cpuset);
12647 SFMMU_STAT(sf_tsb_raise_exception);
12648 SFMMU_MMU_STAT(mmu_tsb_raise_exception);
12652 * If the hat to-be-invalidated is the same as the current
12653 * process on local CPU we need to invalidate
12654 * this CPU context as well.
12656 if ((sfmmu_getctx_sec() == currcnum) &&
12657 (currcnum != INVALID_CONTEXT)) {
12658 /* sets shared context to INVALID too */
12659 sfmmu_setctx_sec(INVALID_CONTEXT);
12660 sfmmu_clear_utsbinfo();
12663 SFMMU_FLAGS_SET(sfmmup, HAT_ALLCTX_INVALID);
12665 kpreempt_enable();
12668 * we hold the hat lock, so nobody should allocate a context
12669 * for us yet
12671 ASSERT(sfmmup->sfmmu_ctxs[mmu_ctxp->mmu_idx].cnum == INVALID_CONTEXT);
12674 #ifdef VAC
12676 * We need to flush the cache in all cpus. It is possible that
12677 * a process referenced a page as cacheable but has sinced exited
12678 * and cleared the mapping list. We still to flush it but have no
12679 * state so all cpus is the only alternative.
12681 void
12682 sfmmu_cache_flush(pfn_t pfnum, int vcolor)
12684 cpuset_t cpuset;
12686 kpreempt_disable();
12687 cpuset = cpu_ready_set;
12688 CPUSET_DEL(cpuset, CPU->cpu_id);
12689 SFMMU_XCALL_STATS(NULL); /* account to any ctx */
12690 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor);
12691 xt_sync(cpuset);
12692 vac_flushpage(pfnum, vcolor);
12693 kpreempt_enable();
12696 void
12697 sfmmu_cache_flushcolor(int vcolor, pfn_t pfnum)
12699 cpuset_t cpuset;
12701 ASSERT(vcolor >= 0);
12703 kpreempt_disable();
12704 cpuset = cpu_ready_set;
12705 CPUSET_DEL(cpuset, CPU->cpu_id);
12706 SFMMU_XCALL_STATS(NULL); /* account to any ctx */
12707 xt_some(cpuset, vac_flushcolor_tl1, vcolor, pfnum);
12708 xt_sync(cpuset);
12709 vac_flushcolor(vcolor, pfnum);
12710 kpreempt_enable();
12712 #endif /* VAC */
12715 * We need to prevent processes from accessing the TSB using a cached physical
12716 * address. It's alright if they try to access the TSB via virtual address
12717 * since they will just fault on that virtual address once the mapping has
12718 * been suspended.
12720 #pragma weak sendmondo_in_recover
12722 /* ARGSUSED */
12723 static int
12724 sfmmu_tsb_pre_relocator(caddr_t va, uint_t tsbsz, uint_t flags, void *tsbinfo)
12726 struct tsb_info *tsbinfop = (struct tsb_info *)tsbinfo;
12727 sfmmu_t *sfmmup = tsbinfop->tsb_sfmmu;
12728 hatlock_t *hatlockp;
12729 sf_scd_t *scdp;
12731 if (flags != HAT_PRESUSPEND)
12732 return (0);
12735 * If tsb is a shared TSB with TSB_SHAREDCTX set, sfmmup must
12736 * be a shared hat, then set SCD's tsbinfo's flag.
12737 * If tsb is not shared, sfmmup is a private hat, then set
12738 * its private tsbinfo's flag.
12740 hatlockp = sfmmu_hat_enter(sfmmup);
12741 tsbinfop->tsb_flags |= TSB_RELOC_FLAG;
12743 if (!(tsbinfop->tsb_flags & TSB_SHAREDCTX)) {
12744 sfmmu_tsb_inv_ctx(sfmmup);
12745 sfmmu_hat_exit(hatlockp);
12746 } else {
12747 /* release lock on the shared hat */
12748 sfmmu_hat_exit(hatlockp);
12749 /* sfmmup is a shared hat */
12750 ASSERT(sfmmup->sfmmu_scdhat);
12751 scdp = sfmmup->sfmmu_scdp;
12752 ASSERT(scdp != NULL);
12753 /* get private hat from the scd list */
12754 mutex_enter(&scdp->scd_mutex);
12755 sfmmup = scdp->scd_sf_list;
12756 while (sfmmup != NULL) {
12757 hatlockp = sfmmu_hat_enter(sfmmup);
12759 * We do not call sfmmu_tsb_inv_ctx here because
12760 * sendmondo_in_recover check is only needed for
12761 * sun4u.
12763 sfmmu_invalidate_ctx(sfmmup);
12764 sfmmu_hat_exit(hatlockp);
12765 sfmmup = sfmmup->sfmmu_scd_link.next;
12768 mutex_exit(&scdp->scd_mutex);
12770 return (0);
12773 static void
12774 sfmmu_tsb_inv_ctx(sfmmu_t *sfmmup)
12776 extern uint32_t sendmondo_in_recover;
12778 ASSERT(sfmmu_hat_lock_held(sfmmup));
12781 * For Cheetah+ Erratum 25:
12782 * Wait for any active recovery to finish. We can't risk
12783 * relocating the TSB of the thread running mondo_recover_proc()
12784 * since, if we did that, we would deadlock. The scenario we are
12785 * trying to avoid is as follows:
12787 * THIS CPU RECOVER CPU
12788 * -------- -----------
12789 * Begins recovery, walking through TSB
12790 * hat_pagesuspend() TSB TTE
12791 * TLB miss on TSB TTE, spins at TL1
12792 * xt_sync()
12793 * send_mondo_timeout()
12794 * mondo_recover_proc()
12795 * ((deadlocked))
12797 * The second half of the workaround is that mondo_recover_proc()
12798 * checks to see if the tsb_info has the RELOC flag set, and if it
12799 * does, it skips over that TSB without ever touching tsbinfop->tsb_va
12800 * and hence avoiding the TLB miss that could result in a deadlock.
12802 if (&sendmondo_in_recover) {
12803 membar_enter(); /* make sure RELOC flag visible */
12804 while (sendmondo_in_recover) {
12805 drv_usecwait(1);
12806 membar_consumer();
12810 sfmmu_invalidate_ctx(sfmmup);
12813 /* ARGSUSED */
12814 static int
12815 sfmmu_tsb_post_relocator(caddr_t va, uint_t tsbsz, uint_t flags,
12816 void *tsbinfo, pfn_t newpfn)
12818 hatlock_t *hatlockp;
12819 struct tsb_info *tsbinfop = (struct tsb_info *)tsbinfo;
12820 sfmmu_t *sfmmup = tsbinfop->tsb_sfmmu;
12822 if (flags != HAT_POSTUNSUSPEND)
12823 return (0);
12825 hatlockp = sfmmu_hat_enter(sfmmup);
12827 SFMMU_STAT(sf_tsb_reloc);
12830 * The process may have swapped out while we were relocating one
12831 * of its TSBs. If so, don't bother doing the setup since the
12832 * process can't be using the memory anymore.
12834 if ((tsbinfop->tsb_flags & TSB_SWAPPED) == 0) {
12835 ASSERT(va == tsbinfop->tsb_va);
12836 sfmmu_tsbinfo_setup_phys(tsbinfop, newpfn);
12838 if (tsbinfop->tsb_flags & TSB_FLUSH_NEEDED) {
12839 sfmmu_inv_tsb(tsbinfop->tsb_va,
12840 TSB_BYTES(tsbinfop->tsb_szc));
12841 tsbinfop->tsb_flags &= ~TSB_FLUSH_NEEDED;
12845 membar_exit();
12846 tsbinfop->tsb_flags &= ~TSB_RELOC_FLAG;
12847 cv_broadcast(&sfmmup->sfmmu_tsb_cv);
12849 sfmmu_hat_exit(hatlockp);
12851 return (0);
12855 * Allocate and initialize a tsb_info structure. Note that we may or may not
12856 * allocate a TSB here, depending on the flags passed in.
12858 static int
12859 sfmmu_tsbinfo_alloc(struct tsb_info **tsbinfopp, int tsb_szc, int tte_sz_mask,
12860 uint_t flags, sfmmu_t *sfmmup)
12862 int err;
12864 *tsbinfopp = (struct tsb_info *)kmem_cache_alloc(
12865 sfmmu_tsbinfo_cache, KM_SLEEP);
12867 if ((err = sfmmu_init_tsbinfo(*tsbinfopp, tte_sz_mask,
12868 tsb_szc, flags, sfmmup)) != 0) {
12869 kmem_cache_free(sfmmu_tsbinfo_cache, *tsbinfopp);
12870 SFMMU_STAT(sf_tsb_allocfail);
12871 *tsbinfopp = NULL;
12872 return (err);
12874 SFMMU_STAT(sf_tsb_alloc);
12877 * Bump the TSB size counters for this TSB size.
12879 (*(((int *)&sfmmu_tsbsize_stat) + tsb_szc))++;
12880 return (0);
12883 static void
12884 sfmmu_tsb_free(struct tsb_info *tsbinfo)
12886 caddr_t tsbva = tsbinfo->tsb_va;
12887 uint_t tsb_size = TSB_BYTES(tsbinfo->tsb_szc);
12888 struct kmem_cache *kmem_cachep = tsbinfo->tsb_cache;
12889 vmem_t *vmp = tsbinfo->tsb_vmp;
12892 * If we allocated this TSB from relocatable kernel memory, then we
12893 * need to uninstall the callback handler.
12895 if (tsbinfo->tsb_cache != sfmmu_tsb8k_cache) {
12896 uintptr_t slab_mask;
12897 caddr_t slab_vaddr;
12898 page_t **ppl;
12899 int ret;
12901 ASSERT(tsb_size <= MMU_PAGESIZE4M || use_bigtsb_arena);
12902 if (tsb_size > MMU_PAGESIZE4M)
12903 slab_mask = ~((uintptr_t)bigtsb_slab_mask) << PAGESHIFT;
12904 else
12905 slab_mask = ~((uintptr_t)tsb_slab_mask) << PAGESHIFT;
12906 slab_vaddr = (caddr_t)((uintptr_t)tsbva & slab_mask);
12908 ret = as_pagelock(&kas, &ppl, slab_vaddr, PAGESIZE, S_WRITE);
12909 ASSERT(ret == 0);
12910 hat_delete_callback(tsbva, (uint_t)tsb_size, (void *)tsbinfo,
12911 0, NULL);
12912 as_pageunlock(&kas, ppl, slab_vaddr, PAGESIZE, S_WRITE);
12915 if (kmem_cachep != NULL) {
12916 kmem_cache_free(kmem_cachep, tsbva);
12917 } else {
12918 vmem_xfree(vmp, (void *)tsbva, tsb_size);
12920 tsbinfo->tsb_va = (caddr_t)0xbad00bad;
12921 atomic_add_64(&tsb_alloc_bytes, -(int64_t)tsb_size);
12924 static void
12925 sfmmu_tsbinfo_free(struct tsb_info *tsbinfo)
12927 if ((tsbinfo->tsb_flags & TSB_SWAPPED) == 0) {
12928 sfmmu_tsb_free(tsbinfo);
12930 kmem_cache_free(sfmmu_tsbinfo_cache, tsbinfo);
12935 * Setup all the references to physical memory for this tsbinfo.
12936 * The underlying page(s) must be locked.
12938 static void
12939 sfmmu_tsbinfo_setup_phys(struct tsb_info *tsbinfo, pfn_t pfn)
12941 ASSERT(pfn != PFN_INVALID);
12942 ASSERT(pfn == va_to_pfn(tsbinfo->tsb_va));
12944 #ifndef sun4v
12945 if (tsbinfo->tsb_szc == 0) {
12946 sfmmu_memtte(&tsbinfo->tsb_tte, pfn,
12947 PROT_WRITE|PROT_READ, TTE8K);
12948 } else {
12950 * Round down PA and use a large mapping; the handlers will
12951 * compute the TSB pointer at the correct offset into the
12952 * big virtual page. NOTE: this assumes all TSBs larger
12953 * than 8K must come from physically contiguous slabs of
12954 * size tsb_slab_size.
12956 sfmmu_memtte(&tsbinfo->tsb_tte, pfn & ~tsb_slab_mask,
12957 PROT_WRITE|PROT_READ, tsb_slab_ttesz);
12959 tsbinfo->tsb_pa = ptob(pfn);
12961 TTE_SET_LOCKED(&tsbinfo->tsb_tte); /* lock the tte into dtlb */
12962 TTE_SET_MOD(&tsbinfo->tsb_tte); /* enable writes */
12964 ASSERT(TTE_IS_PRIVILEGED(&tsbinfo->tsb_tte));
12965 ASSERT(TTE_IS_LOCKED(&tsbinfo->tsb_tte));
12966 #else /* sun4v */
12967 tsbinfo->tsb_pa = ptob(pfn);
12968 #endif /* sun4v */
12973 * Returns zero on success, ENOMEM if over the high water mark,
12974 * or EAGAIN if the caller needs to retry with a smaller TSB
12975 * size (or specify TSB_FORCEALLOC if the allocation can't fail).
12977 * This call cannot fail to allocate a TSB if TSB_FORCEALLOC
12978 * is specified and the TSB requested is PAGESIZE, though it
12979 * may sleep waiting for memory if sufficient memory is not
12980 * available.
12982 static int
12983 sfmmu_init_tsbinfo(struct tsb_info *tsbinfo, int tteszmask,
12984 int tsbcode, uint_t flags, sfmmu_t *sfmmup)
12986 caddr_t vaddr = NULL;
12987 caddr_t slab_vaddr;
12988 uintptr_t slab_mask;
12989 int tsbbytes = TSB_BYTES(tsbcode);
12990 int lowmem = 0;
12991 struct kmem_cache *kmem_cachep = NULL;
12992 vmem_t *vmp = NULL;
12993 lgrp_id_t lgrpid = LGRP_NONE;
12994 pfn_t pfn;
12995 uint_t cbflags = HAC_SLEEP;
12996 page_t **pplist;
12997 int ret;
12999 ASSERT(tsbbytes <= MMU_PAGESIZE4M || use_bigtsb_arena);
13000 if (tsbbytes > MMU_PAGESIZE4M)
13001 slab_mask = ~((uintptr_t)bigtsb_slab_mask) << PAGESHIFT;
13002 else
13003 slab_mask = ~((uintptr_t)tsb_slab_mask) << PAGESHIFT;
13005 if (flags & (TSB_FORCEALLOC | TSB_SWAPIN | TSB_GROW | TSB_SHRINK))
13006 flags |= TSB_ALLOC;
13008 ASSERT((flags & TSB_FORCEALLOC) == 0 || tsbcode == TSB_MIN_SZCODE);
13010 tsbinfo->tsb_sfmmu = sfmmup;
13013 * If not allocating a TSB, set up the tsbinfo, set TSB_SWAPPED, and
13014 * return.
13016 if ((flags & TSB_ALLOC) == 0) {
13017 tsbinfo->tsb_szc = tsbcode;
13018 tsbinfo->tsb_ttesz_mask = tteszmask;
13019 tsbinfo->tsb_va = (caddr_t)0xbadbadbeef;
13020 tsbinfo->tsb_pa = -1;
13021 tsbinfo->tsb_tte.ll = 0;
13022 tsbinfo->tsb_next = NULL;
13023 tsbinfo->tsb_flags = TSB_SWAPPED;
13024 tsbinfo->tsb_cache = NULL;
13025 tsbinfo->tsb_vmp = NULL;
13026 return (0);
13029 #ifdef DEBUG
13031 * For debugging:
13032 * Randomly force allocation failures every tsb_alloc_mtbf
13033 * tries if TSB_FORCEALLOC is not specified. This will
13034 * return ENOMEM if tsb_alloc_mtbf is odd, or EAGAIN if
13035 * it is even, to allow testing of both failure paths...
13037 if (tsb_alloc_mtbf && ((flags & TSB_FORCEALLOC) == 0) &&
13038 (tsb_alloc_count++ == tsb_alloc_mtbf)) {
13039 tsb_alloc_count = 0;
13040 tsb_alloc_fail_mtbf++;
13041 return ((tsb_alloc_mtbf & 1)? ENOMEM : EAGAIN);
13043 #endif /* DEBUG */
13046 * Enforce high water mark if we are not doing a forced allocation
13047 * and are not shrinking a process' TSB.
13049 if ((flags & TSB_SHRINK) == 0 &&
13050 (tsbbytes + tsb_alloc_bytes) > tsb_alloc_hiwater) {
13051 if ((flags & TSB_FORCEALLOC) == 0)
13052 return (ENOMEM);
13053 lowmem = 1;
13057 * Allocate from the correct location based upon the size of the TSB
13058 * compared to the base page size, and what memory conditions dictate.
13059 * Note we always do nonblocking allocations from the TSB arena since
13060 * we don't want memory fragmentation to cause processes to block
13061 * indefinitely waiting for memory; until the kernel algorithms that
13062 * coalesce large pages are improved this is our best option.
13064 * Algorithm:
13065 * If allocating a "large" TSB (>8K), allocate from the
13066 * appropriate kmem_tsb_default_arena vmem arena
13067 * else if low on memory or the TSB_FORCEALLOC flag is set or
13068 * tsb_forceheap is set
13069 * Allocate from kernel heap via sfmmu_tsb8k_cache with
13070 * KM_SLEEP (never fails)
13071 * else
13072 * Allocate from appropriate sfmmu_tsb_cache with
13073 * KM_NOSLEEP
13074 * endif
13076 if (tsb_lgrp_affinity)
13077 lgrpid = lgrp_home_id(curthread);
13078 if (lgrpid == LGRP_NONE)
13079 lgrpid = 0; /* use lgrp of boot CPU */
13081 if (tsbbytes > MMU_PAGESIZE) {
13082 if (tsbbytes > MMU_PAGESIZE4M) {
13083 vmp = kmem_bigtsb_default_arena[lgrpid];
13084 vaddr = (caddr_t)vmem_xalloc(vmp, tsbbytes, tsbbytes,
13085 0, 0, NULL, NULL, VM_NOSLEEP);
13086 } else {
13087 vmp = kmem_tsb_default_arena[lgrpid];
13088 vaddr = (caddr_t)vmem_xalloc(vmp, tsbbytes, tsbbytes,
13089 0, 0, NULL, NULL, VM_NOSLEEP);
13091 #ifdef DEBUG
13092 } else if (lowmem || (flags & TSB_FORCEALLOC) || tsb_forceheap) {
13093 #else /* !DEBUG */
13094 } else if (lowmem || (flags & TSB_FORCEALLOC)) {
13095 #endif /* DEBUG */
13096 kmem_cachep = sfmmu_tsb8k_cache;
13097 vaddr = (caddr_t)kmem_cache_alloc(kmem_cachep, KM_SLEEP);
13098 ASSERT(vaddr != NULL);
13099 } else {
13100 kmem_cachep = sfmmu_tsb_cache[lgrpid];
13101 vaddr = (caddr_t)kmem_cache_alloc(kmem_cachep, KM_NOSLEEP);
13104 tsbinfo->tsb_cache = kmem_cachep;
13105 tsbinfo->tsb_vmp = vmp;
13107 if (vaddr == NULL) {
13108 return (EAGAIN);
13111 atomic_add_64(&tsb_alloc_bytes, (int64_t)tsbbytes);
13112 kmem_cachep = tsbinfo->tsb_cache;
13115 * If we are allocating from outside the cage, then we need to
13116 * register a relocation callback handler. Note that for now
13117 * since pseudo mappings always hang off of the slab's root page,
13118 * we need only lock the first 8K of the TSB slab. This is a bit
13119 * hacky but it is good for performance.
13121 if (kmem_cachep != sfmmu_tsb8k_cache) {
13122 slab_vaddr = (caddr_t)((uintptr_t)vaddr & slab_mask);
13123 ret = as_pagelock(&kas, &pplist, slab_vaddr, PAGESIZE, S_WRITE);
13124 ASSERT(ret == 0);
13125 ret = hat_add_callback(sfmmu_tsb_cb_id, vaddr, (uint_t)tsbbytes,
13126 cbflags, (void *)tsbinfo, &pfn, NULL);
13129 * Need to free up resources if we could not successfully
13130 * add the callback function and return an error condition.
13132 if (ret != 0) {
13133 if (kmem_cachep) {
13134 kmem_cache_free(kmem_cachep, vaddr);
13135 } else {
13136 vmem_xfree(vmp, (void *)vaddr, tsbbytes);
13138 as_pageunlock(&kas, pplist, slab_vaddr, PAGESIZE,
13139 S_WRITE);
13140 return (EAGAIN);
13142 } else {
13144 * Since allocation of 8K TSBs from heap is rare and occurs
13145 * during memory pressure we allocate them from permanent
13146 * memory rather than using callbacks to get the PFN.
13148 pfn = hat_getpfnum(kas.a_hat, vaddr);
13151 tsbinfo->tsb_va = vaddr;
13152 tsbinfo->tsb_szc = tsbcode;
13153 tsbinfo->tsb_ttesz_mask = tteszmask;
13154 tsbinfo->tsb_next = NULL;
13155 tsbinfo->tsb_flags = 0;
13157 sfmmu_tsbinfo_setup_phys(tsbinfo, pfn);
13159 sfmmu_inv_tsb(vaddr, tsbbytes);
13161 if (kmem_cachep != sfmmu_tsb8k_cache) {
13162 as_pageunlock(&kas, pplist, slab_vaddr, PAGESIZE, S_WRITE);
13165 return (0);
13169 * Initialize per cpu tsb and per cpu tsbmiss_area
13171 void
13172 sfmmu_init_tsbs(void)
13174 int i;
13175 struct tsbmiss *tsbmissp;
13176 struct kpmtsbm *kpmtsbmp;
13177 #ifndef sun4v
13178 extern int dcache_line_mask;
13179 #endif /* sun4v */
13180 extern uint_t vac_colors;
13183 * Init. tsb miss area.
13185 tsbmissp = tsbmiss_area;
13187 for (i = 0; i < NCPU; tsbmissp++, i++) {
13189 * initialize the tsbmiss area.
13190 * Do this for all possible CPUs as some may be added
13191 * while the system is running. There is no cost to this.
13193 tsbmissp->ksfmmup = ksfmmup;
13194 #ifndef sun4v
13195 tsbmissp->dcache_line_mask = (uint16_t)dcache_line_mask;
13196 #endif /* sun4v */
13197 tsbmissp->khashstart =
13198 (struct hmehash_bucket *)va_to_pa((caddr_t)khme_hash);
13199 tsbmissp->uhashstart =
13200 (struct hmehash_bucket *)va_to_pa((caddr_t)uhme_hash);
13201 tsbmissp->khashsz = khmehash_num;
13202 tsbmissp->uhashsz = uhmehash_num;
13205 sfmmu_tsb_cb_id = hat_register_callback('T'<<16 | 'S' << 8 | 'B',
13206 sfmmu_tsb_pre_relocator, sfmmu_tsb_post_relocator, NULL, 0);
13208 if (kpm_enable == 0)
13209 return;
13211 /* -- Begin KPM specific init -- */
13213 if (kpm_smallpages) {
13215 * If we're using base pagesize pages for seg_kpm
13216 * mappings, we use the kernel TSB since we can't afford
13217 * to allocate a second huge TSB for these mappings.
13219 kpm_tsbbase = ktsb_phys? ktsb_pbase : (uint64_t)ktsb_base;
13220 kpm_tsbsz = ktsb_szcode;
13221 kpmsm_tsbbase = kpm_tsbbase;
13222 kpmsm_tsbsz = kpm_tsbsz;
13223 } else {
13225 * In VAC conflict case, just put the entries in the
13226 * kernel 8K indexed TSB for now so we can find them.
13227 * This could really be changed in the future if we feel
13228 * the need...
13230 kpmsm_tsbbase = ktsb_phys? ktsb_pbase : (uint64_t)ktsb_base;
13231 kpmsm_tsbsz = ktsb_szcode;
13232 kpm_tsbbase = ktsb_phys? ktsb4m_pbase : (uint64_t)ktsb4m_base;
13233 kpm_tsbsz = ktsb4m_szcode;
13236 kpmtsbmp = kpmtsbm_area;
13237 for (i = 0; i < NCPU; kpmtsbmp++, i++) {
13239 * Initialize the kpmtsbm area.
13240 * Do this for all possible CPUs as some may be added
13241 * while the system is running. There is no cost to this.
13243 kpmtsbmp->vbase = kpm_vbase;
13244 kpmtsbmp->vend = kpm_vbase + kpm_size * vac_colors;
13245 kpmtsbmp->sz_shift = kpm_size_shift;
13246 kpmtsbmp->kpmp_shift = kpmp_shift;
13247 kpmtsbmp->kpmp2pshft = (uchar_t)kpmp2pshft;
13248 if (kpm_smallpages == 0) {
13249 kpmtsbmp->kpmp_table_sz = kpmp_table_sz;
13250 kpmtsbmp->kpmp_tablepa = va_to_pa(kpmp_table);
13251 } else {
13252 kpmtsbmp->kpmp_table_sz = kpmp_stable_sz;
13253 kpmtsbmp->kpmp_tablepa = va_to_pa(kpmp_stable);
13255 kpmtsbmp->msegphashpa = va_to_pa(memseg_phash);
13256 kpmtsbmp->flags = KPMTSBM_ENABLE_FLAG;
13257 #ifdef DEBUG
13258 kpmtsbmp->flags |= (kpm_tsbmtl) ? KPMTSBM_TLTSBM_FLAG : 0;
13259 #endif /* DEBUG */
13260 if (ktsb_phys)
13261 kpmtsbmp->flags |= KPMTSBM_TSBPHYS_FLAG;
13264 /* -- End KPM specific init -- */
13267 /* Avoid using sfmmu_tsbinfo_alloc() to avoid kmem_alloc - no real reason */
13268 struct tsb_info ktsb_info[2];
13271 * Called from hat_kern_setup() to setup the tsb_info for ksfmmup.
13273 void
13274 sfmmu_init_ktsbinfo()
13276 ASSERT(ksfmmup != NULL);
13277 ASSERT(ksfmmup->sfmmu_tsb == NULL);
13279 * Allocate tsbinfos for kernel and copy in data
13280 * to make debug easier and sun4v setup easier.
13282 ktsb_info[0].tsb_sfmmu = ksfmmup;
13283 ktsb_info[0].tsb_szc = ktsb_szcode;
13284 ktsb_info[0].tsb_ttesz_mask = TSB8K|TSB64K|TSB512K;
13285 ktsb_info[0].tsb_va = ktsb_base;
13286 ktsb_info[0].tsb_pa = ktsb_pbase;
13287 ktsb_info[0].tsb_flags = 0;
13288 ktsb_info[0].tsb_tte.ll = 0;
13289 ktsb_info[0].tsb_cache = NULL;
13291 ktsb_info[1].tsb_sfmmu = ksfmmup;
13292 ktsb_info[1].tsb_szc = ktsb4m_szcode;
13293 ktsb_info[1].tsb_ttesz_mask = TSB4M;
13294 ktsb_info[1].tsb_va = ktsb4m_base;
13295 ktsb_info[1].tsb_pa = ktsb4m_pbase;
13296 ktsb_info[1].tsb_flags = 0;
13297 ktsb_info[1].tsb_tte.ll = 0;
13298 ktsb_info[1].tsb_cache = NULL;
13300 /* Link them into ksfmmup. */
13301 ktsb_info[0].tsb_next = &ktsb_info[1];
13302 ktsb_info[1].tsb_next = NULL;
13303 ksfmmup->sfmmu_tsb = &ktsb_info[0];
13305 sfmmu_setup_tsbinfo(ksfmmup);
13309 * Cache the last value returned from va_to_pa(). If the VA specified
13310 * in the current call to cached_va_to_pa() maps to the same Page (as the
13311 * previous call to cached_va_to_pa()), then compute the PA using
13312 * cached info, else call va_to_pa().
13314 * Note: this function is neither MT-safe nor consistent in the presence
13315 * of multiple, interleaved threads. This function was created to enable
13316 * an optimization used during boot (at a point when there's only one thread
13317 * executing on the "boot CPU", and before startup_vm() has been called).
13319 static uint64_t
13320 cached_va_to_pa(void *vaddr)
13322 static uint64_t prev_vaddr_base = 0;
13323 static uint64_t prev_pfn = 0;
13325 if ((((uint64_t)vaddr) & MMU_PAGEMASK) == prev_vaddr_base) {
13326 return (prev_pfn | ((uint64_t)vaddr & MMU_PAGEOFFSET));
13327 } else {
13328 uint64_t pa = va_to_pa(vaddr);
13330 if (pa != ((uint64_t)-1)) {
13332 * Computed physical address is valid. Cache its
13333 * related info for the next cached_va_to_pa() call.
13335 prev_pfn = pa & MMU_PAGEMASK;
13336 prev_vaddr_base = ((uint64_t)vaddr) & MMU_PAGEMASK;
13339 return (pa);
13344 * Carve up our nucleus hblk region. We may allocate more hblks than
13345 * asked due to rounding errors but we are guaranteed to have at least
13346 * enough space to allocate the requested number of hblk8's and hblk1's.
13348 void
13349 sfmmu_init_nucleus_hblks(caddr_t addr, size_t size, int nhblk8, int nhblk1)
13351 struct hme_blk *hmeblkp;
13352 size_t hme8blk_sz, hme1blk_sz;
13353 size_t i;
13354 size_t hblk8_bound;
13355 ulong_t j = 0, k = 0;
13357 ASSERT(addr != NULL && size != 0);
13359 /* Need to use proper structure alignment */
13360 hme8blk_sz = roundup(HME8BLK_SZ, sizeof (int64_t));
13361 hme1blk_sz = roundup(HME1BLK_SZ, sizeof (int64_t));
13363 nucleus_hblk8.list = (void *)addr;
13364 nucleus_hblk8.index = 0;
13367 * Use as much memory as possible for hblk8's since we
13368 * expect all bop_alloc'ed memory to be allocated in 8k chunks.
13369 * We need to hold back enough space for the hblk1's which
13370 * we'll allocate next.
13372 hblk8_bound = size - (nhblk1 * hme1blk_sz) - hme8blk_sz;
13373 for (i = 0; i <= hblk8_bound; i += hme8blk_sz, j++) {
13374 hmeblkp = (struct hme_blk *)addr;
13375 addr += hme8blk_sz;
13376 hmeblkp->hblk_nuc_bit = 1;
13377 hmeblkp->hblk_nextpa = cached_va_to_pa((caddr_t)hmeblkp);
13379 nucleus_hblk8.len = j;
13380 ASSERT(j >= nhblk8);
13381 SFMMU_STAT_ADD(sf_hblk8_ncreate, j);
13383 nucleus_hblk1.list = (void *)addr;
13384 nucleus_hblk1.index = 0;
13385 for (; i <= (size - hme1blk_sz); i += hme1blk_sz, k++) {
13386 hmeblkp = (struct hme_blk *)addr;
13387 addr += hme1blk_sz;
13388 hmeblkp->hblk_nuc_bit = 1;
13389 hmeblkp->hblk_nextpa = cached_va_to_pa((caddr_t)hmeblkp);
13391 ASSERT(k >= nhblk1);
13392 nucleus_hblk1.len = k;
13393 SFMMU_STAT_ADD(sf_hblk1_ncreate, k);
13397 * This function is currently not supported on this platform. For what
13398 * it's supposed to do, see hat.c and hat_srmmu.c
13400 /* ARGSUSED */
13401 faultcode_t
13402 hat_softlock(struct hat *hat, caddr_t addr, size_t *lenp, page_t **ppp,
13403 uint_t flags)
13405 ASSERT(hat->sfmmu_xhat_provider == NULL);
13406 return (FC_NOSUPPORT);
13410 * Searchs the mapping list of the page for a mapping of the same size. If not
13411 * found the corresponding bit is cleared in the p_index field. When large
13412 * pages are more prevalent in the system, we can maintain the mapping list
13413 * in order and we don't have to traverse the list each time. Just check the
13414 * next and prev entries, and if both are of different size, we clear the bit.
13416 static void
13417 sfmmu_rm_large_mappings(page_t *pp, int ttesz)
13419 struct sf_hment *sfhmep;
13420 struct hme_blk *hmeblkp;
13421 int index;
13422 pgcnt_t npgs;
13424 ASSERT(ttesz > TTE8K);
13426 ASSERT(sfmmu_mlist_held(pp));
13428 ASSERT(PP_ISMAPPED_LARGE(pp));
13431 * Traverse mapping list looking for another mapping of same size.
13432 * since we only want to clear index field if all mappings of
13433 * that size are gone.
13436 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) {
13437 if (IS_PAHME(sfhmep))
13438 continue;
13439 hmeblkp = sfmmu_hmetohblk(sfhmep);
13440 if (hmeblkp->hblk_xhat_bit)
13441 continue;
13442 if (hme_size(sfhmep) == ttesz) {
13444 * another mapping of the same size. don't clear index.
13446 return;
13451 * Clear the p_index bit for large page.
13453 index = PAGESZ_TO_INDEX(ttesz);
13454 npgs = TTEPAGES(ttesz);
13455 while (npgs-- > 0) {
13456 ASSERT(pp->p_index & index);
13457 pp->p_index &= ~index;
13458 pp = PP_PAGENEXT(pp);
13463 * return supported features
13465 /* ARGSUSED */
13467 hat_supported(enum hat_features feature, void *arg)
13469 switch (feature) {
13470 case HAT_SHARED_PT:
13471 case HAT_DYNAMIC_ISM_UNMAP:
13472 case HAT_VMODSORT:
13473 return (1);
13474 case HAT_SHARED_REGIONS:
13475 if (shctx_on)
13476 return (1);
13477 else
13478 return (0);
13479 default:
13480 return (0);
13484 void
13485 hat_enter(struct hat *hat)
13487 hatlock_t *hatlockp;
13489 if (hat != ksfmmup) {
13490 hatlockp = TSB_HASH(hat);
13491 mutex_enter(HATLOCK_MUTEXP(hatlockp));
13495 void
13496 hat_exit(struct hat *hat)
13498 hatlock_t *hatlockp;
13500 if (hat != ksfmmup) {
13501 hatlockp = TSB_HASH(hat);
13502 mutex_exit(HATLOCK_MUTEXP(hatlockp));
13506 /*ARGSUSED*/
13507 void
13508 hat_reserve(struct as *as, caddr_t addr, size_t len)
13512 static void
13513 hat_kstat_init(void)
13515 kstat_t *ksp;
13517 ksp = kstat_create("unix", 0, "sfmmu_global_stat", "hat",
13518 KSTAT_TYPE_RAW, sizeof (struct sfmmu_global_stat),
13519 KSTAT_FLAG_VIRTUAL);
13520 if (ksp) {
13521 ksp->ks_data = (void *) &sfmmu_global_stat;
13522 kstat_install(ksp);
13524 ksp = kstat_create("unix", 0, "sfmmu_tsbsize_stat", "hat",
13525 KSTAT_TYPE_RAW, sizeof (struct sfmmu_tsbsize_stat),
13526 KSTAT_FLAG_VIRTUAL);
13527 if (ksp) {
13528 ksp->ks_data = (void *) &sfmmu_tsbsize_stat;
13529 kstat_install(ksp);
13531 ksp = kstat_create("unix", 0, "sfmmu_percpu_stat", "hat",
13532 KSTAT_TYPE_RAW, sizeof (struct sfmmu_percpu_stat) * NCPU,
13533 KSTAT_FLAG_WRITABLE);
13534 if (ksp) {
13535 ksp->ks_update = sfmmu_kstat_percpu_update;
13536 kstat_install(ksp);
13540 /* ARGSUSED */
13541 static int
13542 sfmmu_kstat_percpu_update(kstat_t *ksp, int rw)
13544 struct sfmmu_percpu_stat *cpu_kstat = ksp->ks_data;
13545 struct tsbmiss *tsbm = tsbmiss_area;
13546 struct kpmtsbm *kpmtsbm = kpmtsbm_area;
13547 int i;
13549 ASSERT(cpu_kstat);
13550 if (rw == KSTAT_READ) {
13551 for (i = 0; i < NCPU; cpu_kstat++, tsbm++, kpmtsbm++, i++) {
13552 cpu_kstat->sf_itlb_misses = 0;
13553 cpu_kstat->sf_dtlb_misses = 0;
13554 cpu_kstat->sf_utsb_misses = tsbm->utsb_misses -
13555 tsbm->uprot_traps;
13556 cpu_kstat->sf_ktsb_misses = tsbm->ktsb_misses +
13557 kpmtsbm->kpm_tsb_misses - tsbm->kprot_traps;
13558 cpu_kstat->sf_tsb_hits = 0;
13559 cpu_kstat->sf_umod_faults = tsbm->uprot_traps;
13560 cpu_kstat->sf_kmod_faults = tsbm->kprot_traps;
13562 } else {
13563 /* KSTAT_WRITE is used to clear stats */
13564 for (i = 0; i < NCPU; tsbm++, kpmtsbm++, i++) {
13565 tsbm->utsb_misses = 0;
13566 tsbm->ktsb_misses = 0;
13567 tsbm->uprot_traps = 0;
13568 tsbm->kprot_traps = 0;
13569 kpmtsbm->kpm_dtlb_misses = 0;
13570 kpmtsbm->kpm_tsb_misses = 0;
13573 return (0);
13576 #ifdef DEBUG
13578 tte_t *gorig[NCPU], *gcur[NCPU], *gnew[NCPU];
13581 * A tte checker. *orig_old is the value we read before cas.
13582 * *cur is the value returned by cas.
13583 * *new is the desired value when we do the cas.
13585 * *hmeblkp is currently unused.
13588 /* ARGSUSED */
13589 void
13590 chk_tte(tte_t *orig_old, tte_t *cur, tte_t *new, struct hme_blk *hmeblkp)
13592 pfn_t i, j, k;
13593 int cpuid = CPU->cpu_id;
13595 gorig[cpuid] = orig_old;
13596 gcur[cpuid] = cur;
13597 gnew[cpuid] = new;
13599 #ifdef lint
13600 hmeblkp = hmeblkp;
13601 #endif
13603 if (TTE_IS_VALID(orig_old)) {
13604 if (TTE_IS_VALID(cur)) {
13605 i = TTE_TO_TTEPFN(orig_old);
13606 j = TTE_TO_TTEPFN(cur);
13607 k = TTE_TO_TTEPFN(new);
13608 if (i != j) {
13609 /* remap error? */
13610 panic("chk_tte: bad pfn, 0x%lx, 0x%lx", i, j);
13613 if (i != k) {
13614 /* remap error? */
13615 panic("chk_tte: bad pfn2, 0x%lx, 0x%lx", i, k);
13617 } else {
13618 if (TTE_IS_VALID(new)) {
13619 panic("chk_tte: invalid cur? ");
13622 i = TTE_TO_TTEPFN(orig_old);
13623 k = TTE_TO_TTEPFN(new);
13624 if (i != k) {
13625 panic("chk_tte: bad pfn3, 0x%lx, 0x%lx", i, k);
13628 } else {
13629 if (TTE_IS_VALID(cur)) {
13630 j = TTE_TO_TTEPFN(cur);
13631 if (TTE_IS_VALID(new)) {
13632 k = TTE_TO_TTEPFN(new);
13633 if (j != k) {
13634 panic("chk_tte: bad pfn4, 0x%lx, 0x%lx",
13635 j, k);
13637 } else {
13638 panic("chk_tte: why here?");
13640 } else {
13641 if (!TTE_IS_VALID(new)) {
13642 panic("chk_tte: why here2 ?");
13648 #endif /* DEBUG */
13650 extern void prefetch_tsbe_read(struct tsbe *);
13651 extern void prefetch_tsbe_write(struct tsbe *);
13655 * We want to prefetch 7 cache lines ahead for our read prefetch. This gives
13656 * us optimal performance on Cheetah+. You can only have 8 outstanding
13657 * prefetches at any one time, so we opted for 7 read prefetches and 1 write
13658 * prefetch to make the most utilization of the prefetch capability.
13660 #define TSBE_PREFETCH_STRIDE (7)
13662 void
13663 sfmmu_copy_tsb(struct tsb_info *old_tsbinfo, struct tsb_info *new_tsbinfo)
13665 int old_bytes = TSB_BYTES(old_tsbinfo->tsb_szc);
13666 int new_bytes = TSB_BYTES(new_tsbinfo->tsb_szc);
13667 int old_entries = TSB_ENTRIES(old_tsbinfo->tsb_szc);
13668 int new_entries = TSB_ENTRIES(new_tsbinfo->tsb_szc);
13669 struct tsbe *old;
13670 struct tsbe *new;
13671 struct tsbe *new_base = (struct tsbe *)new_tsbinfo->tsb_va;
13672 uint64_t va;
13673 int new_offset;
13674 int i;
13675 int vpshift;
13676 int last_prefetch;
13678 if (old_bytes == new_bytes) {
13679 bcopy(old_tsbinfo->tsb_va, new_tsbinfo->tsb_va, new_bytes);
13680 } else {
13683 * A TSBE is 16 bytes which means there are four TSBE's per
13684 * P$ line (64 bytes), thus every 4 TSBE's we prefetch.
13686 old = (struct tsbe *)old_tsbinfo->tsb_va;
13687 last_prefetch = old_entries - (4*(TSBE_PREFETCH_STRIDE+1));
13688 for (i = 0; i < old_entries; i++, old++) {
13689 if (((i & (4-1)) == 0) && (i < last_prefetch))
13690 prefetch_tsbe_read(old);
13691 if (!old->tte_tag.tag_invalid) {
13693 * We have a valid TTE to remap. Check the
13694 * size. We won't remap 64K or 512K TTEs
13695 * because they span more than one TSB entry
13696 * and are indexed using an 8K virt. page.
13697 * Ditto for 32M and 256M TTEs.
13699 if (TTE_CSZ(&old->tte_data) == TTE64K ||
13700 TTE_CSZ(&old->tte_data) == TTE512K)
13701 continue;
13702 if (mmu_page_sizes == max_mmu_page_sizes) {
13703 if (TTE_CSZ(&old->tte_data) == TTE32M ||
13704 TTE_CSZ(&old->tte_data) == TTE256M)
13705 continue;
13708 /* clear the lower 22 bits of the va */
13709 va = *(uint64_t *)old << 22;
13710 /* turn va into a virtual pfn */
13711 va >>= 22 - TSB_START_SIZE;
13713 * or in bits from the offset in the tsb
13714 * to get the real virtual pfn. These
13715 * correspond to bits [21:13] in the va
13717 vpshift =
13718 TTE_BSZS_SHIFT(TTE_CSZ(&old->tte_data)) &
13719 0x1ff;
13720 va |= (i << vpshift);
13721 va >>= vpshift;
13722 new_offset = va & (new_entries - 1);
13723 new = new_base + new_offset;
13724 prefetch_tsbe_write(new);
13725 *new = *old;
13732 * unused in sfmmu
13734 void
13735 hat_dump(void)
13740 * Called when a thread is exiting and we have switched to the kernel address
13741 * space. Perform the same VM initialization resume() uses when switching
13742 * processes.
13744 * Note that sfmmu_load_mmustate() is currently a no-op for kernel threads, but
13745 * we call it anyway in case the semantics change in the future.
13747 /*ARGSUSED*/
13748 void
13749 hat_thread_exit(kthread_t *thd)
13751 uint_t pgsz_cnum;
13752 uint_t pstate_save;
13754 ASSERT(thd->t_procp->p_as == &kas);
13756 pgsz_cnum = KCONTEXT;
13757 #ifdef sun4u
13758 pgsz_cnum |= (ksfmmup->sfmmu_cext << CTXREG_EXT_SHIFT);
13759 #endif
13762 * Note that sfmmu_load_mmustate() is currently a no-op for
13763 * kernel threads. We need to disable interrupts here,
13764 * simply because otherwise sfmmu_load_mmustate() would panic
13765 * if the caller does not disable interrupts.
13767 pstate_save = sfmmu_disable_intrs();
13769 /* Compatibility Note: hw takes care of MMU_SCONTEXT1 */
13770 sfmmu_setctx_sec(pgsz_cnum);
13771 sfmmu_load_mmustate(ksfmmup);
13772 sfmmu_enable_intrs(pstate_save);
13777 * SRD support
13779 #define SRD_HASH_FUNCTION(vp) (((((uintptr_t)(vp)) >> 4) ^ \
13780 (((uintptr_t)(vp)) >> 11)) & \
13781 srd_hashmask)
13784 * Attach the process to the srd struct associated with the exec vnode
13785 * from which the process is started.
13787 void
13788 hat_join_srd(struct hat *sfmmup, vnode_t *evp)
13790 uint_t hash = SRD_HASH_FUNCTION(evp);
13791 sf_srd_t *srdp;
13792 sf_srd_t *newsrdp;
13794 ASSERT(sfmmup != ksfmmup);
13795 ASSERT(sfmmup->sfmmu_srdp == NULL);
13797 if (!shctx_on) {
13798 return;
13801 VN_HOLD(evp);
13803 if (srd_buckets[hash].srdb_srdp != NULL) {
13804 mutex_enter(&srd_buckets[hash].srdb_lock);
13805 for (srdp = srd_buckets[hash].srdb_srdp; srdp != NULL;
13806 srdp = srdp->srd_hash) {
13807 if (srdp->srd_evp == evp) {
13808 ASSERT(srdp->srd_refcnt >= 0);
13809 sfmmup->sfmmu_srdp = srdp;
13810 atomic_inc_32(
13811 (volatile uint_t *)&srdp->srd_refcnt);
13812 mutex_exit(&srd_buckets[hash].srdb_lock);
13813 return;
13816 mutex_exit(&srd_buckets[hash].srdb_lock);
13818 newsrdp = kmem_cache_alloc(srd_cache, KM_SLEEP);
13819 ASSERT(newsrdp->srd_next_ismrid == 0 && newsrdp->srd_next_hmerid == 0);
13821 newsrdp->srd_evp = evp;
13822 newsrdp->srd_refcnt = 1;
13823 newsrdp->srd_hmergnfree = NULL;
13824 newsrdp->srd_ismrgnfree = NULL;
13826 mutex_enter(&srd_buckets[hash].srdb_lock);
13827 for (srdp = srd_buckets[hash].srdb_srdp; srdp != NULL;
13828 srdp = srdp->srd_hash) {
13829 if (srdp->srd_evp == evp) {
13830 ASSERT(srdp->srd_refcnt >= 0);
13831 sfmmup->sfmmu_srdp = srdp;
13832 atomic_inc_32((volatile uint_t *)&srdp->srd_refcnt);
13833 mutex_exit(&srd_buckets[hash].srdb_lock);
13834 kmem_cache_free(srd_cache, newsrdp);
13835 return;
13838 newsrdp->srd_hash = srd_buckets[hash].srdb_srdp;
13839 srd_buckets[hash].srdb_srdp = newsrdp;
13840 sfmmup->sfmmu_srdp = newsrdp;
13842 mutex_exit(&srd_buckets[hash].srdb_lock);
13846 static void
13847 sfmmu_leave_srd(sfmmu_t *sfmmup)
13849 vnode_t *evp;
13850 sf_srd_t *srdp = sfmmup->sfmmu_srdp;
13851 uint_t hash;
13852 sf_srd_t **prev_srdpp;
13853 sf_region_t *rgnp;
13854 sf_region_t *nrgnp;
13855 #ifdef DEBUG
13856 int rgns = 0;
13857 #endif
13858 int i;
13860 ASSERT(sfmmup != ksfmmup);
13861 ASSERT(srdp != NULL);
13862 ASSERT(srdp->srd_refcnt > 0);
13863 ASSERT(sfmmup->sfmmu_scdp == NULL);
13864 ASSERT(sfmmup->sfmmu_free == 1);
13866 sfmmup->sfmmu_srdp = NULL;
13867 evp = srdp->srd_evp;
13868 ASSERT(evp != NULL);
13869 if (atomic_dec_32_nv((volatile uint_t *)&srdp->srd_refcnt)) {
13870 VN_RELE(evp);
13871 return;
13874 hash = SRD_HASH_FUNCTION(evp);
13875 mutex_enter(&srd_buckets[hash].srdb_lock);
13876 for (prev_srdpp = &srd_buckets[hash].srdb_srdp;
13877 (srdp = *prev_srdpp) != NULL; prev_srdpp = &srdp->srd_hash) {
13878 if (srdp->srd_evp == evp) {
13879 break;
13882 if (srdp == NULL || srdp->srd_refcnt) {
13883 mutex_exit(&srd_buckets[hash].srdb_lock);
13884 VN_RELE(evp);
13885 return;
13887 *prev_srdpp = srdp->srd_hash;
13888 mutex_exit(&srd_buckets[hash].srdb_lock);
13890 ASSERT(srdp->srd_refcnt == 0);
13891 VN_RELE(evp);
13893 #ifdef DEBUG
13894 for (i = 0; i < SFMMU_MAX_REGION_BUCKETS; i++) {
13895 ASSERT(srdp->srd_rgnhash[i] == NULL);
13897 #endif /* DEBUG */
13899 /* free each hme regions in the srd */
13900 for (rgnp = srdp->srd_hmergnfree; rgnp != NULL; rgnp = nrgnp) {
13901 nrgnp = rgnp->rgn_next;
13902 ASSERT(rgnp->rgn_id < srdp->srd_next_hmerid);
13903 ASSERT(rgnp->rgn_refcnt == 0);
13904 ASSERT(rgnp->rgn_sfmmu_head == NULL);
13905 ASSERT(rgnp->rgn_flags & SFMMU_REGION_FREE);
13906 ASSERT(rgnp->rgn_hmeflags == 0);
13907 ASSERT(srdp->srd_hmergnp[rgnp->rgn_id] == rgnp);
13908 #ifdef DEBUG
13909 for (i = 0; i < MMU_PAGE_SIZES; i++) {
13910 ASSERT(rgnp->rgn_ttecnt[i] == 0);
13912 rgns++;
13913 #endif /* DEBUG */
13914 kmem_cache_free(region_cache, rgnp);
13916 ASSERT(rgns == srdp->srd_next_hmerid);
13918 #ifdef DEBUG
13919 rgns = 0;
13920 #endif
13921 /* free each ism rgns in the srd */
13922 for (rgnp = srdp->srd_ismrgnfree; rgnp != NULL; rgnp = nrgnp) {
13923 nrgnp = rgnp->rgn_next;
13924 ASSERT(rgnp->rgn_id < srdp->srd_next_ismrid);
13925 ASSERT(rgnp->rgn_refcnt == 0);
13926 ASSERT(rgnp->rgn_sfmmu_head == NULL);
13927 ASSERT(rgnp->rgn_flags & SFMMU_REGION_FREE);
13928 ASSERT(srdp->srd_ismrgnp[rgnp->rgn_id] == rgnp);
13929 #ifdef DEBUG
13930 for (i = 0; i < MMU_PAGE_SIZES; i++) {
13931 ASSERT(rgnp->rgn_ttecnt[i] == 0);
13933 rgns++;
13934 #endif /* DEBUG */
13935 kmem_cache_free(region_cache, rgnp);
13937 ASSERT(rgns == srdp->srd_next_ismrid);
13938 ASSERT(srdp->srd_ismbusyrgns == 0);
13939 ASSERT(srdp->srd_hmebusyrgns == 0);
13941 srdp->srd_next_ismrid = 0;
13942 srdp->srd_next_hmerid = 0;
13944 bzero((void *)srdp->srd_ismrgnp,
13945 sizeof (sf_region_t *) * SFMMU_MAX_ISM_REGIONS);
13946 bzero((void *)srdp->srd_hmergnp,
13947 sizeof (sf_region_t *) * SFMMU_MAX_HME_REGIONS);
13949 ASSERT(srdp->srd_scdp == NULL);
13950 kmem_cache_free(srd_cache, srdp);
13953 /* ARGSUSED */
13954 static int
13955 sfmmu_srdcache_constructor(void *buf, void *cdrarg, int kmflags)
13957 sf_srd_t *srdp = (sf_srd_t *)buf;
13958 bzero(buf, sizeof (*srdp));
13960 mutex_init(&srdp->srd_mutex, NULL, MUTEX_DEFAULT, NULL);
13961 mutex_init(&srdp->srd_scd_mutex, NULL, MUTEX_DEFAULT, NULL);
13962 return (0);
13965 /* ARGSUSED */
13966 static void
13967 sfmmu_srdcache_destructor(void *buf, void *cdrarg)
13969 sf_srd_t *srdp = (sf_srd_t *)buf;
13971 mutex_destroy(&srdp->srd_mutex);
13972 mutex_destroy(&srdp->srd_scd_mutex);
13976 * The caller makes sure hat_join_region()/hat_leave_region() can't be called
13977 * at the same time for the same process and address range. This is ensured by
13978 * the fact that address space is locked as writer when a process joins the
13979 * regions. Therefore there's no need to hold an srd lock during the entire
13980 * execution of hat_join_region()/hat_leave_region().
13983 #define RGN_HASH_FUNCTION(obj) (((((uintptr_t)(obj)) >> 4) ^ \
13984 (((uintptr_t)(obj)) >> 11)) & \
13985 srd_rgn_hashmask)
13987 * This routine implements the shared context functionality required when
13988 * attaching a segment to an address space. It must be called from
13989 * hat_share() for D(ISM) segments and from segvn_create() for segments
13990 * with the MAP_PRIVATE and MAP_TEXT flags set. It returns a region_cookie
13991 * which is saved in the private segment data for hme segments and
13992 * the ism_map structure for ism segments.
13994 hat_region_cookie_t
13995 hat_join_region(struct hat *sfmmup,
13996 caddr_t r_saddr,
13997 size_t r_size,
13998 void *r_obj,
13999 u_offset_t r_objoff,
14000 uchar_t r_perm,
14001 uchar_t r_pgszc,
14002 hat_rgn_cb_func_t r_cb_function,
14003 uint_t flags)
14005 sf_srd_t *srdp = sfmmup->sfmmu_srdp;
14006 uint_t rhash;
14007 uint_t rid;
14008 hatlock_t *hatlockp;
14009 sf_region_t *rgnp;
14010 sf_region_t *new_rgnp = NULL;
14011 int i;
14012 uint16_t *nextidp;
14013 sf_region_t **freelistp;
14014 int maxids;
14015 sf_region_t **rarrp;
14016 uint16_t *busyrgnsp;
14017 ulong_t rttecnt;
14018 uchar_t tteflag;
14019 uchar_t r_type = flags & HAT_REGION_TYPE_MASK;
14020 int text = (r_type == HAT_REGION_TEXT);
14022 if (srdp == NULL || r_size == 0) {
14023 return (HAT_INVALID_REGION_COOKIE);
14026 ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
14027 ASSERT(sfmmup != ksfmmup);
14028 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as));
14029 ASSERT(srdp->srd_refcnt > 0);
14030 ASSERT(!(flags & ~HAT_REGION_TYPE_MASK));
14031 ASSERT(flags == HAT_REGION_TEXT || flags == HAT_REGION_ISM);
14032 ASSERT(r_pgszc < mmu_page_sizes);
14033 if (!IS_P2ALIGNED(r_saddr, TTEBYTES(r_pgszc)) ||
14034 !IS_P2ALIGNED(r_size, TTEBYTES(r_pgszc))) {
14035 panic("hat_join_region: region addr or size is not aligned\n");
14039 r_type = (r_type == HAT_REGION_ISM) ? SFMMU_REGION_ISM :
14040 SFMMU_REGION_HME;
14042 * Currently only support shared hmes for the read only main text
14043 * region.
14045 if (r_type == SFMMU_REGION_HME && ((r_obj != srdp->srd_evp) ||
14046 (r_perm & PROT_WRITE))) {
14047 return (HAT_INVALID_REGION_COOKIE);
14050 rhash = RGN_HASH_FUNCTION(r_obj);
14052 if (r_type == SFMMU_REGION_ISM) {
14053 nextidp = &srdp->srd_next_ismrid;
14054 freelistp = &srdp->srd_ismrgnfree;
14055 maxids = SFMMU_MAX_ISM_REGIONS;
14056 rarrp = srdp->srd_ismrgnp;
14057 busyrgnsp = &srdp->srd_ismbusyrgns;
14058 } else {
14059 nextidp = &srdp->srd_next_hmerid;
14060 freelistp = &srdp->srd_hmergnfree;
14061 maxids = SFMMU_MAX_HME_REGIONS;
14062 rarrp = srdp->srd_hmergnp;
14063 busyrgnsp = &srdp->srd_hmebusyrgns;
14066 mutex_enter(&srdp->srd_mutex);
14068 for (rgnp = srdp->srd_rgnhash[rhash]; rgnp != NULL;
14069 rgnp = rgnp->rgn_hash) {
14070 if (rgnp->rgn_saddr == r_saddr && rgnp->rgn_size == r_size &&
14071 rgnp->rgn_obj == r_obj && rgnp->rgn_objoff == r_objoff &&
14072 rgnp->rgn_perm == r_perm && rgnp->rgn_pgszc == r_pgszc) {
14073 break;
14077 rfound:
14078 if (rgnp != NULL) {
14079 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == r_type);
14080 ASSERT(rgnp->rgn_cb_function == r_cb_function);
14081 ASSERT(rgnp->rgn_refcnt >= 0);
14082 rid = rgnp->rgn_id;
14083 ASSERT(rid < maxids);
14084 ASSERT(rarrp[rid] == rgnp);
14085 ASSERT(rid < *nextidp);
14086 atomic_inc_32((volatile uint_t *)&rgnp->rgn_refcnt);
14087 mutex_exit(&srdp->srd_mutex);
14088 if (new_rgnp != NULL) {
14089 kmem_cache_free(region_cache, new_rgnp);
14091 if (r_type == SFMMU_REGION_HME) {
14092 int myjoin =
14093 (sfmmup == astosfmmu(curthread->t_procp->p_as));
14095 sfmmu_link_to_hmeregion(sfmmup, rgnp);
14097 * bitmap should be updated after linking sfmmu on
14098 * region list so that pageunload() doesn't skip
14099 * TSB/TLB flush. As soon as bitmap is updated another
14100 * thread in this process can already start accessing
14101 * this region.
14104 * Normally ttecnt accounting is done as part of
14105 * pagefault handling. But a process may not take any
14106 * pagefaults on shared hmeblks created by some other
14107 * process. To compensate for this assume that the
14108 * entire region will end up faulted in using
14109 * the region's pagesize.
14112 if (r_pgszc > TTE8K) {
14113 tteflag = 1 << r_pgszc;
14114 if (disable_large_pages & tteflag) {
14115 tteflag = 0;
14117 } else {
14118 tteflag = 0;
14120 if (tteflag && !(sfmmup->sfmmu_rtteflags & tteflag)) {
14121 hatlockp = sfmmu_hat_enter(sfmmup);
14122 sfmmup->sfmmu_rtteflags |= tteflag;
14123 sfmmu_hat_exit(hatlockp);
14125 hatlockp = sfmmu_hat_enter(sfmmup);
14128 * Preallocate 1/4 of ttecnt's in 8K TSB for >= 4M
14129 * region to allow for large page allocation failure.
14131 if (r_pgszc >= TTE4M) {
14132 sfmmup->sfmmu_tsb0_4minflcnt +=
14133 r_size >> (TTE_PAGE_SHIFT(TTE8K) + 2);
14136 /* update sfmmu_ttecnt with the shme rgn ttecnt */
14137 rttecnt = r_size >> TTE_PAGE_SHIFT(r_pgszc);
14138 atomic_add_long(&sfmmup->sfmmu_ttecnt[r_pgszc],
14139 rttecnt);
14141 if (text && r_pgszc >= TTE4M &&
14142 (tteflag || ((disable_large_pages >> TTE4M) &
14143 ((1 << (r_pgszc - TTE4M + 1)) - 1))) &&
14144 !SFMMU_FLAGS_ISSET(sfmmup, HAT_4MTEXT_FLAG)) {
14145 SFMMU_FLAGS_SET(sfmmup, HAT_4MTEXT_FLAG);
14148 sfmmu_hat_exit(hatlockp);
14150 * On Panther we need to make sure TLB is programmed
14151 * to accept 32M/256M pages. Call
14152 * sfmmu_check_page_sizes() now to make sure TLB is
14153 * setup before making hmeregions visible to other
14154 * threads.
14156 sfmmu_check_page_sizes(sfmmup, 1);
14157 hatlockp = sfmmu_hat_enter(sfmmup);
14158 SF_RGNMAP_ADD(sfmmup->sfmmu_hmeregion_map, rid);
14161 * if context is invalid tsb miss exception code will
14162 * call sfmmu_check_page_sizes() and update tsbmiss
14163 * area later.
14165 kpreempt_disable();
14166 if (myjoin &&
14167 (sfmmup->sfmmu_ctxs[CPU_MMU_IDX(CPU)].cnum
14168 != INVALID_CONTEXT)) {
14169 struct tsbmiss *tsbmp;
14171 tsbmp = &tsbmiss_area[CPU->cpu_id];
14172 ASSERT(sfmmup == tsbmp->usfmmup);
14173 BT_SET(tsbmp->shmermap, rid);
14174 if (r_pgszc > TTE64K) {
14175 tsbmp->uhat_rtteflags |= tteflag;
14179 kpreempt_enable();
14181 sfmmu_hat_exit(hatlockp);
14182 ASSERT((hat_region_cookie_t)((uint64_t)rid) !=
14183 HAT_INVALID_REGION_COOKIE);
14184 } else {
14185 hatlockp = sfmmu_hat_enter(sfmmup);
14186 SF_RGNMAP_ADD(sfmmup->sfmmu_ismregion_map, rid);
14187 sfmmu_hat_exit(hatlockp);
14189 ASSERT(rid < maxids);
14191 if (r_type == SFMMU_REGION_ISM) {
14192 sfmmu_find_scd(sfmmup);
14194 return ((hat_region_cookie_t)((uint64_t)rid));
14197 ASSERT(new_rgnp == NULL);
14199 if (*busyrgnsp >= maxids) {
14200 mutex_exit(&srdp->srd_mutex);
14201 return (HAT_INVALID_REGION_COOKIE);
14204 ASSERT(MUTEX_HELD(&srdp->srd_mutex));
14205 if (*freelistp != NULL) {
14206 rgnp = *freelistp;
14207 *freelistp = rgnp->rgn_next;
14208 ASSERT(rgnp->rgn_id < *nextidp);
14209 ASSERT(rgnp->rgn_id < maxids);
14210 ASSERT(rgnp->rgn_flags & SFMMU_REGION_FREE);
14211 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK)
14212 == r_type);
14213 ASSERT(rarrp[rgnp->rgn_id] == rgnp);
14214 ASSERT(rgnp->rgn_hmeflags == 0);
14215 } else {
14217 * release local locks before memory allocation.
14219 mutex_exit(&srdp->srd_mutex);
14221 new_rgnp = kmem_cache_alloc(region_cache, KM_SLEEP);
14223 mutex_enter(&srdp->srd_mutex);
14224 for (rgnp = srdp->srd_rgnhash[rhash]; rgnp != NULL;
14225 rgnp = rgnp->rgn_hash) {
14226 if (rgnp->rgn_saddr == r_saddr &&
14227 rgnp->rgn_size == r_size &&
14228 rgnp->rgn_obj == r_obj &&
14229 rgnp->rgn_objoff == r_objoff &&
14230 rgnp->rgn_perm == r_perm &&
14231 rgnp->rgn_pgszc == r_pgszc) {
14232 break;
14235 if (rgnp != NULL) {
14236 goto rfound;
14239 if (*nextidp >= maxids) {
14240 mutex_exit(&srdp->srd_mutex);
14241 goto fail;
14243 rgnp = new_rgnp;
14244 new_rgnp = NULL;
14245 rgnp->rgn_id = (*nextidp)++;
14246 ASSERT(rgnp->rgn_id < maxids);
14247 ASSERT(rarrp[rgnp->rgn_id] == NULL);
14248 rarrp[rgnp->rgn_id] = rgnp;
14251 ASSERT(rgnp->rgn_sfmmu_head == NULL);
14252 ASSERT(rgnp->rgn_hmeflags == 0);
14253 #ifdef DEBUG
14254 for (i = 0; i < MMU_PAGE_SIZES; i++) {
14255 ASSERT(rgnp->rgn_ttecnt[i] == 0);
14257 #endif
14258 rgnp->rgn_saddr = r_saddr;
14259 rgnp->rgn_size = r_size;
14260 rgnp->rgn_obj = r_obj;
14261 rgnp->rgn_objoff = r_objoff;
14262 rgnp->rgn_perm = r_perm;
14263 rgnp->rgn_pgszc = r_pgszc;
14264 rgnp->rgn_flags = r_type;
14265 rgnp->rgn_refcnt = 0;
14266 rgnp->rgn_cb_function = r_cb_function;
14267 rgnp->rgn_hash = srdp->srd_rgnhash[rhash];
14268 srdp->srd_rgnhash[rhash] = rgnp;
14269 (*busyrgnsp)++;
14270 ASSERT(*busyrgnsp <= maxids);
14271 goto rfound;
14273 fail:
14274 ASSERT(new_rgnp != NULL);
14275 kmem_cache_free(region_cache, new_rgnp);
14276 return (HAT_INVALID_REGION_COOKIE);
14280 * This function implements the shared context functionality required
14281 * when detaching a segment from an address space. It must be called
14282 * from hat_unshare() for all D(ISM) segments and from segvn_unmap(),
14283 * for segments with a valid region_cookie.
14284 * It will also be called from all seg_vn routines which change a
14285 * segment's attributes such as segvn_setprot(), segvn_setpagesize(),
14286 * segvn_clrszc() & segvn_advise(), as well as in the case of COW fault
14287 * from segvn_fault().
14289 void
14290 hat_leave_region(struct hat *sfmmup, hat_region_cookie_t rcookie, uint_t flags)
14292 sf_srd_t *srdp = sfmmup->sfmmu_srdp;
14293 sf_scd_t *scdp;
14294 uint_t rhash;
14295 uint_t rid = (uint_t)((uint64_t)rcookie);
14296 hatlock_t *hatlockp = NULL;
14297 sf_region_t *rgnp;
14298 sf_region_t **prev_rgnpp;
14299 sf_region_t *cur_rgnp;
14300 void *r_obj;
14301 int i;
14302 caddr_t r_saddr;
14303 caddr_t r_eaddr;
14304 size_t r_size;
14305 uchar_t r_pgszc;
14306 uchar_t r_type = flags & HAT_REGION_TYPE_MASK;
14308 ASSERT(sfmmup != ksfmmup);
14309 ASSERT(srdp != NULL);
14310 ASSERT(srdp->srd_refcnt > 0);
14311 ASSERT(!(flags & ~HAT_REGION_TYPE_MASK));
14312 ASSERT(flags == HAT_REGION_TEXT || flags == HAT_REGION_ISM);
14313 ASSERT(!sfmmup->sfmmu_free || sfmmup->sfmmu_scdp == NULL);
14315 r_type = (r_type == HAT_REGION_ISM) ? SFMMU_REGION_ISM :
14316 SFMMU_REGION_HME;
14318 if (r_type == SFMMU_REGION_ISM) {
14319 ASSERT(SFMMU_IS_ISMRID_VALID(rid));
14320 ASSERT(rid < SFMMU_MAX_ISM_REGIONS);
14321 rgnp = srdp->srd_ismrgnp[rid];
14322 } else {
14323 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
14324 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
14325 rgnp = srdp->srd_hmergnp[rid];
14327 ASSERT(rgnp != NULL);
14328 ASSERT(rgnp->rgn_id == rid);
14329 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == r_type);
14330 ASSERT(!(rgnp->rgn_flags & SFMMU_REGION_FREE));
14331 ASSERT(AS_LOCK_HELD(sfmmup->sfmmu_as));
14333 ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
14334 if (r_type == SFMMU_REGION_HME && sfmmup->sfmmu_as->a_xhat != NULL) {
14335 xhat_unload_callback_all(sfmmup->sfmmu_as, rgnp->rgn_saddr,
14336 rgnp->rgn_size, 0, NULL);
14339 if (sfmmup->sfmmu_free) {
14340 ulong_t rttecnt;
14341 r_pgszc = rgnp->rgn_pgszc;
14342 r_size = rgnp->rgn_size;
14344 ASSERT(sfmmup->sfmmu_scdp == NULL);
14345 if (r_type == SFMMU_REGION_ISM) {
14346 SF_RGNMAP_DEL(sfmmup->sfmmu_ismregion_map, rid);
14347 } else {
14348 /* update shme rgns ttecnt in sfmmu_ttecnt */
14349 rttecnt = r_size >> TTE_PAGE_SHIFT(r_pgszc);
14350 ASSERT(sfmmup->sfmmu_ttecnt[r_pgszc] >= rttecnt);
14352 atomic_add_long(&sfmmup->sfmmu_ttecnt[r_pgszc],
14353 -rttecnt);
14355 SF_RGNMAP_DEL(sfmmup->sfmmu_hmeregion_map, rid);
14357 } else if (r_type == SFMMU_REGION_ISM) {
14358 hatlockp = sfmmu_hat_enter(sfmmup);
14359 ASSERT(rid < srdp->srd_next_ismrid);
14360 SF_RGNMAP_DEL(sfmmup->sfmmu_ismregion_map, rid);
14361 scdp = sfmmup->sfmmu_scdp;
14362 if (scdp != NULL &&
14363 SF_RGNMAP_TEST(scdp->scd_ismregion_map, rid)) {
14364 sfmmu_leave_scd(sfmmup, r_type);
14365 ASSERT(sfmmu_hat_lock_held(sfmmup));
14367 sfmmu_hat_exit(hatlockp);
14368 } else {
14369 ulong_t rttecnt;
14370 r_pgszc = rgnp->rgn_pgszc;
14371 r_saddr = rgnp->rgn_saddr;
14372 r_size = rgnp->rgn_size;
14373 r_eaddr = r_saddr + r_size;
14375 ASSERT(r_type == SFMMU_REGION_HME);
14376 hatlockp = sfmmu_hat_enter(sfmmup);
14377 ASSERT(rid < srdp->srd_next_hmerid);
14378 SF_RGNMAP_DEL(sfmmup->sfmmu_hmeregion_map, rid);
14381 * If region is part of an SCD call sfmmu_leave_scd().
14382 * Otherwise if process is not exiting and has valid context
14383 * just drop the context on the floor to lose stale TLB
14384 * entries and force the update of tsb miss area to reflect
14385 * the new region map. After that clean our TSB entries.
14387 scdp = sfmmup->sfmmu_scdp;
14388 if (scdp != NULL &&
14389 SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) {
14390 sfmmu_leave_scd(sfmmup, r_type);
14391 ASSERT(sfmmu_hat_lock_held(sfmmup));
14393 sfmmu_invalidate_ctx(sfmmup);
14395 i = TTE8K;
14396 while (i < mmu_page_sizes) {
14397 if (rgnp->rgn_ttecnt[i] != 0) {
14398 sfmmu_unload_tsb_range(sfmmup, r_saddr,
14399 r_eaddr, i);
14400 if (i < TTE4M) {
14401 i = TTE4M;
14402 continue;
14403 } else {
14404 break;
14407 i++;
14409 /* Remove the preallocated 1/4 8k ttecnt for 4M regions. */
14410 if (r_pgszc >= TTE4M) {
14411 rttecnt = r_size >> (TTE_PAGE_SHIFT(TTE8K) + 2);
14412 ASSERT(sfmmup->sfmmu_tsb0_4minflcnt >=
14413 rttecnt);
14414 sfmmup->sfmmu_tsb0_4minflcnt -= rttecnt;
14417 /* update shme rgns ttecnt in sfmmu_ttecnt */
14418 rttecnt = r_size >> TTE_PAGE_SHIFT(r_pgszc);
14419 ASSERT(sfmmup->sfmmu_ttecnt[r_pgszc] >= rttecnt);
14420 atomic_add_long(&sfmmup->sfmmu_ttecnt[r_pgszc], -rttecnt);
14422 sfmmu_hat_exit(hatlockp);
14423 if (scdp != NULL && sfmmup->sfmmu_scdp == NULL) {
14424 /* sfmmup left the scd, grow private tsb */
14425 sfmmu_check_page_sizes(sfmmup, 1);
14426 } else {
14427 sfmmu_check_page_sizes(sfmmup, 0);
14431 if (r_type == SFMMU_REGION_HME) {
14432 sfmmu_unlink_from_hmeregion(sfmmup, rgnp);
14435 r_obj = rgnp->rgn_obj;
14436 if (atomic_dec_32_nv((volatile uint_t *)&rgnp->rgn_refcnt)) {
14437 return;
14441 * looks like nobody uses this region anymore. Free it.
14443 rhash = RGN_HASH_FUNCTION(r_obj);
14444 mutex_enter(&srdp->srd_mutex);
14445 for (prev_rgnpp = &srdp->srd_rgnhash[rhash];
14446 (cur_rgnp = *prev_rgnpp) != NULL;
14447 prev_rgnpp = &cur_rgnp->rgn_hash) {
14448 if (cur_rgnp == rgnp && cur_rgnp->rgn_refcnt == 0) {
14449 break;
14453 if (cur_rgnp == NULL) {
14454 mutex_exit(&srdp->srd_mutex);
14455 return;
14458 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == r_type);
14459 *prev_rgnpp = rgnp->rgn_hash;
14460 if (r_type == SFMMU_REGION_ISM) {
14461 rgnp->rgn_flags |= SFMMU_REGION_FREE;
14462 ASSERT(rid < srdp->srd_next_ismrid);
14463 rgnp->rgn_next = srdp->srd_ismrgnfree;
14464 srdp->srd_ismrgnfree = rgnp;
14465 ASSERT(srdp->srd_ismbusyrgns > 0);
14466 srdp->srd_ismbusyrgns--;
14467 mutex_exit(&srdp->srd_mutex);
14468 return;
14470 mutex_exit(&srdp->srd_mutex);
14473 * Destroy region's hmeblks.
14475 sfmmu_unload_hmeregion(srdp, rgnp);
14477 rgnp->rgn_hmeflags = 0;
14479 ASSERT(rgnp->rgn_sfmmu_head == NULL);
14480 ASSERT(rgnp->rgn_id == rid);
14481 for (i = 0; i < MMU_PAGE_SIZES; i++) {
14482 rgnp->rgn_ttecnt[i] = 0;
14484 rgnp->rgn_flags |= SFMMU_REGION_FREE;
14485 mutex_enter(&srdp->srd_mutex);
14486 ASSERT(rid < srdp->srd_next_hmerid);
14487 rgnp->rgn_next = srdp->srd_hmergnfree;
14488 srdp->srd_hmergnfree = rgnp;
14489 ASSERT(srdp->srd_hmebusyrgns > 0);
14490 srdp->srd_hmebusyrgns--;
14491 mutex_exit(&srdp->srd_mutex);
14495 * For now only called for hmeblk regions and not for ISM regions.
14497 void
14498 hat_dup_region(struct hat *sfmmup, hat_region_cookie_t rcookie)
14500 sf_srd_t *srdp = sfmmup->sfmmu_srdp;
14501 uint_t rid = (uint_t)((uint64_t)rcookie);
14502 sf_region_t *rgnp;
14503 sf_rgn_link_t *rlink;
14504 sf_rgn_link_t *hrlink;
14505 ulong_t rttecnt;
14507 ASSERT(sfmmup != ksfmmup);
14508 ASSERT(srdp != NULL);
14509 ASSERT(srdp->srd_refcnt > 0);
14511 ASSERT(rid < srdp->srd_next_hmerid);
14512 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
14513 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
14515 rgnp = srdp->srd_hmergnp[rid];
14516 ASSERT(rgnp->rgn_refcnt > 0);
14517 ASSERT(rgnp->rgn_id == rid);
14518 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == SFMMU_REGION_HME);
14519 ASSERT(!(rgnp->rgn_flags & SFMMU_REGION_FREE));
14521 atomic_inc_32((volatile uint_t *)&rgnp->rgn_refcnt);
14523 /* LINTED: constant in conditional context */
14524 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 1, 0);
14525 ASSERT(rlink != NULL);
14526 mutex_enter(&rgnp->rgn_mutex);
14527 ASSERT(rgnp->rgn_sfmmu_head != NULL);
14528 /* LINTED: constant in conditional context */
14529 SFMMU_HMERID2RLINKP(rgnp->rgn_sfmmu_head, rid, hrlink, 0, 0);
14530 ASSERT(hrlink != NULL);
14531 ASSERT(hrlink->prev == NULL);
14532 rlink->next = rgnp->rgn_sfmmu_head;
14533 rlink->prev = NULL;
14534 hrlink->prev = sfmmup;
14536 * make sure rlink's next field is correct
14537 * before making this link visible.
14539 membar_stst();
14540 rgnp->rgn_sfmmu_head = sfmmup;
14541 mutex_exit(&rgnp->rgn_mutex);
14543 /* update sfmmu_ttecnt with the shme rgn ttecnt */
14544 rttecnt = rgnp->rgn_size >> TTE_PAGE_SHIFT(rgnp->rgn_pgszc);
14545 atomic_add_long(&sfmmup->sfmmu_ttecnt[rgnp->rgn_pgszc], rttecnt);
14546 /* update tsb0 inflation count */
14547 if (rgnp->rgn_pgszc >= TTE4M) {
14548 sfmmup->sfmmu_tsb0_4minflcnt +=
14549 rgnp->rgn_size >> (TTE_PAGE_SHIFT(TTE8K) + 2);
14552 * Update regionid bitmask without hat lock since no other thread
14553 * can update this region bitmask right now.
14555 SF_RGNMAP_ADD(sfmmup->sfmmu_hmeregion_map, rid);
14558 /* ARGSUSED */
14559 static int
14560 sfmmu_rgncache_constructor(void *buf, void *cdrarg, int kmflags)
14562 sf_region_t *rgnp = (sf_region_t *)buf;
14563 bzero(buf, sizeof (*rgnp));
14565 mutex_init(&rgnp->rgn_mutex, NULL, MUTEX_DEFAULT, NULL);
14567 return (0);
14570 /* ARGSUSED */
14571 static void
14572 sfmmu_rgncache_destructor(void *buf, void *cdrarg)
14574 sf_region_t *rgnp = (sf_region_t *)buf;
14575 mutex_destroy(&rgnp->rgn_mutex);
14578 static int
14579 sfrgnmap_isnull(sf_region_map_t *map)
14581 int i;
14583 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) {
14584 if (map->bitmap[i] != 0) {
14585 return (0);
14588 return (1);
14591 static int
14592 sfhmergnmap_isnull(sf_hmeregion_map_t *map)
14594 int i;
14596 for (i = 0; i < SFMMU_HMERGNMAP_WORDS; i++) {
14597 if (map->bitmap[i] != 0) {
14598 return (0);
14601 return (1);
14604 #ifdef DEBUG
14605 static void
14606 check_scd_sfmmu_list(sfmmu_t **headp, sfmmu_t *sfmmup, int onlist)
14608 sfmmu_t *sp;
14609 sf_srd_t *srdp = sfmmup->sfmmu_srdp;
14611 for (sp = *headp; sp != NULL; sp = sp->sfmmu_scd_link.next) {
14612 ASSERT(srdp == sp->sfmmu_srdp);
14613 if (sp == sfmmup) {
14614 if (onlist) {
14615 return;
14616 } else {
14617 panic("shctx: sfmmu 0x%p found on scd"
14618 "list 0x%p", (void *)sfmmup,
14619 (void *)*headp);
14623 if (onlist) {
14624 panic("shctx: sfmmu 0x%p not found on scd list 0x%p",
14625 (void *)sfmmup, (void *)*headp);
14626 } else {
14627 return;
14630 #else /* DEBUG */
14631 #define check_scd_sfmmu_list(headp, sfmmup, onlist)
14632 #endif /* DEBUG */
14635 * Removes an sfmmu from the SCD sfmmu list.
14637 static void
14638 sfmmu_from_scd_list(sfmmu_t **headp, sfmmu_t *sfmmup)
14640 ASSERT(sfmmup->sfmmu_srdp != NULL);
14641 check_scd_sfmmu_list(headp, sfmmup, 1);
14642 if (sfmmup->sfmmu_scd_link.prev != NULL) {
14643 ASSERT(*headp != sfmmup);
14644 sfmmup->sfmmu_scd_link.prev->sfmmu_scd_link.next =
14645 sfmmup->sfmmu_scd_link.next;
14646 } else {
14647 ASSERT(*headp == sfmmup);
14648 *headp = sfmmup->sfmmu_scd_link.next;
14650 if (sfmmup->sfmmu_scd_link.next != NULL) {
14651 sfmmup->sfmmu_scd_link.next->sfmmu_scd_link.prev =
14652 sfmmup->sfmmu_scd_link.prev;
14658 * Adds an sfmmu to the start of the queue.
14660 static void
14661 sfmmu_to_scd_list(sfmmu_t **headp, sfmmu_t *sfmmup)
14663 check_scd_sfmmu_list(headp, sfmmup, 0);
14664 sfmmup->sfmmu_scd_link.prev = NULL;
14665 sfmmup->sfmmu_scd_link.next = *headp;
14666 if (*headp != NULL)
14667 (*headp)->sfmmu_scd_link.prev = sfmmup;
14668 *headp = sfmmup;
14672 * Remove an scd from the start of the queue.
14674 static void
14675 sfmmu_remove_scd(sf_scd_t **headp, sf_scd_t *scdp)
14677 if (scdp->scd_prev != NULL) {
14678 ASSERT(*headp != scdp);
14679 scdp->scd_prev->scd_next = scdp->scd_next;
14680 } else {
14681 ASSERT(*headp == scdp);
14682 *headp = scdp->scd_next;
14685 if (scdp->scd_next != NULL) {
14686 scdp->scd_next->scd_prev = scdp->scd_prev;
14691 * Add an scd to the start of the queue.
14693 static void
14694 sfmmu_add_scd(sf_scd_t **headp, sf_scd_t *scdp)
14696 scdp->scd_prev = NULL;
14697 scdp->scd_next = *headp;
14698 if (*headp != NULL) {
14699 (*headp)->scd_prev = scdp;
14701 *headp = scdp;
14704 static int
14705 sfmmu_alloc_scd_tsbs(sf_srd_t *srdp, sf_scd_t *scdp)
14707 uint_t rid;
14708 uint_t i;
14709 uint_t j;
14710 ulong_t w;
14711 sf_region_t *rgnp;
14712 ulong_t tte8k_cnt = 0;
14713 ulong_t tte4m_cnt = 0;
14714 uint_t tsb_szc;
14715 sfmmu_t *scsfmmup = scdp->scd_sfmmup;
14716 sfmmu_t *ism_hatid;
14717 struct tsb_info *newtsb;
14718 int szc;
14720 ASSERT(srdp != NULL);
14722 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) {
14723 if ((w = scdp->scd_region_map.bitmap[i]) == 0) {
14724 continue;
14726 j = 0;
14727 while (w) {
14728 if (!(w & 0x1)) {
14729 j++;
14730 w >>= 1;
14731 continue;
14733 rid = (i << BT_ULSHIFT) | j;
14734 j++;
14735 w >>= 1;
14737 if (rid < SFMMU_MAX_HME_REGIONS) {
14738 rgnp = srdp->srd_hmergnp[rid];
14739 ASSERT(rgnp->rgn_id == rid);
14740 ASSERT(rgnp->rgn_refcnt > 0);
14742 if (rgnp->rgn_pgszc < TTE4M) {
14743 tte8k_cnt += rgnp->rgn_size >>
14744 TTE_PAGE_SHIFT(TTE8K);
14745 } else {
14746 ASSERT(rgnp->rgn_pgszc >= TTE4M);
14747 tte4m_cnt += rgnp->rgn_size >>
14748 TTE_PAGE_SHIFT(TTE4M);
14750 * Inflate SCD tsb0 by preallocating
14751 * 1/4 8k ttecnt for 4M regions to
14752 * allow for lgpg alloc failure.
14754 tte8k_cnt += rgnp->rgn_size >>
14755 (TTE_PAGE_SHIFT(TTE8K) + 2);
14757 } else {
14758 rid -= SFMMU_MAX_HME_REGIONS;
14759 rgnp = srdp->srd_ismrgnp[rid];
14760 ASSERT(rgnp->rgn_id == rid);
14761 ASSERT(rgnp->rgn_refcnt > 0);
14763 ism_hatid = (sfmmu_t *)rgnp->rgn_obj;
14764 ASSERT(ism_hatid->sfmmu_ismhat);
14766 for (szc = 0; szc < TTE4M; szc++) {
14767 tte8k_cnt +=
14768 ism_hatid->sfmmu_ttecnt[szc] <<
14769 TTE_BSZS_SHIFT(szc);
14772 ASSERT(rgnp->rgn_pgszc >= TTE4M);
14773 if (rgnp->rgn_pgszc >= TTE4M) {
14774 tte4m_cnt += rgnp->rgn_size >>
14775 TTE_PAGE_SHIFT(TTE4M);
14781 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt);
14783 /* Allocate both the SCD TSBs here. */
14784 if (sfmmu_tsbinfo_alloc(&scsfmmup->sfmmu_tsb,
14785 tsb_szc, TSB8K|TSB64K|TSB512K, TSB_ALLOC, scsfmmup) &&
14786 (tsb_szc <= TSB_4M_SZCODE ||
14787 sfmmu_tsbinfo_alloc(&scsfmmup->sfmmu_tsb,
14788 TSB_4M_SZCODE, TSB8K|TSB64K|TSB512K,
14789 TSB_ALLOC, scsfmmup))) {
14791 SFMMU_STAT(sf_scd_1sttsb_allocfail);
14792 return (TSB_ALLOCFAIL);
14793 } else {
14794 scsfmmup->sfmmu_tsb->tsb_flags |= TSB_SHAREDCTX;
14796 if (tte4m_cnt) {
14797 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt);
14798 if (sfmmu_tsbinfo_alloc(&newtsb, tsb_szc,
14799 TSB4M|TSB32M|TSB256M, TSB_ALLOC, scsfmmup) &&
14800 (tsb_szc <= TSB_4M_SZCODE ||
14801 sfmmu_tsbinfo_alloc(&newtsb, TSB_4M_SZCODE,
14802 TSB4M|TSB32M|TSB256M,
14803 TSB_ALLOC, scsfmmup))) {
14805 * If we fail to allocate the 2nd shared tsb,
14806 * just free the 1st tsb, return failure.
14808 sfmmu_tsbinfo_free(scsfmmup->sfmmu_tsb);
14809 SFMMU_STAT(sf_scd_2ndtsb_allocfail);
14810 return (TSB_ALLOCFAIL);
14811 } else {
14812 ASSERT(scsfmmup->sfmmu_tsb->tsb_next == NULL);
14813 newtsb->tsb_flags |= TSB_SHAREDCTX;
14814 scsfmmup->sfmmu_tsb->tsb_next = newtsb;
14815 SFMMU_STAT(sf_scd_2ndtsb_alloc);
14818 SFMMU_STAT(sf_scd_1sttsb_alloc);
14820 return (TSB_SUCCESS);
14823 static void
14824 sfmmu_free_scd_tsbs(sfmmu_t *scd_sfmmu)
14826 while (scd_sfmmu->sfmmu_tsb != NULL) {
14827 struct tsb_info *next = scd_sfmmu->sfmmu_tsb->tsb_next;
14828 sfmmu_tsbinfo_free(scd_sfmmu->sfmmu_tsb);
14829 scd_sfmmu->sfmmu_tsb = next;
14834 * Link the sfmmu onto the hme region list.
14836 void
14837 sfmmu_link_to_hmeregion(sfmmu_t *sfmmup, sf_region_t *rgnp)
14839 uint_t rid;
14840 sf_rgn_link_t *rlink;
14841 sfmmu_t *head;
14842 sf_rgn_link_t *hrlink;
14844 rid = rgnp->rgn_id;
14845 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
14847 /* LINTED: constant in conditional context */
14848 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 1, 1);
14849 ASSERT(rlink != NULL);
14850 mutex_enter(&rgnp->rgn_mutex);
14851 if ((head = rgnp->rgn_sfmmu_head) == NULL) {
14852 rlink->next = NULL;
14853 rlink->prev = NULL;
14855 * make sure rlink's next field is NULL
14856 * before making this link visible.
14858 membar_stst();
14859 rgnp->rgn_sfmmu_head = sfmmup;
14860 } else {
14861 /* LINTED: constant in conditional context */
14862 SFMMU_HMERID2RLINKP(head, rid, hrlink, 0, 0);
14863 ASSERT(hrlink != NULL);
14864 ASSERT(hrlink->prev == NULL);
14865 rlink->next = head;
14866 rlink->prev = NULL;
14867 hrlink->prev = sfmmup;
14869 * make sure rlink's next field is correct
14870 * before making this link visible.
14872 membar_stst();
14873 rgnp->rgn_sfmmu_head = sfmmup;
14875 mutex_exit(&rgnp->rgn_mutex);
14879 * Unlink the sfmmu from the hme region list.
14881 void
14882 sfmmu_unlink_from_hmeregion(sfmmu_t *sfmmup, sf_region_t *rgnp)
14884 uint_t rid;
14885 sf_rgn_link_t *rlink;
14887 rid = rgnp->rgn_id;
14888 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
14890 /* LINTED: constant in conditional context */
14891 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 0, 0);
14892 ASSERT(rlink != NULL);
14893 mutex_enter(&rgnp->rgn_mutex);
14894 if (rgnp->rgn_sfmmu_head == sfmmup) {
14895 sfmmu_t *next = rlink->next;
14896 rgnp->rgn_sfmmu_head = next;
14898 * if we are stopped by xc_attention() after this
14899 * point the forward link walking in
14900 * sfmmu_rgntlb_demap() will work correctly since the
14901 * head correctly points to the next element.
14903 membar_stst();
14904 rlink->next = NULL;
14905 ASSERT(rlink->prev == NULL);
14906 if (next != NULL) {
14907 sf_rgn_link_t *nrlink;
14908 /* LINTED: constant in conditional context */
14909 SFMMU_HMERID2RLINKP(next, rid, nrlink, 0, 0);
14910 ASSERT(nrlink != NULL);
14911 ASSERT(nrlink->prev == sfmmup);
14912 nrlink->prev = NULL;
14914 } else {
14915 sfmmu_t *next = rlink->next;
14916 sfmmu_t *prev = rlink->prev;
14917 sf_rgn_link_t *prlink;
14919 ASSERT(prev != NULL);
14920 /* LINTED: constant in conditional context */
14921 SFMMU_HMERID2RLINKP(prev, rid, prlink, 0, 0);
14922 ASSERT(prlink != NULL);
14923 ASSERT(prlink->next == sfmmup);
14924 prlink->next = next;
14926 * if we are stopped by xc_attention()
14927 * after this point the forward link walking
14928 * will work correctly since the prev element
14929 * correctly points to the next element.
14931 membar_stst();
14932 rlink->next = NULL;
14933 rlink->prev = NULL;
14934 if (next != NULL) {
14935 sf_rgn_link_t *nrlink;
14936 /* LINTED: constant in conditional context */
14937 SFMMU_HMERID2RLINKP(next, rid, nrlink, 0, 0);
14938 ASSERT(nrlink != NULL);
14939 ASSERT(nrlink->prev == sfmmup);
14940 nrlink->prev = prev;
14943 mutex_exit(&rgnp->rgn_mutex);
14947 * Link scd sfmmu onto ism or hme region list for each region in the
14948 * scd region map.
14950 void
14951 sfmmu_link_scd_to_regions(sf_srd_t *srdp, sf_scd_t *scdp)
14953 uint_t rid;
14954 uint_t i;
14955 uint_t j;
14956 ulong_t w;
14957 sf_region_t *rgnp;
14958 sfmmu_t *scsfmmup;
14960 scsfmmup = scdp->scd_sfmmup;
14961 ASSERT(scsfmmup->sfmmu_scdhat);
14962 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) {
14963 if ((w = scdp->scd_region_map.bitmap[i]) == 0) {
14964 continue;
14966 j = 0;
14967 while (w) {
14968 if (!(w & 0x1)) {
14969 j++;
14970 w >>= 1;
14971 continue;
14973 rid = (i << BT_ULSHIFT) | j;
14974 j++;
14975 w >>= 1;
14977 if (rid < SFMMU_MAX_HME_REGIONS) {
14978 rgnp = srdp->srd_hmergnp[rid];
14979 ASSERT(rgnp->rgn_id == rid);
14980 ASSERT(rgnp->rgn_refcnt > 0);
14981 sfmmu_link_to_hmeregion(scsfmmup, rgnp);
14982 } else {
14983 sfmmu_t *ism_hatid = NULL;
14984 ism_ment_t *ism_ment;
14985 rid -= SFMMU_MAX_HME_REGIONS;
14986 rgnp = srdp->srd_ismrgnp[rid];
14987 ASSERT(rgnp->rgn_id == rid);
14988 ASSERT(rgnp->rgn_refcnt > 0);
14990 ism_hatid = (sfmmu_t *)rgnp->rgn_obj;
14991 ASSERT(ism_hatid->sfmmu_ismhat);
14992 ism_ment = &scdp->scd_ism_links[rid];
14993 ism_ment->iment_hat = scsfmmup;
14994 ism_ment->iment_base_va = rgnp->rgn_saddr;
14995 mutex_enter(&ism_mlist_lock);
14996 iment_add(ism_ment, ism_hatid);
14997 mutex_exit(&ism_mlist_lock);
15004 * Unlink scd sfmmu from ism or hme region list for each region in the
15005 * scd region map.
15007 void
15008 sfmmu_unlink_scd_from_regions(sf_srd_t *srdp, sf_scd_t *scdp)
15010 uint_t rid;
15011 uint_t i;
15012 uint_t j;
15013 ulong_t w;
15014 sf_region_t *rgnp;
15015 sfmmu_t *scsfmmup;
15017 scsfmmup = scdp->scd_sfmmup;
15018 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) {
15019 if ((w = scdp->scd_region_map.bitmap[i]) == 0) {
15020 continue;
15022 j = 0;
15023 while (w) {
15024 if (!(w & 0x1)) {
15025 j++;
15026 w >>= 1;
15027 continue;
15029 rid = (i << BT_ULSHIFT) | j;
15030 j++;
15031 w >>= 1;
15033 if (rid < SFMMU_MAX_HME_REGIONS) {
15034 rgnp = srdp->srd_hmergnp[rid];
15035 ASSERT(rgnp->rgn_id == rid);
15036 ASSERT(rgnp->rgn_refcnt > 0);
15037 sfmmu_unlink_from_hmeregion(scsfmmup,
15038 rgnp);
15040 } else {
15041 sfmmu_t *ism_hatid = NULL;
15042 ism_ment_t *ism_ment;
15043 rid -= SFMMU_MAX_HME_REGIONS;
15044 rgnp = srdp->srd_ismrgnp[rid];
15045 ASSERT(rgnp->rgn_id == rid);
15046 ASSERT(rgnp->rgn_refcnt > 0);
15048 ism_hatid = (sfmmu_t *)rgnp->rgn_obj;
15049 ASSERT(ism_hatid->sfmmu_ismhat);
15050 ism_ment = &scdp->scd_ism_links[rid];
15051 ASSERT(ism_ment->iment_hat == scdp->scd_sfmmup);
15052 ASSERT(ism_ment->iment_base_va ==
15053 rgnp->rgn_saddr);
15054 mutex_enter(&ism_mlist_lock);
15055 iment_sub(ism_ment, ism_hatid);
15056 mutex_exit(&ism_mlist_lock);
15063 * Allocates and initialises a new SCD structure, this is called with
15064 * the srd_scd_mutex held and returns with the reference count
15065 * initialised to 1.
15067 static sf_scd_t *
15068 sfmmu_alloc_scd(sf_srd_t *srdp, sf_region_map_t *new_map)
15070 sf_scd_t *new_scdp;
15071 sfmmu_t *scsfmmup;
15072 int i;
15074 ASSERT(MUTEX_HELD(&srdp->srd_scd_mutex));
15075 new_scdp = kmem_cache_alloc(scd_cache, KM_SLEEP);
15077 scsfmmup = kmem_cache_alloc(sfmmuid_cache, KM_SLEEP);
15078 new_scdp->scd_sfmmup = scsfmmup;
15079 scsfmmup->sfmmu_srdp = srdp;
15080 scsfmmup->sfmmu_scdp = new_scdp;
15081 scsfmmup->sfmmu_tsb0_4minflcnt = 0;
15082 scsfmmup->sfmmu_scdhat = 1;
15083 CPUSET_ALL(scsfmmup->sfmmu_cpusran);
15084 bzero(scsfmmup->sfmmu_hmeregion_links, SFMMU_L1_HMERLINKS_SIZE);
15086 ASSERT(max_mmu_ctxdoms > 0);
15087 for (i = 0; i < max_mmu_ctxdoms; i++) {
15088 scsfmmup->sfmmu_ctxs[i].cnum = INVALID_CONTEXT;
15089 scsfmmup->sfmmu_ctxs[i].gnum = 0;
15092 for (i = 0; i < MMU_PAGE_SIZES; i++) {
15093 new_scdp->scd_rttecnt[i] = 0;
15096 new_scdp->scd_region_map = *new_map;
15097 new_scdp->scd_refcnt = 1;
15098 if (sfmmu_alloc_scd_tsbs(srdp, new_scdp) != TSB_SUCCESS) {
15099 kmem_cache_free(scd_cache, new_scdp);
15100 kmem_cache_free(sfmmuid_cache, scsfmmup);
15101 return (NULL);
15103 if (&mmu_init_scd) {
15104 mmu_init_scd(new_scdp);
15106 return (new_scdp);
15110 * The first phase of a process joining an SCD. The hat structure is
15111 * linked to the SCD queue and then the HAT_JOIN_SCD sfmmu flag is set
15112 * and a cross-call with context invalidation is used to cause the
15113 * remaining work to be carried out in the sfmmu_tsbmiss_exception()
15114 * routine.
15116 static void
15117 sfmmu_join_scd(sf_scd_t *scdp, sfmmu_t *sfmmup)
15119 hatlock_t *hatlockp;
15120 sf_srd_t *srdp = sfmmup->sfmmu_srdp;
15121 int i;
15122 sf_scd_t *old_scdp;
15124 ASSERT(srdp != NULL);
15125 ASSERT(scdp != NULL);
15126 ASSERT(scdp->scd_refcnt > 0);
15127 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as));
15129 if ((old_scdp = sfmmup->sfmmu_scdp) != NULL) {
15130 ASSERT(old_scdp != scdp);
15132 mutex_enter(&old_scdp->scd_mutex);
15133 sfmmu_from_scd_list(&old_scdp->scd_sf_list, sfmmup);
15134 mutex_exit(&old_scdp->scd_mutex);
15136 * sfmmup leaves the old scd. Update sfmmu_ttecnt to
15137 * include the shme rgn ttecnt for rgns that
15138 * were in the old SCD
15140 for (i = 0; i < mmu_page_sizes; i++) {
15141 ASSERT(sfmmup->sfmmu_scdrttecnt[i] ==
15142 old_scdp->scd_rttecnt[i]);
15143 atomic_add_long(&sfmmup->sfmmu_ttecnt[i],
15144 sfmmup->sfmmu_scdrttecnt[i]);
15149 * Move sfmmu to the scd lists.
15151 mutex_enter(&scdp->scd_mutex);
15152 sfmmu_to_scd_list(&scdp->scd_sf_list, sfmmup);
15153 mutex_exit(&scdp->scd_mutex);
15154 SF_SCD_INCR_REF(scdp);
15156 hatlockp = sfmmu_hat_enter(sfmmup);
15158 * For a multi-thread process, we must stop
15159 * all the other threads before joining the scd.
15162 SFMMU_FLAGS_SET(sfmmup, HAT_JOIN_SCD);
15164 sfmmu_invalidate_ctx(sfmmup);
15165 sfmmup->sfmmu_scdp = scdp;
15168 * Copy scd_rttecnt into sfmmup's sfmmu_scdrttecnt, and update
15169 * sfmmu_ttecnt to not include the rgn ttecnt just joined in SCD.
15171 for (i = 0; i < mmu_page_sizes; i++) {
15172 sfmmup->sfmmu_scdrttecnt[i] = scdp->scd_rttecnt[i];
15173 ASSERT(sfmmup->sfmmu_ttecnt[i] >= scdp->scd_rttecnt[i]);
15174 atomic_add_long(&sfmmup->sfmmu_ttecnt[i],
15175 -sfmmup->sfmmu_scdrttecnt[i]);
15177 /* update tsb0 inflation count */
15178 if (old_scdp != NULL) {
15179 sfmmup->sfmmu_tsb0_4minflcnt +=
15180 old_scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt;
15182 ASSERT(sfmmup->sfmmu_tsb0_4minflcnt >=
15183 scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt);
15184 sfmmup->sfmmu_tsb0_4minflcnt -= scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt;
15186 sfmmu_hat_exit(hatlockp);
15188 if (old_scdp != NULL) {
15189 SF_SCD_DECR_REF(srdp, old_scdp);
15195 * This routine is called by a process to become part of an SCD. It is called
15196 * from sfmmu_tsbmiss_exception() once most of the initial work has been
15197 * done by sfmmu_join_scd(). This routine must not drop the hat lock.
15199 static void
15200 sfmmu_finish_join_scd(sfmmu_t *sfmmup)
15202 struct tsb_info *tsbinfop;
15204 ASSERT(sfmmu_hat_lock_held(sfmmup));
15205 ASSERT(sfmmup->sfmmu_scdp != NULL);
15206 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD));
15207 ASSERT(!SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY));
15208 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ALLCTX_INVALID));
15210 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL;
15211 tsbinfop = tsbinfop->tsb_next) {
15212 if (tsbinfop->tsb_flags & TSB_SWAPPED) {
15213 continue;
15215 ASSERT(!(tsbinfop->tsb_flags & TSB_RELOC_FLAG));
15217 sfmmu_inv_tsb(tsbinfop->tsb_va,
15218 TSB_BYTES(tsbinfop->tsb_szc));
15221 /* Set HAT_CTX1_FLAG for all SCD ISMs */
15222 sfmmu_ism_hatflags(sfmmup, 1);
15224 SFMMU_STAT(sf_join_scd);
15228 * This routine is called in order to check if there is an SCD which matches
15229 * the process's region map if not then a new SCD may be created.
15231 static void
15232 sfmmu_find_scd(sfmmu_t *sfmmup)
15234 sf_srd_t *srdp = sfmmup->sfmmu_srdp;
15235 sf_scd_t *scdp, *new_scdp;
15236 int ret;
15238 ASSERT(srdp != NULL);
15239 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as));
15241 mutex_enter(&srdp->srd_scd_mutex);
15242 for (scdp = srdp->srd_scdp; scdp != NULL;
15243 scdp = scdp->scd_next) {
15244 SF_RGNMAP_EQUAL(&scdp->scd_region_map,
15245 &sfmmup->sfmmu_region_map, ret);
15246 if (ret == 1) {
15247 SF_SCD_INCR_REF(scdp);
15248 mutex_exit(&srdp->srd_scd_mutex);
15249 sfmmu_join_scd(scdp, sfmmup);
15250 ASSERT(scdp->scd_refcnt >= 2);
15251 atomic_dec_32((volatile uint32_t *)&scdp->scd_refcnt);
15252 return;
15253 } else {
15255 * If the sfmmu region map is a subset of the scd
15256 * region map, then the assumption is that this process
15257 * will continue attaching to ISM segments until the
15258 * region maps are equal.
15260 SF_RGNMAP_IS_SUBSET(&scdp->scd_region_map,
15261 &sfmmup->sfmmu_region_map, ret);
15262 if (ret == 1) {
15263 mutex_exit(&srdp->srd_scd_mutex);
15264 return;
15269 ASSERT(scdp == NULL);
15271 * No matching SCD has been found, create a new one.
15273 if ((new_scdp = sfmmu_alloc_scd(srdp, &sfmmup->sfmmu_region_map)) ==
15274 NULL) {
15275 mutex_exit(&srdp->srd_scd_mutex);
15276 return;
15280 * sfmmu_alloc_scd() returns with a ref count of 1 on the scd.
15283 /* Set scd_rttecnt for shme rgns in SCD */
15284 sfmmu_set_scd_rttecnt(srdp, new_scdp);
15287 * Link scd onto srd_scdp list and scd sfmmu onto region/iment lists.
15289 sfmmu_link_scd_to_regions(srdp, new_scdp);
15290 sfmmu_add_scd(&srdp->srd_scdp, new_scdp);
15291 SFMMU_STAT_ADD(sf_create_scd, 1);
15293 mutex_exit(&srdp->srd_scd_mutex);
15294 sfmmu_join_scd(new_scdp, sfmmup);
15295 ASSERT(new_scdp->scd_refcnt >= 2);
15296 atomic_dec_32((volatile uint32_t *)&new_scdp->scd_refcnt);
15300 * This routine is called by a process to remove itself from an SCD. It is
15301 * either called when the processes has detached from a segment or from
15302 * hat_free_start() as a result of calling exit.
15304 static void
15305 sfmmu_leave_scd(sfmmu_t *sfmmup, uchar_t r_type)
15307 sf_scd_t *scdp = sfmmup->sfmmu_scdp;
15308 sf_srd_t *srdp = sfmmup->sfmmu_srdp;
15309 hatlock_t *hatlockp = TSB_HASH(sfmmup);
15310 int i;
15312 ASSERT(scdp != NULL);
15313 ASSERT(srdp != NULL);
15315 if (sfmmup->sfmmu_free) {
15317 * If the process is part of an SCD the sfmmu is unlinked
15318 * from scd_sf_list.
15320 mutex_enter(&scdp->scd_mutex);
15321 sfmmu_from_scd_list(&scdp->scd_sf_list, sfmmup);
15322 mutex_exit(&scdp->scd_mutex);
15324 * Update sfmmu_ttecnt to include the rgn ttecnt for rgns that
15325 * are about to leave the SCD
15327 for (i = 0; i < mmu_page_sizes; i++) {
15328 ASSERT(sfmmup->sfmmu_scdrttecnt[i] ==
15329 scdp->scd_rttecnt[i]);
15330 atomic_add_long(&sfmmup->sfmmu_ttecnt[i],
15331 sfmmup->sfmmu_scdrttecnt[i]);
15332 sfmmup->sfmmu_scdrttecnt[i] = 0;
15334 sfmmup->sfmmu_scdp = NULL;
15336 SF_SCD_DECR_REF(srdp, scdp);
15337 return;
15340 ASSERT(r_type != SFMMU_REGION_ISM ||
15341 SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY));
15342 ASSERT(scdp->scd_refcnt);
15343 ASSERT(!sfmmup->sfmmu_free);
15344 ASSERT(sfmmu_hat_lock_held(sfmmup));
15345 ASSERT(AS_LOCK_HELD(sfmmup->sfmmu_as));
15348 * Wait for ISM maps to be updated.
15350 if (r_type != SFMMU_REGION_ISM) {
15351 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY) &&
15352 sfmmup->sfmmu_scdp != NULL) {
15353 cv_wait(&sfmmup->sfmmu_tsb_cv,
15354 HATLOCK_MUTEXP(hatlockp));
15357 if (sfmmup->sfmmu_scdp == NULL) {
15358 sfmmu_hat_exit(hatlockp);
15359 return;
15361 SFMMU_FLAGS_SET(sfmmup, HAT_ISMBUSY);
15364 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)) {
15365 SFMMU_FLAGS_CLEAR(sfmmup, HAT_JOIN_SCD);
15367 * Since HAT_JOIN_SCD was set our context
15368 * is still invalid.
15370 } else {
15372 * For a multi-thread process, we must stop
15373 * all the other threads before leaving the scd.
15376 sfmmu_invalidate_ctx(sfmmup);
15379 /* Clear all the rid's for ISM, delete flags, etc */
15380 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY));
15381 sfmmu_ism_hatflags(sfmmup, 0);
15384 * Update sfmmu_ttecnt to include the rgn ttecnt for rgns that
15385 * are in SCD before this sfmmup leaves the SCD.
15387 for (i = 0; i < mmu_page_sizes; i++) {
15388 ASSERT(sfmmup->sfmmu_scdrttecnt[i] ==
15389 scdp->scd_rttecnt[i]);
15390 atomic_add_long(&sfmmup->sfmmu_ttecnt[i],
15391 sfmmup->sfmmu_scdrttecnt[i]);
15392 sfmmup->sfmmu_scdrttecnt[i] = 0;
15393 /* update ismttecnt to include SCD ism before hat leaves SCD */
15394 sfmmup->sfmmu_ismttecnt[i] += sfmmup->sfmmu_scdismttecnt[i];
15395 sfmmup->sfmmu_scdismttecnt[i] = 0;
15397 /* update tsb0 inflation count */
15398 sfmmup->sfmmu_tsb0_4minflcnt += scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt;
15400 if (r_type != SFMMU_REGION_ISM) {
15401 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ISMBUSY);
15403 sfmmup->sfmmu_scdp = NULL;
15405 sfmmu_hat_exit(hatlockp);
15408 * Unlink sfmmu from scd_sf_list this can be done without holding
15409 * the hat lock as we hold the sfmmu_as lock which prevents
15410 * hat_join_region from adding this thread to the scd again. Other
15411 * threads check if sfmmu_scdp is NULL under hat lock and if it's NULL
15412 * they won't get here, since sfmmu_leave_scd() clears sfmmu_scdp
15413 * while holding the hat lock.
15415 mutex_enter(&scdp->scd_mutex);
15416 sfmmu_from_scd_list(&scdp->scd_sf_list, sfmmup);
15417 mutex_exit(&scdp->scd_mutex);
15418 SFMMU_STAT(sf_leave_scd);
15420 SF_SCD_DECR_REF(srdp, scdp);
15421 hatlockp = sfmmu_hat_enter(sfmmup);
15426 * Unlink and free up an SCD structure with a reference count of 0.
15428 static void
15429 sfmmu_destroy_scd(sf_srd_t *srdp, sf_scd_t *scdp, sf_region_map_t *scd_rmap)
15431 sfmmu_t *scsfmmup;
15432 sf_scd_t *sp;
15433 hatlock_t *shatlockp;
15434 int i, ret;
15436 mutex_enter(&srdp->srd_scd_mutex);
15437 for (sp = srdp->srd_scdp; sp != NULL; sp = sp->scd_next) {
15438 if (sp == scdp)
15439 break;
15441 if (sp == NULL || sp->scd_refcnt) {
15442 mutex_exit(&srdp->srd_scd_mutex);
15443 return;
15447 * It is possible that the scd has been freed and reallocated with a
15448 * different region map while we've been waiting for the srd_scd_mutex.
15450 SF_RGNMAP_EQUAL(scd_rmap, &sp->scd_region_map, ret);
15451 if (ret != 1) {
15452 mutex_exit(&srdp->srd_scd_mutex);
15453 return;
15456 ASSERT(scdp->scd_sf_list == NULL);
15458 * Unlink scd from srd_scdp list.
15460 sfmmu_remove_scd(&srdp->srd_scdp, scdp);
15461 mutex_exit(&srdp->srd_scd_mutex);
15463 sfmmu_unlink_scd_from_regions(srdp, scdp);
15465 /* Clear shared context tsb and release ctx */
15466 scsfmmup = scdp->scd_sfmmup;
15469 * create a barrier so that scd will not be destroyed
15470 * if other thread still holds the same shared hat lock.
15471 * E.g., sfmmu_tsbmiss_exception() needs to acquire the
15472 * shared hat lock before checking the shared tsb reloc flag.
15474 shatlockp = sfmmu_hat_enter(scsfmmup);
15475 sfmmu_hat_exit(shatlockp);
15477 sfmmu_free_scd_tsbs(scsfmmup);
15479 for (i = 0; i < SFMMU_L1_HMERLINKS; i++) {
15480 if (scsfmmup->sfmmu_hmeregion_links[i] != NULL) {
15481 kmem_free(scsfmmup->sfmmu_hmeregion_links[i],
15482 SFMMU_L2_HMERLINKS_SIZE);
15483 scsfmmup->sfmmu_hmeregion_links[i] = NULL;
15486 kmem_cache_free(sfmmuid_cache, scsfmmup);
15487 kmem_cache_free(scd_cache, scdp);
15488 SFMMU_STAT(sf_destroy_scd);
15492 * Modifies the HAT_CTX1_FLAG for each of the ISM segments which correspond to
15493 * bits which are set in the ism_region_map parameter. This flag indicates to
15494 * the tsbmiss handler that mapping for these segments should be loaded using
15495 * the shared context.
15497 static void
15498 sfmmu_ism_hatflags(sfmmu_t *sfmmup, int addflag)
15500 sf_scd_t *scdp = sfmmup->sfmmu_scdp;
15501 ism_blk_t *ism_blkp;
15502 ism_map_t *ism_map;
15503 int i, rid;
15505 ASSERT(sfmmup->sfmmu_iblk != NULL);
15506 ASSERT(scdp != NULL);
15508 * Note that the caller either set HAT_ISMBUSY flag or checked
15509 * under hat lock that HAT_ISMBUSY was not set by another thread.
15511 ASSERT(sfmmu_hat_lock_held(sfmmup));
15513 ism_blkp = sfmmup->sfmmu_iblk;
15514 while (ism_blkp != NULL) {
15515 ism_map = ism_blkp->iblk_maps;
15516 for (i = 0; ism_map[i].imap_ismhat && i < ISM_MAP_SLOTS; i++) {
15517 rid = ism_map[i].imap_rid;
15518 if (rid == SFMMU_INVALID_ISMRID) {
15519 continue;
15521 ASSERT(rid >= 0 && rid < SFMMU_MAX_ISM_REGIONS);
15522 if (SF_RGNMAP_TEST(scdp->scd_ismregion_map, rid) &&
15523 addflag) {
15524 ism_map[i].imap_hatflags |=
15525 HAT_CTX1_FLAG;
15526 } else {
15527 ism_map[i].imap_hatflags &=
15528 ~HAT_CTX1_FLAG;
15531 ism_blkp = ism_blkp->iblk_next;
15535 static int
15536 sfmmu_srd_lock_held(sf_srd_t *srdp)
15538 return (MUTEX_HELD(&srdp->srd_mutex));
15541 /* ARGSUSED */
15542 static int
15543 sfmmu_scdcache_constructor(void *buf, void *cdrarg, int kmflags)
15545 sf_scd_t *scdp = (sf_scd_t *)buf;
15547 bzero(buf, sizeof (sf_scd_t));
15548 mutex_init(&scdp->scd_mutex, NULL, MUTEX_DEFAULT, NULL);
15549 return (0);
15552 /* ARGSUSED */
15553 static void
15554 sfmmu_scdcache_destructor(void *buf, void *cdrarg)
15556 sf_scd_t *scdp = (sf_scd_t *)buf;
15558 mutex_destroy(&scdp->scd_mutex);
15562 * The listp parameter is a pointer to a list of hmeblks which are partially
15563 * freed as result of calling sfmmu_hblk_hash_rm(), the last phase of the
15564 * freeing process is to cross-call all cpus to ensure that there are no
15565 * remaining cached references.
15567 * If the local generation number is less than the global then we can free
15568 * hmeblks which are already on the pending queue as another cpu has completed
15569 * the cross-call.
15571 * We cross-call to make sure that there are no threads on other cpus accessing
15572 * these hmblks and then complete the process of freeing them under the
15573 * following conditions:
15574 * The total number of pending hmeblks is greater than the threshold
15575 * The reserve list has fewer than HBLK_RESERVE_CNT hmeblks
15576 * It is at least 1 second since the last time we cross-called
15578 * Otherwise, we add the hmeblks to the per-cpu pending queue.
15580 static void
15581 sfmmu_hblks_list_purge(struct hme_blk **listp, int dontfree)
15583 struct hme_blk *hblkp, *pr_hblkp = NULL;
15584 int count = 0;
15585 cpuset_t cpuset = cpu_ready_set;
15586 cpu_hme_pend_t *cpuhp;
15587 timestruc_t now;
15588 int one_second_expired = 0;
15590 gethrestime_lasttick(&now);
15592 for (hblkp = *listp; hblkp != NULL; hblkp = hblkp->hblk_next) {
15593 ASSERT(hblkp->hblk_shw_bit == 0);
15594 ASSERT(hblkp->hblk_shared == 0);
15595 count++;
15596 pr_hblkp = hblkp;
15599 cpuhp = &cpu_hme_pend[CPU->cpu_seqid];
15600 mutex_enter(&cpuhp->chp_mutex);
15602 if ((cpuhp->chp_count + count) == 0) {
15603 mutex_exit(&cpuhp->chp_mutex);
15604 return;
15607 if ((now.tv_sec - cpuhp->chp_timestamp) > 1) {
15608 one_second_expired = 1;
15611 if (!dontfree && (freehblkcnt < HBLK_RESERVE_CNT ||
15612 (cpuhp->chp_count + count) > cpu_hme_pend_thresh ||
15613 one_second_expired)) {
15614 /* Append global list to local */
15615 if (pr_hblkp == NULL) {
15616 *listp = cpuhp->chp_listp;
15617 } else {
15618 pr_hblkp->hblk_next = cpuhp->chp_listp;
15620 cpuhp->chp_listp = NULL;
15621 cpuhp->chp_count = 0;
15622 cpuhp->chp_timestamp = now.tv_sec;
15623 mutex_exit(&cpuhp->chp_mutex);
15625 kpreempt_disable();
15626 CPUSET_DEL(cpuset, CPU->cpu_id);
15627 xt_sync(cpuset);
15628 xt_sync(cpuset);
15629 kpreempt_enable();
15632 * At this stage we know that no trap handlers on other
15633 * cpus can have references to hmeblks on the list.
15635 sfmmu_hblk_free(listp);
15636 } else if (*listp != NULL) {
15637 pr_hblkp->hblk_next = cpuhp->chp_listp;
15638 cpuhp->chp_listp = *listp;
15639 cpuhp->chp_count += count;
15640 *listp = NULL;
15641 mutex_exit(&cpuhp->chp_mutex);
15642 } else {
15643 mutex_exit(&cpuhp->chp_mutex);
15648 * Add an hmeblk to the the hash list.
15650 void
15651 sfmmu_hblk_hash_add(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp,
15652 uint64_t hblkpa)
15654 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
15655 #ifdef DEBUG
15656 if (hmebp->hmeblkp == NULL) {
15657 ASSERT(hmebp->hmeh_nextpa == HMEBLK_ENDPA);
15659 #endif /* DEBUG */
15661 hmeblkp->hblk_nextpa = hmebp->hmeh_nextpa;
15663 * Since the TSB miss handler now does not lock the hash chain before
15664 * walking it, make sure that the hmeblks nextpa is globally visible
15665 * before we make the hmeblk globally visible by updating the chain root
15666 * pointer in the hash bucket.
15668 membar_producer();
15669 hmebp->hmeh_nextpa = hblkpa;
15670 hmeblkp->hblk_next = hmebp->hmeblkp;
15671 hmebp->hmeblkp = hmeblkp;
15676 * This function is the first part of a 2 part process to remove an hmeblk
15677 * from the hash chain. In this phase we unlink the hmeblk from the hash chain
15678 * but leave the next physical pointer unchanged. The hmeblk is then linked onto
15679 * a per-cpu pending list using the virtual address pointer.
15681 * TSB miss trap handlers that start after this phase will no longer see
15682 * this hmeblk. TSB miss handlers that still cache this hmeblk in a register
15683 * can still use it for further chain traversal because we haven't yet modifed
15684 * the next physical pointer or freed it.
15686 * In the second phase of hmeblk removal we'll issue a barrier xcall before
15687 * we reuse or free this hmeblk. This will make sure all lingering references to
15688 * the hmeblk after first phase disappear before we finally reclaim it.
15689 * This scheme eliminates the need for TSB miss handlers to lock hmeblk chains
15690 * during their traversal.
15692 * The hmehash_mutex must be held when calling this function.
15694 * Input:
15695 * hmebp - hme hash bucket pointer
15696 * hmeblkp - address of hmeblk to be removed
15697 * pr_hblk - virtual address of previous hmeblkp
15698 * listp - pointer to list of hmeblks linked by virtual address
15699 * free_now flag - indicates that a complete removal from the hash chains
15700 * is necessary.
15702 * It is inefficient to use the free_now flag as a cross-call is required to
15703 * remove a single hmeblk from the hash chain but is necessary when hmeblks are
15704 * in short supply.
15706 void
15707 sfmmu_hblk_hash_rm(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp,
15708 struct hme_blk *pr_hblk, struct hme_blk **listp,
15709 int free_now)
15711 int shw_size, vshift;
15712 struct hme_blk *shw_hblkp;
15713 uint_t shw_mask, newshw_mask;
15714 caddr_t vaddr;
15715 int size;
15716 cpuset_t cpuset = cpu_ready_set;
15718 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
15720 if (hmebp->hmeblkp == hmeblkp) {
15721 hmebp->hmeh_nextpa = hmeblkp->hblk_nextpa;
15722 hmebp->hmeblkp = hmeblkp->hblk_next;
15723 } else {
15724 pr_hblk->hblk_nextpa = hmeblkp->hblk_nextpa;
15725 pr_hblk->hblk_next = hmeblkp->hblk_next;
15728 size = get_hblk_ttesz(hmeblkp);
15729 shw_hblkp = hmeblkp->hblk_shadow;
15730 if (shw_hblkp) {
15731 ASSERT(hblktosfmmu(hmeblkp) != KHATID);
15732 ASSERT(!hmeblkp->hblk_shared);
15733 #ifdef DEBUG
15734 if (mmu_page_sizes == max_mmu_page_sizes) {
15735 ASSERT(size < TTE256M);
15736 } else {
15737 ASSERT(size < TTE4M);
15739 #endif /* DEBUG */
15741 shw_size = get_hblk_ttesz(shw_hblkp);
15742 vaddr = (caddr_t)get_hblk_base(hmeblkp);
15743 vshift = vaddr_to_vshift(shw_hblkp->hblk_tag, vaddr, shw_size);
15744 ASSERT(vshift < 8);
15746 * Atomically clear shadow mask bit
15748 do {
15749 shw_mask = shw_hblkp->hblk_shw_mask;
15750 ASSERT(shw_mask & (1 << vshift));
15751 newshw_mask = shw_mask & ~(1 << vshift);
15752 newshw_mask = atomic_cas_32(&shw_hblkp->hblk_shw_mask,
15753 shw_mask, newshw_mask);
15754 } while (newshw_mask != shw_mask);
15755 hmeblkp->hblk_shadow = NULL;
15757 hmeblkp->hblk_shw_bit = 0;
15759 if (hmeblkp->hblk_shared) {
15760 #ifdef DEBUG
15761 sf_srd_t *srdp;
15762 sf_region_t *rgnp;
15763 uint_t rid;
15765 srdp = hblktosrd(hmeblkp);
15766 ASSERT(srdp != NULL && srdp->srd_refcnt != 0);
15767 rid = hmeblkp->hblk_tag.htag_rid;
15768 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
15769 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
15770 rgnp = srdp->srd_hmergnp[rid];
15771 ASSERT(rgnp != NULL);
15772 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid);
15773 #endif /* DEBUG */
15774 hmeblkp->hblk_shared = 0;
15776 if (free_now) {
15777 kpreempt_disable();
15778 CPUSET_DEL(cpuset, CPU->cpu_id);
15779 xt_sync(cpuset);
15780 xt_sync(cpuset);
15781 kpreempt_enable();
15783 hmeblkp->hblk_nextpa = HMEBLK_ENDPA;
15784 hmeblkp->hblk_next = NULL;
15785 } else {
15786 /* Append hmeblkp to listp for processing later. */
15787 hmeblkp->hblk_next = *listp;
15788 *listp = hmeblkp;
15793 * This routine is called when memory is in short supply and returns a free
15794 * hmeblk of the requested size from the cpu pending lists.
15796 static struct hme_blk *
15797 sfmmu_check_pending_hblks(int size)
15799 int i;
15800 struct hme_blk *hmeblkp = NULL, *last_hmeblkp;
15801 int found_hmeblk;
15802 cpuset_t cpuset = cpu_ready_set;
15803 cpu_hme_pend_t *cpuhp;
15805 /* Flush cpu hblk pending queues */
15806 for (i = 0; i < NCPU; i++) {
15807 cpuhp = &cpu_hme_pend[i];
15808 if (cpuhp->chp_listp != NULL) {
15809 mutex_enter(&cpuhp->chp_mutex);
15810 if (cpuhp->chp_listp == NULL) {
15811 mutex_exit(&cpuhp->chp_mutex);
15812 continue;
15814 found_hmeblk = 0;
15815 last_hmeblkp = NULL;
15816 for (hmeblkp = cpuhp->chp_listp; hmeblkp != NULL;
15817 hmeblkp = hmeblkp->hblk_next) {
15818 if (get_hblk_ttesz(hmeblkp) == size) {
15819 if (last_hmeblkp == NULL) {
15820 cpuhp->chp_listp =
15821 hmeblkp->hblk_next;
15822 } else {
15823 last_hmeblkp->hblk_next =
15824 hmeblkp->hblk_next;
15826 ASSERT(cpuhp->chp_count > 0);
15827 cpuhp->chp_count--;
15828 found_hmeblk = 1;
15829 break;
15830 } else {
15831 last_hmeblkp = hmeblkp;
15834 mutex_exit(&cpuhp->chp_mutex);
15836 if (found_hmeblk) {
15837 kpreempt_disable();
15838 CPUSET_DEL(cpuset, CPU->cpu_id);
15839 xt_sync(cpuset);
15840 xt_sync(cpuset);
15841 kpreempt_enable();
15842 return (hmeblkp);
15846 return (NULL);