4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
29 * VM - Hardware Address Translation management for Spitfire MMU.
31 * This file implements the machine specific hardware translation
32 * needed by the VM system. The machine independent interface is
33 * described in <vm/hat.h> while the machine dependent interface
34 * and data structures are described in <vm/hat_sfmmu.h>.
36 * The hat layer manages the address translation hardware as a cache
37 * driven by calls from the higher levels in the VM system.
40 #include <sys/types.h>
41 #include <sys/kstat.h>
43 #include <vm/hat_sfmmu.h>
46 #include <sys/systm.h>
48 #include <sys/sysmacros.h>
49 #include <sys/machparam.h>
50 #include <sys/vtrace.h>
53 #include <sys/cmn_err.h>
55 #include <sys/cpuvar.h>
56 #include <sys/debug.h>
58 #include <sys/archsystm.h>
59 #include <sys/machsystm.h>
60 #include <sys/vmsystm.h>
63 #include <vm/seg_kp.h>
64 #include <vm/seg_kmem.h>
65 #include <vm/seg_kpm.h>
67 #include <sys/t_lock.h>
68 #include <sys/obpdefs.h>
69 #include <sys/vm_machparam.h>
72 #include <sys/machtrap.h>
74 #include <sys/bitmap.h>
75 #include <sys/machlock.h>
76 #include <sys/membar.h>
77 #include <sys/atomic.h>
78 #include <sys/cpu_module.h>
79 #include <sys/prom_debug.h>
80 #include <sys/ksynch.h>
81 #include <sys/mem_config.h>
82 #include <sys/mem_cage.h>
83 #include <vm/vm_dep.h>
84 #include <vm/xhat_sfmmu.h>
85 #include <sys/fpu/fpusystm.h>
86 #include <vm/mach_kpm.h>
87 #include <sys/callb.h>
90 #define SFMMU_VALIDATE_HMERID(hat, rid, saddr, len) \
91 if (SFMMU_IS_SHMERID_VALID(rid)) { \
92 caddr_t _eaddr = (saddr) + (len); \
95 ASSERT((rid) < SFMMU_MAX_HME_REGIONS); \
96 ASSERT(SF_RGNMAP_TEST(hat->sfmmu_hmeregion_map, rid)); \
97 ASSERT((hat) != ksfmmup); \
98 _srdp = (hat)->sfmmu_srdp; \
99 ASSERT(_srdp != NULL); \
100 ASSERT(_srdp->srd_refcnt != 0); \
101 _rgnp = _srdp->srd_hmergnp[(rid)]; \
102 ASSERT(_rgnp != NULL && _rgnp->rgn_id == rid); \
103 ASSERT(_rgnp->rgn_refcnt != 0); \
104 ASSERT(!(_rgnp->rgn_flags & SFMMU_REGION_FREE)); \
105 ASSERT((_rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == \
107 ASSERT((saddr) >= _rgnp->rgn_saddr); \
108 ASSERT((saddr) < _rgnp->rgn_saddr + _rgnp->rgn_size); \
109 ASSERT(_eaddr > _rgnp->rgn_saddr); \
110 ASSERT(_eaddr <= _rgnp->rgn_saddr + _rgnp->rgn_size); \
113 #define SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid) \
119 int _ttesz = get_hblk_ttesz(hmeblkp); \
121 ASSERT((srdp)->srd_refcnt != 0); \
122 ASSERT((rid) < SFMMU_MAX_HME_REGIONS); \
123 ASSERT((rgnp)->rgn_id == rid); \
124 ASSERT(!((rgnp)->rgn_flags & SFMMU_REGION_FREE)); \
125 ASSERT(((rgnp)->rgn_flags & SFMMU_REGION_TYPE_MASK) == \
127 ASSERT(_ttesz <= (rgnp)->rgn_pgszc); \
128 _hsva = (caddr_t)get_hblk_base(hmeblkp); \
129 _heva = get_hblk_endaddr(hmeblkp); \
130 _rsva = (caddr_t)P2ALIGN( \
131 (uintptr_t)(rgnp)->rgn_saddr, HBLK_MIN_BYTES); \
132 _reva = (caddr_t)P2ROUNDUP( \
133 (uintptr_t)((rgnp)->rgn_saddr + (rgnp)->rgn_size), \
135 ASSERT(_hsva >= _rsva); \
136 ASSERT(_hsva < _reva); \
137 ASSERT(_heva > _rsva); \
138 ASSERT(_heva <= _reva); \
139 _flagtte = (_ttesz < HBLK_MIN_TTESZ) ? HBLK_MIN_TTESZ : \
141 ASSERT(rgnp->rgn_hmeflags & (0x1 << _flagtte)); \
145 #define SFMMU_VALIDATE_HMERID(hat, rid, addr, len)
146 #define SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid)
149 #if defined(SF_ERRATA_57)
150 extern caddr_t errata57_limit
;
153 #define HME8BLK_SZ_RND ((roundup(HME8BLK_SZ, sizeof (int64_t))) / \
155 #define HBLK_RESERVE ((struct hme_blk *)hblk_reserve)
157 #define HBLK_RESERVE_CNT 128
158 #define HBLK_RESERVE_MIN 20
160 static struct hme_blk
*freehblkp
;
161 static kmutex_t freehblkp_lock
;
162 static int freehblkcnt
;
164 static int64_t hblk_reserve
[HME8BLK_SZ_RND
];
165 static kmutex_t hblk_reserve_lock
;
166 static kthread_t
*hblk_reserve_thread
;
168 static nucleus_hblk8_info_t nucleus_hblk8
;
169 static nucleus_hblk1_info_t nucleus_hblk1
;
172 * Data to manage per-cpu hmeblk pending queues, hmeblks are queued here
173 * after the initial phase of removing an hmeblk from the hash chain, see
174 * the detailed comment in sfmmu_hblk_hash_rm() for further details.
176 static cpu_hme_pend_t
*cpu_hme_pend
;
177 static uint_t cpu_hme_pend_thresh
;
179 * SFMMU specific hat functions
181 void hat_pagecachectl(struct page
*, int);
183 /* flags for hat_pagecachectl */
184 #define HAT_CACHE 0x1
185 #define HAT_UNCACHE 0x2
186 #define HAT_TMPNC 0x4
189 * Flag to allow the creation of non-cacheable translations
190 * to system memory. It is off by default. At the moment this
191 * flag is used by the ecache error injector. The error injector
192 * will turn it on when creating such a translation then shut it
193 * off when it's finished.
196 int sfmmu_allow_nc_trans
= 0;
199 * Flag to disable large page support.
200 * value of 1 => disable all large pages.
201 * bits 1, 2, and 3 are to disable 64K, 512K and 4M pages respectively.
203 * For example, use the value 0x4 to disable 512K pages.
206 #define LARGE_PAGES_OFF 0x1
209 * The disable_large_pages and disable_ism_large_pages variables control
210 * hat_memload_array and the page sizes to be used by ISM and the kernel.
212 * The disable_auto_data_large_pages and disable_auto_text_large_pages variables
213 * are only used to control which OOB pages to use at upper VM segment creation
214 * time, and are set in hat_init_pagesizes and used in the map_pgsz* routines.
215 * Their values may come from platform or CPU specific code to disable page
216 * sizes that should not be used.
218 * WARNING: 512K pages are currently not supported for ISM/DISM.
220 uint_t disable_large_pages
= 0;
221 uint_t disable_ism_large_pages
= (1 << TTE512K
);
222 uint_t disable_auto_data_large_pages
= 0;
223 uint_t disable_auto_text_large_pages
= 0;
226 * Private sfmmu data structures for hat management
228 static struct kmem_cache
*sfmmuid_cache
;
229 static struct kmem_cache
*mmuctxdom_cache
;
232 * Private sfmmu data structures for tsb management
234 static struct kmem_cache
*sfmmu_tsbinfo_cache
;
235 static struct kmem_cache
*sfmmu_tsb8k_cache
;
236 static struct kmem_cache
*sfmmu_tsb_cache
[NLGRPS_MAX
];
237 static vmem_t
*kmem_bigtsb_arena
;
238 static vmem_t
*kmem_tsb_arena
;
241 * sfmmu static variables for hmeblk resource management.
243 static vmem_t
*hat_memload1_arena
; /* HAT translation arena for sfmmu1_cache */
244 static struct kmem_cache
*sfmmu8_cache
;
245 static struct kmem_cache
*sfmmu1_cache
;
246 static struct kmem_cache
*pa_hment_cache
;
248 static kmutex_t ism_mlist_lock
; /* mutex for ism mapping list */
250 * private data for ism
252 static struct kmem_cache
*ism_blk_cache
;
253 static struct kmem_cache
*ism_ment_cache
;
254 #define ISMID_STARTADDR NULL
257 * Region management data structures and function declarations.
260 static void sfmmu_leave_srd(sfmmu_t
*);
261 static int sfmmu_srdcache_constructor(void *, void *, int);
262 static void sfmmu_srdcache_destructor(void *, void *);
263 static int sfmmu_rgncache_constructor(void *, void *, int);
264 static void sfmmu_rgncache_destructor(void *, void *);
265 static int sfrgnmap_isnull(sf_region_map_t
*);
266 static int sfhmergnmap_isnull(sf_hmeregion_map_t
*);
267 static int sfmmu_scdcache_constructor(void *, void *, int);
268 static void sfmmu_scdcache_destructor(void *, void *);
269 static void sfmmu_rgn_cb_noop(caddr_t
, caddr_t
, caddr_t
,
270 size_t, void *, u_offset_t
);
272 static uint_t srd_hashmask
= SFMMU_MAX_SRD_BUCKETS
- 1;
273 static sf_srd_bucket_t
*srd_buckets
;
274 static struct kmem_cache
*srd_cache
;
275 static uint_t srd_rgn_hashmask
= SFMMU_MAX_REGION_BUCKETS
- 1;
276 static struct kmem_cache
*region_cache
;
277 static struct kmem_cache
*scd_cache
;
280 int use_bigtsb_arena
= 1;
282 int use_bigtsb_arena
= 0;
285 /* External /etc/system tunable, for turning on&off the shctx support */
286 int disable_shctx
= 0;
287 /* Internal variable, set by MD if the HW supports shctx feature */
291 static void check_scd_sfmmu_list(sfmmu_t
**, sfmmu_t
*, int);
293 static void sfmmu_to_scd_list(sfmmu_t
**, sfmmu_t
*);
294 static void sfmmu_from_scd_list(sfmmu_t
**, sfmmu_t
*);
296 static sf_scd_t
*sfmmu_alloc_scd(sf_srd_t
*, sf_region_map_t
*);
297 static void sfmmu_find_scd(sfmmu_t
*);
298 static void sfmmu_join_scd(sf_scd_t
*, sfmmu_t
*);
299 static void sfmmu_finish_join_scd(sfmmu_t
*);
300 static void sfmmu_leave_scd(sfmmu_t
*, uchar_t
);
301 static void sfmmu_destroy_scd(sf_srd_t
*, sf_scd_t
*, sf_region_map_t
*);
302 static int sfmmu_alloc_scd_tsbs(sf_srd_t
*, sf_scd_t
*);
303 static void sfmmu_free_scd_tsbs(sfmmu_t
*);
304 static void sfmmu_tsb_inv_ctx(sfmmu_t
*);
305 static int find_ism_rid(sfmmu_t
*, sfmmu_t
*, caddr_t
, uint_t
*);
306 static void sfmmu_ism_hatflags(sfmmu_t
*, int);
307 static int sfmmu_srd_lock_held(sf_srd_t
*);
308 static void sfmmu_remove_scd(sf_scd_t
**, sf_scd_t
*);
309 static void sfmmu_add_scd(sf_scd_t
**headp
, sf_scd_t
*);
310 static void sfmmu_link_scd_to_regions(sf_srd_t
*, sf_scd_t
*);
311 static void sfmmu_unlink_scd_from_regions(sf_srd_t
*, sf_scd_t
*);
312 static void sfmmu_link_to_hmeregion(sfmmu_t
*, sf_region_t
*);
313 static void sfmmu_unlink_from_hmeregion(sfmmu_t
*, sf_region_t
*);
316 * ``hat_lock'' is a hashed mutex lock for protecting sfmmu TSB lists,
317 * HAT flags, synchronizing TLB/TSB coherency, and context management.
318 * The lock is hashed on the sfmmup since the case where we need to lock
319 * all processes is rare but does occur (e.g. we need to unload a shared
320 * mapping from all processes using the mapping). We have a lot of buckets,
321 * and each slab of sfmmu_t's can use about a quarter of them, giving us
322 * a fairly good distribution without wasting too much space and overhead
323 * when we have to grab them all.
325 #define SFMMU_NUM_LOCK 128 /* must be power of two */
326 hatlock_t hat_lock
[SFMMU_NUM_LOCK
];
329 * Hash algorithm optimized for a small number of slabs.
330 * 7 is (highbit((sizeof sfmmu_t)) - 1)
331 * This hash algorithm is based upon the knowledge that sfmmu_t's come from a
332 * kmem_cache, and thus they will be sequential within that cache. In
333 * addition, each new slab will have a different "color" up to cache_maxcolor
334 * which will skew the hashing for each successive slab which is allocated.
335 * If the size of sfmmu_t changed to a larger size, this algorithm may need
338 #define TSB_HASH_SHIFT_BITS (7)
339 #define PTR_HASH(x) ((uintptr_t)x >> TSB_HASH_SHIFT_BITS)
342 int tsb_hash_debug
= 0;
343 #define TSB_HASH(sfmmup) \
344 (tsb_hash_debug ? &hat_lock[0] : \
345 &hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)])
347 #define TSB_HASH(sfmmup) &hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)]
351 /* sfmmu_replace_tsb() return codes. */
352 typedef enum tsb_replace_rc
{
361 * Flags for TSB allocation routines.
363 #define TSB_ALLOC 0x01
364 #define TSB_FORCEALLOC 0x02
365 #define TSB_GROW 0x04
366 #define TSB_SHRINK 0x08
367 #define TSB_SWAPIN 0x10
370 * Support for HAT callbacks.
372 #define SFMMU_MAX_RELOC_CALLBACKS 10
373 int sfmmu_max_cb_id
= SFMMU_MAX_RELOC_CALLBACKS
;
374 static id_t sfmmu_cb_nextid
= 0;
375 static id_t sfmmu_tsb_cb_id
;
376 struct sfmmu_callback
*sfmmu_cb_table
;
379 kmutex_t kpr_suspendlock
;
380 kthread_t
*kreloc_thread
;
383 * Enable VA->PA translation sanity checking on DEBUG kernels.
384 * Disabled by default. This is incompatible with some
385 * drivers (error injector, RSM) so if it breaks you get
386 * to keep both pieces.
388 int hat_check_vtop
= 0;
391 * Private sfmmu routines (prototypes)
393 static struct hme_blk
*sfmmu_shadow_hcreate(sfmmu_t
*, caddr_t
, int, uint_t
);
394 static struct hme_blk
*sfmmu_hblk_alloc(sfmmu_t
*, caddr_t
,
395 struct hmehash_bucket
*, uint_t
, hmeblk_tag
, uint_t
,
397 static caddr_t
sfmmu_hblk_unload(struct hat
*, struct hme_blk
*, caddr_t
,
398 caddr_t
, demap_range_t
*, uint_t
);
399 static caddr_t
sfmmu_hblk_sync(struct hat
*, struct hme_blk
*, caddr_t
,
401 static void sfmmu_hblk_free(struct hme_blk
**);
402 static void sfmmu_hblks_list_purge(struct hme_blk
**, int);
403 static uint_t
sfmmu_get_free_hblk(struct hme_blk
**, uint_t
);
404 static uint_t
sfmmu_put_free_hblk(struct hme_blk
*, uint_t
);
405 static struct hme_blk
*sfmmu_hblk_steal(int);
406 static int sfmmu_steal_this_hblk(struct hmehash_bucket
*,
407 struct hme_blk
*, uint64_t, struct hme_blk
*);
408 static caddr_t
sfmmu_hblk_unlock(struct hme_blk
*, caddr_t
, caddr_t
);
410 static void hat_do_memload_array(struct hat
*, caddr_t
, size_t,
411 struct page
**, uint_t
, uint_t
, uint_t
);
412 static void hat_do_memload(struct hat
*, caddr_t
, struct page
*,
413 uint_t
, uint_t
, uint_t
);
414 static void sfmmu_memload_batchsmall(struct hat
*, caddr_t
, page_t
**,
415 uint_t
, uint_t
, pgcnt_t
, uint_t
);
416 void sfmmu_tteload(struct hat
*, tte_t
*, caddr_t
, page_t
*,
418 static int sfmmu_tteload_array(sfmmu_t
*, tte_t
*, caddr_t
, page_t
**,
420 static struct hmehash_bucket
*sfmmu_tteload_acquire_hashbucket(sfmmu_t
*,
421 caddr_t
, int, uint_t
);
422 static struct hme_blk
*sfmmu_tteload_find_hmeblk(sfmmu_t
*,
423 struct hmehash_bucket
*, caddr_t
, uint_t
, uint_t
,
425 static int sfmmu_tteload_addentry(sfmmu_t
*, struct hme_blk
*, tte_t
*,
426 caddr_t
, page_t
**, uint_t
, uint_t
);
427 static void sfmmu_tteload_release_hashbucket(struct hmehash_bucket
*);
429 static int sfmmu_pagearray_setup(caddr_t
, page_t
**, tte_t
*, int);
430 static pfn_t
sfmmu_uvatopfn(caddr_t
, sfmmu_t
*, tte_t
*);
431 void sfmmu_memtte(tte_t
*, pfn_t
, uint_t
, int);
433 static void sfmmu_vac_conflict(struct hat
*, caddr_t
, page_t
*);
434 static int sfmmu_vacconflict_array(caddr_t
, page_t
*, int *);
435 int tst_tnc(page_t
*pp
, pgcnt_t
);
436 void conv_tnc(page_t
*pp
, int);
439 static void sfmmu_get_ctx(sfmmu_t
*);
440 static void sfmmu_free_sfmmu(sfmmu_t
*);
442 static void sfmmu_ttesync(struct hat
*, caddr_t
, tte_t
*, page_t
*);
443 static void sfmmu_chgattr(struct hat
*, caddr_t
, size_t, uint_t
, int);
445 cpuset_t
sfmmu_pageunload(page_t
*, struct sf_hment
*, int);
446 static void hat_pagereload(struct page
*, struct page
*);
447 static cpuset_t
sfmmu_pagesync(page_t
*, struct sf_hment
*, uint_t
);
449 void sfmmu_page_cache_array(page_t
*, int, int, pgcnt_t
);
450 static void sfmmu_page_cache(page_t
*, int, int, int);
453 cpuset_t
sfmmu_rgntlb_demap(caddr_t
, sf_region_t
*,
454 struct hme_blk
*, int);
455 static void sfmmu_tlbcache_demap(caddr_t
, sfmmu_t
*, struct hme_blk
*,
456 pfn_t
, int, int, int, int);
457 static void sfmmu_ismtlbcache_demap(caddr_t
, sfmmu_t
*, struct hme_blk
*,
459 static void sfmmu_tlb_demap(caddr_t
, sfmmu_t
*, struct hme_blk
*, int, int);
460 static void sfmmu_tlb_range_demap(demap_range_t
*);
461 static void sfmmu_invalidate_ctx(sfmmu_t
*);
462 static void sfmmu_sync_mmustate(sfmmu_t
*);
464 static void sfmmu_tsbinfo_setup_phys(struct tsb_info
*, pfn_t
);
465 static int sfmmu_tsbinfo_alloc(struct tsb_info
**, int, int, uint_t
,
467 static void sfmmu_tsb_free(struct tsb_info
*);
468 static void sfmmu_tsbinfo_free(struct tsb_info
*);
469 static int sfmmu_init_tsbinfo(struct tsb_info
*, int, int, uint_t
,
471 static void sfmmu_tsb_chk_reloc(sfmmu_t
*, hatlock_t
*);
472 static void sfmmu_tsb_swapin(sfmmu_t
*, hatlock_t
*);
473 static int sfmmu_select_tsb_szc(pgcnt_t
);
474 static void sfmmu_mod_tsb(sfmmu_t
*, caddr_t
, tte_t
*, int);
475 #define sfmmu_load_tsb(sfmmup, vaddr, tte, szc) \
476 sfmmu_mod_tsb(sfmmup, vaddr, tte, szc)
477 #define sfmmu_unload_tsb(sfmmup, vaddr, szc) \
478 sfmmu_mod_tsb(sfmmup, vaddr, NULL, szc)
479 static void sfmmu_copy_tsb(struct tsb_info
*, struct tsb_info
*);
480 static tsb_replace_rc_t
sfmmu_replace_tsb(sfmmu_t
*, struct tsb_info
*, uint_t
,
481 hatlock_t
*, uint_t
);
482 static void sfmmu_size_tsb(sfmmu_t
*, int, uint64_t, uint64_t, int);
485 void sfmmu_cache_flush(pfn_t
, int);
486 void sfmmu_cache_flushcolor(int, pfn_t
);
488 static caddr_t
sfmmu_hblk_chgattr(sfmmu_t
*, struct hme_blk
*, caddr_t
,
489 caddr_t
, demap_range_t
*, uint_t
, int);
491 static uint64_t sfmmu_vtop_attr(uint_t
, int mode
, tte_t
*);
492 static uint_t
sfmmu_ptov_attr(tte_t
*);
493 static caddr_t
sfmmu_hblk_chgprot(sfmmu_t
*, struct hme_blk
*, caddr_t
,
494 caddr_t
, demap_range_t
*, uint_t
);
495 static uint_t
sfmmu_vtop_prot(uint_t
, uint_t
*);
496 static int sfmmu_idcache_constructor(void *, void *, int);
497 static void sfmmu_idcache_destructor(void *, void *);
498 static int sfmmu_hblkcache_constructor(void *, void *, int);
499 static void sfmmu_hblkcache_destructor(void *, void *);
500 static void sfmmu_hblkcache_reclaim(void *);
501 static void sfmmu_shadow_hcleanup(sfmmu_t
*, struct hme_blk
*,
502 struct hmehash_bucket
*);
503 static void sfmmu_hblk_hash_rm(struct hmehash_bucket
*, struct hme_blk
*,
504 struct hme_blk
*, struct hme_blk
**, int);
505 static void sfmmu_hblk_hash_add(struct hmehash_bucket
*, struct hme_blk
*,
507 static struct hme_blk
*sfmmu_check_pending_hblks(int);
508 static void sfmmu_free_hblks(sfmmu_t
*, caddr_t
, caddr_t
, int);
509 static void sfmmu_cleanup_rhblk(sf_srd_t
*, caddr_t
, uint_t
, int);
510 static void sfmmu_unload_hmeregion_va(sf_srd_t
*, uint_t
, caddr_t
, caddr_t
,
512 static void sfmmu_unload_hmeregion(sf_srd_t
*, sf_region_t
*);
514 static void sfmmu_rm_large_mappings(page_t
*, int);
516 static void hat_lock_init(void);
517 static void hat_kstat_init(void);
518 static int sfmmu_kstat_percpu_update(kstat_t
*ksp
, int rw
);
519 static void sfmmu_set_scd_rttecnt(sf_srd_t
*, sf_scd_t
*);
520 static int sfmmu_is_rgnva(sf_srd_t
*, caddr_t
, ulong_t
, ulong_t
);
521 static void sfmmu_check_page_sizes(sfmmu_t
*, int);
522 int fnd_mapping_sz(page_t
*);
523 static void iment_add(struct ism_ment
*, struct hat
*);
524 static void iment_sub(struct ism_ment
*, struct hat
*);
525 static pgcnt_t
ism_tsb_entries(sfmmu_t
*, int szc
);
526 extern void sfmmu_setup_tsbinfo(sfmmu_t
*);
527 extern void sfmmu_clear_utsbinfo(void);
529 static void sfmmu_ctx_wrap_around(mmu_ctx_t
*, boolean_t
);
531 extern int vpm_enable
;
536 * Enable trap level tsbmiss handling
541 * Flush the TLB on kpm mapout. Note: Xcalls are used (again) for the
542 * required TLB shootdowns in this case, so handle w/ care. Off by default.
547 static void *sfmmu_vmem_xalloc_aligned_wrapper(vmem_t
*, size_t, int);
550 static void sfmmu_check_hblk_flist();
554 * Semi-private sfmmu data structures. Some of them are initialize in
555 * startup or in hat_init. Some of them are private but accessed by
556 * assembly code or mach_sfmmu.c
558 struct hmehash_bucket
*uhme_hash
; /* user hmeblk hash table */
559 struct hmehash_bucket
*khme_hash
; /* kernel hmeblk hash table */
560 uint64_t uhme_hash_pa
; /* PA of uhme_hash */
561 uint64_t khme_hash_pa
; /* PA of khme_hash */
562 int uhmehash_num
; /* # of buckets in user hash table */
563 int khmehash_num
; /* # of buckets in kernel hash table */
565 uint_t max_mmu_ctxdoms
= 0; /* max context domains in the system */
566 mmu_ctx_t
**mmu_ctxs_tbl
; /* global array of context domains */
567 uint64_t mmu_saved_gnum
= 0; /* to init incoming MMUs' gnums */
569 #define DEFAULT_NUM_CTXS_PER_MMU 8192
570 static uint_t nctxs
= DEFAULT_NUM_CTXS_PER_MMU
;
572 int cache
; /* describes system cache */
574 caddr_t ktsb_base
; /* kernel 8k-indexed tsb base address */
575 uint64_t ktsb_pbase
; /* kernel 8k-indexed tsb phys address */
576 int ktsb_szcode
; /* kernel 8k-indexed tsb size code */
577 int ktsb_sz
; /* kernel 8k-indexed tsb size */
579 caddr_t ktsb4m_base
; /* kernel 4m-indexed tsb base address */
580 uint64_t ktsb4m_pbase
; /* kernel 4m-indexed tsb phys address */
581 int ktsb4m_szcode
; /* kernel 4m-indexed tsb size code */
582 int ktsb4m_sz
; /* kernel 4m-indexed tsb size */
584 uint64_t kpm_tsbbase
; /* kernel seg_kpm 4M TSB base address */
585 int kpm_tsbsz
; /* kernel seg_kpm 4M TSB size code */
586 uint64_t kpmsm_tsbbase
; /* kernel seg_kpm 8K TSB base address */
587 int kpmsm_tsbsz
; /* kernel seg_kpm 8K TSB size code */
590 int utsb_dtlb_ttenum
= -1; /* index in TLB for utsb locked TTE */
591 int utsb4m_dtlb_ttenum
= -1; /* index in TLB for 4M TSB TTE */
592 int dtlb_resv_ttenum
; /* index in TLB of first reserved TTE */
593 caddr_t utsb_vabase
; /* reserved kernel virtual memory */
594 caddr_t utsb4m_vabase
; /* for trap handler TSB accesses */
596 uint64_t tsb_alloc_bytes
= 0; /* bytes allocated to TSBs */
597 vmem_t
*kmem_tsb_default_arena
[NLGRPS_MAX
]; /* For dynamic TSBs */
598 vmem_t
*kmem_bigtsb_default_arena
[NLGRPS_MAX
]; /* dynamic 256M TSBs */
601 * Size to use for TSB slabs. Future platforms that support page sizes
602 * larger than 4M may wish to change these values, and provide their own
603 * assembly macros for building and decoding the TSB base register contents.
604 * Note disable_large_pages will override the value set here.
606 static uint_t tsb_slab_ttesz
= TTE4M
;
607 size_t tsb_slab_size
= MMU_PAGESIZE4M
;
608 uint_t tsb_slab_shift
= MMU_PAGESHIFT4M
;
609 /* PFN mask for TTE */
610 size_t tsb_slab_mask
= MMU_PAGEOFFSET4M
>> MMU_PAGESHIFT
;
613 * Size to use for TSB slabs. These are used only when 256M tsb arenas
616 static uint_t bigtsb_slab_ttesz
= TTE256M
;
617 static size_t bigtsb_slab_size
= MMU_PAGESIZE256M
;
618 static uint_t bigtsb_slab_shift
= MMU_PAGESHIFT256M
;
619 /* 256M page alignment for 8K pfn */
620 static size_t bigtsb_slab_mask
= MMU_PAGEOFFSET256M
>> MMU_PAGESHIFT
;
622 /* largest TSB size to grow to, will be smaller on smaller memory systems */
623 static int tsb_max_growsize
= 0;
626 * Tunable parameters dealing with TSB policies.
630 * This undocumented tunable forces all 8K TSBs to be allocated from
631 * the kernel heap rather than from the kmem_tsb_default_arena arenas.
634 int tsb_forceheap
= 0;
638 * Decide whether to use per-lgroup arenas, or one global set of
639 * TSB arenas. The default is not to break up per-lgroup, since
640 * most platforms don't recognize any tangible benefit from it.
642 int tsb_lgrp_affinity
= 0;
645 * Used for growing the TSB based on the process RSS.
646 * tsb_rss_factor is based on the smallest TSB, and is
647 * shifted by the TSB size to determine if we need to grow.
648 * The default will grow the TSB if the number of TTEs for
649 * this page size exceeds 75% of the number of TSB entries,
650 * which should _almost_ eliminate all conflict misses
651 * (at the expense of using up lots and lots of memory).
653 #define TSB_RSS_FACTOR (TSB_ENTRIES(TSB_MIN_SZCODE) * 0.75)
654 #define SFMMU_RSS_TSBSIZE(tsbszc) (tsb_rss_factor << tsbszc)
655 #define SELECT_TSB_SIZECODE(pgcnt) ( \
656 (enable_tsb_rss_sizing)? sfmmu_select_tsb_szc(pgcnt) : \
658 #define TSB_OK_SHRINK() \
659 (tsb_alloc_bytes > tsb_alloc_hiwater || freemem < desfree)
660 #define TSB_OK_GROW() \
661 (tsb_alloc_bytes < tsb_alloc_hiwater && freemem > desfree)
663 int enable_tsb_rss_sizing
= 1;
664 int tsb_rss_factor
= (int)TSB_RSS_FACTOR
;
666 /* which TSB size code to use for new address spaces or if rss sizing off */
667 int default_tsb_size
= TSB_8K_SZCODE
;
669 static uint64_t tsb_alloc_hiwater
; /* limit TSB reserved memory */
670 uint64_t tsb_alloc_hiwater_factor
; /* tsb_alloc_hiwater = physmem / this */
671 #define TSB_ALLOC_HIWATER_FACTOR_DEFAULT 32
674 static int tsb_random_size
= 0; /* set to 1 to test random tsb sizes on alloc */
675 static int tsb_grow_stress
= 0; /* if set to 1, keep replacing TSB w/ random */
676 static int tsb_alloc_mtbf
= 0; /* fail allocation every n attempts */
677 static int tsb_alloc_fail_mtbf
= 0;
678 static int tsb_alloc_count
= 0;
681 /* if set to 1, will remap valid TTEs when growing TSB. */
682 int tsb_remap_ttes
= 1;
685 * If we have more than this many mappings, allocate a second TSB.
686 * This default is chosen because the I/D fully associative TLBs are
687 * assumed to have at least 8 available entries. Platforms with a
688 * larger fully-associative TLB could probably override the default.
692 int tsb_sectsb_threshold
= 0;
694 int tsb_sectsb_threshold
= 8;
700 struct sfmmu_global_stat sfmmu_global_stat
;
701 struct sfmmu_tsbsize_stat sfmmu_tsbsize_stat
;
706 sfmmu_t
*ksfmmup
; /* kernel's hat id */
709 static void chk_tte(tte_t
*, tte_t
*, tte_t
*, struct hme_blk
*);
712 /* sfmmu locking operations */
713 static kmutex_t
*sfmmu_mlspl_enter(struct page
*, int);
714 static int sfmmu_mlspl_held(struct page
*, int);
716 kmutex_t
*sfmmu_page_enter(page_t
*);
717 void sfmmu_page_exit(kmutex_t
*);
718 int sfmmu_page_spl_held(struct page
*);
720 /* sfmmu internal locking operations - accessed directly */
721 static void sfmmu_mlist_reloc_enter(page_t
*, page_t
*,
722 kmutex_t
**, kmutex_t
**);
723 static void sfmmu_mlist_reloc_exit(kmutex_t
*, kmutex_t
*);
725 sfmmu_hat_enter(sfmmu_t
*);
727 sfmmu_hat_tryenter(sfmmu_t
*);
728 static void sfmmu_hat_exit(hatlock_t
*);
729 static void sfmmu_hat_lock_all(void);
730 static void sfmmu_hat_unlock_all(void);
731 static void sfmmu_ismhat_enter(sfmmu_t
*, int);
732 static void sfmmu_ismhat_exit(sfmmu_t
*, int);
734 kpm_hlk_t
*kpmp_table
;
735 uint_t kpmp_table_sz
; /* must be a power of 2 */
738 kpm_shlk_t
*kpmp_stable
;
739 uint_t kpmp_stable_sz
; /* must be a power of 2 */
742 * SPL_TABLE_SIZE is 2 * NCPU, but no smaller than 128.
743 * SPL_SHIFT is log2(SPL_TABLE_SIZE).
745 #if ((2*NCPU_P2) > 128)
746 #define SPL_SHIFT ((unsigned)(NCPU_LOG2 + 1))
750 #define SPL_TABLE_SIZE (1U << SPL_SHIFT)
751 #define SPL_MASK (SPL_TABLE_SIZE - 1)
754 * We shift by PP_SHIFT to take care of the low-order 0 bits of a page_t
755 * and by multiples of SPL_SHIFT to get as many varied bits as we can.
757 #define SPL_INDEX(pp) \
758 ((((uintptr_t)(pp) >> PP_SHIFT) ^ \
759 ((uintptr_t)(pp) >> (PP_SHIFT + SPL_SHIFT)) ^ \
760 ((uintptr_t)(pp) >> (PP_SHIFT + SPL_SHIFT * 2)) ^ \
761 ((uintptr_t)(pp) >> (PP_SHIFT + SPL_SHIFT * 3))) & \
764 #define SPL_HASH(pp) \
765 (&sfmmu_page_lock[SPL_INDEX(pp)].pad_mutex)
767 static pad_mutex_t sfmmu_page_lock
[SPL_TABLE_SIZE
];
769 /* Array of mutexes protecting a page's mapping list and p_nrm field. */
771 #define MML_TABLE_SIZE SPL_TABLE_SIZE
772 #define MLIST_HASH(pp) (&mml_table[SPL_INDEX(pp)].pad_mutex)
774 static pad_mutex_t mml_table
[MML_TABLE_SIZE
];
777 * hat_unload_callback() will group together callbacks in order
778 * to avoid xt_sync() calls. This is the maximum size of the group.
780 #define MAX_CB_ADDR 32
783 static ulong_t sfmmu_dmr_maxbit
= DMR_MAXBIT
;
785 static char *mmu_ctx_kstat_names
[] = {
786 "mmu_ctx_tsb_exceptions",
787 "mmu_ctx_tsb_raise_exception",
788 "mmu_ctx_wrap_around",
792 * Wrapper for vmem_xalloc since vmem_create only allows limited
793 * parameters for vm_source_alloc functions. This function allows us
794 * to specify alignment consistent with the size of the object being
798 sfmmu_vmem_xalloc_aligned_wrapper(vmem_t
*vmp
, size_t size
, int vmflag
)
800 return (vmem_xalloc(vmp
, size
, size
, 0, 0, NULL
, NULL
, vmflag
));
803 /* Common code for setting tsb_alloc_hiwater. */
804 #define SFMMU_SET_TSB_ALLOC_HIWATER(pages) tsb_alloc_hiwater = \
805 ptob(pages) / tsb_alloc_hiwater_factor
808 * Set tsb_max_growsize to allow at most all of physical memory to be mapped by
809 * a single TSB. physmem is the number of physical pages so we need physmem 8K
810 * TTEs to represent all those physical pages. We round this up by using
811 * 1<<highbit(). To figure out which size code to use, remember that the size
812 * code is just an amount to shift the smallest TSB size to get the size of
813 * this TSB. So we subtract that size, TSB_START_SIZE, from highbit() (or
814 * highbit() - 1) to get the size code for the smallest TSB that can represent
815 * all of physical memory, while erring on the side of too much.
817 * Restrict tsb_max_growsize to make sure that:
818 * 1) TSBs can't grow larger than the TSB slab size
819 * 2) TSBs can't grow larger than UTSB_MAX_SZCODE.
821 #define SFMMU_SET_TSB_MAX_GROWSIZE(pages) { \
822 int _i, _szc, _slabszc, _tsbszc; \
824 _i = highbit(pages); \
825 if ((1 << (_i - 1)) == (pages)) \
826 _i--; /* 2^n case, round down */ \
827 _szc = _i - TSB_START_SIZE; \
828 _slabszc = bigtsb_slab_shift - (TSB_START_SIZE + TSB_ENTRY_SHIFT); \
829 _tsbszc = MIN(_szc, _slabszc); \
830 tsb_max_growsize = MIN(_tsbszc, UTSB_MAX_SZCODE); \
834 * Given a pointer to an sfmmu and a TTE size code, return a pointer to the
835 * tsb_info which handles that TTE size.
837 #define SFMMU_GET_TSBINFO(tsbinfop, sfmmup, tte_szc) { \
838 (tsbinfop) = (sfmmup)->sfmmu_tsb; \
839 ASSERT(((tsbinfop)->tsb_flags & TSB_SHAREDCTX) || \
840 sfmmu_hat_lock_held(sfmmup)); \
841 if ((tte_szc) >= TTE4M) { \
842 ASSERT((tsbinfop) != NULL); \
843 (tsbinfop) = (tsbinfop)->tsb_next; \
848 * Macro to use to unload entries from the TSB.
849 * It has knowledge of which page sizes get replicated in the TSB
850 * and will call the appropriate unload routine for the appropriate size.
852 #define SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, ismhat) \
854 int ttesz = get_hblk_ttesz(hmeblkp); \
855 if (ttesz == TTE8K || ttesz == TTE4M) { \
856 sfmmu_unload_tsb(sfmmup, addr, ttesz); \
858 caddr_t sva = ismhat ? addr : \
859 (caddr_t)get_hblk_base(hmeblkp); \
860 caddr_t eva = sva + get_hblk_span(hmeblkp); \
861 ASSERT(addr >= sva && addr < eva); \
862 sfmmu_unload_tsb_range(sfmmup, sva, eva, ttesz); \
867 /* Update tsb_alloc_hiwater after memory is configured. */
870 sfmmu_update_post_add(void *arg
, pgcnt_t delta_pages
)
872 /* Assumes physmem has already been updated. */
873 SFMMU_SET_TSB_ALLOC_HIWATER(physmem
);
874 SFMMU_SET_TSB_MAX_GROWSIZE(physmem
);
878 * Update tsb_alloc_hiwater before memory is deleted. We'll do nothing here
879 * and update tsb_alloc_hiwater and tsb_max_growsize after the memory is
884 sfmmu_update_pre_del(void *arg
, pgcnt_t delta_pages
)
889 /* Update tsb_alloc_hiwater after memory fails to be unconfigured. */
892 sfmmu_update_post_del(void *arg
, pgcnt_t delta_pages
, int cancelled
)
895 * Whether the delete was cancelled or not, just go ahead and update
896 * tsb_alloc_hiwater and tsb_max_growsize.
898 SFMMU_SET_TSB_ALLOC_HIWATER(physmem
);
899 SFMMU_SET_TSB_MAX_GROWSIZE(physmem
);
902 static kphysm_setup_vector_t sfmmu_update_vec
= {
903 KPHYSM_SETUP_VECTOR_VERSION
, /* version */
904 sfmmu_update_post_add
, /* post_add */
905 sfmmu_update_pre_del
, /* pre_del */
906 sfmmu_update_post_del
/* post_del */
911 * HME_BLK HASH PRIMITIVES
915 * Enter a hme on the mapping list for page pp.
916 * When large pages are more prevalent in the system we might want to
917 * keep the mapping list in ascending order by the hment size. For now,
918 * small pages are more frequent, so don't slow it down.
920 #define HME_ADD(hme, pp) \
922 ASSERT(sfmmu_mlist_held(pp)); \
924 hme->hme_prev = NULL; \
925 hme->hme_next = pp->p_mapping; \
926 hme->hme_page = pp; \
927 if (pp->p_mapping) { \
928 ((struct sf_hment *)(pp->p_mapping))->hme_prev = hme;\
929 ASSERT(pp->p_share > 0); \
932 ASSERT(pp->p_share == 0); \
934 pp->p_mapping = hme; \
939 * Enter a hme on the mapping list for page pp.
940 * If we are unmapping a large translation, we need to make sure that the
941 * change is reflect in the corresponding bit of the p_index field.
943 #define HME_SUB(hme, pp) \
945 ASSERT(sfmmu_mlist_held(pp)); \
946 ASSERT(hme->hme_page == pp || IS_PAHME(hme)); \
948 if (pp->p_mapping == NULL) { \
949 panic("hme_remove - no mappings"); \
952 membar_stst(); /* ensure previous stores finish */ \
954 ASSERT(pp->p_share > 0); \
957 if (hme->hme_prev) { \
958 ASSERT(pp->p_mapping != hme); \
959 ASSERT(hme->hme_prev->hme_page == pp || \
960 IS_PAHME(hme->hme_prev)); \
961 hme->hme_prev->hme_next = hme->hme_next; \
963 ASSERT(pp->p_mapping == hme); \
964 pp->p_mapping = hme->hme_next; \
965 ASSERT((pp->p_mapping == NULL) ? \
966 (pp->p_share == 0) : 1); \
969 if (hme->hme_next) { \
970 ASSERT(hme->hme_next->hme_page == pp || \
971 IS_PAHME(hme->hme_next)); \
972 hme->hme_next->hme_prev = hme->hme_prev; \
975 /* zero out the entry */ \
976 hme->hme_next = NULL; \
977 hme->hme_prev = NULL; \
978 hme->hme_page = NULL; \
980 if (hme_size(hme) > TTE8K) { \
981 /* remove mappings for remainder of large pg */ \
982 sfmmu_rm_large_mappings(pp, hme_size(hme)); \
987 * This function returns the hment given the hme_blk and a vaddr.
988 * It assumes addr has already been checked to belong to hme_blk's
991 #define HBLKTOHME(hment, hmeblkp, addr) \
994 HBLKTOHME_IDX(hment, hmeblkp, addr, index) \
998 * Version of HBLKTOHME that also returns the index in hmeblkp
1001 #define HBLKTOHME_IDX(hment, hmeblkp, addr, idx) \
1003 ASSERT(in_hblk_range((hmeblkp), (addr))); \
1005 if (get_hblk_ttesz(hmeblkp) == TTE8K) { \
1006 idx = (((uintptr_t)(addr) >> MMU_PAGESHIFT) & (NHMENTS-1)); \
1010 (hment) = &(hmeblkp)->hblk_hme[idx]; \
1014 * Disable any page sizes not supported by the CPU
1017 hat_init_pagesizes()
1021 mmu_exported_page_sizes
= 0;
1022 for (i
= TTE8K
; i
< max_mmu_page_sizes
; i
++) {
1024 szc_2_userszc
[i
] = (uint_t
)-1;
1025 userszc_2_szc
[i
] = (uint_t
)-1;
1027 if ((mmu_exported_pagesize_mask
& (1 << i
)) == 0) {
1028 disable_large_pages
|= (1 << i
);
1030 szc_2_userszc
[i
] = mmu_exported_page_sizes
;
1031 userszc_2_szc
[mmu_exported_page_sizes
] = i
;
1032 mmu_exported_page_sizes
++;
1036 disable_ism_large_pages
|= disable_large_pages
;
1037 disable_auto_data_large_pages
= disable_large_pages
;
1038 disable_auto_text_large_pages
= disable_large_pages
;
1041 * Initialize mmu-specific large page sizes.
1043 if (&mmu_large_pages_disabled
) {
1044 disable_large_pages
|= mmu_large_pages_disabled(HAT_LOAD
);
1045 disable_ism_large_pages
|=
1046 mmu_large_pages_disabled(HAT_LOAD_SHARE
);
1047 disable_auto_data_large_pages
|=
1048 mmu_large_pages_disabled(HAT_AUTO_DATA
);
1049 disable_auto_text_large_pages
|=
1050 mmu_large_pages_disabled(HAT_AUTO_TEXT
);
1055 * Initialize the hardware address translation structures.
1068 * Hardware-only bits in a TTE
1070 MAKE_TTE_MASK(&hw_tte
);
1072 hat_init_pagesizes();
1074 /* Initialize the hash locks */
1075 for (i
= 0; i
< khmehash_num
; i
++) {
1076 mutex_init(&khme_hash
[i
].hmehash_mutex
, NULL
,
1077 MUTEX_DEFAULT
, NULL
);
1078 khme_hash
[i
].hmeh_nextpa
= HMEBLK_ENDPA
;
1080 for (i
= 0; i
< uhmehash_num
; i
++) {
1081 mutex_init(&uhme_hash
[i
].hmehash_mutex
, NULL
,
1082 MUTEX_DEFAULT
, NULL
);
1083 uhme_hash
[i
].hmeh_nextpa
= HMEBLK_ENDPA
;
1085 khmehash_num
--; /* make sure counter starts from 0 */
1086 uhmehash_num
--; /* make sure counter starts from 0 */
1089 * Allocate context domain structures.
1091 * A platform may choose to modify max_mmu_ctxdoms in
1092 * set_platform_defaults(). If a platform does not define
1093 * a set_platform_defaults() or does not choose to modify
1094 * max_mmu_ctxdoms, it gets one MMU context domain for every CPU.
1096 * For all platforms that have CPUs sharing MMUs, this
1097 * value must be defined.
1099 if (max_mmu_ctxdoms
== 0)
1100 max_mmu_ctxdoms
= max_ncpus
;
1102 size
= max_mmu_ctxdoms
* sizeof (mmu_ctx_t
*);
1103 mmu_ctxs_tbl
= kmem_zalloc(size
, KM_SLEEP
);
1105 /* mmu_ctx_t is 64 bytes aligned */
1106 mmuctxdom_cache
= kmem_cache_create("mmuctxdom_cache",
1107 sizeof (mmu_ctx_t
), 64, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
1109 * MMU context domain initialization for the Boot CPU.
1110 * This needs the context domains array allocated above.
1112 mutex_enter(&cpu_lock
);
1113 sfmmu_cpu_init(CPU
);
1114 mutex_exit(&cpu_lock
);
1117 * Intialize ism mapping list lock.
1120 mutex_init(&ism_mlist_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
1123 * Each sfmmu structure carries an array of MMU context info
1124 * structures, one per context domain. The size of this array depends
1125 * on the maximum number of context domains. So, the size of the
1126 * sfmmu structure varies per platform.
1128 * sfmmu is allocated from static arena, because trap
1129 * handler at TL > 0 is not allowed to touch kernel relocatable
1130 * memory. sfmmu's alignment is changed to 64 bytes from
1131 * default 8 bytes, as the lower 6 bits will be used to pass
1132 * pgcnt to vtag_flush_pgcnt_tl1.
1134 size
= sizeof (sfmmu_t
) + sizeof (sfmmu_ctx_t
) * (max_mmu_ctxdoms
- 1);
1136 sfmmuid_cache
= kmem_cache_create("sfmmuid_cache", size
,
1137 64, sfmmu_idcache_constructor
, sfmmu_idcache_destructor
,
1138 NULL
, NULL
, static_arena
, 0);
1140 sfmmu_tsbinfo_cache
= kmem_cache_create("sfmmu_tsbinfo_cache",
1141 sizeof (struct tsb_info
), 0, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
1144 * Since we only use the tsb8k cache to "borrow" pages for TSBs
1145 * from the heap when low on memory or when TSB_FORCEALLOC is
1146 * specified, don't use magazines to cache them--we want to return
1147 * them to the system as quickly as possible.
1149 sfmmu_tsb8k_cache
= kmem_cache_create("sfmmu_tsb8k_cache",
1150 MMU_PAGESIZE
, MMU_PAGESIZE
, NULL
, NULL
, NULL
, NULL
,
1151 static_arena
, KMC_NOMAGAZINE
);
1154 * Set tsb_alloc_hiwater to 1/tsb_alloc_hiwater_factor of physical
1155 * memory, which corresponds to the old static reserve for TSBs.
1156 * tsb_alloc_hiwater_factor defaults to 32. This caps the amount of
1157 * memory we'll allocate for TSB slabs; beyond this point TSB
1158 * allocations will be taken from the kernel heap (via
1159 * sfmmu_tsb8k_cache) and will be throttled as would any other kmem
1162 if (tsb_alloc_hiwater_factor
== 0) {
1163 tsb_alloc_hiwater_factor
= TSB_ALLOC_HIWATER_FACTOR_DEFAULT
;
1165 SFMMU_SET_TSB_ALLOC_HIWATER(physmem
);
1167 for (sz
= tsb_slab_ttesz
; sz
> 0; sz
--) {
1168 if (!(disable_large_pages
& (1 << sz
)))
1172 if (sz
< tsb_slab_ttesz
) {
1173 tsb_slab_ttesz
= sz
;
1174 tsb_slab_shift
= MMU_PAGESHIFT
+ (sz
<< 1) + sz
;
1175 tsb_slab_size
= 1 << tsb_slab_shift
;
1176 tsb_slab_mask
= (1 << (tsb_slab_shift
- MMU_PAGESHIFT
)) - 1;
1177 use_bigtsb_arena
= 0;
1178 } else if (use_bigtsb_arena
&&
1179 (disable_large_pages
& (1 << bigtsb_slab_ttesz
))) {
1180 use_bigtsb_arena
= 0;
1183 if (!use_bigtsb_arena
) {
1184 bigtsb_slab_shift
= tsb_slab_shift
;
1186 SFMMU_SET_TSB_MAX_GROWSIZE(physmem
);
1189 * On smaller memory systems, allocate TSB memory in smaller chunks
1190 * than the default 4M slab size. We also honor disable_large_pages
1193 * The trap handlers need to be patched with the final slab shift,
1194 * since they need to be able to construct the TSB pointer at runtime.
1196 if ((tsb_max_growsize
<= TSB_512K_SZCODE
) &&
1197 !(disable_large_pages
& (1 << TTE512K
))) {
1198 tsb_slab_ttesz
= TTE512K
;
1199 tsb_slab_shift
= MMU_PAGESHIFT512K
;
1200 tsb_slab_size
= MMU_PAGESIZE512K
;
1201 tsb_slab_mask
= MMU_PAGEOFFSET512K
>> MMU_PAGESHIFT
;
1202 use_bigtsb_arena
= 0;
1205 if (!use_bigtsb_arena
) {
1206 bigtsb_slab_ttesz
= tsb_slab_ttesz
;
1207 bigtsb_slab_shift
= tsb_slab_shift
;
1208 bigtsb_slab_size
= tsb_slab_size
;
1209 bigtsb_slab_mask
= tsb_slab_mask
;
1214 * Set up memory callback to update tsb_alloc_hiwater and
1217 i
= kphysm_setup_func_register(&sfmmu_update_vec
, (void *) 0);
1221 * kmem_tsb_arena is the source from which large TSB slabs are
1222 * drawn. The quantum of this arena corresponds to the largest
1223 * TSB size we can dynamically allocate for user processes.
1224 * Currently it must also be a supported page size since we
1225 * use exactly one translation entry to map each slab page.
1227 * The per-lgroup kmem_tsb_default_arena arenas are the arenas from
1228 * which most TSBs are allocated. Since most TSB allocations are
1229 * typically 8K we have a kmem cache we stack on top of each
1230 * kmem_tsb_default_arena to speed up those allocations.
1232 * Note the two-level scheme of arenas is required only
1233 * because vmem_create doesn't allow us to specify alignment
1234 * requirements. If this ever changes the code could be
1235 * simplified to use only one level of arenas.
1237 * If 256M page support exists on sun4v, 256MB kmem_bigtsb_arena
1238 * will be provided in addition to the 4M kmem_tsb_arena.
1240 if (use_bigtsb_arena
) {
1241 kmem_bigtsb_arena
= vmem_create("kmem_bigtsb", NULL
, 0,
1242 bigtsb_slab_size
, sfmmu_vmem_xalloc_aligned_wrapper
,
1243 vmem_xfree
, heap_arena
, 0, VM_SLEEP
);
1246 kmem_tsb_arena
= vmem_create("kmem_tsb", NULL
, 0, tsb_slab_size
,
1247 sfmmu_vmem_xalloc_aligned_wrapper
,
1248 vmem_xfree
, heap_arena
, 0, VM_SLEEP
);
1250 if (tsb_lgrp_affinity
) {
1252 for (i
= 0; i
< NLGRPS_MAX
; i
++) {
1253 if (use_bigtsb_arena
) {
1254 (void) sprintf(s
, "kmem_bigtsb_lgrp%d", i
);
1255 kmem_bigtsb_default_arena
[i
] = vmem_create(s
,
1256 NULL
, 0, 2 * tsb_slab_size
,
1257 sfmmu_tsb_segkmem_alloc
,
1258 sfmmu_tsb_segkmem_free
, kmem_bigtsb_arena
,
1259 0, VM_SLEEP
| VM_BESTFIT
);
1262 (void) sprintf(s
, "kmem_tsb_lgrp%d", i
);
1263 kmem_tsb_default_arena
[i
] = vmem_create(s
,
1264 NULL
, 0, PAGESIZE
, sfmmu_tsb_segkmem_alloc
,
1265 sfmmu_tsb_segkmem_free
, kmem_tsb_arena
, 0,
1266 VM_SLEEP
| VM_BESTFIT
);
1268 (void) sprintf(s
, "sfmmu_tsb_lgrp%d_cache", i
);
1269 sfmmu_tsb_cache
[i
] = kmem_cache_create(s
,
1270 PAGESIZE
, PAGESIZE
, NULL
, NULL
, NULL
, NULL
,
1271 kmem_tsb_default_arena
[i
], 0);
1274 if (use_bigtsb_arena
) {
1275 kmem_bigtsb_default_arena
[0] =
1276 vmem_create("kmem_bigtsb_default", NULL
, 0,
1277 2 * tsb_slab_size
, sfmmu_tsb_segkmem_alloc
,
1278 sfmmu_tsb_segkmem_free
, kmem_bigtsb_arena
, 0,
1279 VM_SLEEP
| VM_BESTFIT
);
1282 kmem_tsb_default_arena
[0] = vmem_create("kmem_tsb_default",
1283 NULL
, 0, PAGESIZE
, sfmmu_tsb_segkmem_alloc
,
1284 sfmmu_tsb_segkmem_free
, kmem_tsb_arena
, 0,
1285 VM_SLEEP
| VM_BESTFIT
);
1286 sfmmu_tsb_cache
[0] = kmem_cache_create("sfmmu_tsb_cache",
1287 PAGESIZE
, PAGESIZE
, NULL
, NULL
, NULL
, NULL
,
1288 kmem_tsb_default_arena
[0], 0);
1291 sfmmu8_cache
= kmem_cache_create("sfmmu8_cache", HME8BLK_SZ
,
1292 HMEBLK_ALIGN
, sfmmu_hblkcache_constructor
,
1293 sfmmu_hblkcache_destructor
,
1294 sfmmu_hblkcache_reclaim
, (void *)HME8BLK_SZ
,
1295 hat_memload_arena
, KMC_NOHASH
);
1297 hat_memload1_arena
= vmem_create("hat_memload1", NULL
, 0, PAGESIZE
,
1298 segkmem_alloc_permanent
, segkmem_free
, heap_arena
, 0,
1299 VMC_DUMPSAFE
| VM_SLEEP
);
1301 sfmmu1_cache
= kmem_cache_create("sfmmu1_cache", HME1BLK_SZ
,
1302 HMEBLK_ALIGN
, sfmmu_hblkcache_constructor
,
1303 sfmmu_hblkcache_destructor
,
1304 NULL
, (void *)HME1BLK_SZ
,
1305 hat_memload1_arena
, KMC_NOHASH
);
1307 pa_hment_cache
= kmem_cache_create("pa_hment_cache", PAHME_SZ
,
1308 0, NULL
, NULL
, NULL
, NULL
, static_arena
, KMC_NOHASH
);
1310 ism_blk_cache
= kmem_cache_create("ism_blk_cache",
1311 sizeof (ism_blk_t
), ecache_alignsize
, NULL
, NULL
,
1312 NULL
, NULL
, static_arena
, KMC_NOHASH
);
1314 ism_ment_cache
= kmem_cache_create("ism_ment_cache",
1315 sizeof (ism_ment_t
), 0, NULL
, NULL
,
1316 NULL
, NULL
, NULL
, 0);
1319 * We grab the first hat for the kernel,
1321 AS_LOCK_ENTER(&kas
, RW_WRITER
);
1322 kas
.a_hat
= hat_alloc(&kas
);
1326 * Initialize hblk_reserve.
1328 ((struct hme_blk
*)hblk_reserve
)->hblk_nextpa
=
1329 va_to_pa((caddr_t
)hblk_reserve
);
1333 * Reserve some kernel virtual address space for the locked TTEs
1334 * that allow us to probe the TSB from TL>0.
1336 utsb_vabase
= vmem_xalloc(heap_arena
, tsb_slab_size
, tsb_slab_size
,
1337 0, 0, NULL
, NULL
, VM_SLEEP
);
1338 utsb4m_vabase
= vmem_xalloc(heap_arena
, tsb_slab_size
, tsb_slab_size
,
1339 0, 0, NULL
, NULL
, VM_SLEEP
);
1344 * The big page VAC handling code assumes VAC
1345 * will not be bigger than the smallest big
1346 * page- which is 64K.
1348 if (TTEPAGES(TTE64K
) < CACHE_NUM_COLOR
) {
1349 cmn_err(CE_PANIC
, "VAC too big!");
1355 uhme_hash_pa
= va_to_pa(uhme_hash
);
1356 khme_hash_pa
= va_to_pa(khme_hash
);
1359 * Initialize relocation locks. kpr_suspendlock is held
1360 * at PIL_MAX to prevent interrupts from pinning the holder
1361 * of a suspended TTE which may access it leading to a
1362 * deadlock condition.
1364 mutex_init(&kpr_mutex
, NULL
, MUTEX_DEFAULT
, NULL
);
1365 mutex_init(&kpr_suspendlock
, NULL
, MUTEX_SPIN
, (void *)PIL_MAX
);
1368 * If Shared context support is disabled via /etc/system
1369 * set shctx_on to 0 here if it was set to 1 earlier in boot
1370 * sequence by cpu module initialization code.
1372 if (shctx_on
&& disable_shctx
) {
1377 srd_buckets
= kmem_zalloc(SFMMU_MAX_SRD_BUCKETS
*
1378 sizeof (srd_buckets
[0]), KM_SLEEP
);
1379 for (i
= 0; i
< SFMMU_MAX_SRD_BUCKETS
; i
++) {
1380 mutex_init(&srd_buckets
[i
].srdb_lock
, NULL
,
1381 MUTEX_DEFAULT
, NULL
);
1384 srd_cache
= kmem_cache_create("srd_cache", sizeof (sf_srd_t
),
1385 0, sfmmu_srdcache_constructor
, sfmmu_srdcache_destructor
,
1386 NULL
, NULL
, NULL
, 0);
1387 region_cache
= kmem_cache_create("region_cache",
1388 sizeof (sf_region_t
), 0, sfmmu_rgncache_constructor
,
1389 sfmmu_rgncache_destructor
, NULL
, NULL
, NULL
, 0);
1390 scd_cache
= kmem_cache_create("scd_cache", sizeof (sf_scd_t
),
1391 0, sfmmu_scdcache_constructor
, sfmmu_scdcache_destructor
,
1392 NULL
, NULL
, NULL
, 0);
1396 * Pre-allocate hrm_hashtab before enabling the collection of
1397 * refmod statistics. Allocating on the fly would mean us
1398 * running the risk of suffering recursive mutex enters or
1401 hrm_hashtab
= kmem_zalloc(HRM_HASHSIZE
* sizeof (struct hrmstat
*),
1404 /* Allocate per-cpu pending freelist of hmeblks */
1405 cpu_hme_pend
= kmem_zalloc((NCPU
* sizeof (cpu_hme_pend_t
)) + 64,
1407 cpu_hme_pend
= (cpu_hme_pend_t
*)P2ROUNDUP(
1408 (uintptr_t)cpu_hme_pend
, 64);
1410 for (i
= 0; i
< NCPU
; i
++) {
1411 mutex_init(&cpu_hme_pend
[i
].chp_mutex
, NULL
, MUTEX_DEFAULT
,
1415 if (cpu_hme_pend_thresh
== 0) {
1416 cpu_hme_pend_thresh
= CPU_HME_PEND_THRESH
;
1421 * Initialize locking for the hat layer, called early during boot.
1429 * initialize the array of mutexes protecting a page's mapping
1430 * list and p_nrm field.
1432 for (i
= 0; i
< MML_TABLE_SIZE
; i
++)
1433 mutex_init(&mml_table
[i
].pad_mutex
, NULL
, MUTEX_DEFAULT
, NULL
);
1436 for (i
= 0; i
< kpmp_table_sz
; i
++) {
1437 mutex_init(&kpmp_table
[i
].khl_mutex
, NULL
,
1438 MUTEX_DEFAULT
, NULL
);
1443 * Initialize array of mutex locks that protects sfmmu fields and
1446 for (i
= 0; i
< SFMMU_NUM_LOCK
; i
++)
1447 mutex_init(HATLOCK_MUTEXP(&hat_lock
[i
]), NULL
, MUTEX_DEFAULT
,
1451 #define SFMMU_KERNEL_MAXVA \
1452 (kmem64_base ? (uintptr_t)kmem64_end : (SYSLIMIT))
1455 * Allocate a hat structure.
1456 * Called when an address space first uses a hat.
1459 hat_alloc(struct as
*as
)
1464 extern uint_t
get_color_start(struct as
*);
1466 ASSERT(AS_WRITE_HELD(as
));
1467 sfmmup
= kmem_cache_alloc(sfmmuid_cache
, KM_SLEEP
);
1468 sfmmup
->sfmmu_as
= as
;
1469 sfmmup
->sfmmu_flags
= 0;
1470 sfmmup
->sfmmu_tteflags
= 0;
1471 sfmmup
->sfmmu_rtteflags
= 0;
1472 LOCK_INIT_CLEAR(&sfmmup
->sfmmu_ctx_lock
);
1476 sfmmup
->sfmmu_cext
= 0;
1479 sfmmup
->sfmmu_clrstart
= 0;
1480 sfmmup
->sfmmu_tsb
= NULL
;
1482 * hat_kern_setup() will call sfmmu_init_ktsbinfo()
1483 * to setup tsb_info for ksfmmup.
1488 * Just set to invalid ctx. When it faults, it will
1489 * get a valid ctx. This would avoid the situation
1490 * where we get a ctx, but it gets stolen and then
1491 * we fault when we try to run and so have to get
1494 sfmmup
->sfmmu_cext
= 0;
1495 cnum
= INVALID_CONTEXT
;
1497 /* initialize original physical page coloring bin */
1498 sfmmup
->sfmmu_clrstart
= get_color_start(as
);
1500 if (tsb_random_size
) {
1501 uint32_t randval
= (uint32_t)gettick() >> 4;
1502 int size
= randval
% (tsb_max_growsize
+ 1);
1504 /* chose a random tsb size for stress testing */
1505 (void) sfmmu_tsbinfo_alloc(&sfmmup
->sfmmu_tsb
, size
,
1506 TSB8K
|TSB64K
|TSB512K
, 0, sfmmup
);
1509 (void) sfmmu_tsbinfo_alloc(&sfmmup
->sfmmu_tsb
,
1511 TSB8K
|TSB64K
|TSB512K
, 0, sfmmup
);
1512 sfmmup
->sfmmu_flags
= HAT_SWAPPED
| HAT_ALLCTX_INVALID
;
1513 ASSERT(sfmmup
->sfmmu_tsb
!= NULL
);
1516 ASSERT(max_mmu_ctxdoms
> 0);
1517 for (i
= 0; i
< max_mmu_ctxdoms
; i
++) {
1518 sfmmup
->sfmmu_ctxs
[i
].cnum
= cnum
;
1519 sfmmup
->sfmmu_ctxs
[i
].gnum
= 0;
1522 for (i
= 0; i
< max_mmu_page_sizes
; i
++) {
1523 sfmmup
->sfmmu_ttecnt
[i
] = 0;
1524 sfmmup
->sfmmu_scdrttecnt
[i
] = 0;
1525 sfmmup
->sfmmu_ismttecnt
[i
] = 0;
1526 sfmmup
->sfmmu_scdismttecnt
[i
] = 0;
1527 sfmmup
->sfmmu_pgsz
[i
] = TTE8K
;
1529 sfmmup
->sfmmu_tsb0_4minflcnt
= 0;
1530 sfmmup
->sfmmu_iblk
= NULL
;
1531 sfmmup
->sfmmu_ismhat
= 0;
1532 sfmmup
->sfmmu_scdhat
= 0;
1533 sfmmup
->sfmmu_ismblkpa
= (uint64_t)-1;
1534 if (sfmmup
== ksfmmup
) {
1535 CPUSET_ALL(sfmmup
->sfmmu_cpusran
);
1537 CPUSET_ZERO(sfmmup
->sfmmu_cpusran
);
1539 sfmmup
->sfmmu_free
= 0;
1540 sfmmup
->sfmmu_rmstat
= 0;
1541 sfmmup
->sfmmu_clrbin
= sfmmup
->sfmmu_clrstart
;
1542 sfmmup
->sfmmu_xhat_provider
= NULL
;
1543 cv_init(&sfmmup
->sfmmu_tsb_cv
, NULL
, CV_DEFAULT
, NULL
);
1544 sfmmup
->sfmmu_srdp
= NULL
;
1545 SF_RGNMAP_ZERO(sfmmup
->sfmmu_region_map
);
1546 bzero(sfmmup
->sfmmu_hmeregion_links
, SFMMU_L1_HMERLINKS_SIZE
);
1547 sfmmup
->sfmmu_scdp
= NULL
;
1548 sfmmup
->sfmmu_scd_link
.next
= NULL
;
1549 sfmmup
->sfmmu_scd_link
.prev
= NULL
;
1554 * Create per-MMU context domain kstats for a given MMU ctx.
1557 sfmmu_mmu_kstat_create(mmu_ctx_t
*mmu_ctxp
)
1559 mmu_ctx_stat_t stat
;
1562 ASSERT(MUTEX_HELD(&cpu_lock
));
1563 ASSERT(mmu_ctxp
->mmu_kstat
== NULL
);
1565 mmu_kstat
= kstat_create("unix", mmu_ctxp
->mmu_idx
, "mmu_ctx",
1566 "hat", KSTAT_TYPE_NAMED
, MMU_CTX_NUM_STATS
, KSTAT_FLAG_VIRTUAL
);
1568 if (mmu_kstat
== NULL
) {
1569 cmn_err(CE_WARN
, "kstat_create for MMU %d failed",
1572 mmu_kstat
->ks_data
= mmu_ctxp
->mmu_kstat_data
;
1573 for (stat
= 0; stat
< MMU_CTX_NUM_STATS
; stat
++)
1574 kstat_named_init(&mmu_ctxp
->mmu_kstat_data
[stat
],
1575 mmu_ctx_kstat_names
[stat
], KSTAT_DATA_INT64
);
1576 mmu_ctxp
->mmu_kstat
= mmu_kstat
;
1577 kstat_install(mmu_kstat
);
1582 * plat_cpuid_to_mmu_ctx_info() is a platform interface that returns MMU
1583 * context domain information for a given CPU. If a platform does not
1584 * specify that interface, then the function below is used instead to return
1585 * default information. The defaults are as follows:
1587 * - The number of MMU context IDs supported on any CPU in the
1589 * - There is one MMU context domain per CPU.
1593 sfmmu_cpuid_to_mmu_ctx_info(processorid_t cpuid
, mmu_ctx_info_t
*infop
)
1595 infop
->mmu_nctxs
= nctxs
;
1596 infop
->mmu_idx
= cpu
[cpuid
]->cpu_seqid
;
1600 * Called during CPU initialization to set the MMU context-related information
1603 * cpu_lock serializes accesses to mmu_ctxs and mmu_saved_gnum.
1606 sfmmu_cpu_init(cpu_t
*cp
)
1608 mmu_ctx_info_t info
;
1609 mmu_ctx_t
*mmu_ctxp
;
1611 ASSERT(MUTEX_HELD(&cpu_lock
));
1613 if (&plat_cpuid_to_mmu_ctx_info
== NULL
)
1614 sfmmu_cpuid_to_mmu_ctx_info(cp
->cpu_id
, &info
);
1616 plat_cpuid_to_mmu_ctx_info(cp
->cpu_id
, &info
);
1618 ASSERT(info
.mmu_idx
< max_mmu_ctxdoms
);
1620 if ((mmu_ctxp
= mmu_ctxs_tbl
[info
.mmu_idx
]) == NULL
) {
1621 /* Each mmu_ctx is cacheline aligned. */
1622 mmu_ctxp
= kmem_cache_alloc(mmuctxdom_cache
, KM_SLEEP
);
1623 bzero(mmu_ctxp
, sizeof (mmu_ctx_t
));
1625 mutex_init(&mmu_ctxp
->mmu_lock
, NULL
, MUTEX_SPIN
,
1626 (void *)ipltospl(DISP_LEVEL
));
1627 mmu_ctxp
->mmu_idx
= info
.mmu_idx
;
1628 mmu_ctxp
->mmu_nctxs
= info
.mmu_nctxs
;
1630 * Globally for lifetime of a system,
1631 * gnum must always increase.
1632 * mmu_saved_gnum is protected by the cpu_lock.
1634 mmu_ctxp
->mmu_gnum
= mmu_saved_gnum
+ 1;
1635 mmu_ctxp
->mmu_cnum
= NUM_LOCKED_CTXS
;
1637 sfmmu_mmu_kstat_create(mmu_ctxp
);
1639 mmu_ctxs_tbl
[info
.mmu_idx
] = mmu_ctxp
;
1641 ASSERT(mmu_ctxp
->mmu_idx
== info
.mmu_idx
);
1642 ASSERT(mmu_ctxp
->mmu_nctxs
<= info
.mmu_nctxs
);
1646 * The mmu_lock is acquired here to prevent races with
1647 * the wrap-around code.
1649 mutex_enter(&mmu_ctxp
->mmu_lock
);
1652 mmu_ctxp
->mmu_ncpus
++;
1653 CPUSET_ADD(mmu_ctxp
->mmu_cpuset
, cp
->cpu_id
);
1654 CPU_MMU_IDX(cp
) = info
.mmu_idx
;
1655 CPU_MMU_CTXP(cp
) = mmu_ctxp
;
1657 mutex_exit(&mmu_ctxp
->mmu_lock
);
1661 sfmmu_ctxdom_free(mmu_ctx_t
*mmu_ctxp
)
1663 ASSERT(MUTEX_HELD(&cpu_lock
));
1664 ASSERT(!MUTEX_HELD(&mmu_ctxp
->mmu_lock
));
1666 mutex_destroy(&mmu_ctxp
->mmu_lock
);
1668 if (mmu_ctxp
->mmu_kstat
)
1669 kstat_delete(mmu_ctxp
->mmu_kstat
);
1671 /* mmu_saved_gnum is protected by the cpu_lock. */
1672 if (mmu_saved_gnum
< mmu_ctxp
->mmu_gnum
)
1673 mmu_saved_gnum
= mmu_ctxp
->mmu_gnum
;
1675 kmem_cache_free(mmuctxdom_cache
, mmu_ctxp
);
1679 * Called to perform MMU context-related cleanup for a CPU.
1682 sfmmu_cpu_cleanup(cpu_t
*cp
)
1684 mmu_ctx_t
*mmu_ctxp
;
1686 ASSERT(MUTEX_HELD(&cpu_lock
));
1688 mmu_ctxp
= CPU_MMU_CTXP(cp
);
1689 ASSERT(mmu_ctxp
!= NULL
);
1692 * The mmu_lock is acquired here to prevent races with
1693 * the wrap-around code.
1695 mutex_enter(&mmu_ctxp
->mmu_lock
);
1697 CPU_MMU_CTXP(cp
) = NULL
;
1699 CPUSET_DEL(mmu_ctxp
->mmu_cpuset
, cp
->cpu_id
);
1700 if (--mmu_ctxp
->mmu_ncpus
== 0) {
1701 mmu_ctxs_tbl
[mmu_ctxp
->mmu_idx
] = NULL
;
1702 mutex_exit(&mmu_ctxp
->mmu_lock
);
1703 sfmmu_ctxdom_free(mmu_ctxp
);
1707 mutex_exit(&mmu_ctxp
->mmu_lock
);
1711 sfmmu_ctxdom_nctxs(int idx
)
1713 return (mmu_ctxs_tbl
[idx
]->mmu_nctxs
);
1718 * sfmmu_ctxdoms_* is an interface provided to help keep context domains
1719 * consistant after suspend/resume on system that can resume on a different
1720 * hardware than it was suspended.
1722 * sfmmu_ctxdom_lock(void) locks all context domains and prevents new contexts
1723 * from being allocated. It acquires all hat_locks, which blocks most access to
1724 * context data, except for a few cases that are handled separately or are
1725 * harmless. It wraps each domain to increment gnum and invalidate on-CPU
1726 * contexts, and forces cnum to its max. As a result of this call all user
1727 * threads that are running on CPUs trap and try to perform wrap around but
1728 * can't because hat_locks are taken. Threads that were not on CPUs but started
1729 * by scheduler go to sfmmu_alloc_ctx() to aquire context without checking
1730 * hat_lock, but fail, because cnum == nctxs, and therefore also trap and block
1731 * on hat_lock trying to wrap. sfmmu_ctxdom_lock() must be called before CPUs
1732 * are paused, else it could deadlock acquiring locks held by paused CPUs.
1734 * sfmmu_ctxdoms_remove() removes context domains from every CPUs and records
1735 * the CPUs that had them. It must be called after CPUs have been paused. This
1736 * ensures that no threads are in sfmmu_alloc_ctx() accessing domain data,
1737 * because pause_cpus sends a mondo interrupt to every CPU, and sfmmu_alloc_ctx
1738 * runs with interrupts disabled. When CPUs are later resumed, they may enter
1739 * sfmmu_alloc_ctx, but it will check for CPU_MMU_CTXP = NULL and immediately
1740 * return failure. Or, they will be blocked trying to acquire hat_lock. Thus
1741 * after sfmmu_ctxdoms_remove returns, we are guaranteed that no one is
1742 * accessing the old context domains.
1744 * sfmmu_ctxdoms_update(void) frees space used by old context domains and
1745 * allocates new context domains based on hardware layout. It initializes
1746 * every CPU that had context domain before migration to have one again.
1747 * sfmmu_ctxdoms_update must be called after CPUs are resumed, else it
1748 * could deadlock acquiring locks held by paused CPUs.
1750 * sfmmu_ctxdoms_unlock(void) releases all hat_locks after which user threads
1751 * acquire new context ids and continue execution.
1753 * Therefore functions should be called in the following order:
1755 * sfmmu_ctxdom_lock()
1758 * if (suspend failed)
1759 * sfmmu_ctxdom_unlock()
1761 * sfmmu_ctxdom_remove()
1763 * sfmmu_ctxdom_update()
1764 * sfmmu_ctxdom_unlock()
1766 static cpuset_t sfmmu_ctxdoms_pset
;
1769 sfmmu_ctxdoms_remove()
1775 * Record the CPUs that have domains in sfmmu_ctxdoms_pset, so they can
1776 * be restored post-migration. A CPU may be powered off and not have a
1777 * domain, for example.
1779 CPUSET_ZERO(sfmmu_ctxdoms_pset
);
1781 for (id
= 0; id
< NCPU
; id
++) {
1782 if ((cp
= cpu
[id
]) != NULL
&& CPU_MMU_CTXP(cp
) != NULL
) {
1783 CPUSET_ADD(sfmmu_ctxdoms_pset
, id
);
1784 CPU_MMU_CTXP(cp
) = NULL
;
1790 sfmmu_ctxdoms_lock(void)
1793 mmu_ctx_t
*mmu_ctxp
;
1795 sfmmu_hat_lock_all();
1798 * At this point, no thread can be in sfmmu_ctx_wrap_around, because
1799 * hat_lock is always taken before calling it.
1801 * For each domain, set mmu_cnum to max so no more contexts can be
1802 * allocated, and wrap to flush on-CPU contexts and force threads to
1803 * acquire a new context when we later drop hat_lock after migration.
1804 * Setting mmu_cnum may race with sfmmu_alloc_ctx which also sets cnum,
1805 * but the latter uses CAS and will miscompare and not overwrite it.
1807 kpreempt_disable(); /* required by sfmmu_ctx_wrap_around */
1808 for (idx
= 0; idx
< max_mmu_ctxdoms
; idx
++) {
1809 if ((mmu_ctxp
= mmu_ctxs_tbl
[idx
]) != NULL
) {
1810 mutex_enter(&mmu_ctxp
->mmu_lock
);
1811 mmu_ctxp
->mmu_cnum
= mmu_ctxp
->mmu_nctxs
;
1812 /* make sure updated cnum visible */
1814 mutex_exit(&mmu_ctxp
->mmu_lock
);
1815 sfmmu_ctx_wrap_around(mmu_ctxp
, B_FALSE
);
1822 sfmmu_ctxdoms_unlock(void)
1824 sfmmu_hat_unlock_all();
1828 sfmmu_ctxdoms_update(void)
1833 mmu_ctx_t
*mmu_ctxp
;
1836 * Free all context domains. As side effect, this increases
1837 * mmu_saved_gnum to the maximum gnum over all domains, which is used to
1838 * init gnum in the new domains, which therefore will be larger than the
1839 * sfmmu gnum for any process, guaranteeing that every process will see
1840 * a new generation and allocate a new context regardless of what new
1841 * domain it runs in.
1843 mutex_enter(&cpu_lock
);
1845 for (idx
= 0; idx
< max_mmu_ctxdoms
; idx
++) {
1846 if (mmu_ctxs_tbl
[idx
] != NULL
) {
1847 mmu_ctxp
= mmu_ctxs_tbl
[idx
];
1848 mmu_ctxs_tbl
[idx
] = NULL
;
1849 sfmmu_ctxdom_free(mmu_ctxp
);
1853 for (id
= 0; id
< NCPU
; id
++) {
1854 if (CPU_IN_SET(sfmmu_ctxdoms_pset
, id
) &&
1855 (cp
= cpu
[id
]) != NULL
)
1858 mutex_exit(&cpu_lock
);
1863 * Hat_setup, makes an address space context the current active one.
1864 * In sfmmu this translates to setting the secondary context with the
1865 * corresponding context.
1868 hat_setup(struct hat
*sfmmup
, int allocflag
)
1870 hatlock_t
*hatlockp
;
1872 /* Init needs some special treatment. */
1873 if (allocflag
== HAT_INIT
) {
1875 * Make sure that we have
1877 * 2. a valid ctx that doesn't get stolen after this point.
1879 hatlockp
= sfmmu_hat_enter(sfmmup
);
1882 * Swap in the TSB. hat_init() allocates tsbinfos without
1883 * TSBs, but we need one for init, since the kernel does some
1884 * special things to set up its stack and needs the TSB to
1885 * resolve page faults.
1887 sfmmu_tsb_swapin(sfmmup
, hatlockp
);
1889 sfmmu_get_ctx(sfmmup
);
1891 sfmmu_hat_exit(hatlockp
);
1893 ASSERT(allocflag
== HAT_ALLOC
);
1895 hatlockp
= sfmmu_hat_enter(sfmmup
);
1898 CPUSET_ADD(sfmmup
->sfmmu_cpusran
, CPU
->cpu_id
);
1900 * sfmmu_setctx_sec takes <pgsz|cnum> as a parameter,
1901 * pagesize bits don't matter in this case since we are passing
1902 * INVALID_CONTEXT to it.
1903 * Compatibility Note: hw takes care of MMU_SCONTEXT1
1905 sfmmu_setctx_sec(INVALID_CONTEXT
);
1906 sfmmu_clear_utsbinfo();
1909 sfmmu_hat_exit(hatlockp
);
1914 * Free all the translation resources for the specified address space.
1915 * Called from as_free when an address space is being destroyed.
1918 hat_free_start(struct hat
*sfmmup
)
1920 ASSERT(AS_WRITE_HELD(sfmmup
->sfmmu_as
));
1921 ASSERT(sfmmup
!= ksfmmup
);
1922 ASSERT(sfmmup
->sfmmu_xhat_provider
== NULL
);
1924 sfmmup
->sfmmu_free
= 1;
1925 if (sfmmup
->sfmmu_scdp
!= NULL
) {
1926 sfmmu_leave_scd(sfmmup
, 0);
1929 ASSERT(sfmmup
->sfmmu_scdp
== NULL
);
1933 hat_free_end(struct hat
*sfmmup
)
1937 ASSERT(sfmmup
->sfmmu_xhat_provider
== NULL
);
1938 ASSERT(sfmmup
->sfmmu_free
== 1);
1939 ASSERT(sfmmup
->sfmmu_ttecnt
[TTE8K
] == 0);
1940 ASSERT(sfmmup
->sfmmu_ttecnt
[TTE64K
] == 0);
1941 ASSERT(sfmmup
->sfmmu_ttecnt
[TTE512K
] == 0);
1942 ASSERT(sfmmup
->sfmmu_ttecnt
[TTE4M
] == 0);
1943 ASSERT(sfmmup
->sfmmu_ttecnt
[TTE32M
] == 0);
1944 ASSERT(sfmmup
->sfmmu_ttecnt
[TTE256M
] == 0);
1946 if (sfmmup
->sfmmu_rmstat
) {
1947 hat_freestat(sfmmup
->sfmmu_as
, NULL
);
1950 while (sfmmup
->sfmmu_tsb
!= NULL
) {
1951 struct tsb_info
*next
= sfmmup
->sfmmu_tsb
->tsb_next
;
1952 sfmmu_tsbinfo_free(sfmmup
->sfmmu_tsb
);
1953 sfmmup
->sfmmu_tsb
= next
;
1956 if (sfmmup
->sfmmu_srdp
!= NULL
) {
1957 sfmmu_leave_srd(sfmmup
);
1958 ASSERT(sfmmup
->sfmmu_srdp
== NULL
);
1959 for (i
= 0; i
< SFMMU_L1_HMERLINKS
; i
++) {
1960 if (sfmmup
->sfmmu_hmeregion_links
[i
] != NULL
) {
1961 kmem_free(sfmmup
->sfmmu_hmeregion_links
[i
],
1962 SFMMU_L2_HMERLINKS_SIZE
);
1963 sfmmup
->sfmmu_hmeregion_links
[i
] = NULL
;
1967 sfmmu_free_sfmmu(sfmmup
);
1970 for (i
= 0; i
< SFMMU_L1_HMERLINKS
; i
++) {
1971 ASSERT(sfmmup
->sfmmu_hmeregion_links
[i
] == NULL
);
1975 kmem_cache_free(sfmmuid_cache
, sfmmup
);
1979 * Set up any translation structures, for the specified address space,
1980 * that are needed or preferred when the process is being swapped in.
1984 hat_swapin(struct hat
*hat
)
1986 ASSERT(hat
->sfmmu_xhat_provider
== NULL
);
1990 * Free all of the translation resources, for the specified address space,
1991 * that can be freed while the process is swapped out. Called from as_swapout.
1992 * Also, free up the ctx that this process was using.
1995 hat_swapout(struct hat
*sfmmup
)
1997 struct hmehash_bucket
*hmebp
;
1998 struct hme_blk
*hmeblkp
;
1999 struct hme_blk
*pr_hblk
= NULL
;
2000 struct hme_blk
*nx_hblk
;
2002 struct hme_blk
*list
= NULL
;
2003 hatlock_t
*hatlockp
;
2004 struct tsb_info
*tsbinfop
;
2006 struct free_tsb
*next
;
2007 struct tsb_info
*tsbinfop
;
2008 }; /* free list of TSBs */
2009 struct free_tsb
*freelist
, *last
, *next
;
2011 ASSERT(sfmmup
->sfmmu_xhat_provider
== NULL
);
2012 SFMMU_STAT(sf_swapout
);
2015 * There is no way to go from an as to all its translations in sfmmu.
2016 * Here is one of the times when we take the big hit and traverse
2017 * the hash looking for hme_blks to free up. Not only do we free up
2018 * this as hme_blks but all those that are free. We are obviously
2019 * swapping because we need memory so let's free up as much
2022 * Note that we don't flush TLB/TSB here -- it's not necessary
2024 * 1) we free the ctx we're using and throw away the TSB(s);
2025 * 2) processes aren't runnable while being swapped out.
2027 ASSERT(sfmmup
!= KHATID
);
2028 for (i
= 0; i
<= UHMEHASH_SZ
; i
++) {
2029 hmebp
= &uhme_hash
[i
];
2030 SFMMU_HASH_LOCK(hmebp
);
2031 hmeblkp
= hmebp
->hmeblkp
;
2035 ASSERT(!hmeblkp
->hblk_xhat_bit
);
2037 if ((hmeblkp
->hblk_tag
.htag_id
== sfmmup
) &&
2038 !hmeblkp
->hblk_shw_bit
&& !hmeblkp
->hblk_lckcnt
) {
2039 ASSERT(!hmeblkp
->hblk_shared
);
2040 (void) sfmmu_hblk_unload(sfmmup
, hmeblkp
,
2041 (caddr_t
)get_hblk_base(hmeblkp
),
2042 get_hblk_endaddr(hmeblkp
),
2045 nx_hblk
= hmeblkp
->hblk_next
;
2046 if (!hmeblkp
->hblk_vcnt
&& !hmeblkp
->hblk_hmecnt
) {
2047 ASSERT(!hmeblkp
->hblk_lckcnt
);
2048 sfmmu_hblk_hash_rm(hmebp
, hmeblkp
, pr_hblk
,
2055 SFMMU_HASH_UNLOCK(hmebp
);
2058 sfmmu_hblks_list_purge(&list
, 0);
2061 * Now free up the ctx so that others can reuse it.
2063 hatlockp
= sfmmu_hat_enter(sfmmup
);
2065 sfmmu_invalidate_ctx(sfmmup
);
2068 * Free TSBs, but not tsbinfos, and set SWAPPED flag.
2069 * If TSBs were never swapped in, just return.
2070 * This implies that we don't support partial swapping
2071 * of TSBs -- either all are swapped out, or none are.
2073 * We must hold the HAT lock here to prevent racing with another
2074 * thread trying to unmap TTEs from the TSB or running the post-
2075 * relocator after relocating the TSB's memory. Unfortunately, we
2076 * can't free memory while holding the HAT lock or we could
2077 * deadlock, so we build a list of TSBs to be freed after marking
2078 * the tsbinfos as swapped out and free them after dropping the
2081 if (SFMMU_FLAGS_ISSET(sfmmup
, HAT_SWAPPED
)) {
2082 sfmmu_hat_exit(hatlockp
);
2086 SFMMU_FLAGS_SET(sfmmup
, HAT_SWAPPED
);
2087 last
= freelist
= NULL
;
2088 for (tsbinfop
= sfmmup
->sfmmu_tsb
; tsbinfop
!= NULL
;
2089 tsbinfop
= tsbinfop
->tsb_next
) {
2090 ASSERT((tsbinfop
->tsb_flags
& TSB_SWAPPED
) == 0);
2093 * Cast the TSB into a struct free_tsb and put it on the free
2096 if (freelist
== NULL
) {
2097 last
= freelist
= (struct free_tsb
*)tsbinfop
->tsb_va
;
2099 last
->next
= (struct free_tsb
*)tsbinfop
->tsb_va
;
2103 last
->tsbinfop
= tsbinfop
;
2104 tsbinfop
->tsb_flags
|= TSB_SWAPPED
;
2106 * Zero out the TTE to clear the valid bit.
2107 * Note we can't use a value like 0xbad because we want to
2108 * ensure diagnostic bits are NEVER set on TTEs that might
2109 * be loaded. The intent is to catch any invalid access
2110 * to the swapped TSB, such as a thread running with a valid
2111 * context without first calling sfmmu_tsb_swapin() to
2112 * allocate TSB memory.
2114 tsbinfop
->tsb_tte
.ll
= 0;
2117 /* Now we can drop the lock and free the TSB memory. */
2118 sfmmu_hat_exit(hatlockp
);
2119 for (; freelist
!= NULL
; freelist
= next
) {
2120 next
= freelist
->next
;
2121 sfmmu_tsb_free(freelist
->tsbinfop
);
2126 * Duplicate the translations of an as into another newas
2130 hat_dup(struct hat
*hat
, struct hat
*newhat
, caddr_t addr
, size_t len
,
2136 extern uint_t
get_color_start(struct as
*);
2138 ASSERT(hat
->sfmmu_xhat_provider
== NULL
);
2139 ASSERT((flag
== 0) || (flag
== HAT_DUP_ALL
) || (flag
== HAT_DUP_COW
) ||
2140 (flag
== HAT_DUP_SRD
));
2141 ASSERT(hat
!= ksfmmup
);
2142 ASSERT(newhat
!= ksfmmup
);
2143 ASSERT(flag
!= HAT_DUP_ALL
|| hat
->sfmmu_srdp
== newhat
->sfmmu_srdp
);
2145 if (flag
== HAT_DUP_COW
) {
2146 panic("hat_dup: HAT_DUP_COW not supported");
2149 if (flag
== HAT_DUP_SRD
&& ((srdp
= hat
->sfmmu_srdp
) != NULL
)) {
2150 ASSERT(srdp
->srd_evp
!= NULL
);
2151 VN_HOLD(srdp
->srd_evp
);
2152 ASSERT(srdp
->srd_refcnt
> 0);
2153 newhat
->sfmmu_srdp
= srdp
;
2154 atomic_inc_32((volatile uint_t
*)&srdp
->srd_refcnt
);
2158 * HAT_DUP_ALL flag is used after as duplication is done.
2160 if (flag
== HAT_DUP_ALL
&& ((srdp
= newhat
->sfmmu_srdp
) != NULL
)) {
2161 ASSERT(newhat
->sfmmu_srdp
->srd_refcnt
>= 2);
2162 newhat
->sfmmu_rtteflags
= hat
->sfmmu_rtteflags
;
2163 if (hat
->sfmmu_flags
& HAT_4MTEXT_FLAG
) {
2164 newhat
->sfmmu_flags
|= HAT_4MTEXT_FLAG
;
2167 /* check if need to join scd */
2168 if ((scdp
= hat
->sfmmu_scdp
) != NULL
&&
2169 newhat
->sfmmu_scdp
!= scdp
) {
2171 SF_RGNMAP_IS_SUBSET(&newhat
->sfmmu_region_map
,
2172 &scdp
->scd_region_map
, ret
);
2174 sfmmu_join_scd(scdp
, newhat
);
2175 ASSERT(newhat
->sfmmu_scdp
== scdp
&&
2176 scdp
->scd_refcnt
>= 2);
2177 for (i
= 0; i
< max_mmu_page_sizes
; i
++) {
2178 newhat
->sfmmu_ismttecnt
[i
] =
2179 hat
->sfmmu_ismttecnt
[i
];
2180 newhat
->sfmmu_scdismttecnt
[i
] =
2181 hat
->sfmmu_scdismttecnt
[i
];
2185 sfmmu_check_page_sizes(newhat
, 1);
2188 if (flag
== HAT_DUP_ALL
&& consistent_coloring
== 0 &&
2189 update_proc_pgcolorbase_after_fork
!= 0) {
2190 hat
->sfmmu_clrbin
= get_color_start(hat
->sfmmu_as
);
2196 hat_memload(struct hat
*hat
, caddr_t addr
, struct page
*pp
,
2197 uint_t attr
, uint_t flags
)
2199 hat_do_memload(hat
, addr
, pp
, attr
, flags
,
2200 SFMMU_INVALID_SHMERID
);
2204 hat_memload_region(struct hat
*hat
, caddr_t addr
, struct page
*pp
,
2205 uint_t attr
, uint_t flags
, hat_region_cookie_t rcookie
)
2208 if (rcookie
== HAT_INVALID_REGION_COOKIE
||
2209 hat
->sfmmu_xhat_provider
!= NULL
) {
2210 hat_do_memload(hat
, addr
, pp
, attr
, flags
,
2211 SFMMU_INVALID_SHMERID
);
2214 rid
= (uint_t
)((uint64_t)rcookie
);
2215 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
2216 hat_do_memload(hat
, addr
, pp
, attr
, flags
, rid
);
2220 * Set up addr to map to page pp with protection prot.
2221 * As an optimization we also load the TSB with the
2222 * corresponding tte but it is no big deal if the tte gets kicked out.
2225 hat_do_memload(struct hat
*hat
, caddr_t addr
, struct page
*pp
,
2226 uint_t attr
, uint_t flags
, uint_t rid
)
2231 ASSERT(hat
!= NULL
);
2232 ASSERT(PAGE_LOCKED(pp
));
2233 ASSERT(!((uintptr_t)addr
& MMU_PAGEOFFSET
));
2234 ASSERT(!(flags
& ~SFMMU_LOAD_ALLFLAG
));
2235 ASSERT(!(attr
& ~SFMMU_LOAD_ALLATTR
));
2236 SFMMU_VALIDATE_HMERID(hat
, rid
, addr
, MMU_PAGESIZE
);
2238 if (PP_ISFREE(pp
)) {
2239 panic("hat_memload: loading a mapping to free page %p",
2243 if (hat
->sfmmu_xhat_provider
) {
2244 /* no regions for xhats */
2245 ASSERT(!SFMMU_IS_SHMERID_VALID(rid
));
2246 XHAT_MEMLOAD(hat
, addr
, pp
, attr
, flags
);
2250 ASSERT((hat
== ksfmmup
) || AS_LOCK_HELD(hat
->sfmmu_as
));
2252 if (flags
& ~SFMMU_LOAD_ALLFLAG
)
2253 cmn_err(CE_NOTE
, "hat_memload: unsupported flags %d",
2254 flags
& ~SFMMU_LOAD_ALLFLAG
);
2256 if (hat
->sfmmu_rmstat
)
2257 hat_resvstat(MMU_PAGESIZE
, hat
->sfmmu_as
, addr
);
2259 #if defined(SF_ERRATA_57)
2260 if ((hat
!= ksfmmup
) && AS_TYPE_64BIT(hat
->sfmmu_as
) &&
2261 (addr
< errata57_limit
) && (attr
& PROT_EXEC
) &&
2262 !(flags
& HAT_LOAD_SHARE
)) {
2263 cmn_err(CE_WARN
, "hat_memload: illegal attempt to make user "
2264 " page executable");
2269 sfmmu_memtte(&tte
, pp
->p_pagenum
, attr
, TTE8K
);
2270 (void) sfmmu_tteload_array(hat
, &tte
, addr
, &pp
, flags
, rid
);
2273 * Check TSB and TLB page sizes.
2275 if ((flags
& HAT_LOAD_SHARE
) == 0) {
2276 sfmmu_check_page_sizes(hat
, 1);
2281 * hat_devload can be called to map real memory (e.g.
2282 * /dev/kmem) and even though hat_devload will determine pf is
2283 * for memory, it will be unable to get a shared lock on the
2284 * page (because someone else has it exclusively) and will
2285 * pass dp = NULL. If tteload doesn't get a non-NULL
2286 * page pointer it can't cache memory.
2289 hat_devload(struct hat
*hat
, caddr_t addr
, size_t len
, pfn_t pfn
,
2290 uint_t attr
, int flags
)
2293 struct page
*pp
= NULL
;
2296 ASSERT(hat
!= NULL
);
2298 if (hat
->sfmmu_xhat_provider
) {
2299 XHAT_DEVLOAD(hat
, addr
, len
, pfn
, attr
, flags
);
2303 ASSERT(!(flags
& ~SFMMU_LOAD_ALLFLAG
));
2304 ASSERT(!(attr
& ~SFMMU_LOAD_ALLATTR
));
2305 ASSERT((hat
== ksfmmup
) || AS_LOCK_HELD(hat
->sfmmu_as
));
2307 panic("hat_devload: zero len");
2308 if (flags
& ~SFMMU_LOAD_ALLFLAG
)
2309 cmn_err(CE_NOTE
, "hat_devload: unsupported flags %d",
2310 flags
& ~SFMMU_LOAD_ALLFLAG
);
2312 #if defined(SF_ERRATA_57)
2313 if ((hat
!= ksfmmup
) && AS_TYPE_64BIT(hat
->sfmmu_as
) &&
2314 (addr
< errata57_limit
) && (attr
& PROT_EXEC
) &&
2315 !(flags
& HAT_LOAD_SHARE
)) {
2316 cmn_err(CE_WARN
, "hat_devload: illegal attempt to make user "
2317 " page executable");
2323 * If it's a memory page find its pp
2325 if (!(flags
& HAT_LOAD_NOCONSIST
) && pf_is_memory(pfn
)) {
2326 pp
= page_numtopp_nolock(pfn
);
2328 flags
|= HAT_LOAD_NOCONSIST
;
2330 if (PP_ISFREE(pp
)) {
2331 panic("hat_memload: loading "
2332 "a mapping to free page %p",
2335 if (!PAGE_LOCKED(pp
) && !PP_ISNORELOC(pp
)) {
2336 panic("hat_memload: loading a mapping "
2337 "to unlocked relocatable page %p",
2340 ASSERT(len
== MMU_PAGESIZE
);
2344 if (hat
->sfmmu_rmstat
)
2345 hat_resvstat(len
, hat
->sfmmu_as
, addr
);
2347 if (flags
& HAT_LOAD_NOCONSIST
) {
2348 attr
|= SFMMU_UNCACHEVTTE
;
2351 if (!pf_is_memory(pfn
)) {
2352 attr
|= SFMMU_UNCACHEPTTE
| HAT_NOSYNC
;
2354 switch (attr
& HAT_ORDER_MASK
) {
2355 case HAT_STRICTORDER
:
2356 case HAT_UNORDERED_OK
:
2358 * we set the side effect bit for all non
2359 * memory mappings unless merging is ok
2361 attr
|= SFMMU_SIDEFFECT
;
2363 case HAT_MERGING_OK
:
2364 case HAT_LOADCACHING_OK
:
2365 case HAT_STORECACHING_OK
:
2368 panic("hat_devload: bad attr");
2374 sfmmu_memtte(&tte
, pfn
, attr
, TTE8K
);
2375 (void) sfmmu_tteload_array(hat
, &tte
, addr
, &pp
,
2376 flags
, SFMMU_INVALID_SHMERID
);
2377 len
-= MMU_PAGESIZE
;
2378 addr
+= MMU_PAGESIZE
;
2383 * try to use large pages, check va/pa alignments
2384 * Note that 32M/256M page sizes are not (yet) supported.
2386 if ((len
>= MMU_PAGESIZE4M
) &&
2387 !((uintptr_t)addr
& MMU_PAGEOFFSET4M
) &&
2388 !(disable_large_pages
& (1 << TTE4M
)) &&
2389 !(mmu_ptob(pfn
) & MMU_PAGEOFFSET4M
)) {
2390 sfmmu_memtte(&tte
, pfn
, attr
, TTE4M
);
2391 (void) sfmmu_tteload_array(hat
, &tte
, addr
, &pp
,
2392 flags
, SFMMU_INVALID_SHMERID
);
2393 len
-= MMU_PAGESIZE4M
;
2394 addr
+= MMU_PAGESIZE4M
;
2395 pfn
+= MMU_PAGESIZE4M
/ MMU_PAGESIZE
;
2396 } else if ((len
>= MMU_PAGESIZE512K
) &&
2397 !((uintptr_t)addr
& MMU_PAGEOFFSET512K
) &&
2398 !(disable_large_pages
& (1 << TTE512K
)) &&
2399 !(mmu_ptob(pfn
) & MMU_PAGEOFFSET512K
)) {
2400 sfmmu_memtte(&tte
, pfn
, attr
, TTE512K
);
2401 (void) sfmmu_tteload_array(hat
, &tte
, addr
, &pp
,
2402 flags
, SFMMU_INVALID_SHMERID
);
2403 len
-= MMU_PAGESIZE512K
;
2404 addr
+= MMU_PAGESIZE512K
;
2405 pfn
+= MMU_PAGESIZE512K
/ MMU_PAGESIZE
;
2406 } else if ((len
>= MMU_PAGESIZE64K
) &&
2407 !((uintptr_t)addr
& MMU_PAGEOFFSET64K
) &&
2408 !(disable_large_pages
& (1 << TTE64K
)) &&
2409 !(mmu_ptob(pfn
) & MMU_PAGEOFFSET64K
)) {
2410 sfmmu_memtte(&tte
, pfn
, attr
, TTE64K
);
2411 (void) sfmmu_tteload_array(hat
, &tte
, addr
, &pp
,
2412 flags
, SFMMU_INVALID_SHMERID
);
2413 len
-= MMU_PAGESIZE64K
;
2414 addr
+= MMU_PAGESIZE64K
;
2415 pfn
+= MMU_PAGESIZE64K
/ MMU_PAGESIZE
;
2417 sfmmu_memtte(&tte
, pfn
, attr
, TTE8K
);
2418 (void) sfmmu_tteload_array(hat
, &tte
, addr
, &pp
,
2419 flags
, SFMMU_INVALID_SHMERID
);
2420 len
-= MMU_PAGESIZE
;
2421 addr
+= MMU_PAGESIZE
;
2427 * Check TSB and TLB page sizes.
2429 if ((flags
& HAT_LOAD_SHARE
) == 0) {
2430 sfmmu_check_page_sizes(hat
, 1);
2435 hat_memload_array(struct hat
*hat
, caddr_t addr
, size_t len
,
2436 struct page
**pps
, uint_t attr
, uint_t flags
)
2438 hat_do_memload_array(hat
, addr
, len
, pps
, attr
, flags
,
2439 SFMMU_INVALID_SHMERID
);
2443 hat_memload_array_region(struct hat
*hat
, caddr_t addr
, size_t len
,
2444 struct page
**pps
, uint_t attr
, uint_t flags
,
2445 hat_region_cookie_t rcookie
)
2448 if (rcookie
== HAT_INVALID_REGION_COOKIE
||
2449 hat
->sfmmu_xhat_provider
!= NULL
) {
2450 hat_do_memload_array(hat
, addr
, len
, pps
, attr
, flags
,
2451 SFMMU_INVALID_SHMERID
);
2454 rid
= (uint_t
)((uint64_t)rcookie
);
2455 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
2456 hat_do_memload_array(hat
, addr
, len
, pps
, attr
, flags
, rid
);
2460 * Map the largest extend possible out of the page array. The array may NOT
2461 * be in order. The largest possible mapping a page can have
2462 * is specified in the p_szc field. The p_szc field
2463 * cannot change as long as there any mappings (large or small)
2464 * to any of the pages that make up the large page. (ie. any
2465 * promotion/demotion of page size is not up to the hat but up to
2466 * the page free list manager). The array
2467 * should consist of properly aligned contigous pages that are
2468 * part of a big page for a large mapping to be created.
2471 hat_do_memload_array(struct hat
*hat
, caddr_t addr
, size_t len
,
2472 struct page
**pps
, uint_t attr
, uint_t flags
, uint_t rid
)
2476 pgcnt_t numpg
, npgs
;
2479 uint_t large_pages_disable
;
2481 ASSERT(!((uintptr_t)addr
& MMU_PAGEOFFSET
));
2482 SFMMU_VALIDATE_HMERID(hat
, rid
, addr
, len
);
2484 if (hat
->sfmmu_xhat_provider
) {
2485 ASSERT(!SFMMU_IS_SHMERID_VALID(rid
));
2486 XHAT_MEMLOAD_ARRAY(hat
, addr
, len
, pps
, attr
, flags
);
2490 if (hat
->sfmmu_rmstat
)
2491 hat_resvstat(len
, hat
->sfmmu_as
, addr
);
2493 #if defined(SF_ERRATA_57)
2494 if ((hat
!= ksfmmup
) && AS_TYPE_64BIT(hat
->sfmmu_as
) &&
2495 (addr
< errata57_limit
) && (attr
& PROT_EXEC
) &&
2496 !(flags
& HAT_LOAD_SHARE
)) {
2497 cmn_err(CE_WARN
, "hat_memload_array: illegal attempt to make "
2498 "user page executable");
2503 /* Get number of pages */
2504 npgs
= len
>> MMU_PAGESHIFT
;
2506 if (flags
& HAT_LOAD_SHARE
) {
2507 large_pages_disable
= disable_ism_large_pages
;
2509 large_pages_disable
= disable_large_pages
;
2512 if (npgs
< NHMENTS
|| large_pages_disable
== LARGE_PAGES_OFF
) {
2513 sfmmu_memload_batchsmall(hat
, addr
, pps
, attr
, flags
, npgs
,
2518 while (npgs
>= NHMENTS
) {
2520 for (ttesz
= pp
->p_szc
; ttesz
!= TTE8K
; ttesz
--) {
2522 * Check if this page size is disabled.
2524 if (large_pages_disable
& (1 << ttesz
))
2527 numpg
= TTEPAGES(ttesz
);
2528 mapsz
= numpg
<< MMU_PAGESHIFT
;
2529 if ((npgs
>= numpg
) &&
2530 IS_P2ALIGNED(addr
, mapsz
) &&
2531 IS_P2ALIGNED(pp
->p_pagenum
, numpg
)) {
2533 * At this point we have enough pages and
2534 * we know the virtual address and the pfn
2535 * are properly aligned. We still need
2536 * to check for physical contiguity but since
2537 * it is very likely that this is the case
2538 * we will assume they are so and undo
2539 * the request if necessary. It would
2540 * be great if we could get a hint flag
2541 * like HAT_CONTIG which would tell us
2542 * the pages are contigous for sure.
2544 sfmmu_memtte(&tte
, (*pps
)->p_pagenum
,
2546 if (!sfmmu_tteload_array(hat
, &tte
, addr
,
2552 if (ttesz
== TTE8K
) {
2554 * We were not able to map array using a large page
2555 * batch a hmeblk or fraction at a time.
2557 numpg
= ((uintptr_t)addr
>> MMU_PAGESHIFT
)
2559 numpg
= NHMENTS
- numpg
;
2560 ASSERT(numpg
<= npgs
);
2561 mapsz
= numpg
* MMU_PAGESIZE
;
2562 sfmmu_memload_batchsmall(hat
, addr
, pps
, attr
, flags
,
2571 sfmmu_memload_batchsmall(hat
, addr
, pps
, attr
, flags
, npgs
,
2576 * Check TSB and TLB page sizes.
2578 if ((flags
& HAT_LOAD_SHARE
) == 0) {
2579 sfmmu_check_page_sizes(hat
, 1);
2584 * Function tries to batch 8K pages into the same hme blk.
2587 sfmmu_memload_batchsmall(struct hat
*hat
, caddr_t vaddr
, page_t
**pps
,
2588 uint_t attr
, uint_t flags
, pgcnt_t npgs
, uint_t rid
)
2592 struct hmehash_bucket
*hmebp
;
2593 struct hme_blk
*hmeblkp
;
2598 * Acquire the hash bucket.
2600 hmebp
= sfmmu_tteload_acquire_hashbucket(hat
, vaddr
, TTE8K
,
2605 * Find the hment block.
2607 hmeblkp
= sfmmu_tteload_find_hmeblk(hat
, hmebp
, vaddr
,
2616 sfmmu_memtte(&tte
, pp
->p_pagenum
, attr
, TTE8K
);
2619 * Add the translation.
2621 (void) sfmmu_tteload_addentry(hat
, hmeblkp
, &tte
,
2622 vaddr
, pps
, flags
, rid
);
2631 * Goto next address.
2633 vaddr
+= MMU_PAGESIZE
;
2636 * Don't crossover into a different hmentblk.
2638 index
= (int)(((uintptr_t)vaddr
>> MMU_PAGESHIFT
) &
2641 } while (index
!= 0 && npgs
!= 0);
2644 * Release the hash bucket.
2647 sfmmu_tteload_release_hashbucket(hmebp
);
2652 * Construct a tte for a page:
2655 * tte_size2 = size & TTE_SZ2_BITS (Panther and Olympus-C only)
2657 * tte_nfo = attr & HAT_NOFAULT
2658 * tte_ie = attr & HAT_STRUCTURE_LE
2659 * tte_hmenum = hmenum
2660 * tte_pahi = pp->p_pagenum >> TTE_PASHIFT;
2661 * tte_palo = pp->p_pagenum & TTE_PALOMASK;
2662 * tte_ref = 1 (optimization)
2663 * tte_wr_perm = attr & PROT_WRITE;
2664 * tte_no_sync = attr & HAT_NOSYNC
2665 * tte_lock = attr & SFMMU_LOCKTTE
2666 * tte_cp = !(attr & SFMMU_UNCACHEPTTE)
2667 * tte_cv = !(attr & SFMMU_UNCACHEVTTE)
2668 * tte_e = attr & SFMMU_SIDEFFECT
2669 * tte_priv = !(attr & PROT_USER)
2670 * tte_hwwr = if nosync is set and it is writable we set the mod bit (opt)
2674 sfmmu_memtte(tte_t
*ttep
, pfn_t pfn
, uint_t attr
, int tte_sz
)
2676 ASSERT(!(attr
& ~SFMMU_LOAD_ALLATTR
));
2678 ttep
->tte_inthi
= MAKE_TTE_INTHI(pfn
, attr
, tte_sz
, 0 /* hmenum */);
2679 ttep
->tte_intlo
= MAKE_TTE_INTLO(pfn
, attr
, tte_sz
, 0 /* hmenum */);
2681 if (TTE_IS_NOSYNC(ttep
)) {
2683 if (TTE_IS_WRITABLE(ttep
)) {
2687 if (TTE_IS_NFO(ttep
) && TTE_IS_EXECUTABLE(ttep
)) {
2688 panic("sfmmu_memtte: can't set both NFO and EXEC bits");
2693 * This function will add a translation to the hme_blk and allocate the
2694 * hme_blk if one does not exist.
2695 * If a page structure is specified then it will add the
2696 * corresponding hment to the mapping list.
2697 * It will also update the hmenum field for the tte.
2699 * Currently this function is only used for kernel mappings.
2700 * So pass invalid region to sfmmu_tteload_array().
2703 sfmmu_tteload(struct hat
*sfmmup
, tte_t
*ttep
, caddr_t vaddr
, page_t
*pp
,
2706 ASSERT(sfmmup
== ksfmmup
);
2707 (void) sfmmu_tteload_array(sfmmup
, ttep
, vaddr
, &pp
, flags
,
2708 SFMMU_INVALID_SHMERID
);
2712 * Load (ttep != NULL) or unload (ttep == NULL) one entry in the TSB.
2713 * Assumes that a particular page size may only be resident in one TSB.
2716 sfmmu_mod_tsb(sfmmu_t
*sfmmup
, caddr_t vaddr
, tte_t
*ttep
, int ttesz
)
2718 struct tsb_info
*tsbinfop
= NULL
;
2720 struct tsbe
*tsbe_addr
;
2723 int vpshift
= MMU_PAGESHIFT
;
2726 if (sfmmup
== ksfmmup
) { /* No support for 32/256M ksfmmu pages */
2728 if (ttesz
>= TTE4M
) {
2730 ASSERT((ttesz
!= TTE32M
) && (ttesz
!= TTE256M
));
2732 tsb_base
= (phys
)? ktsb4m_pbase
: (uint64_t)ktsb4m_base
;
2733 tsb_size
= ktsb4m_szcode
;
2735 tsb_base
= (phys
)? ktsb_pbase
: (uint64_t)ktsb_base
;
2736 tsb_size
= ktsb_szcode
;
2739 SFMMU_GET_TSBINFO(tsbinfop
, sfmmup
, ttesz
);
2742 * If there isn't a TSB for this page size, or the TSB is
2743 * swapped out, there is nothing to do. Note that the latter
2744 * case seems impossible but can occur if hat_pageunload()
2745 * is called on an ISM mapping while the process is swapped
2748 if (tsbinfop
== NULL
|| (tsbinfop
->tsb_flags
& TSB_SWAPPED
))
2752 * If another thread is in the middle of relocating a TSB
2753 * we can't unload the entry so set a flag so that the
2754 * TSB will be flushed before it can be accessed by the
2757 if ((tsbinfop
->tsb_flags
& TSB_RELOC_FLAG
) != 0) {
2759 tsbinfop
->tsb_flags
|= TSB_FLUSH_NEEDED
;
2762 #if defined(UTSB_PHYS)
2764 tsb_base
= (uint64_t)tsbinfop
->tsb_pa
;
2766 tsb_base
= (uint64_t)tsbinfop
->tsb_va
;
2768 tsb_size
= tsbinfop
->tsb_szc
;
2771 vpshift
= MMU_PAGESHIFT4M
;
2773 tsbe_addr
= sfmmu_get_tsbe(tsb_base
, vaddr
, vpshift
, tsb_size
);
2774 tag
= sfmmu_make_tsbtag(vaddr
);
2777 sfmmu_unload_tsbe(tsbe_addr
, tag
, phys
);
2779 if (ttesz
>= TTE4M
) {
2780 SFMMU_STAT(sf_tsb_load4m
);
2782 SFMMU_STAT(sf_tsb_load8k
);
2785 sfmmu_load_tsbe(tsbe_addr
, tag
, ttep
, phys
);
2790 * Unmap all entries from [start, end) matching the given page size.
2792 * This function is used primarily to unmap replicated 64K or 512K entries
2793 * from the TSB that are inserted using the base page size TSB pointer, but
2794 * it may also be called to unmap a range of addresses from the TSB.
2797 sfmmu_unload_tsb_range(sfmmu_t
*sfmmup
, caddr_t start
, caddr_t end
, int ttesz
)
2799 struct tsb_info
*tsbinfop
;
2801 struct tsbe
*tsbe_addr
;
2810 * If ttesz == 8K, 64K or 512K, we walk through the range 8K
2811 * at a time shooting down any valid entries we encounter.
2813 * If ttesz >= 4M we walk the range 4M at a time shooting
2814 * down any valid mappings we find.
2816 if (sfmmup
== ksfmmup
) {
2818 if (ttesz
>= TTE4M
) {
2820 ASSERT((ttesz
!= TTE32M
) && (ttesz
!= TTE256M
));
2822 tsb_base
= (phys
)? ktsb4m_pbase
: (uint64_t)ktsb4m_base
;
2823 tsb_size
= ktsb4m_szcode
;
2825 tsb_base
= (phys
)? ktsb_pbase
: (uint64_t)ktsb_base
;
2826 tsb_size
= ktsb_szcode
;
2829 SFMMU_GET_TSBINFO(tsbinfop
, sfmmup
, ttesz
);
2832 * If there isn't a TSB for this page size, or the TSB is
2833 * swapped out, there is nothing to do. Note that the latter
2834 * case seems impossible but can occur if hat_pageunload()
2835 * is called on an ISM mapping while the process is swapped
2838 if (tsbinfop
== NULL
|| (tsbinfop
->tsb_flags
& TSB_SWAPPED
))
2842 * If another thread is in the middle of relocating a TSB
2843 * we can't unload the entry so set a flag so that the
2844 * TSB will be flushed before it can be accessed by the
2847 if ((tsbinfop
->tsb_flags
& TSB_RELOC_FLAG
) != 0) {
2848 tsbinfop
->tsb_flags
|= TSB_FLUSH_NEEDED
;
2851 #if defined(UTSB_PHYS)
2853 tsb_base
= (uint64_t)tsbinfop
->tsb_pa
;
2855 tsb_base
= (uint64_t)tsbinfop
->tsb_va
;
2857 tsb_size
= tsbinfop
->tsb_szc
;
2859 if (ttesz
>= TTE4M
) {
2860 vpshift
= MMU_PAGESHIFT4M
;
2861 vpgsz
= MMU_PAGESIZE4M
;
2863 vpshift
= MMU_PAGESHIFT
;
2864 vpgsz
= MMU_PAGESIZE
;
2867 for (vaddr
= start
; vaddr
< end
; vaddr
+= vpgsz
) {
2868 tag
= sfmmu_make_tsbtag(vaddr
);
2869 tsbe_addr
= sfmmu_get_tsbe(tsb_base
, vaddr
, vpshift
, tsb_size
);
2870 sfmmu_unload_tsbe(tsbe_addr
, tag
, phys
);
2875 * Select the optimum TSB size given the number of mappings
2876 * that need to be cached.
2879 sfmmu_select_tsb_szc(pgcnt_t pgcnt
)
2884 if (tsb_grow_stress
) {
2885 uint32_t randval
= (uint32_t)gettick() >> 4;
2886 return (randval
% (tsb_max_growsize
+ 1));
2890 while ((szc
< tsb_max_growsize
) && (pgcnt
> SFMMU_RSS_TSBSIZE(szc
)))
2896 * This function will add a translation to the hme_blk and allocate the
2897 * hme_blk if one does not exist.
2898 * If a page structure is specified then it will add the
2899 * corresponding hment to the mapping list.
2900 * It will also update the hmenum field for the tte.
2901 * Furthermore, it attempts to create a large page translation
2902 * for <addr,hat> at page array pps. It assumes addr and first
2903 * pp is correctly aligned. It returns 0 if successful and 1 otherwise.
2906 sfmmu_tteload_array(sfmmu_t
*sfmmup
, tte_t
*ttep
, caddr_t vaddr
,
2907 page_t
**pps
, uint_t flags
, uint_t rid
)
2909 struct hmehash_bucket
*hmebp
;
2910 struct hme_blk
*hmeblkp
;
2917 size
= TTE_CSZ(ttep
);
2918 ASSERT(!((uintptr_t)vaddr
& TTE_PAGE_OFFSET(size
)));
2921 * Acquire the hash bucket.
2923 hmebp
= sfmmu_tteload_acquire_hashbucket(sfmmup
, vaddr
, size
, rid
);
2927 * Find the hment block.
2929 hmeblkp
= sfmmu_tteload_find_hmeblk(sfmmup
, hmebp
, vaddr
, size
, flags
,
2934 * Add the translation.
2936 ret
= sfmmu_tteload_addentry(sfmmup
, hmeblkp
, ttep
, vaddr
, pps
, flags
,
2940 * Release the hash bucket.
2942 sfmmu_tteload_release_hashbucket(hmebp
);
2948 * Function locks and returns a pointer to the hash bucket for vaddr and size.
2950 static struct hmehash_bucket
*
2951 sfmmu_tteload_acquire_hashbucket(sfmmu_t
*sfmmup
, caddr_t vaddr
, int size
,
2954 struct hmehash_bucket
*hmebp
;
2956 void *htagid
= sfmmutohtagid(sfmmup
, rid
);
2958 ASSERT(htagid
!= NULL
);
2960 hmeshift
= HME_HASH_SHIFT(size
);
2962 hmebp
= HME_HASH_FUNCTION(htagid
, vaddr
, hmeshift
);
2964 SFMMU_HASH_LOCK(hmebp
);
2970 * Function returns a pointer to an hmeblk in the hash bucket, hmebp. If the
2971 * hmeblk doesn't exists for the [sfmmup, vaddr & size] signature, a hmeblk is
2974 static struct hme_blk
*
2975 sfmmu_tteload_find_hmeblk(sfmmu_t
*sfmmup
, struct hmehash_bucket
*hmebp
,
2976 caddr_t vaddr
, uint_t size
, uint_t flags
, uint_t rid
)
2980 struct hme_blk
*hmeblkp
, *pr_hblk
, *list
= NULL
;
2982 SFMMU_VALIDATE_HMERID(sfmmup
, rid
, vaddr
, TTEBYTES(size
));
2984 hblktag
.htag_id
= sfmmutohtagid(sfmmup
, rid
);
2985 ASSERT(hblktag
.htag_id
!= NULL
);
2986 hmeshift
= HME_HASH_SHIFT(size
);
2987 hblktag
.htag_bspage
= HME_HASH_BSPAGE(vaddr
, hmeshift
);
2988 hblktag
.htag_rehash
= HME_HASH_REHASH(size
);
2989 hblktag
.htag_rid
= rid
;
2993 HME_HASH_SEARCH_PREV(hmebp
, hblktag
, hmeblkp
, pr_hblk
, &list
);
2996 * We block until hblk_reserve_lock is released; it's held by
2997 * the thread, temporarily using hblk_reserve, until hblk_reserve is
2998 * replaced by a hblk from sfmmu8_cache.
3000 if (hmeblkp
== (struct hme_blk
*)hblk_reserve
&&
3001 hblk_reserve_thread
!= curthread
) {
3002 SFMMU_HASH_UNLOCK(hmebp
);
3003 mutex_enter(&hblk_reserve_lock
);
3004 mutex_exit(&hblk_reserve_lock
);
3005 SFMMU_STAT(sf_hblk_reserve_hit
);
3006 SFMMU_HASH_LOCK(hmebp
);
3007 goto ttearray_realloc
;
3010 if (hmeblkp
== NULL
) {
3011 hmeblkp
= sfmmu_hblk_alloc(sfmmup
, vaddr
, hmebp
, size
,
3012 hblktag
, flags
, rid
);
3013 ASSERT(!SFMMU_IS_SHMERID_VALID(rid
) || hmeblkp
->hblk_shared
);
3014 ASSERT(SFMMU_IS_SHMERID_VALID(rid
) || !hmeblkp
->hblk_shared
);
3017 * It is possible for 8k and 64k hblks to collide since they
3018 * have the same rehash value. This is because we
3019 * lazily free hblks and 8K/64K blks could be lingering.
3020 * If we find size mismatch we free the block and & try again.
3022 if (get_hblk_ttesz(hmeblkp
) != size
) {
3023 ASSERT(!hmeblkp
->hblk_vcnt
);
3024 ASSERT(!hmeblkp
->hblk_hmecnt
);
3025 sfmmu_hblk_hash_rm(hmebp
, hmeblkp
, pr_hblk
,
3027 goto ttearray_realloc
;
3029 if (hmeblkp
->hblk_shw_bit
) {
3031 * if the hblk was previously used as a shadow hblk then
3032 * we will change it to a normal hblk
3034 ASSERT(!hmeblkp
->hblk_shared
);
3035 if (hmeblkp
->hblk_shw_mask
) {
3036 sfmmu_shadow_hcleanup(sfmmup
, hmeblkp
, hmebp
);
3037 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp
));
3038 goto ttearray_realloc
;
3040 hmeblkp
->hblk_shw_bit
= 0;
3043 SFMMU_STAT(sf_hblk_hit
);
3047 * hat_memload() should never call kmem_cache_free() for kernel hmeblks;
3048 * see block comment showing the stacktrace in sfmmu_hblk_alloc();
3049 * set the flag parameter to 1 so that sfmmu_hblks_list_purge() will
3050 * just add these hmeblks to the per-cpu pending queue.
3052 sfmmu_hblks_list_purge(&list
, 1);
3054 ASSERT(get_hblk_ttesz(hmeblkp
) == size
);
3055 ASSERT(!hmeblkp
->hblk_shw_bit
);
3056 ASSERT(!SFMMU_IS_SHMERID_VALID(rid
) || hmeblkp
->hblk_shared
);
3057 ASSERT(SFMMU_IS_SHMERID_VALID(rid
) || !hmeblkp
->hblk_shared
);
3058 ASSERT(hmeblkp
->hblk_tag
.htag_rid
== rid
);
3064 * Function adds a tte entry into the hmeblk. It returns 0 if successful and 1
3068 sfmmu_tteload_addentry(sfmmu_t
*sfmmup
, struct hme_blk
*hmeblkp
, tte_t
*ttep
,
3069 caddr_t vaddr
, page_t
**pps
, uint_t flags
, uint_t rid
)
3072 int hmenum
, size
, remap
;
3073 tte_t tteold
, flush_tte
;
3077 struct sf_hment
*sfhme
;
3078 kmutex_t
*pml
, *pmtx
;
3079 hatlock_t
*hatlockp
;
3083 * remove this panic when we decide to let user virtual address
3084 * space be >= USERLIMIT.
3086 if (!TTE_IS_PRIVILEGED(ttep
) && vaddr
>= (caddr_t
)USERLIMIT
)
3087 panic("user addr %p in kernel space", (void *)vaddr
);
3088 #if defined(TTE_IS_GLOBAL)
3089 if (TTE_IS_GLOBAL(ttep
))
3090 panic("sfmmu_tteload: creating global tte");
3094 if (pf_is_memory(sfmmu_ttetopfn(ttep
, vaddr
)) &&
3095 !TTE_IS_PCACHEABLE(ttep
) && !sfmmu_allow_nc_trans
)
3096 panic("sfmmu_tteload: non cacheable memory tte");
3099 /* don't simulate dirty bit for writeable ISM/DISM mappings */
3100 if ((flags
& HAT_LOAD_SHARE
) && TTE_IS_WRITABLE(ttep
)) {
3105 if ((flags
& HAT_LOAD_SHARE
) || !TTE_IS_REF(ttep
) ||
3106 !TTE_IS_MOD(ttep
)) {
3108 * Don't load TSB for dummy as in ISM. Also don't preload
3109 * the TSB if the TTE isn't writable since we're likely to
3110 * fault on it again -- preloading can be fairly expensive.
3112 flags
|= SFMMU_NO_TSBLOAD
;
3115 size
= TTE_CSZ(ttep
);
3118 SFMMU_STAT(sf_tteload8k
);
3121 SFMMU_STAT(sf_tteload64k
);
3124 SFMMU_STAT(sf_tteload512k
);
3127 SFMMU_STAT(sf_tteload4m
);
3130 SFMMU_STAT(sf_tteload32m
);
3131 ASSERT(mmu_page_sizes
== max_mmu_page_sizes
);
3134 SFMMU_STAT(sf_tteload256m
);
3135 ASSERT(mmu_page_sizes
== max_mmu_page_sizes
);
3139 ASSERT(!((uintptr_t)vaddr
& TTE_PAGE_OFFSET(size
)));
3140 SFMMU_VALIDATE_HMERID(sfmmup
, rid
, vaddr
, TTEBYTES(size
));
3141 ASSERT(!SFMMU_IS_SHMERID_VALID(rid
) || hmeblkp
->hblk_shared
);
3142 ASSERT(SFMMU_IS_SHMERID_VALID(rid
) || !hmeblkp
->hblk_shared
);
3144 HBLKTOHME_IDX(sfhme
, hmeblkp
, vaddr
, hmenum
);
3147 * Need to grab mlist lock here so that pageunload
3148 * will not change tte behind us.
3151 pml
= sfmmu_mlist_enter(pp
);
3154 sfmmu_copytte(&sfhme
->hme_tte
, &tteold
);
3156 * Look for corresponding hment and if valid verify
3159 remap
= TTE_IS_VALID(&tteold
);
3161 pfn_t new_pfn
, old_pfn
;
3163 old_pfn
= TTE_TO_PFN(vaddr
, &tteold
);
3164 new_pfn
= TTE_TO_PFN(vaddr
, ttep
);
3166 if (flags
& HAT_LOAD_REMAP
) {
3167 /* make sure we are remapping same type of pages */
3168 if (pf_is_memory(old_pfn
) != pf_is_memory(new_pfn
)) {
3169 panic("sfmmu_tteload - tte remap io<->memory");
3171 if (old_pfn
!= new_pfn
&&
3172 (pp
!= NULL
|| sfhme
->hme_page
!= NULL
)) {
3173 panic("sfmmu_tteload - tte remap pp != NULL");
3175 } else if (old_pfn
!= new_pfn
) {
3176 panic("sfmmu_tteload - tte remap, hmeblkp 0x%p",
3179 ASSERT(TTE_CSZ(&tteold
) == TTE_CSZ(ttep
));
3183 if (size
== TTE8K
) {
3186 * Handle VAC consistency
3188 if (!remap
&& (cache
& CACHE_VAC
) && !PP_ISNC(pp
)) {
3189 sfmmu_vac_conflict(sfmmup
, vaddr
, pp
);
3193 if (TTE_IS_WRITABLE(ttep
) && PP_ISRO(pp
)) {
3194 pmtx
= sfmmu_page_enter(pp
);
3196 sfmmu_page_exit(pmtx
);
3197 } else if (!PP_ISMAPPED(pp
) &&
3198 (!TTE_IS_WRITABLE(ttep
)) && !(PP_ISMOD(pp
))) {
3199 pmtx
= sfmmu_page_enter(pp
);
3200 if (!(PP_ISMOD(pp
))) {
3203 sfmmu_page_exit(pmtx
);
3206 } else if (sfmmu_pagearray_setup(vaddr
, pps
, ttep
, remap
)) {
3208 * sfmmu_pagearray_setup failed so return
3210 sfmmu_mlist_exit(pml
);
3216 * Make sure hment is not on a mapping list.
3218 ASSERT(remap
|| (sfhme
->hme_page
== NULL
));
3220 /* if it is not a remap then hme->next better be NULL */
3221 ASSERT((!remap
) ? sfhme
->hme_next
== NULL
: 1);
3223 if (flags
& HAT_LOAD_LOCK
) {
3224 if ((hmeblkp
->hblk_lckcnt
+ 1) >= MAX_HBLK_LCKCNT
) {
3225 panic("too high lckcnt-hmeblk %p",
3228 atomic_inc_32(&hmeblkp
->hblk_lckcnt
);
3230 HBLK_STACK_TRACE(hmeblkp
, HBLK_LOCK
);
3234 if (pp
&& PP_ISNC(pp
)) {
3236 * If the physical page is marked to be uncacheable, like
3237 * by a vac conflict, make sure the new mapping is also
3240 TTE_CLR_VCACHEABLE(ttep
);
3241 ASSERT(PP_GET_VCOLOR(pp
) == NO_VCOLOR
);
3244 ttep
->tte_hmenum
= hmenum
;
3250 while (sfmmu_modifytte_try(&tteold
, ttep
, &sfhme
->hme_tte
) < 0) {
3251 if ((sfmmup
== KHATID
) &&
3252 (flags
& (HAT_LOAD_LOCK
| HAT_LOAD_REMAP
))) {
3253 sfmmu_copytte(&sfhme
->hme_tte
, &tteold
);
3256 chk_tte(&orig_old
, &tteold
, ttep
, hmeblkp
);
3259 ASSERT(TTE_IS_VALID(&sfhme
->hme_tte
));
3261 if (!TTE_IS_VALID(&tteold
)) {
3263 atomic_inc_16(&hmeblkp
->hblk_vcnt
);
3264 if (rid
== SFMMU_INVALID_SHMERID
) {
3265 atomic_inc_ulong(&sfmmup
->sfmmu_ttecnt
[size
]);
3267 sf_srd_t
*srdp
= sfmmup
->sfmmu_srdp
;
3268 sf_region_t
*rgnp
= srdp
->srd_hmergnp
[rid
];
3270 * We already accounted for region ttecnt's in sfmmu
3271 * during hat_join_region() processing. Here we
3272 * only update ttecnt's in region struture.
3274 atomic_inc_ulong(&rgnp
->rgn_ttecnt
[size
]);
3278 myflt
= (astosfmmu(curthread
->t_procp
->p_as
) == sfmmup
);
3279 if (size
> TTE8K
&& (flags
& HAT_LOAD_SHARE
) == 0 &&
3280 sfmmup
!= ksfmmup
) {
3281 uchar_t tteflag
= 1 << size
;
3282 if (rid
== SFMMU_INVALID_SHMERID
) {
3283 if (!(sfmmup
->sfmmu_tteflags
& tteflag
)) {
3284 hatlockp
= sfmmu_hat_enter(sfmmup
);
3285 sfmmup
->sfmmu_tteflags
|= tteflag
;
3286 sfmmu_hat_exit(hatlockp
);
3288 } else if (!(sfmmup
->sfmmu_rtteflags
& tteflag
)) {
3289 hatlockp
= sfmmu_hat_enter(sfmmup
);
3290 sfmmup
->sfmmu_rtteflags
|= tteflag
;
3291 sfmmu_hat_exit(hatlockp
);
3294 * Update the current CPU tsbmiss area, so the current thread
3295 * won't need to take the tsbmiss for the new pagesize.
3296 * The other threads in the process will update their tsb
3297 * miss area lazily in sfmmu_tsbmiss_exception() when they
3298 * fail to find the translation for a newly added pagesize.
3300 if (size
> TTE64K
&& myflt
) {
3301 struct tsbmiss
*tsbmp
;
3303 tsbmp
= &tsbmiss_area
[CPU
->cpu_id
];
3304 if (rid
== SFMMU_INVALID_SHMERID
) {
3305 if (!(tsbmp
->uhat_tteflags
& tteflag
)) {
3306 tsbmp
->uhat_tteflags
|= tteflag
;
3309 if (!(tsbmp
->uhat_rtteflags
& tteflag
)) {
3310 tsbmp
->uhat_rtteflags
|= tteflag
;
3317 if (size
>= TTE4M
&& (flags
& HAT_LOAD_TEXT
) &&
3318 !SFMMU_FLAGS_ISSET(sfmmup
, HAT_4MTEXT_FLAG
)) {
3319 hatlockp
= sfmmu_hat_enter(sfmmup
);
3320 SFMMU_FLAGS_SET(sfmmup
, HAT_4MTEXT_FLAG
);
3321 sfmmu_hat_exit(hatlockp
);
3324 flush_tte
.tte_intlo
= (tteold
.tte_intlo
^ ttep
->tte_intlo
) &
3326 flush_tte
.tte_inthi
= (tteold
.tte_inthi
^ ttep
->tte_inthi
) &
3329 if (remap
&& (flush_tte
.tte_inthi
|| flush_tte
.tte_intlo
)) {
3331 * If remap and new tte differs from old tte we need
3332 * to sync the mod bit and flush TLB/TSB. We don't
3333 * need to sync ref bit because we currently always set
3334 * ref bit in tteload.
3336 ASSERT(TTE_IS_REF(ttep
));
3337 if (TTE_IS_MOD(&tteold
)) {
3338 sfmmu_ttesync(sfmmup
, vaddr
, &tteold
, pp
);
3341 * hwtte bits shouldn't change for SRD hmeblks as long as SRD
3342 * hmes are only used for read only text. Adding this code for
3343 * completeness and future use of shared hmeblks with writable
3344 * mappings of VMODSORT vnodes.
3346 if (hmeblkp
->hblk_shared
) {
3347 cpuset_t cpuset
= sfmmu_rgntlb_demap(vaddr
,
3348 sfmmup
->sfmmu_srdp
->srd_hmergnp
[rid
], hmeblkp
, 1);
3350 SFMMU_STAT_ADD(sf_region_remap_demap
, 1);
3352 sfmmu_tlb_demap(vaddr
, sfmmup
, hmeblkp
, 0, 0);
3353 xt_sync(sfmmup
->sfmmu_cpusran
);
3357 if ((flags
& SFMMU_NO_TSBLOAD
) == 0) {
3359 * We only preload 8K and 4M mappings into the TSB, since
3360 * 64K and 512K mappings are replicated and hence don't
3361 * have a single, unique TSB entry. Ditto for 32M/256M.
3363 if (size
== TTE8K
|| size
== TTE4M
) {
3365 hatlockp
= sfmmu_hat_enter(sfmmup
);
3367 * Don't preload private TSB if the mapping is used
3368 * by the shctx in the SCD.
3370 scdp
= sfmmup
->sfmmu_scdp
;
3371 if (rid
== SFMMU_INVALID_SHMERID
|| scdp
== NULL
||
3372 !SF_RGNMAP_TEST(scdp
->scd_hmeregion_map
, rid
)) {
3373 sfmmu_load_tsb(sfmmup
, vaddr
, &sfhme
->hme_tte
,
3376 sfmmu_hat_exit(hatlockp
);
3382 atomic_inc_16(&hmeblkp
->hblk_hmecnt
);
3383 ASSERT(hmeblkp
->hblk_hmecnt
> 0);
3386 * Cannot ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS)
3387 * see pageunload() for comment.
3390 sfmmu_mlist_exit(pml
);
3396 * Function unlocks hash bucket.
3399 sfmmu_tteload_release_hashbucket(struct hmehash_bucket
*hmebp
)
3401 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp
));
3402 SFMMU_HASH_UNLOCK(hmebp
);
3406 * function which checks and sets up page array for a large
3407 * translation. Will set p_vcolor, p_index, p_ro fields.
3408 * Assumes addr and pfnum of first page are properly aligned.
3409 * Will check for physical contiguity. If check fails it return
3413 sfmmu_pagearray_setup(caddr_t addr
, page_t
**pps
, tte_t
*ttep
, int remap
)
3415 int i
, index
, ttesz
;
3427 ttesz
= TTE_CSZ(ttep
);
3429 ASSERT(ttesz
> TTE8K
);
3431 npgs
= TTEPAGES(ttesz
);
3432 index
= PAGESZ_TO_INDEX(ttesz
);
3434 pfnum
= (*pps
)->p_pagenum
;
3435 ASSERT(IS_P2ALIGNED(pfnum
, npgs
));
3438 * Save the first pp so we can do HAT_TMPNC at the end.
3442 osz
= fnd_mapping_sz(pp1
);
3445 for (i
= 0; i
< npgs
; i
++, pps
++) {
3447 ASSERT(PAGE_LOCKED(pp
));
3448 ASSERT(pp
->p_szc
>= ttesz
);
3449 ASSERT(pp
->p_szc
== pp1
->p_szc
);
3450 ASSERT(sfmmu_mlist_held(pp
));
3453 * XXX is it possible to maintain P_RO on the root only?
3455 if (TTE_IS_WRITABLE(ttep
) && PP_ISRO(pp
)) {
3456 pmtx
= sfmmu_page_enter(pp
);
3458 sfmmu_page_exit(pmtx
);
3459 } else if (!PP_ISMAPPED(pp
) && !TTE_IS_WRITABLE(ttep
) &&
3461 pmtx
= sfmmu_page_enter(pp
);
3462 if (!(PP_ISMOD(pp
))) {
3465 sfmmu_page_exit(pmtx
);
3469 * If this is a remap we skip vac & contiguity checks.
3475 * set p_vcolor and detect any vac conflicts.
3479 vac_err
= sfmmu_vacconflict_array(addr
, pp
, &cflags
);
3485 * Save current index in case we need to undo it.
3486 * Note: "PAGESZ_TO_INDEX(sz) (1 << (sz))"
3487 * "SFMMU_INDEX_SHIFT 6"
3488 * "SFMMU_INDEX_MASK ((1 << SFMMU_INDEX_SHIFT) - 1)"
3489 * "PP_MAPINDEX(p_index) (p_index & SFMMU_INDEX_MASK)"
3491 * So: index = PAGESZ_TO_INDEX(ttesz);
3492 * if ttesz == 1 then index = 0x2
3493 * 2 then index = 0x4
3494 * 3 then index = 0x8
3495 * 4 then index = 0x10
3496 * 5 then index = 0x20
3497 * The code below checks if it's a new pagesize (ie, newidx)
3498 * in case we need to take it back out of p_index,
3499 * and then or's the new index into the existing index.
3501 if ((PP_MAPINDEX(pp
) & index
) == 0)
3503 pp
->p_index
= (PP_MAPINDEX(pp
) | index
);
3508 if (pp
->p_pagenum
!= pfnum
) {
3510 * If we fail the contiguity test then
3511 * the only thing we need to fix is the p_index field.
3512 * We might get a few extra flushes but since this
3513 * path is rare that is ok. The p_ro field will
3514 * get automatically fixed on the next tteload to
3515 * the page. NO TNC bit is set yet.
3520 pp
->p_index
= (PP_MAPINDEX(pp
) &
3528 addr
+= MMU_PAGESIZE
;
3535 * There are some smaller mappings that causes vac
3536 * conflicts. Convert all existing small mappings to
3539 SFMMU_STAT_ADD(sf_uncache_conflict
, npgs
);
3540 sfmmu_page_cache_array(pp1
, HAT_TMPNC
, CACHE_FLUSH
,
3545 * If there exists an big page mapping,
3546 * that means the whole existing big page
3547 * has TNC setting already. No need to covert to
3550 ASSERT(PP_ISTNC(pp1
));
3560 * Routine that detects vac consistency for a large page. It also
3561 * sets virtual color for all pp's for this big mapping.
3564 sfmmu_vacconflict_array(caddr_t addr
, page_t
*pp
, int *cflags
)
3568 ASSERT(sfmmu_mlist_held(pp
));
3574 vcolor
= addr_to_vcolor(addr
);
3575 if (PP_NEWPAGE(pp
)) {
3576 PP_SET_VCOLOR(pp
, vcolor
);
3580 ocolor
= PP_GET_VCOLOR(pp
);
3581 if (ocolor
== vcolor
) {
3585 if (!PP_ISMAPPED(pp
) && !PP_ISMAPPED_KPM(pp
)) {
3587 * Previous user of page had a differnet color
3588 * but since there are no current users
3589 * we just flush the cache and change the color.
3590 * As an optimization for large pages we flush the
3591 * entire cache of that color and set a flag.
3593 SFMMU_STAT(sf_pgcolor_conflict
);
3594 if (!CacheColor_IsFlushed(*cflags
, ocolor
)) {
3595 CacheColor_SetFlushed(*cflags
, ocolor
);
3596 sfmmu_cache_flushcolor(ocolor
, pp
->p_pagenum
);
3598 PP_SET_VCOLOR(pp
, vcolor
);
3603 * We got a real conflict with a current mapping.
3604 * set flags to start unencaching all mappings
3605 * and return failure so we restart looping
3606 * the pp array from the beginning.
3613 * creates a large page shadow hmeblk for a tte.
3614 * The purpose of this routine is to allow us to do quick unloads because
3615 * the vm layer can easily pass a very large but sparsely populated range.
3617 static struct hme_blk
*
3618 sfmmu_shadow_hcreate(sfmmu_t
*sfmmup
, caddr_t vaddr
, int ttesz
, uint_t flags
)
3620 struct hmehash_bucket
*hmebp
;
3622 int hmeshift
, size
, vshift
;
3623 uint_t shw_mask
, newshw_mask
;
3624 struct hme_blk
*hmeblkp
;
3626 ASSERT(sfmmup
!= KHATID
);
3627 if (mmu_page_sizes
== max_mmu_page_sizes
) {
3628 ASSERT(ttesz
< TTE256M
);
3630 ASSERT(ttesz
< TTE4M
);
3631 ASSERT(sfmmup
->sfmmu_ttecnt
[TTE32M
] == 0);
3632 ASSERT(sfmmup
->sfmmu_ttecnt
[TTE256M
] == 0);
3635 if (ttesz
== TTE8K
) {
3641 hblktag
.htag_id
= sfmmup
;
3642 hmeshift
= HME_HASH_SHIFT(size
);
3643 hblktag
.htag_bspage
= HME_HASH_BSPAGE(vaddr
, hmeshift
);
3644 hblktag
.htag_rehash
= HME_HASH_REHASH(size
);
3645 hblktag
.htag_rid
= SFMMU_INVALID_SHMERID
;
3646 hmebp
= HME_HASH_FUNCTION(sfmmup
, vaddr
, hmeshift
);
3648 SFMMU_HASH_LOCK(hmebp
);
3650 HME_HASH_FAST_SEARCH(hmebp
, hblktag
, hmeblkp
);
3651 ASSERT(hmeblkp
!= (struct hme_blk
*)hblk_reserve
);
3652 if (hmeblkp
== NULL
) {
3653 hmeblkp
= sfmmu_hblk_alloc(sfmmup
, vaddr
, hmebp
, size
,
3654 hblktag
, flags
, SFMMU_INVALID_SHMERID
);
3657 if (!hmeblkp
->hblk_shw_mask
) {
3659 * if this is a unused hblk it was just allocated or could
3660 * potentially be a previous large page hblk so we need to
3661 * set the shadow bit.
3663 ASSERT(!hmeblkp
->hblk_vcnt
&& !hmeblkp
->hblk_hmecnt
);
3664 hmeblkp
->hblk_shw_bit
= 1;
3665 } else if (hmeblkp
->hblk_shw_bit
== 0) {
3666 panic("sfmmu_shadow_hcreate: shw bit not set in hmeblkp 0x%p",
3669 ASSERT(hmeblkp
->hblk_shw_bit
== 1);
3670 ASSERT(!hmeblkp
->hblk_shared
);
3671 vshift
= vaddr_to_vshift(hblktag
, vaddr
, size
);
3674 * Atomically set shw mask bit
3677 shw_mask
= hmeblkp
->hblk_shw_mask
;
3678 newshw_mask
= shw_mask
| (1 << vshift
);
3679 newshw_mask
= atomic_cas_32(&hmeblkp
->hblk_shw_mask
, shw_mask
,
3681 } while (newshw_mask
!= shw_mask
);
3683 SFMMU_HASH_UNLOCK(hmebp
);
3689 * This routine cleanup a previous shadow hmeblk and changes it to
3690 * a regular hblk. This happens rarely but it is possible
3691 * when a process wants to use large pages and there are hblks still
3692 * lying around from the previous as that used these hmeblks.
3693 * The alternative was to cleanup the shadow hblks at unload time
3694 * but since so few user processes actually use large pages, it is
3695 * better to be lazy and cleanup at this time.
3698 sfmmu_shadow_hcleanup(sfmmu_t
*sfmmup
, struct hme_blk
*hmeblkp
,
3699 struct hmehash_bucket
*hmebp
)
3701 caddr_t addr
, endaddr
;
3704 ASSERT(hmeblkp
->hblk_shw_bit
);
3705 ASSERT(!hmeblkp
->hblk_shared
);
3707 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp
));
3709 if (!hmeblkp
->hblk_shw_mask
) {
3710 hmeblkp
->hblk_shw_bit
= 0;
3713 addr
= (caddr_t
)get_hblk_base(hmeblkp
);
3714 endaddr
= get_hblk_endaddr(hmeblkp
);
3715 size
= get_hblk_ttesz(hmeblkp
);
3718 SFMMU_HASH_UNLOCK(hmebp
);
3720 sfmmu_free_hblks(sfmmup
, addr
, endaddr
, hashno
);
3722 SFMMU_HASH_LOCK(hmebp
);
3726 sfmmu_free_hblks(sfmmu_t
*sfmmup
, caddr_t addr
, caddr_t endaddr
,
3729 int hmeshift
, shadow
= 0;
3731 struct hmehash_bucket
*hmebp
;
3732 struct hme_blk
*hmeblkp
;
3733 struct hme_blk
*nx_hblk
, *pr_hblk
, *list
= NULL
;
3736 hblktag
.htag_id
= sfmmup
;
3737 hblktag
.htag_rehash
= hashno
;
3738 hblktag
.htag_rid
= SFMMU_INVALID_SHMERID
;
3740 hmeshift
= HME_HASH_SHIFT(hashno
);
3742 while (addr
< endaddr
) {
3743 hblktag
.htag_bspage
= HME_HASH_BSPAGE(addr
, hmeshift
);
3744 hmebp
= HME_HASH_FUNCTION(sfmmup
, addr
, hmeshift
);
3745 SFMMU_HASH_LOCK(hmebp
);
3746 /* inline HME_HASH_SEARCH */
3747 hmeblkp
= hmebp
->hmeblkp
;
3750 if (HTAGS_EQ(hmeblkp
->hblk_tag
, hblktag
)) {
3752 ASSERT(!hmeblkp
->hblk_shared
);
3753 if (hmeblkp
->hblk_shw_bit
) {
3754 if (hmeblkp
->hblk_shw_mask
) {
3756 sfmmu_shadow_hcleanup(sfmmup
,
3760 hmeblkp
->hblk_shw_bit
= 0;
3765 * Hblk_hmecnt and hblk_vcnt could be non zero
3766 * since hblk_unload() does not gurantee that.
3768 * XXX - this could cause tteload() to spin
3769 * where sfmmu_shadow_hcleanup() is called.
3773 nx_hblk
= hmeblkp
->hblk_next
;
3774 if (!hmeblkp
->hblk_vcnt
&& !hmeblkp
->hblk_hmecnt
) {
3775 sfmmu_hblk_hash_rm(hmebp
, hmeblkp
, pr_hblk
,
3783 SFMMU_HASH_UNLOCK(hmebp
);
3787 * We found another shadow hblk so cleaned its
3788 * children. We need to go back and cleanup
3789 * the original hblk so we don't change the
3794 addr
= (caddr_t
)roundup((uintptr_t)addr
+ 1,
3798 sfmmu_hblks_list_purge(&list
, 0);
3802 * This routine's job is to delete stale invalid shared hmeregions hmeblks that
3803 * may still linger on after pageunload.
3806 sfmmu_cleanup_rhblk(sf_srd_t
*srdp
, caddr_t addr
, uint_t rid
, int ttesz
)
3810 struct hmehash_bucket
*hmebp
;
3811 struct hme_blk
*hmeblkp
;
3812 struct hme_blk
*pr_hblk
;
3813 struct hme_blk
*list
= NULL
;
3815 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
3816 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
3818 hmeshift
= HME_HASH_SHIFT(ttesz
);
3819 hblktag
.htag_bspage
= HME_HASH_BSPAGE(addr
, hmeshift
);
3820 hblktag
.htag_rehash
= ttesz
;
3821 hblktag
.htag_rid
= rid
;
3822 hblktag
.htag_id
= srdp
;
3823 hmebp
= HME_HASH_FUNCTION(srdp
, addr
, hmeshift
);
3825 SFMMU_HASH_LOCK(hmebp
);
3826 HME_HASH_SEARCH_PREV(hmebp
, hblktag
, hmeblkp
, pr_hblk
, &list
);
3827 if (hmeblkp
!= NULL
) {
3828 ASSERT(hmeblkp
->hblk_shared
);
3829 ASSERT(!hmeblkp
->hblk_shw_bit
);
3830 if (hmeblkp
->hblk_vcnt
|| hmeblkp
->hblk_hmecnt
) {
3831 panic("sfmmu_cleanup_rhblk: valid hmeblk");
3833 ASSERT(!hmeblkp
->hblk_lckcnt
);
3834 sfmmu_hblk_hash_rm(hmebp
, hmeblkp
, pr_hblk
,
3837 SFMMU_HASH_UNLOCK(hmebp
);
3838 sfmmu_hblks_list_purge(&list
, 0);
3843 sfmmu_rgn_cb_noop(caddr_t saddr
, caddr_t eaddr
, caddr_t r_saddr
,
3844 size_t r_size
, void *r_obj
, u_offset_t r_objoff
)
3849 * Searches for an hmeblk which maps addr, then unloads this mapping
3850 * and updates *eaddrp, if the hmeblk is found.
3853 sfmmu_unload_hmeregion_va(sf_srd_t
*srdp
, uint_t rid
, caddr_t addr
,
3854 caddr_t eaddr
, int ttesz
, caddr_t
*eaddrp
)
3858 struct hmehash_bucket
*hmebp
;
3859 struct hme_blk
*hmeblkp
;
3860 struct hme_blk
*pr_hblk
;
3861 struct hme_blk
*list
= NULL
;
3863 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
3864 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
3865 ASSERT(ttesz
>= HBLK_MIN_TTESZ
);
3867 hmeshift
= HME_HASH_SHIFT(ttesz
);
3868 hblktag
.htag_bspage
= HME_HASH_BSPAGE(addr
, hmeshift
);
3869 hblktag
.htag_rehash
= ttesz
;
3870 hblktag
.htag_rid
= rid
;
3871 hblktag
.htag_id
= srdp
;
3872 hmebp
= HME_HASH_FUNCTION(srdp
, addr
, hmeshift
);
3874 SFMMU_HASH_LOCK(hmebp
);
3875 HME_HASH_SEARCH_PREV(hmebp
, hblktag
, hmeblkp
, pr_hblk
, &list
);
3876 if (hmeblkp
!= NULL
) {
3877 ASSERT(hmeblkp
->hblk_shared
);
3878 ASSERT(!hmeblkp
->hblk_lckcnt
);
3879 if (hmeblkp
->hblk_vcnt
|| hmeblkp
->hblk_hmecnt
) {
3880 *eaddrp
= sfmmu_hblk_unload(NULL
, hmeblkp
, addr
,
3881 eaddr
, NULL
, HAT_UNLOAD
);
3882 ASSERT(*eaddrp
> addr
);
3884 ASSERT(!hmeblkp
->hblk_vcnt
&& !hmeblkp
->hblk_hmecnt
);
3885 sfmmu_hblk_hash_rm(hmebp
, hmeblkp
, pr_hblk
,
3888 SFMMU_HASH_UNLOCK(hmebp
);
3889 sfmmu_hblks_list_purge(&list
, 0);
3893 sfmmu_unload_hmeregion(sf_srd_t
*srdp
, sf_region_t
*rgnp
)
3895 int ttesz
= rgnp
->rgn_pgszc
;
3896 size_t rsz
= rgnp
->rgn_size
;
3897 caddr_t rsaddr
= rgnp
->rgn_saddr
;
3898 caddr_t readdr
= rsaddr
+ rsz
;
3901 uint_t rid
= rgnp
->rgn_id
;
3904 hat_rgn_cb_func_t rcbfunc
;
3907 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
3908 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
3910 ASSERT(IS_P2ALIGNED(rsaddr
, TTEBYTES(ttesz
)));
3911 ASSERT(IS_P2ALIGNED(rsz
, TTEBYTES(ttesz
)));
3912 if (ttesz
< HBLK_MIN_TTESZ
) {
3913 ttesz
= HBLK_MIN_TTESZ
;
3914 rhsaddr
= (caddr_t
)P2ALIGN((uintptr_t)rsaddr
, HBLK_MIN_BYTES
);
3919 if ((rcbfunc
= rgnp
->rgn_cb_function
) == NULL
) {
3920 rcbfunc
= sfmmu_rgn_cb_noop
;
3923 while (ttesz
>= HBLK_MIN_TTESZ
) {
3926 if (!(rgnp
->rgn_hmeflags
& (1 << ttesz
))) {
3932 while (va
< readdr
) {
3933 ASSERT(va
>= rhsaddr
);
3934 if (va
!= cbeaddr
) {
3935 if (cbeaddr
!= cbsaddr
) {
3936 ASSERT(cbeaddr
> cbsaddr
);
3937 (*rcbfunc
)(cbsaddr
, cbeaddr
,
3938 rsaddr
, rsz
, rgnp
->rgn_obj
,
3944 sfmmu_unload_hmeregion_va(srdp
, rid
, va
, readdr
,
3947 va
= rhsaddr
+ (cnt
<< TTE_PAGE_SHIFT(ttesz
));
3949 if (cbeaddr
!= cbsaddr
) {
3950 ASSERT(cbeaddr
> cbsaddr
);
3951 (*rcbfunc
)(cbsaddr
, cbeaddr
, rsaddr
,
3960 * Release one hardware address translation lock on the given address range.
3963 hat_unlock(struct hat
*sfmmup
, caddr_t addr
, size_t len
)
3965 struct hmehash_bucket
*hmebp
;
3967 int hmeshift
, hashno
= 1;
3968 struct hme_blk
*hmeblkp
, *list
= NULL
;
3971 ASSERT(sfmmup
!= NULL
);
3972 ASSERT(sfmmup
->sfmmu_xhat_provider
== NULL
);
3974 ASSERT((sfmmup
== ksfmmup
) || AS_LOCK_HELD(sfmmup
->sfmmu_as
));
3975 ASSERT((len
& MMU_PAGEOFFSET
) == 0);
3976 endaddr
= addr
+ len
;
3977 hblktag
.htag_id
= sfmmup
;
3978 hblktag
.htag_rid
= SFMMU_INVALID_SHMERID
;
3981 * Spitfire supports 4 page sizes.
3982 * Most pages are expected to be of the smallest page size (8K) and
3983 * these will not need to be rehashed. 64K pages also don't need to be
3984 * rehashed because an hmeblk spans 64K of address space. 512K pages
3985 * might need 1 rehash and and 4M pages might need 2 rehashes.
3987 while (addr
< endaddr
) {
3988 hmeshift
= HME_HASH_SHIFT(hashno
);
3989 hblktag
.htag_bspage
= HME_HASH_BSPAGE(addr
, hmeshift
);
3990 hblktag
.htag_rehash
= hashno
;
3991 hmebp
= HME_HASH_FUNCTION(sfmmup
, addr
, hmeshift
);
3993 SFMMU_HASH_LOCK(hmebp
);
3995 HME_HASH_SEARCH(hmebp
, hblktag
, hmeblkp
, &list
);
3996 if (hmeblkp
!= NULL
) {
3997 ASSERT(!hmeblkp
->hblk_shared
);
3999 * If we encounter a shadow hmeblk then
4000 * we know there are no valid hmeblks mapping
4001 * this address at this size or larger.
4002 * Just increment address by the smallest
4005 if (hmeblkp
->hblk_shw_bit
) {
4006 addr
+= MMU_PAGESIZE
;
4008 addr
= sfmmu_hblk_unlock(hmeblkp
, addr
,
4011 SFMMU_HASH_UNLOCK(hmebp
);
4015 SFMMU_HASH_UNLOCK(hmebp
);
4017 if (!HME_REHASH(sfmmup
) || (hashno
>= mmu_hashcnt
)) {
4019 * We have traversed the whole list and rehashed
4020 * if necessary without finding the address to unlock
4021 * which should never happen.
4023 panic("sfmmu_unlock: addr not found. "
4024 "addr %p hat %p", (void *)addr
, (void *)sfmmup
);
4030 sfmmu_hblks_list_purge(&list
, 0);
4034 hat_unlock_region(struct hat
*sfmmup
, caddr_t addr
, size_t len
,
4035 hat_region_cookie_t rcookie
)
4045 struct hmehash_bucket
*hmebp
;
4046 struct hme_blk
*hmeblkp
;
4047 struct hme_blk
*pr_hblk
;
4048 struct hme_blk
*list
;
4050 if (rcookie
== HAT_INVALID_REGION_COOKIE
) {
4051 hat_unlock(sfmmup
, addr
, len
);
4055 ASSERT(sfmmup
!= NULL
);
4056 ASSERT(sfmmup
->sfmmu_xhat_provider
== NULL
);
4057 ASSERT(sfmmup
!= ksfmmup
);
4059 srdp
= sfmmup
->sfmmu_srdp
;
4060 rid
= (uint_t
)((uint64_t)rcookie
);
4061 VERIFY3U(rid
, <, SFMMU_MAX_HME_REGIONS
);
4065 rgnp
= srdp
->srd_hmergnp
[rid
];
4066 SFMMU_VALIDATE_HMERID(sfmmup
, rid
, addr
, len
);
4068 ASSERT(IS_P2ALIGNED(addr
, TTEBYTES(rgnp
->rgn_pgszc
)));
4069 ASSERT(IS_P2ALIGNED(len
, TTEBYTES(rgnp
->rgn_pgszc
)));
4070 if (rgnp
->rgn_pgszc
< HBLK_MIN_TTESZ
) {
4071 ttesz
= HBLK_MIN_TTESZ
;
4073 ttesz
= rgnp
->rgn_pgszc
;
4075 while (va
< eaddr
) {
4076 while (ttesz
< rgnp
->rgn_pgszc
&&
4077 IS_P2ALIGNED(va
, TTEBYTES(ttesz
+ 1))) {
4080 while (ttesz
>= HBLK_MIN_TTESZ
) {
4081 if (!(rgnp
->rgn_hmeflags
& (1 << ttesz
))) {
4085 hmeshift
= HME_HASH_SHIFT(ttesz
);
4086 hblktag
.htag_bspage
= HME_HASH_BSPAGE(va
, hmeshift
);
4087 hblktag
.htag_rehash
= ttesz
;
4088 hblktag
.htag_rid
= rid
;
4089 hblktag
.htag_id
= srdp
;
4090 hmebp
= HME_HASH_FUNCTION(srdp
, va
, hmeshift
);
4091 SFMMU_HASH_LOCK(hmebp
);
4092 HME_HASH_SEARCH_PREV(hmebp
, hblktag
, hmeblkp
, pr_hblk
,
4094 if (hmeblkp
== NULL
) {
4095 SFMMU_HASH_UNLOCK(hmebp
);
4099 ASSERT(hmeblkp
->hblk_shared
);
4100 va
= sfmmu_hblk_unlock(hmeblkp
, va
, eaddr
);
4101 ASSERT(va
>= eaddr
||
4102 IS_P2ALIGNED((uintptr_t)va
, TTEBYTES(ttesz
)));
4103 SFMMU_HASH_UNLOCK(hmebp
);
4106 if (ttesz
< HBLK_MIN_TTESZ
) {
4107 panic("hat_unlock_region: addr not found "
4108 "addr %p hat %p", (void *)va
, (void *)sfmmup
);
4111 sfmmu_hblks_list_purge(&list
, 0);
4115 * Function to unlock a range of addresses in an hmeblk. It returns the
4116 * next address that needs to be unlocked.
4117 * Should be called with the hash lock held.
4120 sfmmu_hblk_unlock(struct hme_blk
*hmeblkp
, caddr_t addr
, caddr_t endaddr
)
4122 struct sf_hment
*sfhme
;
4123 tte_t tteold
, ttemod
;
4126 ASSERT(in_hblk_range(hmeblkp
, addr
));
4127 ASSERT(hmeblkp
->hblk_shw_bit
== 0);
4129 endaddr
= MIN(endaddr
, get_hblk_endaddr(hmeblkp
));
4130 ttesz
= get_hblk_ttesz(hmeblkp
);
4132 HBLKTOHME(sfhme
, hmeblkp
, addr
);
4133 while (addr
< endaddr
) {
4135 sfmmu_copytte(&sfhme
->hme_tte
, &tteold
);
4136 if (TTE_IS_VALID(&tteold
)) {
4140 ret
= sfmmu_modifytte_try(&tteold
, &ttemod
,
4146 if (hmeblkp
->hblk_lckcnt
== 0)
4147 panic("zero hblk lckcnt");
4149 if (((uintptr_t)addr
+ TTEBYTES(ttesz
)) >
4151 panic("can't unlock large tte");
4153 ASSERT(hmeblkp
->hblk_lckcnt
> 0);
4154 atomic_dec_32(&hmeblkp
->hblk_lckcnt
);
4155 HBLK_STACK_TRACE(hmeblkp
, HBLK_UNLOCK
);
4157 panic("sfmmu_hblk_unlock: invalid tte");
4159 addr
+= TTEBYTES(ttesz
);
4166 * Physical Address Mapping Framework
4170 * (1) Applies only to seg_kmem memory pages. To make things easier,
4171 * seg_kpm addresses are also accepted by the routines, but nothing
4172 * is done with them since by definition their PA mappings are static.
4173 * (2) hat_add_callback() may only be called while holding the page lock
4174 * SE_SHARED or SE_EXCL of the underlying page (e.g., as_pagelock()),
4175 * or passing HAC_PAGELOCK flag.
4176 * (3) prehandler() and posthandler() may not call hat_add_callback() or
4177 * hat_delete_callback(), nor should they allocate memory. Post quiesce
4178 * callbacks may not sleep or acquire adaptive mutex locks.
4179 * (4) Either prehandler() or posthandler() (but not both) may be specified
4180 * as being NULL. Specifying an errhandler() is optional.
4182 * Details of using the framework:
4184 * registering a callback (hat_register_callback())
4186 * Pass prehandler, posthandler, errhandler addresses
4187 * as described below. If capture_cpus argument is nonzero,
4188 * suspend callback to the prehandler will occur with CPUs
4189 * captured and executing xc_loop() and CPUs will remain
4190 * captured until after the posthandler suspend callback
4193 * adding a callback (hat_add_callback())
4196 * hat_add_callback();
4197 * save returned pfn in private data structures or program registers;
4202 * Stop all accesses by physical address to this memory page.
4203 * Called twice: the first, PRESUSPEND, is a context safe to acquire
4204 * adaptive locks. The second, SUSPEND, is called at high PIL with
4205 * CPUs captured so adaptive locks may NOT be acquired (and all spin
4206 * locks must be XCALL_PIL or higher locks).
4208 * May return the following errors:
4209 * EIO: A fatal error has occurred. This will result in panic.
4210 * EAGAIN: The page cannot be suspended. This will fail the
4216 * Save new pfn in private data structures or program registers;
4217 * not allowed to fail (non-zero return values will result in panic).
4221 * called when an error occurs related to the callback. Currently
4222 * the only such error is HAT_CB_ERR_LEAKED which indicates that
4223 * a page is being freed, but there are still outstanding callback(s)
4224 * registered on the page.
4226 * removing a callback (hat_delete_callback(); e.g., prior to freeing memory)
4228 * stop using physical address
4229 * hat_delete_callback();
4234 * Register a callback class. Each subsystem should do this once and
4235 * cache the id_t returned for use in setting up and tearing down callbacks.
4237 * There is no facility for removing callback IDs once they are created;
4238 * the "key" should be unique for each module, so in case a module is unloaded
4239 * and subsequently re-loaded, we can recycle the module's previous entry.
4242 hat_register_callback(int key
,
4243 int (*prehandler
)(caddr_t
, uint_t
, uint_t
, void *),
4244 int (*posthandler
)(caddr_t
, uint_t
, uint_t
, void *, pfn_t
),
4245 int (*errhandler
)(caddr_t
, uint_t
, uint_t
, void *),
4251 * Search the table for a pre-existing callback associated with
4252 * the identifier "key". If one exists, we re-use that entry in
4253 * the table for this instance, otherwise we assign the next
4254 * available table slot.
4256 for (id
= 0; id
< sfmmu_max_cb_id
; id
++) {
4257 if (sfmmu_cb_table
[id
].key
== key
)
4261 if (id
== sfmmu_max_cb_id
) {
4262 id
= sfmmu_cb_nextid
++;
4263 if (id
>= sfmmu_max_cb_id
)
4264 panic("hat_register_callback: out of callback IDs");
4267 ASSERT(prehandler
!= NULL
|| posthandler
!= NULL
);
4269 sfmmu_cb_table
[id
].key
= key
;
4270 sfmmu_cb_table
[id
].prehandler
= prehandler
;
4271 sfmmu_cb_table
[id
].posthandler
= posthandler
;
4272 sfmmu_cb_table
[id
].errhandler
= errhandler
;
4273 sfmmu_cb_table
[id
].capture_cpus
= capture_cpus
;
4278 #define HAC_COOKIE_NONE (void *)-1
4281 * Add relocation callbacks to the specified addr/len which will be called
4282 * when relocating the associated page. See the description of pre and
4283 * posthandler above for more details.
4285 * If HAC_PAGELOCK is included in flags, the underlying memory page is
4286 * locked internally so the caller must be able to deal with the callback
4287 * running even before this function has returned. If HAC_PAGELOCK is not
4288 * set, it is assumed that the underlying memory pages are locked.
4290 * Since the caller must track the individual page boundaries anyway,
4291 * we only allow a callback to be added to a single page (large
4292 * or small). Thus [addr, addr + len) MUST be contained within a single
4295 * Registering multiple callbacks on the same [addr, addr+len) is supported,
4296 * _provided_that_ a unique parameter is specified for each callback.
4297 * If multiple callbacks are registered on the same range the callback will
4298 * be invoked with each unique parameter. Registering the same callback with
4299 * the same argument more than once will result in corrupted kernel state.
4301 * Returns the pfn of the underlying kernel page in *rpfn
4302 * on success, or PFN_INVALID on failure.
4304 * cookiep (if passed) provides storage space for an opaque cookie
4305 * to return later to hat_delete_callback(). This cookie makes the callback
4306 * deletion significantly quicker by avoiding a potentially lengthy hash
4311 * ENOMEM: memory allocation failure (e.g. flags was passed as HAC_NOSLEEP)
4312 * EINVAL: callback ID is not valid
4313 * ENXIO: ["vaddr", "vaddr" + len) is not mapped in the kernel's address
4315 * ERANGE: ["vaddr", "vaddr" + len) crosses a page boundary
4318 hat_add_callback(id_t callback_id
, caddr_t vaddr
, uint_t len
, uint_t flags
,
4319 void *pvt
, pfn_t
*rpfn
, void **cookiep
)
4321 struct hmehash_bucket
*hmebp
;
4323 struct hme_blk
*hmeblkp
;
4324 int hmeshift
, hashno
;
4325 caddr_t saddr
, eaddr
, baseaddr
;
4326 struct pa_hment
*pahmep
;
4327 struct sf_hment
*sfhmep
, *osfhmep
;
4334 int kmflags
= (flags
& HAC_SLEEP
)? KM_SLEEP
: KM_NOSLEEP
;
4338 * For KPM mappings, just return the physical address since we
4339 * don't need to register any callbacks.
4341 if (IS_KPM_ADDR(vaddr
)) {
4343 SFMMU_KPM_VTOP(vaddr
, paddr
);
4344 *rpfn
= btop(paddr
);
4345 if (cookiep
!= NULL
)
4346 *cookiep
= HAC_COOKIE_NONE
;
4350 if (callback_id
< (id_t
)0 || callback_id
>= sfmmu_cb_nextid
) {
4351 *rpfn
= PFN_INVALID
;
4355 if ((pahmep
= kmem_cache_alloc(pa_hment_cache
, kmflags
)) == NULL
) {
4356 *rpfn
= PFN_INVALID
;
4360 sfhmep
= &pahmep
->sfment
;
4362 saddr
= (caddr_t
)((uintptr_t)vaddr
& MMU_PAGEMASK
);
4363 eaddr
= saddr
+ len
;
4366 /* Find the mapping(s) for this page */
4367 for (hashno
= TTE64K
, hmeblkp
= NULL
;
4368 hmeblkp
== NULL
&& hashno
<= mmu_hashcnt
;
4370 hmeshift
= HME_HASH_SHIFT(hashno
);
4371 hblktag
.htag_id
= ksfmmup
;
4372 hblktag
.htag_rid
= SFMMU_INVALID_SHMERID
;
4373 hblktag
.htag_bspage
= HME_HASH_BSPAGE(saddr
, hmeshift
);
4374 hblktag
.htag_rehash
= hashno
;
4375 hmebp
= HME_HASH_FUNCTION(ksfmmup
, saddr
, hmeshift
);
4377 SFMMU_HASH_LOCK(hmebp
);
4379 HME_HASH_FAST_SEARCH(hmebp
, hblktag
, hmeblkp
);
4381 if (hmeblkp
== NULL
)
4382 SFMMU_HASH_UNLOCK(hmebp
);
4385 if (hmeblkp
== NULL
) {
4386 kmem_cache_free(pa_hment_cache
, pahmep
);
4387 *rpfn
= PFN_INVALID
;
4391 ASSERT(!hmeblkp
->hblk_shared
);
4393 HBLKTOHME(osfhmep
, hmeblkp
, saddr
);
4394 sfmmu_copytte(&osfhmep
->hme_tte
, &tte
);
4396 if (!TTE_IS_VALID(&tte
)) {
4397 SFMMU_HASH_UNLOCK(hmebp
);
4398 kmem_cache_free(pa_hment_cache
, pahmep
);
4399 *rpfn
= PFN_INVALID
;
4404 * Make sure the boundaries for the callback fall within this
4407 baseaddr
= (caddr_t
)get_hblk_base(hmeblkp
);
4408 ASSERT(saddr
>= baseaddr
);
4409 if (eaddr
> saddr
+ TTEBYTES(TTE_CSZ(&tte
))) {
4410 SFMMU_HASH_UNLOCK(hmebp
);
4411 kmem_cache_free(pa_hment_cache
, pahmep
);
4412 *rpfn
= PFN_INVALID
;
4416 pfn
= sfmmu_ttetopfn(&tte
, vaddr
);
4419 * The pfn may not have a page_t underneath in which case we
4420 * just return it. This can happen if we are doing I/O to a
4421 * static portion of the kernel's address space, for instance.
4423 pp
= osfhmep
->hme_page
;
4425 SFMMU_HASH_UNLOCK(hmebp
);
4426 kmem_cache_free(pa_hment_cache
, pahmep
);
4429 *cookiep
= HAC_COOKIE_NONE
;
4432 ASSERT(pp
== PP_PAGEROOT(pp
));
4437 pml
= sfmmu_mlist_enter(pp
);
4439 if (flags
& HAC_PAGELOCK
) {
4440 if (!page_trylock(pp
, SE_SHARED
)) {
4442 * Somebody is holding SE_EXCL lock. Might
4443 * even be hat_page_relocate(). Drop all
4444 * our locks, lookup the page in &kvp, and
4445 * retry. If it doesn't exist in &kvp and &zvp,
4446 * then we must be dealing with a kernel mapped
4447 * page which doesn't actually belong to
4448 * segkmem so we punt.
4450 sfmmu_mlist_exit(pml
);
4451 SFMMU_HASH_UNLOCK(hmebp
);
4452 pp
= page_lookup(&kvp
, (u_offset_t
)saddr
, SE_SHARED
);
4454 /* check zvp before giving up */
4456 pp
= page_lookup(&zvp
, (u_offset_t
)saddr
,
4459 /* Okay, we didn't find it, give up */
4461 kmem_cache_free(pa_hment_cache
, pahmep
);
4464 *cookiep
= HAC_COOKIE_NONE
;
4473 if (!PAGE_LOCKED(pp
) && !panicstr
)
4474 panic("hat_add_callback: page 0x%p not locked", (void *)pp
);
4476 if (osfhmep
->hme_page
!= pp
|| pp
->p_vnode
!= vp
||
4477 pp
->p_offset
!= off
) {
4479 * The page moved before we got our hands on it. Drop
4480 * all the locks and try again.
4482 ASSERT((flags
& HAC_PAGELOCK
) != 0);
4483 sfmmu_mlist_exit(pml
);
4484 SFMMU_HASH_UNLOCK(hmebp
);
4490 if (!VN_ISKAS(vp
)) {
4492 * This is not a segkmem page but another page which
4493 * has been kernel mapped. It had better have at least
4494 * a share lock on it. Return the pfn.
4496 sfmmu_mlist_exit(pml
);
4497 SFMMU_HASH_UNLOCK(hmebp
);
4500 kmem_cache_free(pa_hment_cache
, pahmep
);
4501 ASSERT(PAGE_LOCKED(pp
));
4504 *cookiep
= HAC_COOKIE_NONE
;
4509 * Setup this pa_hment and link its embedded dummy sf_hment into
4513 pahmep
->cb_id
= callback_id
;
4514 pahmep
->addr
= vaddr
;
4520 sfhmep
->hme_tte
.ll
= 0;
4521 sfhmep
->hme_data
= pahmep
;
4522 sfhmep
->hme_prev
= osfhmep
;
4523 sfhmep
->hme_next
= osfhmep
->hme_next
;
4525 if (osfhmep
->hme_next
)
4526 osfhmep
->hme_next
->hme_prev
= sfhmep
;
4528 osfhmep
->hme_next
= sfhmep
;
4530 sfmmu_mlist_exit(pml
);
4531 SFMMU_HASH_UNLOCK(hmebp
);
4538 *cookiep
= (void *)pahmep
;
4544 * Remove the relocation callbacks from the specified addr/len.
4547 hat_delete_callback(caddr_t vaddr
, uint_t len
, void *pvt
, uint_t flags
,
4550 struct hmehash_bucket
*hmebp
;
4552 struct hme_blk
*hmeblkp
;
4553 int hmeshift
, hashno
;
4555 struct pa_hment
*pahmep
;
4556 struct sf_hment
*sfhmep
, *osfhmep
;
4565 * If the cookie is HAC_COOKIE_NONE then there is no pa_hment to
4566 * remove so just return.
4568 if (cookie
== HAC_COOKIE_NONE
|| IS_KPM_ADDR(vaddr
))
4571 saddr
= (caddr_t
)((uintptr_t)vaddr
& MMU_PAGEMASK
);
4574 /* Find the mapping(s) for this page */
4575 for (hashno
= TTE64K
, hmeblkp
= NULL
;
4576 hmeblkp
== NULL
&& hashno
<= mmu_hashcnt
;
4578 hmeshift
= HME_HASH_SHIFT(hashno
);
4579 hblktag
.htag_id
= ksfmmup
;
4580 hblktag
.htag_rid
= SFMMU_INVALID_SHMERID
;
4581 hblktag
.htag_bspage
= HME_HASH_BSPAGE(saddr
, hmeshift
);
4582 hblktag
.htag_rehash
= hashno
;
4583 hmebp
= HME_HASH_FUNCTION(ksfmmup
, saddr
, hmeshift
);
4585 SFMMU_HASH_LOCK(hmebp
);
4587 HME_HASH_FAST_SEARCH(hmebp
, hblktag
, hmeblkp
);
4589 if (hmeblkp
== NULL
)
4590 SFMMU_HASH_UNLOCK(hmebp
);
4593 if (hmeblkp
== NULL
)
4596 ASSERT(!hmeblkp
->hblk_shared
);
4598 HBLKTOHME(osfhmep
, hmeblkp
, saddr
);
4600 sfmmu_copytte(&osfhmep
->hme_tte
, &tte
);
4601 if (!TTE_IS_VALID(&tte
)) {
4602 SFMMU_HASH_UNLOCK(hmebp
);
4606 pp
= osfhmep
->hme_page
;
4608 SFMMU_HASH_UNLOCK(hmebp
);
4609 ASSERT(cookie
== NULL
);
4616 pml
= sfmmu_mlist_enter(pp
);
4618 if (flags
& HAC_PAGELOCK
) {
4619 if (!page_trylock(pp
, SE_SHARED
)) {
4621 * Somebody is holding SE_EXCL lock. Might
4622 * even be hat_page_relocate(). Drop all
4623 * our locks, lookup the page in &kvp, and
4624 * retry. If it doesn't exist in &kvp and &zvp,
4625 * then we must be dealing with a kernel mapped
4626 * page which doesn't actually belong to
4627 * segkmem so we punt.
4629 sfmmu_mlist_exit(pml
);
4630 SFMMU_HASH_UNLOCK(hmebp
);
4631 pp
= page_lookup(&kvp
, (u_offset_t
)saddr
, SE_SHARED
);
4632 /* check zvp before giving up */
4634 pp
= page_lookup(&zvp
, (u_offset_t
)saddr
,
4638 ASSERT(cookie
== NULL
);
4647 ASSERT(PAGE_LOCKED(pp
));
4649 if (osfhmep
->hme_page
!= pp
|| pp
->p_vnode
!= vp
||
4650 pp
->p_offset
!= off
) {
4652 * The page moved before we got our hands on it. Drop
4653 * all the locks and try again.
4655 ASSERT((flags
& HAC_PAGELOCK
) != 0);
4656 sfmmu_mlist_exit(pml
);
4657 SFMMU_HASH_UNLOCK(hmebp
);
4663 if (!VN_ISKAS(vp
)) {
4665 * This is not a segkmem page but another page which
4666 * has been kernel mapped.
4668 sfmmu_mlist_exit(pml
);
4669 SFMMU_HASH_UNLOCK(hmebp
);
4672 ASSERT(cookie
== NULL
);
4676 if (cookie
!= NULL
) {
4677 pahmep
= (struct pa_hment
*)cookie
;
4678 sfhmep
= &pahmep
->sfment
;
4680 for (sfhmep
= pp
->p_mapping
; sfhmep
!= NULL
;
4681 sfhmep
= sfhmep
->hme_next
) {
4684 * skip va<->pa mappings
4686 if (!IS_PAHME(sfhmep
))
4689 pahmep
= sfhmep
->hme_data
;
4690 ASSERT(pahmep
!= NULL
);
4693 * if pa_hment matches, remove it
4695 if ((pahmep
->pvt
== pvt
) &&
4696 (pahmep
->addr
== vaddr
) &&
4697 (pahmep
->len
== len
)) {
4703 if (sfhmep
== NULL
) {
4705 panic("hat_delete_callback: pa_hment not found, pp %p",
4712 * Note: at this point a valid kernel mapping must still be
4713 * present on this page.
4716 if (pp
->p_share
<= 0)
4717 panic("hat_delete_callback: zero p_share");
4719 if (--pahmep
->refcnt
== 0) {
4720 if (pahmep
->flags
!= 0)
4721 panic("hat_delete_callback: pa_hment is busy");
4724 * Remove sfhmep from the mapping list for the page.
4726 if (sfhmep
->hme_prev
) {
4727 sfhmep
->hme_prev
->hme_next
= sfhmep
->hme_next
;
4729 pp
->p_mapping
= sfhmep
->hme_next
;
4732 if (sfhmep
->hme_next
)
4733 sfhmep
->hme_next
->hme_prev
= sfhmep
->hme_prev
;
4735 sfmmu_mlist_exit(pml
);
4736 SFMMU_HASH_UNLOCK(hmebp
);
4741 kmem_cache_free(pa_hment_cache
, pahmep
);
4745 sfmmu_mlist_exit(pml
);
4746 SFMMU_HASH_UNLOCK(hmebp
);
4752 * hat_probe returns 1 if the translation for the address 'addr' is
4753 * loaded, zero otherwise.
4755 * hat_probe should be used only for advisorary purposes because it may
4756 * occasionally return the wrong value. The implementation must guarantee that
4757 * returning the wrong value is a very rare event. hat_probe is used
4758 * to implement optimizations in the segment drivers.
4762 hat_probe(struct hat
*sfmmup
, caddr_t addr
)
4767 ASSERT(sfmmup
!= NULL
);
4768 ASSERT(sfmmup
->sfmmu_xhat_provider
== NULL
);
4770 ASSERT((sfmmup
== ksfmmup
) || AS_LOCK_HELD(sfmmup
->sfmmu_as
));
4772 if (sfmmup
== ksfmmup
) {
4773 while ((pfn
= sfmmu_vatopfn(addr
, sfmmup
, &tte
))
4775 sfmmu_vatopfn_suspended(addr
, sfmmup
, &tte
);
4778 pfn
= sfmmu_uvatopfn(addr
, sfmmup
, NULL
);
4781 if (pfn
!= PFN_INVALID
)
4788 hat_getpagesize(struct hat
*sfmmup
, caddr_t addr
)
4792 ASSERT(sfmmup
->sfmmu_xhat_provider
== NULL
);
4794 if (sfmmup
== ksfmmup
) {
4795 if (sfmmu_vatopfn(addr
, sfmmup
, &tte
) == PFN_INVALID
) {
4799 if (sfmmu_uvatopfn(addr
, sfmmup
, &tte
) == PFN_INVALID
) {
4804 ASSERT(TTE_IS_VALID(&tte
));
4805 return (TTEBYTES(TTE_CSZ(&tte
)));
4809 hat_getattr(struct hat
*sfmmup
, caddr_t addr
, uint_t
*attr
)
4813 ASSERT(sfmmup
->sfmmu_xhat_provider
== NULL
);
4815 if (sfmmup
== ksfmmup
) {
4816 if (sfmmu_vatopfn(addr
, sfmmup
, &tte
) == PFN_INVALID
) {
4820 if (sfmmu_uvatopfn(addr
, sfmmup
, &tte
) == PFN_INVALID
) {
4824 if (TTE_IS_VALID(&tte
)) {
4825 *attr
= sfmmu_ptov_attr(&tte
);
4829 return ((uint_t
)0xffffffff);
4833 * Enables more attributes on specified address range (ie. logical OR)
4836 hat_setattr(struct hat
*hat
, caddr_t addr
, size_t len
, uint_t attr
)
4838 if (hat
->sfmmu_xhat_provider
) {
4839 XHAT_SETATTR(hat
, addr
, len
, attr
);
4843 * This must be a CPU HAT. If the address space has
4844 * XHATs attached, change attributes for all of them,
4847 ASSERT(hat
->sfmmu_as
!= NULL
);
4848 if (hat
->sfmmu_as
->a_xhat
!= NULL
)
4849 xhat_setattr_all(hat
->sfmmu_as
, addr
, len
, attr
);
4852 sfmmu_chgattr(hat
, addr
, len
, attr
, SFMMU_SETATTR
);
4856 * Assigns attributes to the specified address range. All the attributes
4860 hat_chgattr(struct hat
*hat
, caddr_t addr
, size_t len
, uint_t attr
)
4862 if (hat
->sfmmu_xhat_provider
) {
4863 XHAT_CHGATTR(hat
, addr
, len
, attr
);
4867 * This must be a CPU HAT. If the address space has
4868 * XHATs attached, change attributes for all of them,
4871 ASSERT(hat
->sfmmu_as
!= NULL
);
4872 if (hat
->sfmmu_as
->a_xhat
!= NULL
)
4873 xhat_chgattr_all(hat
->sfmmu_as
, addr
, len
, attr
);
4876 sfmmu_chgattr(hat
, addr
, len
, attr
, SFMMU_CHGATTR
);
4880 * Remove attributes on the specified address range (ie. loginal NAND)
4883 hat_clrattr(struct hat
*hat
, caddr_t addr
, size_t len
, uint_t attr
)
4885 if (hat
->sfmmu_xhat_provider
) {
4886 XHAT_CLRATTR(hat
, addr
, len
, attr
);
4890 * This must be a CPU HAT. If the address space has
4891 * XHATs attached, change attributes for all of them,
4894 ASSERT(hat
->sfmmu_as
!= NULL
);
4895 if (hat
->sfmmu_as
->a_xhat
!= NULL
)
4896 xhat_clrattr_all(hat
->sfmmu_as
, addr
, len
, attr
);
4899 sfmmu_chgattr(hat
, addr
, len
, attr
, SFMMU_CLRATTR
);
4903 * Change attributes on an address range to that specified by attr and mode.
4906 sfmmu_chgattr(struct hat
*sfmmup
, caddr_t addr
, size_t len
, uint_t attr
,
4909 struct hmehash_bucket
*hmebp
;
4911 int hmeshift
, hashno
= 1;
4912 struct hme_blk
*hmeblkp
, *list
= NULL
;
4917 CPUSET_ZERO(cpuset
);
4919 ASSERT((sfmmup
== ksfmmup
) || AS_LOCK_HELD(sfmmup
->sfmmu_as
));
4920 ASSERT((len
& MMU_PAGEOFFSET
) == 0);
4921 ASSERT(((uintptr_t)addr
& MMU_PAGEOFFSET
) == 0);
4923 if ((attr
& PROT_USER
) && (mode
!= SFMMU_CLRATTR
) &&
4924 ((addr
+ len
) > (caddr_t
)USERLIMIT
)) {
4925 panic("user addr %p in kernel space",
4929 endaddr
= addr
+ len
;
4930 hblktag
.htag_id
= sfmmup
;
4931 hblktag
.htag_rid
= SFMMU_INVALID_SHMERID
;
4932 DEMAP_RANGE_INIT(sfmmup
, &dmr
);
4934 while (addr
< endaddr
) {
4935 hmeshift
= HME_HASH_SHIFT(hashno
);
4936 hblktag
.htag_bspage
= HME_HASH_BSPAGE(addr
, hmeshift
);
4937 hblktag
.htag_rehash
= hashno
;
4938 hmebp
= HME_HASH_FUNCTION(sfmmup
, addr
, hmeshift
);
4940 SFMMU_HASH_LOCK(hmebp
);
4942 HME_HASH_SEARCH(hmebp
, hblktag
, hmeblkp
, &list
);
4943 if (hmeblkp
!= NULL
) {
4944 ASSERT(!hmeblkp
->hblk_shared
);
4946 * We've encountered a shadow hmeblk so skip the range
4947 * of the next smaller mapping size.
4949 if (hmeblkp
->hblk_shw_bit
) {
4950 ASSERT(sfmmup
!= ksfmmup
);
4952 addr
= (caddr_t
)P2END((uintptr_t)addr
,
4953 TTEBYTES(hashno
- 1));
4955 addr
= sfmmu_hblk_chgattr(sfmmup
,
4956 hmeblkp
, addr
, endaddr
, &dmr
, attr
, mode
);
4958 SFMMU_HASH_UNLOCK(hmebp
);
4962 SFMMU_HASH_UNLOCK(hmebp
);
4964 if (!HME_REHASH(sfmmup
) || (hashno
>= mmu_hashcnt
)) {
4966 * We have traversed the whole list and rehashed
4967 * if necessary without finding the address to chgattr.
4968 * This is ok, so we increment the address by the
4969 * smallest hmeblk range for kernel mappings or for
4970 * user mappings with no large pages, and the largest
4971 * hmeblk range, to account for shadow hmeblks, for
4972 * user mappings with large pages and continue.
4974 if (sfmmup
== ksfmmup
)
4975 addr
= (caddr_t
)P2END((uintptr_t)addr
,
4978 addr
= (caddr_t
)P2END((uintptr_t)addr
,
4986 sfmmu_hblks_list_purge(&list
, 0);
4987 DEMAP_RANGE_FLUSH(&dmr
);
4988 cpuset
= sfmmup
->sfmmu_cpusran
;
4993 * This function chgattr on a range of addresses in an hmeblk. It returns the
4994 * next addres that needs to be chgattr.
4995 * It should be called with the hash lock held.
4996 * XXX It should be possible to optimize chgattr by not flushing every time but
4997 * on the other hand:
4998 * 1. do one flush crosscall.
4999 * 2. only flush if we are increasing permissions (make sure this will work)
5002 sfmmu_hblk_chgattr(struct hat
*sfmmup
, struct hme_blk
*hmeblkp
, caddr_t addr
,
5003 caddr_t endaddr
, demap_range_t
*dmrp
, uint_t attr
, int mode
)
5005 tte_t tte
, tteattr
, tteflags
, ttemod
;
5006 struct sf_hment
*sfhmep
;
5008 struct page
*pp
= NULL
;
5009 kmutex_t
*pml
, *pmtx
;
5011 int use_demap_range
;
5012 #if defined(SF_ERRATA_57)
5016 ASSERT(in_hblk_range(hmeblkp
, addr
));
5017 ASSERT(hmeblkp
->hblk_shw_bit
== 0);
5018 ASSERT(!hmeblkp
->hblk_shared
);
5020 endaddr
= MIN(endaddr
, get_hblk_endaddr(hmeblkp
));
5021 ttesz
= get_hblk_ttesz(hmeblkp
);
5024 * Flush the current demap region if addresses have been
5025 * skipped or the page size doesn't match.
5027 use_demap_range
= (TTEBYTES(ttesz
) == DEMAP_RANGE_PGSZ(dmrp
));
5028 if (use_demap_range
) {
5029 DEMAP_RANGE_CONTINUE(dmrp
, addr
, endaddr
);
5030 } else if (dmrp
!= NULL
) {
5031 DEMAP_RANGE_FLUSH(dmrp
);
5034 tteattr
.ll
= sfmmu_vtop_attr(attr
, mode
, &tteflags
);
5035 #if defined(SF_ERRATA_57)
5036 check_exec
= (sfmmup
!= ksfmmup
) &&
5037 AS_TYPE_64BIT(sfmmup
->sfmmu_as
) &&
5038 TTE_IS_EXECUTABLE(&tteattr
);
5040 HBLKTOHME(sfhmep
, hmeblkp
, addr
);
5041 while (addr
< endaddr
) {
5042 sfmmu_copytte(&sfhmep
->hme_tte
, &tte
);
5043 if (TTE_IS_VALID(&tte
)) {
5044 if ((tte
.ll
& tteflags
.ll
) == tteattr
.ll
) {
5046 * if the new attr is the same as old
5051 if (!TTE_IS_WRITABLE(&tteattr
)) {
5053 * make sure we clear hw modify bit if we
5054 * removing write protections
5056 tteflags
.tte_intlo
|= TTE_HWWR_INT
;
5060 pp
= sfhmep
->hme_page
;
5062 pml
= sfmmu_mlist_enter(pp
);
5065 if (pp
!= sfhmep
->hme_page
) {
5067 * tte must have been unloaded.
5070 sfmmu_mlist_exit(pml
);
5074 ASSERT(pp
== NULL
|| sfmmu_mlist_held(pp
));
5077 ttemod
.ll
= (ttemod
.ll
& ~tteflags
.ll
) | tteattr
.ll
;
5078 ASSERT(TTE_TO_TTEPFN(&ttemod
) == TTE_TO_TTEPFN(&tte
));
5080 #if defined(SF_ERRATA_57)
5081 if (check_exec
&& addr
< errata57_limit
)
5082 ttemod
.tte_exec_perm
= 0;
5084 ret
= sfmmu_modifytte_try(&tte
, &ttemod
,
5088 /* tte changed underneath us */
5090 sfmmu_mlist_exit(pml
);
5095 if (tteflags
.tte_intlo
& TTE_HWWR_INT
) {
5097 * need to sync if we are clearing modify bit.
5099 sfmmu_ttesync(sfmmup
, addr
, &tte
, pp
);
5102 if (pp
&& PP_ISRO(pp
)) {
5103 if (tteattr
.tte_intlo
& TTE_WRPRM_INT
) {
5104 pmtx
= sfmmu_page_enter(pp
);
5106 sfmmu_page_exit(pmtx
);
5110 if (ret
> 0 && use_demap_range
) {
5111 DEMAP_RANGE_MARKPG(dmrp
, addr
);
5112 } else if (ret
> 0) {
5113 sfmmu_tlb_demap(addr
, sfmmup
, hmeblkp
, 0, 0);
5117 sfmmu_mlist_exit(pml
);
5121 addr
+= TTEBYTES(ttesz
);
5123 DEMAP_RANGE_NEXTPG(dmrp
);
5129 * This routine converts virtual attributes to physical ones. It will
5130 * update the tteflags field with the tte mask corresponding to the attributes
5131 * affected and it returns the new attributes. It will also clear the modify
5132 * bit if we are taking away write permission. This is necessary since the
5133 * modify bit is the hardware permission bit and we need to clear it in order
5134 * to detect write faults.
5137 sfmmu_vtop_attr(uint_t attr
, int mode
, tte_t
*ttemaskp
)
5141 ASSERT(!(attr
& ~SFMMU_LOAD_ALLATTR
));
5145 /* all attributes specified */
5146 ttevalue
.tte_inthi
= MAKE_TTEATTR_INTHI(attr
);
5147 ttevalue
.tte_intlo
= MAKE_TTEATTR_INTLO(attr
);
5148 ttemaskp
->tte_inthi
= TTEINTHI_ATTR
;
5149 ttemaskp
->tte_intlo
= TTEINTLO_ATTR
;
5152 ASSERT(!(attr
& ~HAT_PROT_MASK
));
5156 * a valid tte implies exec and read for sfmmu
5157 * so no need to do anything about them.
5158 * since priviledged access implies user access
5159 * PROT_USER doesn't make sense either.
5161 if (attr
& PROT_WRITE
) {
5162 ttemaskp
->tte_intlo
|= TTE_WRPRM_INT
;
5163 ttevalue
.tte_intlo
|= TTE_WRPRM_INT
;
5167 /* attributes will be nand with current ones */
5168 if (attr
& ~(PROT_WRITE
| PROT_USER
)) {
5169 panic("sfmmu: attr %x not supported", attr
);
5173 if (attr
& PROT_WRITE
) {
5174 /* clear both writable and modify bit */
5175 ttemaskp
->tte_intlo
|= TTE_WRPRM_INT
| TTE_HWWR_INT
;
5177 if (attr
& PROT_USER
) {
5178 ttemaskp
->tte_intlo
|= TTE_PRIV_INT
;
5179 ttevalue
.tte_intlo
|= TTE_PRIV_INT
;
5183 panic("sfmmu_vtop_attr: bad mode %x", mode
);
5185 ASSERT(TTE_TO_TTEPFN(&ttevalue
) == 0);
5186 return (ttevalue
.ll
);
5190 sfmmu_ptov_attr(tte_t
*ttep
)
5194 ASSERT(TTE_IS_VALID(ttep
));
5198 if (TTE_IS_WRITABLE(ttep
)) {
5201 if (TTE_IS_EXECUTABLE(ttep
)) {
5204 if (!TTE_IS_PRIVILEGED(ttep
)) {
5207 if (TTE_IS_NFO(ttep
)) {
5208 attr
|= HAT_NOFAULT
;
5210 if (TTE_IS_NOSYNC(ttep
)) {
5213 if (TTE_IS_SIDEFFECT(ttep
)) {
5214 attr
|= SFMMU_SIDEFFECT
;
5216 if (!TTE_IS_VCACHEABLE(ttep
)) {
5217 attr
|= SFMMU_UNCACHEVTTE
;
5219 if (!TTE_IS_PCACHEABLE(ttep
)) {
5220 attr
|= SFMMU_UNCACHEPTTE
;
5226 * hat_chgprot is a deprecated hat call. New segment drivers
5227 * should store all attributes and use hat_*attr calls.
5229 * Change the protections in the virtual address range
5230 * given to the specified virtual protection. If vprot is ~PROT_WRITE,
5231 * then remove write permission, leaving the other
5232 * permissions unchanged. If vprot is ~PROT_USER, remove user permissions.
5236 hat_chgprot(struct hat
*sfmmup
, caddr_t addr
, size_t len
, uint_t vprot
)
5238 struct hmehash_bucket
*hmebp
;
5240 int hmeshift
, hashno
= 1;
5241 struct hme_blk
*hmeblkp
, *list
= NULL
;
5246 ASSERT((len
& MMU_PAGEOFFSET
) == 0);
5247 ASSERT(((uintptr_t)addr
& MMU_PAGEOFFSET
) == 0);
5249 if (sfmmup
->sfmmu_xhat_provider
) {
5250 XHAT_CHGPROT(sfmmup
, addr
, len
, vprot
);
5254 * This must be a CPU HAT. If the address space has
5255 * XHATs attached, change attributes for all of them,
5258 ASSERT(sfmmup
->sfmmu_as
!= NULL
);
5259 if (sfmmup
->sfmmu_as
->a_xhat
!= NULL
)
5260 xhat_chgprot_all(sfmmup
->sfmmu_as
, addr
, len
, vprot
);
5263 CPUSET_ZERO(cpuset
);
5265 if ((vprot
!= (uint_t
)~PROT_WRITE
) && (vprot
& PROT_USER
) &&
5266 ((addr
+ len
) > (caddr_t
)USERLIMIT
)) {
5267 panic("user addr %p vprot %x in kernel space",
5268 (void *)addr
, vprot
);
5270 endaddr
= addr
+ len
;
5271 hblktag
.htag_id
= sfmmup
;
5272 hblktag
.htag_rid
= SFMMU_INVALID_SHMERID
;
5273 DEMAP_RANGE_INIT(sfmmup
, &dmr
);
5275 while (addr
< endaddr
) {
5276 hmeshift
= HME_HASH_SHIFT(hashno
);
5277 hblktag
.htag_bspage
= HME_HASH_BSPAGE(addr
, hmeshift
);
5278 hblktag
.htag_rehash
= hashno
;
5279 hmebp
= HME_HASH_FUNCTION(sfmmup
, addr
, hmeshift
);
5281 SFMMU_HASH_LOCK(hmebp
);
5283 HME_HASH_SEARCH(hmebp
, hblktag
, hmeblkp
, &list
);
5284 if (hmeblkp
!= NULL
) {
5285 ASSERT(!hmeblkp
->hblk_shared
);
5287 * We've encountered a shadow hmeblk so skip the range
5288 * of the next smaller mapping size.
5290 if (hmeblkp
->hblk_shw_bit
) {
5291 ASSERT(sfmmup
!= ksfmmup
);
5293 addr
= (caddr_t
)P2END((uintptr_t)addr
,
5294 TTEBYTES(hashno
- 1));
5296 addr
= sfmmu_hblk_chgprot(sfmmup
, hmeblkp
,
5297 addr
, endaddr
, &dmr
, vprot
);
5299 SFMMU_HASH_UNLOCK(hmebp
);
5303 SFMMU_HASH_UNLOCK(hmebp
);
5305 if (!HME_REHASH(sfmmup
) || (hashno
>= mmu_hashcnt
)) {
5307 * We have traversed the whole list and rehashed
5308 * if necessary without finding the address to chgprot.
5309 * This is ok so we increment the address by the
5310 * smallest hmeblk range for kernel mappings and the
5311 * largest hmeblk range, to account for shadow hmeblks,
5312 * for user mappings and continue.
5314 if (sfmmup
== ksfmmup
)
5315 addr
= (caddr_t
)P2END((uintptr_t)addr
,
5318 addr
= (caddr_t
)P2END((uintptr_t)addr
,
5326 sfmmu_hblks_list_purge(&list
, 0);
5327 DEMAP_RANGE_FLUSH(&dmr
);
5328 cpuset
= sfmmup
->sfmmu_cpusran
;
5333 * This function chgprots a range of addresses in an hmeblk. It returns the
5334 * next addres that needs to be chgprot.
5335 * It should be called with the hash lock held.
5336 * XXX It shold be possible to optimize chgprot by not flushing every time but
5337 * on the other hand:
5338 * 1. do one flush crosscall.
5339 * 2. only flush if we are increasing permissions (make sure this will work)
5342 sfmmu_hblk_chgprot(sfmmu_t
*sfmmup
, struct hme_blk
*hmeblkp
, caddr_t addr
,
5343 caddr_t endaddr
, demap_range_t
*dmrp
, uint_t vprot
)
5347 struct sf_hment
*sfhmep
;
5350 struct page
*pp
= NULL
;
5351 kmutex_t
*pml
, *pmtx
;
5353 int use_demap_range
;
5354 #if defined(SF_ERRATA_57)
5358 ASSERT(in_hblk_range(hmeblkp
, addr
));
5359 ASSERT(hmeblkp
->hblk_shw_bit
== 0);
5360 ASSERT(!hmeblkp
->hblk_shared
);
5363 if (get_hblk_ttesz(hmeblkp
) != TTE8K
&&
5364 (endaddr
< get_hblk_endaddr(hmeblkp
))) {
5365 panic("sfmmu_hblk_chgprot: partial chgprot of large page");
5369 endaddr
= MIN(endaddr
, get_hblk_endaddr(hmeblkp
));
5370 ttesz
= get_hblk_ttesz(hmeblkp
);
5372 pprot
= sfmmu_vtop_prot(vprot
, &tteflags
);
5373 #if defined(SF_ERRATA_57)
5374 check_exec
= (sfmmup
!= ksfmmup
) &&
5375 AS_TYPE_64BIT(sfmmup
->sfmmu_as
) &&
5376 ((vprot
& PROT_EXEC
) == PROT_EXEC
);
5378 HBLKTOHME(sfhmep
, hmeblkp
, addr
);
5381 * Flush the current demap region if addresses have been
5382 * skipped or the page size doesn't match.
5384 use_demap_range
= (TTEBYTES(ttesz
) == MMU_PAGESIZE
);
5385 if (use_demap_range
) {
5386 DEMAP_RANGE_CONTINUE(dmrp
, addr
, endaddr
);
5387 } else if (dmrp
!= NULL
) {
5388 DEMAP_RANGE_FLUSH(dmrp
);
5391 while (addr
< endaddr
) {
5392 sfmmu_copytte(&sfhmep
->hme_tte
, &tte
);
5393 if (TTE_IS_VALID(&tte
)) {
5394 if (TTE_GET_LOFLAGS(&tte
, tteflags
) == pprot
) {
5396 * if the new protection is the same as old
5402 pp
= sfhmep
->hme_page
;
5404 pml
= sfmmu_mlist_enter(pp
);
5406 if (pp
!= sfhmep
->hme_page
) {
5408 * tte most have been unloaded
5409 * underneath us. Recheck
5412 sfmmu_mlist_exit(pml
);
5416 ASSERT(pp
== NULL
|| sfmmu_mlist_held(pp
));
5419 TTE_SET_LOFLAGS(&ttemod
, tteflags
, pprot
);
5420 #if defined(SF_ERRATA_57)
5421 if (check_exec
&& addr
< errata57_limit
)
5422 ttemod
.tte_exec_perm
= 0;
5424 ret
= sfmmu_modifytte_try(&tte
, &ttemod
,
5428 /* tte changed underneath us */
5430 sfmmu_mlist_exit(pml
);
5435 if (tteflags
& TTE_HWWR_INT
) {
5437 * need to sync if we are clearing modify bit.
5439 sfmmu_ttesync(sfmmup
, addr
, &tte
, pp
);
5442 if (pp
&& PP_ISRO(pp
)) {
5443 if (pprot
& TTE_WRPRM_INT
) {
5444 pmtx
= sfmmu_page_enter(pp
);
5446 sfmmu_page_exit(pmtx
);
5450 if (ret
> 0 && use_demap_range
) {
5451 DEMAP_RANGE_MARKPG(dmrp
, addr
);
5452 } else if (ret
> 0) {
5453 sfmmu_tlb_demap(addr
, sfmmup
, hmeblkp
, 0, 0);
5457 sfmmu_mlist_exit(pml
);
5461 addr
+= TTEBYTES(ttesz
);
5463 DEMAP_RANGE_NEXTPG(dmrp
);
5469 * This routine is deprecated and should only be used by hat_chgprot.
5470 * The correct routine is sfmmu_vtop_attr.
5471 * This routine converts virtual page protections to physical ones. It will
5472 * update the tteflags field with the tte mask corresponding to the protections
5473 * affected and it returns the new protections. It will also clear the modify
5474 * bit if we are taking away write permission. This is necessary since the
5475 * modify bit is the hardware permission bit and we need to clear it in order
5476 * to detect write faults.
5477 * It accepts the following special protections:
5478 * ~PROT_WRITE = remove write permissions.
5479 * ~PROT_USER = remove user permissions.
5482 sfmmu_vtop_prot(uint_t vprot
, uint_t
*tteflagsp
)
5484 if (vprot
== (uint_t
)~PROT_WRITE
) {
5485 *tteflagsp
= TTE_WRPRM_INT
| TTE_HWWR_INT
;
5486 return (0); /* will cause wrprm to be cleared */
5488 if (vprot
== (uint_t
)~PROT_USER
) {
5489 *tteflagsp
= TTE_PRIV_INT
;
5490 return (0); /* will cause privprm to be cleared */
5492 if ((vprot
== 0) || (vprot
== PROT_USER
) ||
5493 ((vprot
& PROT_ALL
) != vprot
)) {
5494 panic("sfmmu_vtop_prot -- bad prot %x", vprot
);
5500 case (PROT_EXEC
| PROT_READ
):
5501 *tteflagsp
= TTE_PRIV_INT
| TTE_WRPRM_INT
| TTE_HWWR_INT
;
5502 return (TTE_PRIV_INT
); /* set prv and clr wrt */
5504 case (PROT_WRITE
| PROT_READ
):
5505 case (PROT_EXEC
| PROT_WRITE
):
5506 case (PROT_EXEC
| PROT_WRITE
| PROT_READ
):
5507 *tteflagsp
= TTE_PRIV_INT
| TTE_WRPRM_INT
;
5508 return (TTE_PRIV_INT
| TTE_WRPRM_INT
); /* set prv and wrt */
5509 case (PROT_USER
| PROT_READ
):
5510 case (PROT_USER
| PROT_EXEC
):
5511 case (PROT_USER
| PROT_EXEC
| PROT_READ
):
5512 *tteflagsp
= TTE_PRIV_INT
| TTE_WRPRM_INT
| TTE_HWWR_INT
;
5513 return (0); /* clr prv and wrt */
5514 case (PROT_USER
| PROT_WRITE
):
5515 case (PROT_USER
| PROT_WRITE
| PROT_READ
):
5516 case (PROT_USER
| PROT_EXEC
| PROT_WRITE
):
5517 case (PROT_USER
| PROT_EXEC
| PROT_WRITE
| PROT_READ
):
5518 *tteflagsp
= TTE_PRIV_INT
| TTE_WRPRM_INT
;
5519 return (TTE_WRPRM_INT
); /* clr prv and set wrt */
5521 panic("sfmmu_vtop_prot -- bad prot %x", vprot
);
5527 * Alternate unload for very large virtual ranges. With a true 64 bit VA,
5528 * the normal algorithm would take too long for a very large VA range with
5529 * few real mappings. This routine just walks thru all HMEs in the global
5530 * hash table to find and remove mappings.
5533 hat_unload_large_virtual(
5538 hat_callback_t
*callback
)
5540 struct hmehash_bucket
*hmebp
;
5541 struct hme_blk
*hmeblkp
;
5542 struct hme_blk
*pr_hblk
= NULL
;
5543 struct hme_blk
*nx_hblk
;
5544 struct hme_blk
*list
= NULL
;
5546 demap_range_t dmr
, *dmrp
;
5548 caddr_t endaddr
= startaddr
+ len
;
5551 caddr_t cb_sa
[MAX_CB_ADDR
];
5552 caddr_t cb_ea
[MAX_CB_ADDR
];
5556 if (sfmmup
->sfmmu_free
) {
5560 DEMAP_RANGE_INIT(sfmmup
, dmrp
);
5564 * Loop through all the hash buckets of HME blocks looking for matches.
5566 for (i
= 0; i
<= UHMEHASH_SZ
; i
++) {
5567 hmebp
= &uhme_hash
[i
];
5568 SFMMU_HASH_LOCK(hmebp
);
5569 hmeblkp
= hmebp
->hmeblkp
;
5572 nx_hblk
= hmeblkp
->hblk_next
;
5575 * skip if not this context, if a shadow block or
5576 * if the mapping is not in the requested range
5578 if (hmeblkp
->hblk_tag
.htag_id
!= sfmmup
||
5579 hmeblkp
->hblk_shw_bit
||
5580 (sa
= (caddr_t
)get_hblk_base(hmeblkp
)) >= endaddr
||
5581 (ea
= get_hblk_endaddr(hmeblkp
)) <= startaddr
) {
5586 ASSERT(!hmeblkp
->hblk_shared
);
5588 * unload if there are any current valid mappings
5590 if (hmeblkp
->hblk_vcnt
!= 0 ||
5591 hmeblkp
->hblk_hmecnt
!= 0)
5592 (void) sfmmu_hblk_unload(sfmmup
, hmeblkp
,
5593 sa
, ea
, dmrp
, flags
);
5596 * on unmap we also release the HME block itself, once
5597 * all mappings are gone.
5599 if ((flags
& HAT_UNLOAD_UNMAP
) != 0 &&
5600 !hmeblkp
->hblk_vcnt
&&
5601 !hmeblkp
->hblk_hmecnt
) {
5602 ASSERT(!hmeblkp
->hblk_lckcnt
);
5603 sfmmu_hblk_hash_rm(hmebp
, hmeblkp
, pr_hblk
,
5609 if (callback
== NULL
)
5613 * HME blocks may span more than one page, but we may be
5614 * unmapping only one page, so check for a smaller range
5622 cb_sa
[addr_cnt
] = sa
;
5623 cb_ea
[addr_cnt
] = ea
;
5624 if (++addr_cnt
== MAX_CB_ADDR
) {
5626 DEMAP_RANGE_FLUSH(dmrp
);
5627 cpuset
= sfmmup
->sfmmu_cpusran
;
5631 for (a
= 0; a
< MAX_CB_ADDR
; ++a
) {
5632 callback
->hcb_start_addr
= cb_sa
[a
];
5633 callback
->hcb_end_addr
= cb_ea
[a
];
5634 callback
->hcb_function(callback
);
5642 SFMMU_HASH_UNLOCK(hmebp
);
5645 sfmmu_hblks_list_purge(&list
, 0);
5647 DEMAP_RANGE_FLUSH(dmrp
);
5648 cpuset
= sfmmup
->sfmmu_cpusran
;
5652 for (a
= 0; a
< addr_cnt
; ++a
) {
5653 callback
->hcb_start_addr
= cb_sa
[a
];
5654 callback
->hcb_end_addr
= cb_ea
[a
];
5655 callback
->hcb_function(callback
);
5659 * Check TSB and TLB page sizes if the process isn't exiting.
5661 if (!sfmmup
->sfmmu_free
)
5662 sfmmu_check_page_sizes(sfmmup
, 0);
5666 * Unload all the mappings in the range [addr..addr+len). addr and len must
5667 * be MMU_PAGESIZE aligned.
5670 extern struct seg
*segkmap
;
5671 #define ISSEGKMAP(sfmmup, addr) (sfmmup == ksfmmup && \
5672 segkmap->s_base <= (addr) && (addr) < (segkmap->s_base + segkmap->s_size))
5676 hat_unload_callback(
5681 hat_callback_t
*callback
)
5683 struct hmehash_bucket
*hmebp
;
5685 int hmeshift
, hashno
, iskernel
;
5686 struct hme_blk
*hmeblkp
, *pr_hblk
, *list
= NULL
;
5691 caddr_t cb_start_addr
[MAX_CB_ADDR
];
5692 caddr_t cb_end_addr
[MAX_CB_ADDR
];
5693 int issegkmap
= ISSEGKMAP(sfmmup
, addr
);
5694 demap_range_t dmr
, *dmrp
;
5696 if (sfmmup
->sfmmu_xhat_provider
) {
5697 XHAT_UNLOAD_CALLBACK(sfmmup
, addr
, len
, flags
, callback
);
5701 * This must be a CPU HAT. If the address space has
5702 * XHATs attached, unload the mappings for all of them,
5705 ASSERT(sfmmup
->sfmmu_as
!= NULL
);
5706 if (sfmmup
->sfmmu_as
->a_xhat
!= NULL
)
5707 xhat_unload_callback_all(sfmmup
->sfmmu_as
, addr
,
5708 len
, flags
, callback
);
5711 ASSERT((sfmmup
== ksfmmup
) || (flags
& HAT_UNLOAD_OTHER
) || \
5712 AS_LOCK_HELD(sfmmup
->sfmmu_as
));
5714 ASSERT(sfmmup
!= NULL
);
5715 ASSERT((len
& MMU_PAGEOFFSET
) == 0);
5716 ASSERT(!((uintptr_t)addr
& MMU_PAGEOFFSET
));
5719 * Probing through a large VA range (say 63 bits) will be slow, even
5720 * at 4 Meg steps between the probes. So, when the virtual address range
5721 * is very large, search the HME entries for what to unload.
5723 * len >> TTE_PAGE_SHIFT(TTE4M) is the # of 4Meg probes we'd need
5725 * UHMEHASH_SZ is number of hash buckets to examine
5728 if (sfmmup
!= KHATID
&& (len
>> TTE_PAGE_SHIFT(TTE4M
)) > UHMEHASH_SZ
) {
5729 hat_unload_large_virtual(sfmmup
, addr
, len
, flags
, callback
);
5733 CPUSET_ZERO(cpuset
);
5736 * If the process is exiting, we can save a lot of fuss since
5737 * we'll flush the TLB when we free the ctx anyway.
5739 if (sfmmup
->sfmmu_free
) {
5743 DEMAP_RANGE_INIT(sfmmup
, dmrp
);
5746 endaddr
= addr
+ len
;
5747 hblktag
.htag_id
= sfmmup
;
5748 hblktag
.htag_rid
= SFMMU_INVALID_SHMERID
;
5751 * It is likely for the vm to call unload over a wide range of
5752 * addresses that are actually very sparsely populated by
5753 * translations. In order to speed this up the sfmmu hat supports
5754 * the concept of shadow hmeblks. Dummy large page hmeblks that
5755 * correspond to actual small translations are allocated at tteload
5756 * time and are referred to as shadow hmeblks. Now, during unload
5757 * time, we first check if we have a shadow hmeblk for that
5758 * translation. The absence of one means the corresponding address
5759 * range is empty and can be skipped.
5761 * The kernel is an exception to above statement and that is why
5762 * we don't use shadow hmeblks and hash starting from the smallest
5765 if (sfmmup
== KHATID
) {
5770 if (mmu_page_sizes
== max_mmu_page_sizes
) {
5776 while (addr
< endaddr
) {
5777 hmeshift
= HME_HASH_SHIFT(hashno
);
5778 hblktag
.htag_bspage
= HME_HASH_BSPAGE(addr
, hmeshift
);
5779 hblktag
.htag_rehash
= hashno
;
5780 hmebp
= HME_HASH_FUNCTION(sfmmup
, addr
, hmeshift
);
5782 SFMMU_HASH_LOCK(hmebp
);
5784 HME_HASH_SEARCH_PREV(hmebp
, hblktag
, hmeblkp
, pr_hblk
, &list
);
5785 if (hmeblkp
== NULL
) {
5787 * didn't find an hmeblk. skip the appropiate
5790 SFMMU_HASH_UNLOCK(hmebp
);
5792 if (hashno
< mmu_hashcnt
) {
5797 addr
= (caddr_t
)roundup((uintptr_t)addr
5798 + 1, MMU_PAGESIZE64K
);
5802 addr
= (caddr_t
)roundup((uintptr_t)addr
+ 1,
5804 if ((uintptr_t)addr
& MMU_PAGEOFFSET512K
) {
5805 ASSERT(hashno
== TTE64K
);
5808 if ((uintptr_t)addr
& MMU_PAGEOFFSET4M
) {
5812 if (mmu_page_sizes
== max_mmu_page_sizes
) {
5813 if ((uintptr_t)addr
& MMU_PAGEOFFSET32M
) {
5817 if ((uintptr_t)addr
& MMU_PAGEOFFSET256M
) {
5829 ASSERT(!hmeblkp
->hblk_shared
);
5830 if (!hmeblkp
->hblk_vcnt
&& !hmeblkp
->hblk_hmecnt
) {
5832 * If the valid count is zero we can skip the range
5833 * mapped by this hmeblk.
5834 * We free hblks in the case of HAT_UNMAP. HAT_UNMAP
5835 * is used by segment drivers as a hint
5836 * that the mapping resource won't be used any longer.
5837 * The best example of this is during exit().
5839 addr
= (caddr_t
)roundup((uintptr_t)addr
+ 1,
5840 get_hblk_span(hmeblkp
));
5841 if ((flags
& HAT_UNLOAD_UNMAP
) ||
5842 (iskernel
&& !issegkmap
)) {
5843 sfmmu_hblk_hash_rm(hmebp
, hmeblkp
, pr_hblk
,
5846 SFMMU_HASH_UNLOCK(hmebp
);
5852 if ((uintptr_t)addr
& MMU_PAGEOFFSET512K
) {
5853 ASSERT(hashno
== TTE64K
);
5856 if ((uintptr_t)addr
& MMU_PAGEOFFSET4M
) {
5860 if (mmu_page_sizes
== max_mmu_page_sizes
) {
5861 if ((uintptr_t)addr
& MMU_PAGEOFFSET32M
) {
5865 if ((uintptr_t)addr
& MMU_PAGEOFFSET256M
) {
5876 if (hmeblkp
->hblk_shw_bit
) {
5878 * If we encounter a shadow hmeblk we know there is
5879 * smaller sized hmeblks mapping the same address space.
5880 * Decrement the hash size and rehash.
5882 ASSERT(sfmmup
!= KHATID
);
5884 SFMMU_HASH_UNLOCK(hmebp
);
5889 * track callback address ranges.
5890 * only start a new range when it's not contiguous
5892 if (callback
!= NULL
) {
5893 if (addr_count
> 0 &&
5894 addr
== cb_end_addr
[addr_count
- 1])
5897 cb_start_addr
[addr_count
] = addr
;
5900 addr
= sfmmu_hblk_unload(sfmmup
, hmeblkp
, addr
, endaddr
,
5903 if (callback
!= NULL
)
5904 cb_end_addr
[addr_count
++] = addr
;
5906 if (((flags
& HAT_UNLOAD_UNMAP
) || (iskernel
&& !issegkmap
)) &&
5907 !hmeblkp
->hblk_vcnt
&& !hmeblkp
->hblk_hmecnt
) {
5908 sfmmu_hblk_hash_rm(hmebp
, hmeblkp
, pr_hblk
, &list
, 0);
5910 SFMMU_HASH_UNLOCK(hmebp
);
5913 * Notify our caller as to exactly which pages
5914 * have been unloaded. We do these in clumps,
5915 * to minimize the number of xt_sync()s that need to occur.
5917 if (callback
!= NULL
&& addr_count
== MAX_CB_ADDR
) {
5919 DEMAP_RANGE_FLUSH(dmrp
);
5920 cpuset
= sfmmup
->sfmmu_cpusran
;
5924 for (a
= 0; a
< MAX_CB_ADDR
; ++a
) {
5925 callback
->hcb_start_addr
= cb_start_addr
[a
];
5926 callback
->hcb_end_addr
= cb_end_addr
[a
];
5927 callback
->hcb_function(callback
);
5935 if ((uintptr_t)addr
& MMU_PAGEOFFSET512K
) {
5936 ASSERT(hashno
== TTE64K
);
5939 if ((uintptr_t)addr
& MMU_PAGEOFFSET4M
) {
5943 if (mmu_page_sizes
== max_mmu_page_sizes
) {
5944 if ((uintptr_t)addr
& MMU_PAGEOFFSET32M
) {
5948 if ((uintptr_t)addr
& MMU_PAGEOFFSET256M
) {
5958 sfmmu_hblks_list_purge(&list
, 0);
5960 DEMAP_RANGE_FLUSH(dmrp
);
5961 cpuset
= sfmmup
->sfmmu_cpusran
;
5964 if (callback
&& addr_count
!= 0) {
5965 for (a
= 0; a
< addr_count
; ++a
) {
5966 callback
->hcb_start_addr
= cb_start_addr
[a
];
5967 callback
->hcb_end_addr
= cb_end_addr
[a
];
5968 callback
->hcb_function(callback
);
5973 * Check TSB and TLB page sizes if the process isn't exiting.
5975 if (!sfmmup
->sfmmu_free
)
5976 sfmmu_check_page_sizes(sfmmup
, 0);
5980 * Unload all the mappings in the range [addr..addr+len). addr and len must
5981 * be MMU_PAGESIZE aligned.
5984 hat_unload(struct hat
*sfmmup
, caddr_t addr
, size_t len
, uint_t flags
)
5986 if (sfmmup
->sfmmu_xhat_provider
) {
5987 XHAT_UNLOAD(sfmmup
, addr
, len
, flags
);
5990 hat_unload_callback(sfmmup
, addr
, len
, flags
, NULL
);
5995 * Find the largest mapping size for this page.
5998 fnd_mapping_sz(page_t
*pp
)
6003 p_index
= PP_MAPINDEX(pp
);
6006 p_index
>>= 1; /* don't care about 8K bit */
6007 for (; p_index
; p_index
>>= 1) {
6015 * This function unloads a range of addresses for an hmeblk.
6016 * It returns the next address to be unloaded.
6017 * It should be called with the hash lock held.
6020 sfmmu_hblk_unload(struct hat
*sfmmup
, struct hme_blk
*hmeblkp
, caddr_t addr
,
6021 caddr_t endaddr
, demap_range_t
*dmrp
, uint_t flags
)
6024 struct sf_hment
*sfhmep
;
6030 int use_demap_range
;
6032 ASSERT(in_hblk_range(hmeblkp
, addr
));
6033 ASSERT(!hmeblkp
->hblk_shw_bit
);
6034 ASSERT(sfmmup
!= NULL
|| hmeblkp
->hblk_shared
);
6035 ASSERT(sfmmup
== NULL
|| !hmeblkp
->hblk_shared
);
6036 ASSERT(dmrp
== NULL
|| !hmeblkp
->hblk_shared
);
6039 if (get_hblk_ttesz(hmeblkp
) != TTE8K
&&
6040 (endaddr
< get_hblk_endaddr(hmeblkp
))) {
6041 panic("sfmmu_hblk_unload: partial unload of large page");
6045 endaddr
= MIN(endaddr
, get_hblk_endaddr(hmeblkp
));
6046 ttesz
= get_hblk_ttesz(hmeblkp
);
6048 use_demap_range
= ((dmrp
== NULL
) ||
6049 (TTEBYTES(ttesz
) == DEMAP_RANGE_PGSZ(dmrp
)));
6051 if (use_demap_range
) {
6052 DEMAP_RANGE_CONTINUE(dmrp
, addr
, endaddr
);
6053 } else if (dmrp
!= NULL
) {
6054 DEMAP_RANGE_FLUSH(dmrp
);
6057 HBLKTOHME(sfhmep
, hmeblkp
, addr
);
6059 while (addr
< endaddr
) {
6061 sfmmu_copytte(&sfhmep
->hme_tte
, &tte
);
6062 if (TTE_IS_VALID(&tte
)) {
6063 pp
= sfhmep
->hme_page
;
6065 pml
= sfmmu_mlist_enter(pp
);
6069 * Verify if hme still points to 'pp' now that
6070 * we have p_mapping lock.
6072 if (sfhmep
->hme_page
!= pp
) {
6073 if (pp
!= NULL
&& sfhmep
->hme_page
!= NULL
) {
6074 ASSERT(pml
!= NULL
);
6075 sfmmu_mlist_exit(pml
);
6076 /* Re-start this iteration. */
6079 ASSERT((pp
!= NULL
) &&
6080 (sfhmep
->hme_page
== NULL
));
6085 * This point on we have both HASH and p_mapping
6088 ASSERT(pp
== sfhmep
->hme_page
);
6089 ASSERT(pp
== NULL
|| sfmmu_mlist_held(pp
));
6092 * We need to loop on modify tte because it is
6093 * possible for pagesync to come along and
6094 * change the software bits beneath us.
6096 * Page_unload can also invalidate the tte after
6097 * we read tte outside of p_mapping lock.
6102 TTE_SET_INVALID(&ttemod
);
6103 ret
= sfmmu_modifytte_try(&tte
, &ttemod
,
6107 if (TTE_IS_VALID(&tte
)) {
6112 panic("sfmmu_hblk_unload: pp = 0x%p "
6113 "tte became invalid under mlist"
6114 " lock = 0x%p", (void *)pp
,
6120 if (!(flags
& HAT_UNLOAD_NOSYNC
)) {
6121 sfmmu_ttesync(sfmmup
, addr
, &tte
, pp
);
6125 * Ok- we invalidated the tte. Do the rest of the job.
6129 if (flags
& HAT_UNLOAD_UNLOCK
) {
6130 ASSERT(hmeblkp
->hblk_lckcnt
> 0);
6131 atomic_dec_32(&hmeblkp
->hblk_lckcnt
);
6132 HBLK_STACK_TRACE(hmeblkp
, HBLK_UNLOCK
);
6136 * Normally we would need to flush the page
6137 * from the virtual cache at this point in
6138 * order to prevent a potential cache alias
6140 * The particular scenario we need to worry
6142 * Given: va1 and va2 are two virtual address
6143 * that alias and map the same physical
6145 * 1. mapping exists from va1 to pa and data
6146 * has been read into the cache.
6148 * 3. load va2 and modify data using va2.
6150 * 5. load va1 and reference data. Unless we
6151 * flush the data cache when we unload we will
6153 * Fortunately, page coloring eliminates the
6154 * above scenario by remembering the color a
6155 * physical page was last or is currently
6156 * mapped to. Now, we delay the flush until
6157 * the loading of translations. Only when the
6158 * new translation is of a different color
6159 * are we forced to flush.
6161 if (use_demap_range
) {
6163 * Mark this page as needing a demap.
6165 DEMAP_RANGE_MARKPG(dmrp
, addr
);
6167 ASSERT(sfmmup
!= NULL
);
6168 ASSERT(!hmeblkp
->hblk_shared
);
6169 sfmmu_tlb_demap(addr
, sfmmup
, hmeblkp
,
6170 sfmmup
->sfmmu_free
, 0);
6175 * Remove the hment from the mapping list
6177 ASSERT(hmeblkp
->hblk_hmecnt
> 0);
6181 * ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS);
6183 HME_SUB(sfhmep
, pp
);
6185 atomic_dec_16(&hmeblkp
->hblk_hmecnt
);
6188 ASSERT(hmeblkp
->hblk_vcnt
> 0);
6189 atomic_dec_16(&hmeblkp
->hblk_vcnt
);
6191 ASSERT(hmeblkp
->hblk_hmecnt
|| hmeblkp
->hblk_vcnt
||
6192 !hmeblkp
->hblk_lckcnt
);
6195 if (pp
&& (pp
->p_nrm
& (P_KPMC
| P_KPMS
| P_TNC
))) {
6198 * If page was temporary
6199 * uncached, try to recache
6200 * it. Note that HME_SUB() was
6201 * called above so p_index and
6202 * mlist had been updated.
6204 conv_tnc(pp
, ttesz
);
6205 } else if (pp
->p_mapping
== NULL
) {
6208 * Page is marked to be in VAC conflict
6209 * to an existing kpm mapping and/or is
6210 * kpm mapped using only the regular
6213 sfmmu_kpm_hme_unload(pp
);
6217 } else if ((pp
= sfhmep
->hme_page
) != NULL
) {
6219 * TTE is invalid but the hme
6220 * still exists. let pageunload
6223 ASSERT(pml
== NULL
);
6224 pml
= sfmmu_mlist_enter(pp
);
6225 if (sfhmep
->hme_page
!= NULL
) {
6226 sfmmu_mlist_exit(pml
);
6229 ASSERT(sfhmep
->hme_page
== NULL
);
6230 } else if (hmeblkp
->hblk_hmecnt
!= 0) {
6232 * pageunload may have not finished decrementing
6233 * hblk_vcnt and hblk_hmecnt. Find page_t if any and
6234 * wait for pageunload to finish. Rely on pageunload
6235 * to decrement hblk_hmecnt after hblk_vcnt.
6237 pfn_t pfn
= TTE_TO_TTEPFN(&tte
);
6238 ASSERT(pml
== NULL
);
6239 if (pf_is_memory(pfn
)) {
6240 pp
= page_numtopp_nolock(pfn
);
6242 pml
= sfmmu_mlist_enter(pp
);
6243 sfmmu_mlist_exit(pml
);
6251 * At this point, the tte we are looking at
6252 * should be unloaded, and hme has been unlinked
6253 * from page too. This is important because in
6254 * pageunload, it does ttesync() then HME_SUB.
6255 * We need to make sure HME_SUB has been completed
6256 * so we know ttesync() has been completed. Otherwise,
6257 * at exit time, after return from hat layer, VM will
6258 * release as structure which hat_setstat() (called
6259 * by ttesync()) needs.
6265 ASSERT(sfhmep
->hme_page
== NULL
);
6267 sfmmu_copytte(&sfhmep
->hme_tte
, &dtte
);
6268 ASSERT(!TTE_IS_VALID(&dtte
));
6273 sfmmu_mlist_exit(pml
);
6276 addr
+= TTEBYTES(ttesz
);
6278 DEMAP_RANGE_NEXTPG(dmrp
);
6281 * For shared hmeblks this routine is only called when region is freed
6282 * and no longer referenced. So no need to decrement ttecnt
6283 * in the region structure here.
6285 if (ttecnt
> 0 && sfmmup
!= NULL
) {
6286 atomic_add_long(&sfmmup
->sfmmu_ttecnt
[ttesz
], -ttecnt
);
6292 * Invalidate a virtual address range for the local CPU.
6293 * For best performance ensure that the va range is completely
6294 * mapped, otherwise the entire TLB will be flushed.
6297 hat_flush_range(struct hat
*sfmmup
, caddr_t va
, size_t size
)
6300 caddr_t endva
= va
+ size
;
6302 while (va
< endva
) {
6303 sz
= hat_getpagesize(sfmmup
, va
);
6308 vtag_flushpage(va
, (uint64_t)sfmmup
);
6314 * Synchronize all the mappings in the range [addr..addr+len).
6315 * Can be called with clearflag having two states:
6316 * HAT_SYNC_DONTZERO means just return the rm stats
6317 * HAT_SYNC_ZERORM means zero rm bits in the tte and return the stats
6320 hat_sync(struct hat
*sfmmup
, caddr_t addr
, size_t len
, uint_t clearflag
)
6322 struct hmehash_bucket
*hmebp
;
6324 int hmeshift
, hashno
= 1;
6325 struct hme_blk
*hmeblkp
, *list
= NULL
;
6329 ASSERT(sfmmup
->sfmmu_xhat_provider
== NULL
);
6330 ASSERT((sfmmup
== ksfmmup
) || AS_LOCK_HELD(sfmmup
->sfmmu_as
));
6331 ASSERT((len
& MMU_PAGEOFFSET
) == 0);
6332 ASSERT((clearflag
== HAT_SYNC_DONTZERO
) ||
6333 (clearflag
== HAT_SYNC_ZERORM
));
6335 CPUSET_ZERO(cpuset
);
6337 endaddr
= addr
+ len
;
6338 hblktag
.htag_id
= sfmmup
;
6339 hblktag
.htag_rid
= SFMMU_INVALID_SHMERID
;
6342 * Spitfire supports 4 page sizes.
6343 * Most pages are expected to be of the smallest page
6344 * size (8K) and these will not need to be rehashed. 64K
6345 * pages also don't need to be rehashed because the an hmeblk
6346 * spans 64K of address space. 512K pages might need 1 rehash and
6347 * and 4M pages 2 rehashes.
6349 while (addr
< endaddr
) {
6350 hmeshift
= HME_HASH_SHIFT(hashno
);
6351 hblktag
.htag_bspage
= HME_HASH_BSPAGE(addr
, hmeshift
);
6352 hblktag
.htag_rehash
= hashno
;
6353 hmebp
= HME_HASH_FUNCTION(sfmmup
, addr
, hmeshift
);
6355 SFMMU_HASH_LOCK(hmebp
);
6357 HME_HASH_SEARCH(hmebp
, hblktag
, hmeblkp
, &list
);
6358 if (hmeblkp
!= NULL
) {
6359 ASSERT(!hmeblkp
->hblk_shared
);
6361 * We've encountered a shadow hmeblk so skip the range
6362 * of the next smaller mapping size.
6364 if (hmeblkp
->hblk_shw_bit
) {
6365 ASSERT(sfmmup
!= ksfmmup
);
6367 addr
= (caddr_t
)P2END((uintptr_t)addr
,
6368 TTEBYTES(hashno
- 1));
6370 addr
= sfmmu_hblk_sync(sfmmup
, hmeblkp
,
6371 addr
, endaddr
, clearflag
);
6373 SFMMU_HASH_UNLOCK(hmebp
);
6377 SFMMU_HASH_UNLOCK(hmebp
);
6379 if (!HME_REHASH(sfmmup
) || (hashno
>= mmu_hashcnt
)) {
6381 * We have traversed the whole list and rehashed
6382 * if necessary without finding the address to sync.
6383 * This is ok so we increment the address by the
6384 * smallest hmeblk range for kernel mappings and the
6385 * largest hmeblk range, to account for shadow hmeblks,
6386 * for user mappings and continue.
6388 if (sfmmup
== ksfmmup
)
6389 addr
= (caddr_t
)P2END((uintptr_t)addr
,
6392 addr
= (caddr_t
)P2END((uintptr_t)addr
,
6399 sfmmu_hblks_list_purge(&list
, 0);
6400 cpuset
= sfmmup
->sfmmu_cpusran
;
6405 sfmmu_hblk_sync(struct hat
*sfmmup
, struct hme_blk
*hmeblkp
, caddr_t addr
,
6406 caddr_t endaddr
, int clearflag
)
6409 struct sf_hment
*sfhmep
;
6415 ASSERT(hmeblkp
->hblk_shw_bit
== 0);
6416 ASSERT(!hmeblkp
->hblk_shared
);
6418 endaddr
= MIN(endaddr
, get_hblk_endaddr(hmeblkp
));
6420 ttesz
= get_hblk_ttesz(hmeblkp
);
6421 HBLKTOHME(sfhmep
, hmeblkp
, addr
);
6423 while (addr
< endaddr
) {
6424 sfmmu_copytte(&sfhmep
->hme_tte
, &tte
);
6425 if (TTE_IS_VALID(&tte
)) {
6427 pp
= sfhmep
->hme_page
;
6429 pml
= sfmmu_mlist_enter(pp
);
6431 if (pp
!= sfhmep
->hme_page
) {
6433 * tte most have been unloaded
6434 * underneath us. Recheck
6437 sfmmu_mlist_exit(pml
);
6441 ASSERT(pp
== NULL
|| sfmmu_mlist_held(pp
));
6443 if (clearflag
== HAT_SYNC_ZERORM
) {
6445 TTE_CLR_RM(&ttemod
);
6446 ret
= sfmmu_modifytte_try(&tte
, &ttemod
,
6450 sfmmu_mlist_exit(pml
);
6456 sfmmu_tlb_demap(addr
, sfmmup
,
6460 sfmmu_ttesync(sfmmup
, addr
, &tte
, pp
);
6462 sfmmu_mlist_exit(pml
);
6465 addr
+= TTEBYTES(ttesz
);
6472 * This function will sync a tte to the page struct and it will
6473 * update the hat stats. Currently it allows us to pass a NULL pp
6474 * and we will simply update the stats. We may want to change this
6475 * so we only keep stats for pages backed by pp's.
6478 sfmmu_ttesync(struct hat
*sfmmup
, caddr_t addr
, tte_t
*ttep
, page_t
*pp
)
6484 ASSERT(TTE_IS_VALID(ttep
));
6486 if (TTE_IS_NOSYNC(ttep
)) {
6490 if (TTE_IS_REF(ttep
)) {
6493 if (TTE_IS_MOD(ttep
)) {
6502 if (sfmmup
!= NULL
&& sfmmup
->sfmmu_rmstat
) {
6504 caddr_t vaddr
= addr
;
6506 for (i
= 0; i
< TTEPAGES(sz
); i
++, vaddr
+= MMU_PAGESIZE
) {
6507 hat_setstat(sfmmup
->sfmmu_as
, vaddr
, MMU_PAGESIZE
, rm
);
6513 * XXX I want to use cas to update nrm bits but they
6514 * currently belong in common/vm and not in hat where
6516 * The nrm bits are protected by the same mutex as
6517 * the one that protects the page's mapping list.
6521 ASSERT(sfmmu_mlist_held(pp
));
6523 * If the tte is for a large page, we need to sync all the
6524 * pages covered by the tte.
6527 ASSERT(pp
->p_szc
!= 0);
6528 pp
= PP_GROUPLEADER(pp
, sz
);
6529 ASSERT(sfmmu_mlist_held(pp
));
6532 /* Get number of pages from tte size. */
6533 npgs
= TTEPAGES(sz
);
6537 ASSERT(sfmmu_mlist_held(pp
));
6538 if (((rm
& P_REF
) != 0 && !PP_ISREF(pp
)) ||
6539 ((rm
& P_MOD
) != 0 && !PP_ISMOD(pp
)))
6540 hat_page_setattr(pp
, rm
);
6543 * Are we done? If not, we must have a large mapping.
6544 * For large mappings we need to sync the rest of the pages
6545 * covered by this tte; goto the next page.
6547 } while (--npgs
> 0 && (pp
= PP_PAGENEXT(pp
)));
6551 * Execute pre-callback handler of each pa_hment linked to pp
6554 * flag: either HAT_PRESUSPEND or HAT_SUSPEND.
6555 * capture_cpus: pointer to return value (below)
6558 * Propagates the subsystem callback return values back to the caller;
6559 * returns 0 on success. If capture_cpus is non-NULL, the value returned
6560 * is zero if all of the pa_hments are of a type that do not require
6561 * capturing CPUs prior to suspending the mapping, else it is 1.
6564 hat_pageprocess_precallbacks(struct page
*pp
, uint_t flag
, int *capture_cpus
)
6566 struct sf_hment
*sfhmep
;
6567 struct pa_hment
*pahmep
;
6568 int (*f
)(caddr_t
, uint_t
, uint_t
, void *);
6574 ASSERT(PAGE_EXCL(pp
));
6575 if (!sfmmu_mlist_held(pp
)) {
6576 pml
= sfmmu_mlist_enter(pp
);
6584 for (sfhmep
= pp
->p_mapping
; sfhmep
; sfhmep
= sfhmep
->hme_next
) {
6586 * skip sf_hments corresponding to VA<->PA mappings;
6587 * for pa_hment's, hme_tte.ll is zero
6589 if (!IS_PAHME(sfhmep
))
6592 pahmep
= sfhmep
->hme_data
;
6593 ASSERT(pahmep
!= NULL
);
6596 * skip if pre-handler has been called earlier in this loop
6598 if (pahmep
->flags
& flag
)
6602 ASSERT(id
>= (id_t
)0 && id
< sfmmu_cb_nextid
);
6603 if (capture_cpus
&& sfmmu_cb_table
[id
].capture_cpus
!= 0)
6605 if ((f
= sfmmu_cb_table
[id
].prehandler
) == NULL
) {
6606 pahmep
->flags
|= flag
;
6611 * Drop the mapping list lock to avoid locking order issues.
6614 sfmmu_mlist_exit(pml
);
6616 ret
= f(pahmep
->addr
, pahmep
->len
, flag
, pahmep
->pvt
);
6618 return (ret
); /* caller must do the cleanup */
6621 pml
= sfmmu_mlist_enter(pp
);
6622 pahmep
->flags
|= flag
;
6626 pahmep
->flags
|= flag
;
6630 sfmmu_mlist_exit(pml
);
6636 * Execute post-callback handler of each pa_hment linked to pp
6638 * Same overall assumptions and restrictions apply as for
6639 * hat_pageprocess_precallbacks().
6642 hat_pageprocess_postcallbacks(struct page
*pp
, uint_t flag
)
6644 pfn_t pgpfn
= pp
->p_pagenum
;
6645 pfn_t pgmask
= btop(page_get_pagesize(pp
->p_szc
)) - 1;
6647 struct sf_hment
*sfhmep
;
6648 struct pa_hment
*pahmep
;
6649 int (*f
)(caddr_t
, uint_t
, uint_t
, void *, pfn_t
);
6654 ASSERT(PAGE_EXCL(pp
));
6655 if (!sfmmu_mlist_held(pp
)) {
6656 pml
= sfmmu_mlist_enter(pp
);
6661 for (sfhmep
= pp
->p_mapping
; sfhmep
; sfhmep
= sfhmep
->hme_next
) {
6663 * skip sf_hments corresponding to VA<->PA mappings;
6664 * for pa_hment's, hme_tte.ll is zero
6666 if (!IS_PAHME(sfhmep
))
6669 pahmep
= sfhmep
->hme_data
;
6670 ASSERT(pahmep
!= NULL
);
6672 if ((pahmep
->flags
& flag
) == 0)
6675 pahmep
->flags
&= ~flag
;
6678 ASSERT(id
>= (id_t
)0 && id
< sfmmu_cb_nextid
);
6679 if ((f
= sfmmu_cb_table
[id
].posthandler
) == NULL
)
6683 * Convert the base page PFN into the constituent PFN
6684 * which is needed by the callback handler.
6686 newpfn
= pgpfn
| (btop((uintptr_t)pahmep
->addr
) & pgmask
);
6689 * Drop the mapping list lock to avoid locking order issues.
6692 sfmmu_mlist_exit(pml
);
6694 if (f(pahmep
->addr
, pahmep
->len
, flag
, pahmep
->pvt
, newpfn
)
6696 panic("sfmmu: posthandler failed");
6699 pml
= sfmmu_mlist_enter(pp
);
6705 sfmmu_mlist_exit(pml
);
6709 * Suspend locked kernel mapping
6712 hat_pagesuspend(struct page
*pp
)
6714 struct sf_hment
*sfhmep
;
6717 struct hme_blk
*hmeblkp
;
6722 ASSERT(PAGE_EXCL(pp
));
6723 ASSERT(sfmmu_mlist_held(pp
));
6725 mutex_enter(&kpr_suspendlock
);
6728 * We're about to suspend a kernel mapping so mark this thread as
6729 * non-traceable by DTrace. This prevents us from running into issues
6730 * with probe context trying to touch a suspended page
6731 * in the relocation codepath itself.
6733 curthread
->t_flag
|= T_DONTDTRACE
;
6735 index
= PP_MAPINDEX(pp
);
6739 for (sfhmep
= pp
->p_mapping
; sfhmep
; sfhmep
= sfhmep
->hme_next
) {
6741 if (IS_PAHME(sfhmep
))
6744 if (get_hblk_ttesz(sfmmu_hmetohblk(sfhmep
)) != cons
)
6748 * Loop until we successfully set the suspend bit in
6752 sfmmu_copytte(&sfhmep
->hme_tte
, &tte
);
6753 ASSERT(TTE_IS_VALID(&tte
));
6756 TTE_SET_SUSPEND(&ttemod
);
6757 if (sfmmu_modifytte_try(&tte
, &ttemod
,
6758 &sfhmep
->hme_tte
) < 0)
6762 * Invalidate TSB entry
6764 hmeblkp
= sfmmu_hmetohblk(sfhmep
);
6766 sfmmup
= hblktosfmmu(hmeblkp
);
6767 ASSERT(sfmmup
== ksfmmup
);
6768 ASSERT(!hmeblkp
->hblk_shared
);
6770 addr
= tte_to_vaddr(hmeblkp
, tte
);
6773 * No need to make sure that the TSB for this sfmmu is
6774 * not being relocated since it is ksfmmup and thus it
6775 * will never be relocated.
6777 SFMMU_UNLOAD_TSB(addr
, sfmmup
, hmeblkp
, 0);
6780 * Update xcall stats
6782 cpuset
= cpu_ready_set
;
6783 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
6785 /* LINTED: constant in conditional context */
6786 SFMMU_XCALL_STATS(ksfmmup
);
6789 * Flush TLB entry on remote CPU's
6791 xt_some(cpuset
, vtag_flushpage_tl1
, (uint64_t)addr
,
6796 * Flush TLB entry on local CPU
6798 vtag_flushpage(addr
, (uint64_t)ksfmmup
);
6801 while (index
!= 0) {
6806 pp
= PP_GROUPLEADER(pp
, cons
);
6823 static struct prle page_relocate_log
[N_PRLE
];
6824 static int prl_entry
;
6825 static kmutex_t prl_mutex
;
6827 #define PAGE_RELOCATE_LOG(t, r, s, p) \
6828 mutex_enter(&prl_mutex); \
6829 page_relocate_log[prl_entry].targ = *(t); \
6830 page_relocate_log[prl_entry].repl = *(r); \
6831 page_relocate_log[prl_entry].status = (s); \
6832 page_relocate_log[prl_entry].pausecpus = (p); \
6833 page_relocate_log[prl_entry].whence = gethrtime(); \
6834 prl_entry = (prl_entry == (N_PRLE - 1))? 0 : prl_entry + 1; \
6835 mutex_exit(&prl_mutex);
6838 #define PAGE_RELOCATE_LOG(t, r, s, p)
6842 * Core Kernel Page Relocation Algorithm
6846 * target : constituent pages are SE_EXCL locked.
6847 * replacement: constituent pages are SE_EXCL locked.
6851 * nrelocp: number of pages relocated
6854 hat_page_relocate(page_t
**target
, page_t
**replacement
, spgcnt_t
*nrelocp
)
6856 page_t
*targ
, *repl
;
6858 kmutex_t
*low
, *high
;
6869 if (!kcage_on
|| PP_ISNORELOC(*target
)) {
6870 PAGE_RELOCATE_LOG(target
, replacement
, EAGAIN
, -1);
6874 mutex_enter(&kpr_mutex
);
6875 kreloc_thread
= curthread
;
6878 repl
= *replacement
;
6879 ASSERT(repl
!= NULL
);
6880 ASSERT(targ
->p_szc
== repl
->p_szc
);
6882 npages
= page_get_pagecnt(targ
->p_szc
);
6885 * unload VA<->PA mappings that are not locked
6888 for (i
= 0; i
< npages
; i
++) {
6889 (void) hat_pageunload(tpp
, SFMMU_KERNEL_RELOC
);
6894 * Do "presuspend" callbacks, in a context from which we can still
6895 * block as needed. Note that we don't hold the mapping list lock
6896 * of "targ" at this point due to potential locking order issues;
6897 * we assume that between the hat_pageunload() above and holding
6898 * the SE_EXCL lock that the mapping list *cannot* change at this
6901 ret
= hat_pageprocess_precallbacks(targ
, HAT_PRESUSPEND
, &cap_cpus
);
6904 * EIO translates to fatal error, for all others cleanup
6905 * and return EAGAIN.
6908 hat_pageprocess_postcallbacks(targ
, HAT_POSTUNSUSPEND
);
6909 PAGE_RELOCATE_LOG(target
, replacement
, ret
, -1);
6910 kreloc_thread
= NULL
;
6911 mutex_exit(&kpr_mutex
);
6916 * acquire p_mapping list lock for both the target and replacement
6919 * low and high refer to the need to grab the mlist locks in a
6920 * specific order in order to prevent race conditions. Thus the
6921 * lower lock must be grabbed before the higher lock.
6923 * This will block hat_unload's accessing p_mapping list. Since
6924 * we have SE_EXCL lock, hat_memload and hat_pageunload will be
6925 * blocked. Thus, no one else will be accessing the p_mapping list
6926 * while we suspend and reload the locked mapping below.
6930 sfmmu_mlist_reloc_enter(tpp
, rpp
, &low
, &high
);
6935 * We raise our PIL to 13 so that we don't get captured by
6936 * another CPU or pinned by an interrupt thread. We can't go to
6937 * PIL 14 since the nexus driver(s) may need to interrupt at
6938 * that level in the case of IOMMU pseudo mappings.
6940 cpuset
= cpu_ready_set
;
6941 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
6942 if (!cap_cpus
|| CPUSET_ISNULL(cpuset
)) {
6943 old_pil
= splr(XCALL_PIL
);
6946 xc_attention(cpuset
);
6948 ASSERT(getpil() == XCALL_PIL
);
6951 * Now do suspend callbacks. In the case of an IOMMU mapping
6952 * this will suspend all DMA activity to the page while it is
6953 * being relocated. Since we are well above LOCK_LEVEL and CPUs
6954 * may be captured at this point we should have acquired any needed
6955 * locks in the presuspend callback.
6957 ret
= hat_pageprocess_precallbacks(targ
, HAT_SUSPEND
, NULL
);
6964 * Raise the PIL yet again, this time to block all high-level
6965 * interrupts on this CPU. This is necessary to prevent an
6966 * interrupt routine from pinning the thread which holds the
6967 * mapping suspended and then touching the suspended page.
6969 * Once the page is suspended we also need to be careful to
6970 * avoid calling any functions which touch any seg_kmem memory
6971 * since that memory may be backed by the very page we are
6972 * relocating in here!
6974 hat_pagesuspend(targ
);
6977 * Now that we are confident everybody has stopped using this page,
6978 * copy the page contents. Note we use a physical copy to prevent
6979 * locking issues and to avoid fpRAS because we can't handle it in
6982 for (i
= 0; i
< npages
; i
++, tpp
++, rpp
++) {
6985 * If the replacement has a different vcolor than
6986 * the one being replacd, we need to handle VAC
6987 * consistency for it just as we were setting up
6988 * a new mapping to it.
6990 if ((PP_GET_VCOLOR(rpp
) != NO_VCOLOR
) &&
6991 (tpp
->p_vcolor
!= rpp
->p_vcolor
) &&
6992 !CacheColor_IsFlushed(cflags
, PP_GET_VCOLOR(rpp
))) {
6993 CacheColor_SetFlushed(cflags
, PP_GET_VCOLOR(rpp
));
6994 sfmmu_cache_flushcolor(PP_GET_VCOLOR(rpp
),
6999 * Copy the contents of the page.
7001 ppcopy_kernel(tpp
, rpp
);
7006 for (i
= 0; i
< npages
; i
++, tpp
++, rpp
++) {
7008 * Copy attributes. VAC consistency was handled above,
7011 rpp
->p_nrm
= tpp
->p_nrm
;
7013 rpp
->p_index
= tpp
->p_index
;
7016 rpp
->p_vcolor
= tpp
->p_vcolor
;
7021 * First, unsuspend the page, if we set the suspend bit, and transfer
7022 * the mapping list from the target page to the replacement page.
7023 * Next process postcallbacks; since pa_hment's are linked only to the
7024 * p_mapping list of root page, we don't iterate over the constituent
7027 hat_pagereload(targ
, repl
);
7030 hat_pageprocess_postcallbacks(repl
, HAT_UNSUSPEND
);
7033 * Now lower our PIL and release any captured CPUs since we
7034 * are out of the "danger zone". After this it will again be
7035 * safe to acquire adaptive mutex locks, or to drop them...
7037 if (old_pil
!= -1) {
7040 xc_dismissed(cpuset
);
7045 sfmmu_mlist_reloc_exit(low
, high
);
7048 * Postsuspend callbacks should drop any locks held across
7049 * the suspend callbacks. As before, we don't hold the mapping
7050 * list lock at this point.. our assumption is that the mapping
7051 * list still can't change due to our holding SE_EXCL lock and
7052 * there being no unlocked mappings left. Hence the restriction
7053 * on calling context to hat_delete_callback()
7055 hat_pageprocess_postcallbacks(repl
, HAT_POSTUNSUSPEND
);
7058 * The second presuspend call failed: we got here through
7059 * the suspend_fail label above.
7062 PAGE_RELOCATE_LOG(target
, replacement
, ret
, cap_cpus
);
7063 kreloc_thread
= NULL
;
7064 mutex_exit(&kpr_mutex
);
7069 * Now that we're out of the performance critical section we can
7070 * take care of updating the hash table, since we still
7071 * hold all the pages locked SE_EXCL at this point we
7072 * needn't worry about things changing out from under us.
7076 for (i
= 0; i
< npages
; i
++, tpp
++, rpp
++) {
7079 * replace targ with replacement in page_hash table
7082 page_relocate_hash(rpp
, targ
);
7085 * concatenate target; caller of platform_page_relocate()
7086 * expects target to be concatenated after returning.
7088 ASSERT(targ
->p_next
== targ
);
7089 ASSERT(targ
->p_prev
== targ
);
7090 page_list_concat(&pl
, &targ
);
7093 ASSERT(*target
== pl
);
7095 PAGE_RELOCATE_LOG(target
, replacement
, 0, cap_cpus
);
7096 kreloc_thread
= NULL
;
7097 mutex_exit(&kpr_mutex
);
7102 * Called when stray pa_hments are found attached to a page which is
7103 * being freed. Notify the subsystem which attached the pa_hment of
7104 * the error if it registered a suitable handler, else panic.
7107 sfmmu_pahment_leaked(struct pa_hment
*pahmep
)
7109 id_t cb_id
= pahmep
->cb_id
;
7111 ASSERT(cb_id
>= (id_t
)0 && cb_id
< sfmmu_cb_nextid
);
7112 if (sfmmu_cb_table
[cb_id
].errhandler
!= NULL
) {
7113 if (sfmmu_cb_table
[cb_id
].errhandler(pahmep
->addr
, pahmep
->len
,
7114 HAT_CB_ERR_LEAKED
, pahmep
->pvt
) == 0)
7115 return; /* non-fatal */
7117 panic("pa_hment leaked: 0x%p", (void *)pahmep
);
7121 * Remove all mappings to page 'pp'.
7124 hat_pageunload(struct page
*pp
, uint_t forceflag
)
7126 struct page
*origpp
= pp
;
7127 struct sf_hment
*sfhme
, *tmphme
;
7128 struct hme_blk
*hmeblkp
;
7133 cpuset_t cpuset
, tset
;
7138 ASSERT(PAGE_EXCL(pp
));
7144 CPUSET_ZERO(cpuset
);
7146 pml
= sfmmu_mlist_enter(pp
);
7150 sfmmu_kpm_pageunload(pp
);
7151 ASSERT(!PP_ISMAPPED_KPM(pp
));
7154 * Clear vpm reference. Since the page is exclusively locked
7155 * vpm cannot be referencing it.
7161 index
= PP_MAPINDEX(pp
);
7164 for (sfhme
= pp
->p_mapping
; sfhme
; sfhme
= tmphme
) {
7165 tmphme
= sfhme
->hme_next
;
7167 if (IS_PAHME(sfhme
)) {
7168 ASSERT(sfhme
->hme_data
!= NULL
);
7173 hmeblkp
= sfmmu_hmetohblk(sfhme
);
7174 if (hmeblkp
->hblk_xhat_bit
) {
7175 struct xhat_hme_blk
*xblk
=
7176 (struct xhat_hme_blk
*)hmeblkp
;
7178 (void) XHAT_PAGEUNLOAD(xblk
->xhat_hme_blk_hat
,
7179 pp
, forceflag
, XBLK2PROVBLK(xblk
));
7186 * If there are kernel mappings don't unload them, they will
7189 if (forceflag
== SFMMU_KERNEL_RELOC
&& hmeblkp
->hblk_lckcnt
&&
7190 hmeblkp
->hblk_tag
.htag_id
== ksfmmup
)
7193 tset
= sfmmu_pageunload(pp
, sfhme
, cons
);
7194 CPUSET_OR(cpuset
, tset
);
7197 while (index
!= 0) {
7202 /* Go to leading page */
7203 pp
= PP_GROUPLEADER(pp
, cons
);
7204 ASSERT(sfmmu_mlist_held(pp
));
7210 * cpuset may be empty if the page was only mapped by segkpm,
7211 * in which case we won't actually cross-trap.
7216 * The page should have no mappings at this point, unless
7217 * we were called from hat_page_relocate() in which case we
7218 * leave the locked mappings which will be suspended later.
7220 ASSERT(!PP_ISMAPPED(origpp
) || xhme_blks
|| pa_hments
||
7221 (forceflag
== SFMMU_KERNEL_RELOC
));
7225 if (cons
== TTE8K
) {
7226 pmtx
= sfmmu_page_enter(pp
);
7228 sfmmu_page_exit(pmtx
);
7235 if (pa_hments
&& forceflag
!= SFMMU_KERNEL_RELOC
) {
7237 * Unlink any pa_hments and free them, calling back
7238 * the responsible subsystem to notify it of the error.
7239 * This can occur in situations such as drivers leaking
7240 * DMA handles: naughty, but common enough that we'd like
7241 * to keep the system running rather than bringing it
7242 * down with an obscure error like "pa_hment leaked"
7243 * which doesn't aid the user in debugging their driver.
7245 for (sfhme
= pp
->p_mapping
; sfhme
; sfhme
= tmphme
) {
7246 tmphme
= sfhme
->hme_next
;
7247 if (IS_PAHME(sfhme
)) {
7248 struct pa_hment
*pahmep
= sfhme
->hme_data
;
7249 sfmmu_pahment_leaked(pahmep
);
7251 kmem_cache_free(pa_hment_cache
, pahmep
);
7255 ASSERT(!PP_ISMAPPED(origpp
) || xhme_blks
);
7258 sfmmu_mlist_exit(pml
);
7261 * XHAT may not have finished unloading pages
7262 * because some other thread was waiting for
7263 * mlist lock and XHAT_PAGEUNLOAD let it do
7275 sfmmu_pageunload(page_t
*pp
, struct sf_hment
*sfhme
, int cons
)
7277 struct hme_blk
*hmeblkp
;
7289 ASSERT(sfmmu_mlist_held(pp
));
7290 ASSERT(!PP_ISKAS(pp
));
7292 CPUSET_ZERO(cpuset
);
7294 hmeblkp
= sfmmu_hmetohblk(sfhme
);
7297 sfmmu_copytte(&sfhme
->hme_tte
, &tte
);
7298 if (TTE_IS_VALID(&tte
)) {
7299 sfmmup
= hblktosfmmu(hmeblkp
);
7300 ttesz
= get_hblk_ttesz(hmeblkp
);
7302 * Only unload mappings of 'cons' size.
7308 * Note that we have p_mapping lock, but no hash lock here.
7309 * hblk_unload() has to have both hash lock AND p_mapping
7310 * lock before it tries to modify tte. So, the tte could
7311 * not become invalid in the sfmmu_modifytte_try() below.
7318 TTE_SET_INVALID(&ttemod
);
7319 ret
= sfmmu_modifytte_try(&tte
, &ttemod
, &sfhme
->hme_tte
);
7322 /* only R/M bits can change. */
7323 chk_tte(&orig_old
, &tte
, &ttemod
, hmeblkp
);
7329 panic("pageunload: cas failed?");
7332 addr
= tte_to_vaddr(hmeblkp
, tte
);
7334 if (hmeblkp
->hblk_shared
) {
7335 sf_srd_t
*srdp
= (sf_srd_t
*)sfmmup
;
7336 uint_t rid
= hmeblkp
->hblk_tag
.htag_rid
;
7338 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
7339 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
7340 ASSERT(srdp
!= NULL
);
7341 rgnp
= srdp
->srd_hmergnp
[rid
];
7342 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp
, srdp
, rgnp
, rid
);
7343 cpuset
= sfmmu_rgntlb_demap(addr
, rgnp
, hmeblkp
, 1);
7344 sfmmu_ttesync(NULL
, addr
, &tte
, pp
);
7345 ASSERT(rgnp
->rgn_ttecnt
[ttesz
] > 0);
7346 atomic_dec_ulong(&rgnp
->rgn_ttecnt
[ttesz
]);
7348 sfmmu_ttesync(sfmmup
, addr
, &tte
, pp
);
7349 atomic_dec_ulong(&sfmmup
->sfmmu_ttecnt
[ttesz
]);
7352 * We need to flush the page from the virtual cache
7353 * in order to prevent a virtual cache alias
7354 * inconsistency. The particular scenario we need
7355 * to worry about is:
7356 * Given: va1 and va2 are two virtual address that
7357 * alias and will map the same physical address.
7358 * 1. mapping exists from va1 to pa and data has
7359 * been read into the cache.
7361 * 3. load va2 and modify data using va2.
7363 * 5. load va1 and reference data. Unless we flush
7364 * the data cache when we unload we will get
7366 * This scenario is taken care of by using virtual
7369 if (sfmmup
->sfmmu_ismhat
) {
7371 * Flush TSBs, TLBs and caches
7373 * sharing this ism segment.
7375 sfmmu_hat_lock_all();
7376 mutex_enter(&ism_mlist_lock
);
7378 sfmmu_ismtlbcache_demap(addr
, sfmmup
, hmeblkp
,
7379 pp
->p_pagenum
, CACHE_NO_FLUSH
);
7381 mutex_exit(&ism_mlist_lock
);
7382 sfmmu_hat_unlock_all();
7383 cpuset
= cpu_ready_set
;
7385 sfmmu_tlb_demap(addr
, sfmmup
, hmeblkp
, 0, 0);
7386 cpuset
= sfmmup
->sfmmu_cpusran
;
7391 * Hme_sub has to run after ttesync() and a_rss update.
7392 * See hblk_unload().
7398 * We can not make ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS)
7399 * since pteload may have done a HME_ADD() right after
7400 * we did the HME_SUB() above. Hmecnt is now maintained
7401 * by cas only. no lock guranteed its value. The only
7402 * gurantee we have is the hmecnt should not be less than
7403 * what it should be so the hblk will not be taken away.
7404 * It's also important that we decremented the hmecnt after
7405 * we are done with hmeblkp so that this hmeblk won't be
7408 ASSERT(hmeblkp
->hblk_hmecnt
> 0);
7409 ASSERT(hmeblkp
->hblk_vcnt
> 0);
7410 atomic_dec_16(&hmeblkp
->hblk_vcnt
);
7411 atomic_dec_16(&hmeblkp
->hblk_hmecnt
);
7413 * This is bug 4063182.
7415 * ASSERT(hmeblkp->hblk_hmecnt || hmeblkp->hblk_vcnt ||
7416 * !hmeblkp->hblk_lckcnt);
7419 panic("invalid tte? pp %p &tte %p",
7420 (void *)pp
, (void *)&tte
);
7427 * While relocating a kernel page, this function will move the mappings
7428 * from tpp to dpp and modify any associated data with these mappings.
7429 * It also unsuspends the suspended kernel mapping.
7432 hat_pagereload(struct page
*tpp
, struct page
*dpp
)
7434 struct sf_hment
*sfhme
;
7438 ASSERT(getpil() == PIL_MAX
);
7439 ASSERT(sfmmu_mlist_held(tpp
));
7440 ASSERT(sfmmu_mlist_held(dpp
));
7442 index
= PP_MAPINDEX(tpp
);
7445 /* Update real mappings to the page */
7447 for (sfhme
= tpp
->p_mapping
; sfhme
!= NULL
; sfhme
= sfhme
->hme_next
) {
7448 if (IS_PAHME(sfhme
))
7450 sfmmu_copytte(&sfhme
->hme_tte
, &tte
);
7454 * replace old pfn with new pfn in TTE
7456 PFN_TO_TTE(ttemod
, dpp
->p_pagenum
);
7461 ASSERT(TTE_IS_SUSPEND(&ttemod
));
7462 TTE_CLR_SUSPEND(&ttemod
);
7464 if (sfmmu_modifytte_try(&tte
, &ttemod
, &sfhme
->hme_tte
) < 0)
7465 panic("hat_pagereload(): sfmmu_modifytte_try() failed");
7468 * set hme_page point to new page
7470 sfhme
->hme_page
= dpp
;
7474 * move p_mapping list from old page to new page
7476 dpp
->p_mapping
= tpp
->p_mapping
;
7477 tpp
->p_mapping
= NULL
;
7478 dpp
->p_share
= tpp
->p_share
;
7481 while (index
!= 0) {
7486 tpp
= PP_GROUPLEADER(tpp
, cons
);
7487 dpp
= PP_GROUPLEADER(dpp
, cons
);
7492 curthread
->t_flag
&= ~T_DONTDTRACE
;
7493 mutex_exit(&kpr_suspendlock
);
7497 hat_pagesync(struct page
*pp
, uint_t clearflag
)
7499 struct sf_hment
*sfhme
, *tmphme
= NULL
;
7500 struct hme_blk
*hmeblkp
;
7502 cpuset_t cpuset
, tset
;
7504 extern ulong_t po_share
;
7505 page_t
*save_pp
= pp
;
7509 CPUSET_ZERO(cpuset
);
7511 if (PP_ISRO(pp
) && (clearflag
& HAT_SYNC_STOPON_MOD
)) {
7512 return (PP_GENERIC_ATTR(pp
));
7515 if ((clearflag
& HAT_SYNC_ZERORM
) == 0) {
7516 if ((clearflag
& HAT_SYNC_STOPON_REF
) && PP_ISREF(pp
)) {
7517 return (PP_GENERIC_ATTR(pp
));
7519 if ((clearflag
& HAT_SYNC_STOPON_MOD
) && PP_ISMOD(pp
)) {
7520 return (PP_GENERIC_ATTR(pp
));
7522 if (clearflag
& HAT_SYNC_STOPON_SHARED
) {
7523 if (pp
->p_share
> po_share
) {
7524 hat_page_setattr(pp
, P_REF
);
7525 return (PP_GENERIC_ATTR(pp
));
7532 clearflag
&= ~HAT_SYNC_STOPON_SHARED
;
7533 pml
= sfmmu_mlist_enter(pp
);
7534 index
= PP_MAPINDEX(pp
);
7537 for (sfhme
= pp
->p_mapping
; sfhme
; sfhme
= tmphme
) {
7539 * We need to save the next hment on the list since
7540 * it is possible for pagesync to remove an invalid hment
7543 tmphme
= sfhme
->hme_next
;
7544 if (IS_PAHME(sfhme
))
7547 * If we are looking for large mappings and this hme doesn't
7548 * reach the range we are seeking, just ignore it.
7550 hmeblkp
= sfmmu_hmetohblk(sfhme
);
7551 if (hmeblkp
->hblk_xhat_bit
)
7554 if (hme_size(sfhme
) < cons
)
7558 if (hmeblkp
->hblk_shared
) {
7559 sf_srd_t
*srdp
= hblktosrd(hmeblkp
);
7560 uint_t rid
= hmeblkp
->hblk_tag
.htag_rid
;
7562 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
7563 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
7564 ASSERT(srdp
!= NULL
);
7565 rgnp
= srdp
->srd_hmergnp
[rid
];
7566 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp
, srdp
,
7568 shcnt
+= rgnp
->rgn_refcnt
;
7572 if (shcnt
> po_share
) {
7574 * tell the pager to spare the page this time
7577 hat_page_setattr(save_pp
, P_REF
);
7582 tset
= sfmmu_pagesync(pp
, sfhme
,
7583 clearflag
& ~HAT_SYNC_STOPON_RM
);
7584 CPUSET_OR(cpuset
, tset
);
7587 * If clearflag is HAT_SYNC_DONTZERO, break out as soon
7588 * as the "ref" or "mod" is set or share cnt exceeds po_share.
7590 if ((clearflag
& ~HAT_SYNC_STOPON_RM
) == HAT_SYNC_DONTZERO
&&
7591 (((clearflag
& HAT_SYNC_STOPON_MOD
) && PP_ISMOD(save_pp
)) ||
7592 ((clearflag
& HAT_SYNC_STOPON_REF
) && PP_ISREF(save_pp
)))) {
7602 /* Go to leading page */
7603 pp
= PP_GROUPLEADER(pp
, cons
);
7609 sfmmu_mlist_exit(pml
);
7610 return (PP_GENERIC_ATTR(save_pp
));
7614 * Get all the hardware dependent attributes for a page struct
7617 sfmmu_pagesync(struct page
*pp
, struct sf_hment
*sfhme
,
7622 struct hme_blk
*hmeblkp
;
7628 ASSERT(sfmmu_mlist_held(pp
));
7629 ASSERT((clearflag
== HAT_SYNC_DONTZERO
) ||
7630 (clearflag
== HAT_SYNC_ZERORM
));
7632 SFMMU_STAT(sf_pagesync
);
7634 CPUSET_ZERO(cpuset
);
7636 sfmmu_pagesync_retry
:
7638 sfmmu_copytte(&sfhme
->hme_tte
, &tte
);
7639 if (TTE_IS_VALID(&tte
)) {
7640 hmeblkp
= sfmmu_hmetohblk(sfhme
);
7641 sfmmup
= hblktosfmmu(hmeblkp
);
7642 addr
= tte_to_vaddr(hmeblkp
, tte
);
7643 if (clearflag
== HAT_SYNC_ZERORM
) {
7645 TTE_CLR_RM(&ttemod
);
7646 ret
= sfmmu_modifytte_try(&tte
, &ttemod
,
7650 * cas failed and the new value is not what
7653 goto sfmmu_pagesync_retry
;
7657 /* we win the cas */
7658 if (hmeblkp
->hblk_shared
) {
7659 sf_srd_t
*srdp
= (sf_srd_t
*)sfmmup
;
7661 hmeblkp
->hblk_tag
.htag_rid
;
7663 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
7664 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
7665 ASSERT(srdp
!= NULL
);
7666 rgnp
= srdp
->srd_hmergnp
[rid
];
7667 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp
,
7669 cpuset
= sfmmu_rgntlb_demap(addr
,
7672 sfmmu_tlb_demap(addr
, sfmmup
, hmeblkp
,
7674 cpuset
= sfmmup
->sfmmu_cpusran
;
7678 sfmmu_ttesync(hmeblkp
->hblk_shared
? NULL
: sfmmup
, addr
,
7685 * Remove write permission from a mappings to a page, so that
7686 * we can detect the next modification of it. This requires modifying
7687 * the TTE then invalidating (demap) any TLB entry using that TTE.
7688 * This code is similar to sfmmu_pagesync().
7691 sfmmu_pageclrwrt(struct page
*pp
, struct sf_hment
*sfhme
)
7696 struct hme_blk
*hmeblkp
;
7702 ASSERT(sfmmu_mlist_held(pp
));
7704 CPUSET_ZERO(cpuset
);
7705 SFMMU_STAT(sf_clrwrt
);
7709 sfmmu_copytte(&sfhme
->hme_tte
, &tte
);
7710 if (TTE_IS_VALID(&tte
) && TTE_IS_WRITABLE(&tte
)) {
7711 hmeblkp
= sfmmu_hmetohblk(sfhme
);
7714 * xhat mappings should never be to a VMODSORT page.
7716 ASSERT(hmeblkp
->hblk_xhat_bit
== 0);
7718 sfmmup
= hblktosfmmu(hmeblkp
);
7719 addr
= tte_to_vaddr(hmeblkp
, tte
);
7722 TTE_CLR_WRT(&ttemod
);
7723 TTE_CLR_MOD(&ttemod
);
7724 ret
= sfmmu_modifytte_try(&tte
, &ttemod
, &sfhme
->hme_tte
);
7727 * if cas failed and the new value is not what
7733 /* we win the cas */
7735 if (hmeblkp
->hblk_shared
) {
7736 sf_srd_t
*srdp
= (sf_srd_t
*)sfmmup
;
7737 uint_t rid
= hmeblkp
->hblk_tag
.htag_rid
;
7739 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
7740 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
7741 ASSERT(srdp
!= NULL
);
7742 rgnp
= srdp
->srd_hmergnp
[rid
];
7743 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp
,
7745 cpuset
= sfmmu_rgntlb_demap(addr
,
7748 sfmmu_tlb_demap(addr
, sfmmup
, hmeblkp
, 0, 0);
7749 cpuset
= sfmmup
->sfmmu_cpusran
;
7758 * Walk all mappings of a page, removing write permission and clearing the
7759 * ref/mod bits. This code is similar to hat_pagesync()
7762 hat_page_clrwrt(page_t
*pp
)
7764 struct sf_hment
*sfhme
;
7765 struct sf_hment
*tmphme
= NULL
;
7772 CPUSET_ZERO(cpuset
);
7774 pml
= sfmmu_mlist_enter(pp
);
7775 index
= PP_MAPINDEX(pp
);
7778 for (sfhme
= pp
->p_mapping
; sfhme
; sfhme
= tmphme
) {
7779 tmphme
= sfhme
->hme_next
;
7782 * If we are looking for large mappings and this hme doesn't
7783 * reach the range we are seeking, just ignore its.
7786 if (hme_size(sfhme
) < cons
)
7789 tset
= sfmmu_pageclrwrt(pp
, sfhme
);
7790 CPUSET_OR(cpuset
, tset
);
7797 /* Go to leading page */
7798 pp
= PP_GROUPLEADER(pp
, cons
);
7804 sfmmu_mlist_exit(pml
);
7808 * Set the given REF/MOD/RO bits for the given page.
7809 * For a vnode with a sorted v_pages list, we need to change
7810 * the attributes and the v_pages list together under page_vnode_mutex.
7813 hat_page_setattr(page_t
*pp
, uint_t flag
)
7815 vnode_t
*vp
= pp
->p_vnode
;
7818 kmutex_t
*vphm
= NULL
;
7821 noshuffle
= flag
& P_NSH
;
7824 ASSERT(!(flag
& ~(P_MOD
| P_REF
| P_RO
)));
7827 * nothing to do if attribute already set
7829 if ((pp
->p_nrm
& flag
) == flag
)
7832 if ((flag
& P_MOD
) != 0 && vp
!= NULL
&& IS_VMODSORT(vp
) &&
7834 vphm
= page_vnode_mutex(vp
);
7838 pmtx
= sfmmu_page_enter(pp
);
7840 sfmmu_page_exit(pmtx
);
7844 * Some File Systems examine v_pages for NULL w/o
7845 * grabbing the vphm mutex. Must not let it become NULL when
7846 * pp is the only page on the list.
7848 if (pp
->p_vpnext
!= pp
) {
7849 page_vpsub(&vp
->v_pages
, pp
);
7850 if (vp
->v_pages
!= NULL
)
7851 listp
= &vp
->v_pages
->p_vpprev
->p_vpnext
;
7853 listp
= &vp
->v_pages
;
7854 page_vpadd(listp
, pp
);
7861 hat_page_clrattr(page_t
*pp
, uint_t flag
)
7863 vnode_t
*vp
= pp
->p_vnode
;
7866 ASSERT(!(flag
& ~(P_MOD
| P_REF
| P_RO
)));
7868 pmtx
= sfmmu_page_enter(pp
);
7871 * Caller is expected to hold page's io lock for VMODSORT to work
7872 * correctly with pvn_vplist_dirty() and pvn_getdirty() when mod
7874 * We don't have assert to avoid tripping some existing third party
7875 * code. The dirty page is moved back to top of the v_page list
7876 * after IO is done in pvn_write_done().
7879 sfmmu_page_exit(pmtx
);
7881 if ((flag
& P_MOD
) != 0 && vp
!= NULL
&& IS_VMODSORT(vp
)) {
7884 * VMODSORT works by removing write permissions and getting
7885 * a fault when a page is made dirty. At this point
7886 * we need to remove write permission from all mappings
7889 hat_page_clrwrt(pp
);
7894 hat_page_getattr(page_t
*pp
, uint_t flag
)
7896 ASSERT(!(flag
& ~(P_MOD
| P_REF
| P_RO
)));
7897 return ((uint_t
)(pp
->p_nrm
& flag
));
7901 * DEBUG kernels: verify that a kernel va<->pa translation
7902 * is safe by checking the underlying page_t is in a page
7903 * relocation-safe state.
7907 sfmmu_check_kpfn(pfn_t pfn
)
7912 if (hat_check_vtop
== 0)
7915 if (kvseg
.s_base
== NULL
|| panicstr
)
7918 pp
= page_numtopp_nolock(pfn
);
7922 if (PAGE_LOCKED(pp
) || PP_ISNORELOC(pp
))
7926 * Handed a large kernel page, we dig up the root page since we
7927 * know the root page might have the lock also.
7929 if (pp
->p_szc
!= 0) {
7930 index
= PP_MAPINDEX(pp
);
7933 while (index
!= 0) {
7938 pp
= PP_GROUPLEADER(pp
, cons
);
7944 if (PAGE_LOCKED(pp
) || PP_ISNORELOC(pp
))
7948 * Pages need to be locked or allocated "permanent" (either from
7949 * static_arena arena or explicitly setting PG_NORELOC when calling
7950 * page_create_va()) for VA->PA translations to be valid.
7952 if (!PP_ISNORELOC(pp
))
7953 panic("Illegal VA->PA translation, pp 0x%p not permanent",
7956 panic("Illegal VA->PA translation, pp 0x%p not locked",
7962 * Returns a page frame number for a given virtual address.
7963 * Returns PFN_INVALID to indicate an invalid mapping
7966 hat_getpfnum(struct hat
*hat
, caddr_t addr
)
7973 * ASSERT(AS_LOCK_HELD(as));
7974 * but we can't because the iommu driver will call this
7975 * routine at interrupt time and it can't grab the as lock
7976 * or it will deadlock: A thread could have the as lock
7977 * and be waiting for io. The io can't complete
7978 * because the interrupt thread is blocked trying to grab
7982 ASSERT(hat
->sfmmu_xhat_provider
== NULL
);
7984 if (hat
== ksfmmup
) {
7985 if (IS_KMEM_VA_LARGEPAGE(addr
)) {
7986 ASSERT(segkmem_lpszc
> 0);
7987 pfn
= sfmmu_kvaszc2pfn(addr
, segkmem_lpszc
);
7988 if (pfn
!= PFN_INVALID
) {
7989 sfmmu_check_kpfn(pfn
);
7992 } else if (segkpm
&& IS_KPM_ADDR(addr
)) {
7993 return (sfmmu_kpm_vatopfn(addr
));
7995 while ((pfn
= sfmmu_vatopfn(addr
, ksfmmup
, &tte
))
7997 sfmmu_vatopfn_suspended(addr
, ksfmmup
, &tte
);
7999 sfmmu_check_kpfn(pfn
);
8002 return (sfmmu_uvatopfn(addr
, hat
, NULL
));
8007 * This routine will return both pfn and tte for the vaddr.
8010 sfmmu_uvatopfn(caddr_t vaddr
, struct hat
*sfmmup
, tte_t
*ttep
)
8012 struct hmehash_bucket
*hmebp
;
8014 int hmeshift
, hashno
= 1;
8015 struct hme_blk
*hmeblkp
= NULL
;
8018 struct sf_hment
*sfhmep
;
8021 /* support for ISM */
8023 ism_blk_t
*ism_blkp
;
8025 sfmmu_t
*ism_hatid
= NULL
;
8026 sfmmu_t
*locked_hatid
= NULL
;
8027 sfmmu_t
*sv_sfmmup
= sfmmup
;
8028 caddr_t sv_vaddr
= vaddr
;
8037 ASSERT(sfmmup
!= ksfmmup
);
8038 SFMMU_STAT(sf_user_vtop
);
8040 * Set ism_hatid if vaddr falls in a ISM segment.
8042 ism_blkp
= sfmmup
->sfmmu_iblk
;
8043 if (ism_blkp
!= NULL
) {
8044 sfmmu_ismhat_enter(sfmmup
, 0);
8045 locked_hatid
= sfmmup
;
8047 while (ism_blkp
!= NULL
&& ism_hatid
== NULL
) {
8048 ism_map
= ism_blkp
->iblk_maps
;
8049 for (i
= 0; ism_map
[i
].imap_ismhat
&& i
< ISM_MAP_SLOTS
; i
++) {
8050 if (vaddr
>= ism_start(ism_map
[i
]) &&
8051 vaddr
< ism_end(ism_map
[i
])) {
8052 sfmmup
= ism_hatid
= ism_map
[i
].imap_ismhat
;
8053 vaddr
= (caddr_t
)(vaddr
-
8054 ism_start(ism_map
[i
]));
8058 ism_blkp
= ism_blkp
->iblk_next
;
8061 sfmmu_ismhat_exit(locked_hatid
, 0);
8064 hblktag
.htag_id
= sfmmup
;
8065 hblktag
.htag_rid
= SFMMU_INVALID_SHMERID
;
8067 hmeshift
= HME_HASH_SHIFT(hashno
);
8068 hblktag
.htag_bspage
= HME_HASH_BSPAGE(vaddr
, hmeshift
);
8069 hblktag
.htag_rehash
= hashno
;
8070 hmebp
= HME_HASH_FUNCTION(sfmmup
, vaddr
, hmeshift
);
8072 SFMMU_HASH_LOCK(hmebp
);
8074 HME_HASH_FAST_SEARCH(hmebp
, hblktag
, hmeblkp
);
8075 if (hmeblkp
!= NULL
) {
8076 ASSERT(!hmeblkp
->hblk_shared
);
8077 HBLKTOHME(sfhmep
, hmeblkp
, vaddr
);
8078 sfmmu_copytte(&sfhmep
->hme_tte
, ttep
);
8079 SFMMU_HASH_UNLOCK(hmebp
);
8080 if (TTE_IS_VALID(ttep
)) {
8081 pfn
= TTE_TO_PFN(vaddr
, ttep
);
8086 SFMMU_HASH_UNLOCK(hmebp
);
8088 } while (HME_REHASH(sfmmup
) && (hashno
<= mmu_hashcnt
));
8090 if (SF_HMERGNMAP_ISNULL(sv_sfmmup
)) {
8091 return (PFN_INVALID
);
8093 srdp
= sv_sfmmup
->sfmmu_srdp
;
8094 ASSERT(srdp
!= NULL
);
8095 ASSERT(srdp
->srd_refcnt
!= 0);
8096 hblktag
.htag_id
= srdp
;
8099 hmeshift
= HME_HASH_SHIFT(hashno
);
8100 hblktag
.htag_bspage
= HME_HASH_BSPAGE(sv_vaddr
, hmeshift
);
8101 hblktag
.htag_rehash
= hashno
;
8102 hmebp
= HME_HASH_FUNCTION(srdp
, sv_vaddr
, hmeshift
);
8104 SFMMU_HASH_LOCK(hmebp
);
8105 for (hmeblkp
= hmebp
->hmeblkp
; hmeblkp
!= NULL
;
8106 hmeblkp
= hmeblkp
->hblk_next
) {
8112 if (!HTAGS_EQ_SHME(hmeblkp
->hblk_tag
, hblktag
,
8113 sv_sfmmup
->sfmmu_hmeregion_map
)) {
8116 ASSERT(hmeblkp
->hblk_shared
);
8117 rid
= hmeblkp
->hblk_tag
.htag_rid
;
8118 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
8119 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
8120 rgnp
= srdp
->srd_hmergnp
[rid
];
8121 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp
, srdp
, rgnp
, rid
);
8122 HBLKTOHME(sfhmep
, hmeblkp
, sv_vaddr
);
8123 sfmmu_copytte(&sfhmep
->hme_tte
, ttep
);
8124 rsaddr
= rgnp
->rgn_saddr
;
8125 readdr
= rsaddr
+ rgnp
->rgn_size
;
8127 if (TTE_IS_VALID(ttep
) ||
8128 get_hblk_ttesz(hmeblkp
) > TTE8K
) {
8129 caddr_t eva
= tte_to_evaddr(hmeblkp
, ttep
);
8130 ASSERT(eva
> sv_vaddr
);
8131 ASSERT(sv_vaddr
>= rsaddr
);
8132 ASSERT(sv_vaddr
< readdr
);
8133 ASSERT(eva
<= readdr
);
8137 * Continue the search if we
8138 * found an invalid 8K tte outside of the area
8139 * covered by this hmeblk's region.
8141 if (TTE_IS_VALID(ttep
)) {
8142 SFMMU_HASH_UNLOCK(hmebp
);
8143 pfn
= TTE_TO_PFN(sv_vaddr
, ttep
);
8145 } else if (get_hblk_ttesz(hmeblkp
) > TTE8K
||
8146 (sv_vaddr
>= rsaddr
&& sv_vaddr
< readdr
)) {
8147 SFMMU_HASH_UNLOCK(hmebp
);
8152 SFMMU_HASH_UNLOCK(hmebp
);
8154 } while (hashno
<= mmu_hashcnt
);
8155 return (PFN_INVALID
);
8160 * For compatability with AT&T and later optimizations
8164 hat_map(struct hat
*hat
, caddr_t addr
, size_t len
, uint_t flags
)
8166 ASSERT(hat
!= NULL
);
8167 ASSERT(hat
->sfmmu_xhat_provider
== NULL
);
8171 * Return the number of mappings to a particular page. This number is an
8172 * approximation of the number of people sharing the page.
8174 * shared hmeblks or ism hmeblks are counted as 1 mapping here.
8175 * hat_page_checkshare() can be used to compare threshold to share
8176 * count that reflects the number of region sharers albeit at higher cost.
8179 hat_page_getshare(page_t
*pp
)
8181 page_t
*spp
= pp
; /* start page */
8184 int index
, sz
= TTE64K
;
8187 * We need to grab the mlist lock to make sure any outstanding
8188 * load/unloads complete. Otherwise we could return zero
8189 * even though the unload(s) hasn't finished yet.
8191 pml
= sfmmu_mlist_enter(spp
);
8196 cnt
+= spp
->p_kpmref
;
8198 if (vpm_enable
&& pp
->p_vpmref
) {
8203 * If we have any large mappings, we count the number of
8204 * mappings that this large page is part of.
8206 index
= PP_MAPINDEX(spp
);
8209 pp
= PP_GROUPLEADER(spp
, sz
);
8210 if ((index
& 0x1) && pp
!= spp
) {
8217 sfmmu_mlist_exit(pml
);
8222 * Return 1 if the number of mappings exceeds sh_thresh. Return 0
8223 * otherwise. Count shared hmeblks by region's refcnt.
8226 hat_page_checkshare(page_t
*pp
, ulong_t sh_thresh
)
8230 int index
, sz
= TTE8K
;
8231 struct sf_hment
*sfhme
, *tmphme
= NULL
;
8232 struct hme_blk
*hmeblkp
;
8234 pml
= sfmmu_mlist_enter(pp
);
8241 if (vpm_enable
&& pp
->p_vpmref
) {
8245 if (pp
->p_share
+ cnt
> sh_thresh
) {
8246 sfmmu_mlist_exit(pml
);
8250 index
= PP_MAPINDEX(pp
);
8253 for (sfhme
= pp
->p_mapping
; sfhme
; sfhme
= tmphme
) {
8254 tmphme
= sfhme
->hme_next
;
8255 if (IS_PAHME(sfhme
)) {
8259 hmeblkp
= sfmmu_hmetohblk(sfhme
);
8260 if (hmeblkp
->hblk_xhat_bit
) {
8262 if (cnt
> sh_thresh
) {
8263 sfmmu_mlist_exit(pml
);
8268 if (hme_size(sfhme
) != sz
) {
8272 if (hmeblkp
->hblk_shared
) {
8273 sf_srd_t
*srdp
= hblktosrd(hmeblkp
);
8274 uint_t rid
= hmeblkp
->hblk_tag
.htag_rid
;
8276 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
8277 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
8278 ASSERT(srdp
!= NULL
);
8279 rgnp
= srdp
->srd_hmergnp
[rid
];
8280 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp
, srdp
,
8282 cnt
+= rgnp
->rgn_refcnt
;
8286 if (cnt
> sh_thresh
) {
8287 sfmmu_mlist_exit(pml
);
8295 pp
= PP_GROUPLEADER(pp
, sz
);
8296 ASSERT(sfmmu_mlist_held(pp
));
8303 sfmmu_mlist_exit(pml
);
8308 * Unload all large mappings to the pp and reset the p_szc field of every
8309 * constituent page according to the remaining mappings.
8311 * pp must be locked SE_EXCL. Even though no other constituent pages are
8312 * locked it's legal to unload the large mappings to the pp because all
8313 * constituent pages of large locked mappings have to be locked SE_SHARED.
8314 * This means if we have SE_EXCL lock on one of constituent pages none of the
8315 * large mappings to pp are locked.
8317 * Decrease p_szc field starting from the last constituent page and ending
8318 * with the root page. This method is used because other threads rely on the
8319 * root's p_szc to find the lock to syncronize on. After a root page_t's p_szc
8320 * is demoted then other threads will succeed in sfmmu_mlspl_enter(). This
8321 * ensures that p_szc changes of the constituent pages appears atomic for all
8322 * threads that use sfmmu_mlspl_enter() to examine p_szc field.
8324 * This mechanism is only used for file system pages where it's not always
8325 * possible to get SE_EXCL locks on all constituent pages to demote the size
8326 * code (as is done for anonymous or kernel large pages).
8328 * See more comments in front of sfmmu_mlspl_enter().
8331 hat_page_demote(page_t
*pp
)
8338 struct sf_hment
*sfhme
;
8339 struct sf_hment
*tmphme
= NULL
;
8340 struct hme_blk
*hmeblkp
;
8346 kmutex_t
*pmtx
= NULL
;
8348 ASSERT(PAGE_EXCL(pp
));
8349 ASSERT(!PP_ISFREE(pp
));
8350 ASSERT(!PP_ISKAS(pp
));
8351 ASSERT(page_szc_lock_assert(pp
));
8352 pml
= sfmmu_mlist_enter(pp
);
8359 index
= PP_MAPINDEX(pp
) >> 1;
8362 CPUSET_ZERO(cpuset
);
8368 if (!(index
& 0x1)) {
8374 rootpp
= PP_GROUPLEADER(pp
, sz
);
8375 for (sfhme
= rootpp
->p_mapping
; sfhme
; sfhme
= tmphme
) {
8376 tmphme
= sfhme
->hme_next
;
8377 ASSERT(!IS_PAHME(sfhme
));
8378 hmeblkp
= sfmmu_hmetohblk(sfhme
);
8379 if (hme_size(sfhme
) != sz
) {
8382 if (hmeblkp
->hblk_xhat_bit
) {
8384 "hat_page_demote: xhat hmeblk");
8386 tset
= sfmmu_pageunload(rootpp
, sfhme
, sz
);
8387 CPUSET_OR(cpuset
, tset
);
8394 ASSERT(!PP_ISMAPPED_LARGE(pp
));
8400 conv_tnc(rootpp
, sz
);
8405 pmtx
= sfmmu_page_enter(pp
);
8407 ASSERT(pp
->p_szc
== pszc
);
8408 rootpp
= PP_PAGEROOT(pp
);
8409 ASSERT(rootpp
->p_szc
== pszc
);
8410 lastpp
= PP_PAGENEXT_N(rootpp
, TTEPAGES(pszc
) - 1);
8412 while (lastpp
!= rootpp
) {
8413 sz
= PP_MAPINDEX(lastpp
) ? fnd_mapping_sz(lastpp
) : 0;
8415 npgs
= (sz
== 0) ? 1 : TTEPAGES(sz
);
8416 ASSERT(P2PHASE(lastpp
->p_pagenum
, npgs
) == npgs
- 1);
8417 while (--npgs
> 0) {
8418 lastpp
->p_szc
= (uchar_t
)sz
;
8419 lastpp
= PP_PAGEPREV(lastpp
);
8423 * make sure before current root's pszc
8424 * is updated all updates to constituent pages pszc
8425 * fields are globally visible.
8430 ASSERT(IS_P2ALIGNED(lastpp
->p_pagenum
, TTEPAGES(sz
)));
8431 if (lastpp
!= rootpp
) {
8432 lastpp
= PP_PAGEPREV(lastpp
);
8436 /* the loop above doesn't cover this case */
8440 ASSERT(pp
->p_szc
== 0);
8442 sfmmu_page_exit(pmtx
);
8444 sfmmu_mlist_exit(pml
);
8448 * Refresh the HAT ismttecnt[] element for size szc.
8449 * Caller must have set ISM busy flag to prevent mapping
8450 * lists from changing while we're traversing them.
8453 ism_tsb_entries(sfmmu_t
*sfmmup
, int szc
)
8455 ism_blk_t
*ism_blkp
= sfmmup
->sfmmu_iblk
;
8458 pgcnt_t npgs_scd
= 0;
8463 ASSERT(SFMMU_FLAGS_ISSET(sfmmup
, HAT_ISMBUSY
));
8464 scdp
= sfmmup
->sfmmu_scdp
;
8466 for (; ism_blkp
!= NULL
; ism_blkp
= ism_blkp
->iblk_next
) {
8467 ism_map
= ism_blkp
->iblk_maps
;
8468 for (j
= 0; ism_map
[j
].imap_ismhat
&& j
< ISM_MAP_SLOTS
; j
++) {
8469 rid
= ism_map
[j
].imap_rid
;
8470 ASSERT(rid
== SFMMU_INVALID_ISMRID
||
8471 rid
< sfmmup
->sfmmu_srdp
->srd_next_ismrid
);
8473 if (scdp
!= NULL
&& rid
!= SFMMU_INVALID_ISMRID
&&
8474 SF_RGNMAP_TEST(scdp
->scd_ismregion_map
, rid
)) {
8475 /* ISM is in sfmmup's SCD */
8477 ism_map
[j
].imap_ismhat
->sfmmu_ttecnt
[szc
];
8479 /* ISMs is not in SCD */
8481 ism_map
[j
].imap_ismhat
->sfmmu_ttecnt
[szc
];
8485 sfmmup
->sfmmu_ismttecnt
[szc
] = npgs
;
8486 sfmmup
->sfmmu_scdismttecnt
[szc
] = npgs_scd
;
8491 * Yield the memory claim requirement for an address space.
8493 * This is currently implemented as the number of bytes that have active
8494 * hardware translations that have page structures. Therefore, it can
8495 * underestimate the traditional resident set size, eg, if the
8496 * physical page is present and the hardware translation is missing;
8497 * and it can overestimate the rss, eg, if there are active
8498 * translations to a frame buffer with page structs.
8499 * Also, it does not take sharing into account.
8501 * Note that we don't acquire locks here since this function is most often
8502 * called from the clock thread.
8505 hat_get_mapped_size(struct hat
*hat
)
8513 ASSERT(hat
->sfmmu_xhat_provider
== NULL
);
8515 for (i
= 0; i
< mmu_page_sizes
; i
++)
8516 assize
+= ((pgcnt_t
)hat
->sfmmu_ttecnt
[i
] +
8517 (pgcnt_t
)hat
->sfmmu_scdrttecnt
[i
]) * TTEBYTES(i
);
8519 if (hat
->sfmmu_iblk
== NULL
)
8522 for (i
= 0; i
< mmu_page_sizes
; i
++)
8523 assize
+= ((pgcnt_t
)hat
->sfmmu_ismttecnt
[i
] +
8524 (pgcnt_t
)hat
->sfmmu_scdismttecnt
[i
]) * TTEBYTES(i
);
8530 hat_stats_enable(struct hat
*hat
)
8532 hatlock_t
*hatlockp
;
8534 ASSERT(hat
->sfmmu_xhat_provider
== NULL
);
8536 hatlockp
= sfmmu_hat_enter(hat
);
8537 hat
->sfmmu_rmstat
++;
8538 sfmmu_hat_exit(hatlockp
);
8543 hat_stats_disable(struct hat
*hat
)
8545 hatlock_t
*hatlockp
;
8547 ASSERT(hat
->sfmmu_xhat_provider
== NULL
);
8549 hatlockp
= sfmmu_hat_enter(hat
);
8550 hat
->sfmmu_rmstat
--;
8551 sfmmu_hat_exit(hatlockp
);
8555 * Routines for entering or removing ourselves from the
8556 * ism_hat's mapping list. This is used for both private and
8560 iment_add(struct ism_ment
*iment
, struct hat
*ism_hat
)
8562 ASSERT(MUTEX_HELD(&ism_mlist_lock
));
8564 iment
->iment_prev
= NULL
;
8565 iment
->iment_next
= ism_hat
->sfmmu_iment
;
8566 if (ism_hat
->sfmmu_iment
) {
8567 ism_hat
->sfmmu_iment
->iment_prev
= iment
;
8569 ism_hat
->sfmmu_iment
= iment
;
8573 iment_sub(struct ism_ment
*iment
, struct hat
*ism_hat
)
8575 ASSERT(MUTEX_HELD(&ism_mlist_lock
));
8577 if (ism_hat
->sfmmu_iment
== NULL
) {
8578 panic("ism map entry remove - no entries");
8581 if (iment
->iment_prev
) {
8582 ASSERT(ism_hat
->sfmmu_iment
!= iment
);
8583 iment
->iment_prev
->iment_next
= iment
->iment_next
;
8585 ASSERT(ism_hat
->sfmmu_iment
== iment
);
8586 ism_hat
->sfmmu_iment
= iment
->iment_next
;
8589 if (iment
->iment_next
) {
8590 iment
->iment_next
->iment_prev
= iment
->iment_prev
;
8594 * zero out the entry
8596 iment
->iment_next
= NULL
;
8597 iment
->iment_prev
= NULL
;
8598 iment
->iment_hat
= NULL
;
8599 iment
->iment_base_va
= 0;
8603 * Hat_share()/unshare() return an (non-zero) error
8604 * when saddr and daddr are not properly aligned.
8606 * The top level mapping element determines the alignment
8607 * requirement for saddr and daddr, depending on different
8610 * When hat_share()/unshare() are not supported,
8611 * HATOP_SHARE()/UNSHARE() return 0
8614 hat_share(struct hat
*sfmmup
, caddr_t addr
,
8615 struct hat
*ism_hatid
, caddr_t sptaddr
, size_t len
, uint_t ismszc
)
8617 ism_blk_t
*ism_blkp
;
8618 ism_blk_t
*new_iblk
;
8620 ism_ment_t
*ism_ment
;
8622 hatlock_t
*hatlockp
;
8624 uint_t ismshift
= page_get_shift(ismszc
);
8625 size_t ismpgsz
= page_get_pagesize(ismszc
);
8626 uint_t ismmask
= (uint_t
)ismpgsz
- 1;
8627 size_t sh_size
= ISM_SHIFT(ismshift
, len
);
8628 ushort_t ismhatflag
;
8629 hat_region_cookie_t rcookie
;
8633 caddr_t eaddr
= addr
+ len
;
8636 ASSERT(ism_hatid
!= NULL
&& sfmmup
!= NULL
);
8637 ASSERT(sptaddr
== ISMID_STARTADDR
);
8639 * Check the alignment.
8641 if (!ISM_ALIGNED(ismshift
, addr
) || !ISM_ALIGNED(ismshift
, sptaddr
))
8645 * Check size alignment.
8647 if (!ISM_ALIGNED(ismshift
, len
))
8650 ASSERT(sfmmup
->sfmmu_xhat_provider
== NULL
);
8653 * Allocate ism_ment for the ism_hat's mapping list, and an
8654 * ism map blk in case we need one. We must do our
8655 * allocations before acquiring locks to prevent a deadlock
8656 * in the kmem allocator on the mapping list lock.
8658 new_iblk
= kmem_cache_alloc(ism_blk_cache
, KM_SLEEP
);
8659 ism_ment
= kmem_cache_alloc(ism_ment_cache
, KM_SLEEP
);
8662 * Serialize ISM mappings with the ISM busy flag, and also the
8665 sfmmu_ismhat_enter(sfmmup
, 0);
8668 * Allocate an ism map blk if necessary.
8670 if (sfmmup
->sfmmu_iblk
== NULL
) {
8671 sfmmup
->sfmmu_iblk
= new_iblk
;
8672 bzero(new_iblk
, sizeof (*new_iblk
));
8673 new_iblk
->iblk_nextpa
= (uint64_t)-1;
8674 membar_stst(); /* make sure next ptr visible to all CPUs */
8675 sfmmup
->sfmmu_ismblkpa
= va_to_pa((caddr_t
)new_iblk
);
8682 * Make sure mapping does not already exist.
8684 ism_blkp
= sfmmup
->sfmmu_iblk
;
8685 while (ism_blkp
!= NULL
) {
8686 ism_map
= ism_blkp
->iblk_maps
;
8687 for (i
= 0; i
< ISM_MAP_SLOTS
&& ism_map
[i
].imap_ismhat
; i
++) {
8688 if ((addr
>= ism_start(ism_map
[i
]) &&
8689 addr
< ism_end(ism_map
[i
])) ||
8690 eaddr
> ism_start(ism_map
[i
]) &&
8691 eaddr
<= ism_end(ism_map
[i
])) {
8692 panic("sfmmu_share: Already mapped!");
8695 ism_blkp
= ism_blkp
->iblk_next
;
8699 ASSERT(ismszc
>= TTE4M
);
8700 if (ismszc
== TTE4M
) {
8701 ismhatflag
= HAT_4M_FLAG
;
8702 } else if (ismszc
== TTE32M
) {
8703 ismhatflag
= HAT_32M_FLAG
;
8704 } else if (ismszc
== TTE256M
) {
8705 ismhatflag
= HAT_256M_FLAG
;
8708 * Add mapping to first available mapping slot.
8710 ism_blkp
= sfmmup
->sfmmu_iblk
;
8713 ism_map
= ism_blkp
->iblk_maps
;
8714 for (i
= 0; i
< ISM_MAP_SLOTS
; i
++) {
8715 if (ism_map
[i
].imap_ismhat
== NULL
) {
8717 ism_map
[i
].imap_ismhat
= ism_hatid
;
8718 ism_map
[i
].imap_vb_shift
= (uchar_t
)ismshift
;
8719 ism_map
[i
].imap_rid
= SFMMU_INVALID_ISMRID
;
8720 ism_map
[i
].imap_hatflags
= ismhatflag
;
8721 ism_map
[i
].imap_sz_mask
= ismmask
;
8723 * imap_seg is checked in ISM_CHECK to see if
8724 * non-NULL, then other info assumed valid.
8727 ism_map
[i
].imap_seg
= (uintptr_t)addr
| sh_size
;
8728 ism_map
[i
].imap_ment
= ism_ment
;
8731 * Now add ourselves to the ism_hat's
8734 ism_ment
->iment_hat
= sfmmup
;
8735 ism_ment
->iment_base_va
= addr
;
8736 ism_hatid
->sfmmu_ismhat
= 1;
8737 mutex_enter(&ism_mlist_lock
);
8738 iment_add(ism_ment
, ism_hatid
);
8739 mutex_exit(&ism_mlist_lock
);
8744 if (!added
&& ism_blkp
->iblk_next
== NULL
) {
8745 ism_blkp
->iblk_next
= new_iblk
;
8747 bzero(ism_blkp
->iblk_next
,
8748 sizeof (*ism_blkp
->iblk_next
));
8749 ism_blkp
->iblk_next
->iblk_nextpa
= (uint64_t)-1;
8751 ism_blkp
->iblk_nextpa
=
8752 va_to_pa((caddr_t
)ism_blkp
->iblk_next
);
8754 ism_blkp
= ism_blkp
->iblk_next
;
8758 * After calling hat_join_region, sfmmup may join a new SCD or
8759 * move from the old scd to a new scd, in which case, we want to
8760 * shrink the sfmmup's private tsb size, i.e., pass shrink to
8761 * sfmmu_check_page_sizes at the end of this routine.
8763 old_scdp
= sfmmup
->sfmmu_scdp
;
8765 rcookie
= hat_join_region(sfmmup
, addr
, len
, (void *)ism_hatid
, 0,
8766 PROT_ALL
, ismszc
, NULL
, HAT_REGION_ISM
);
8767 if (rcookie
!= HAT_INVALID_REGION_COOKIE
) {
8768 ism_map
[i
].imap_rid
= (uchar_t
)((uint64_t)rcookie
);
8771 * Update our counters for this sfmmup's ism mappings.
8773 for (i
= 0; i
<= ismszc
; i
++) {
8774 if (!(disable_ism_large_pages
& (1 << i
)))
8775 (void) ism_tsb_entries(sfmmup
, i
);
8779 * For ISM and DISM we do not support 512K pages, so we only only
8780 * search the 4M and 8K/64K hashes for 4 pagesize cpus, and search the
8781 * 256M or 32M, and 4M and 8K/64K hashes for 6 pagesize cpus.
8783 * Need to set 32M/256M ISM flags to make sure
8784 * sfmmu_check_page_sizes() enables them on Panther.
8786 ASSERT((disable_ism_large_pages
& (1 << TTE512K
)) != 0);
8790 if (!SFMMU_FLAGS_ISSET(sfmmup
, HAT_256M_ISM
)) {
8791 hatlockp
= sfmmu_hat_enter(sfmmup
);
8792 SFMMU_FLAGS_SET(sfmmup
, HAT_256M_ISM
);
8793 sfmmu_hat_exit(hatlockp
);
8797 if (!SFMMU_FLAGS_ISSET(sfmmup
, HAT_32M_ISM
)) {
8798 hatlockp
= sfmmu_hat_enter(sfmmup
);
8799 SFMMU_FLAGS_SET(sfmmup
, HAT_32M_ISM
);
8800 sfmmu_hat_exit(hatlockp
);
8808 * If we updated the ismblkpa for this HAT we must make
8809 * sure all CPUs running this process reload their tsbmiss area.
8810 * Otherwise they will fail to load the mappings in the tsbmiss
8811 * handler and will loop calling pagefault().
8814 hatlockp
= sfmmu_hat_enter(sfmmup
);
8815 sfmmu_sync_mmustate(sfmmup
);
8816 sfmmu_hat_exit(hatlockp
);
8819 sfmmu_ismhat_exit(sfmmup
, 0);
8822 * Free up ismblk if we didn't use it.
8824 if (new_iblk
!= NULL
)
8825 kmem_cache_free(ism_blk_cache
, new_iblk
);
8828 * Check TSB and TLB page sizes.
8830 if (sfmmup
->sfmmu_scdp
!= NULL
&& old_scdp
!= sfmmup
->sfmmu_scdp
) {
8831 sfmmu_check_page_sizes(sfmmup
, 0);
8833 sfmmu_check_page_sizes(sfmmup
, 1);
8839 * hat_unshare removes exactly one ism_map from
8840 * this process's as. It expects multiple calls
8841 * to hat_unshare for multiple shm segments.
8844 hat_unshare(struct hat
*sfmmup
, caddr_t addr
, size_t len
, uint_t ismszc
)
8847 ism_ment_t
*free_ment
= NULL
;
8848 ism_blk_t
*ism_blkp
;
8849 struct hat
*ism_hatid
;
8851 hatlock_t
*hatlockp
;
8852 struct tsb_info
*tsbinfo
;
8853 uint_t ismshift
= page_get_shift(ismszc
);
8854 size_t sh_size
= ISM_SHIFT(ismshift
, len
);
8858 ASSERT(ISM_ALIGNED(ismshift
, addr
));
8859 ASSERT(ISM_ALIGNED(ismshift
, len
));
8860 ASSERT(sfmmup
!= NULL
);
8861 ASSERT(sfmmup
!= ksfmmup
);
8863 if (sfmmup
->sfmmu_xhat_provider
) {
8864 XHAT_UNSHARE(sfmmup
, addr
, len
);
8868 * This must be a CPU HAT. If the address space has
8869 * XHATs attached, inform all XHATs that ISM segment
8872 ASSERT(sfmmup
->sfmmu_as
!= NULL
);
8873 if (sfmmup
->sfmmu_as
->a_xhat
!= NULL
)
8874 xhat_unshare_all(sfmmup
->sfmmu_as
, addr
, len
);
8878 * Make sure that during the entire time ISM mappings are removed,
8879 * the trap handlers serialize behind us, and that no one else
8880 * can be mucking with ISM mappings. This also lets us get away
8881 * with not doing expensive cross calls to flush the TLB -- we
8882 * just discard the context, flush the entire TSB, and call it
8885 sfmmu_ismhat_enter(sfmmup
, 0);
8888 * Remove the mapping.
8890 * We can't have any holes in the ism map.
8891 * The tsb miss code while searching the ism map will
8892 * stop on an empty map slot. So we must move
8893 * everyone past the hole up 1 if any.
8895 * Also empty ism map blks are not freed until the
8896 * process exits. This is to prevent a MT race condition
8897 * between sfmmu_unshare() and sfmmu_tsbmiss_exception().
8900 ism_blkp
= sfmmup
->sfmmu_iblk
;
8901 while (!found
&& ism_blkp
!= NULL
) {
8902 ism_map
= ism_blkp
->iblk_maps
;
8903 for (i
= 0; i
< ISM_MAP_SLOTS
; i
++) {
8904 if (addr
== ism_start(ism_map
[i
]) &&
8905 sh_size
== (size_t)(ism_size(ism_map
[i
]))) {
8911 ism_blkp
= ism_blkp
->iblk_next
;
8915 ism_hatid
= ism_map
[i
].imap_ismhat
;
8916 ism_rid
= ism_map
[i
].imap_rid
;
8917 ASSERT(ism_hatid
!= NULL
);
8918 ASSERT(ism_hatid
->sfmmu_ismhat
== 1);
8921 * After hat_leave_region, the sfmmup may leave SCD,
8922 * in which case, we want to grow the private tsb size when
8923 * calling sfmmu_check_page_sizes at the end of the routine.
8925 old_scdp
= sfmmup
->sfmmu_scdp
;
8927 * Then remove ourselves from the region.
8929 if (ism_rid
!= SFMMU_INVALID_ISMRID
) {
8930 hat_leave_region(sfmmup
, (void *)((uint64_t)ism_rid
),
8935 * And now guarantee that any other cpu
8936 * that tries to process an ISM miss
8939 hatlockp
= sfmmu_hat_enter(sfmmup
);
8940 sfmmu_invalidate_ctx(sfmmup
);
8941 sfmmu_hat_exit(hatlockp
);
8944 * Remove ourselves from the ism mapping list.
8946 mutex_enter(&ism_mlist_lock
);
8947 iment_sub(ism_map
[i
].imap_ment
, ism_hatid
);
8948 mutex_exit(&ism_mlist_lock
);
8949 free_ment
= ism_map
[i
].imap_ment
;
8952 * We delete the ism map by copying
8953 * the next map over the current one.
8954 * We will take the next one in the maps
8955 * array or from the next ism_blk.
8957 while (ism_blkp
!= NULL
) {
8958 ism_map
= ism_blkp
->iblk_maps
;
8959 while (i
< (ISM_MAP_SLOTS
- 1)) {
8960 ism_map
[i
] = ism_map
[i
+ 1];
8963 /* i == (ISM_MAP_SLOTS - 1) */
8964 ism_blkp
= ism_blkp
->iblk_next
;
8965 if (ism_blkp
!= NULL
) {
8966 ism_map
[i
] = ism_blkp
->iblk_maps
[0];
8969 ism_map
[i
].imap_seg
= 0;
8970 ism_map
[i
].imap_vb_shift
= 0;
8971 ism_map
[i
].imap_rid
= SFMMU_INVALID_ISMRID
;
8972 ism_map
[i
].imap_hatflags
= 0;
8973 ism_map
[i
].imap_sz_mask
= 0;
8974 ism_map
[i
].imap_ismhat
= NULL
;
8975 ism_map
[i
].imap_ment
= NULL
;
8980 * Now flush entire TSB for the process, since
8981 * demapping page by page can be too expensive.
8982 * We don't have to flush the TLB here anymore
8983 * since we switch to a new TLB ctx instead.
8984 * Also, there is no need to flush if the process
8985 * is exiting since the TSB will be freed later.
8987 if (!sfmmup
->sfmmu_free
) {
8988 hatlockp
= sfmmu_hat_enter(sfmmup
);
8989 for (tsbinfo
= sfmmup
->sfmmu_tsb
; tsbinfo
!= NULL
;
8990 tsbinfo
= tsbinfo
->tsb_next
) {
8991 if (tsbinfo
->tsb_flags
& TSB_SWAPPED
)
8993 if (tsbinfo
->tsb_flags
& TSB_RELOC_FLAG
) {
8994 tsbinfo
->tsb_flags
|=
8999 sfmmu_inv_tsb(tsbinfo
->tsb_va
,
9000 TSB_BYTES(tsbinfo
->tsb_szc
));
9002 sfmmu_hat_exit(hatlockp
);
9007 * Update our counters for this sfmmup's ism mappings.
9009 for (i
= 0; i
<= ismszc
; i
++) {
9010 if (!(disable_ism_large_pages
& (1 << i
)))
9011 (void) ism_tsb_entries(sfmmup
, i
);
9014 sfmmu_ismhat_exit(sfmmup
, 0);
9017 * We must do our freeing here after dropping locks
9018 * to prevent a deadlock in the kmem allocator on the
9019 * mapping list lock.
9021 if (free_ment
!= NULL
)
9022 kmem_cache_free(ism_ment_cache
, free_ment
);
9025 * Check TSB and TLB page sizes if the process isn't exiting.
9027 if (!sfmmup
->sfmmu_free
) {
9028 if (found
&& old_scdp
!= NULL
&& sfmmup
->sfmmu_scdp
== NULL
) {
9029 sfmmu_check_page_sizes(sfmmup
, 1);
9031 sfmmu_check_page_sizes(sfmmup
, 0);
9038 sfmmu_idcache_constructor(void *buf
, void *cdrarg
, int kmflags
)
9040 /* void *buf is sfmmu_t pointer */
9041 bzero(buf
, sizeof (sfmmu_t
));
9048 sfmmu_idcache_destructor(void *buf
, void *cdrarg
)
9050 /* void *buf is sfmmu_t pointer */
9054 * setup kmem hmeblks by bzeroing all members and initializing the nextpa
9055 * field to be the pa of this hmeblk
9059 sfmmu_hblkcache_constructor(void *buf
, void *cdrarg
, int kmflags
)
9061 struct hme_blk
*hmeblkp
;
9063 bzero(buf
, (size_t)cdrarg
);
9064 hmeblkp
= (struct hme_blk
*)buf
;
9065 hmeblkp
->hblk_nextpa
= va_to_pa((caddr_t
)hmeblkp
);
9068 mutex_init(&hmeblkp
->hblk_audit_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
9069 #endif /* HBLK_TRACE */
9076 sfmmu_hblkcache_destructor(void *buf
, void *cdrarg
)
9081 struct hme_blk
*hmeblkp
;
9083 hmeblkp
= (struct hme_blk
*)buf
;
9084 mutex_destroy(&hmeblkp
->hblk_audit_lock
);
9086 #endif /* HBLK_TRACE */
9089 #define SFMMU_CACHE_RECLAIM_SCAN_RATIO 8
9090 static int sfmmu_cache_reclaim_scan_ratio
= SFMMU_CACHE_RECLAIM_SCAN_RATIO
;
9092 * The kmem allocator will callback into our reclaim routine when the system
9093 * is running low in memory. We traverse the hash and free up all unused but
9094 * still cached hme_blks. We also traverse the free list and free them up
9099 sfmmu_hblkcache_reclaim(void *cdrarg
)
9102 struct hmehash_bucket
*hmebp
;
9103 struct hme_blk
*hmeblkp
, *nx_hblk
, *pr_hblk
= NULL
;
9104 static struct hmehash_bucket
*uhmehash_reclaim_hand
;
9105 static struct hmehash_bucket
*khmehash_reclaim_hand
;
9106 struct hme_blk
*list
= NULL
, *last_hmeblkp
;
9107 cpuset_t cpuset
= cpu_ready_set
;
9108 cpu_hme_pend_t
*cpuhp
;
9110 /* Free up hmeblks on the cpu pending lists */
9111 for (i
= 0; i
< NCPU
; i
++) {
9112 cpuhp
= &cpu_hme_pend
[i
];
9113 if (cpuhp
->chp_listp
!= NULL
) {
9114 mutex_enter(&cpuhp
->chp_mutex
);
9115 if (cpuhp
->chp_listp
== NULL
) {
9116 mutex_exit(&cpuhp
->chp_mutex
);
9119 for (last_hmeblkp
= cpuhp
->chp_listp
;
9120 last_hmeblkp
->hblk_next
!= NULL
;
9121 last_hmeblkp
= last_hmeblkp
->hblk_next
)
9123 last_hmeblkp
->hblk_next
= list
;
9124 list
= cpuhp
->chp_listp
;
9125 cpuhp
->chp_listp
= NULL
;
9126 cpuhp
->chp_count
= 0;
9127 mutex_exit(&cpuhp
->chp_mutex
);
9134 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
9138 sfmmu_hblk_free(&list
);
9142 hmebp
= uhmehash_reclaim_hand
;
9143 if (hmebp
== NULL
|| hmebp
> &uhme_hash
[UHMEHASH_SZ
])
9144 uhmehash_reclaim_hand
= hmebp
= uhme_hash
;
9145 uhmehash_reclaim_hand
+= UHMEHASH_SZ
/ sfmmu_cache_reclaim_scan_ratio
;
9147 for (i
= UHMEHASH_SZ
/ sfmmu_cache_reclaim_scan_ratio
; i
; i
--) {
9148 if (SFMMU_HASH_LOCK_TRYENTER(hmebp
) != 0) {
9149 hmeblkp
= hmebp
->hmeblkp
;
9152 nx_hblk
= hmeblkp
->hblk_next
;
9153 if (!hmeblkp
->hblk_vcnt
&&
9154 !hmeblkp
->hblk_hmecnt
) {
9155 sfmmu_hblk_hash_rm(hmebp
, hmeblkp
,
9162 SFMMU_HASH_UNLOCK(hmebp
);
9164 if (hmebp
++ == &uhme_hash
[UHMEHASH_SZ
])
9168 hmebp
= khmehash_reclaim_hand
;
9169 if (hmebp
== NULL
|| hmebp
> &khme_hash
[KHMEHASH_SZ
])
9170 khmehash_reclaim_hand
= hmebp
= khme_hash
;
9171 khmehash_reclaim_hand
+= KHMEHASH_SZ
/ sfmmu_cache_reclaim_scan_ratio
;
9173 for (i
= KHMEHASH_SZ
/ sfmmu_cache_reclaim_scan_ratio
; i
; i
--) {
9174 if (SFMMU_HASH_LOCK_TRYENTER(hmebp
) != 0) {
9175 hmeblkp
= hmebp
->hmeblkp
;
9178 nx_hblk
= hmeblkp
->hblk_next
;
9179 if (!hmeblkp
->hblk_vcnt
&&
9180 !hmeblkp
->hblk_hmecnt
) {
9181 sfmmu_hblk_hash_rm(hmebp
, hmeblkp
,
9188 SFMMU_HASH_UNLOCK(hmebp
);
9190 if (hmebp
++ == &khme_hash
[KHMEHASH_SZ
])
9193 sfmmu_hblks_list_purge(&list
, 0);
9197 * sfmmu_get_ppvcolor should become a vm_machdep or hatop interface.
9198 * same goes for sfmmu_get_addrvcolor().
9200 * This function will return the virtual color for the specified page. The
9201 * virtual color corresponds to this page current mapping or its last mapping.
9202 * It is used by memory allocators to choose addresses with the correct
9203 * alignment so vac consistency is automatically maintained. If the page
9204 * has no color it returns -1.
9208 sfmmu_get_ppvcolor(struct page
*pp
)
9213 if (!(cache
& CACHE_VAC
) || PP_NEWPAGE(pp
)) {
9216 color
= PP_GET_VCOLOR(pp
);
9217 ASSERT(color
< mmu_btop(shm_alignment
));
9225 * This function will return the desired alignment for vac consistency
9226 * (vac color) given a virtual address. If no vac is present it returns -1.
9230 sfmmu_get_addrvcolor(caddr_t vaddr
)
9233 if (cache
& CACHE_VAC
) {
9234 return (addr_to_vcolor(vaddr
));
9245 * Check for conflicts.
9246 * A conflict exists if the new and existent mappings do not match in
9247 * their "shm_alignment fields. If conflicts exist, the existant mappings
9248 * are flushed unless one of them is locked. If one of them is locked, then
9249 * the mappings are flushed and converted to non-cacheable mappings.
9252 sfmmu_vac_conflict(struct hat
*hat
, caddr_t addr
, page_t
*pp
)
9255 struct sf_hment
*sfhmep
, *tmphme
= NULL
;
9256 struct hme_blk
*hmeblkp
;
9260 ASSERT(sfmmu_mlist_held(pp
));
9261 ASSERT(!PP_ISNC(pp
)); /* page better be cacheable */
9263 vcolor
= addr_to_vcolor(addr
);
9264 if (PP_NEWPAGE(pp
)) {
9265 PP_SET_VCOLOR(pp
, vcolor
);
9269 if (PP_GET_VCOLOR(pp
) == vcolor
) {
9273 if (!PP_ISMAPPED(pp
) && !PP_ISMAPPED_KPM(pp
)) {
9275 * Previous user of page had a different color
9276 * but since there are no current users
9277 * we just flush the cache and change the color.
9279 SFMMU_STAT(sf_pgcolor_conflict
);
9280 sfmmu_cache_flush(pp
->p_pagenum
, PP_GET_VCOLOR(pp
));
9281 PP_SET_VCOLOR(pp
, vcolor
);
9286 * If we get here we have a vac conflict with a current
9287 * mapping. VAC conflict policy is as follows.
9288 * - The default is to unload the other mappings unless:
9289 * - If we have a large mapping we uncache the page.
9290 * We need to uncache the rest of the large page too.
9291 * - If any of the mappings are locked we uncache the page.
9292 * - If the requested mapping is inconsistent
9293 * with another mapping and that mapping
9294 * is in the same address space we have to
9295 * make it non-cached. The default thing
9296 * to do is unload the inconsistent mapping
9297 * but if they are in the same address space
9298 * we run the risk of unmapping the pc or the
9299 * stack which we will use as we return to the user,
9300 * in which case we can then fault on the thing
9301 * we just unloaded and get into an infinite loop.
9303 if (PP_ISMAPPED_LARGE(pp
)) {
9307 * Existing mapping is for big pages. We don't unload
9308 * existing big mappings to satisfy new mappings.
9309 * Always convert all mappings to TNC.
9311 sz
= fnd_mapping_sz(pp
);
9312 pp
= PP_GROUPLEADER(pp
, sz
);
9313 SFMMU_STAT_ADD(sf_uncache_conflict
, TTEPAGES(sz
));
9314 sfmmu_page_cache_array(pp
, HAT_TMPNC
, CACHE_FLUSH
,
9321 * check if any mapping is in same as or if it is locked
9322 * since in that case we need to uncache.
9324 for (sfhmep
= pp
->p_mapping
; sfhmep
; sfhmep
= tmphme
) {
9325 tmphme
= sfhmep
->hme_next
;
9326 if (IS_PAHME(sfhmep
))
9328 hmeblkp
= sfmmu_hmetohblk(sfhmep
);
9329 if (hmeblkp
->hblk_xhat_bit
)
9331 tmphat
= hblktosfmmu(hmeblkp
);
9332 sfmmu_copytte(&sfhmep
->hme_tte
, &tte
);
9333 ASSERT(TTE_IS_VALID(&tte
));
9334 if (hmeblkp
->hblk_shared
|| tmphat
== hat
||
9335 hmeblkp
->hblk_lckcnt
) {
9337 * We have an uncache conflict
9339 SFMMU_STAT(sf_uncache_conflict
);
9340 sfmmu_page_cache_array(pp
, HAT_TMPNC
, CACHE_FLUSH
, 1);
9346 * We have an unload conflict
9347 * We have already checked for LARGE mappings, therefore
9348 * the remaining mapping(s) must be TTE8K.
9350 SFMMU_STAT(sf_unload_conflict
);
9352 for (sfhmep
= pp
->p_mapping
; sfhmep
; sfhmep
= tmphme
) {
9353 tmphme
= sfhmep
->hme_next
;
9354 if (IS_PAHME(sfhmep
))
9356 hmeblkp
= sfmmu_hmetohblk(sfhmep
);
9357 if (hmeblkp
->hblk_xhat_bit
)
9359 ASSERT(!hmeblkp
->hblk_shared
);
9360 (void) sfmmu_pageunload(pp
, sfhmep
, TTE8K
);
9363 if (PP_ISMAPPED_KPM(pp
))
9364 sfmmu_kpm_vac_unload(pp
, addr
);
9367 * Unloads only do TLB flushes so we need to flush the
9370 sfmmu_cache_flush(pp
->p_pagenum
, PP_GET_VCOLOR(pp
));
9371 PP_SET_VCOLOR(pp
, vcolor
);
9375 * Whenever a mapping is unloaded and the page is in TNC state,
9376 * we see if the page can be made cacheable again. 'pp' is
9377 * the page that we just unloaded a mapping from, the size
9378 * of mapping that was unloaded is 'ottesz'.
9380 * The recache policy for mpss pages can leave a performance problem
9381 * under the following circumstances:
9382 * . A large page in uncached mode has just been unmapped.
9383 * . All constituent pages are TNC due to a conflicting small mapping.
9384 * . There are many other, non conflicting, small mappings around for
9385 * a lot of the constituent pages.
9386 * . We're called w/ the "old" groupleader page and the old ottesz,
9387 * but this is irrelevant, since we're no more "PP_ISMAPPED_LARGE", so
9388 * we end up w/ TTE8K or npages == 1.
9389 * . We call tst_tnc w/ the old groupleader only, and if there is no
9390 * conflict, we re-cache only this page.
9391 * . All other small mappings are not checked and will be left in TNC mode.
9392 * The problem is not very serious because:
9393 * . mpss is actually only defined for heap and stack, so the probability
9394 * is not very high that a large page mapping exists in parallel to a small
9395 * one (this is possible, but seems to be bad programming style in the
9397 * . The problem gets a little bit more serious, when those TNC pages
9398 * have to be mapped into kernel space, e.g. for networking.
9399 * . When VAC alias conflicts occur in applications, this is regarded
9400 * as an application bug. So if kstat's show them, the appl should
9401 * be changed anyway.
9404 conv_tnc(page_t
*pp
, int ottesz
)
9407 pgcnt_t curnpgs
, dopgs
;
9412 * Determine how big a range we check for TNC and find
9413 * leader page. cursz is the size of the biggest
9414 * mapping that still exist on 'pp'.
9416 if (PP_ISMAPPED_LARGE(pp
)) {
9417 cursz
= fnd_mapping_sz(pp
);
9422 if (ottesz
>= cursz
) {
9427 pp2
= PP_GROUPLEADER(pp
, dosz
);
9430 pg64k
= TTEPAGES(TTE64K
);
9431 dopgs
= TTEPAGES(dosz
);
9433 ASSERT(dopgs
== 1 || ((dopgs
& (pg64k
- 1)) == 0));
9435 while (dopgs
!= 0) {
9436 curnpgs
= TTEPAGES(cursz
);
9437 if (tst_tnc(pp2
, curnpgs
)) {
9438 SFMMU_STAT_ADD(sf_recache
, curnpgs
);
9439 sfmmu_page_cache_array(pp2
, HAT_CACHE
, CACHE_NO_FLUSH
,
9443 ASSERT(dopgs
>= curnpgs
);
9450 pp2
= PP_PAGENEXT_N(pp2
, curnpgs
);
9451 if (((dopgs
& (pg64k
- 1)) == 0) && PP_ISMAPPED_LARGE(pp2
)) {
9452 cursz
= fnd_mapping_sz(pp2
);
9460 * Returns 1 if page(s) can be converted from TNC to cacheable setting,
9461 * returns 0 otherwise. Note that oaddr argument is valid for only
9465 tst_tnc(page_t
*pp
, pgcnt_t npages
)
9467 struct sf_hment
*sfhme
;
9468 struct hme_blk
*hmeblkp
;
9472 int color
, color1
, bcolor
;
9476 ASSERT(!(cache
& CACHE_WRITEBACK
));
9479 ncolors
= CACHE_NUM_COLOR
;
9482 for (i
= 0; i
< npages
; i
++) {
9483 ASSERT(sfmmu_mlist_held(pp
));
9484 ASSERT(PP_ISTNC(pp
));
9485 ASSERT(PP_GET_VCOLOR(pp
) == NO_VCOLOR
);
9492 if (PP_ISMAPPED_KPM(pp
)) {
9496 kpmvaddr
= hat_kpm_page2va(pp
, 1);
9497 ASSERT(!(npages
> 1 && IS_KPM_ALIAS_RANGE(kpmvaddr
)));
9498 color1
= addr_to_vcolor(kpmvaddr
);
9502 for (sfhme
= pp
->p_mapping
; sfhme
; sfhme
= sfhme
->hme_next
) {
9503 if (IS_PAHME(sfhme
))
9505 hmeblkp
= sfmmu_hmetohblk(sfhme
);
9506 if (hmeblkp
->hblk_xhat_bit
)
9509 sfmmu_copytte(&sfhme
->hme_tte
, &tte
);
9510 ASSERT(TTE_IS_VALID(&tte
));
9512 vaddr
= tte_to_vaddr(hmeblkp
, tte
);
9513 color
= addr_to_vcolor(vaddr
);
9517 * If there is a big mapping, make sure
9518 * 8K mapping is consistent with the big
9521 bcolor
= i
% ncolors
;
9522 if (color
!= bcolor
) {
9531 if (color1
!= color
) {
9536 pp
= PP_PAGENEXT(pp
);
9543 sfmmu_page_cache_array(page_t
*pp
, int flags
, int cache_flush_flag
,
9547 int i
, ncolors
, bcolor
;
9552 ASSERT(!(cache
& CACHE_WRITEBACK
));
9554 kpmp
= sfmmu_kpm_kpmp_enter(pp
, npages
);
9555 pmtx
= sfmmu_page_enter(pp
);
9558 * Fast path caching single unmapped page
9560 if (npages
== 1 && !PP_ISMAPPED(pp
) && !PP_ISMAPPED_KPM(pp
) &&
9561 flags
== HAT_CACHE
) {
9564 sfmmu_page_exit(pmtx
);
9565 sfmmu_kpm_kpmp_exit(kpmp
);
9570 * We need to capture all cpus in order to change cacheability
9571 * because we can't allow one cpu to access the same physical
9572 * page using a cacheable and a non-cachebale mapping at the same
9573 * time. Since we may end up walking the ism mapping list
9574 * have to grab it's lock now since we can't after all the
9575 * cpus have been captured.
9577 sfmmu_hat_lock_all();
9578 mutex_enter(&ism_mlist_lock
);
9580 cpuset
= cpu_ready_set
;
9581 xc_attention(cpuset
);
9585 * Make sure all colors are flushed since the
9586 * sfmmu_page_cache() only flushes one color-
9587 * it does not know big pages.
9589 ncolors
= CACHE_NUM_COLOR
;
9590 if (flags
& HAT_TMPNC
) {
9591 for (i
= 0; i
< ncolors
; i
++) {
9592 sfmmu_cache_flushcolor(i
, pp
->p_pagenum
);
9594 cache_flush_flag
= CACHE_NO_FLUSH
;
9598 for (i
= 0; i
< npages
; i
++) {
9600 ASSERT(sfmmu_mlist_held(pp
));
9602 if (!(flags
== HAT_TMPNC
&& PP_ISTNC(pp
))) {
9605 bcolor
= i
% ncolors
;
9610 sfmmu_page_cache(pp
, flags
, cache_flush_flag
,
9614 pp
= PP_PAGENEXT(pp
);
9618 xc_dismissed(cpuset
);
9619 mutex_exit(&ism_mlist_lock
);
9620 sfmmu_hat_unlock_all();
9621 sfmmu_page_exit(pmtx
);
9622 sfmmu_kpm_kpmp_exit(kpmp
);
9627 * This function changes the virtual cacheability of all mappings to a
9628 * particular page. When changing from uncache to cacheable the mappings will
9629 * only be changed if all of them have the same virtual color.
9630 * We need to flush the cache in all cpus. It is possible that
9631 * a process referenced a page as cacheable but has sinced exited
9632 * and cleared the mapping list. We still to flush it but have no
9633 * state so all cpus is the only alternative.
9636 sfmmu_page_cache(page_t
*pp
, int flags
, int cache_flush_flag
, int bcolor
)
9638 struct sf_hment
*sfhme
;
9639 struct hme_blk
*hmeblkp
;
9647 pfn
= pp
->p_pagenum
;
9649 for (sfhme
= pp
->p_mapping
; sfhme
; sfhme
= sfhme
->hme_next
) {
9651 if (IS_PAHME(sfhme
))
9653 hmeblkp
= sfmmu_hmetohblk(sfhme
);
9655 if (hmeblkp
->hblk_xhat_bit
)
9658 sfmmu_copytte(&sfhme
->hme_tte
, &tte
);
9659 ASSERT(TTE_IS_VALID(&tte
));
9660 vaddr
= tte_to_vaddr(hmeblkp
, tte
);
9661 color
= addr_to_vcolor(vaddr
);
9664 if ((flags
& HAT_CACHE
) && bcolor
!= NO_VCOLOR
) {
9665 ASSERT(color
== bcolor
);
9669 ASSERT(flags
!= HAT_TMPNC
|| color
== PP_GET_VCOLOR(pp
));
9672 if (flags
& (HAT_UNCACHE
| HAT_TMPNC
)) {
9673 TTE_CLR_VCACHEABLE(&ttemod
);
9674 } else { /* flags & HAT_CACHE */
9675 TTE_SET_VCACHEABLE(&ttemod
);
9677 ret
= sfmmu_modifytte_try(&tte
, &ttemod
, &sfhme
->hme_tte
);
9680 * Since all cpus are captured modifytte should not
9683 panic("sfmmu_page_cache: write to tte failed");
9686 sfmmup
= hblktosfmmu(hmeblkp
);
9687 if (cache_flush_flag
== CACHE_FLUSH
) {
9689 * Flush TSBs, TLBs and caches
9691 if (hmeblkp
->hblk_shared
) {
9692 sf_srd_t
*srdp
= (sf_srd_t
*)sfmmup
;
9693 uint_t rid
= hmeblkp
->hblk_tag
.htag_rid
;
9695 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
9696 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
9697 ASSERT(srdp
!= NULL
);
9698 rgnp
= srdp
->srd_hmergnp
[rid
];
9699 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp
,
9701 (void) sfmmu_rgntlb_demap(vaddr
, rgnp
,
9703 sfmmu_cache_flush(pfn
, addr_to_vcolor(vaddr
));
9704 } else if (sfmmup
->sfmmu_ismhat
) {
9705 if (flags
& HAT_CACHE
) {
9706 SFMMU_STAT(sf_ism_recache
);
9708 SFMMU_STAT(sf_ism_uncache
);
9710 sfmmu_ismtlbcache_demap(vaddr
, sfmmup
, hmeblkp
,
9713 sfmmu_tlbcache_demap(vaddr
, sfmmup
, hmeblkp
,
9714 pfn
, 0, FLUSH_ALL_CPUS
, CACHE_FLUSH
, 1);
9718 * all cache entries belonging to this pfn are
9721 cache_flush_flag
= CACHE_NO_FLUSH
;
9724 * Flush only TSBs and TLBs.
9726 if (hmeblkp
->hblk_shared
) {
9727 sf_srd_t
*srdp
= (sf_srd_t
*)sfmmup
;
9728 uint_t rid
= hmeblkp
->hblk_tag
.htag_rid
;
9730 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
9731 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
9732 ASSERT(srdp
!= NULL
);
9733 rgnp
= srdp
->srd_hmergnp
[rid
];
9734 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp
,
9736 (void) sfmmu_rgntlb_demap(vaddr
, rgnp
,
9738 } else if (sfmmup
->sfmmu_ismhat
) {
9739 if (flags
& HAT_CACHE
) {
9740 SFMMU_STAT(sf_ism_recache
);
9742 SFMMU_STAT(sf_ism_uncache
);
9744 sfmmu_ismtlbcache_demap(vaddr
, sfmmup
, hmeblkp
,
9745 pfn
, CACHE_NO_FLUSH
);
9747 sfmmu_tlb_demap(vaddr
, sfmmup
, hmeblkp
, 0, 1);
9752 if (PP_ISMAPPED_KPM(pp
))
9753 sfmmu_kpm_page_cache(pp
, flags
, cache_flush_flag
);
9758 panic("sfmmu_pagecache: unknown flags");
9764 PP_SET_VCOLOR(pp
, color
);
9769 PP_SET_VCOLOR(pp
, NO_VCOLOR
);
9775 PP_SET_VCOLOR(pp
, NO_VCOLOR
);
9783 * Wrapper routine used to return a context.
9785 * It's the responsibility of the caller to guarantee that the
9786 * process serializes on calls here by taking the HAT lock for
9791 sfmmu_get_ctx(sfmmu_t
*sfmmup
)
9793 mmu_ctx_t
*mmu_ctxp
;
9797 ASSERT(sfmmu_hat_lock_held(sfmmup
));
9798 ASSERT(sfmmup
!= ksfmmup
);
9800 if (SFMMU_FLAGS_ISSET(sfmmup
, HAT_ALLCTX_INVALID
)) {
9801 sfmmu_setup_tsbinfo(sfmmup
);
9802 SFMMU_FLAGS_CLEAR(sfmmup
, HAT_ALLCTX_INVALID
);
9807 mmu_ctxp
= CPU_MMU_CTXP(CPU
);
9809 ASSERT(mmu_ctxp
->mmu_idx
< max_mmu_ctxdoms
);
9810 ASSERT(mmu_ctxp
== mmu_ctxs_tbl
[mmu_ctxp
->mmu_idx
]);
9813 * Do a wrap-around if cnum reaches the max # cnum supported by a MMU.
9815 if (mmu_ctxp
->mmu_cnum
== mmu_ctxp
->mmu_nctxs
)
9816 sfmmu_ctx_wrap_around(mmu_ctxp
, B_TRUE
);
9819 * Let the MMU set up the page sizes to use for
9820 * this context in the TLB. Don't program 2nd dtlb for ism hat.
9822 if ((&mmu_set_ctx_page_sizes
) && (sfmmup
->sfmmu_ismhat
== 0)) {
9823 mmu_set_ctx_page_sizes(sfmmup
);
9827 * sfmmu_alloc_ctx and sfmmu_load_mmustate will be performed with
9828 * interrupts disabled to prevent race condition with wrap-around
9829 * ctx invalidatation. In sun4v, ctx invalidation also involves
9830 * a HV call to set the number of TSBs to 0. If interrupts are not
9831 * disabled until after sfmmu_load_mmustate is complete TSBs may
9832 * become assigned to INVALID_CONTEXT. This is not allowed.
9834 pstate_save
= sfmmu_disable_intrs();
9836 if (sfmmu_alloc_ctx(sfmmup
, 1, CPU
, SFMMU_PRIVATE
) &&
9837 sfmmup
->sfmmu_scdp
!= NULL
) {
9838 sf_scd_t
*scdp
= sfmmup
->sfmmu_scdp
;
9839 sfmmu_t
*scsfmmup
= scdp
->scd_sfmmup
;
9840 ret
= sfmmu_alloc_ctx(scsfmmup
, 1, CPU
, SFMMU_SHARED
);
9841 /* debug purpose only */
9842 ASSERT(!ret
|| scsfmmup
->sfmmu_ctxs
[CPU_MMU_IDX(CPU
)].cnum
9843 != INVALID_CONTEXT
);
9845 sfmmu_load_mmustate(sfmmup
);
9847 sfmmu_enable_intrs(pstate_save
);
9853 * When all cnums are used up in a MMU, cnum will wrap around to the
9854 * next generation and start from 2.
9857 sfmmu_ctx_wrap_around(mmu_ctx_t
*mmu_ctxp
, boolean_t reset_cnum
)
9860 /* caller must have disabled the preemption */
9861 ASSERT(curthread
->t_preempt
>= 1);
9862 ASSERT(mmu_ctxp
!= NULL
);
9864 /* acquire Per-MMU (PM) spin lock */
9865 mutex_enter(&mmu_ctxp
->mmu_lock
);
9867 /* re-check to see if wrap-around is needed */
9868 if (mmu_ctxp
->mmu_cnum
< mmu_ctxp
->mmu_nctxs
)
9871 SFMMU_MMU_STAT(mmu_wrap_around
);
9874 ASSERT(mmu_ctxp
->mmu_gnum
!= 0);
9875 mmu_ctxp
->mmu_gnum
++;
9876 if (mmu_ctxp
->mmu_gnum
== 0 ||
9877 mmu_ctxp
->mmu_gnum
> MAX_SFMMU_GNUM_VAL
) {
9878 cmn_err(CE_PANIC
, "mmu_gnum of mmu_ctx 0x%p is out of bound.",
9882 if (mmu_ctxp
->mmu_ncpus
> 1) {
9885 membar_enter(); /* make sure updated gnum visible */
9887 SFMMU_XCALL_STATS(NULL
);
9889 /* xcall to others on the same MMU to invalidate ctx */
9890 cpuset
= mmu_ctxp
->mmu_cpuset
;
9891 ASSERT(CPU_IN_SET(cpuset
, CPU
->cpu_id
) || !reset_cnum
);
9892 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
9893 CPUSET_AND(cpuset
, cpu_ready_set
);
9896 * Pass in INVALID_CONTEXT as the first parameter to
9897 * sfmmu_raise_tsb_exception, which invalidates the context
9898 * of any process running on the CPUs in the MMU.
9900 xt_some(cpuset
, sfmmu_raise_tsb_exception
,
9901 INVALID_CONTEXT
, INVALID_CONTEXT
);
9904 SFMMU_MMU_STAT(mmu_tsb_raise_exception
);
9907 if (sfmmu_getctx_sec() != INVALID_CONTEXT
) {
9908 sfmmu_setctx_sec(INVALID_CONTEXT
);
9909 sfmmu_clear_utsbinfo();
9913 * No xcall is needed here. For sun4u systems all CPUs in context
9914 * domain share a single physical MMU therefore it's enough to flush
9915 * TLB on local CPU. On sun4v systems we use 1 global context
9916 * domain and flush all remote TLBs in sfmmu_raise_tsb_exception
9917 * handler. Note that vtag_flushall_uctxs() is called
9918 * for Ultra II machine, where the equivalent flushall functionality
9919 * is implemented in SW, and only user ctx TLB entries are flushed.
9921 if (&vtag_flushall_uctxs
!= NULL
) {
9922 vtag_flushall_uctxs();
9927 /* reset mmu cnum, skips cnum 0 and 1 */
9928 if (reset_cnum
== B_TRUE
)
9929 mmu_ctxp
->mmu_cnum
= NUM_LOCKED_CTXS
;
9932 mutex_exit(&mmu_ctxp
->mmu_lock
);
9937 * For multi-threaded process, set the process context to INVALID_CONTEXT
9938 * so that it faults and reloads the MMU state from TL=0. For single-threaded
9939 * process, we can just load the MMU state directly without having to
9940 * set context invalid. Caller must hold the hat lock since we don't
9944 sfmmu_sync_mmustate(sfmmu_t
*sfmmup
)
9949 ASSERT(sfmmup
!= ksfmmup
);
9950 ASSERT(sfmmu_hat_lock_held(sfmmup
));
9955 * We check whether the pass'ed-in sfmmup is the same as the
9956 * current running proc. This is to makes sure the current proc
9957 * stays single-threaded if it already is.
9959 if ((sfmmup
== curthread
->t_procp
->p_as
->a_hat
) &&
9960 (curthread
->t_procp
->p_lwpcnt
== 1)) {
9962 cnum
= sfmmup
->sfmmu_ctxs
[CPU_MMU_IDX(CPU
)].cnum
;
9963 if (cnum
!= INVALID_CONTEXT
) {
9966 * Disable interrupts to prevent race condition
9967 * with sfmmu_ctx_wrap_around ctx invalidation.
9968 * In sun4v, ctx invalidation involves setting
9969 * TSB to NULL, hence, interrupts should be disabled
9970 * untill after sfmmu_load_mmustate is completed.
9972 pstate_save
= sfmmu_disable_intrs();
9973 curcnum
= sfmmu_getctx_sec();
9974 if (curcnum
== cnum
)
9975 sfmmu_load_mmustate(sfmmup
);
9976 sfmmu_enable_intrs(pstate_save
);
9977 ASSERT(curcnum
== cnum
|| curcnum
== INVALID_CONTEXT
);
9982 * or when sfmmup is not the same as the curproc.
9984 sfmmu_invalidate_ctx(sfmmup
);
9992 * Replace the specified TSB with a new TSB. This function gets called when
9993 * we grow, shrink or swapin a TSB. When swapping in a TSB (TSB_SWAPIN), the
9994 * TSB_FORCEALLOC flag may be used to force allocation of a minimum-sized TSB
9997 * Caller must hold the HAT lock, but should assume any tsb_info
9998 * pointers it has are no longer valid after calling this function.
10001 * TSB_ALLOCFAIL Failed to allocate a TSB, due to memory constraints
10002 * TSB_LOSTRACE HAT is busy, i.e. another thread is already doing
10003 * something to this tsbinfo/TSB
10004 * TSB_SUCCESS Operation succeeded
10006 static tsb_replace_rc_t
10007 sfmmu_replace_tsb(sfmmu_t
*sfmmup
, struct tsb_info
*old_tsbinfo
, uint_t szc
,
10008 hatlock_t
*hatlockp
, uint_t flags
)
10010 struct tsb_info
*new_tsbinfo
= NULL
;
10011 struct tsb_info
*curtsb
, *prevtsb
;
10012 uint_t tte_sz_mask
;
10015 ASSERT(sfmmup
!= ksfmmup
);
10016 ASSERT(sfmmup
->sfmmu_ismhat
== 0);
10017 ASSERT(sfmmu_hat_lock_held(sfmmup
));
10018 ASSERT(szc
<= tsb_max_growsize
);
10020 if (SFMMU_FLAGS_ISSET(sfmmup
, HAT_BUSY
))
10021 return (TSB_LOSTRACE
);
10024 * Find the tsb_info ahead of this one in the list, and
10025 * also make sure that the tsb_info passed in really
10028 for (prevtsb
= NULL
, curtsb
= sfmmup
->sfmmu_tsb
;
10029 curtsb
!= old_tsbinfo
&& curtsb
!= NULL
;
10030 prevtsb
= curtsb
, curtsb
= curtsb
->tsb_next
)
10032 ASSERT(curtsb
!= NULL
);
10034 if (!(flags
& TSB_SWAPIN
) && SFMMU_FLAGS_ISSET(sfmmup
, HAT_SWAPPED
)) {
10036 * The process is swapped out, so just set the new size
10037 * code. When it swaps back in, we'll allocate a new one
10038 * of the new chosen size.
10040 curtsb
->tsb_szc
= szc
;
10041 return (TSB_SUCCESS
);
10043 SFMMU_FLAGS_SET(sfmmup
, HAT_BUSY
);
10045 tte_sz_mask
= old_tsbinfo
->tsb_ttesz_mask
;
10048 * All initialization is done inside of sfmmu_tsbinfo_alloc().
10049 * If we fail to allocate a TSB, exit.
10051 * If tsb grows with new tsb size > 4M and old tsb size < 4M,
10052 * then try 4M slab after the initial alloc fails.
10054 * If tsb swapin with tsb size > 4M, then try 4M after the
10055 * initial alloc fails.
10057 sfmmu_hat_exit(hatlockp
);
10058 if (sfmmu_tsbinfo_alloc(&new_tsbinfo
, szc
,
10059 tte_sz_mask
, flags
, sfmmup
) &&
10060 (!(flags
& (TSB_GROW
| TSB_SWAPIN
)) || (szc
<= TSB_4M_SZCODE
) ||
10061 (!(flags
& TSB_SWAPIN
) &&
10062 (old_tsbinfo
->tsb_szc
>= TSB_4M_SZCODE
)) ||
10063 sfmmu_tsbinfo_alloc(&new_tsbinfo
, TSB_4M_SZCODE
,
10064 tte_sz_mask
, flags
, sfmmup
))) {
10065 (void) sfmmu_hat_enter(sfmmup
);
10066 if (!(flags
& TSB_SWAPIN
))
10067 SFMMU_STAT(sf_tsb_resize_failures
);
10068 SFMMU_FLAGS_CLEAR(sfmmup
, HAT_BUSY
);
10069 return (TSB_ALLOCFAIL
);
10071 (void) sfmmu_hat_enter(sfmmup
);
10074 * Re-check to make sure somebody else didn't muck with us while we
10075 * didn't hold the HAT lock. If the process swapped out, fine, just
10076 * exit; this can happen if we try to shrink the TSB from the context
10077 * of another process (such as on an ISM unmap), though it is rare.
10079 if (!(flags
& TSB_SWAPIN
) && SFMMU_FLAGS_ISSET(sfmmup
, HAT_SWAPPED
)) {
10080 SFMMU_STAT(sf_tsb_resize_failures
);
10081 SFMMU_FLAGS_CLEAR(sfmmup
, HAT_BUSY
);
10082 sfmmu_hat_exit(hatlockp
);
10083 sfmmu_tsbinfo_free(new_tsbinfo
);
10084 (void) sfmmu_hat_enter(sfmmup
);
10085 return (TSB_LOSTRACE
);
10089 /* Reverify that the tsb_info still exists.. for debugging only */
10090 for (prevtsb
= NULL
, curtsb
= sfmmup
->sfmmu_tsb
;
10091 curtsb
!= old_tsbinfo
&& curtsb
!= NULL
;
10092 prevtsb
= curtsb
, curtsb
= curtsb
->tsb_next
)
10094 ASSERT(curtsb
!= NULL
);
10098 * Quiesce any CPUs running this process on their next TLB miss
10099 * so they atomically see the new tsb_info. We temporarily set the
10100 * context to invalid context so new threads that come on processor
10101 * after we do the xcall to cpusran will also serialize behind the
10102 * HAT lock on TLB miss and will see the new TSB. Since this short
10103 * race with a new thread coming on processor is relatively rare,
10104 * this synchronization mechanism should be cheaper than always
10105 * pausing all CPUs for the duration of the setup, which is what
10106 * the old implementation did. This is particuarly true if we are
10107 * copying a huge chunk of memory around during that window.
10109 * The memory barriers are to make sure things stay consistent
10110 * with resume() since it does not hold the HAT lock while
10111 * walking the list of tsb_info structures.
10113 if ((flags
& TSB_SWAPIN
) != TSB_SWAPIN
) {
10114 /* The TSB is either growing or shrinking. */
10115 sfmmu_invalidate_ctx(sfmmup
);
10118 * It is illegal to swap in TSBs from a process other
10119 * than a process being swapped in. This in turn
10120 * implies we do not have a valid MMU context here
10121 * since a process needs one to resolve translation
10124 ASSERT(curthread
->t_procp
->p_as
->a_hat
== sfmmup
);
10128 ASSERT(max_mmu_ctxdoms
> 0);
10131 * Process should have INVALID_CONTEXT on all MMUs
10133 for (i
= 0; i
< max_mmu_ctxdoms
; i
++) {
10135 ASSERT(sfmmup
->sfmmu_ctxs
[i
].cnum
== INVALID_CONTEXT
);
10139 new_tsbinfo
->tsb_next
= old_tsbinfo
->tsb_next
;
10140 membar_stst(); /* strict ordering required */
10142 prevtsb
->tsb_next
= new_tsbinfo
;
10144 sfmmup
->sfmmu_tsb
= new_tsbinfo
;
10145 membar_enter(); /* make sure new TSB globally visible */
10148 * We need to migrate TSB entries from the old TSB to the new TSB
10149 * if tsb_remap_ttes is set and the TSB is growing.
10151 if (tsb_remap_ttes
&& ((flags
& TSB_GROW
) == TSB_GROW
))
10152 sfmmu_copy_tsb(old_tsbinfo
, new_tsbinfo
);
10154 SFMMU_FLAGS_CLEAR(sfmmup
, HAT_BUSY
);
10157 * Drop the HAT lock to free our old tsb_info.
10159 sfmmu_hat_exit(hatlockp
);
10161 if ((flags
& TSB_GROW
) == TSB_GROW
) {
10162 SFMMU_STAT(sf_tsb_grow
);
10163 } else if ((flags
& TSB_SHRINK
) == TSB_SHRINK
) {
10164 SFMMU_STAT(sf_tsb_shrink
);
10167 sfmmu_tsbinfo_free(old_tsbinfo
);
10169 (void) sfmmu_hat_enter(sfmmup
);
10170 return (TSB_SUCCESS
);
10174 * This function will re-program hat pgsz array, and invalidate the
10175 * process' context, forcing the process to switch to another
10176 * context on the next TLB miss, and therefore start using the
10177 * TLB that is reprogrammed for the new page sizes.
10180 sfmmu_reprog_pgsz_arr(sfmmu_t
*sfmmup
, uint8_t *tmp_pgsz
)
10183 hatlock_t
*hatlockp
= NULL
;
10185 hatlockp
= sfmmu_hat_enter(sfmmup
);
10186 /* USIII+-IV+ optimization, requires hat lock */
10188 for (i
= 0; i
< mmu_page_sizes
; i
++)
10189 sfmmup
->sfmmu_pgsz
[i
] = tmp_pgsz
[i
];
10191 SFMMU_STAT(sf_tlb_reprog_pgsz
);
10193 sfmmu_invalidate_ctx(sfmmup
);
10195 sfmmu_hat_exit(hatlockp
);
10199 * The scd_rttecnt field in the SCD must be updated to take account of the
10200 * regions which it contains.
10203 sfmmu_set_scd_rttecnt(sf_srd_t
*srdp
, sf_scd_t
*scdp
)
10210 ASSERT(srdp
!= NULL
);
10212 for (i
= 0; i
< SFMMU_HMERGNMAP_WORDS
; i
++) {
10213 if ((w
= scdp
->scd_region_map
.bitmap
[i
]) == 0) {
10224 rid
= (i
<< BT_ULSHIFT
) | j
;
10228 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
10229 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
10230 rgnp
= srdp
->srd_hmergnp
[rid
];
10231 ASSERT(rgnp
->rgn_refcnt
> 0);
10232 ASSERT(rgnp
->rgn_id
== rid
);
10234 scdp
->scd_rttecnt
[rgnp
->rgn_pgszc
] +=
10235 rgnp
->rgn_size
>> TTE_PAGE_SHIFT(rgnp
->rgn_pgszc
);
10238 * Maintain the tsb0 inflation cnt for the regions
10241 if (rgnp
->rgn_pgszc
>= TTE4M
) {
10242 scdp
->scd_sfmmup
->sfmmu_tsb0_4minflcnt
+=
10244 (TTE_PAGE_SHIFT(TTE8K
) + 2);
10251 * This function assumes that there are either four or six supported page
10252 * sizes and at most two programmable TLBs, so we need to decide which
10253 * page sizes are most important and then tell the MMU layer so it
10254 * can adjust the TLB page sizes accordingly (if supported).
10256 * If these assumptions change, this function will need to be
10257 * updated to support whatever the new limits are.
10259 * The growing flag is nonzero if we are growing the address space,
10260 * and zero if it is shrinking. This allows us to decide whether
10261 * to grow or shrink our TSB, depending upon available memory
10265 sfmmu_check_page_sizes(sfmmu_t
*sfmmup
, int growing
)
10267 uint64_t ttecnt
[MMU_PAGE_SIZES
];
10268 uint64_t tte8k_cnt
, tte4m_cnt
;
10273 * Kernel threads, processes with small address spaces not using
10274 * large pages, and dummy ISM HATs need not apply.
10276 if (sfmmup
== ksfmmup
|| sfmmup
->sfmmu_ismhat
!= NULL
)
10279 if (!SFMMU_LGPGS_INUSE(sfmmup
) &&
10280 sfmmup
->sfmmu_ttecnt
[TTE8K
] <= tsb_rss_factor
)
10283 for (i
= 0; i
< mmu_page_sizes
; i
++) {
10284 ttecnt
[i
] = sfmmup
->sfmmu_ttecnt
[i
] +
10285 sfmmup
->sfmmu_ismttecnt
[i
];
10288 /* Check pagesizes in use, and possibly reprogram DTLB. */
10289 if (&mmu_check_page_sizes
)
10290 mmu_check_page_sizes(sfmmup
, ttecnt
);
10293 * Calculate the number of 8k ttes to represent the span of these
10296 tte8k_cnt
= ttecnt
[TTE8K
] +
10297 (ttecnt
[TTE64K
] << (MMU_PAGESHIFT64K
- MMU_PAGESHIFT
)) +
10298 (ttecnt
[TTE512K
] << (MMU_PAGESHIFT512K
- MMU_PAGESHIFT
));
10299 if (mmu_page_sizes
== max_mmu_page_sizes
) {
10300 tte4m_cnt
= ttecnt
[TTE4M
] +
10301 (ttecnt
[TTE32M
] << (MMU_PAGESHIFT32M
- MMU_PAGESHIFT4M
)) +
10302 (ttecnt
[TTE256M
] << (MMU_PAGESHIFT256M
- MMU_PAGESHIFT4M
));
10304 tte4m_cnt
= ttecnt
[TTE4M
];
10308 * Inflate tte8k_cnt to allow for region large page allocation failure.
10310 tte8k_cnt
+= sfmmup
->sfmmu_tsb0_4minflcnt
;
10313 * Inflate TSB sizes by a factor of 2 if this process
10314 * uses 4M text pages to minimize extra conflict misses
10315 * in the first TSB since without counting text pages
10316 * 8K TSB may become too small.
10318 * Also double the size of the second TSB to minimize
10319 * extra conflict misses due to competition between 4M text pages
10322 * We need to adjust the second TSB allocation threshold by the
10323 * inflation factor, since there is no point in creating a second
10324 * TSB when we know all the mappings can fit in the I/D TLBs.
10326 sectsb_thresh
= tsb_sectsb_threshold
;
10327 if (sfmmup
->sfmmu_flags
& HAT_4MTEXT_FLAG
) {
10330 sectsb_thresh
<<= 1;
10334 * Check to see if our TSB is the right size; we may need to
10335 * grow or shrink it. If the process is small, our work is
10336 * finished at this point.
10338 if (tte8k_cnt
<= tsb_rss_factor
&& tte4m_cnt
<= sectsb_thresh
) {
10341 sfmmu_size_tsb(sfmmup
, growing
, tte8k_cnt
, tte4m_cnt
, sectsb_thresh
);
10345 sfmmu_size_tsb(sfmmu_t
*sfmmup
, int growing
, uint64_t tte8k_cnt
,
10346 uint64_t tte4m_cnt
, int sectsb_thresh
)
10350 struct tsb_info
*tsbinfop
;
10351 hatlock_t
*hatlockp
= NULL
;
10353 hatlockp
= sfmmu_hat_enter(sfmmup
);
10354 ASSERT(hatlockp
!= NULL
);
10355 tsbinfop
= sfmmup
->sfmmu_tsb
;
10356 ASSERT(tsbinfop
!= NULL
);
10359 * If we're growing, select the size based on RSS. If we're
10360 * shrinking, leave some room so we don't have to turn around and
10361 * grow again immediately.
10364 tsb_szc
= SELECT_TSB_SIZECODE(tte8k_cnt
);
10366 tsb_szc
= SELECT_TSB_SIZECODE(tte8k_cnt
<< 1);
10368 if (!growing
&& (tsb_szc
< tsbinfop
->tsb_szc
) &&
10369 (tsb_szc
>= default_tsb_size
) && TSB_OK_SHRINK()) {
10370 (void) sfmmu_replace_tsb(sfmmup
, tsbinfop
, tsb_szc
,
10371 hatlockp
, TSB_SHRINK
);
10372 } else if (growing
&& tsb_szc
> tsbinfop
->tsb_szc
&& TSB_OK_GROW()) {
10373 (void) sfmmu_replace_tsb(sfmmup
, tsbinfop
, tsb_szc
,
10374 hatlockp
, TSB_GROW
);
10376 tsbinfop
= sfmmup
->sfmmu_tsb
;
10379 * With the TLB and first TSB out of the way, we need to see if
10380 * we need a second TSB for 4M pages. If we managed to reprogram
10381 * the TLB page sizes above, the process will start using this new
10382 * TSB right away; otherwise, it will start using it on the next
10383 * context switch. Either way, it's no big deal so there's no
10384 * synchronization with the trap handlers here unless we grow the
10385 * TSB (in which case it's required to prevent using the old one
10386 * after it's freed). Note: second tsb is required for 32M/256M
10389 if (tte4m_cnt
> sectsb_thresh
) {
10391 * If we're growing, select the size based on RSS. If we're
10392 * shrinking, leave some room so we don't have to turn
10393 * around and grow again immediately.
10396 tsb_szc
= SELECT_TSB_SIZECODE(tte4m_cnt
);
10398 tsb_szc
= SELECT_TSB_SIZECODE(tte4m_cnt
<< 1);
10399 if (tsbinfop
->tsb_next
== NULL
) {
10400 struct tsb_info
*newtsb
;
10401 int allocflags
= SFMMU_FLAGS_ISSET(sfmmup
, HAT_SWAPPED
)?
10404 sfmmu_hat_exit(hatlockp
);
10407 * Try to allocate a TSB for 4[32|256]M pages. If we
10408 * can't get the size we want, retry w/a minimum sized
10409 * TSB. If that still didn't work, give up; we can
10410 * still run without one.
10412 tsb_bits
= (mmu_page_sizes
== max_mmu_page_sizes
)?
10413 TSB4M
|TSB32M
|TSB256M
:TSB4M
;
10414 if ((sfmmu_tsbinfo_alloc(&newtsb
, tsb_szc
, tsb_bits
,
10415 allocflags
, sfmmup
)) &&
10416 (tsb_szc
<= TSB_4M_SZCODE
||
10417 sfmmu_tsbinfo_alloc(&newtsb
, TSB_4M_SZCODE
,
10418 tsb_bits
, allocflags
, sfmmup
)) &&
10419 sfmmu_tsbinfo_alloc(&newtsb
, TSB_MIN_SZCODE
,
10420 tsb_bits
, allocflags
, sfmmup
)) {
10424 hatlockp
= sfmmu_hat_enter(sfmmup
);
10426 sfmmu_invalidate_ctx(sfmmup
);
10428 if (sfmmup
->sfmmu_tsb
->tsb_next
== NULL
) {
10429 sfmmup
->sfmmu_tsb
->tsb_next
= newtsb
;
10430 SFMMU_STAT(sf_tsb_sectsb_create
);
10431 sfmmu_hat_exit(hatlockp
);
10435 * It's annoying, but possible for us
10436 * to get here.. we dropped the HAT lock
10437 * because of locking order in the kmem
10438 * allocator, and while we were off getting
10439 * our memory, some other thread decided to
10440 * do us a favor and won the race to get a
10441 * second TSB for this process. Sigh.
10443 sfmmu_hat_exit(hatlockp
);
10444 sfmmu_tsbinfo_free(newtsb
);
10450 * We have a second TSB, see if it's big enough.
10452 tsbinfop
= tsbinfop
->tsb_next
;
10455 * Check to see if our second TSB is the right size;
10456 * we may need to grow or shrink it.
10457 * To prevent thrashing (e.g. growing the TSB on a
10458 * subsequent map operation), only try to shrink if
10459 * the TSB reach exceeds twice the virtual address
10462 if (!growing
&& (tsb_szc
< tsbinfop
->tsb_szc
) &&
10463 (tsb_szc
>= default_tsb_size
) && TSB_OK_SHRINK()) {
10464 (void) sfmmu_replace_tsb(sfmmup
, tsbinfop
,
10465 tsb_szc
, hatlockp
, TSB_SHRINK
);
10466 } else if (growing
&& tsb_szc
> tsbinfop
->tsb_szc
&&
10468 (void) sfmmu_replace_tsb(sfmmup
, tsbinfop
,
10469 tsb_szc
, hatlockp
, TSB_GROW
);
10473 sfmmu_hat_exit(hatlockp
);
10478 * Since the sfmmu is currently embedded in the hat struct we simply zero
10479 * out our fields and free up the ism map blk list if any.
10482 sfmmu_free_sfmmu(sfmmu_t
*sfmmup
)
10484 ism_blk_t
*blkp
, *nx_blkp
;
10490 ASSERT(sfmmup
->sfmmu_ttecnt
[TTE8K
] == 0);
10491 ASSERT(sfmmup
->sfmmu_ttecnt
[TTE64K
] == 0);
10492 ASSERT(sfmmup
->sfmmu_ttecnt
[TTE512K
] == 0);
10493 ASSERT(sfmmup
->sfmmu_ttecnt
[TTE4M
] == 0);
10494 ASSERT(sfmmup
->sfmmu_ttecnt
[TTE32M
] == 0);
10495 ASSERT(sfmmup
->sfmmu_ttecnt
[TTE256M
] == 0);
10496 ASSERT(SF_RGNMAP_ISNULL(sfmmup
));
10498 sfmmup
->sfmmu_free
= 0;
10499 sfmmup
->sfmmu_ismhat
= 0;
10501 blkp
= sfmmup
->sfmmu_iblk
;
10502 sfmmup
->sfmmu_iblk
= NULL
;
10506 map
= blkp
->iblk_maps
;
10507 for (i
= 0; i
< ISM_MAP_SLOTS
; i
++) {
10508 ASSERT(map
[i
].imap_seg
== 0);
10509 ASSERT(map
[i
].imap_ismhat
== NULL
);
10510 ASSERT(map
[i
].imap_ment
== NULL
);
10513 nx_blkp
= blkp
->iblk_next
;
10514 blkp
->iblk_next
= NULL
;
10515 blkp
->iblk_nextpa
= (uint64_t)-1;
10516 kmem_cache_free(ism_blk_cache
, blkp
);
10522 * Locking primitves accessed by HATLOCK macros
10525 #define SFMMU_SPL_MTX (0x0)
10526 #define SFMMU_ML_MTX (0x1)
10528 #define SFMMU_MLSPL_MTX(type, pg) (((type) == SFMMU_SPL_MTX) ? \
10529 SPL_HASH(pg) : MLIST_HASH(pg))
10532 sfmmu_page_enter(struct page
*pp
)
10534 return (sfmmu_mlspl_enter(pp
, SFMMU_SPL_MTX
));
10538 sfmmu_page_exit(kmutex_t
*spl
)
10544 sfmmu_page_spl_held(struct page
*pp
)
10546 return (sfmmu_mlspl_held(pp
, SFMMU_SPL_MTX
));
10550 sfmmu_mlist_enter(struct page
*pp
)
10552 return (sfmmu_mlspl_enter(pp
, SFMMU_ML_MTX
));
10556 sfmmu_mlist_exit(kmutex_t
*mml
)
10562 sfmmu_mlist_held(struct page
*pp
)
10565 return (sfmmu_mlspl_held(pp
, SFMMU_ML_MTX
));
10569 * Common code for sfmmu_mlist_enter() and sfmmu_page_enter(). For
10570 * sfmmu_mlist_enter() case mml_table lock array is used and for
10571 * sfmmu_page_enter() sfmmu_page_lock lock array is used.
10573 * The lock is taken on a root page so that it protects an operation on all
10574 * constituent pages of a large page pp belongs to.
10576 * The routine takes a lock from the appropriate array. The lock is determined
10577 * by hashing the root page. After taking the lock this routine checks if the
10578 * root page has the same size code that was used to determine the root (i.e
10579 * that root hasn't changed). If root page has the expected p_szc field we
10580 * have the right lock and it's returned to the caller. If root's p_szc
10581 * decreased we release the lock and retry from the beginning. This case can
10582 * happen due to hat_page_demote() decreasing p_szc between our load of p_szc
10583 * value and taking the lock. The number of retries due to p_szc decrease is
10584 * limited by the maximum p_szc value. If p_szc is 0 we return the lock
10585 * determined by hashing pp itself.
10587 * If our caller doesn't hold a SE_SHARED or SE_EXCL lock on pp it's also
10588 * possible that p_szc can increase. To increase p_szc a thread has to lock
10589 * all constituent pages EXCL and do hat_pageunload() on all of them. All the
10590 * callers that don't hold a page locked recheck if hmeblk through which pp
10591 * was found still maps this pp. If it doesn't map it anymore returned lock
10592 * is immediately dropped. Therefore if sfmmu_mlspl_enter() hits the case of
10593 * p_szc increase after taking the lock it returns this lock without further
10594 * retries because in this case the caller doesn't care about which lock was
10595 * taken. The caller will drop it right away.
10597 * After the routine returns it's guaranteed that hat_page_demote() can't
10598 * change p_szc field of any of constituent pages of a large page pp belongs
10599 * to as long as pp was either locked at least SHARED prior to this call or
10600 * the caller finds that hment that pointed to this pp still references this
10601 * pp (this also assumes that the caller holds hme hash bucket lock so that
10602 * the same pp can't be remapped into the same hmeblk after it was unmapped by
10603 * hat_pageunload()).
10606 sfmmu_mlspl_enter(struct page
*pp
, int type
)
10609 uint_t prev_rszc
= UINT_MAX
;
10613 uint_t pszc
= pp
->p_szc
;
10615 ASSERT(pp
!= NULL
);
10619 mtx
= SFMMU_MLSPL_MTX(type
, pp
);
10624 /* The lock lives in the root page */
10625 rootpp
= PP_GROUPLEADER(pp
, pszc
);
10626 mtx
= SFMMU_MLSPL_MTX(type
, rootpp
);
10630 * Return mml in the following 3 cases:
10632 * 1) If pp itself is root since if its p_szc decreased before we took
10633 * the lock pp is still the root of smaller szc page. And if its p_szc
10634 * increased it doesn't matter what lock we return (see comment in
10635 * front of this routine).
10637 * 2) If pp's not root but rootpp is the root of a rootpp->p_szc size
10638 * large page we have the right lock since any previous potential
10639 * hat_page_demote() is done demoting from greater than current root's
10640 * p_szc because hat_page_demote() changes root's p_szc last. No
10641 * further hat_page_demote() can start or be in progress since it
10642 * would need the same lock we currently hold.
10644 * 3) If rootpp's p_szc increased since previous iteration it doesn't
10645 * matter what lock we return (see comment in front of this routine).
10647 if (pp
== rootpp
|| (rszc
= rootpp
->p_szc
) == pszc
||
10648 rszc
>= prev_rszc
) {
10653 * hat_page_demote() could have decreased root's p_szc.
10654 * In this case pp's p_szc must also be smaller than pszc.
10665 * pp's p_szc increased after it was decreased.
10666 * page cannot be mapped. Return current lock. The caller
10667 * will drop it right away.
10673 * root's p_szc is greater than pp's p_szc.
10674 * hat_page_demote() is not done with all pages
10675 * yet. Wait for it to complete.
10678 rootpp
= PP_GROUPLEADER(rootpp
, rszc
);
10679 mtx
= SFMMU_MLSPL_MTX(type
, rootpp
);
10687 sfmmu_mlspl_held(struct page
*pp
, int type
)
10691 ASSERT(pp
!= NULL
);
10692 /* The lock lives in the root page */
10693 pp
= PP_PAGEROOT(pp
);
10694 ASSERT(pp
!= NULL
);
10696 mtx
= SFMMU_MLSPL_MTX(type
, pp
);
10697 return (MUTEX_HELD(mtx
));
10701 sfmmu_get_free_hblk(struct hme_blk
**hmeblkpp
, uint_t critical
)
10703 struct hme_blk
*hblkp
;
10706 if (freehblkp
!= NULL
) {
10707 mutex_enter(&freehblkp_lock
);
10708 if (freehblkp
!= NULL
) {
10710 * If the current thread is owning hblk_reserve OR
10711 * critical request from sfmmu_hblk_steal()
10712 * let it succeed even if freehblkcnt is really low.
10714 if (freehblkcnt
<= HBLK_RESERVE_MIN
&& !critical
) {
10715 SFMMU_STAT(sf_get_free_throttle
);
10716 mutex_exit(&freehblkp_lock
);
10720 *hmeblkpp
= freehblkp
;
10722 freehblkp
= hblkp
->hblk_next
;
10723 mutex_exit(&freehblkp_lock
);
10724 hblkp
->hblk_next
= NULL
;
10725 SFMMU_STAT(sf_get_free_success
);
10727 ASSERT(hblkp
->hblk_hmecnt
== 0);
10728 ASSERT(hblkp
->hblk_vcnt
== 0);
10729 ASSERT(hblkp
->hblk_nextpa
== va_to_pa((caddr_t
)hblkp
));
10733 mutex_exit(&freehblkp_lock
);
10736 /* Check cpu hblk pending queues */
10737 if ((*hmeblkpp
= sfmmu_check_pending_hblks(TTE8K
)) != NULL
) {
10739 hblkp
->hblk_next
= NULL
;
10740 hblkp
->hblk_nextpa
= va_to_pa((caddr_t
)hblkp
);
10742 ASSERT(hblkp
->hblk_hmecnt
== 0);
10743 ASSERT(hblkp
->hblk_vcnt
== 0);
10748 SFMMU_STAT(sf_get_free_fail
);
10753 sfmmu_put_free_hblk(struct hme_blk
*hmeblkp
, uint_t critical
)
10755 struct hme_blk
*hblkp
;
10757 ASSERT(hmeblkp
->hblk_hmecnt
== 0);
10758 ASSERT(hmeblkp
->hblk_vcnt
== 0);
10759 ASSERT(hmeblkp
->hblk_nextpa
== va_to_pa((caddr_t
)hmeblkp
));
10762 * If the current thread is mapping into kernel space,
10763 * let it succede even if freehblkcnt is max
10764 * so that it will avoid freeing it to kmem.
10765 * This will prevent stack overflow due to
10766 * possible recursion since kmem_cache_free()
10767 * might require creation of a slab which
10768 * in turn needs an hmeblk to map that slab;
10769 * let's break this vicious chain at the first
10772 if (freehblkcnt
< HBLK_RESERVE_CNT
|| critical
) {
10773 mutex_enter(&freehblkp_lock
);
10774 if (freehblkcnt
< HBLK_RESERVE_CNT
|| critical
) {
10775 SFMMU_STAT(sf_put_free_success
);
10777 hmeblkp
->hblk_next
= freehblkp
;
10778 freehblkp
= hmeblkp
;
10779 mutex_exit(&freehblkp_lock
);
10782 mutex_exit(&freehblkp_lock
);
10786 * Bring down freehblkcnt to HBLK_RESERVE_CNT. We are here
10787 * only if freehblkcnt is at least HBLK_RESERVE_CNT *and*
10788 * we are not in the process of mapping into kernel space.
10791 while (freehblkcnt
> HBLK_RESERVE_CNT
) {
10792 mutex_enter(&freehblkp_lock
);
10793 if (freehblkcnt
> HBLK_RESERVE_CNT
) {
10796 freehblkp
= hblkp
->hblk_next
;
10797 mutex_exit(&freehblkp_lock
);
10798 ASSERT(get_hblk_cache(hblkp
) == sfmmu8_cache
);
10799 kmem_cache_free(sfmmu8_cache
, hblkp
);
10802 mutex_exit(&freehblkp_lock
);
10804 SFMMU_STAT(sf_put_free_fail
);
10809 sfmmu_hblk_swap(struct hme_blk
*new)
10811 struct hme_blk
*old
, *hblkp
, *prev
;
10813 caddr_t base
, vaddr
, endaddr
;
10814 struct hmehash_bucket
*hmebp
;
10815 struct sf_hment
*osfhme
, *nsfhme
;
10819 struct hme_blk
*list
= NULL
;
10822 hmeblk_tag hblktag
;
10823 struct hme_blk
*found
;
10825 old
= HBLK_RESERVE
;
10826 ASSERT(!old
->hblk_shared
);
10829 * save pa before bcopy clobbers it
10831 newpa
= new->hblk_nextpa
;
10833 base
= (caddr_t
)get_hblk_base(old
);
10834 endaddr
= base
+ get_hblk_span(old
);
10837 * acquire hash bucket lock.
10839 hmebp
= sfmmu_tteload_acquire_hashbucket(ksfmmup
, base
, TTE8K
,
10840 SFMMU_INVALID_SHMERID
);
10843 * copy contents from old to new
10845 bcopy((void *)old
, (void *)new, HME8BLK_SZ
);
10848 * add new to hash chain
10850 sfmmu_hblk_hash_add(hmebp
, new, newpa
);
10853 * search hash chain for hblk_reserve; this needs to be performed
10854 * after adding new, otherwise prev won't correspond to the hblk which
10855 * is prior to old in hash chain when we call sfmmu_hblk_hash_rm to
10856 * remove old later.
10859 hblkp
= hmebp
->hmeblkp
; hblkp
!= NULL
&& hblkp
!= old
;
10860 prev
= hblkp
, hblkp
= hblkp
->hblk_next
)
10864 panic("sfmmu_hblk_swap: hblk_reserve not found");
10867 * p_mapping list is still pointing to hments in hblk_reserve;
10868 * fix up p_mapping list so that they point to hments in new.
10870 * Since all these mappings are created by hblk_reserve_thread
10871 * on the way and it's using at least one of the buffers from each of
10872 * the newly minted slabs, there is no danger of any of these
10873 * mappings getting unloaded by another thread.
10875 * tsbmiss could only modify ref/mod bits of hments in old/new.
10876 * Since all of these hments hold mappings established by segkmem
10877 * and mappings in segkmem are setup with HAT_NOSYNC, ref/mod bits
10878 * have no meaning for the mappings in hblk_reserve. hments in
10879 * old and new are identical except for ref/mod bits.
10881 for (vaddr
= base
; vaddr
< endaddr
; vaddr
+= TTEBYTES(TTE8K
)) {
10883 HBLKTOHME(osfhme
, old
, vaddr
);
10884 sfmmu_copytte(&osfhme
->hme_tte
, &tte
);
10886 if (TTE_IS_VALID(&tte
)) {
10887 if ((pp
= osfhme
->hme_page
) == NULL
)
10888 panic("sfmmu_hblk_swap: page not mapped");
10890 pml
= sfmmu_mlist_enter(pp
);
10892 if (pp
!= osfhme
->hme_page
)
10893 panic("sfmmu_hblk_swap: mapping changed");
10895 HBLKTOHME(nsfhme
, new, vaddr
);
10897 HME_ADD(nsfhme
, pp
);
10898 HME_SUB(osfhme
, pp
);
10900 sfmmu_mlist_exit(pml
);
10905 * remove old from hash chain
10907 sfmmu_hblk_hash_rm(hmebp
, old
, prev
, &list
, 1);
10911 hblktag
.htag_id
= ksfmmup
;
10912 hblktag
.htag_rid
= SFMMU_INVALID_SHMERID
;
10913 hblktag
.htag_bspage
= HME_HASH_BSPAGE(base
, HME_HASH_SHIFT(TTE8K
));
10914 hblktag
.htag_rehash
= HME_HASH_REHASH(TTE8K
);
10915 HME_HASH_FAST_SEARCH(hmebp
, hblktag
, found
);
10918 panic("sfmmu_hblk_swap: new hblk not found");
10921 SFMMU_HASH_UNLOCK(hmebp
);
10924 * Reset hblk_reserve
10926 bzero((void *)old
, HME8BLK_SZ
);
10927 old
->hblk_nextpa
= va_to_pa((caddr_t
)old
);
10931 * Grab the mlist mutex for both pages passed in.
10933 * low and high will be returned as pointers to the mutexes for these pages.
10934 * low refers to the mutex residing in the lower bin of the mlist hash, while
10935 * high refers to the mutex residing in the higher bin of the mlist hash. This
10936 * is due to the locking order restrictions on the same thread grabbing
10937 * multiple mlist mutexes. The low lock must be acquired before the high lock.
10939 * If both pages hash to the same mutex, only grab that single mutex, and
10940 * high will be returned as NULL
10941 * If the pages hash to different bins in the hash, grab the lower addressed
10942 * lock first and then the higher addressed lock in order to follow the locking
10943 * rules involved with the same thread grabbing multiple mlist mutexes.
10944 * low and high will both have non-NULL values.
10947 sfmmu_mlist_reloc_enter(struct page
*targ
, struct page
*repl
,
10948 kmutex_t
**low
, kmutex_t
**high
)
10950 kmutex_t
*mml_targ
, *mml_repl
;
10953 * no need to do the dance around szc as in sfmmu_mlist_enter()
10954 * because this routine is only called by hat_page_relocate() and all
10955 * targ and repl pages are already locked EXCL so szc can't change.
10958 mml_targ
= MLIST_HASH(PP_PAGEROOT(targ
));
10959 mml_repl
= MLIST_HASH(PP_PAGEROOT(repl
));
10961 if (mml_targ
== mml_repl
) {
10965 if (mml_targ
< mml_repl
) {
10976 mutex_enter(*high
);
10980 sfmmu_mlist_reloc_exit(kmutex_t
*low
, kmutex_t
*high
)
10988 sfmmu_hat_enter(sfmmu_t
*sfmmup
)
10990 hatlock_t
*hatlockp
;
10992 if (sfmmup
!= ksfmmup
) {
10993 hatlockp
= TSB_HASH(sfmmup
);
10994 mutex_enter(HATLOCK_MUTEXP(hatlockp
));
11001 sfmmu_hat_tryenter(sfmmu_t
*sfmmup
)
11003 hatlock_t
*hatlockp
;
11005 if (sfmmup
!= ksfmmup
) {
11006 hatlockp
= TSB_HASH(sfmmup
);
11007 if (mutex_tryenter(HATLOCK_MUTEXP(hatlockp
)) == 0)
11015 sfmmu_hat_exit(hatlock_t
*hatlockp
)
11017 if (hatlockp
!= NULL
)
11018 mutex_exit(HATLOCK_MUTEXP(hatlockp
));
11022 sfmmu_hat_lock_all(void)
11025 for (i
= 0; i
< SFMMU_NUM_LOCK
; i
++)
11026 mutex_enter(HATLOCK_MUTEXP(&hat_lock
[i
]));
11030 sfmmu_hat_unlock_all(void)
11033 for (i
= SFMMU_NUM_LOCK
- 1; i
>= 0; i
--)
11034 mutex_exit(HATLOCK_MUTEXP(&hat_lock
[i
]));
11038 sfmmu_hat_lock_held(sfmmu_t
*sfmmup
)
11040 ASSERT(sfmmup
!= ksfmmup
);
11041 return (MUTEX_HELD(HATLOCK_MUTEXP(TSB_HASH(sfmmup
))));
11045 * Locking primitives to provide consistency between ISM unmap
11046 * and other operations. Since ISM unmap can take a long time, we
11047 * use HAT_ISMBUSY flag (protected by the hatlock) to avoid creating
11048 * contention on the hatlock buckets while ISM segments are being
11049 * unmapped. The tradeoff is that the flags don't prevent priority
11050 * inversion from occurring, so we must request kernel priority in
11051 * case we have to sleep to keep from getting buried while holding
11052 * the HAT_ISMBUSY flag set, which in turn could block other kernel
11053 * threads from running (for example, in sfmmu_uvatopfn()).
11056 sfmmu_ismhat_enter(sfmmu_t
*sfmmup
, int hatlock_held
)
11058 hatlock_t
*hatlockp
;
11060 THREAD_KPRI_REQUEST();
11062 hatlockp
= sfmmu_hat_enter(sfmmup
);
11063 while (SFMMU_FLAGS_ISSET(sfmmup
, HAT_ISMBUSY
))
11064 cv_wait(&sfmmup
->sfmmu_tsb_cv
, HATLOCK_MUTEXP(hatlockp
));
11065 SFMMU_FLAGS_SET(sfmmup
, HAT_ISMBUSY
);
11067 sfmmu_hat_exit(hatlockp
);
11071 sfmmu_ismhat_exit(sfmmu_t
*sfmmup
, int hatlock_held
)
11073 hatlock_t
*hatlockp
;
11076 hatlockp
= sfmmu_hat_enter(sfmmup
);
11077 ASSERT(SFMMU_FLAGS_ISSET(sfmmup
, HAT_ISMBUSY
));
11078 SFMMU_FLAGS_CLEAR(sfmmup
, HAT_ISMBUSY
);
11079 cv_broadcast(&sfmmup
->sfmmu_tsb_cv
);
11081 sfmmu_hat_exit(hatlockp
);
11082 THREAD_KPRI_RELEASE();
11089 * (1) if segkmem is not ready, allocate hblk from an array of pre-alloc'ed
11092 * (2) if we are allocating an hblk for mapping a slab in sfmmu_cache,
11094 * (a) try to return an hblk from reserve pool of free hblks;
11095 * (b) if the reserve pool is empty, acquire hblk_reserve_lock
11096 * and return hblk_reserve.
11098 * (3) call kmem_cache_alloc() to allocate hblk;
11100 * (a) if hblk_reserve_lock is held by the current thread,
11101 * atomically replace hblk_reserve by the hblk that is
11102 * returned by kmem_cache_alloc; release hblk_reserve_lock
11103 * and call kmem_cache_alloc() again.
11104 * (b) if reserve pool is not full, add the hblk that is
11105 * returned by kmem_cache_alloc to reserve pool and
11106 * call kmem_cache_alloc again.
11109 static struct hme_blk
*
11110 sfmmu_hblk_alloc(sfmmu_t
*sfmmup
, caddr_t vaddr
,
11111 struct hmehash_bucket
*hmebp
, uint_t size
, hmeblk_tag hblktag
,
11112 uint_t flags
, uint_t rid
)
11114 struct hme_blk
*hmeblkp
= NULL
;
11115 struct hme_blk
*newhblkp
;
11116 struct hme_blk
*shw_hblkp
= NULL
;
11117 struct kmem_cache
*sfmmu_cache
= NULL
;
11120 uint_t owner
; /* set to 1 if using hblk_reserve */
11126 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp
));
11127 ASSERT(hblktag
.htag_rid
== rid
);
11128 SFMMU_VALIDATE_HMERID(sfmmup
, rid
, vaddr
, TTEBYTES(size
));
11129 ASSERT(!SFMMU_IS_SHMERID_VALID(rid
) ||
11130 IS_P2ALIGNED(vaddr
, TTEBYTES(size
)));
11133 * If segkmem is not created yet, allocate from static hmeblks
11134 * created at the end of startup_modules(). See the block comment
11135 * in startup_modules() describing how we estimate the number of
11136 * static hmeblks that will be needed during re-map.
11138 if (!hblk_alloc_dynamic
) {
11140 ASSERT(!SFMMU_IS_SHMERID_VALID(rid
));
11142 if (size
== TTE8K
) {
11143 index
= nucleus_hblk8
.index
;
11144 if (index
>= nucleus_hblk8
.len
) {
11146 * If we panic here, see startup_modules() to
11147 * make sure that we are calculating the
11148 * number of hblk8's that we need correctly.
11150 prom_panic("no nucleus hblk8 to allocate");
11153 (struct hme_blk
*)&nucleus_hblk8
.list
[index
];
11154 nucleus_hblk8
.index
++;
11155 SFMMU_STAT(sf_hblk8_nalloc
);
11157 index
= nucleus_hblk1
.index
;
11158 if (nucleus_hblk1
.index
>= nucleus_hblk1
.len
) {
11160 * If we panic here, see startup_modules().
11161 * Most likely you need to update the
11162 * calculation of the number of hblk1 elements
11163 * that the kernel needs to boot.
11165 prom_panic("no nucleus hblk1 to allocate");
11168 (struct hme_blk
*)&nucleus_hblk1
.list
[index
];
11169 nucleus_hblk1
.index
++;
11170 SFMMU_STAT(sf_hblk1_nalloc
);
11176 SFMMU_HASH_UNLOCK(hmebp
);
11178 if (sfmmup
!= KHATID
&& !SFMMU_IS_SHMERID_VALID(rid
)) {
11179 if (mmu_page_sizes
== max_mmu_page_sizes
) {
11180 if (size
< TTE256M
)
11181 shw_hblkp
= sfmmu_shadow_hcreate(sfmmup
, vaddr
,
11185 shw_hblkp
= sfmmu_shadow_hcreate(sfmmup
, vaddr
,
11188 } else if (SFMMU_IS_SHMERID_VALID(rid
)) {
11190 * Shared hmes use per region bitmaps in rgn_hmeflag
11191 * rather than shadow hmeblks to keep track of the
11192 * mapping sizes which have been allocated for the region.
11193 * Here we cleanup old invalid hmeblks with this rid,
11194 * which may be left around by pageunload().
11198 caddr_t eva
= vaddr
+ TTEBYTES(size
);
11200 ASSERT(sfmmup
!= KHATID
);
11202 srdp
= sfmmup
->sfmmu_srdp
;
11203 ASSERT(srdp
!= NULL
&& srdp
->srd_refcnt
!= 0);
11204 rgnp
= srdp
->srd_hmergnp
[rid
];
11205 ASSERT(rgnp
!= NULL
&& rgnp
->rgn_id
== rid
);
11206 ASSERT(rgnp
->rgn_refcnt
!= 0);
11207 ASSERT(size
<= rgnp
->rgn_pgszc
);
11209 ttesz
= HBLK_MIN_TTESZ
;
11211 if (!(rgnp
->rgn_hmeflags
& (0x1 << ttesz
))) {
11215 if (ttesz
> size
&& ttesz
!= HBLK_MIN_TTESZ
) {
11216 sfmmu_cleanup_rhblk(srdp
, vaddr
, rid
, ttesz
);
11217 } else if (ttesz
< size
) {
11218 for (va
= vaddr
; va
< eva
;
11219 va
+= TTEBYTES(ttesz
)) {
11220 sfmmu_cleanup_rhblk(srdp
, va
, rid
,
11224 } while (++ttesz
<= rgnp
->rgn_pgszc
);
11228 owner
= (hblk_reserve_thread
== curthread
) ? 1 : 0;
11230 if (owner
&& size
== TTE8K
) {
11232 ASSERT(!SFMMU_IS_SHMERID_VALID(rid
));
11234 * We are really in a tight spot. We already own
11235 * hblk_reserve and we need another hblk. In anticipation
11236 * of this kind of scenario, we specifically set aside
11237 * HBLK_RESERVE_MIN number of hblks to be used exclusively
11238 * by owner of hblk_reserve.
11240 SFMMU_STAT(sf_hblk_recurse_cnt
);
11242 if (!sfmmu_get_free_hblk(&hmeblkp
, 1))
11243 panic("sfmmu_hblk_alloc: reserve list is empty");
11250 if ((flags
& HAT_NO_KALLOC
) == 0) {
11252 sfmmu_cache
= ((size
== TTE8K
) ? sfmmu8_cache
: sfmmu1_cache
);
11253 sleep
= ((sfmmup
== KHATID
) ? KM_NOSLEEP
: KM_SLEEP
);
11255 if ((hmeblkp
= kmem_cache_alloc(sfmmu_cache
, sleep
)) == NULL
) {
11256 hmeblkp
= sfmmu_hblk_steal(size
);
11259 * if we are the owner of hblk_reserve,
11260 * swap hblk_reserve with hmeblkp and
11261 * start a fresh life. Hope things go
11262 * better this time.
11264 if (hblk_reserve_thread
== curthread
) {
11265 ASSERT(sfmmu_cache
== sfmmu8_cache
);
11266 sfmmu_hblk_swap(hmeblkp
);
11267 hblk_reserve_thread
= NULL
;
11268 mutex_exit(&hblk_reserve_lock
);
11272 * let's donate this hblk to our reserve list if
11273 * we are not mapping kernel range
11275 if (size
== TTE8K
&& sfmmup
!= KHATID
) {
11276 if (sfmmu_put_free_hblk(hmeblkp
, 0))
11282 * We are here to map the slab in sfmmu8_cache; let's
11283 * check if we could tap our reserve list; if successful,
11284 * this will avoid the pain of going thru sfmmu_hblk_swap
11286 SFMMU_STAT(sf_hblk_slab_cnt
);
11287 if (!sfmmu_get_free_hblk(&hmeblkp
, 0)) {
11289 * let's start hblk_reserve dance
11291 SFMMU_STAT(sf_hblk_reserve_cnt
);
11293 mutex_enter(&hblk_reserve_lock
);
11294 hmeblkp
= HBLK_RESERVE
;
11295 hblk_reserve_thread
= curthread
;
11300 ASSERT(hmeblkp
!= NULL
);
11301 set_hblk_sz(hmeblkp
, size
);
11302 ASSERT(hmeblkp
->hblk_nextpa
== va_to_pa((caddr_t
)hmeblkp
));
11303 SFMMU_HASH_LOCK(hmebp
);
11304 HME_HASH_FAST_SEARCH(hmebp
, hblktag
, newhblkp
);
11305 if (newhblkp
!= NULL
) {
11306 SFMMU_HASH_UNLOCK(hmebp
);
11307 if (hmeblkp
!= HBLK_RESERVE
) {
11309 * This is really tricky!
11311 * vmem_alloc(vmem_seg_arena)
11312 * vmem_alloc(vmem_internal_arena)
11313 * segkmem_alloc(heap_arena)
11314 * vmem_alloc(heap_arena)
11317 * kmem_cache_free()
11318 * kmem_cache_alloc()
11319 * kmem_slab_create()
11320 * vmem_alloc(kmem_internal_arena)
11321 * segkmem_alloc(heap_arena)
11322 * vmem_alloc(heap_arena)
11325 * kmem_cache_free()
11328 * Thus, hat_memload() could call kmem_cache_free
11329 * for enough number of times that we could easily
11330 * hit the bottom of the stack or run out of reserve
11331 * list of vmem_seg structs. So, we must donate
11332 * this hblk to reserve list if it's allocated
11333 * from sfmmu8_cache *and* mapping kernel range.
11334 * We don't need to worry about freeing hmeblk1's
11335 * to kmem since they don't map any kmem slabs.
11337 * Note: When segkmem supports largepages, we must
11338 * free hmeblk1's to reserve list as well.
11340 forcefree
= (sfmmup
== KHATID
) ? 1 : 0;
11341 if (size
== TTE8K
&&
11342 sfmmu_put_free_hblk(hmeblkp
, forcefree
)) {
11345 ASSERT(sfmmup
!= KHATID
);
11346 kmem_cache_free(get_hblk_cache(hmeblkp
), hmeblkp
);
11349 * Hey! we don't need hblk_reserve any more.
11352 hblk_reserve_thread
= NULL
;
11353 mutex_exit(&hblk_reserve_lock
);
11358 * let's check if the goodies are still present
11360 SFMMU_HASH_LOCK(hmebp
);
11361 HME_HASH_FAST_SEARCH(hmebp
, hblktag
, newhblkp
);
11362 if (newhblkp
!= NULL
) {
11364 * return newhblkp if it's not hblk_reserve;
11365 * if newhblkp is hblk_reserve, return it
11366 * _only if_ we are the owner of hblk_reserve.
11368 if (newhblkp
!= HBLK_RESERVE
|| owner
) {
11369 ASSERT(!SFMMU_IS_SHMERID_VALID(rid
) ||
11370 newhblkp
->hblk_shared
);
11371 ASSERT(SFMMU_IS_SHMERID_VALID(rid
) ||
11372 !newhblkp
->hblk_shared
);
11376 * we just hit hblk_reserve in the hash and
11377 * we are not the owner of that;
11379 * block until hblk_reserve_thread completes
11380 * swapping hblk_reserve and try the dance
11383 SFMMU_HASH_UNLOCK(hmebp
);
11384 mutex_enter(&hblk_reserve_lock
);
11385 mutex_exit(&hblk_reserve_lock
);
11386 SFMMU_STAT(sf_hblk_reserve_hit
);
11391 * it's no more! try the dance once again.
11393 SFMMU_HASH_UNLOCK(hmebp
);
11399 if (SFMMU_IS_SHMERID_VALID(rid
)) {
11400 uint16_t tteflag
= 0x1 <<
11401 ((size
< HBLK_MIN_TTESZ
) ? HBLK_MIN_TTESZ
: size
);
11403 if (!(rgnp
->rgn_hmeflags
& tteflag
)) {
11404 atomic_or_16(&rgnp
->rgn_hmeflags
, tteflag
);
11406 hmeblkp
->hblk_shared
= 1;
11408 hmeblkp
->hblk_shared
= 0;
11410 set_hblk_sz(hmeblkp
, size
);
11411 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp
));
11412 hmeblkp
->hblk_next
= (struct hme_blk
*)NULL
;
11413 hmeblkp
->hblk_tag
= hblktag
;
11414 hmeblkp
->hblk_shadow
= shw_hblkp
;
11415 hblkpa
= hmeblkp
->hblk_nextpa
;
11416 hmeblkp
->hblk_nextpa
= HMEBLK_ENDPA
;
11418 ASSERT(get_hblk_ttesz(hmeblkp
) == size
);
11419 ASSERT(get_hblk_span(hmeblkp
) == HMEBLK_SPAN(size
));
11420 ASSERT(hmeblkp
->hblk_hmecnt
== 0);
11421 ASSERT(hmeblkp
->hblk_vcnt
== 0);
11422 ASSERT(hmeblkp
->hblk_lckcnt
== 0);
11423 ASSERT(hblkpa
== va_to_pa((caddr_t
)hmeblkp
));
11424 sfmmu_hblk_hash_add(hmebp
, hmeblkp
, hblkpa
);
11429 * This function cleans up the hme_blk and returns it to the free list.
11433 sfmmu_hblk_free(struct hme_blk
**listp
)
11435 struct hme_blk
*hmeblkp
, *next_hmeblkp
;
11440 ASSERT(*listp
!= NULL
);
11443 while (hmeblkp
!= NULL
) {
11444 next_hmeblkp
= hmeblkp
->hblk_next
;
11445 ASSERT(!hmeblkp
->hblk_hmecnt
);
11446 ASSERT(!hmeblkp
->hblk_vcnt
);
11447 ASSERT(!hmeblkp
->hblk_lckcnt
);
11448 ASSERT(hmeblkp
!= (struct hme_blk
*)hblk_reserve
);
11449 ASSERT(hmeblkp
->hblk_shared
== 0);
11450 ASSERT(hmeblkp
->hblk_shw_bit
== 0);
11451 ASSERT(hmeblkp
->hblk_shadow
== NULL
);
11453 hblkpa
= va_to_pa((caddr_t
)hmeblkp
);
11454 ASSERT(hblkpa
!= (uint64_t)-1);
11455 critical
= (hblktosfmmu(hmeblkp
) == KHATID
) ? 1 : 0;
11457 size
= get_hblk_ttesz(hmeblkp
);
11458 hmeblkp
->hblk_next
= NULL
;
11459 hmeblkp
->hblk_nextpa
= hblkpa
;
11461 if (hmeblkp
->hblk_nuc_bit
== 0) {
11463 if (size
!= TTE8K
||
11464 !sfmmu_put_free_hblk(hmeblkp
, critical
))
11465 kmem_cache_free(get_hblk_cache(hmeblkp
),
11468 hmeblkp
= next_hmeblkp
;
11472 #define BUCKETS_TO_SEARCH_BEFORE_UNLOAD 30
11473 #define SFMMU_HBLK_STEAL_THRESHOLD 5
11475 static uint_t sfmmu_hblk_steal_twice
;
11476 static uint_t sfmmu_hblk_steal_count
, sfmmu_hblk_steal_unload_count
;
11479 * Steal a hmeblk from user or kernel hme hash lists.
11480 * For 8K tte grab one from reserve pool (freehblkp) before proceeding to
11481 * steal and if we fail to steal after SFMMU_HBLK_STEAL_THRESHOLD attempts
11482 * tap into critical reserve of freehblkp.
11483 * Note: We remain looping in this routine until we find one.
11485 static struct hme_blk
*
11486 sfmmu_hblk_steal(int size
)
11488 static struct hmehash_bucket
*uhmehash_steal_hand
= NULL
;
11489 struct hmehash_bucket
*hmebp
;
11490 struct hme_blk
*hmeblkp
= NULL
, *pr_hblk
;
11493 uint_t loop_cnt
= 0, critical
;
11496 /* Check cpu hblk pending queues */
11497 if ((hmeblkp
= sfmmu_check_pending_hblks(size
)) != NULL
) {
11498 hmeblkp
->hblk_nextpa
= va_to_pa((caddr_t
)hmeblkp
);
11499 ASSERT(hmeblkp
->hblk_hmecnt
== 0);
11500 ASSERT(hmeblkp
->hblk_vcnt
== 0);
11504 if (size
== TTE8K
) {
11506 (++loop_cnt
> SFMMU_HBLK_STEAL_THRESHOLD
) ? 1 : 0;
11507 if (sfmmu_get_free_hblk(&hmeblkp
, critical
))
11511 hmebp
= (uhmehash_steal_hand
== NULL
) ? uhme_hash
:
11512 uhmehash_steal_hand
;
11513 ASSERT(hmebp
>= uhme_hash
&& hmebp
<= &uhme_hash
[UHMEHASH_SZ
]);
11515 for (i
= 0; hmeblkp
== NULL
&& i
<= UHMEHASH_SZ
+
11516 BUCKETS_TO_SEARCH_BEFORE_UNLOAD
; i
++) {
11517 SFMMU_HASH_LOCK(hmebp
);
11518 hmeblkp
= hmebp
->hmeblkp
;
11519 hblkpa
= hmebp
->hmeh_nextpa
;
11523 * check if it is a hmeblk that is not locked
11524 * and not shared. skip shadow hmeblks with
11525 * shadow_mask set i.e valid count non zero.
11527 if ((get_hblk_ttesz(hmeblkp
) == size
) &&
11528 (hmeblkp
->hblk_shw_bit
== 0 ||
11529 hmeblkp
->hblk_vcnt
== 0) &&
11530 (hmeblkp
->hblk_lckcnt
== 0)) {
11532 * there is a high probability that we
11533 * will find a free one. search some
11534 * buckets for a free hmeblk initially
11535 * before unloading a valid hmeblk.
11537 if ((hmeblkp
->hblk_vcnt
== 0 &&
11538 hmeblkp
->hblk_hmecnt
== 0) || (i
>=
11539 BUCKETS_TO_SEARCH_BEFORE_UNLOAD
)) {
11540 if (sfmmu_steal_this_hblk(hmebp
,
11541 hmeblkp
, hblkpa
, pr_hblk
)) {
11551 hblkpa
= hmeblkp
->hblk_nextpa
;
11552 hmeblkp
= hmeblkp
->hblk_next
;
11555 SFMMU_HASH_UNLOCK(hmebp
);
11556 if (hmebp
++ == &uhme_hash
[UHMEHASH_SZ
])
11559 uhmehash_steal_hand
= hmebp
;
11561 if (hmeblkp
!= NULL
)
11565 * in the worst case, look for a free one in the kernel
11568 for (i
= 0, hmebp
= khme_hash
; i
<= KHMEHASH_SZ
; i
++) {
11569 SFMMU_HASH_LOCK(hmebp
);
11570 hmeblkp
= hmebp
->hmeblkp
;
11571 hblkpa
= hmebp
->hmeh_nextpa
;
11575 * check if it is free hmeblk
11577 if ((get_hblk_ttesz(hmeblkp
) == size
) &&
11578 (hmeblkp
->hblk_lckcnt
== 0) &&
11579 (hmeblkp
->hblk_vcnt
== 0) &&
11580 (hmeblkp
->hblk_hmecnt
== 0)) {
11581 if (sfmmu_steal_this_hblk(hmebp
,
11582 hmeblkp
, hblkpa
, pr_hblk
)) {
11586 * Cannot fail since we have
11589 panic("fail to steal?");
11594 hblkpa
= hmeblkp
->hblk_nextpa
;
11595 hmeblkp
= hmeblkp
->hblk_next
;
11598 SFMMU_HASH_UNLOCK(hmebp
);
11599 if (hmebp
++ == &khme_hash
[KHMEHASH_SZ
])
11603 if (hmeblkp
!= NULL
)
11605 sfmmu_hblk_steal_twice
++;
11611 * This routine does real work to prepare a hblk to be "stolen" by
11612 * unloading the mappings, updating shadow counts ....
11613 * It returns 1 if the block is ready to be reused (stolen), or 0
11614 * means the block cannot be stolen yet- pageunload is still working
11618 sfmmu_steal_this_hblk(struct hmehash_bucket
*hmebp
, struct hme_blk
*hmeblkp
,
11619 uint64_t hblkpa
, struct hme_blk
*pr_hblk
)
11621 int shw_size
, vshift
;
11622 struct hme_blk
*shw_hblkp
;
11624 uint_t shw_mask
, newshw_mask
;
11625 struct hme_blk
*list
= NULL
;
11627 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp
));
11630 * check if the hmeblk is free, unload if necessary
11632 if (hmeblkp
->hblk_vcnt
|| hmeblkp
->hblk_hmecnt
) {
11636 sfmmup
= hblktosfmmu(hmeblkp
);
11637 if (hmeblkp
->hblk_shared
|| sfmmup
->sfmmu_ismhat
) {
11640 DEMAP_RANGE_INIT(sfmmup
, &dmr
);
11641 (void) sfmmu_hblk_unload(sfmmup
, hmeblkp
,
11642 (caddr_t
)get_hblk_base(hmeblkp
),
11643 get_hblk_endaddr(hmeblkp
), &dmr
, HAT_UNLOAD
);
11644 DEMAP_RANGE_FLUSH(&dmr
);
11645 if (hmeblkp
->hblk_vcnt
|| hmeblkp
->hblk_hmecnt
) {
11647 * Pageunload is working on the same hblk.
11652 sfmmu_hblk_steal_unload_count
++;
11655 ASSERT(hmeblkp
->hblk_lckcnt
== 0);
11656 ASSERT(hmeblkp
->hblk_vcnt
== 0 && hmeblkp
->hblk_hmecnt
== 0);
11658 sfmmu_hblk_hash_rm(hmebp
, hmeblkp
, pr_hblk
, &list
, 1);
11659 hmeblkp
->hblk_nextpa
= hblkpa
;
11661 shw_hblkp
= hmeblkp
->hblk_shadow
;
11663 ASSERT(!hmeblkp
->hblk_shared
);
11664 shw_size
= get_hblk_ttesz(shw_hblkp
);
11665 vaddr
= (caddr_t
)get_hblk_base(hmeblkp
);
11666 vshift
= vaddr_to_vshift(shw_hblkp
->hblk_tag
, vaddr
, shw_size
);
11667 ASSERT(vshift
< 8);
11669 * Atomically clear shadow mask bit
11672 shw_mask
= shw_hblkp
->hblk_shw_mask
;
11673 ASSERT(shw_mask
& (1 << vshift
));
11674 newshw_mask
= shw_mask
& ~(1 << vshift
);
11675 newshw_mask
= atomic_cas_32(&shw_hblkp
->hblk_shw_mask
,
11676 shw_mask
, newshw_mask
);
11677 } while (newshw_mask
!= shw_mask
);
11678 hmeblkp
->hblk_shadow
= NULL
;
11682 * remove shadow bit if we are stealing an unused shadow hmeblk.
11683 * sfmmu_hblk_alloc needs it that way, will set shadow bit later if
11684 * we are indeed allocating a shadow hmeblk.
11686 hmeblkp
->hblk_shw_bit
= 0;
11688 if (hmeblkp
->hblk_shared
) {
11693 srdp
= hblktosrd(hmeblkp
);
11694 ASSERT(srdp
!= NULL
&& srdp
->srd_refcnt
!= 0);
11695 rid
= hmeblkp
->hblk_tag
.htag_rid
;
11696 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
11697 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
11698 rgnp
= srdp
->srd_hmergnp
[rid
];
11699 ASSERT(rgnp
!= NULL
);
11700 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp
, srdp
, rgnp
, rid
);
11701 hmeblkp
->hblk_shared
= 0;
11704 sfmmu_hblk_steal_count
++;
11705 SFMMU_STAT(sf_steal_count
);
11711 sfmmu_hmetohblk(struct sf_hment
*sfhme
)
11713 struct hme_blk
*hmeblkp
;
11714 struct sf_hment
*sfhme0
;
11715 struct hme_blk
*hblk_dummy
= 0;
11718 * No dummy sf_hments, please.
11720 ASSERT(sfhme
->hme_tte
.ll
!= 0);
11722 sfhme0
= sfhme
- sfhme
->hme_tte
.tte_hmenum
;
11723 hmeblkp
= (struct hme_blk
*)((uintptr_t)sfhme0
-
11724 (uintptr_t)&hblk_dummy
->hblk_hme
[0]);
11730 * On swapin, get appropriately sized TSB(s) and clear the HAT_SWAPPED flag.
11731 * If we can't get appropriately sized TSB(s), try for 8K TSB(s) using
11732 * KM_SLEEP allocation.
11734 * Return 0 on success, -1 otherwise.
11737 sfmmu_tsb_swapin(sfmmu_t
*sfmmup
, hatlock_t
*hatlockp
)
11739 struct tsb_info
*tsbinfop
, *next
;
11740 tsb_replace_rc_t rc
;
11741 boolean_t gotfirst
= B_FALSE
;
11743 ASSERT(sfmmup
!= ksfmmup
);
11744 ASSERT(sfmmu_hat_lock_held(sfmmup
));
11746 while (SFMMU_FLAGS_ISSET(sfmmup
, HAT_SWAPIN
)) {
11747 cv_wait(&sfmmup
->sfmmu_tsb_cv
, HATLOCK_MUTEXP(hatlockp
));
11750 if (SFMMU_FLAGS_ISSET(sfmmup
, HAT_SWAPPED
)) {
11751 SFMMU_FLAGS_SET(sfmmup
, HAT_SWAPIN
);
11756 ASSERT(sfmmup
->sfmmu_tsb
!= NULL
);
11759 * Loop over all tsbinfo's replacing them with ones that actually have
11760 * a TSB. If any of the replacements ever fail, bail out of the loop.
11762 for (tsbinfop
= sfmmup
->sfmmu_tsb
; tsbinfop
!= NULL
; tsbinfop
= next
) {
11763 ASSERT(tsbinfop
->tsb_flags
& TSB_SWAPPED
);
11764 next
= tsbinfop
->tsb_next
;
11765 rc
= sfmmu_replace_tsb(sfmmup
, tsbinfop
, tsbinfop
->tsb_szc
,
11766 hatlockp
, TSB_SWAPIN
);
11767 if (rc
!= TSB_SUCCESS
) {
11775 SFMMU_FLAGS_CLEAR(sfmmup
, HAT_SWAPPED
|HAT_SWAPIN
);
11776 cv_broadcast(&sfmmup
->sfmmu_tsb_cv
);
11780 case TSB_ALLOCFAIL
:
11783 panic("sfmmu_replace_tsb returned unrecognized failure code "
11788 * In this case, we failed to get one of our TSBs. If we failed to
11789 * get the first TSB, get one of minimum size (8KB). Walk the list
11790 * and throw away the tsbinfos, starting where the allocation failed;
11791 * we can get by with just one TSB as long as we don't leave the
11792 * SWAPPED tsbinfo structures lying around.
11794 tsbinfop
= sfmmup
->sfmmu_tsb
;
11795 next
= tsbinfop
->tsb_next
;
11796 tsbinfop
->tsb_next
= NULL
;
11798 sfmmu_hat_exit(hatlockp
);
11799 for (tsbinfop
= next
; tsbinfop
!= NULL
; tsbinfop
= next
) {
11800 next
= tsbinfop
->tsb_next
;
11801 sfmmu_tsbinfo_free(tsbinfop
);
11803 hatlockp
= sfmmu_hat_enter(sfmmup
);
11806 * If we don't have any TSBs, get a single 8K TSB for 8K, 64K and 512K
11810 tsbinfop
= sfmmup
->sfmmu_tsb
;
11811 rc
= sfmmu_replace_tsb(sfmmup
, tsbinfop
, TSB_MIN_SZCODE
,
11812 hatlockp
, TSB_SWAPIN
| TSB_FORCEALLOC
);
11813 ASSERT(rc
== TSB_SUCCESS
);
11816 SFMMU_FLAGS_CLEAR(sfmmup
, HAT_SWAPPED
|HAT_SWAPIN
);
11817 cv_broadcast(&sfmmup
->sfmmu_tsb_cv
);
11821 sfmmu_is_rgnva(sf_srd_t
*srdp
, caddr_t addr
, ulong_t w
, ulong_t bmw
)
11827 ASSERT(srdp
!= NULL
);
11828 ASSERT(srdp
->srd_refcnt
!= 0);
11832 if (!(bmw
& 0x1)) {
11838 rgnp
= srdp
->srd_hmergnp
[rid
];
11839 ASSERT(rgnp
->rgn_refcnt
> 0);
11840 ASSERT(rgnp
->rgn_id
== rid
);
11841 if (addr
< rgnp
->rgn_saddr
||
11842 addr
>= (rgnp
->rgn_saddr
+ rgnp
->rgn_size
)) {
11853 * Handle exceptions for low level tsb_handler.
11855 * There are many scenarios that could land us here:
11857 * If the context is invalid we land here. The context can be invalid
11858 * for 3 reasons: 1) we couldn't allocate a new context and now need to
11859 * perform a wrap around operation in order to allocate a new context.
11860 * 2) Context was invalidated to change pagesize programming 3) ISMs or
11861 * TSBs configuration is changeing for this process and we are forced into
11862 * here to do a syncronization operation. If the context is valid we can
11863 * be here from window trap hanlder. In this case just call trap to handle
11866 * Note that the process will run in INVALID_CONTEXT before
11867 * faulting into here and subsequently loading the MMU registers
11868 * (including the TSB base register) associated with this process.
11869 * For this reason, the trap handlers must all test for
11870 * INVALID_CONTEXT before attempting to access any registers other
11871 * than the context registers.
11874 sfmmu_tsbmiss_exception(struct regs
*rp
, uintptr_t tagaccess
, uint_t traptype
)
11876 sfmmu_t
*sfmmup
, *shsfmmup
;
11879 char lwp_save_state
;
11880 hatlock_t
*hatlockp
, *shatlockp
;
11881 struct tsb_info
*tsbinfop
;
11882 struct tsbmiss
*tsbmp
;
11885 SFMMU_STAT(sf_tsb_exceptions
);
11886 SFMMU_MMU_STAT(mmu_tsb_exceptions
);
11887 sfmmup
= astosfmmu(curthread
->t_procp
->p_as
);
11889 * note that in sun4u, tagacces register contains ctxnum
11890 * while sun4v passes ctxtype in the tagaccess register.
11892 ctxtype
= tagaccess
& TAGACC_CTX_MASK
;
11894 ASSERT(sfmmup
!= ksfmmup
&& ctxtype
!= KCONTEXT
);
11895 ASSERT(sfmmup
->sfmmu_ismhat
== 0);
11896 ASSERT(!SFMMU_FLAGS_ISSET(sfmmup
, HAT_SWAPPED
) ||
11897 ctxtype
== INVALID_CONTEXT
);
11899 if (ctxtype
!= INVALID_CONTEXT
&& traptype
!= T_DATA_PROT
) {
11901 * We may land here because shme bitmap and pagesize
11902 * flags are updated lazily in tsbmiss area on other cpus.
11903 * If we detect here that tsbmiss area is out of sync with
11904 * sfmmu update it and retry the trapped instruction.
11905 * Otherwise call trap().
11908 uchar_t tteflag_mask
= (1 << TTE64K
) | (1 << TTE8K
);
11909 caddr_t addr
= (caddr_t
)(tagaccess
& TAGACC_VADDR_MASK
);
11912 * Must set lwp state to LWP_SYS before
11913 * trying to acquire any adaptive lock
11915 lwp
= ttolwp(curthread
);
11917 lwp_save_state
= lwp
->lwp_state
;
11918 lwp
->lwp_state
= LWP_SYS
;
11920 hatlockp
= sfmmu_hat_enter(sfmmup
);
11921 kpreempt_disable();
11922 tsbmp
= &tsbmiss_area
[CPU
->cpu_id
];
11923 ASSERT(sfmmup
== tsbmp
->usfmmup
);
11924 if (((tsbmp
->uhat_tteflags
^ sfmmup
->sfmmu_tteflags
) &
11926 ((tsbmp
->uhat_rtteflags
^ sfmmup
->sfmmu_rtteflags
) &
11928 tsbmp
->uhat_tteflags
= sfmmup
->sfmmu_tteflags
;
11929 tsbmp
->uhat_rtteflags
= sfmmup
->sfmmu_rtteflags
;
11932 if (sfmmup
->sfmmu_srdp
!= NULL
) {
11933 ulong_t
*sm
= sfmmup
->sfmmu_hmeregion_map
.bitmap
;
11934 ulong_t
*tm
= tsbmp
->shmermap
;
11936 for (i
= 0; i
< SFMMU_HMERGNMAP_WORDS
; i
++) {
11937 ulong_t d
= tm
[i
] ^ sm
[i
];
11940 if (!ret
&& sfmmu_is_rgnva(
11941 sfmmup
->sfmmu_srdp
,
11942 addr
, i
, d
& sm
[i
])) {
11951 sfmmu_hat_exit(hatlockp
);
11952 lwp
->lwp_state
= lwp_save_state
;
11956 } else if (ctxtype
== INVALID_CONTEXT
) {
11958 * First, make sure we come out of here with a valid ctx,
11959 * since if we don't get one we'll simply loop on the
11960 * faulting instruction.
11962 * If the ISM mappings are changing, the TSB is relocated,
11963 * the process is swapped, the process is joining SCD or
11964 * leaving SCD or shared regions we serialize behind the
11965 * controlling thread with hat lock, sfmmu_flags and
11966 * sfmmu_tsb_cv condition variable.
11970 * Must set lwp state to LWP_SYS before
11971 * trying to acquire any adaptive lock
11973 lwp
= ttolwp(curthread
);
11975 lwp_save_state
= lwp
->lwp_state
;
11976 lwp
->lwp_state
= LWP_SYS
;
11978 hatlockp
= sfmmu_hat_enter(sfmmup
);
11980 if ((scdp
= sfmmup
->sfmmu_scdp
) != NULL
) {
11981 shsfmmup
= scdp
->scd_sfmmup
;
11982 ASSERT(shsfmmup
!= NULL
);
11984 for (tsbinfop
= shsfmmup
->sfmmu_tsb
; tsbinfop
!= NULL
;
11985 tsbinfop
= tsbinfop
->tsb_next
) {
11986 if (tsbinfop
->tsb_flags
& TSB_RELOC_FLAG
) {
11987 /* drop the private hat lock */
11988 sfmmu_hat_exit(hatlockp
);
11989 /* acquire the shared hat lock */
11990 shatlockp
= sfmmu_hat_enter(shsfmmup
);
11992 * recheck to see if anything changed
11993 * after we drop the private hat lock.
11995 if (sfmmup
->sfmmu_scdp
== scdp
&&
11996 shsfmmup
== scdp
->scd_sfmmup
) {
11997 sfmmu_tsb_chk_reloc(shsfmmup
,
12000 sfmmu_hat_exit(shatlockp
);
12001 hatlockp
= sfmmu_hat_enter(sfmmup
);
12007 for (tsbinfop
= sfmmup
->sfmmu_tsb
; tsbinfop
!= NULL
;
12008 tsbinfop
= tsbinfop
->tsb_next
) {
12009 if (tsbinfop
->tsb_flags
& TSB_RELOC_FLAG
) {
12010 cv_wait(&sfmmup
->sfmmu_tsb_cv
,
12011 HATLOCK_MUTEXP(hatlockp
));
12017 * Wait for ISM maps to be updated.
12019 if (SFMMU_FLAGS_ISSET(sfmmup
, HAT_ISMBUSY
)) {
12020 cv_wait(&sfmmup
->sfmmu_tsb_cv
,
12021 HATLOCK_MUTEXP(hatlockp
));
12025 /* Is this process joining an SCD? */
12026 if (SFMMU_FLAGS_ISSET(sfmmup
, HAT_JOIN_SCD
)) {
12028 * Flush private TSB and setup shared TSB.
12029 * sfmmu_finish_join_scd() does not drop the
12032 sfmmu_finish_join_scd(sfmmup
);
12033 SFMMU_FLAGS_CLEAR(sfmmup
, HAT_JOIN_SCD
);
12037 * If we're swapping in, get TSB(s). Note that we must do
12038 * this before we get a ctx or load the MMU state. Once
12039 * we swap in we have to recheck to make sure the TSB(s) and
12040 * ISM mappings didn't change while we slept.
12042 if (SFMMU_FLAGS_ISSET(sfmmup
, HAT_SWAPPED
)) {
12043 sfmmu_tsb_swapin(sfmmup
, hatlockp
);
12047 sfmmu_get_ctx(sfmmup
);
12049 sfmmu_hat_exit(hatlockp
);
12051 * Must restore lwp_state if not calling
12052 * trap() for further processing. Restore
12055 lwp
->lwp_state
= lwp_save_state
;
12058 trap(rp
, (caddr_t
)tagaccess
, traptype
, 0);
12062 sfmmu_tsb_chk_reloc(sfmmu_t
*sfmmup
, hatlock_t
*hatlockp
)
12064 struct tsb_info
*tp
;
12066 ASSERT(sfmmu_hat_lock_held(sfmmup
));
12068 for (tp
= sfmmup
->sfmmu_tsb
; tp
!= NULL
; tp
= tp
->tsb_next
) {
12069 if (tp
->tsb_flags
& TSB_RELOC_FLAG
) {
12070 cv_wait(&sfmmup
->sfmmu_tsb_cv
,
12071 HATLOCK_MUTEXP(hatlockp
));
12078 * sfmmu_vatopfn_suspended is called from GET_TTE when TL=0 and
12079 * TTE_SUSPENDED bit set in tte we block on aquiring a page lock
12080 * rather than spinning to avoid send mondo timeouts with
12081 * interrupts enabled. When the lock is acquired it is immediately
12082 * released and we return back to sfmmu_vatopfn just after
12083 * the GET_TTE call.
12086 sfmmu_vatopfn_suspended(caddr_t vaddr
, sfmmu_t
*sfmmu
, tte_t
*ttep
)
12090 (void) as_pagelock(sfmmu
->sfmmu_as
, &pp
, vaddr
, TTE_CSZ(ttep
), S_WRITE
);
12091 as_pageunlock(sfmmu
->sfmmu_as
, pp
, vaddr
, TTE_CSZ(ttep
), S_WRITE
);
12095 * sfmmu_tsbmiss_suspended is called from GET_TTE when TL>0 and
12096 * TTE_SUSPENDED bit set in tte. We do this so that we can handle
12097 * cross traps which cannot be handled while spinning in the
12098 * trap handlers. Simply enter and exit the kpr_suspendlock spin
12099 * mutex, which is held by the holder of the suspend bit, and then
12100 * retry the trapped instruction after unwinding.
12104 sfmmu_tsbmiss_suspended(struct regs
*rp
, uintptr_t tagacc
, uint_t traptype
)
12106 ASSERT(curthread
!= kreloc_thread
);
12107 mutex_enter(&kpr_suspendlock
);
12108 mutex_exit(&kpr_suspendlock
);
12112 * This routine could be optimized to reduce the number of xcalls by flushing
12113 * the entire TLBs if region reference count is above some threshold but the
12114 * tradeoff will depend on the size of the TLB. So for now flush the specific
12115 * page a context at a time.
12117 * If uselocks is 0 then it's called after all cpus were captured and all the
12118 * hat locks were taken. In this case don't take the region lock by relying on
12119 * the order of list region update operations in hat_join_region(),
12120 * hat_leave_region() and hat_dup_region(). The ordering in those routines
12121 * guarantees that list is always forward walkable and reaches active sfmmus
12122 * regardless of where xc_attention() captures a cpu.
12125 sfmmu_rgntlb_demap(caddr_t addr
, sf_region_t
*rgnp
,
12126 struct hme_blk
*hmeblkp
, int uselocks
)
12131 hatlock_t
*hatlockp
;
12132 uint_t rid
= rgnp
->rgn_id
;
12133 sf_rgn_link_t
*rlink
;
12136 ASSERT(hmeblkp
->hblk_shared
);
12137 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
12138 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
12140 CPUSET_ZERO(rcpuset
);
12142 mutex_enter(&rgnp
->rgn_mutex
);
12144 sfmmup
= rgnp
->rgn_sfmmu_head
;
12145 while (sfmmup
!= NULL
) {
12147 hatlockp
= sfmmu_hat_enter(sfmmup
);
12151 * When an SCD is created the SCD hat is linked on the sfmmu
12152 * region lists for each hme region which is part of the
12153 * SCD. If we find an SCD hat, when walking these lists,
12154 * then we flush the shared TSBs, if we find a private hat,
12155 * which is part of an SCD, but where the region
12156 * is not part of the SCD then we flush the private TSBs.
12158 if (!sfmmup
->sfmmu_scdhat
&& sfmmup
->sfmmu_scdp
!= NULL
&&
12159 !SFMMU_FLAGS_ISSET(sfmmup
, HAT_JOIN_SCD
)) {
12160 scdp
= sfmmup
->sfmmu_scdp
;
12161 if (SF_RGNMAP_TEST(scdp
->scd_hmeregion_map
, rid
)) {
12163 sfmmu_hat_exit(hatlockp
);
12169 SFMMU_UNLOAD_TSB(addr
, sfmmup
, hmeblkp
, 0);
12171 kpreempt_disable();
12172 cpuset
= sfmmup
->sfmmu_cpusran
;
12173 CPUSET_AND(cpuset
, cpu_ready_set
);
12174 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
12175 SFMMU_XCALL_STATS(sfmmup
);
12176 xt_some(cpuset
, vtag_flushpage_tl1
,
12177 (uint64_t)addr
, (uint64_t)sfmmup
);
12178 vtag_flushpage(addr
, (uint64_t)sfmmup
);
12180 sfmmu_hat_exit(hatlockp
);
12183 CPUSET_OR(rcpuset
, cpuset
);
12186 /* LINTED: constant in conditional context */
12187 SFMMU_HMERID2RLINKP(sfmmup
, rid
, rlink
, 0, 0);
12188 ASSERT(rlink
!= NULL
);
12189 sfmmup
= rlink
->next
;
12192 mutex_exit(&rgnp
->rgn_mutex
);
12198 * This routine takes an sfmmu pointer and the va for an adddress in an
12199 * ISM region as input and returns the corresponding region id in ism_rid.
12200 * The return value of 1 indicates that a region has been found and ism_rid
12201 * is valid, otherwise 0 is returned.
12204 find_ism_rid(sfmmu_t
*sfmmup
, sfmmu_t
*ism_sfmmup
, caddr_t va
, uint_t
*ism_rid
)
12206 ism_blk_t
*ism_blkp
;
12208 ism_map_t
*ism_map
;
12210 struct hat
*ism_hatid
;
12212 ASSERT(sfmmu_hat_lock_held(sfmmup
));
12214 ism_blkp
= sfmmup
->sfmmu_iblk
;
12215 while (ism_blkp
!= NULL
) {
12216 ism_map
= ism_blkp
->iblk_maps
;
12217 for (i
= 0; i
< ISM_MAP_SLOTS
&& ism_map
[i
].imap_ismhat
; i
++) {
12218 if ((va
>= ism_start(ism_map
[i
])) &&
12219 (va
< ism_end(ism_map
[i
]))) {
12221 *ism_rid
= ism_map
[i
].imap_rid
;
12223 ism_hatid
= ism_map
[i
].imap_ismhat
;
12224 ASSERT(ism_hatid
== ism_sfmmup
);
12225 ASSERT(ism_hatid
->sfmmu_ismhat
);
12230 ism_blkp
= ism_blkp
->iblk_next
;
12236 * Special routine to flush out ism mappings- TSBs, TLBs and D-caches.
12237 * This routine may be called with all cpu's captured. Therefore, the
12238 * caller is responsible for holding all locks and disabling kernel
12243 sfmmu_ismtlbcache_demap(caddr_t addr
, sfmmu_t
*ism_sfmmup
,
12244 struct hme_blk
*hmeblkp
, pfn_t pfnum
, int cache_flush_flag
)
12257 ASSERT(!hmeblkp
->hblk_shared
);
12259 * Walk the ism_hat's mapping list and flush the page
12260 * from every hat sharing this ism_hat. This routine
12261 * may be called while all cpu's have been captured.
12262 * Therefore we can't attempt to grab any locks. For now
12263 * this means we will protect the ism mapping list under
12264 * a single lock which will be grabbed by the caller.
12265 * If hat_share/unshare scalibility becomes a performance
12266 * problem then we may need to re-think ism mapping list locking.
12268 ASSERT(ism_sfmmup
->sfmmu_ismhat
);
12269 ASSERT(MUTEX_HELD(&ism_mlist_lock
));
12270 addr
= addr
- ISMID_STARTADDR
;
12272 for (ment
= ism_sfmmup
->sfmmu_iment
; ment
; ment
= ment
->iment_next
) {
12274 sfmmup
= ment
->iment_hat
;
12276 va
= ment
->iment_base_va
;
12277 va
= (caddr_t
)((uintptr_t)va
+ (uintptr_t)addr
);
12280 * When an SCD is created the SCD hat is linked on the ism
12281 * mapping lists for each ISM segment which is part of the
12282 * SCD. If we find an SCD hat, when walking these lists,
12283 * then we flush the shared TSBs, if we find a private hat,
12284 * which is part of an SCD, but where the region
12285 * corresponding to this va is not part of the SCD then we
12286 * flush the private TSBs.
12288 if (!sfmmup
->sfmmu_scdhat
&& sfmmup
->sfmmu_scdp
!= NULL
&&
12289 !SFMMU_FLAGS_ISSET(sfmmup
, HAT_JOIN_SCD
) &&
12290 !SFMMU_FLAGS_ISSET(sfmmup
, HAT_ISMBUSY
)) {
12291 if (!find_ism_rid(sfmmup
, ism_sfmmup
, va
,
12294 "can't find matching ISM rid!");
12297 scdp
= sfmmup
->sfmmu_scdp
;
12298 if (SFMMU_IS_ISMRID_VALID(ism_rid
) &&
12299 SF_RGNMAP_TEST(scdp
->scd_ismregion_map
,
12304 SFMMU_UNLOAD_TSB(va
, sfmmup
, hmeblkp
, 1);
12306 cpuset
= sfmmup
->sfmmu_cpusran
;
12307 CPUSET_AND(cpuset
, cpu_ready_set
);
12308 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
12309 SFMMU_XCALL_STATS(sfmmup
);
12310 xt_some(cpuset
, vtag_flushpage_tl1
, (uint64_t)va
,
12312 vtag_flushpage(va
, (uint64_t)sfmmup
);
12317 * When flushing D$ we must flush all
12318 * cpu's. See sfmmu_cache_flush().
12320 if (cache_flush_flag
== CACHE_FLUSH
) {
12321 cpuset
= cpu_ready_set
;
12322 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
12324 SFMMU_XCALL_STATS(sfmmup
);
12325 vcolor
= addr_to_vcolor(va
);
12326 xt_some(cpuset
, vac_flushpage_tl1
, pfnum
, vcolor
);
12327 vac_flushpage(pfnum
, vcolor
);
12334 * Demaps the TSB, CPU caches, and flushes all TLBs on all CPUs of
12335 * a particular virtual address and ctx. If noflush is set we do not
12336 * flush the TLB/TSB. This function may or may not be called with the
12340 sfmmu_tlbcache_demap(caddr_t addr
, sfmmu_t
*sfmmup
, struct hme_blk
*hmeblkp
,
12341 pfn_t pfnum
, int tlb_noflush
, int cpu_flag
, int cache_flush_flag
,
12348 hatlock_t
*hatlockp
;
12350 ASSERT(!hmeblkp
->hblk_shared
);
12352 #if defined(lint) && !defined(VAC)
12354 cpu_flag
= cpu_flag
;
12355 cache_flush_flag
= cache_flush_flag
;
12359 * There is no longer a need to protect against ctx being
12360 * stolen here since we don't store the ctx in the TSB anymore.
12363 vcolor
= addr_to_vcolor(addr
);
12367 * We must hold the hat lock during the flush of TLB,
12368 * to avoid a race with sfmmu_invalidate_ctx(), where
12369 * sfmmu_cnum on a MMU could be set to INVALID_CONTEXT,
12370 * causing TLB demap routine to skip flush on that MMU.
12371 * If the context on a MMU has already been set to
12372 * INVALID_CONTEXT, we just get an extra flush on
12375 if (!hat_lock_held
&& !tlb_noflush
)
12376 hatlockp
= sfmmu_hat_enter(sfmmup
);
12378 kpreempt_disable();
12379 if (!tlb_noflush
) {
12381 * Flush the TSB and TLB.
12383 SFMMU_UNLOAD_TSB(addr
, sfmmup
, hmeblkp
, 0);
12385 cpuset
= sfmmup
->sfmmu_cpusran
;
12386 CPUSET_AND(cpuset
, cpu_ready_set
);
12387 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
12389 SFMMU_XCALL_STATS(sfmmup
);
12391 xt_some(cpuset
, vtag_flushpage_tl1
, (uint64_t)addr
,
12394 vtag_flushpage(addr
, (uint64_t)sfmmup
);
12397 if (!hat_lock_held
&& !tlb_noflush
)
12398 sfmmu_hat_exit(hatlockp
);
12404 * Even if the ctx is stolen, we need to flush the
12405 * cache. Our ctx stealer only flushes the TLBs.
12407 if (cache_flush_flag
== CACHE_FLUSH
) {
12408 if (cpu_flag
& FLUSH_ALL_CPUS
) {
12409 cpuset
= cpu_ready_set
;
12411 cpuset
= sfmmup
->sfmmu_cpusran
;
12412 CPUSET_AND(cpuset
, cpu_ready_set
);
12414 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
12415 SFMMU_XCALL_STATS(sfmmup
);
12416 xt_some(cpuset
, vac_flushpage_tl1
, pfnum
, vcolor
);
12417 vac_flushpage(pfnum
, vcolor
);
12424 * Demaps the TSB and flushes all TLBs on all cpus for a particular virtual
12425 * address and ctx. If noflush is set we do not currently do anything.
12426 * This function may or may not be called with the HAT lock held.
12429 sfmmu_tlb_demap(caddr_t addr
, sfmmu_t
*sfmmup
, struct hme_blk
*hmeblkp
,
12430 int tlb_noflush
, int hat_lock_held
)
12433 hatlock_t
*hatlockp
;
12435 ASSERT(!hmeblkp
->hblk_shared
);
12438 * If the process is exiting we have nothing to do.
12446 if (!hat_lock_held
)
12447 hatlockp
= sfmmu_hat_enter(sfmmup
);
12448 SFMMU_UNLOAD_TSB(addr
, sfmmup
, hmeblkp
, 0);
12450 kpreempt_disable();
12452 cpuset
= sfmmup
->sfmmu_cpusran
;
12453 CPUSET_AND(cpuset
, cpu_ready_set
);
12454 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
12456 SFMMU_XCALL_STATS(sfmmup
);
12457 xt_some(cpuset
, vtag_flushpage_tl1
, (uint64_t)addr
, (uint64_t)sfmmup
);
12459 vtag_flushpage(addr
, (uint64_t)sfmmup
);
12461 if (!hat_lock_held
)
12462 sfmmu_hat_exit(hatlockp
);
12469 * Special case of sfmmu_tlb_demap for MMU_PAGESIZE hblks. Use the xcall
12470 * call handler that can flush a range of pages to save on xcalls.
12472 static int sfmmu_xcall_save
;
12475 * this routine is never used for demaping addresses backed by SRD hmeblks.
12478 sfmmu_tlb_range_demap(demap_range_t
*dmrp
)
12480 sfmmu_t
*sfmmup
= dmrp
->dmr_sfmmup
;
12481 hatlock_t
*hatlockp
;
12483 uint64_t sfmmu_pgcnt
;
12487 caddr_t addr
= dmrp
->dmr_addr
;
12489 uint64_t bitvec
= dmrp
->dmr_bitvec
;
12491 ASSERT(bitvec
& 1);
12494 * Flush TSB and calculate number of pages to flush.
12496 while (bitvec
!= 0) {
12499 * Find the first page to flush and then count how many
12500 * pages there are after it that also need to be flushed.
12501 * This way the number of TSB flushes is minimized.
12503 while ((bitvec
& 1) == 0) {
12505 addr
+= MMU_PAGESIZE
;
12508 while (bitvec
& 1) {
12512 eaddr
= addr
+ ptob(dirtypg
);
12513 hatlockp
= sfmmu_hat_enter(sfmmup
);
12514 sfmmu_unload_tsb_range(sfmmup
, addr
, eaddr
, TTE8K
);
12515 sfmmu_hat_exit(hatlockp
);
12516 pgunload
+= dirtypg
;
12521 ASSERT((pgcnt
<<MMU_PAGESHIFT
) <= dmrp
->dmr_endaddr
- dmrp
->dmr_addr
);
12522 if (sfmmup
->sfmmu_free
== 0) {
12523 addr
= dmrp
->dmr_addr
;
12524 bitvec
= dmrp
->dmr_bitvec
;
12527 * make sure it has SFMMU_PGCNT_SHIFT bits only,
12528 * as it will be used to pack argument for xt_some
12530 ASSERT((pgcnt
> 0) &&
12531 (pgcnt
<= (1 << SFMMU_PGCNT_SHIFT
)));
12534 * Encode pgcnt as (pgcnt -1 ), and pass (pgcnt - 1) in
12535 * the low 6 bits of sfmmup. This is doable since pgcnt
12538 ASSERT(!((uint64_t)sfmmup
& SFMMU_PGCNT_MASK
));
12539 sfmmu_pgcnt
= (uint64_t)sfmmup
|
12540 ((pgcnt
- 1) & SFMMU_PGCNT_MASK
);
12543 * We must hold the hat lock during the flush of TLB,
12544 * to avoid a race with sfmmu_invalidate_ctx(), where
12545 * sfmmu_cnum on a MMU could be set to INVALID_CONTEXT,
12546 * causing TLB demap routine to skip flush on that MMU.
12547 * If the context on a MMU has already been set to
12548 * INVALID_CONTEXT, we just get an extra flush on
12551 hatlockp
= sfmmu_hat_enter(sfmmup
);
12552 kpreempt_disable();
12554 cpuset
= sfmmup
->sfmmu_cpusran
;
12555 CPUSET_AND(cpuset
, cpu_ready_set
);
12556 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
12558 SFMMU_XCALL_STATS(sfmmup
);
12559 xt_some(cpuset
, vtag_flush_pgcnt_tl1
, (uint64_t)addr
,
12562 for (; bitvec
!= 0; bitvec
>>= 1) {
12564 vtag_flushpage(addr
, (uint64_t)sfmmup
);
12565 addr
+= MMU_PAGESIZE
;
12568 sfmmu_hat_exit(hatlockp
);
12570 sfmmu_xcall_save
+= (pgunload
-1);
12572 dmrp
->dmr_bitvec
= 0;
12576 * In cases where we need to synchronize with TLB/TSB miss trap
12577 * handlers, _and_ need to flush the TLB, it's a lot easier to
12578 * throw away the context from the process than to do a
12579 * special song and dance to keep things consistent for the
12582 * Since the process suddenly ends up without a context and our caller
12583 * holds the hat lock, threads that fault after this function is called
12584 * will pile up on the lock. We can then do whatever we need to
12585 * atomically from the context of the caller. The first blocked thread
12586 * to resume executing will get the process a new context, and the
12587 * process will resume executing.
12589 * One added advantage of this approach is that on MMUs that
12590 * support a "flush all" operation, we will delay the flush until
12591 * cnum wrap-around, and then flush the TLB one time. This
12592 * is rather rare, so it's a lot less expensive than making 8000
12593 * x-calls to flush the TLB 8000 times.
12595 * A per-process (PP) lock is used to synchronize ctx allocations in
12596 * resume() and ctx invalidations here.
12599 sfmmu_invalidate_ctx(sfmmu_t
*sfmmup
)
12602 int cnum
, currcnum
;
12603 mmu_ctx_t
*mmu_ctxp
;
12605 uint_t pstate_save
;
12607 SFMMU_STAT(sf_ctx_inv
);
12609 ASSERT(sfmmu_hat_lock_held(sfmmup
));
12610 ASSERT(sfmmup
!= ksfmmup
);
12612 kpreempt_disable();
12614 mmu_ctxp
= CPU_MMU_CTXP(CPU
);
12616 ASSERT(mmu_ctxp
->mmu_idx
< max_mmu_ctxdoms
);
12617 ASSERT(mmu_ctxp
== mmu_ctxs_tbl
[mmu_ctxp
->mmu_idx
]);
12619 currcnum
= sfmmup
->sfmmu_ctxs
[mmu_ctxp
->mmu_idx
].cnum
;
12621 pstate_save
= sfmmu_disable_intrs();
12623 lock_set(&sfmmup
->sfmmu_ctx_lock
); /* acquire PP lock */
12624 /* set HAT cnum invalid across all context domains. */
12625 for (i
= 0; i
< max_mmu_ctxdoms
; i
++) {
12627 cnum
= sfmmup
->sfmmu_ctxs
[i
].cnum
;
12628 if (cnum
== INVALID_CONTEXT
) {
12632 sfmmup
->sfmmu_ctxs
[i
].cnum
= INVALID_CONTEXT
;
12634 membar_enter(); /* make sure globally visible to all CPUs */
12635 lock_clear(&sfmmup
->sfmmu_ctx_lock
); /* release PP lock */
12637 sfmmu_enable_intrs(pstate_save
);
12639 cpuset
= sfmmup
->sfmmu_cpusran
;
12640 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
12641 CPUSET_AND(cpuset
, cpu_ready_set
);
12642 if (!CPUSET_ISNULL(cpuset
)) {
12643 SFMMU_XCALL_STATS(sfmmup
);
12644 xt_some(cpuset
, sfmmu_raise_tsb_exception
,
12645 (uint64_t)sfmmup
, INVALID_CONTEXT
);
12647 SFMMU_STAT(sf_tsb_raise_exception
);
12648 SFMMU_MMU_STAT(mmu_tsb_raise_exception
);
12652 * If the hat to-be-invalidated is the same as the current
12653 * process on local CPU we need to invalidate
12654 * this CPU context as well.
12656 if ((sfmmu_getctx_sec() == currcnum
) &&
12657 (currcnum
!= INVALID_CONTEXT
)) {
12658 /* sets shared context to INVALID too */
12659 sfmmu_setctx_sec(INVALID_CONTEXT
);
12660 sfmmu_clear_utsbinfo();
12663 SFMMU_FLAGS_SET(sfmmup
, HAT_ALLCTX_INVALID
);
12668 * we hold the hat lock, so nobody should allocate a context
12671 ASSERT(sfmmup
->sfmmu_ctxs
[mmu_ctxp
->mmu_idx
].cnum
== INVALID_CONTEXT
);
12676 * We need to flush the cache in all cpus. It is possible that
12677 * a process referenced a page as cacheable but has sinced exited
12678 * and cleared the mapping list. We still to flush it but have no
12679 * state so all cpus is the only alternative.
12682 sfmmu_cache_flush(pfn_t pfnum
, int vcolor
)
12686 kpreempt_disable();
12687 cpuset
= cpu_ready_set
;
12688 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
12689 SFMMU_XCALL_STATS(NULL
); /* account to any ctx */
12690 xt_some(cpuset
, vac_flushpage_tl1
, pfnum
, vcolor
);
12692 vac_flushpage(pfnum
, vcolor
);
12697 sfmmu_cache_flushcolor(int vcolor
, pfn_t pfnum
)
12701 ASSERT(vcolor
>= 0);
12703 kpreempt_disable();
12704 cpuset
= cpu_ready_set
;
12705 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
12706 SFMMU_XCALL_STATS(NULL
); /* account to any ctx */
12707 xt_some(cpuset
, vac_flushcolor_tl1
, vcolor
, pfnum
);
12709 vac_flushcolor(vcolor
, pfnum
);
12715 * We need to prevent processes from accessing the TSB using a cached physical
12716 * address. It's alright if they try to access the TSB via virtual address
12717 * since they will just fault on that virtual address once the mapping has
12720 #pragma weak sendmondo_in_recover
12724 sfmmu_tsb_pre_relocator(caddr_t va
, uint_t tsbsz
, uint_t flags
, void *tsbinfo
)
12726 struct tsb_info
*tsbinfop
= (struct tsb_info
*)tsbinfo
;
12727 sfmmu_t
*sfmmup
= tsbinfop
->tsb_sfmmu
;
12728 hatlock_t
*hatlockp
;
12731 if (flags
!= HAT_PRESUSPEND
)
12735 * If tsb is a shared TSB with TSB_SHAREDCTX set, sfmmup must
12736 * be a shared hat, then set SCD's tsbinfo's flag.
12737 * If tsb is not shared, sfmmup is a private hat, then set
12738 * its private tsbinfo's flag.
12740 hatlockp
= sfmmu_hat_enter(sfmmup
);
12741 tsbinfop
->tsb_flags
|= TSB_RELOC_FLAG
;
12743 if (!(tsbinfop
->tsb_flags
& TSB_SHAREDCTX
)) {
12744 sfmmu_tsb_inv_ctx(sfmmup
);
12745 sfmmu_hat_exit(hatlockp
);
12747 /* release lock on the shared hat */
12748 sfmmu_hat_exit(hatlockp
);
12749 /* sfmmup is a shared hat */
12750 ASSERT(sfmmup
->sfmmu_scdhat
);
12751 scdp
= sfmmup
->sfmmu_scdp
;
12752 ASSERT(scdp
!= NULL
);
12753 /* get private hat from the scd list */
12754 mutex_enter(&scdp
->scd_mutex
);
12755 sfmmup
= scdp
->scd_sf_list
;
12756 while (sfmmup
!= NULL
) {
12757 hatlockp
= sfmmu_hat_enter(sfmmup
);
12759 * We do not call sfmmu_tsb_inv_ctx here because
12760 * sendmondo_in_recover check is only needed for
12763 sfmmu_invalidate_ctx(sfmmup
);
12764 sfmmu_hat_exit(hatlockp
);
12765 sfmmup
= sfmmup
->sfmmu_scd_link
.next
;
12768 mutex_exit(&scdp
->scd_mutex
);
12774 sfmmu_tsb_inv_ctx(sfmmu_t
*sfmmup
)
12776 extern uint32_t sendmondo_in_recover
;
12778 ASSERT(sfmmu_hat_lock_held(sfmmup
));
12781 * For Cheetah+ Erratum 25:
12782 * Wait for any active recovery to finish. We can't risk
12783 * relocating the TSB of the thread running mondo_recover_proc()
12784 * since, if we did that, we would deadlock. The scenario we are
12785 * trying to avoid is as follows:
12787 * THIS CPU RECOVER CPU
12788 * -------- -----------
12789 * Begins recovery, walking through TSB
12790 * hat_pagesuspend() TSB TTE
12791 * TLB miss on TSB TTE, spins at TL1
12793 * send_mondo_timeout()
12794 * mondo_recover_proc()
12797 * The second half of the workaround is that mondo_recover_proc()
12798 * checks to see if the tsb_info has the RELOC flag set, and if it
12799 * does, it skips over that TSB without ever touching tsbinfop->tsb_va
12800 * and hence avoiding the TLB miss that could result in a deadlock.
12802 if (&sendmondo_in_recover
) {
12803 membar_enter(); /* make sure RELOC flag visible */
12804 while (sendmondo_in_recover
) {
12810 sfmmu_invalidate_ctx(sfmmup
);
12815 sfmmu_tsb_post_relocator(caddr_t va
, uint_t tsbsz
, uint_t flags
,
12816 void *tsbinfo
, pfn_t newpfn
)
12818 hatlock_t
*hatlockp
;
12819 struct tsb_info
*tsbinfop
= (struct tsb_info
*)tsbinfo
;
12820 sfmmu_t
*sfmmup
= tsbinfop
->tsb_sfmmu
;
12822 if (flags
!= HAT_POSTUNSUSPEND
)
12825 hatlockp
= sfmmu_hat_enter(sfmmup
);
12827 SFMMU_STAT(sf_tsb_reloc
);
12830 * The process may have swapped out while we were relocating one
12831 * of its TSBs. If so, don't bother doing the setup since the
12832 * process can't be using the memory anymore.
12834 if ((tsbinfop
->tsb_flags
& TSB_SWAPPED
) == 0) {
12835 ASSERT(va
== tsbinfop
->tsb_va
);
12836 sfmmu_tsbinfo_setup_phys(tsbinfop
, newpfn
);
12838 if (tsbinfop
->tsb_flags
& TSB_FLUSH_NEEDED
) {
12839 sfmmu_inv_tsb(tsbinfop
->tsb_va
,
12840 TSB_BYTES(tsbinfop
->tsb_szc
));
12841 tsbinfop
->tsb_flags
&= ~TSB_FLUSH_NEEDED
;
12846 tsbinfop
->tsb_flags
&= ~TSB_RELOC_FLAG
;
12847 cv_broadcast(&sfmmup
->sfmmu_tsb_cv
);
12849 sfmmu_hat_exit(hatlockp
);
12855 * Allocate and initialize a tsb_info structure. Note that we may or may not
12856 * allocate a TSB here, depending on the flags passed in.
12859 sfmmu_tsbinfo_alloc(struct tsb_info
**tsbinfopp
, int tsb_szc
, int tte_sz_mask
,
12860 uint_t flags
, sfmmu_t
*sfmmup
)
12864 *tsbinfopp
= (struct tsb_info
*)kmem_cache_alloc(
12865 sfmmu_tsbinfo_cache
, KM_SLEEP
);
12867 if ((err
= sfmmu_init_tsbinfo(*tsbinfopp
, tte_sz_mask
,
12868 tsb_szc
, flags
, sfmmup
)) != 0) {
12869 kmem_cache_free(sfmmu_tsbinfo_cache
, *tsbinfopp
);
12870 SFMMU_STAT(sf_tsb_allocfail
);
12874 SFMMU_STAT(sf_tsb_alloc
);
12877 * Bump the TSB size counters for this TSB size.
12879 (*(((int *)&sfmmu_tsbsize_stat
) + tsb_szc
))++;
12884 sfmmu_tsb_free(struct tsb_info
*tsbinfo
)
12886 caddr_t tsbva
= tsbinfo
->tsb_va
;
12887 uint_t tsb_size
= TSB_BYTES(tsbinfo
->tsb_szc
);
12888 struct kmem_cache
*kmem_cachep
= tsbinfo
->tsb_cache
;
12889 vmem_t
*vmp
= tsbinfo
->tsb_vmp
;
12892 * If we allocated this TSB from relocatable kernel memory, then we
12893 * need to uninstall the callback handler.
12895 if (tsbinfo
->tsb_cache
!= sfmmu_tsb8k_cache
) {
12896 uintptr_t slab_mask
;
12897 caddr_t slab_vaddr
;
12901 ASSERT(tsb_size
<= MMU_PAGESIZE4M
|| use_bigtsb_arena
);
12902 if (tsb_size
> MMU_PAGESIZE4M
)
12903 slab_mask
= ~((uintptr_t)bigtsb_slab_mask
) << PAGESHIFT
;
12905 slab_mask
= ~((uintptr_t)tsb_slab_mask
) << PAGESHIFT
;
12906 slab_vaddr
= (caddr_t
)((uintptr_t)tsbva
& slab_mask
);
12908 ret
= as_pagelock(&kas
, &ppl
, slab_vaddr
, PAGESIZE
, S_WRITE
);
12910 hat_delete_callback(tsbva
, (uint_t
)tsb_size
, (void *)tsbinfo
,
12912 as_pageunlock(&kas
, ppl
, slab_vaddr
, PAGESIZE
, S_WRITE
);
12915 if (kmem_cachep
!= NULL
) {
12916 kmem_cache_free(kmem_cachep
, tsbva
);
12918 vmem_xfree(vmp
, (void *)tsbva
, tsb_size
);
12920 tsbinfo
->tsb_va
= (caddr_t
)0xbad00bad;
12921 atomic_add_64(&tsb_alloc_bytes
, -(int64_t)tsb_size
);
12925 sfmmu_tsbinfo_free(struct tsb_info
*tsbinfo
)
12927 if ((tsbinfo
->tsb_flags
& TSB_SWAPPED
) == 0) {
12928 sfmmu_tsb_free(tsbinfo
);
12930 kmem_cache_free(sfmmu_tsbinfo_cache
, tsbinfo
);
12935 * Setup all the references to physical memory for this tsbinfo.
12936 * The underlying page(s) must be locked.
12939 sfmmu_tsbinfo_setup_phys(struct tsb_info
*tsbinfo
, pfn_t pfn
)
12941 ASSERT(pfn
!= PFN_INVALID
);
12942 ASSERT(pfn
== va_to_pfn(tsbinfo
->tsb_va
));
12945 if (tsbinfo
->tsb_szc
== 0) {
12946 sfmmu_memtte(&tsbinfo
->tsb_tte
, pfn
,
12947 PROT_WRITE
|PROT_READ
, TTE8K
);
12950 * Round down PA and use a large mapping; the handlers will
12951 * compute the TSB pointer at the correct offset into the
12952 * big virtual page. NOTE: this assumes all TSBs larger
12953 * than 8K must come from physically contiguous slabs of
12954 * size tsb_slab_size.
12956 sfmmu_memtte(&tsbinfo
->tsb_tte
, pfn
& ~tsb_slab_mask
,
12957 PROT_WRITE
|PROT_READ
, tsb_slab_ttesz
);
12959 tsbinfo
->tsb_pa
= ptob(pfn
);
12961 TTE_SET_LOCKED(&tsbinfo
->tsb_tte
); /* lock the tte into dtlb */
12962 TTE_SET_MOD(&tsbinfo
->tsb_tte
); /* enable writes */
12964 ASSERT(TTE_IS_PRIVILEGED(&tsbinfo
->tsb_tte
));
12965 ASSERT(TTE_IS_LOCKED(&tsbinfo
->tsb_tte
));
12967 tsbinfo
->tsb_pa
= ptob(pfn
);
12973 * Returns zero on success, ENOMEM if over the high water mark,
12974 * or EAGAIN if the caller needs to retry with a smaller TSB
12975 * size (or specify TSB_FORCEALLOC if the allocation can't fail).
12977 * This call cannot fail to allocate a TSB if TSB_FORCEALLOC
12978 * is specified and the TSB requested is PAGESIZE, though it
12979 * may sleep waiting for memory if sufficient memory is not
12983 sfmmu_init_tsbinfo(struct tsb_info
*tsbinfo
, int tteszmask
,
12984 int tsbcode
, uint_t flags
, sfmmu_t
*sfmmup
)
12986 caddr_t vaddr
= NULL
;
12987 caddr_t slab_vaddr
;
12988 uintptr_t slab_mask
;
12989 int tsbbytes
= TSB_BYTES(tsbcode
);
12991 struct kmem_cache
*kmem_cachep
= NULL
;
12992 vmem_t
*vmp
= NULL
;
12993 lgrp_id_t lgrpid
= LGRP_NONE
;
12995 uint_t cbflags
= HAC_SLEEP
;
12999 ASSERT(tsbbytes
<= MMU_PAGESIZE4M
|| use_bigtsb_arena
);
13000 if (tsbbytes
> MMU_PAGESIZE4M
)
13001 slab_mask
= ~((uintptr_t)bigtsb_slab_mask
) << PAGESHIFT
;
13003 slab_mask
= ~((uintptr_t)tsb_slab_mask
) << PAGESHIFT
;
13005 if (flags
& (TSB_FORCEALLOC
| TSB_SWAPIN
| TSB_GROW
| TSB_SHRINK
))
13006 flags
|= TSB_ALLOC
;
13008 ASSERT((flags
& TSB_FORCEALLOC
) == 0 || tsbcode
== TSB_MIN_SZCODE
);
13010 tsbinfo
->tsb_sfmmu
= sfmmup
;
13013 * If not allocating a TSB, set up the tsbinfo, set TSB_SWAPPED, and
13016 if ((flags
& TSB_ALLOC
) == 0) {
13017 tsbinfo
->tsb_szc
= tsbcode
;
13018 tsbinfo
->tsb_ttesz_mask
= tteszmask
;
13019 tsbinfo
->tsb_va
= (caddr_t
)0xbadbadbeef;
13020 tsbinfo
->tsb_pa
= -1;
13021 tsbinfo
->tsb_tte
.ll
= 0;
13022 tsbinfo
->tsb_next
= NULL
;
13023 tsbinfo
->tsb_flags
= TSB_SWAPPED
;
13024 tsbinfo
->tsb_cache
= NULL
;
13025 tsbinfo
->tsb_vmp
= NULL
;
13032 * Randomly force allocation failures every tsb_alloc_mtbf
13033 * tries if TSB_FORCEALLOC is not specified. This will
13034 * return ENOMEM if tsb_alloc_mtbf is odd, or EAGAIN if
13035 * it is even, to allow testing of both failure paths...
13037 if (tsb_alloc_mtbf
&& ((flags
& TSB_FORCEALLOC
) == 0) &&
13038 (tsb_alloc_count
++ == tsb_alloc_mtbf
)) {
13039 tsb_alloc_count
= 0;
13040 tsb_alloc_fail_mtbf
++;
13041 return ((tsb_alloc_mtbf
& 1)? ENOMEM
: EAGAIN
);
13046 * Enforce high water mark if we are not doing a forced allocation
13047 * and are not shrinking a process' TSB.
13049 if ((flags
& TSB_SHRINK
) == 0 &&
13050 (tsbbytes
+ tsb_alloc_bytes
) > tsb_alloc_hiwater
) {
13051 if ((flags
& TSB_FORCEALLOC
) == 0)
13057 * Allocate from the correct location based upon the size of the TSB
13058 * compared to the base page size, and what memory conditions dictate.
13059 * Note we always do nonblocking allocations from the TSB arena since
13060 * we don't want memory fragmentation to cause processes to block
13061 * indefinitely waiting for memory; until the kernel algorithms that
13062 * coalesce large pages are improved this is our best option.
13065 * If allocating a "large" TSB (>8K), allocate from the
13066 * appropriate kmem_tsb_default_arena vmem arena
13067 * else if low on memory or the TSB_FORCEALLOC flag is set or
13068 * tsb_forceheap is set
13069 * Allocate from kernel heap via sfmmu_tsb8k_cache with
13070 * KM_SLEEP (never fails)
13072 * Allocate from appropriate sfmmu_tsb_cache with
13076 if (tsb_lgrp_affinity
)
13077 lgrpid
= lgrp_home_id(curthread
);
13078 if (lgrpid
== LGRP_NONE
)
13079 lgrpid
= 0; /* use lgrp of boot CPU */
13081 if (tsbbytes
> MMU_PAGESIZE
) {
13082 if (tsbbytes
> MMU_PAGESIZE4M
) {
13083 vmp
= kmem_bigtsb_default_arena
[lgrpid
];
13084 vaddr
= (caddr_t
)vmem_xalloc(vmp
, tsbbytes
, tsbbytes
,
13085 0, 0, NULL
, NULL
, VM_NOSLEEP
);
13087 vmp
= kmem_tsb_default_arena
[lgrpid
];
13088 vaddr
= (caddr_t
)vmem_xalloc(vmp
, tsbbytes
, tsbbytes
,
13089 0, 0, NULL
, NULL
, VM_NOSLEEP
);
13092 } else if (lowmem
|| (flags
& TSB_FORCEALLOC
) || tsb_forceheap
) {
13094 } else if (lowmem
|| (flags
& TSB_FORCEALLOC
)) {
13096 kmem_cachep
= sfmmu_tsb8k_cache
;
13097 vaddr
= (caddr_t
)kmem_cache_alloc(kmem_cachep
, KM_SLEEP
);
13098 ASSERT(vaddr
!= NULL
);
13100 kmem_cachep
= sfmmu_tsb_cache
[lgrpid
];
13101 vaddr
= (caddr_t
)kmem_cache_alloc(kmem_cachep
, KM_NOSLEEP
);
13104 tsbinfo
->tsb_cache
= kmem_cachep
;
13105 tsbinfo
->tsb_vmp
= vmp
;
13107 if (vaddr
== NULL
) {
13111 atomic_add_64(&tsb_alloc_bytes
, (int64_t)tsbbytes
);
13112 kmem_cachep
= tsbinfo
->tsb_cache
;
13115 * If we are allocating from outside the cage, then we need to
13116 * register a relocation callback handler. Note that for now
13117 * since pseudo mappings always hang off of the slab's root page,
13118 * we need only lock the first 8K of the TSB slab. This is a bit
13119 * hacky but it is good for performance.
13121 if (kmem_cachep
!= sfmmu_tsb8k_cache
) {
13122 slab_vaddr
= (caddr_t
)((uintptr_t)vaddr
& slab_mask
);
13123 ret
= as_pagelock(&kas
, &pplist
, slab_vaddr
, PAGESIZE
, S_WRITE
);
13125 ret
= hat_add_callback(sfmmu_tsb_cb_id
, vaddr
, (uint_t
)tsbbytes
,
13126 cbflags
, (void *)tsbinfo
, &pfn
, NULL
);
13129 * Need to free up resources if we could not successfully
13130 * add the callback function and return an error condition.
13134 kmem_cache_free(kmem_cachep
, vaddr
);
13136 vmem_xfree(vmp
, (void *)vaddr
, tsbbytes
);
13138 as_pageunlock(&kas
, pplist
, slab_vaddr
, PAGESIZE
,
13144 * Since allocation of 8K TSBs from heap is rare and occurs
13145 * during memory pressure we allocate them from permanent
13146 * memory rather than using callbacks to get the PFN.
13148 pfn
= hat_getpfnum(kas
.a_hat
, vaddr
);
13151 tsbinfo
->tsb_va
= vaddr
;
13152 tsbinfo
->tsb_szc
= tsbcode
;
13153 tsbinfo
->tsb_ttesz_mask
= tteszmask
;
13154 tsbinfo
->tsb_next
= NULL
;
13155 tsbinfo
->tsb_flags
= 0;
13157 sfmmu_tsbinfo_setup_phys(tsbinfo
, pfn
);
13159 sfmmu_inv_tsb(vaddr
, tsbbytes
);
13161 if (kmem_cachep
!= sfmmu_tsb8k_cache
) {
13162 as_pageunlock(&kas
, pplist
, slab_vaddr
, PAGESIZE
, S_WRITE
);
13169 * Initialize per cpu tsb and per cpu tsbmiss_area
13172 sfmmu_init_tsbs(void)
13175 struct tsbmiss
*tsbmissp
;
13176 struct kpmtsbm
*kpmtsbmp
;
13178 extern int dcache_line_mask
;
13180 extern uint_t vac_colors
;
13183 * Init. tsb miss area.
13185 tsbmissp
= tsbmiss_area
;
13187 for (i
= 0; i
< NCPU
; tsbmissp
++, i
++) {
13189 * initialize the tsbmiss area.
13190 * Do this for all possible CPUs as some may be added
13191 * while the system is running. There is no cost to this.
13193 tsbmissp
->ksfmmup
= ksfmmup
;
13195 tsbmissp
->dcache_line_mask
= (uint16_t)dcache_line_mask
;
13197 tsbmissp
->khashstart
=
13198 (struct hmehash_bucket
*)va_to_pa((caddr_t
)khme_hash
);
13199 tsbmissp
->uhashstart
=
13200 (struct hmehash_bucket
*)va_to_pa((caddr_t
)uhme_hash
);
13201 tsbmissp
->khashsz
= khmehash_num
;
13202 tsbmissp
->uhashsz
= uhmehash_num
;
13205 sfmmu_tsb_cb_id
= hat_register_callback('T'<<16 | 'S' << 8 | 'B',
13206 sfmmu_tsb_pre_relocator
, sfmmu_tsb_post_relocator
, NULL
, 0);
13208 if (kpm_enable
== 0)
13211 /* -- Begin KPM specific init -- */
13213 if (kpm_smallpages
) {
13215 * If we're using base pagesize pages for seg_kpm
13216 * mappings, we use the kernel TSB since we can't afford
13217 * to allocate a second huge TSB for these mappings.
13219 kpm_tsbbase
= ktsb_phys
? ktsb_pbase
: (uint64_t)ktsb_base
;
13220 kpm_tsbsz
= ktsb_szcode
;
13221 kpmsm_tsbbase
= kpm_tsbbase
;
13222 kpmsm_tsbsz
= kpm_tsbsz
;
13225 * In VAC conflict case, just put the entries in the
13226 * kernel 8K indexed TSB for now so we can find them.
13227 * This could really be changed in the future if we feel
13230 kpmsm_tsbbase
= ktsb_phys
? ktsb_pbase
: (uint64_t)ktsb_base
;
13231 kpmsm_tsbsz
= ktsb_szcode
;
13232 kpm_tsbbase
= ktsb_phys
? ktsb4m_pbase
: (uint64_t)ktsb4m_base
;
13233 kpm_tsbsz
= ktsb4m_szcode
;
13236 kpmtsbmp
= kpmtsbm_area
;
13237 for (i
= 0; i
< NCPU
; kpmtsbmp
++, i
++) {
13239 * Initialize the kpmtsbm area.
13240 * Do this for all possible CPUs as some may be added
13241 * while the system is running. There is no cost to this.
13243 kpmtsbmp
->vbase
= kpm_vbase
;
13244 kpmtsbmp
->vend
= kpm_vbase
+ kpm_size
* vac_colors
;
13245 kpmtsbmp
->sz_shift
= kpm_size_shift
;
13246 kpmtsbmp
->kpmp_shift
= kpmp_shift
;
13247 kpmtsbmp
->kpmp2pshft
= (uchar_t
)kpmp2pshft
;
13248 if (kpm_smallpages
== 0) {
13249 kpmtsbmp
->kpmp_table_sz
= kpmp_table_sz
;
13250 kpmtsbmp
->kpmp_tablepa
= va_to_pa(kpmp_table
);
13252 kpmtsbmp
->kpmp_table_sz
= kpmp_stable_sz
;
13253 kpmtsbmp
->kpmp_tablepa
= va_to_pa(kpmp_stable
);
13255 kpmtsbmp
->msegphashpa
= va_to_pa(memseg_phash
);
13256 kpmtsbmp
->flags
= KPMTSBM_ENABLE_FLAG
;
13258 kpmtsbmp
->flags
|= (kpm_tsbmtl
) ? KPMTSBM_TLTSBM_FLAG
: 0;
13261 kpmtsbmp
->flags
|= KPMTSBM_TSBPHYS_FLAG
;
13264 /* -- End KPM specific init -- */
13267 /* Avoid using sfmmu_tsbinfo_alloc() to avoid kmem_alloc - no real reason */
13268 struct tsb_info ktsb_info
[2];
13271 * Called from hat_kern_setup() to setup the tsb_info for ksfmmup.
13274 sfmmu_init_ktsbinfo()
13276 ASSERT(ksfmmup
!= NULL
);
13277 ASSERT(ksfmmup
->sfmmu_tsb
== NULL
);
13279 * Allocate tsbinfos for kernel and copy in data
13280 * to make debug easier and sun4v setup easier.
13282 ktsb_info
[0].tsb_sfmmu
= ksfmmup
;
13283 ktsb_info
[0].tsb_szc
= ktsb_szcode
;
13284 ktsb_info
[0].tsb_ttesz_mask
= TSB8K
|TSB64K
|TSB512K
;
13285 ktsb_info
[0].tsb_va
= ktsb_base
;
13286 ktsb_info
[0].tsb_pa
= ktsb_pbase
;
13287 ktsb_info
[0].tsb_flags
= 0;
13288 ktsb_info
[0].tsb_tte
.ll
= 0;
13289 ktsb_info
[0].tsb_cache
= NULL
;
13291 ktsb_info
[1].tsb_sfmmu
= ksfmmup
;
13292 ktsb_info
[1].tsb_szc
= ktsb4m_szcode
;
13293 ktsb_info
[1].tsb_ttesz_mask
= TSB4M
;
13294 ktsb_info
[1].tsb_va
= ktsb4m_base
;
13295 ktsb_info
[1].tsb_pa
= ktsb4m_pbase
;
13296 ktsb_info
[1].tsb_flags
= 0;
13297 ktsb_info
[1].tsb_tte
.ll
= 0;
13298 ktsb_info
[1].tsb_cache
= NULL
;
13300 /* Link them into ksfmmup. */
13301 ktsb_info
[0].tsb_next
= &ktsb_info
[1];
13302 ktsb_info
[1].tsb_next
= NULL
;
13303 ksfmmup
->sfmmu_tsb
= &ktsb_info
[0];
13305 sfmmu_setup_tsbinfo(ksfmmup
);
13309 * Cache the last value returned from va_to_pa(). If the VA specified
13310 * in the current call to cached_va_to_pa() maps to the same Page (as the
13311 * previous call to cached_va_to_pa()), then compute the PA using
13312 * cached info, else call va_to_pa().
13314 * Note: this function is neither MT-safe nor consistent in the presence
13315 * of multiple, interleaved threads. This function was created to enable
13316 * an optimization used during boot (at a point when there's only one thread
13317 * executing on the "boot CPU", and before startup_vm() has been called).
13320 cached_va_to_pa(void *vaddr
)
13322 static uint64_t prev_vaddr_base
= 0;
13323 static uint64_t prev_pfn
= 0;
13325 if ((((uint64_t)vaddr
) & MMU_PAGEMASK
) == prev_vaddr_base
) {
13326 return (prev_pfn
| ((uint64_t)vaddr
& MMU_PAGEOFFSET
));
13328 uint64_t pa
= va_to_pa(vaddr
);
13330 if (pa
!= ((uint64_t)-1)) {
13332 * Computed physical address is valid. Cache its
13333 * related info for the next cached_va_to_pa() call.
13335 prev_pfn
= pa
& MMU_PAGEMASK
;
13336 prev_vaddr_base
= ((uint64_t)vaddr
) & MMU_PAGEMASK
;
13344 * Carve up our nucleus hblk region. We may allocate more hblks than
13345 * asked due to rounding errors but we are guaranteed to have at least
13346 * enough space to allocate the requested number of hblk8's and hblk1's.
13349 sfmmu_init_nucleus_hblks(caddr_t addr
, size_t size
, int nhblk8
, int nhblk1
)
13351 struct hme_blk
*hmeblkp
;
13352 size_t hme8blk_sz
, hme1blk_sz
;
13354 size_t hblk8_bound
;
13355 ulong_t j
= 0, k
= 0;
13357 ASSERT(addr
!= NULL
&& size
!= 0);
13359 /* Need to use proper structure alignment */
13360 hme8blk_sz
= roundup(HME8BLK_SZ
, sizeof (int64_t));
13361 hme1blk_sz
= roundup(HME1BLK_SZ
, sizeof (int64_t));
13363 nucleus_hblk8
.list
= (void *)addr
;
13364 nucleus_hblk8
.index
= 0;
13367 * Use as much memory as possible for hblk8's since we
13368 * expect all bop_alloc'ed memory to be allocated in 8k chunks.
13369 * We need to hold back enough space for the hblk1's which
13370 * we'll allocate next.
13372 hblk8_bound
= size
- (nhblk1
* hme1blk_sz
) - hme8blk_sz
;
13373 for (i
= 0; i
<= hblk8_bound
; i
+= hme8blk_sz
, j
++) {
13374 hmeblkp
= (struct hme_blk
*)addr
;
13375 addr
+= hme8blk_sz
;
13376 hmeblkp
->hblk_nuc_bit
= 1;
13377 hmeblkp
->hblk_nextpa
= cached_va_to_pa((caddr_t
)hmeblkp
);
13379 nucleus_hblk8
.len
= j
;
13380 ASSERT(j
>= nhblk8
);
13381 SFMMU_STAT_ADD(sf_hblk8_ncreate
, j
);
13383 nucleus_hblk1
.list
= (void *)addr
;
13384 nucleus_hblk1
.index
= 0;
13385 for (; i
<= (size
- hme1blk_sz
); i
+= hme1blk_sz
, k
++) {
13386 hmeblkp
= (struct hme_blk
*)addr
;
13387 addr
+= hme1blk_sz
;
13388 hmeblkp
->hblk_nuc_bit
= 1;
13389 hmeblkp
->hblk_nextpa
= cached_va_to_pa((caddr_t
)hmeblkp
);
13391 ASSERT(k
>= nhblk1
);
13392 nucleus_hblk1
.len
= k
;
13393 SFMMU_STAT_ADD(sf_hblk1_ncreate
, k
);
13397 * This function is currently not supported on this platform. For what
13398 * it's supposed to do, see hat.c and hat_srmmu.c
13402 hat_softlock(struct hat
*hat
, caddr_t addr
, size_t *lenp
, page_t
**ppp
,
13405 ASSERT(hat
->sfmmu_xhat_provider
== NULL
);
13406 return (FC_NOSUPPORT
);
13410 * Searchs the mapping list of the page for a mapping of the same size. If not
13411 * found the corresponding bit is cleared in the p_index field. When large
13412 * pages are more prevalent in the system, we can maintain the mapping list
13413 * in order and we don't have to traverse the list each time. Just check the
13414 * next and prev entries, and if both are of different size, we clear the bit.
13417 sfmmu_rm_large_mappings(page_t
*pp
, int ttesz
)
13419 struct sf_hment
*sfhmep
;
13420 struct hme_blk
*hmeblkp
;
13424 ASSERT(ttesz
> TTE8K
);
13426 ASSERT(sfmmu_mlist_held(pp
));
13428 ASSERT(PP_ISMAPPED_LARGE(pp
));
13431 * Traverse mapping list looking for another mapping of same size.
13432 * since we only want to clear index field if all mappings of
13433 * that size are gone.
13436 for (sfhmep
= pp
->p_mapping
; sfhmep
; sfhmep
= sfhmep
->hme_next
) {
13437 if (IS_PAHME(sfhmep
))
13439 hmeblkp
= sfmmu_hmetohblk(sfhmep
);
13440 if (hmeblkp
->hblk_xhat_bit
)
13442 if (hme_size(sfhmep
) == ttesz
) {
13444 * another mapping of the same size. don't clear index.
13451 * Clear the p_index bit for large page.
13453 index
= PAGESZ_TO_INDEX(ttesz
);
13454 npgs
= TTEPAGES(ttesz
);
13455 while (npgs
-- > 0) {
13456 ASSERT(pp
->p_index
& index
);
13457 pp
->p_index
&= ~index
;
13458 pp
= PP_PAGENEXT(pp
);
13463 * return supported features
13467 hat_supported(enum hat_features feature
, void *arg
)
13470 case HAT_SHARED_PT
:
13471 case HAT_DYNAMIC_ISM_UNMAP
:
13474 case HAT_SHARED_REGIONS
:
13485 hat_enter(struct hat
*hat
)
13487 hatlock_t
*hatlockp
;
13489 if (hat
!= ksfmmup
) {
13490 hatlockp
= TSB_HASH(hat
);
13491 mutex_enter(HATLOCK_MUTEXP(hatlockp
));
13496 hat_exit(struct hat
*hat
)
13498 hatlock_t
*hatlockp
;
13500 if (hat
!= ksfmmup
) {
13501 hatlockp
= TSB_HASH(hat
);
13502 mutex_exit(HATLOCK_MUTEXP(hatlockp
));
13508 hat_reserve(struct as
*as
, caddr_t addr
, size_t len
)
13513 hat_kstat_init(void)
13517 ksp
= kstat_create("unix", 0, "sfmmu_global_stat", "hat",
13518 KSTAT_TYPE_RAW
, sizeof (struct sfmmu_global_stat
),
13519 KSTAT_FLAG_VIRTUAL
);
13521 ksp
->ks_data
= (void *) &sfmmu_global_stat
;
13522 kstat_install(ksp
);
13524 ksp
= kstat_create("unix", 0, "sfmmu_tsbsize_stat", "hat",
13525 KSTAT_TYPE_RAW
, sizeof (struct sfmmu_tsbsize_stat
),
13526 KSTAT_FLAG_VIRTUAL
);
13528 ksp
->ks_data
= (void *) &sfmmu_tsbsize_stat
;
13529 kstat_install(ksp
);
13531 ksp
= kstat_create("unix", 0, "sfmmu_percpu_stat", "hat",
13532 KSTAT_TYPE_RAW
, sizeof (struct sfmmu_percpu_stat
) * NCPU
,
13533 KSTAT_FLAG_WRITABLE
);
13535 ksp
->ks_update
= sfmmu_kstat_percpu_update
;
13536 kstat_install(ksp
);
13542 sfmmu_kstat_percpu_update(kstat_t
*ksp
, int rw
)
13544 struct sfmmu_percpu_stat
*cpu_kstat
= ksp
->ks_data
;
13545 struct tsbmiss
*tsbm
= tsbmiss_area
;
13546 struct kpmtsbm
*kpmtsbm
= kpmtsbm_area
;
13550 if (rw
== KSTAT_READ
) {
13551 for (i
= 0; i
< NCPU
; cpu_kstat
++, tsbm
++, kpmtsbm
++, i
++) {
13552 cpu_kstat
->sf_itlb_misses
= 0;
13553 cpu_kstat
->sf_dtlb_misses
= 0;
13554 cpu_kstat
->sf_utsb_misses
= tsbm
->utsb_misses
-
13556 cpu_kstat
->sf_ktsb_misses
= tsbm
->ktsb_misses
+
13557 kpmtsbm
->kpm_tsb_misses
- tsbm
->kprot_traps
;
13558 cpu_kstat
->sf_tsb_hits
= 0;
13559 cpu_kstat
->sf_umod_faults
= tsbm
->uprot_traps
;
13560 cpu_kstat
->sf_kmod_faults
= tsbm
->kprot_traps
;
13563 /* KSTAT_WRITE is used to clear stats */
13564 for (i
= 0; i
< NCPU
; tsbm
++, kpmtsbm
++, i
++) {
13565 tsbm
->utsb_misses
= 0;
13566 tsbm
->ktsb_misses
= 0;
13567 tsbm
->uprot_traps
= 0;
13568 tsbm
->kprot_traps
= 0;
13569 kpmtsbm
->kpm_dtlb_misses
= 0;
13570 kpmtsbm
->kpm_tsb_misses
= 0;
13578 tte_t
*gorig
[NCPU
], *gcur
[NCPU
], *gnew
[NCPU
];
13581 * A tte checker. *orig_old is the value we read before cas.
13582 * *cur is the value returned by cas.
13583 * *new is the desired value when we do the cas.
13585 * *hmeblkp is currently unused.
13590 chk_tte(tte_t
*orig_old
, tte_t
*cur
, tte_t
*new, struct hme_blk
*hmeblkp
)
13593 int cpuid
= CPU
->cpu_id
;
13595 gorig
[cpuid
] = orig_old
;
13603 if (TTE_IS_VALID(orig_old
)) {
13604 if (TTE_IS_VALID(cur
)) {
13605 i
= TTE_TO_TTEPFN(orig_old
);
13606 j
= TTE_TO_TTEPFN(cur
);
13607 k
= TTE_TO_TTEPFN(new);
13610 panic("chk_tte: bad pfn, 0x%lx, 0x%lx", i
, j
);
13615 panic("chk_tte: bad pfn2, 0x%lx, 0x%lx", i
, k
);
13618 if (TTE_IS_VALID(new)) {
13619 panic("chk_tte: invalid cur? ");
13622 i
= TTE_TO_TTEPFN(orig_old
);
13623 k
= TTE_TO_TTEPFN(new);
13625 panic("chk_tte: bad pfn3, 0x%lx, 0x%lx", i
, k
);
13629 if (TTE_IS_VALID(cur
)) {
13630 j
= TTE_TO_TTEPFN(cur
);
13631 if (TTE_IS_VALID(new)) {
13632 k
= TTE_TO_TTEPFN(new);
13634 panic("chk_tte: bad pfn4, 0x%lx, 0x%lx",
13638 panic("chk_tte: why here?");
13641 if (!TTE_IS_VALID(new)) {
13642 panic("chk_tte: why here2 ?");
13650 extern void prefetch_tsbe_read(struct tsbe
*);
13651 extern void prefetch_tsbe_write(struct tsbe
*);
13655 * We want to prefetch 7 cache lines ahead for our read prefetch. This gives
13656 * us optimal performance on Cheetah+. You can only have 8 outstanding
13657 * prefetches at any one time, so we opted for 7 read prefetches and 1 write
13658 * prefetch to make the most utilization of the prefetch capability.
13660 #define TSBE_PREFETCH_STRIDE (7)
13663 sfmmu_copy_tsb(struct tsb_info
*old_tsbinfo
, struct tsb_info
*new_tsbinfo
)
13665 int old_bytes
= TSB_BYTES(old_tsbinfo
->tsb_szc
);
13666 int new_bytes
= TSB_BYTES(new_tsbinfo
->tsb_szc
);
13667 int old_entries
= TSB_ENTRIES(old_tsbinfo
->tsb_szc
);
13668 int new_entries
= TSB_ENTRIES(new_tsbinfo
->tsb_szc
);
13671 struct tsbe
*new_base
= (struct tsbe
*)new_tsbinfo
->tsb_va
;
13678 if (old_bytes
== new_bytes
) {
13679 bcopy(old_tsbinfo
->tsb_va
, new_tsbinfo
->tsb_va
, new_bytes
);
13683 * A TSBE is 16 bytes which means there are four TSBE's per
13684 * P$ line (64 bytes), thus every 4 TSBE's we prefetch.
13686 old
= (struct tsbe
*)old_tsbinfo
->tsb_va
;
13687 last_prefetch
= old_entries
- (4*(TSBE_PREFETCH_STRIDE
+1));
13688 for (i
= 0; i
< old_entries
; i
++, old
++) {
13689 if (((i
& (4-1)) == 0) && (i
< last_prefetch
))
13690 prefetch_tsbe_read(old
);
13691 if (!old
->tte_tag
.tag_invalid
) {
13693 * We have a valid TTE to remap. Check the
13694 * size. We won't remap 64K or 512K TTEs
13695 * because they span more than one TSB entry
13696 * and are indexed using an 8K virt. page.
13697 * Ditto for 32M and 256M TTEs.
13699 if (TTE_CSZ(&old
->tte_data
) == TTE64K
||
13700 TTE_CSZ(&old
->tte_data
) == TTE512K
)
13702 if (mmu_page_sizes
== max_mmu_page_sizes
) {
13703 if (TTE_CSZ(&old
->tte_data
) == TTE32M
||
13704 TTE_CSZ(&old
->tte_data
) == TTE256M
)
13708 /* clear the lower 22 bits of the va */
13709 va
= *(uint64_t *)old
<< 22;
13710 /* turn va into a virtual pfn */
13711 va
>>= 22 - TSB_START_SIZE
;
13713 * or in bits from the offset in the tsb
13714 * to get the real virtual pfn. These
13715 * correspond to bits [21:13] in the va
13718 TTE_BSZS_SHIFT(TTE_CSZ(&old
->tte_data
)) &
13720 va
|= (i
<< vpshift
);
13722 new_offset
= va
& (new_entries
- 1);
13723 new = new_base
+ new_offset
;
13724 prefetch_tsbe_write(new);
13740 * Called when a thread is exiting and we have switched to the kernel address
13741 * space. Perform the same VM initialization resume() uses when switching
13744 * Note that sfmmu_load_mmustate() is currently a no-op for kernel threads, but
13745 * we call it anyway in case the semantics change in the future.
13749 hat_thread_exit(kthread_t
*thd
)
13752 uint_t pstate_save
;
13754 ASSERT(thd
->t_procp
->p_as
== &kas
);
13756 pgsz_cnum
= KCONTEXT
;
13758 pgsz_cnum
|= (ksfmmup
->sfmmu_cext
<< CTXREG_EXT_SHIFT
);
13762 * Note that sfmmu_load_mmustate() is currently a no-op for
13763 * kernel threads. We need to disable interrupts here,
13764 * simply because otherwise sfmmu_load_mmustate() would panic
13765 * if the caller does not disable interrupts.
13767 pstate_save
= sfmmu_disable_intrs();
13769 /* Compatibility Note: hw takes care of MMU_SCONTEXT1 */
13770 sfmmu_setctx_sec(pgsz_cnum
);
13771 sfmmu_load_mmustate(ksfmmup
);
13772 sfmmu_enable_intrs(pstate_save
);
13779 #define SRD_HASH_FUNCTION(vp) (((((uintptr_t)(vp)) >> 4) ^ \
13780 (((uintptr_t)(vp)) >> 11)) & \
13784 * Attach the process to the srd struct associated with the exec vnode
13785 * from which the process is started.
13788 hat_join_srd(struct hat
*sfmmup
, vnode_t
*evp
)
13790 uint_t hash
= SRD_HASH_FUNCTION(evp
);
13794 ASSERT(sfmmup
!= ksfmmup
);
13795 ASSERT(sfmmup
->sfmmu_srdp
== NULL
);
13803 if (srd_buckets
[hash
].srdb_srdp
!= NULL
) {
13804 mutex_enter(&srd_buckets
[hash
].srdb_lock
);
13805 for (srdp
= srd_buckets
[hash
].srdb_srdp
; srdp
!= NULL
;
13806 srdp
= srdp
->srd_hash
) {
13807 if (srdp
->srd_evp
== evp
) {
13808 ASSERT(srdp
->srd_refcnt
>= 0);
13809 sfmmup
->sfmmu_srdp
= srdp
;
13811 (volatile uint_t
*)&srdp
->srd_refcnt
);
13812 mutex_exit(&srd_buckets
[hash
].srdb_lock
);
13816 mutex_exit(&srd_buckets
[hash
].srdb_lock
);
13818 newsrdp
= kmem_cache_alloc(srd_cache
, KM_SLEEP
);
13819 ASSERT(newsrdp
->srd_next_ismrid
== 0 && newsrdp
->srd_next_hmerid
== 0);
13821 newsrdp
->srd_evp
= evp
;
13822 newsrdp
->srd_refcnt
= 1;
13823 newsrdp
->srd_hmergnfree
= NULL
;
13824 newsrdp
->srd_ismrgnfree
= NULL
;
13826 mutex_enter(&srd_buckets
[hash
].srdb_lock
);
13827 for (srdp
= srd_buckets
[hash
].srdb_srdp
; srdp
!= NULL
;
13828 srdp
= srdp
->srd_hash
) {
13829 if (srdp
->srd_evp
== evp
) {
13830 ASSERT(srdp
->srd_refcnt
>= 0);
13831 sfmmup
->sfmmu_srdp
= srdp
;
13832 atomic_inc_32((volatile uint_t
*)&srdp
->srd_refcnt
);
13833 mutex_exit(&srd_buckets
[hash
].srdb_lock
);
13834 kmem_cache_free(srd_cache
, newsrdp
);
13838 newsrdp
->srd_hash
= srd_buckets
[hash
].srdb_srdp
;
13839 srd_buckets
[hash
].srdb_srdp
= newsrdp
;
13840 sfmmup
->sfmmu_srdp
= newsrdp
;
13842 mutex_exit(&srd_buckets
[hash
].srdb_lock
);
13847 sfmmu_leave_srd(sfmmu_t
*sfmmup
)
13850 sf_srd_t
*srdp
= sfmmup
->sfmmu_srdp
;
13852 sf_srd_t
**prev_srdpp
;
13854 sf_region_t
*nrgnp
;
13860 ASSERT(sfmmup
!= ksfmmup
);
13861 ASSERT(srdp
!= NULL
);
13862 ASSERT(srdp
->srd_refcnt
> 0);
13863 ASSERT(sfmmup
->sfmmu_scdp
== NULL
);
13864 ASSERT(sfmmup
->sfmmu_free
== 1);
13866 sfmmup
->sfmmu_srdp
= NULL
;
13867 evp
= srdp
->srd_evp
;
13868 ASSERT(evp
!= NULL
);
13869 if (atomic_dec_32_nv((volatile uint_t
*)&srdp
->srd_refcnt
)) {
13874 hash
= SRD_HASH_FUNCTION(evp
);
13875 mutex_enter(&srd_buckets
[hash
].srdb_lock
);
13876 for (prev_srdpp
= &srd_buckets
[hash
].srdb_srdp
;
13877 (srdp
= *prev_srdpp
) != NULL
; prev_srdpp
= &srdp
->srd_hash
) {
13878 if (srdp
->srd_evp
== evp
) {
13882 if (srdp
== NULL
|| srdp
->srd_refcnt
) {
13883 mutex_exit(&srd_buckets
[hash
].srdb_lock
);
13887 *prev_srdpp
= srdp
->srd_hash
;
13888 mutex_exit(&srd_buckets
[hash
].srdb_lock
);
13890 ASSERT(srdp
->srd_refcnt
== 0);
13894 for (i
= 0; i
< SFMMU_MAX_REGION_BUCKETS
; i
++) {
13895 ASSERT(srdp
->srd_rgnhash
[i
] == NULL
);
13899 /* free each hme regions in the srd */
13900 for (rgnp
= srdp
->srd_hmergnfree
; rgnp
!= NULL
; rgnp
= nrgnp
) {
13901 nrgnp
= rgnp
->rgn_next
;
13902 ASSERT(rgnp
->rgn_id
< srdp
->srd_next_hmerid
);
13903 ASSERT(rgnp
->rgn_refcnt
== 0);
13904 ASSERT(rgnp
->rgn_sfmmu_head
== NULL
);
13905 ASSERT(rgnp
->rgn_flags
& SFMMU_REGION_FREE
);
13906 ASSERT(rgnp
->rgn_hmeflags
== 0);
13907 ASSERT(srdp
->srd_hmergnp
[rgnp
->rgn_id
] == rgnp
);
13909 for (i
= 0; i
< MMU_PAGE_SIZES
; i
++) {
13910 ASSERT(rgnp
->rgn_ttecnt
[i
] == 0);
13914 kmem_cache_free(region_cache
, rgnp
);
13916 ASSERT(rgns
== srdp
->srd_next_hmerid
);
13921 /* free each ism rgns in the srd */
13922 for (rgnp
= srdp
->srd_ismrgnfree
; rgnp
!= NULL
; rgnp
= nrgnp
) {
13923 nrgnp
= rgnp
->rgn_next
;
13924 ASSERT(rgnp
->rgn_id
< srdp
->srd_next_ismrid
);
13925 ASSERT(rgnp
->rgn_refcnt
== 0);
13926 ASSERT(rgnp
->rgn_sfmmu_head
== NULL
);
13927 ASSERT(rgnp
->rgn_flags
& SFMMU_REGION_FREE
);
13928 ASSERT(srdp
->srd_ismrgnp
[rgnp
->rgn_id
] == rgnp
);
13930 for (i
= 0; i
< MMU_PAGE_SIZES
; i
++) {
13931 ASSERT(rgnp
->rgn_ttecnt
[i
] == 0);
13935 kmem_cache_free(region_cache
, rgnp
);
13937 ASSERT(rgns
== srdp
->srd_next_ismrid
);
13938 ASSERT(srdp
->srd_ismbusyrgns
== 0);
13939 ASSERT(srdp
->srd_hmebusyrgns
== 0);
13941 srdp
->srd_next_ismrid
= 0;
13942 srdp
->srd_next_hmerid
= 0;
13944 bzero((void *)srdp
->srd_ismrgnp
,
13945 sizeof (sf_region_t
*) * SFMMU_MAX_ISM_REGIONS
);
13946 bzero((void *)srdp
->srd_hmergnp
,
13947 sizeof (sf_region_t
*) * SFMMU_MAX_HME_REGIONS
);
13949 ASSERT(srdp
->srd_scdp
== NULL
);
13950 kmem_cache_free(srd_cache
, srdp
);
13955 sfmmu_srdcache_constructor(void *buf
, void *cdrarg
, int kmflags
)
13957 sf_srd_t
*srdp
= (sf_srd_t
*)buf
;
13958 bzero(buf
, sizeof (*srdp
));
13960 mutex_init(&srdp
->srd_mutex
, NULL
, MUTEX_DEFAULT
, NULL
);
13961 mutex_init(&srdp
->srd_scd_mutex
, NULL
, MUTEX_DEFAULT
, NULL
);
13967 sfmmu_srdcache_destructor(void *buf
, void *cdrarg
)
13969 sf_srd_t
*srdp
= (sf_srd_t
*)buf
;
13971 mutex_destroy(&srdp
->srd_mutex
);
13972 mutex_destroy(&srdp
->srd_scd_mutex
);
13976 * The caller makes sure hat_join_region()/hat_leave_region() can't be called
13977 * at the same time for the same process and address range. This is ensured by
13978 * the fact that address space is locked as writer when a process joins the
13979 * regions. Therefore there's no need to hold an srd lock during the entire
13980 * execution of hat_join_region()/hat_leave_region().
13983 #define RGN_HASH_FUNCTION(obj) (((((uintptr_t)(obj)) >> 4) ^ \
13984 (((uintptr_t)(obj)) >> 11)) & \
13987 * This routine implements the shared context functionality required when
13988 * attaching a segment to an address space. It must be called from
13989 * hat_share() for D(ISM) segments and from segvn_create() for segments
13990 * with the MAP_PRIVATE and MAP_TEXT flags set. It returns a region_cookie
13991 * which is saved in the private segment data for hme segments and
13992 * the ism_map structure for ism segments.
13994 hat_region_cookie_t
13995 hat_join_region(struct hat
*sfmmup
,
13999 u_offset_t r_objoff
,
14002 hat_rgn_cb_func_t r_cb_function
,
14005 sf_srd_t
*srdp
= sfmmup
->sfmmu_srdp
;
14008 hatlock_t
*hatlockp
;
14010 sf_region_t
*new_rgnp
= NULL
;
14013 sf_region_t
**freelistp
;
14015 sf_region_t
**rarrp
;
14016 uint16_t *busyrgnsp
;
14019 uchar_t r_type
= flags
& HAT_REGION_TYPE_MASK
;
14020 int text
= (r_type
== HAT_REGION_TEXT
);
14022 if (srdp
== NULL
|| r_size
== 0) {
14023 return (HAT_INVALID_REGION_COOKIE
);
14026 ASSERT(sfmmup
->sfmmu_xhat_provider
== NULL
);
14027 ASSERT(sfmmup
!= ksfmmup
);
14028 ASSERT(AS_WRITE_HELD(sfmmup
->sfmmu_as
));
14029 ASSERT(srdp
->srd_refcnt
> 0);
14030 ASSERT(!(flags
& ~HAT_REGION_TYPE_MASK
));
14031 ASSERT(flags
== HAT_REGION_TEXT
|| flags
== HAT_REGION_ISM
);
14032 ASSERT(r_pgszc
< mmu_page_sizes
);
14033 if (!IS_P2ALIGNED(r_saddr
, TTEBYTES(r_pgszc
)) ||
14034 !IS_P2ALIGNED(r_size
, TTEBYTES(r_pgszc
))) {
14035 panic("hat_join_region: region addr or size is not aligned\n");
14039 r_type
= (r_type
== HAT_REGION_ISM
) ? SFMMU_REGION_ISM
:
14042 * Currently only support shared hmes for the read only main text
14045 if (r_type
== SFMMU_REGION_HME
&& ((r_obj
!= srdp
->srd_evp
) ||
14046 (r_perm
& PROT_WRITE
))) {
14047 return (HAT_INVALID_REGION_COOKIE
);
14050 rhash
= RGN_HASH_FUNCTION(r_obj
);
14052 if (r_type
== SFMMU_REGION_ISM
) {
14053 nextidp
= &srdp
->srd_next_ismrid
;
14054 freelistp
= &srdp
->srd_ismrgnfree
;
14055 maxids
= SFMMU_MAX_ISM_REGIONS
;
14056 rarrp
= srdp
->srd_ismrgnp
;
14057 busyrgnsp
= &srdp
->srd_ismbusyrgns
;
14059 nextidp
= &srdp
->srd_next_hmerid
;
14060 freelistp
= &srdp
->srd_hmergnfree
;
14061 maxids
= SFMMU_MAX_HME_REGIONS
;
14062 rarrp
= srdp
->srd_hmergnp
;
14063 busyrgnsp
= &srdp
->srd_hmebusyrgns
;
14066 mutex_enter(&srdp
->srd_mutex
);
14068 for (rgnp
= srdp
->srd_rgnhash
[rhash
]; rgnp
!= NULL
;
14069 rgnp
= rgnp
->rgn_hash
) {
14070 if (rgnp
->rgn_saddr
== r_saddr
&& rgnp
->rgn_size
== r_size
&&
14071 rgnp
->rgn_obj
== r_obj
&& rgnp
->rgn_objoff
== r_objoff
&&
14072 rgnp
->rgn_perm
== r_perm
&& rgnp
->rgn_pgszc
== r_pgszc
) {
14078 if (rgnp
!= NULL
) {
14079 ASSERT((rgnp
->rgn_flags
& SFMMU_REGION_TYPE_MASK
) == r_type
);
14080 ASSERT(rgnp
->rgn_cb_function
== r_cb_function
);
14081 ASSERT(rgnp
->rgn_refcnt
>= 0);
14082 rid
= rgnp
->rgn_id
;
14083 ASSERT(rid
< maxids
);
14084 ASSERT(rarrp
[rid
] == rgnp
);
14085 ASSERT(rid
< *nextidp
);
14086 atomic_inc_32((volatile uint_t
*)&rgnp
->rgn_refcnt
);
14087 mutex_exit(&srdp
->srd_mutex
);
14088 if (new_rgnp
!= NULL
) {
14089 kmem_cache_free(region_cache
, new_rgnp
);
14091 if (r_type
== SFMMU_REGION_HME
) {
14093 (sfmmup
== astosfmmu(curthread
->t_procp
->p_as
));
14095 sfmmu_link_to_hmeregion(sfmmup
, rgnp
);
14097 * bitmap should be updated after linking sfmmu on
14098 * region list so that pageunload() doesn't skip
14099 * TSB/TLB flush. As soon as bitmap is updated another
14100 * thread in this process can already start accessing
14104 * Normally ttecnt accounting is done as part of
14105 * pagefault handling. But a process may not take any
14106 * pagefaults on shared hmeblks created by some other
14107 * process. To compensate for this assume that the
14108 * entire region will end up faulted in using
14109 * the region's pagesize.
14112 if (r_pgszc
> TTE8K
) {
14113 tteflag
= 1 << r_pgszc
;
14114 if (disable_large_pages
& tteflag
) {
14120 if (tteflag
&& !(sfmmup
->sfmmu_rtteflags
& tteflag
)) {
14121 hatlockp
= sfmmu_hat_enter(sfmmup
);
14122 sfmmup
->sfmmu_rtteflags
|= tteflag
;
14123 sfmmu_hat_exit(hatlockp
);
14125 hatlockp
= sfmmu_hat_enter(sfmmup
);
14128 * Preallocate 1/4 of ttecnt's in 8K TSB for >= 4M
14129 * region to allow for large page allocation failure.
14131 if (r_pgszc
>= TTE4M
) {
14132 sfmmup
->sfmmu_tsb0_4minflcnt
+=
14133 r_size
>> (TTE_PAGE_SHIFT(TTE8K
) + 2);
14136 /* update sfmmu_ttecnt with the shme rgn ttecnt */
14137 rttecnt
= r_size
>> TTE_PAGE_SHIFT(r_pgszc
);
14138 atomic_add_long(&sfmmup
->sfmmu_ttecnt
[r_pgszc
],
14141 if (text
&& r_pgszc
>= TTE4M
&&
14142 (tteflag
|| ((disable_large_pages
>> TTE4M
) &
14143 ((1 << (r_pgszc
- TTE4M
+ 1)) - 1))) &&
14144 !SFMMU_FLAGS_ISSET(sfmmup
, HAT_4MTEXT_FLAG
)) {
14145 SFMMU_FLAGS_SET(sfmmup
, HAT_4MTEXT_FLAG
);
14148 sfmmu_hat_exit(hatlockp
);
14150 * On Panther we need to make sure TLB is programmed
14151 * to accept 32M/256M pages. Call
14152 * sfmmu_check_page_sizes() now to make sure TLB is
14153 * setup before making hmeregions visible to other
14156 sfmmu_check_page_sizes(sfmmup
, 1);
14157 hatlockp
= sfmmu_hat_enter(sfmmup
);
14158 SF_RGNMAP_ADD(sfmmup
->sfmmu_hmeregion_map
, rid
);
14161 * if context is invalid tsb miss exception code will
14162 * call sfmmu_check_page_sizes() and update tsbmiss
14165 kpreempt_disable();
14167 (sfmmup
->sfmmu_ctxs
[CPU_MMU_IDX(CPU
)].cnum
14168 != INVALID_CONTEXT
)) {
14169 struct tsbmiss
*tsbmp
;
14171 tsbmp
= &tsbmiss_area
[CPU
->cpu_id
];
14172 ASSERT(sfmmup
== tsbmp
->usfmmup
);
14173 BT_SET(tsbmp
->shmermap
, rid
);
14174 if (r_pgszc
> TTE64K
) {
14175 tsbmp
->uhat_rtteflags
|= tteflag
;
14181 sfmmu_hat_exit(hatlockp
);
14182 ASSERT((hat_region_cookie_t
)((uint64_t)rid
) !=
14183 HAT_INVALID_REGION_COOKIE
);
14185 hatlockp
= sfmmu_hat_enter(sfmmup
);
14186 SF_RGNMAP_ADD(sfmmup
->sfmmu_ismregion_map
, rid
);
14187 sfmmu_hat_exit(hatlockp
);
14189 ASSERT(rid
< maxids
);
14191 if (r_type
== SFMMU_REGION_ISM
) {
14192 sfmmu_find_scd(sfmmup
);
14194 return ((hat_region_cookie_t
)((uint64_t)rid
));
14197 ASSERT(new_rgnp
== NULL
);
14199 if (*busyrgnsp
>= maxids
) {
14200 mutex_exit(&srdp
->srd_mutex
);
14201 return (HAT_INVALID_REGION_COOKIE
);
14204 ASSERT(MUTEX_HELD(&srdp
->srd_mutex
));
14205 if (*freelistp
!= NULL
) {
14207 *freelistp
= rgnp
->rgn_next
;
14208 ASSERT(rgnp
->rgn_id
< *nextidp
);
14209 ASSERT(rgnp
->rgn_id
< maxids
);
14210 ASSERT(rgnp
->rgn_flags
& SFMMU_REGION_FREE
);
14211 ASSERT((rgnp
->rgn_flags
& SFMMU_REGION_TYPE_MASK
)
14213 ASSERT(rarrp
[rgnp
->rgn_id
] == rgnp
);
14214 ASSERT(rgnp
->rgn_hmeflags
== 0);
14217 * release local locks before memory allocation.
14219 mutex_exit(&srdp
->srd_mutex
);
14221 new_rgnp
= kmem_cache_alloc(region_cache
, KM_SLEEP
);
14223 mutex_enter(&srdp
->srd_mutex
);
14224 for (rgnp
= srdp
->srd_rgnhash
[rhash
]; rgnp
!= NULL
;
14225 rgnp
= rgnp
->rgn_hash
) {
14226 if (rgnp
->rgn_saddr
== r_saddr
&&
14227 rgnp
->rgn_size
== r_size
&&
14228 rgnp
->rgn_obj
== r_obj
&&
14229 rgnp
->rgn_objoff
== r_objoff
&&
14230 rgnp
->rgn_perm
== r_perm
&&
14231 rgnp
->rgn_pgszc
== r_pgszc
) {
14235 if (rgnp
!= NULL
) {
14239 if (*nextidp
>= maxids
) {
14240 mutex_exit(&srdp
->srd_mutex
);
14245 rgnp
->rgn_id
= (*nextidp
)++;
14246 ASSERT(rgnp
->rgn_id
< maxids
);
14247 ASSERT(rarrp
[rgnp
->rgn_id
] == NULL
);
14248 rarrp
[rgnp
->rgn_id
] = rgnp
;
14251 ASSERT(rgnp
->rgn_sfmmu_head
== NULL
);
14252 ASSERT(rgnp
->rgn_hmeflags
== 0);
14254 for (i
= 0; i
< MMU_PAGE_SIZES
; i
++) {
14255 ASSERT(rgnp
->rgn_ttecnt
[i
] == 0);
14258 rgnp
->rgn_saddr
= r_saddr
;
14259 rgnp
->rgn_size
= r_size
;
14260 rgnp
->rgn_obj
= r_obj
;
14261 rgnp
->rgn_objoff
= r_objoff
;
14262 rgnp
->rgn_perm
= r_perm
;
14263 rgnp
->rgn_pgszc
= r_pgszc
;
14264 rgnp
->rgn_flags
= r_type
;
14265 rgnp
->rgn_refcnt
= 0;
14266 rgnp
->rgn_cb_function
= r_cb_function
;
14267 rgnp
->rgn_hash
= srdp
->srd_rgnhash
[rhash
];
14268 srdp
->srd_rgnhash
[rhash
] = rgnp
;
14270 ASSERT(*busyrgnsp
<= maxids
);
14274 ASSERT(new_rgnp
!= NULL
);
14275 kmem_cache_free(region_cache
, new_rgnp
);
14276 return (HAT_INVALID_REGION_COOKIE
);
14280 * This function implements the shared context functionality required
14281 * when detaching a segment from an address space. It must be called
14282 * from hat_unshare() for all D(ISM) segments and from segvn_unmap(),
14283 * for segments with a valid region_cookie.
14284 * It will also be called from all seg_vn routines which change a
14285 * segment's attributes such as segvn_setprot(), segvn_setpagesize(),
14286 * segvn_clrszc() & segvn_advise(), as well as in the case of COW fault
14287 * from segvn_fault().
14290 hat_leave_region(struct hat
*sfmmup
, hat_region_cookie_t rcookie
, uint_t flags
)
14292 sf_srd_t
*srdp
= sfmmup
->sfmmu_srdp
;
14295 uint_t rid
= (uint_t
)((uint64_t)rcookie
);
14296 hatlock_t
*hatlockp
= NULL
;
14298 sf_region_t
**prev_rgnpp
;
14299 sf_region_t
*cur_rgnp
;
14306 uchar_t r_type
= flags
& HAT_REGION_TYPE_MASK
;
14308 ASSERT(sfmmup
!= ksfmmup
);
14309 ASSERT(srdp
!= NULL
);
14310 ASSERT(srdp
->srd_refcnt
> 0);
14311 ASSERT(!(flags
& ~HAT_REGION_TYPE_MASK
));
14312 ASSERT(flags
== HAT_REGION_TEXT
|| flags
== HAT_REGION_ISM
);
14313 ASSERT(!sfmmup
->sfmmu_free
|| sfmmup
->sfmmu_scdp
== NULL
);
14315 r_type
= (r_type
== HAT_REGION_ISM
) ? SFMMU_REGION_ISM
:
14318 if (r_type
== SFMMU_REGION_ISM
) {
14319 ASSERT(SFMMU_IS_ISMRID_VALID(rid
));
14320 ASSERT(rid
< SFMMU_MAX_ISM_REGIONS
);
14321 rgnp
= srdp
->srd_ismrgnp
[rid
];
14323 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
14324 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
14325 rgnp
= srdp
->srd_hmergnp
[rid
];
14327 ASSERT(rgnp
!= NULL
);
14328 ASSERT(rgnp
->rgn_id
== rid
);
14329 ASSERT((rgnp
->rgn_flags
& SFMMU_REGION_TYPE_MASK
) == r_type
);
14330 ASSERT(!(rgnp
->rgn_flags
& SFMMU_REGION_FREE
));
14331 ASSERT(AS_LOCK_HELD(sfmmup
->sfmmu_as
));
14333 ASSERT(sfmmup
->sfmmu_xhat_provider
== NULL
);
14334 if (r_type
== SFMMU_REGION_HME
&& sfmmup
->sfmmu_as
->a_xhat
!= NULL
) {
14335 xhat_unload_callback_all(sfmmup
->sfmmu_as
, rgnp
->rgn_saddr
,
14336 rgnp
->rgn_size
, 0, NULL
);
14339 if (sfmmup
->sfmmu_free
) {
14341 r_pgszc
= rgnp
->rgn_pgszc
;
14342 r_size
= rgnp
->rgn_size
;
14344 ASSERT(sfmmup
->sfmmu_scdp
== NULL
);
14345 if (r_type
== SFMMU_REGION_ISM
) {
14346 SF_RGNMAP_DEL(sfmmup
->sfmmu_ismregion_map
, rid
);
14348 /* update shme rgns ttecnt in sfmmu_ttecnt */
14349 rttecnt
= r_size
>> TTE_PAGE_SHIFT(r_pgszc
);
14350 ASSERT(sfmmup
->sfmmu_ttecnt
[r_pgszc
] >= rttecnt
);
14352 atomic_add_long(&sfmmup
->sfmmu_ttecnt
[r_pgszc
],
14355 SF_RGNMAP_DEL(sfmmup
->sfmmu_hmeregion_map
, rid
);
14357 } else if (r_type
== SFMMU_REGION_ISM
) {
14358 hatlockp
= sfmmu_hat_enter(sfmmup
);
14359 ASSERT(rid
< srdp
->srd_next_ismrid
);
14360 SF_RGNMAP_DEL(sfmmup
->sfmmu_ismregion_map
, rid
);
14361 scdp
= sfmmup
->sfmmu_scdp
;
14362 if (scdp
!= NULL
&&
14363 SF_RGNMAP_TEST(scdp
->scd_ismregion_map
, rid
)) {
14364 sfmmu_leave_scd(sfmmup
, r_type
);
14365 ASSERT(sfmmu_hat_lock_held(sfmmup
));
14367 sfmmu_hat_exit(hatlockp
);
14370 r_pgszc
= rgnp
->rgn_pgszc
;
14371 r_saddr
= rgnp
->rgn_saddr
;
14372 r_size
= rgnp
->rgn_size
;
14373 r_eaddr
= r_saddr
+ r_size
;
14375 ASSERT(r_type
== SFMMU_REGION_HME
);
14376 hatlockp
= sfmmu_hat_enter(sfmmup
);
14377 ASSERT(rid
< srdp
->srd_next_hmerid
);
14378 SF_RGNMAP_DEL(sfmmup
->sfmmu_hmeregion_map
, rid
);
14381 * If region is part of an SCD call sfmmu_leave_scd().
14382 * Otherwise if process is not exiting and has valid context
14383 * just drop the context on the floor to lose stale TLB
14384 * entries and force the update of tsb miss area to reflect
14385 * the new region map. After that clean our TSB entries.
14387 scdp
= sfmmup
->sfmmu_scdp
;
14388 if (scdp
!= NULL
&&
14389 SF_RGNMAP_TEST(scdp
->scd_hmeregion_map
, rid
)) {
14390 sfmmu_leave_scd(sfmmup
, r_type
);
14391 ASSERT(sfmmu_hat_lock_held(sfmmup
));
14393 sfmmu_invalidate_ctx(sfmmup
);
14396 while (i
< mmu_page_sizes
) {
14397 if (rgnp
->rgn_ttecnt
[i
] != 0) {
14398 sfmmu_unload_tsb_range(sfmmup
, r_saddr
,
14409 /* Remove the preallocated 1/4 8k ttecnt for 4M regions. */
14410 if (r_pgszc
>= TTE4M
) {
14411 rttecnt
= r_size
>> (TTE_PAGE_SHIFT(TTE8K
) + 2);
14412 ASSERT(sfmmup
->sfmmu_tsb0_4minflcnt
>=
14414 sfmmup
->sfmmu_tsb0_4minflcnt
-= rttecnt
;
14417 /* update shme rgns ttecnt in sfmmu_ttecnt */
14418 rttecnt
= r_size
>> TTE_PAGE_SHIFT(r_pgszc
);
14419 ASSERT(sfmmup
->sfmmu_ttecnt
[r_pgszc
] >= rttecnt
);
14420 atomic_add_long(&sfmmup
->sfmmu_ttecnt
[r_pgszc
], -rttecnt
);
14422 sfmmu_hat_exit(hatlockp
);
14423 if (scdp
!= NULL
&& sfmmup
->sfmmu_scdp
== NULL
) {
14424 /* sfmmup left the scd, grow private tsb */
14425 sfmmu_check_page_sizes(sfmmup
, 1);
14427 sfmmu_check_page_sizes(sfmmup
, 0);
14431 if (r_type
== SFMMU_REGION_HME
) {
14432 sfmmu_unlink_from_hmeregion(sfmmup
, rgnp
);
14435 r_obj
= rgnp
->rgn_obj
;
14436 if (atomic_dec_32_nv((volatile uint_t
*)&rgnp
->rgn_refcnt
)) {
14441 * looks like nobody uses this region anymore. Free it.
14443 rhash
= RGN_HASH_FUNCTION(r_obj
);
14444 mutex_enter(&srdp
->srd_mutex
);
14445 for (prev_rgnpp
= &srdp
->srd_rgnhash
[rhash
];
14446 (cur_rgnp
= *prev_rgnpp
) != NULL
;
14447 prev_rgnpp
= &cur_rgnp
->rgn_hash
) {
14448 if (cur_rgnp
== rgnp
&& cur_rgnp
->rgn_refcnt
== 0) {
14453 if (cur_rgnp
== NULL
) {
14454 mutex_exit(&srdp
->srd_mutex
);
14458 ASSERT((rgnp
->rgn_flags
& SFMMU_REGION_TYPE_MASK
) == r_type
);
14459 *prev_rgnpp
= rgnp
->rgn_hash
;
14460 if (r_type
== SFMMU_REGION_ISM
) {
14461 rgnp
->rgn_flags
|= SFMMU_REGION_FREE
;
14462 ASSERT(rid
< srdp
->srd_next_ismrid
);
14463 rgnp
->rgn_next
= srdp
->srd_ismrgnfree
;
14464 srdp
->srd_ismrgnfree
= rgnp
;
14465 ASSERT(srdp
->srd_ismbusyrgns
> 0);
14466 srdp
->srd_ismbusyrgns
--;
14467 mutex_exit(&srdp
->srd_mutex
);
14470 mutex_exit(&srdp
->srd_mutex
);
14473 * Destroy region's hmeblks.
14475 sfmmu_unload_hmeregion(srdp
, rgnp
);
14477 rgnp
->rgn_hmeflags
= 0;
14479 ASSERT(rgnp
->rgn_sfmmu_head
== NULL
);
14480 ASSERT(rgnp
->rgn_id
== rid
);
14481 for (i
= 0; i
< MMU_PAGE_SIZES
; i
++) {
14482 rgnp
->rgn_ttecnt
[i
] = 0;
14484 rgnp
->rgn_flags
|= SFMMU_REGION_FREE
;
14485 mutex_enter(&srdp
->srd_mutex
);
14486 ASSERT(rid
< srdp
->srd_next_hmerid
);
14487 rgnp
->rgn_next
= srdp
->srd_hmergnfree
;
14488 srdp
->srd_hmergnfree
= rgnp
;
14489 ASSERT(srdp
->srd_hmebusyrgns
> 0);
14490 srdp
->srd_hmebusyrgns
--;
14491 mutex_exit(&srdp
->srd_mutex
);
14495 * For now only called for hmeblk regions and not for ISM regions.
14498 hat_dup_region(struct hat
*sfmmup
, hat_region_cookie_t rcookie
)
14500 sf_srd_t
*srdp
= sfmmup
->sfmmu_srdp
;
14501 uint_t rid
= (uint_t
)((uint64_t)rcookie
);
14503 sf_rgn_link_t
*rlink
;
14504 sf_rgn_link_t
*hrlink
;
14507 ASSERT(sfmmup
!= ksfmmup
);
14508 ASSERT(srdp
!= NULL
);
14509 ASSERT(srdp
->srd_refcnt
> 0);
14511 ASSERT(rid
< srdp
->srd_next_hmerid
);
14512 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
14513 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
14515 rgnp
= srdp
->srd_hmergnp
[rid
];
14516 ASSERT(rgnp
->rgn_refcnt
> 0);
14517 ASSERT(rgnp
->rgn_id
== rid
);
14518 ASSERT((rgnp
->rgn_flags
& SFMMU_REGION_TYPE_MASK
) == SFMMU_REGION_HME
);
14519 ASSERT(!(rgnp
->rgn_flags
& SFMMU_REGION_FREE
));
14521 atomic_inc_32((volatile uint_t
*)&rgnp
->rgn_refcnt
);
14523 /* LINTED: constant in conditional context */
14524 SFMMU_HMERID2RLINKP(sfmmup
, rid
, rlink
, 1, 0);
14525 ASSERT(rlink
!= NULL
);
14526 mutex_enter(&rgnp
->rgn_mutex
);
14527 ASSERT(rgnp
->rgn_sfmmu_head
!= NULL
);
14528 /* LINTED: constant in conditional context */
14529 SFMMU_HMERID2RLINKP(rgnp
->rgn_sfmmu_head
, rid
, hrlink
, 0, 0);
14530 ASSERT(hrlink
!= NULL
);
14531 ASSERT(hrlink
->prev
== NULL
);
14532 rlink
->next
= rgnp
->rgn_sfmmu_head
;
14533 rlink
->prev
= NULL
;
14534 hrlink
->prev
= sfmmup
;
14536 * make sure rlink's next field is correct
14537 * before making this link visible.
14540 rgnp
->rgn_sfmmu_head
= sfmmup
;
14541 mutex_exit(&rgnp
->rgn_mutex
);
14543 /* update sfmmu_ttecnt with the shme rgn ttecnt */
14544 rttecnt
= rgnp
->rgn_size
>> TTE_PAGE_SHIFT(rgnp
->rgn_pgszc
);
14545 atomic_add_long(&sfmmup
->sfmmu_ttecnt
[rgnp
->rgn_pgszc
], rttecnt
);
14546 /* update tsb0 inflation count */
14547 if (rgnp
->rgn_pgszc
>= TTE4M
) {
14548 sfmmup
->sfmmu_tsb0_4minflcnt
+=
14549 rgnp
->rgn_size
>> (TTE_PAGE_SHIFT(TTE8K
) + 2);
14552 * Update regionid bitmask without hat lock since no other thread
14553 * can update this region bitmask right now.
14555 SF_RGNMAP_ADD(sfmmup
->sfmmu_hmeregion_map
, rid
);
14560 sfmmu_rgncache_constructor(void *buf
, void *cdrarg
, int kmflags
)
14562 sf_region_t
*rgnp
= (sf_region_t
*)buf
;
14563 bzero(buf
, sizeof (*rgnp
));
14565 mutex_init(&rgnp
->rgn_mutex
, NULL
, MUTEX_DEFAULT
, NULL
);
14572 sfmmu_rgncache_destructor(void *buf
, void *cdrarg
)
14574 sf_region_t
*rgnp
= (sf_region_t
*)buf
;
14575 mutex_destroy(&rgnp
->rgn_mutex
);
14579 sfrgnmap_isnull(sf_region_map_t
*map
)
14583 for (i
= 0; i
< SFMMU_RGNMAP_WORDS
; i
++) {
14584 if (map
->bitmap
[i
] != 0) {
14592 sfhmergnmap_isnull(sf_hmeregion_map_t
*map
)
14596 for (i
= 0; i
< SFMMU_HMERGNMAP_WORDS
; i
++) {
14597 if (map
->bitmap
[i
] != 0) {
14606 check_scd_sfmmu_list(sfmmu_t
**headp
, sfmmu_t
*sfmmup
, int onlist
)
14609 sf_srd_t
*srdp
= sfmmup
->sfmmu_srdp
;
14611 for (sp
= *headp
; sp
!= NULL
; sp
= sp
->sfmmu_scd_link
.next
) {
14612 ASSERT(srdp
== sp
->sfmmu_srdp
);
14613 if (sp
== sfmmup
) {
14617 panic("shctx: sfmmu 0x%p found on scd"
14618 "list 0x%p", (void *)sfmmup
,
14624 panic("shctx: sfmmu 0x%p not found on scd list 0x%p",
14625 (void *)sfmmup
, (void *)*headp
);
14631 #define check_scd_sfmmu_list(headp, sfmmup, onlist)
14635 * Removes an sfmmu from the SCD sfmmu list.
14638 sfmmu_from_scd_list(sfmmu_t
**headp
, sfmmu_t
*sfmmup
)
14640 ASSERT(sfmmup
->sfmmu_srdp
!= NULL
);
14641 check_scd_sfmmu_list(headp
, sfmmup
, 1);
14642 if (sfmmup
->sfmmu_scd_link
.prev
!= NULL
) {
14643 ASSERT(*headp
!= sfmmup
);
14644 sfmmup
->sfmmu_scd_link
.prev
->sfmmu_scd_link
.next
=
14645 sfmmup
->sfmmu_scd_link
.next
;
14647 ASSERT(*headp
== sfmmup
);
14648 *headp
= sfmmup
->sfmmu_scd_link
.next
;
14650 if (sfmmup
->sfmmu_scd_link
.next
!= NULL
) {
14651 sfmmup
->sfmmu_scd_link
.next
->sfmmu_scd_link
.prev
=
14652 sfmmup
->sfmmu_scd_link
.prev
;
14658 * Adds an sfmmu to the start of the queue.
14661 sfmmu_to_scd_list(sfmmu_t
**headp
, sfmmu_t
*sfmmup
)
14663 check_scd_sfmmu_list(headp
, sfmmup
, 0);
14664 sfmmup
->sfmmu_scd_link
.prev
= NULL
;
14665 sfmmup
->sfmmu_scd_link
.next
= *headp
;
14666 if (*headp
!= NULL
)
14667 (*headp
)->sfmmu_scd_link
.prev
= sfmmup
;
14672 * Remove an scd from the start of the queue.
14675 sfmmu_remove_scd(sf_scd_t
**headp
, sf_scd_t
*scdp
)
14677 if (scdp
->scd_prev
!= NULL
) {
14678 ASSERT(*headp
!= scdp
);
14679 scdp
->scd_prev
->scd_next
= scdp
->scd_next
;
14681 ASSERT(*headp
== scdp
);
14682 *headp
= scdp
->scd_next
;
14685 if (scdp
->scd_next
!= NULL
) {
14686 scdp
->scd_next
->scd_prev
= scdp
->scd_prev
;
14691 * Add an scd to the start of the queue.
14694 sfmmu_add_scd(sf_scd_t
**headp
, sf_scd_t
*scdp
)
14696 scdp
->scd_prev
= NULL
;
14697 scdp
->scd_next
= *headp
;
14698 if (*headp
!= NULL
) {
14699 (*headp
)->scd_prev
= scdp
;
14705 sfmmu_alloc_scd_tsbs(sf_srd_t
*srdp
, sf_scd_t
*scdp
)
14712 ulong_t tte8k_cnt
= 0;
14713 ulong_t tte4m_cnt
= 0;
14715 sfmmu_t
*scsfmmup
= scdp
->scd_sfmmup
;
14716 sfmmu_t
*ism_hatid
;
14717 struct tsb_info
*newtsb
;
14720 ASSERT(srdp
!= NULL
);
14722 for (i
= 0; i
< SFMMU_RGNMAP_WORDS
; i
++) {
14723 if ((w
= scdp
->scd_region_map
.bitmap
[i
]) == 0) {
14733 rid
= (i
<< BT_ULSHIFT
) | j
;
14737 if (rid
< SFMMU_MAX_HME_REGIONS
) {
14738 rgnp
= srdp
->srd_hmergnp
[rid
];
14739 ASSERT(rgnp
->rgn_id
== rid
);
14740 ASSERT(rgnp
->rgn_refcnt
> 0);
14742 if (rgnp
->rgn_pgszc
< TTE4M
) {
14743 tte8k_cnt
+= rgnp
->rgn_size
>>
14744 TTE_PAGE_SHIFT(TTE8K
);
14746 ASSERT(rgnp
->rgn_pgszc
>= TTE4M
);
14747 tte4m_cnt
+= rgnp
->rgn_size
>>
14748 TTE_PAGE_SHIFT(TTE4M
);
14750 * Inflate SCD tsb0 by preallocating
14751 * 1/4 8k ttecnt for 4M regions to
14752 * allow for lgpg alloc failure.
14754 tte8k_cnt
+= rgnp
->rgn_size
>>
14755 (TTE_PAGE_SHIFT(TTE8K
) + 2);
14758 rid
-= SFMMU_MAX_HME_REGIONS
;
14759 rgnp
= srdp
->srd_ismrgnp
[rid
];
14760 ASSERT(rgnp
->rgn_id
== rid
);
14761 ASSERT(rgnp
->rgn_refcnt
> 0);
14763 ism_hatid
= (sfmmu_t
*)rgnp
->rgn_obj
;
14764 ASSERT(ism_hatid
->sfmmu_ismhat
);
14766 for (szc
= 0; szc
< TTE4M
; szc
++) {
14768 ism_hatid
->sfmmu_ttecnt
[szc
] <<
14769 TTE_BSZS_SHIFT(szc
);
14772 ASSERT(rgnp
->rgn_pgszc
>= TTE4M
);
14773 if (rgnp
->rgn_pgszc
>= TTE4M
) {
14774 tte4m_cnt
+= rgnp
->rgn_size
>>
14775 TTE_PAGE_SHIFT(TTE4M
);
14781 tsb_szc
= SELECT_TSB_SIZECODE(tte8k_cnt
);
14783 /* Allocate both the SCD TSBs here. */
14784 if (sfmmu_tsbinfo_alloc(&scsfmmup
->sfmmu_tsb
,
14785 tsb_szc
, TSB8K
|TSB64K
|TSB512K
, TSB_ALLOC
, scsfmmup
) &&
14786 (tsb_szc
<= TSB_4M_SZCODE
||
14787 sfmmu_tsbinfo_alloc(&scsfmmup
->sfmmu_tsb
,
14788 TSB_4M_SZCODE
, TSB8K
|TSB64K
|TSB512K
,
14789 TSB_ALLOC
, scsfmmup
))) {
14791 SFMMU_STAT(sf_scd_1sttsb_allocfail
);
14792 return (TSB_ALLOCFAIL
);
14794 scsfmmup
->sfmmu_tsb
->tsb_flags
|= TSB_SHAREDCTX
;
14797 tsb_szc
= SELECT_TSB_SIZECODE(tte4m_cnt
);
14798 if (sfmmu_tsbinfo_alloc(&newtsb
, tsb_szc
,
14799 TSB4M
|TSB32M
|TSB256M
, TSB_ALLOC
, scsfmmup
) &&
14800 (tsb_szc
<= TSB_4M_SZCODE
||
14801 sfmmu_tsbinfo_alloc(&newtsb
, TSB_4M_SZCODE
,
14802 TSB4M
|TSB32M
|TSB256M
,
14803 TSB_ALLOC
, scsfmmup
))) {
14805 * If we fail to allocate the 2nd shared tsb,
14806 * just free the 1st tsb, return failure.
14808 sfmmu_tsbinfo_free(scsfmmup
->sfmmu_tsb
);
14809 SFMMU_STAT(sf_scd_2ndtsb_allocfail
);
14810 return (TSB_ALLOCFAIL
);
14812 ASSERT(scsfmmup
->sfmmu_tsb
->tsb_next
== NULL
);
14813 newtsb
->tsb_flags
|= TSB_SHAREDCTX
;
14814 scsfmmup
->sfmmu_tsb
->tsb_next
= newtsb
;
14815 SFMMU_STAT(sf_scd_2ndtsb_alloc
);
14818 SFMMU_STAT(sf_scd_1sttsb_alloc
);
14820 return (TSB_SUCCESS
);
14824 sfmmu_free_scd_tsbs(sfmmu_t
*scd_sfmmu
)
14826 while (scd_sfmmu
->sfmmu_tsb
!= NULL
) {
14827 struct tsb_info
*next
= scd_sfmmu
->sfmmu_tsb
->tsb_next
;
14828 sfmmu_tsbinfo_free(scd_sfmmu
->sfmmu_tsb
);
14829 scd_sfmmu
->sfmmu_tsb
= next
;
14834 * Link the sfmmu onto the hme region list.
14837 sfmmu_link_to_hmeregion(sfmmu_t
*sfmmup
, sf_region_t
*rgnp
)
14840 sf_rgn_link_t
*rlink
;
14842 sf_rgn_link_t
*hrlink
;
14844 rid
= rgnp
->rgn_id
;
14845 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
14847 /* LINTED: constant in conditional context */
14848 SFMMU_HMERID2RLINKP(sfmmup
, rid
, rlink
, 1, 1);
14849 ASSERT(rlink
!= NULL
);
14850 mutex_enter(&rgnp
->rgn_mutex
);
14851 if ((head
= rgnp
->rgn_sfmmu_head
) == NULL
) {
14852 rlink
->next
= NULL
;
14853 rlink
->prev
= NULL
;
14855 * make sure rlink's next field is NULL
14856 * before making this link visible.
14859 rgnp
->rgn_sfmmu_head
= sfmmup
;
14861 /* LINTED: constant in conditional context */
14862 SFMMU_HMERID2RLINKP(head
, rid
, hrlink
, 0, 0);
14863 ASSERT(hrlink
!= NULL
);
14864 ASSERT(hrlink
->prev
== NULL
);
14865 rlink
->next
= head
;
14866 rlink
->prev
= NULL
;
14867 hrlink
->prev
= sfmmup
;
14869 * make sure rlink's next field is correct
14870 * before making this link visible.
14873 rgnp
->rgn_sfmmu_head
= sfmmup
;
14875 mutex_exit(&rgnp
->rgn_mutex
);
14879 * Unlink the sfmmu from the hme region list.
14882 sfmmu_unlink_from_hmeregion(sfmmu_t
*sfmmup
, sf_region_t
*rgnp
)
14885 sf_rgn_link_t
*rlink
;
14887 rid
= rgnp
->rgn_id
;
14888 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
14890 /* LINTED: constant in conditional context */
14891 SFMMU_HMERID2RLINKP(sfmmup
, rid
, rlink
, 0, 0);
14892 ASSERT(rlink
!= NULL
);
14893 mutex_enter(&rgnp
->rgn_mutex
);
14894 if (rgnp
->rgn_sfmmu_head
== sfmmup
) {
14895 sfmmu_t
*next
= rlink
->next
;
14896 rgnp
->rgn_sfmmu_head
= next
;
14898 * if we are stopped by xc_attention() after this
14899 * point the forward link walking in
14900 * sfmmu_rgntlb_demap() will work correctly since the
14901 * head correctly points to the next element.
14904 rlink
->next
= NULL
;
14905 ASSERT(rlink
->prev
== NULL
);
14906 if (next
!= NULL
) {
14907 sf_rgn_link_t
*nrlink
;
14908 /* LINTED: constant in conditional context */
14909 SFMMU_HMERID2RLINKP(next
, rid
, nrlink
, 0, 0);
14910 ASSERT(nrlink
!= NULL
);
14911 ASSERT(nrlink
->prev
== sfmmup
);
14912 nrlink
->prev
= NULL
;
14915 sfmmu_t
*next
= rlink
->next
;
14916 sfmmu_t
*prev
= rlink
->prev
;
14917 sf_rgn_link_t
*prlink
;
14919 ASSERT(prev
!= NULL
);
14920 /* LINTED: constant in conditional context */
14921 SFMMU_HMERID2RLINKP(prev
, rid
, prlink
, 0, 0);
14922 ASSERT(prlink
!= NULL
);
14923 ASSERT(prlink
->next
== sfmmup
);
14924 prlink
->next
= next
;
14926 * if we are stopped by xc_attention()
14927 * after this point the forward link walking
14928 * will work correctly since the prev element
14929 * correctly points to the next element.
14932 rlink
->next
= NULL
;
14933 rlink
->prev
= NULL
;
14934 if (next
!= NULL
) {
14935 sf_rgn_link_t
*nrlink
;
14936 /* LINTED: constant in conditional context */
14937 SFMMU_HMERID2RLINKP(next
, rid
, nrlink
, 0, 0);
14938 ASSERT(nrlink
!= NULL
);
14939 ASSERT(nrlink
->prev
== sfmmup
);
14940 nrlink
->prev
= prev
;
14943 mutex_exit(&rgnp
->rgn_mutex
);
14947 * Link scd sfmmu onto ism or hme region list for each region in the
14951 sfmmu_link_scd_to_regions(sf_srd_t
*srdp
, sf_scd_t
*scdp
)
14960 scsfmmup
= scdp
->scd_sfmmup
;
14961 ASSERT(scsfmmup
->sfmmu_scdhat
);
14962 for (i
= 0; i
< SFMMU_RGNMAP_WORDS
; i
++) {
14963 if ((w
= scdp
->scd_region_map
.bitmap
[i
]) == 0) {
14973 rid
= (i
<< BT_ULSHIFT
) | j
;
14977 if (rid
< SFMMU_MAX_HME_REGIONS
) {
14978 rgnp
= srdp
->srd_hmergnp
[rid
];
14979 ASSERT(rgnp
->rgn_id
== rid
);
14980 ASSERT(rgnp
->rgn_refcnt
> 0);
14981 sfmmu_link_to_hmeregion(scsfmmup
, rgnp
);
14983 sfmmu_t
*ism_hatid
= NULL
;
14984 ism_ment_t
*ism_ment
;
14985 rid
-= SFMMU_MAX_HME_REGIONS
;
14986 rgnp
= srdp
->srd_ismrgnp
[rid
];
14987 ASSERT(rgnp
->rgn_id
== rid
);
14988 ASSERT(rgnp
->rgn_refcnt
> 0);
14990 ism_hatid
= (sfmmu_t
*)rgnp
->rgn_obj
;
14991 ASSERT(ism_hatid
->sfmmu_ismhat
);
14992 ism_ment
= &scdp
->scd_ism_links
[rid
];
14993 ism_ment
->iment_hat
= scsfmmup
;
14994 ism_ment
->iment_base_va
= rgnp
->rgn_saddr
;
14995 mutex_enter(&ism_mlist_lock
);
14996 iment_add(ism_ment
, ism_hatid
);
14997 mutex_exit(&ism_mlist_lock
);
15004 * Unlink scd sfmmu from ism or hme region list for each region in the
15008 sfmmu_unlink_scd_from_regions(sf_srd_t
*srdp
, sf_scd_t
*scdp
)
15017 scsfmmup
= scdp
->scd_sfmmup
;
15018 for (i
= 0; i
< SFMMU_RGNMAP_WORDS
; i
++) {
15019 if ((w
= scdp
->scd_region_map
.bitmap
[i
]) == 0) {
15029 rid
= (i
<< BT_ULSHIFT
) | j
;
15033 if (rid
< SFMMU_MAX_HME_REGIONS
) {
15034 rgnp
= srdp
->srd_hmergnp
[rid
];
15035 ASSERT(rgnp
->rgn_id
== rid
);
15036 ASSERT(rgnp
->rgn_refcnt
> 0);
15037 sfmmu_unlink_from_hmeregion(scsfmmup
,
15041 sfmmu_t
*ism_hatid
= NULL
;
15042 ism_ment_t
*ism_ment
;
15043 rid
-= SFMMU_MAX_HME_REGIONS
;
15044 rgnp
= srdp
->srd_ismrgnp
[rid
];
15045 ASSERT(rgnp
->rgn_id
== rid
);
15046 ASSERT(rgnp
->rgn_refcnt
> 0);
15048 ism_hatid
= (sfmmu_t
*)rgnp
->rgn_obj
;
15049 ASSERT(ism_hatid
->sfmmu_ismhat
);
15050 ism_ment
= &scdp
->scd_ism_links
[rid
];
15051 ASSERT(ism_ment
->iment_hat
== scdp
->scd_sfmmup
);
15052 ASSERT(ism_ment
->iment_base_va
==
15054 mutex_enter(&ism_mlist_lock
);
15055 iment_sub(ism_ment
, ism_hatid
);
15056 mutex_exit(&ism_mlist_lock
);
15063 * Allocates and initialises a new SCD structure, this is called with
15064 * the srd_scd_mutex held and returns with the reference count
15065 * initialised to 1.
15068 sfmmu_alloc_scd(sf_srd_t
*srdp
, sf_region_map_t
*new_map
)
15070 sf_scd_t
*new_scdp
;
15074 ASSERT(MUTEX_HELD(&srdp
->srd_scd_mutex
));
15075 new_scdp
= kmem_cache_alloc(scd_cache
, KM_SLEEP
);
15077 scsfmmup
= kmem_cache_alloc(sfmmuid_cache
, KM_SLEEP
);
15078 new_scdp
->scd_sfmmup
= scsfmmup
;
15079 scsfmmup
->sfmmu_srdp
= srdp
;
15080 scsfmmup
->sfmmu_scdp
= new_scdp
;
15081 scsfmmup
->sfmmu_tsb0_4minflcnt
= 0;
15082 scsfmmup
->sfmmu_scdhat
= 1;
15083 CPUSET_ALL(scsfmmup
->sfmmu_cpusran
);
15084 bzero(scsfmmup
->sfmmu_hmeregion_links
, SFMMU_L1_HMERLINKS_SIZE
);
15086 ASSERT(max_mmu_ctxdoms
> 0);
15087 for (i
= 0; i
< max_mmu_ctxdoms
; i
++) {
15088 scsfmmup
->sfmmu_ctxs
[i
].cnum
= INVALID_CONTEXT
;
15089 scsfmmup
->sfmmu_ctxs
[i
].gnum
= 0;
15092 for (i
= 0; i
< MMU_PAGE_SIZES
; i
++) {
15093 new_scdp
->scd_rttecnt
[i
] = 0;
15096 new_scdp
->scd_region_map
= *new_map
;
15097 new_scdp
->scd_refcnt
= 1;
15098 if (sfmmu_alloc_scd_tsbs(srdp
, new_scdp
) != TSB_SUCCESS
) {
15099 kmem_cache_free(scd_cache
, new_scdp
);
15100 kmem_cache_free(sfmmuid_cache
, scsfmmup
);
15103 if (&mmu_init_scd
) {
15104 mmu_init_scd(new_scdp
);
15110 * The first phase of a process joining an SCD. The hat structure is
15111 * linked to the SCD queue and then the HAT_JOIN_SCD sfmmu flag is set
15112 * and a cross-call with context invalidation is used to cause the
15113 * remaining work to be carried out in the sfmmu_tsbmiss_exception()
15117 sfmmu_join_scd(sf_scd_t
*scdp
, sfmmu_t
*sfmmup
)
15119 hatlock_t
*hatlockp
;
15120 sf_srd_t
*srdp
= sfmmup
->sfmmu_srdp
;
15122 sf_scd_t
*old_scdp
;
15124 ASSERT(srdp
!= NULL
);
15125 ASSERT(scdp
!= NULL
);
15126 ASSERT(scdp
->scd_refcnt
> 0);
15127 ASSERT(AS_WRITE_HELD(sfmmup
->sfmmu_as
));
15129 if ((old_scdp
= sfmmup
->sfmmu_scdp
) != NULL
) {
15130 ASSERT(old_scdp
!= scdp
);
15132 mutex_enter(&old_scdp
->scd_mutex
);
15133 sfmmu_from_scd_list(&old_scdp
->scd_sf_list
, sfmmup
);
15134 mutex_exit(&old_scdp
->scd_mutex
);
15136 * sfmmup leaves the old scd. Update sfmmu_ttecnt to
15137 * include the shme rgn ttecnt for rgns that
15138 * were in the old SCD
15140 for (i
= 0; i
< mmu_page_sizes
; i
++) {
15141 ASSERT(sfmmup
->sfmmu_scdrttecnt
[i
] ==
15142 old_scdp
->scd_rttecnt
[i
]);
15143 atomic_add_long(&sfmmup
->sfmmu_ttecnt
[i
],
15144 sfmmup
->sfmmu_scdrttecnt
[i
]);
15149 * Move sfmmu to the scd lists.
15151 mutex_enter(&scdp
->scd_mutex
);
15152 sfmmu_to_scd_list(&scdp
->scd_sf_list
, sfmmup
);
15153 mutex_exit(&scdp
->scd_mutex
);
15154 SF_SCD_INCR_REF(scdp
);
15156 hatlockp
= sfmmu_hat_enter(sfmmup
);
15158 * For a multi-thread process, we must stop
15159 * all the other threads before joining the scd.
15162 SFMMU_FLAGS_SET(sfmmup
, HAT_JOIN_SCD
);
15164 sfmmu_invalidate_ctx(sfmmup
);
15165 sfmmup
->sfmmu_scdp
= scdp
;
15168 * Copy scd_rttecnt into sfmmup's sfmmu_scdrttecnt, and update
15169 * sfmmu_ttecnt to not include the rgn ttecnt just joined in SCD.
15171 for (i
= 0; i
< mmu_page_sizes
; i
++) {
15172 sfmmup
->sfmmu_scdrttecnt
[i
] = scdp
->scd_rttecnt
[i
];
15173 ASSERT(sfmmup
->sfmmu_ttecnt
[i
] >= scdp
->scd_rttecnt
[i
]);
15174 atomic_add_long(&sfmmup
->sfmmu_ttecnt
[i
],
15175 -sfmmup
->sfmmu_scdrttecnt
[i
]);
15177 /* update tsb0 inflation count */
15178 if (old_scdp
!= NULL
) {
15179 sfmmup
->sfmmu_tsb0_4minflcnt
+=
15180 old_scdp
->scd_sfmmup
->sfmmu_tsb0_4minflcnt
;
15182 ASSERT(sfmmup
->sfmmu_tsb0_4minflcnt
>=
15183 scdp
->scd_sfmmup
->sfmmu_tsb0_4minflcnt
);
15184 sfmmup
->sfmmu_tsb0_4minflcnt
-= scdp
->scd_sfmmup
->sfmmu_tsb0_4minflcnt
;
15186 sfmmu_hat_exit(hatlockp
);
15188 if (old_scdp
!= NULL
) {
15189 SF_SCD_DECR_REF(srdp
, old_scdp
);
15195 * This routine is called by a process to become part of an SCD. It is called
15196 * from sfmmu_tsbmiss_exception() once most of the initial work has been
15197 * done by sfmmu_join_scd(). This routine must not drop the hat lock.
15200 sfmmu_finish_join_scd(sfmmu_t
*sfmmup
)
15202 struct tsb_info
*tsbinfop
;
15204 ASSERT(sfmmu_hat_lock_held(sfmmup
));
15205 ASSERT(sfmmup
->sfmmu_scdp
!= NULL
);
15206 ASSERT(SFMMU_FLAGS_ISSET(sfmmup
, HAT_JOIN_SCD
));
15207 ASSERT(!SFMMU_FLAGS_ISSET(sfmmup
, HAT_ISMBUSY
));
15208 ASSERT(SFMMU_FLAGS_ISSET(sfmmup
, HAT_ALLCTX_INVALID
));
15210 for (tsbinfop
= sfmmup
->sfmmu_tsb
; tsbinfop
!= NULL
;
15211 tsbinfop
= tsbinfop
->tsb_next
) {
15212 if (tsbinfop
->tsb_flags
& TSB_SWAPPED
) {
15215 ASSERT(!(tsbinfop
->tsb_flags
& TSB_RELOC_FLAG
));
15217 sfmmu_inv_tsb(tsbinfop
->tsb_va
,
15218 TSB_BYTES(tsbinfop
->tsb_szc
));
15221 /* Set HAT_CTX1_FLAG for all SCD ISMs */
15222 sfmmu_ism_hatflags(sfmmup
, 1);
15224 SFMMU_STAT(sf_join_scd
);
15228 * This routine is called in order to check if there is an SCD which matches
15229 * the process's region map if not then a new SCD may be created.
15232 sfmmu_find_scd(sfmmu_t
*sfmmup
)
15234 sf_srd_t
*srdp
= sfmmup
->sfmmu_srdp
;
15235 sf_scd_t
*scdp
, *new_scdp
;
15238 ASSERT(srdp
!= NULL
);
15239 ASSERT(AS_WRITE_HELD(sfmmup
->sfmmu_as
));
15241 mutex_enter(&srdp
->srd_scd_mutex
);
15242 for (scdp
= srdp
->srd_scdp
; scdp
!= NULL
;
15243 scdp
= scdp
->scd_next
) {
15244 SF_RGNMAP_EQUAL(&scdp
->scd_region_map
,
15245 &sfmmup
->sfmmu_region_map
, ret
);
15247 SF_SCD_INCR_REF(scdp
);
15248 mutex_exit(&srdp
->srd_scd_mutex
);
15249 sfmmu_join_scd(scdp
, sfmmup
);
15250 ASSERT(scdp
->scd_refcnt
>= 2);
15251 atomic_dec_32((volatile uint32_t *)&scdp
->scd_refcnt
);
15255 * If the sfmmu region map is a subset of the scd
15256 * region map, then the assumption is that this process
15257 * will continue attaching to ISM segments until the
15258 * region maps are equal.
15260 SF_RGNMAP_IS_SUBSET(&scdp
->scd_region_map
,
15261 &sfmmup
->sfmmu_region_map
, ret
);
15263 mutex_exit(&srdp
->srd_scd_mutex
);
15269 ASSERT(scdp
== NULL
);
15271 * No matching SCD has been found, create a new one.
15273 if ((new_scdp
= sfmmu_alloc_scd(srdp
, &sfmmup
->sfmmu_region_map
)) ==
15275 mutex_exit(&srdp
->srd_scd_mutex
);
15280 * sfmmu_alloc_scd() returns with a ref count of 1 on the scd.
15283 /* Set scd_rttecnt for shme rgns in SCD */
15284 sfmmu_set_scd_rttecnt(srdp
, new_scdp
);
15287 * Link scd onto srd_scdp list and scd sfmmu onto region/iment lists.
15289 sfmmu_link_scd_to_regions(srdp
, new_scdp
);
15290 sfmmu_add_scd(&srdp
->srd_scdp
, new_scdp
);
15291 SFMMU_STAT_ADD(sf_create_scd
, 1);
15293 mutex_exit(&srdp
->srd_scd_mutex
);
15294 sfmmu_join_scd(new_scdp
, sfmmup
);
15295 ASSERT(new_scdp
->scd_refcnt
>= 2);
15296 atomic_dec_32((volatile uint32_t *)&new_scdp
->scd_refcnt
);
15300 * This routine is called by a process to remove itself from an SCD. It is
15301 * either called when the processes has detached from a segment or from
15302 * hat_free_start() as a result of calling exit.
15305 sfmmu_leave_scd(sfmmu_t
*sfmmup
, uchar_t r_type
)
15307 sf_scd_t
*scdp
= sfmmup
->sfmmu_scdp
;
15308 sf_srd_t
*srdp
= sfmmup
->sfmmu_srdp
;
15309 hatlock_t
*hatlockp
= TSB_HASH(sfmmup
);
15312 ASSERT(scdp
!= NULL
);
15313 ASSERT(srdp
!= NULL
);
15315 if (sfmmup
->sfmmu_free
) {
15317 * If the process is part of an SCD the sfmmu is unlinked
15318 * from scd_sf_list.
15320 mutex_enter(&scdp
->scd_mutex
);
15321 sfmmu_from_scd_list(&scdp
->scd_sf_list
, sfmmup
);
15322 mutex_exit(&scdp
->scd_mutex
);
15324 * Update sfmmu_ttecnt to include the rgn ttecnt for rgns that
15325 * are about to leave the SCD
15327 for (i
= 0; i
< mmu_page_sizes
; i
++) {
15328 ASSERT(sfmmup
->sfmmu_scdrttecnt
[i
] ==
15329 scdp
->scd_rttecnt
[i
]);
15330 atomic_add_long(&sfmmup
->sfmmu_ttecnt
[i
],
15331 sfmmup
->sfmmu_scdrttecnt
[i
]);
15332 sfmmup
->sfmmu_scdrttecnt
[i
] = 0;
15334 sfmmup
->sfmmu_scdp
= NULL
;
15336 SF_SCD_DECR_REF(srdp
, scdp
);
15340 ASSERT(r_type
!= SFMMU_REGION_ISM
||
15341 SFMMU_FLAGS_ISSET(sfmmup
, HAT_ISMBUSY
));
15342 ASSERT(scdp
->scd_refcnt
);
15343 ASSERT(!sfmmup
->sfmmu_free
);
15344 ASSERT(sfmmu_hat_lock_held(sfmmup
));
15345 ASSERT(AS_LOCK_HELD(sfmmup
->sfmmu_as
));
15348 * Wait for ISM maps to be updated.
15350 if (r_type
!= SFMMU_REGION_ISM
) {
15351 while (SFMMU_FLAGS_ISSET(sfmmup
, HAT_ISMBUSY
) &&
15352 sfmmup
->sfmmu_scdp
!= NULL
) {
15353 cv_wait(&sfmmup
->sfmmu_tsb_cv
,
15354 HATLOCK_MUTEXP(hatlockp
));
15357 if (sfmmup
->sfmmu_scdp
== NULL
) {
15358 sfmmu_hat_exit(hatlockp
);
15361 SFMMU_FLAGS_SET(sfmmup
, HAT_ISMBUSY
);
15364 if (SFMMU_FLAGS_ISSET(sfmmup
, HAT_JOIN_SCD
)) {
15365 SFMMU_FLAGS_CLEAR(sfmmup
, HAT_JOIN_SCD
);
15367 * Since HAT_JOIN_SCD was set our context
15368 * is still invalid.
15372 * For a multi-thread process, we must stop
15373 * all the other threads before leaving the scd.
15376 sfmmu_invalidate_ctx(sfmmup
);
15379 /* Clear all the rid's for ISM, delete flags, etc */
15380 ASSERT(SFMMU_FLAGS_ISSET(sfmmup
, HAT_ISMBUSY
));
15381 sfmmu_ism_hatflags(sfmmup
, 0);
15384 * Update sfmmu_ttecnt to include the rgn ttecnt for rgns that
15385 * are in SCD before this sfmmup leaves the SCD.
15387 for (i
= 0; i
< mmu_page_sizes
; i
++) {
15388 ASSERT(sfmmup
->sfmmu_scdrttecnt
[i
] ==
15389 scdp
->scd_rttecnt
[i
]);
15390 atomic_add_long(&sfmmup
->sfmmu_ttecnt
[i
],
15391 sfmmup
->sfmmu_scdrttecnt
[i
]);
15392 sfmmup
->sfmmu_scdrttecnt
[i
] = 0;
15393 /* update ismttecnt to include SCD ism before hat leaves SCD */
15394 sfmmup
->sfmmu_ismttecnt
[i
] += sfmmup
->sfmmu_scdismttecnt
[i
];
15395 sfmmup
->sfmmu_scdismttecnt
[i
] = 0;
15397 /* update tsb0 inflation count */
15398 sfmmup
->sfmmu_tsb0_4minflcnt
+= scdp
->scd_sfmmup
->sfmmu_tsb0_4minflcnt
;
15400 if (r_type
!= SFMMU_REGION_ISM
) {
15401 SFMMU_FLAGS_CLEAR(sfmmup
, HAT_ISMBUSY
);
15403 sfmmup
->sfmmu_scdp
= NULL
;
15405 sfmmu_hat_exit(hatlockp
);
15408 * Unlink sfmmu from scd_sf_list this can be done without holding
15409 * the hat lock as we hold the sfmmu_as lock which prevents
15410 * hat_join_region from adding this thread to the scd again. Other
15411 * threads check if sfmmu_scdp is NULL under hat lock and if it's NULL
15412 * they won't get here, since sfmmu_leave_scd() clears sfmmu_scdp
15413 * while holding the hat lock.
15415 mutex_enter(&scdp
->scd_mutex
);
15416 sfmmu_from_scd_list(&scdp
->scd_sf_list
, sfmmup
);
15417 mutex_exit(&scdp
->scd_mutex
);
15418 SFMMU_STAT(sf_leave_scd
);
15420 SF_SCD_DECR_REF(srdp
, scdp
);
15421 hatlockp
= sfmmu_hat_enter(sfmmup
);
15426 * Unlink and free up an SCD structure with a reference count of 0.
15429 sfmmu_destroy_scd(sf_srd_t
*srdp
, sf_scd_t
*scdp
, sf_region_map_t
*scd_rmap
)
15433 hatlock_t
*shatlockp
;
15436 mutex_enter(&srdp
->srd_scd_mutex
);
15437 for (sp
= srdp
->srd_scdp
; sp
!= NULL
; sp
= sp
->scd_next
) {
15441 if (sp
== NULL
|| sp
->scd_refcnt
) {
15442 mutex_exit(&srdp
->srd_scd_mutex
);
15447 * It is possible that the scd has been freed and reallocated with a
15448 * different region map while we've been waiting for the srd_scd_mutex.
15450 SF_RGNMAP_EQUAL(scd_rmap
, &sp
->scd_region_map
, ret
);
15452 mutex_exit(&srdp
->srd_scd_mutex
);
15456 ASSERT(scdp
->scd_sf_list
== NULL
);
15458 * Unlink scd from srd_scdp list.
15460 sfmmu_remove_scd(&srdp
->srd_scdp
, scdp
);
15461 mutex_exit(&srdp
->srd_scd_mutex
);
15463 sfmmu_unlink_scd_from_regions(srdp
, scdp
);
15465 /* Clear shared context tsb and release ctx */
15466 scsfmmup
= scdp
->scd_sfmmup
;
15469 * create a barrier so that scd will not be destroyed
15470 * if other thread still holds the same shared hat lock.
15471 * E.g., sfmmu_tsbmiss_exception() needs to acquire the
15472 * shared hat lock before checking the shared tsb reloc flag.
15474 shatlockp
= sfmmu_hat_enter(scsfmmup
);
15475 sfmmu_hat_exit(shatlockp
);
15477 sfmmu_free_scd_tsbs(scsfmmup
);
15479 for (i
= 0; i
< SFMMU_L1_HMERLINKS
; i
++) {
15480 if (scsfmmup
->sfmmu_hmeregion_links
[i
] != NULL
) {
15481 kmem_free(scsfmmup
->sfmmu_hmeregion_links
[i
],
15482 SFMMU_L2_HMERLINKS_SIZE
);
15483 scsfmmup
->sfmmu_hmeregion_links
[i
] = NULL
;
15486 kmem_cache_free(sfmmuid_cache
, scsfmmup
);
15487 kmem_cache_free(scd_cache
, scdp
);
15488 SFMMU_STAT(sf_destroy_scd
);
15492 * Modifies the HAT_CTX1_FLAG for each of the ISM segments which correspond to
15493 * bits which are set in the ism_region_map parameter. This flag indicates to
15494 * the tsbmiss handler that mapping for these segments should be loaded using
15495 * the shared context.
15498 sfmmu_ism_hatflags(sfmmu_t
*sfmmup
, int addflag
)
15500 sf_scd_t
*scdp
= sfmmup
->sfmmu_scdp
;
15501 ism_blk_t
*ism_blkp
;
15502 ism_map_t
*ism_map
;
15505 ASSERT(sfmmup
->sfmmu_iblk
!= NULL
);
15506 ASSERT(scdp
!= NULL
);
15508 * Note that the caller either set HAT_ISMBUSY flag or checked
15509 * under hat lock that HAT_ISMBUSY was not set by another thread.
15511 ASSERT(sfmmu_hat_lock_held(sfmmup
));
15513 ism_blkp
= sfmmup
->sfmmu_iblk
;
15514 while (ism_blkp
!= NULL
) {
15515 ism_map
= ism_blkp
->iblk_maps
;
15516 for (i
= 0; ism_map
[i
].imap_ismhat
&& i
< ISM_MAP_SLOTS
; i
++) {
15517 rid
= ism_map
[i
].imap_rid
;
15518 if (rid
== SFMMU_INVALID_ISMRID
) {
15521 ASSERT(rid
>= 0 && rid
< SFMMU_MAX_ISM_REGIONS
);
15522 if (SF_RGNMAP_TEST(scdp
->scd_ismregion_map
, rid
) &&
15524 ism_map
[i
].imap_hatflags
|=
15527 ism_map
[i
].imap_hatflags
&=
15531 ism_blkp
= ism_blkp
->iblk_next
;
15536 sfmmu_srd_lock_held(sf_srd_t
*srdp
)
15538 return (MUTEX_HELD(&srdp
->srd_mutex
));
15543 sfmmu_scdcache_constructor(void *buf
, void *cdrarg
, int kmflags
)
15545 sf_scd_t
*scdp
= (sf_scd_t
*)buf
;
15547 bzero(buf
, sizeof (sf_scd_t
));
15548 mutex_init(&scdp
->scd_mutex
, NULL
, MUTEX_DEFAULT
, NULL
);
15554 sfmmu_scdcache_destructor(void *buf
, void *cdrarg
)
15556 sf_scd_t
*scdp
= (sf_scd_t
*)buf
;
15558 mutex_destroy(&scdp
->scd_mutex
);
15562 * The listp parameter is a pointer to a list of hmeblks which are partially
15563 * freed as result of calling sfmmu_hblk_hash_rm(), the last phase of the
15564 * freeing process is to cross-call all cpus to ensure that there are no
15565 * remaining cached references.
15567 * If the local generation number is less than the global then we can free
15568 * hmeblks which are already on the pending queue as another cpu has completed
15571 * We cross-call to make sure that there are no threads on other cpus accessing
15572 * these hmblks and then complete the process of freeing them under the
15573 * following conditions:
15574 * The total number of pending hmeblks is greater than the threshold
15575 * The reserve list has fewer than HBLK_RESERVE_CNT hmeblks
15576 * It is at least 1 second since the last time we cross-called
15578 * Otherwise, we add the hmeblks to the per-cpu pending queue.
15581 sfmmu_hblks_list_purge(struct hme_blk
**listp
, int dontfree
)
15583 struct hme_blk
*hblkp
, *pr_hblkp
= NULL
;
15585 cpuset_t cpuset
= cpu_ready_set
;
15586 cpu_hme_pend_t
*cpuhp
;
15588 int one_second_expired
= 0;
15590 gethrestime_lasttick(&now
);
15592 for (hblkp
= *listp
; hblkp
!= NULL
; hblkp
= hblkp
->hblk_next
) {
15593 ASSERT(hblkp
->hblk_shw_bit
== 0);
15594 ASSERT(hblkp
->hblk_shared
== 0);
15599 cpuhp
= &cpu_hme_pend
[CPU
->cpu_seqid
];
15600 mutex_enter(&cpuhp
->chp_mutex
);
15602 if ((cpuhp
->chp_count
+ count
) == 0) {
15603 mutex_exit(&cpuhp
->chp_mutex
);
15607 if ((now
.tv_sec
- cpuhp
->chp_timestamp
) > 1) {
15608 one_second_expired
= 1;
15611 if (!dontfree
&& (freehblkcnt
< HBLK_RESERVE_CNT
||
15612 (cpuhp
->chp_count
+ count
) > cpu_hme_pend_thresh
||
15613 one_second_expired
)) {
15614 /* Append global list to local */
15615 if (pr_hblkp
== NULL
) {
15616 *listp
= cpuhp
->chp_listp
;
15618 pr_hblkp
->hblk_next
= cpuhp
->chp_listp
;
15620 cpuhp
->chp_listp
= NULL
;
15621 cpuhp
->chp_count
= 0;
15622 cpuhp
->chp_timestamp
= now
.tv_sec
;
15623 mutex_exit(&cpuhp
->chp_mutex
);
15625 kpreempt_disable();
15626 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
15632 * At this stage we know that no trap handlers on other
15633 * cpus can have references to hmeblks on the list.
15635 sfmmu_hblk_free(listp
);
15636 } else if (*listp
!= NULL
) {
15637 pr_hblkp
->hblk_next
= cpuhp
->chp_listp
;
15638 cpuhp
->chp_listp
= *listp
;
15639 cpuhp
->chp_count
+= count
;
15641 mutex_exit(&cpuhp
->chp_mutex
);
15643 mutex_exit(&cpuhp
->chp_mutex
);
15648 * Add an hmeblk to the the hash list.
15651 sfmmu_hblk_hash_add(struct hmehash_bucket
*hmebp
, struct hme_blk
*hmeblkp
,
15654 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp
));
15656 if (hmebp
->hmeblkp
== NULL
) {
15657 ASSERT(hmebp
->hmeh_nextpa
== HMEBLK_ENDPA
);
15661 hmeblkp
->hblk_nextpa
= hmebp
->hmeh_nextpa
;
15663 * Since the TSB miss handler now does not lock the hash chain before
15664 * walking it, make sure that the hmeblks nextpa is globally visible
15665 * before we make the hmeblk globally visible by updating the chain root
15666 * pointer in the hash bucket.
15669 hmebp
->hmeh_nextpa
= hblkpa
;
15670 hmeblkp
->hblk_next
= hmebp
->hmeblkp
;
15671 hmebp
->hmeblkp
= hmeblkp
;
15676 * This function is the first part of a 2 part process to remove an hmeblk
15677 * from the hash chain. In this phase we unlink the hmeblk from the hash chain
15678 * but leave the next physical pointer unchanged. The hmeblk is then linked onto
15679 * a per-cpu pending list using the virtual address pointer.
15681 * TSB miss trap handlers that start after this phase will no longer see
15682 * this hmeblk. TSB miss handlers that still cache this hmeblk in a register
15683 * can still use it for further chain traversal because we haven't yet modifed
15684 * the next physical pointer or freed it.
15686 * In the second phase of hmeblk removal we'll issue a barrier xcall before
15687 * we reuse or free this hmeblk. This will make sure all lingering references to
15688 * the hmeblk after first phase disappear before we finally reclaim it.
15689 * This scheme eliminates the need for TSB miss handlers to lock hmeblk chains
15690 * during their traversal.
15692 * The hmehash_mutex must be held when calling this function.
15695 * hmebp - hme hash bucket pointer
15696 * hmeblkp - address of hmeblk to be removed
15697 * pr_hblk - virtual address of previous hmeblkp
15698 * listp - pointer to list of hmeblks linked by virtual address
15699 * free_now flag - indicates that a complete removal from the hash chains
15702 * It is inefficient to use the free_now flag as a cross-call is required to
15703 * remove a single hmeblk from the hash chain but is necessary when hmeblks are
15707 sfmmu_hblk_hash_rm(struct hmehash_bucket
*hmebp
, struct hme_blk
*hmeblkp
,
15708 struct hme_blk
*pr_hblk
, struct hme_blk
**listp
,
15711 int shw_size
, vshift
;
15712 struct hme_blk
*shw_hblkp
;
15713 uint_t shw_mask
, newshw_mask
;
15716 cpuset_t cpuset
= cpu_ready_set
;
15718 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp
));
15720 if (hmebp
->hmeblkp
== hmeblkp
) {
15721 hmebp
->hmeh_nextpa
= hmeblkp
->hblk_nextpa
;
15722 hmebp
->hmeblkp
= hmeblkp
->hblk_next
;
15724 pr_hblk
->hblk_nextpa
= hmeblkp
->hblk_nextpa
;
15725 pr_hblk
->hblk_next
= hmeblkp
->hblk_next
;
15728 size
= get_hblk_ttesz(hmeblkp
);
15729 shw_hblkp
= hmeblkp
->hblk_shadow
;
15731 ASSERT(hblktosfmmu(hmeblkp
) != KHATID
);
15732 ASSERT(!hmeblkp
->hblk_shared
);
15734 if (mmu_page_sizes
== max_mmu_page_sizes
) {
15735 ASSERT(size
< TTE256M
);
15737 ASSERT(size
< TTE4M
);
15741 shw_size
= get_hblk_ttesz(shw_hblkp
);
15742 vaddr
= (caddr_t
)get_hblk_base(hmeblkp
);
15743 vshift
= vaddr_to_vshift(shw_hblkp
->hblk_tag
, vaddr
, shw_size
);
15744 ASSERT(vshift
< 8);
15746 * Atomically clear shadow mask bit
15749 shw_mask
= shw_hblkp
->hblk_shw_mask
;
15750 ASSERT(shw_mask
& (1 << vshift
));
15751 newshw_mask
= shw_mask
& ~(1 << vshift
);
15752 newshw_mask
= atomic_cas_32(&shw_hblkp
->hblk_shw_mask
,
15753 shw_mask
, newshw_mask
);
15754 } while (newshw_mask
!= shw_mask
);
15755 hmeblkp
->hblk_shadow
= NULL
;
15757 hmeblkp
->hblk_shw_bit
= 0;
15759 if (hmeblkp
->hblk_shared
) {
15765 srdp
= hblktosrd(hmeblkp
);
15766 ASSERT(srdp
!= NULL
&& srdp
->srd_refcnt
!= 0);
15767 rid
= hmeblkp
->hblk_tag
.htag_rid
;
15768 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
15769 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
15770 rgnp
= srdp
->srd_hmergnp
[rid
];
15771 ASSERT(rgnp
!= NULL
);
15772 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp
, srdp
, rgnp
, rid
);
15774 hmeblkp
->hblk_shared
= 0;
15777 kpreempt_disable();
15778 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
15783 hmeblkp
->hblk_nextpa
= HMEBLK_ENDPA
;
15784 hmeblkp
->hblk_next
= NULL
;
15786 /* Append hmeblkp to listp for processing later. */
15787 hmeblkp
->hblk_next
= *listp
;
15793 * This routine is called when memory is in short supply and returns a free
15794 * hmeblk of the requested size from the cpu pending lists.
15796 static struct hme_blk
*
15797 sfmmu_check_pending_hblks(int size
)
15800 struct hme_blk
*hmeblkp
= NULL
, *last_hmeblkp
;
15802 cpuset_t cpuset
= cpu_ready_set
;
15803 cpu_hme_pend_t
*cpuhp
;
15805 /* Flush cpu hblk pending queues */
15806 for (i
= 0; i
< NCPU
; i
++) {
15807 cpuhp
= &cpu_hme_pend
[i
];
15808 if (cpuhp
->chp_listp
!= NULL
) {
15809 mutex_enter(&cpuhp
->chp_mutex
);
15810 if (cpuhp
->chp_listp
== NULL
) {
15811 mutex_exit(&cpuhp
->chp_mutex
);
15815 last_hmeblkp
= NULL
;
15816 for (hmeblkp
= cpuhp
->chp_listp
; hmeblkp
!= NULL
;
15817 hmeblkp
= hmeblkp
->hblk_next
) {
15818 if (get_hblk_ttesz(hmeblkp
) == size
) {
15819 if (last_hmeblkp
== NULL
) {
15821 hmeblkp
->hblk_next
;
15823 last_hmeblkp
->hblk_next
=
15824 hmeblkp
->hblk_next
;
15826 ASSERT(cpuhp
->chp_count
> 0);
15827 cpuhp
->chp_count
--;
15831 last_hmeblkp
= hmeblkp
;
15834 mutex_exit(&cpuhp
->chp_mutex
);
15836 if (found_hmeblk
) {
15837 kpreempt_disable();
15838 CPUSET_DEL(cpuset
, CPU
->cpu_id
);