4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
29 * VM - Hardware Address Translation management for Spitfire MMU.
31 * This file implements the machine specific hardware translation
32 * needed by the VM system. The machine independent interface is
33 * described in <vm/hat.h> while the machine dependent interface
34 * and data structures are described in <vm/hat_sfmmu.h>.
36 * The hat layer manages the address translation hardware as a cache
37 * driven by calls from the higher levels in the VM system.
40 #include <sys/types.h>
41 #include <sys/kstat.h>
43 #include <vm/hat_sfmmu.h>
46 #include <sys/systm.h>
48 #include <sys/sysmacros.h>
49 #include <sys/machparam.h>
50 #include <sys/vtrace.h>
53 #include <sys/cmn_err.h>
55 #include <sys/cpuvar.h>
56 #include <sys/debug.h>
58 #include <sys/archsystm.h>
59 #include <sys/machsystm.h>
60 #include <sys/vmsystm.h>
63 #include <vm/seg_kp.h>
64 #include <vm/seg_kmem.h>
65 #include <vm/seg_kpm.h>
67 #include <sys/t_lock.h>
68 #include <sys/obpdefs.h>
69 #include <sys/vm_machparam.h>
72 #include <sys/machtrap.h>
74 #include <sys/bitmap.h>
75 #include <sys/machlock.h>
76 #include <sys/membar.h>
77 #include <sys/atomic.h>
78 #include <sys/cpu_module.h>
79 #include <sys/prom_debug.h>
80 #include <sys/ksynch.h>
81 #include <sys/mem_config.h>
82 #include <sys/mem_cage.h>
83 #include <vm/vm_dep.h>
84 #include <sys/fpu/fpusystm.h>
85 #include <vm/mach_kpm.h>
86 #include <sys/callb.h>
89 #define SFMMU_VALIDATE_HMERID(hat, rid, saddr, len) \
90 if (SFMMU_IS_SHMERID_VALID(rid)) { \
91 caddr_t _eaddr = (saddr) + (len); \
94 ASSERT((rid) < SFMMU_MAX_HME_REGIONS); \
95 ASSERT(SF_RGNMAP_TEST(hat->sfmmu_hmeregion_map, rid)); \
96 ASSERT((hat) != ksfmmup); \
97 _srdp = (hat)->sfmmu_srdp; \
98 ASSERT(_srdp != NULL); \
99 ASSERT(_srdp->srd_refcnt != 0); \
100 _rgnp = _srdp->srd_hmergnp[(rid)]; \
101 ASSERT(_rgnp != NULL && _rgnp->rgn_id == rid); \
102 ASSERT(_rgnp->rgn_refcnt != 0); \
103 ASSERT(!(_rgnp->rgn_flags & SFMMU_REGION_FREE)); \
104 ASSERT((_rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == \
106 ASSERT((saddr) >= _rgnp->rgn_saddr); \
107 ASSERT((saddr) < _rgnp->rgn_saddr + _rgnp->rgn_size); \
108 ASSERT(_eaddr > _rgnp->rgn_saddr); \
109 ASSERT(_eaddr <= _rgnp->rgn_saddr + _rgnp->rgn_size); \
112 #define SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid) \
118 int _ttesz = get_hblk_ttesz(hmeblkp); \
120 ASSERT((srdp)->srd_refcnt != 0); \
121 ASSERT((rid) < SFMMU_MAX_HME_REGIONS); \
122 ASSERT((rgnp)->rgn_id == rid); \
123 ASSERT(!((rgnp)->rgn_flags & SFMMU_REGION_FREE)); \
124 ASSERT(((rgnp)->rgn_flags & SFMMU_REGION_TYPE_MASK) == \
126 ASSERT(_ttesz <= (rgnp)->rgn_pgszc); \
127 _hsva = (caddr_t)get_hblk_base(hmeblkp); \
128 _heva = get_hblk_endaddr(hmeblkp); \
129 _rsva = (caddr_t)P2ALIGN( \
130 (uintptr_t)(rgnp)->rgn_saddr, HBLK_MIN_BYTES); \
131 _reva = (caddr_t)P2ROUNDUP( \
132 (uintptr_t)((rgnp)->rgn_saddr + (rgnp)->rgn_size), \
134 ASSERT(_hsva >= _rsva); \
135 ASSERT(_hsva < _reva); \
136 ASSERT(_heva > _rsva); \
137 ASSERT(_heva <= _reva); \
138 _flagtte = (_ttesz < HBLK_MIN_TTESZ) ? HBLK_MIN_TTESZ : \
140 ASSERT(rgnp->rgn_hmeflags & (0x1 << _flagtte)); \
144 #define SFMMU_VALIDATE_HMERID(hat, rid, addr, len)
145 #define SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid)
148 #if defined(SF_ERRATA_57)
149 extern caddr_t errata57_limit
;
152 #define HME8BLK_SZ_RND ((roundup(HME8BLK_SZ, sizeof (int64_t))) / \
154 #define HBLK_RESERVE ((struct hme_blk *)hblk_reserve)
156 #define HBLK_RESERVE_CNT 128
157 #define HBLK_RESERVE_MIN 20
159 static struct hme_blk
*freehblkp
;
160 static kmutex_t freehblkp_lock
;
161 static int freehblkcnt
;
163 static int64_t hblk_reserve
[HME8BLK_SZ_RND
];
164 static kmutex_t hblk_reserve_lock
;
165 static kthread_t
*hblk_reserve_thread
;
167 static nucleus_hblk8_info_t nucleus_hblk8
;
168 static nucleus_hblk1_info_t nucleus_hblk1
;
171 * Data to manage per-cpu hmeblk pending queues, hmeblks are queued here
172 * after the initial phase of removing an hmeblk from the hash chain, see
173 * the detailed comment in sfmmu_hblk_hash_rm() for further details.
175 static cpu_hme_pend_t
*cpu_hme_pend
;
176 static uint_t cpu_hme_pend_thresh
;
178 * SFMMU specific hat functions
180 void hat_pagecachectl(struct page
*, int);
182 /* flags for hat_pagecachectl */
183 #define HAT_CACHE 0x1
184 #define HAT_UNCACHE 0x2
185 #define HAT_TMPNC 0x4
188 * Flag to allow the creation of non-cacheable translations
189 * to system memory. It is off by default. At the moment this
190 * flag is used by the ecache error injector. The error injector
191 * will turn it on when creating such a translation then shut it
192 * off when it's finished.
195 int sfmmu_allow_nc_trans
= 0;
198 * Flag to disable large page support.
199 * value of 1 => disable all large pages.
200 * bits 1, 2, and 3 are to disable 64K, 512K and 4M pages respectively.
202 * For example, use the value 0x4 to disable 512K pages.
205 #define LARGE_PAGES_OFF 0x1
208 * The disable_large_pages and disable_ism_large_pages variables control
209 * hat_memload_array and the page sizes to be used by ISM and the kernel.
211 * The disable_auto_data_large_pages and disable_auto_text_large_pages variables
212 * are only used to control which OOB pages to use at upper VM segment creation
213 * time, and are set in hat_init_pagesizes and used in the map_pgsz* routines.
214 * Their values may come from platform or CPU specific code to disable page
215 * sizes that should not be used.
217 * WARNING: 512K pages are currently not supported for ISM/DISM.
219 uint_t disable_large_pages
= 0;
220 uint_t disable_ism_large_pages
= (1 << TTE512K
);
221 uint_t disable_auto_data_large_pages
= 0;
222 uint_t disable_auto_text_large_pages
= 0;
225 * Private sfmmu data structures for hat management
227 static struct kmem_cache
*sfmmuid_cache
;
228 static struct kmem_cache
*mmuctxdom_cache
;
231 * Private sfmmu data structures for tsb management
233 static struct kmem_cache
*sfmmu_tsbinfo_cache
;
234 static struct kmem_cache
*sfmmu_tsb8k_cache
;
235 static struct kmem_cache
*sfmmu_tsb_cache
[NLGRPS_MAX
];
236 static vmem_t
*kmem_bigtsb_arena
;
237 static vmem_t
*kmem_tsb_arena
;
240 * sfmmu static variables for hmeblk resource management.
242 static vmem_t
*hat_memload1_arena
; /* HAT translation arena for sfmmu1_cache */
243 static struct kmem_cache
*sfmmu8_cache
;
244 static struct kmem_cache
*sfmmu1_cache
;
245 static struct kmem_cache
*pa_hment_cache
;
247 static kmutex_t ism_mlist_lock
; /* mutex for ism mapping list */
249 * private data for ism
251 static struct kmem_cache
*ism_blk_cache
;
252 static struct kmem_cache
*ism_ment_cache
;
253 #define ISMID_STARTADDR NULL
256 * Region management data structures and function declarations.
259 static void sfmmu_leave_srd(sfmmu_t
*);
260 static int sfmmu_srdcache_constructor(void *, void *, int);
261 static void sfmmu_srdcache_destructor(void *, void *);
262 static int sfmmu_rgncache_constructor(void *, void *, int);
263 static void sfmmu_rgncache_destructor(void *, void *);
264 static int sfrgnmap_isnull(sf_region_map_t
*);
265 static int sfhmergnmap_isnull(sf_hmeregion_map_t
*);
266 static int sfmmu_scdcache_constructor(void *, void *, int);
267 static void sfmmu_scdcache_destructor(void *, void *);
268 static void sfmmu_rgn_cb_noop(caddr_t
, caddr_t
, caddr_t
,
269 size_t, void *, u_offset_t
);
271 static uint_t srd_hashmask
= SFMMU_MAX_SRD_BUCKETS
- 1;
272 static sf_srd_bucket_t
*srd_buckets
;
273 static struct kmem_cache
*srd_cache
;
274 static uint_t srd_rgn_hashmask
= SFMMU_MAX_REGION_BUCKETS
- 1;
275 static struct kmem_cache
*region_cache
;
276 static struct kmem_cache
*scd_cache
;
279 int use_bigtsb_arena
= 1;
281 int use_bigtsb_arena
= 0;
284 /* External /etc/system tunable, for turning on&off the shctx support */
285 int disable_shctx
= 0;
286 /* Internal variable, set by MD if the HW supports shctx feature */
290 static void check_scd_sfmmu_list(sfmmu_t
**, sfmmu_t
*, int);
292 static void sfmmu_to_scd_list(sfmmu_t
**, sfmmu_t
*);
293 static void sfmmu_from_scd_list(sfmmu_t
**, sfmmu_t
*);
295 static sf_scd_t
*sfmmu_alloc_scd(sf_srd_t
*, sf_region_map_t
*);
296 static void sfmmu_find_scd(sfmmu_t
*);
297 static void sfmmu_join_scd(sf_scd_t
*, sfmmu_t
*);
298 static void sfmmu_finish_join_scd(sfmmu_t
*);
299 static void sfmmu_leave_scd(sfmmu_t
*, uchar_t
);
300 static void sfmmu_destroy_scd(sf_srd_t
*, sf_scd_t
*, sf_region_map_t
*);
301 static int sfmmu_alloc_scd_tsbs(sf_srd_t
*, sf_scd_t
*);
302 static void sfmmu_free_scd_tsbs(sfmmu_t
*);
303 static void sfmmu_tsb_inv_ctx(sfmmu_t
*);
304 static int find_ism_rid(sfmmu_t
*, sfmmu_t
*, caddr_t
, uint_t
*);
305 static void sfmmu_ism_hatflags(sfmmu_t
*, int);
306 static int sfmmu_srd_lock_held(sf_srd_t
*);
307 static void sfmmu_remove_scd(sf_scd_t
**, sf_scd_t
*);
308 static void sfmmu_add_scd(sf_scd_t
**headp
, sf_scd_t
*);
309 static void sfmmu_link_scd_to_regions(sf_srd_t
*, sf_scd_t
*);
310 static void sfmmu_unlink_scd_from_regions(sf_srd_t
*, sf_scd_t
*);
311 static void sfmmu_link_to_hmeregion(sfmmu_t
*, sf_region_t
*);
312 static void sfmmu_unlink_from_hmeregion(sfmmu_t
*, sf_region_t
*);
315 * ``hat_lock'' is a hashed mutex lock for protecting sfmmu TSB lists,
316 * HAT flags, synchronizing TLB/TSB coherency, and context management.
317 * The lock is hashed on the sfmmup since the case where we need to lock
318 * all processes is rare but does occur (e.g. we need to unload a shared
319 * mapping from all processes using the mapping). We have a lot of buckets,
320 * and each slab of sfmmu_t's can use about a quarter of them, giving us
321 * a fairly good distribution without wasting too much space and overhead
322 * when we have to grab them all.
324 #define SFMMU_NUM_LOCK 128 /* must be power of two */
325 hatlock_t hat_lock
[SFMMU_NUM_LOCK
];
328 * Hash algorithm optimized for a small number of slabs.
329 * 7 is (highbit((sizeof sfmmu_t)) - 1)
330 * This hash algorithm is based upon the knowledge that sfmmu_t's come from a
331 * kmem_cache, and thus they will be sequential within that cache. In
332 * addition, each new slab will have a different "color" up to cache_maxcolor
333 * which will skew the hashing for each successive slab which is allocated.
334 * If the size of sfmmu_t changed to a larger size, this algorithm may need
337 #define TSB_HASH_SHIFT_BITS (7)
338 #define PTR_HASH(x) ((uintptr_t)x >> TSB_HASH_SHIFT_BITS)
341 int tsb_hash_debug
= 0;
342 #define TSB_HASH(sfmmup) \
343 (tsb_hash_debug ? &hat_lock[0] : \
344 &hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)])
346 #define TSB_HASH(sfmmup) &hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)]
350 /* sfmmu_replace_tsb() return codes. */
351 typedef enum tsb_replace_rc
{
360 * Flags for TSB allocation routines.
362 #define TSB_ALLOC 0x01
363 #define TSB_FORCEALLOC 0x02
364 #define TSB_GROW 0x04
365 #define TSB_SHRINK 0x08
366 #define TSB_SWAPIN 0x10
369 * Support for HAT callbacks.
371 #define SFMMU_MAX_RELOC_CALLBACKS 10
372 int sfmmu_max_cb_id
= SFMMU_MAX_RELOC_CALLBACKS
;
373 static id_t sfmmu_cb_nextid
= 0;
374 static id_t sfmmu_tsb_cb_id
;
375 struct sfmmu_callback
*sfmmu_cb_table
;
378 kmutex_t kpr_suspendlock
;
379 kthread_t
*kreloc_thread
;
382 * Enable VA->PA translation sanity checking on DEBUG kernels.
383 * Disabled by default. This is incompatible with some
384 * drivers (error injector, RSM) so if it breaks you get
385 * to keep both pieces.
387 int hat_check_vtop
= 0;
390 * Private sfmmu routines (prototypes)
392 static struct hme_blk
*sfmmu_shadow_hcreate(sfmmu_t
*, caddr_t
, int, uint_t
);
393 static struct hme_blk
*sfmmu_hblk_alloc(sfmmu_t
*, caddr_t
,
394 struct hmehash_bucket
*, uint_t
, hmeblk_tag
, uint_t
,
396 static caddr_t
sfmmu_hblk_unload(struct hat
*, struct hme_blk
*, caddr_t
,
397 caddr_t
, demap_range_t
*, uint_t
);
398 static caddr_t
sfmmu_hblk_sync(struct hat
*, struct hme_blk
*, caddr_t
,
400 static void sfmmu_hblk_free(struct hme_blk
**);
401 static void sfmmu_hblks_list_purge(struct hme_blk
**, int);
402 static uint_t
sfmmu_get_free_hblk(struct hme_blk
**, uint_t
);
403 static uint_t
sfmmu_put_free_hblk(struct hme_blk
*, uint_t
);
404 static struct hme_blk
*sfmmu_hblk_steal(int);
405 static int sfmmu_steal_this_hblk(struct hmehash_bucket
*,
406 struct hme_blk
*, uint64_t, struct hme_blk
*);
407 static caddr_t
sfmmu_hblk_unlock(struct hme_blk
*, caddr_t
, caddr_t
);
409 static void hat_do_memload_array(struct hat
*, caddr_t
, size_t,
410 struct page
**, uint_t
, uint_t
, uint_t
);
411 static void hat_do_memload(struct hat
*, caddr_t
, struct page
*,
412 uint_t
, uint_t
, uint_t
);
413 static void sfmmu_memload_batchsmall(struct hat
*, caddr_t
, page_t
**,
414 uint_t
, uint_t
, pgcnt_t
, uint_t
);
415 void sfmmu_tteload(struct hat
*, tte_t
*, caddr_t
, page_t
*,
417 static int sfmmu_tteload_array(sfmmu_t
*, tte_t
*, caddr_t
, page_t
**,
419 static struct hmehash_bucket
*sfmmu_tteload_acquire_hashbucket(sfmmu_t
*,
420 caddr_t
, int, uint_t
);
421 static struct hme_blk
*sfmmu_tteload_find_hmeblk(sfmmu_t
*,
422 struct hmehash_bucket
*, caddr_t
, uint_t
, uint_t
,
424 static int sfmmu_tteload_addentry(sfmmu_t
*, struct hme_blk
*, tte_t
*,
425 caddr_t
, page_t
**, uint_t
, uint_t
);
426 static void sfmmu_tteload_release_hashbucket(struct hmehash_bucket
*);
428 static int sfmmu_pagearray_setup(caddr_t
, page_t
**, tte_t
*, int);
429 static pfn_t
sfmmu_uvatopfn(caddr_t
, sfmmu_t
*, tte_t
*);
430 void sfmmu_memtte(tte_t
*, pfn_t
, uint_t
, int);
432 static void sfmmu_vac_conflict(struct hat
*, caddr_t
, page_t
*);
433 static int sfmmu_vacconflict_array(caddr_t
, page_t
*, int *);
434 int tst_tnc(page_t
*pp
, pgcnt_t
);
435 void conv_tnc(page_t
*pp
, int);
438 static void sfmmu_get_ctx(sfmmu_t
*);
439 static void sfmmu_free_sfmmu(sfmmu_t
*);
441 static void sfmmu_ttesync(struct hat
*, caddr_t
, tte_t
*, page_t
*);
442 static void sfmmu_chgattr(struct hat
*, caddr_t
, size_t, uint_t
, int);
444 cpuset_t
sfmmu_pageunload(page_t
*, struct sf_hment
*, int);
445 static void hat_pagereload(struct page
*, struct page
*);
446 static cpuset_t
sfmmu_pagesync(page_t
*, struct sf_hment
*, uint_t
);
448 void sfmmu_page_cache_array(page_t
*, int, int, pgcnt_t
);
449 static void sfmmu_page_cache(page_t
*, int, int, int);
452 cpuset_t
sfmmu_rgntlb_demap(caddr_t
, sf_region_t
*,
453 struct hme_blk
*, int);
454 static void sfmmu_tlbcache_demap(caddr_t
, sfmmu_t
*, struct hme_blk
*,
455 pfn_t
, int, int, int, int);
456 static void sfmmu_ismtlbcache_demap(caddr_t
, sfmmu_t
*, struct hme_blk
*,
458 static void sfmmu_tlb_demap(caddr_t
, sfmmu_t
*, struct hme_blk
*, int, int);
459 static void sfmmu_tlb_range_demap(demap_range_t
*);
460 static void sfmmu_invalidate_ctx(sfmmu_t
*);
461 static void sfmmu_sync_mmustate(sfmmu_t
*);
463 static void sfmmu_tsbinfo_setup_phys(struct tsb_info
*, pfn_t
);
464 static int sfmmu_tsbinfo_alloc(struct tsb_info
**, int, int, uint_t
,
466 static void sfmmu_tsb_free(struct tsb_info
*);
467 static void sfmmu_tsbinfo_free(struct tsb_info
*);
468 static int sfmmu_init_tsbinfo(struct tsb_info
*, int, int, uint_t
,
470 static void sfmmu_tsb_chk_reloc(sfmmu_t
*, hatlock_t
*);
471 static void sfmmu_tsb_swapin(sfmmu_t
*, hatlock_t
*);
472 static int sfmmu_select_tsb_szc(pgcnt_t
);
473 static void sfmmu_mod_tsb(sfmmu_t
*, caddr_t
, tte_t
*, int);
474 #define sfmmu_load_tsb(sfmmup, vaddr, tte, szc) \
475 sfmmu_mod_tsb(sfmmup, vaddr, tte, szc)
476 #define sfmmu_unload_tsb(sfmmup, vaddr, szc) \
477 sfmmu_mod_tsb(sfmmup, vaddr, NULL, szc)
478 static void sfmmu_copy_tsb(struct tsb_info
*, struct tsb_info
*);
479 static tsb_replace_rc_t
sfmmu_replace_tsb(sfmmu_t
*, struct tsb_info
*, uint_t
,
480 hatlock_t
*, uint_t
);
481 static void sfmmu_size_tsb(sfmmu_t
*, int, uint64_t, uint64_t, int);
484 void sfmmu_cache_flush(pfn_t
, int);
485 void sfmmu_cache_flushcolor(int, pfn_t
);
487 static caddr_t
sfmmu_hblk_chgattr(sfmmu_t
*, struct hme_blk
*, caddr_t
,
488 caddr_t
, demap_range_t
*, uint_t
, int);
490 static uint64_t sfmmu_vtop_attr(uint_t
, int mode
, tte_t
*);
491 static uint_t
sfmmu_ptov_attr(tte_t
*);
492 static caddr_t
sfmmu_hblk_chgprot(sfmmu_t
*, struct hme_blk
*, caddr_t
,
493 caddr_t
, demap_range_t
*, uint_t
);
494 static uint_t
sfmmu_vtop_prot(uint_t
, uint_t
*);
495 static int sfmmu_idcache_constructor(void *, void *, int);
496 static void sfmmu_idcache_destructor(void *, void *);
497 static int sfmmu_hblkcache_constructor(void *, void *, int);
498 static void sfmmu_hblkcache_destructor(void *, void *);
499 static void sfmmu_hblkcache_reclaim(void *);
500 static void sfmmu_shadow_hcleanup(sfmmu_t
*, struct hme_blk
*,
501 struct hmehash_bucket
*);
502 static void sfmmu_hblk_hash_rm(struct hmehash_bucket
*, struct hme_blk
*,
503 struct hme_blk
*, struct hme_blk
**, int);
504 static void sfmmu_hblk_hash_add(struct hmehash_bucket
*, struct hme_blk
*,
506 static struct hme_blk
*sfmmu_check_pending_hblks(int);
507 static void sfmmu_free_hblks(sfmmu_t
*, caddr_t
, caddr_t
, int);
508 static void sfmmu_cleanup_rhblk(sf_srd_t
*, caddr_t
, uint_t
, int);
509 static void sfmmu_unload_hmeregion_va(sf_srd_t
*, uint_t
, caddr_t
, caddr_t
,
511 static void sfmmu_unload_hmeregion(sf_srd_t
*, sf_region_t
*);
513 static void sfmmu_rm_large_mappings(page_t
*, int);
515 static void hat_lock_init(void);
516 static void hat_kstat_init(void);
517 static int sfmmu_kstat_percpu_update(kstat_t
*ksp
, int rw
);
518 static void sfmmu_set_scd_rttecnt(sf_srd_t
*, sf_scd_t
*);
519 static int sfmmu_is_rgnva(sf_srd_t
*, caddr_t
, ulong_t
, ulong_t
);
520 static void sfmmu_check_page_sizes(sfmmu_t
*, int);
521 int fnd_mapping_sz(page_t
*);
522 static void iment_add(struct ism_ment
*, struct hat
*);
523 static void iment_sub(struct ism_ment
*, struct hat
*);
524 static pgcnt_t
ism_tsb_entries(sfmmu_t
*, int szc
);
525 extern void sfmmu_setup_tsbinfo(sfmmu_t
*);
526 extern void sfmmu_clear_utsbinfo(void);
528 static void sfmmu_ctx_wrap_around(mmu_ctx_t
*, boolean_t
);
530 extern int vpm_enable
;
535 * Enable trap level tsbmiss handling
540 * Flush the TLB on kpm mapout. Note: Xcalls are used (again) for the
541 * required TLB shootdowns in this case, so handle w/ care. Off by default.
546 static void *sfmmu_vmem_xalloc_aligned_wrapper(vmem_t
*, size_t, int);
549 static void sfmmu_check_hblk_flist();
553 * Semi-private sfmmu data structures. Some of them are initialize in
554 * startup or in hat_init. Some of them are private but accessed by
555 * assembly code or mach_sfmmu.c
557 struct hmehash_bucket
*uhme_hash
; /* user hmeblk hash table */
558 struct hmehash_bucket
*khme_hash
; /* kernel hmeblk hash table */
559 uint64_t uhme_hash_pa
; /* PA of uhme_hash */
560 uint64_t khme_hash_pa
; /* PA of khme_hash */
561 int uhmehash_num
; /* # of buckets in user hash table */
562 int khmehash_num
; /* # of buckets in kernel hash table */
564 uint_t max_mmu_ctxdoms
= 0; /* max context domains in the system */
565 mmu_ctx_t
**mmu_ctxs_tbl
; /* global array of context domains */
566 uint64_t mmu_saved_gnum
= 0; /* to init incoming MMUs' gnums */
568 #define DEFAULT_NUM_CTXS_PER_MMU 8192
569 static uint_t nctxs
= DEFAULT_NUM_CTXS_PER_MMU
;
571 int cache
; /* describes system cache */
573 caddr_t ktsb_base
; /* kernel 8k-indexed tsb base address */
574 uint64_t ktsb_pbase
; /* kernel 8k-indexed tsb phys address */
575 int ktsb_szcode
; /* kernel 8k-indexed tsb size code */
576 int ktsb_sz
; /* kernel 8k-indexed tsb size */
578 caddr_t ktsb4m_base
; /* kernel 4m-indexed tsb base address */
579 uint64_t ktsb4m_pbase
; /* kernel 4m-indexed tsb phys address */
580 int ktsb4m_szcode
; /* kernel 4m-indexed tsb size code */
581 int ktsb4m_sz
; /* kernel 4m-indexed tsb size */
583 uint64_t kpm_tsbbase
; /* kernel seg_kpm 4M TSB base address */
584 int kpm_tsbsz
; /* kernel seg_kpm 4M TSB size code */
585 uint64_t kpmsm_tsbbase
; /* kernel seg_kpm 8K TSB base address */
586 int kpmsm_tsbsz
; /* kernel seg_kpm 8K TSB size code */
589 int utsb_dtlb_ttenum
= -1; /* index in TLB for utsb locked TTE */
590 int utsb4m_dtlb_ttenum
= -1; /* index in TLB for 4M TSB TTE */
591 int dtlb_resv_ttenum
; /* index in TLB of first reserved TTE */
592 caddr_t utsb_vabase
; /* reserved kernel virtual memory */
593 caddr_t utsb4m_vabase
; /* for trap handler TSB accesses */
595 uint64_t tsb_alloc_bytes
= 0; /* bytes allocated to TSBs */
596 vmem_t
*kmem_tsb_default_arena
[NLGRPS_MAX
]; /* For dynamic TSBs */
597 vmem_t
*kmem_bigtsb_default_arena
[NLGRPS_MAX
]; /* dynamic 256M TSBs */
600 * Size to use for TSB slabs. Future platforms that support page sizes
601 * larger than 4M may wish to change these values, and provide their own
602 * assembly macros for building and decoding the TSB base register contents.
603 * Note disable_large_pages will override the value set here.
605 static uint_t tsb_slab_ttesz
= TTE4M
;
606 size_t tsb_slab_size
= MMU_PAGESIZE4M
;
607 uint_t tsb_slab_shift
= MMU_PAGESHIFT4M
;
608 /* PFN mask for TTE */
609 size_t tsb_slab_mask
= MMU_PAGEOFFSET4M
>> MMU_PAGESHIFT
;
612 * Size to use for TSB slabs. These are used only when 256M tsb arenas
615 static uint_t bigtsb_slab_ttesz
= TTE256M
;
616 static size_t bigtsb_slab_size
= MMU_PAGESIZE256M
;
617 static uint_t bigtsb_slab_shift
= MMU_PAGESHIFT256M
;
618 /* 256M page alignment for 8K pfn */
619 static size_t bigtsb_slab_mask
= MMU_PAGEOFFSET256M
>> MMU_PAGESHIFT
;
621 /* largest TSB size to grow to, will be smaller on smaller memory systems */
622 static int tsb_max_growsize
= 0;
625 * Tunable parameters dealing with TSB policies.
629 * This undocumented tunable forces all 8K TSBs to be allocated from
630 * the kernel heap rather than from the kmem_tsb_default_arena arenas.
633 int tsb_forceheap
= 0;
637 * Decide whether to use per-lgroup arenas, or one global set of
638 * TSB arenas. The default is not to break up per-lgroup, since
639 * most platforms don't recognize any tangible benefit from it.
641 int tsb_lgrp_affinity
= 0;
644 * Used for growing the TSB based on the process RSS.
645 * tsb_rss_factor is based on the smallest TSB, and is
646 * shifted by the TSB size to determine if we need to grow.
647 * The default will grow the TSB if the number of TTEs for
648 * this page size exceeds 75% of the number of TSB entries,
649 * which should _almost_ eliminate all conflict misses
650 * (at the expense of using up lots and lots of memory).
652 #define TSB_RSS_FACTOR (TSB_ENTRIES(TSB_MIN_SZCODE) * 0.75)
653 #define SFMMU_RSS_TSBSIZE(tsbszc) (tsb_rss_factor << tsbszc)
654 #define SELECT_TSB_SIZECODE(pgcnt) ( \
655 (enable_tsb_rss_sizing)? sfmmu_select_tsb_szc(pgcnt) : \
657 #define TSB_OK_SHRINK() \
658 (tsb_alloc_bytes > tsb_alloc_hiwater || freemem < desfree)
659 #define TSB_OK_GROW() \
660 (tsb_alloc_bytes < tsb_alloc_hiwater && freemem > desfree)
662 int enable_tsb_rss_sizing
= 1;
663 int tsb_rss_factor
= (int)TSB_RSS_FACTOR
;
665 /* which TSB size code to use for new address spaces or if rss sizing off */
666 int default_tsb_size
= TSB_8K_SZCODE
;
668 static uint64_t tsb_alloc_hiwater
; /* limit TSB reserved memory */
669 uint64_t tsb_alloc_hiwater_factor
; /* tsb_alloc_hiwater = physmem / this */
670 #define TSB_ALLOC_HIWATER_FACTOR_DEFAULT 32
673 static int tsb_random_size
= 0; /* set to 1 to test random tsb sizes on alloc */
674 static int tsb_grow_stress
= 0; /* if set to 1, keep replacing TSB w/ random */
675 static int tsb_alloc_mtbf
= 0; /* fail allocation every n attempts */
676 static int tsb_alloc_fail_mtbf
= 0;
677 static int tsb_alloc_count
= 0;
680 /* if set to 1, will remap valid TTEs when growing TSB. */
681 int tsb_remap_ttes
= 1;
684 * If we have more than this many mappings, allocate a second TSB.
685 * This default is chosen because the I/D fully associative TLBs are
686 * assumed to have at least 8 available entries. Platforms with a
687 * larger fully-associative TLB could probably override the default.
691 int tsb_sectsb_threshold
= 0;
693 int tsb_sectsb_threshold
= 8;
699 struct sfmmu_global_stat sfmmu_global_stat
;
700 struct sfmmu_tsbsize_stat sfmmu_tsbsize_stat
;
705 sfmmu_t
*ksfmmup
; /* kernel's hat id */
708 static void chk_tte(tte_t
*, tte_t
*, tte_t
*, struct hme_blk
*);
711 /* sfmmu locking operations */
712 static kmutex_t
*sfmmu_mlspl_enter(struct page
*, int);
713 static int sfmmu_mlspl_held(struct page
*, int);
715 kmutex_t
*sfmmu_page_enter(page_t
*);
716 void sfmmu_page_exit(kmutex_t
*);
717 int sfmmu_page_spl_held(struct page
*);
719 /* sfmmu internal locking operations - accessed directly */
720 static void sfmmu_mlist_reloc_enter(page_t
*, page_t
*,
721 kmutex_t
**, kmutex_t
**);
722 static void sfmmu_mlist_reloc_exit(kmutex_t
*, kmutex_t
*);
724 sfmmu_hat_enter(sfmmu_t
*);
726 sfmmu_hat_tryenter(sfmmu_t
*);
727 static void sfmmu_hat_exit(hatlock_t
*);
728 static void sfmmu_hat_lock_all(void);
729 static void sfmmu_hat_unlock_all(void);
730 static void sfmmu_ismhat_enter(sfmmu_t
*, int);
731 static void sfmmu_ismhat_exit(sfmmu_t
*, int);
733 kpm_hlk_t
*kpmp_table
;
734 uint_t kpmp_table_sz
; /* must be a power of 2 */
737 kpm_shlk_t
*kpmp_stable
;
738 uint_t kpmp_stable_sz
; /* must be a power of 2 */
741 * SPL_TABLE_SIZE is 2 * NCPU, but no smaller than 128.
742 * SPL_SHIFT is log2(SPL_TABLE_SIZE).
744 #if ((2*NCPU_P2) > 128)
745 #define SPL_SHIFT ((unsigned)(NCPU_LOG2 + 1))
749 #define SPL_TABLE_SIZE (1U << SPL_SHIFT)
750 #define SPL_MASK (SPL_TABLE_SIZE - 1)
753 * We shift by PP_SHIFT to take care of the low-order 0 bits of a page_t
754 * and by multiples of SPL_SHIFT to get as many varied bits as we can.
756 #define SPL_INDEX(pp) \
757 ((((uintptr_t)(pp) >> PP_SHIFT) ^ \
758 ((uintptr_t)(pp) >> (PP_SHIFT + SPL_SHIFT)) ^ \
759 ((uintptr_t)(pp) >> (PP_SHIFT + SPL_SHIFT * 2)) ^ \
760 ((uintptr_t)(pp) >> (PP_SHIFT + SPL_SHIFT * 3))) & \
763 #define SPL_HASH(pp) \
764 (&sfmmu_page_lock[SPL_INDEX(pp)].pad_mutex)
766 static pad_mutex_t sfmmu_page_lock
[SPL_TABLE_SIZE
];
768 /* Array of mutexes protecting a page's mapping list and p_nrm field. */
770 #define MML_TABLE_SIZE SPL_TABLE_SIZE
771 #define MLIST_HASH(pp) (&mml_table[SPL_INDEX(pp)].pad_mutex)
773 static pad_mutex_t mml_table
[MML_TABLE_SIZE
];
776 * hat_unload_callback() will group together callbacks in order
777 * to avoid xt_sync() calls. This is the maximum size of the group.
779 #define MAX_CB_ADDR 32
782 static ulong_t sfmmu_dmr_maxbit
= DMR_MAXBIT
;
784 static char *mmu_ctx_kstat_names
[] = {
785 "mmu_ctx_tsb_exceptions",
786 "mmu_ctx_tsb_raise_exception",
787 "mmu_ctx_wrap_around",
791 * Wrapper for vmem_xalloc since vmem_create only allows limited
792 * parameters for vm_source_alloc functions. This function allows us
793 * to specify alignment consistent with the size of the object being
797 sfmmu_vmem_xalloc_aligned_wrapper(vmem_t
*vmp
, size_t size
, int vmflag
)
799 return (vmem_xalloc(vmp
, size
, size
, 0, 0, NULL
, NULL
, vmflag
));
802 /* Common code for setting tsb_alloc_hiwater. */
803 #define SFMMU_SET_TSB_ALLOC_HIWATER(pages) tsb_alloc_hiwater = \
804 ptob(pages) / tsb_alloc_hiwater_factor
807 * Set tsb_max_growsize to allow at most all of physical memory to be mapped by
808 * a single TSB. physmem is the number of physical pages so we need physmem 8K
809 * TTEs to represent all those physical pages. We round this up by using
810 * 1<<highbit(). To figure out which size code to use, remember that the size
811 * code is just an amount to shift the smallest TSB size to get the size of
812 * this TSB. So we subtract that size, TSB_START_SIZE, from highbit() (or
813 * highbit() - 1) to get the size code for the smallest TSB that can represent
814 * all of physical memory, while erring on the side of too much.
816 * Restrict tsb_max_growsize to make sure that:
817 * 1) TSBs can't grow larger than the TSB slab size
818 * 2) TSBs can't grow larger than UTSB_MAX_SZCODE.
820 #define SFMMU_SET_TSB_MAX_GROWSIZE(pages) { \
821 int _i, _szc, _slabszc, _tsbszc; \
823 _i = highbit(pages); \
824 if ((1 << (_i - 1)) == (pages)) \
825 _i--; /* 2^n case, round down */ \
826 _szc = _i - TSB_START_SIZE; \
827 _slabszc = bigtsb_slab_shift - (TSB_START_SIZE + TSB_ENTRY_SHIFT); \
828 _tsbszc = MIN(_szc, _slabszc); \
829 tsb_max_growsize = MIN(_tsbszc, UTSB_MAX_SZCODE); \
833 * Given a pointer to an sfmmu and a TTE size code, return a pointer to the
834 * tsb_info which handles that TTE size.
836 #define SFMMU_GET_TSBINFO(tsbinfop, sfmmup, tte_szc) { \
837 (tsbinfop) = (sfmmup)->sfmmu_tsb; \
838 ASSERT(((tsbinfop)->tsb_flags & TSB_SHAREDCTX) || \
839 sfmmu_hat_lock_held(sfmmup)); \
840 if ((tte_szc) >= TTE4M) { \
841 ASSERT((tsbinfop) != NULL); \
842 (tsbinfop) = (tsbinfop)->tsb_next; \
847 * Macro to use to unload entries from the TSB.
848 * It has knowledge of which page sizes get replicated in the TSB
849 * and will call the appropriate unload routine for the appropriate size.
851 #define SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, ismhat) \
853 int ttesz = get_hblk_ttesz(hmeblkp); \
854 if (ttesz == TTE8K || ttesz == TTE4M) { \
855 sfmmu_unload_tsb(sfmmup, addr, ttesz); \
857 caddr_t sva = ismhat ? addr : \
858 (caddr_t)get_hblk_base(hmeblkp); \
859 caddr_t eva = sva + get_hblk_span(hmeblkp); \
860 ASSERT(addr >= sva && addr < eva); \
861 sfmmu_unload_tsb_range(sfmmup, sva, eva, ttesz); \
866 /* Update tsb_alloc_hiwater after memory is configured. */
869 sfmmu_update_post_add(void *arg
, pgcnt_t delta_pages
)
871 /* Assumes physmem has already been updated. */
872 SFMMU_SET_TSB_ALLOC_HIWATER(physmem
);
873 SFMMU_SET_TSB_MAX_GROWSIZE(physmem
);
877 * Update tsb_alloc_hiwater before memory is deleted. We'll do nothing here
878 * and update tsb_alloc_hiwater and tsb_max_growsize after the memory is
883 sfmmu_update_pre_del(void *arg
, pgcnt_t delta_pages
)
888 /* Update tsb_alloc_hiwater after memory fails to be unconfigured. */
891 sfmmu_update_post_del(void *arg
, pgcnt_t delta_pages
, int cancelled
)
894 * Whether the delete was cancelled or not, just go ahead and update
895 * tsb_alloc_hiwater and tsb_max_growsize.
897 SFMMU_SET_TSB_ALLOC_HIWATER(physmem
);
898 SFMMU_SET_TSB_MAX_GROWSIZE(physmem
);
901 static kphysm_setup_vector_t sfmmu_update_vec
= {
902 KPHYSM_SETUP_VECTOR_VERSION
, /* version */
903 sfmmu_update_post_add
, /* post_add */
904 sfmmu_update_pre_del
, /* pre_del */
905 sfmmu_update_post_del
/* post_del */
910 * HME_BLK HASH PRIMITIVES
914 * Enter a hme on the mapping list for page pp.
915 * When large pages are more prevalent in the system we might want to
916 * keep the mapping list in ascending order by the hment size. For now,
917 * small pages are more frequent, so don't slow it down.
919 #define HME_ADD(hme, pp) \
921 ASSERT(sfmmu_mlist_held(pp)); \
923 hme->hme_prev = NULL; \
924 hme->hme_next = pp->p_mapping; \
925 hme->hme_page = pp; \
926 if (pp->p_mapping) { \
927 ((struct sf_hment *)(pp->p_mapping))->hme_prev = hme;\
928 ASSERT(pp->p_share > 0); \
931 ASSERT(pp->p_share == 0); \
933 pp->p_mapping = hme; \
938 * Enter a hme on the mapping list for page pp.
939 * If we are unmapping a large translation, we need to make sure that the
940 * change is reflect in the corresponding bit of the p_index field.
942 #define HME_SUB(hme, pp) \
944 ASSERT(sfmmu_mlist_held(pp)); \
945 ASSERT(hme->hme_page == pp || IS_PAHME(hme)); \
947 if (pp->p_mapping == NULL) { \
948 panic("hme_remove - no mappings"); \
951 membar_stst(); /* ensure previous stores finish */ \
953 ASSERT(pp->p_share > 0); \
956 if (hme->hme_prev) { \
957 ASSERT(pp->p_mapping != hme); \
958 ASSERT(hme->hme_prev->hme_page == pp || \
959 IS_PAHME(hme->hme_prev)); \
960 hme->hme_prev->hme_next = hme->hme_next; \
962 ASSERT(pp->p_mapping == hme); \
963 pp->p_mapping = hme->hme_next; \
964 ASSERT((pp->p_mapping == NULL) ? \
965 (pp->p_share == 0) : 1); \
968 if (hme->hme_next) { \
969 ASSERT(hme->hme_next->hme_page == pp || \
970 IS_PAHME(hme->hme_next)); \
971 hme->hme_next->hme_prev = hme->hme_prev; \
974 /* zero out the entry */ \
975 hme->hme_next = NULL; \
976 hme->hme_prev = NULL; \
977 hme->hme_page = NULL; \
979 if (hme_size(hme) > TTE8K) { \
980 /* remove mappings for remainder of large pg */ \
981 sfmmu_rm_large_mappings(pp, hme_size(hme)); \
986 * This function returns the hment given the hme_blk and a vaddr.
987 * It assumes addr has already been checked to belong to hme_blk's
990 #define HBLKTOHME(hment, hmeblkp, addr) \
993 HBLKTOHME_IDX(hment, hmeblkp, addr, index) \
997 * Version of HBLKTOHME that also returns the index in hmeblkp
1000 #define HBLKTOHME_IDX(hment, hmeblkp, addr, idx) \
1002 ASSERT(in_hblk_range((hmeblkp), (addr))); \
1004 if (get_hblk_ttesz(hmeblkp) == TTE8K) { \
1005 idx = (((uintptr_t)(addr) >> MMU_PAGESHIFT) & (NHMENTS-1)); \
1009 (hment) = &(hmeblkp)->hblk_hme[idx]; \
1013 * Disable any page sizes not supported by the CPU
1016 hat_init_pagesizes()
1020 mmu_exported_page_sizes
= 0;
1021 for (i
= TTE8K
; i
< max_mmu_page_sizes
; i
++) {
1023 szc_2_userszc
[i
] = (uint_t
)-1;
1024 userszc_2_szc
[i
] = (uint_t
)-1;
1026 if ((mmu_exported_pagesize_mask
& (1 << i
)) == 0) {
1027 disable_large_pages
|= (1 << i
);
1029 szc_2_userszc
[i
] = mmu_exported_page_sizes
;
1030 userszc_2_szc
[mmu_exported_page_sizes
] = i
;
1031 mmu_exported_page_sizes
++;
1035 disable_ism_large_pages
|= disable_large_pages
;
1036 disable_auto_data_large_pages
= disable_large_pages
;
1037 disable_auto_text_large_pages
= disable_large_pages
;
1040 * Initialize mmu-specific large page sizes.
1042 if (&mmu_large_pages_disabled
) {
1043 disable_large_pages
|= mmu_large_pages_disabled(HAT_LOAD
);
1044 disable_ism_large_pages
|=
1045 mmu_large_pages_disabled(HAT_LOAD_SHARE
);
1046 disable_auto_data_large_pages
|=
1047 mmu_large_pages_disabled(HAT_AUTO_DATA
);
1048 disable_auto_text_large_pages
|=
1049 mmu_large_pages_disabled(HAT_AUTO_TEXT
);
1054 * Initialize the hardware address translation structures.
1067 * Hardware-only bits in a TTE
1069 MAKE_TTE_MASK(&hw_tte
);
1071 hat_init_pagesizes();
1073 /* Initialize the hash locks */
1074 for (i
= 0; i
< khmehash_num
; i
++) {
1075 mutex_init(&khme_hash
[i
].hmehash_mutex
, NULL
,
1076 MUTEX_DEFAULT
, NULL
);
1077 khme_hash
[i
].hmeh_nextpa
= HMEBLK_ENDPA
;
1079 for (i
= 0; i
< uhmehash_num
; i
++) {
1080 mutex_init(&uhme_hash
[i
].hmehash_mutex
, NULL
,
1081 MUTEX_DEFAULT
, NULL
);
1082 uhme_hash
[i
].hmeh_nextpa
= HMEBLK_ENDPA
;
1084 khmehash_num
--; /* make sure counter starts from 0 */
1085 uhmehash_num
--; /* make sure counter starts from 0 */
1088 * Allocate context domain structures.
1090 * A platform may choose to modify max_mmu_ctxdoms in
1091 * set_platform_defaults(). If a platform does not define
1092 * a set_platform_defaults() or does not choose to modify
1093 * max_mmu_ctxdoms, it gets one MMU context domain for every CPU.
1095 * For all platforms that have CPUs sharing MMUs, this
1096 * value must be defined.
1098 if (max_mmu_ctxdoms
== 0)
1099 max_mmu_ctxdoms
= max_ncpus
;
1101 size
= max_mmu_ctxdoms
* sizeof (mmu_ctx_t
*);
1102 mmu_ctxs_tbl
= kmem_zalloc(size
, KM_SLEEP
);
1104 /* mmu_ctx_t is 64 bytes aligned */
1105 mmuctxdom_cache
= kmem_cache_create("mmuctxdom_cache",
1106 sizeof (mmu_ctx_t
), 64, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
1108 * MMU context domain initialization for the Boot CPU.
1109 * This needs the context domains array allocated above.
1111 mutex_enter(&cpu_lock
);
1112 sfmmu_cpu_init(CPU
);
1113 mutex_exit(&cpu_lock
);
1116 * Intialize ism mapping list lock.
1119 mutex_init(&ism_mlist_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
1122 * Each sfmmu structure carries an array of MMU context info
1123 * structures, one per context domain. The size of this array depends
1124 * on the maximum number of context domains. So, the size of the
1125 * sfmmu structure varies per platform.
1127 * sfmmu is allocated from static arena, because trap
1128 * handler at TL > 0 is not allowed to touch kernel relocatable
1129 * memory. sfmmu's alignment is changed to 64 bytes from
1130 * default 8 bytes, as the lower 6 bits will be used to pass
1131 * pgcnt to vtag_flush_pgcnt_tl1.
1133 size
= sizeof (sfmmu_t
) + sizeof (sfmmu_ctx_t
) * (max_mmu_ctxdoms
- 1);
1135 sfmmuid_cache
= kmem_cache_create("sfmmuid_cache", size
,
1136 64, sfmmu_idcache_constructor
, sfmmu_idcache_destructor
,
1137 NULL
, NULL
, static_arena
, 0);
1139 sfmmu_tsbinfo_cache
= kmem_cache_create("sfmmu_tsbinfo_cache",
1140 sizeof (struct tsb_info
), 0, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
1143 * Since we only use the tsb8k cache to "borrow" pages for TSBs
1144 * from the heap when low on memory or when TSB_FORCEALLOC is
1145 * specified, don't use magazines to cache them--we want to return
1146 * them to the system as quickly as possible.
1148 sfmmu_tsb8k_cache
= kmem_cache_create("sfmmu_tsb8k_cache",
1149 MMU_PAGESIZE
, MMU_PAGESIZE
, NULL
, NULL
, NULL
, NULL
,
1150 static_arena
, KMC_NOMAGAZINE
);
1153 * Set tsb_alloc_hiwater to 1/tsb_alloc_hiwater_factor of physical
1154 * memory, which corresponds to the old static reserve for TSBs.
1155 * tsb_alloc_hiwater_factor defaults to 32. This caps the amount of
1156 * memory we'll allocate for TSB slabs; beyond this point TSB
1157 * allocations will be taken from the kernel heap (via
1158 * sfmmu_tsb8k_cache) and will be throttled as would any other kmem
1161 if (tsb_alloc_hiwater_factor
== 0) {
1162 tsb_alloc_hiwater_factor
= TSB_ALLOC_HIWATER_FACTOR_DEFAULT
;
1164 SFMMU_SET_TSB_ALLOC_HIWATER(physmem
);
1166 for (sz
= tsb_slab_ttesz
; sz
> 0; sz
--) {
1167 if (!(disable_large_pages
& (1 << sz
)))
1171 if (sz
< tsb_slab_ttesz
) {
1172 tsb_slab_ttesz
= sz
;
1173 tsb_slab_shift
= MMU_PAGESHIFT
+ (sz
<< 1) + sz
;
1174 tsb_slab_size
= 1 << tsb_slab_shift
;
1175 tsb_slab_mask
= (1 << (tsb_slab_shift
- MMU_PAGESHIFT
)) - 1;
1176 use_bigtsb_arena
= 0;
1177 } else if (use_bigtsb_arena
&&
1178 (disable_large_pages
& (1 << bigtsb_slab_ttesz
))) {
1179 use_bigtsb_arena
= 0;
1182 if (!use_bigtsb_arena
) {
1183 bigtsb_slab_shift
= tsb_slab_shift
;
1185 SFMMU_SET_TSB_MAX_GROWSIZE(physmem
);
1188 * On smaller memory systems, allocate TSB memory in smaller chunks
1189 * than the default 4M slab size. We also honor disable_large_pages
1192 * The trap handlers need to be patched with the final slab shift,
1193 * since they need to be able to construct the TSB pointer at runtime.
1195 if ((tsb_max_growsize
<= TSB_512K_SZCODE
) &&
1196 !(disable_large_pages
& (1 << TTE512K
))) {
1197 tsb_slab_ttesz
= TTE512K
;
1198 tsb_slab_shift
= MMU_PAGESHIFT512K
;
1199 tsb_slab_size
= MMU_PAGESIZE512K
;
1200 tsb_slab_mask
= MMU_PAGEOFFSET512K
>> MMU_PAGESHIFT
;
1201 use_bigtsb_arena
= 0;
1204 if (!use_bigtsb_arena
) {
1205 bigtsb_slab_ttesz
= tsb_slab_ttesz
;
1206 bigtsb_slab_shift
= tsb_slab_shift
;
1207 bigtsb_slab_size
= tsb_slab_size
;
1208 bigtsb_slab_mask
= tsb_slab_mask
;
1213 * Set up memory callback to update tsb_alloc_hiwater and
1216 i
= kphysm_setup_func_register(&sfmmu_update_vec
, (void *) 0);
1220 * kmem_tsb_arena is the source from which large TSB slabs are
1221 * drawn. The quantum of this arena corresponds to the largest
1222 * TSB size we can dynamically allocate for user processes.
1223 * Currently it must also be a supported page size since we
1224 * use exactly one translation entry to map each slab page.
1226 * The per-lgroup kmem_tsb_default_arena arenas are the arenas from
1227 * which most TSBs are allocated. Since most TSB allocations are
1228 * typically 8K we have a kmem cache we stack on top of each
1229 * kmem_tsb_default_arena to speed up those allocations.
1231 * Note the two-level scheme of arenas is required only
1232 * because vmem_create doesn't allow us to specify alignment
1233 * requirements. If this ever changes the code could be
1234 * simplified to use only one level of arenas.
1236 * If 256M page support exists on sun4v, 256MB kmem_bigtsb_arena
1237 * will be provided in addition to the 4M kmem_tsb_arena.
1239 if (use_bigtsb_arena
) {
1240 kmem_bigtsb_arena
= vmem_create("kmem_bigtsb", NULL
, 0,
1241 bigtsb_slab_size
, sfmmu_vmem_xalloc_aligned_wrapper
,
1242 vmem_xfree
, heap_arena
, 0, VM_SLEEP
);
1245 kmem_tsb_arena
= vmem_create("kmem_tsb", NULL
, 0, tsb_slab_size
,
1246 sfmmu_vmem_xalloc_aligned_wrapper
,
1247 vmem_xfree
, heap_arena
, 0, VM_SLEEP
);
1249 if (tsb_lgrp_affinity
) {
1251 for (i
= 0; i
< NLGRPS_MAX
; i
++) {
1252 if (use_bigtsb_arena
) {
1253 (void) sprintf(s
, "kmem_bigtsb_lgrp%d", i
);
1254 kmem_bigtsb_default_arena
[i
] = vmem_create(s
,
1255 NULL
, 0, 2 * tsb_slab_size
,
1256 sfmmu_tsb_segkmem_alloc
,
1257 sfmmu_tsb_segkmem_free
, kmem_bigtsb_arena
,
1258 0, VM_SLEEP
| VM_BESTFIT
);
1261 (void) sprintf(s
, "kmem_tsb_lgrp%d", i
);
1262 kmem_tsb_default_arena
[i
] = vmem_create(s
,
1263 NULL
, 0, PAGESIZE
, sfmmu_tsb_segkmem_alloc
,
1264 sfmmu_tsb_segkmem_free
, kmem_tsb_arena
, 0,
1265 VM_SLEEP
| VM_BESTFIT
);
1267 (void) sprintf(s
, "sfmmu_tsb_lgrp%d_cache", i
);
1268 sfmmu_tsb_cache
[i
] = kmem_cache_create(s
,
1269 PAGESIZE
, PAGESIZE
, NULL
, NULL
, NULL
, NULL
,
1270 kmem_tsb_default_arena
[i
], 0);
1273 if (use_bigtsb_arena
) {
1274 kmem_bigtsb_default_arena
[0] =
1275 vmem_create("kmem_bigtsb_default", NULL
, 0,
1276 2 * tsb_slab_size
, sfmmu_tsb_segkmem_alloc
,
1277 sfmmu_tsb_segkmem_free
, kmem_bigtsb_arena
, 0,
1278 VM_SLEEP
| VM_BESTFIT
);
1281 kmem_tsb_default_arena
[0] = vmem_create("kmem_tsb_default",
1282 NULL
, 0, PAGESIZE
, sfmmu_tsb_segkmem_alloc
,
1283 sfmmu_tsb_segkmem_free
, kmem_tsb_arena
, 0,
1284 VM_SLEEP
| VM_BESTFIT
);
1285 sfmmu_tsb_cache
[0] = kmem_cache_create("sfmmu_tsb_cache",
1286 PAGESIZE
, PAGESIZE
, NULL
, NULL
, NULL
, NULL
,
1287 kmem_tsb_default_arena
[0], 0);
1290 sfmmu8_cache
= kmem_cache_create("sfmmu8_cache", HME8BLK_SZ
,
1291 HMEBLK_ALIGN
, sfmmu_hblkcache_constructor
,
1292 sfmmu_hblkcache_destructor
,
1293 sfmmu_hblkcache_reclaim
, (void *)HME8BLK_SZ
,
1294 hat_memload_arena
, KMC_NOHASH
);
1296 hat_memload1_arena
= vmem_create("hat_memload1", NULL
, 0, PAGESIZE
,
1297 segkmem_alloc_permanent
, segkmem_free
, heap_arena
, 0,
1298 VMC_DUMPSAFE
| VM_SLEEP
);
1300 sfmmu1_cache
= kmem_cache_create("sfmmu1_cache", HME1BLK_SZ
,
1301 HMEBLK_ALIGN
, sfmmu_hblkcache_constructor
,
1302 sfmmu_hblkcache_destructor
,
1303 NULL
, (void *)HME1BLK_SZ
,
1304 hat_memload1_arena
, KMC_NOHASH
);
1306 pa_hment_cache
= kmem_cache_create("pa_hment_cache", PAHME_SZ
,
1307 0, NULL
, NULL
, NULL
, NULL
, static_arena
, KMC_NOHASH
);
1309 ism_blk_cache
= kmem_cache_create("ism_blk_cache",
1310 sizeof (ism_blk_t
), ecache_alignsize
, NULL
, NULL
,
1311 NULL
, NULL
, static_arena
, KMC_NOHASH
);
1313 ism_ment_cache
= kmem_cache_create("ism_ment_cache",
1314 sizeof (ism_ment_t
), 0, NULL
, NULL
,
1315 NULL
, NULL
, NULL
, 0);
1318 * We grab the first hat for the kernel,
1320 AS_LOCK_ENTER(&kas
, RW_WRITER
);
1321 kas
.a_hat
= hat_alloc(&kas
);
1325 * Initialize hblk_reserve.
1327 ((struct hme_blk
*)hblk_reserve
)->hblk_nextpa
=
1328 va_to_pa((caddr_t
)hblk_reserve
);
1332 * Reserve some kernel virtual address space for the locked TTEs
1333 * that allow us to probe the TSB from TL>0.
1335 utsb_vabase
= vmem_xalloc(heap_arena
, tsb_slab_size
, tsb_slab_size
,
1336 0, 0, NULL
, NULL
, VM_SLEEP
);
1337 utsb4m_vabase
= vmem_xalloc(heap_arena
, tsb_slab_size
, tsb_slab_size
,
1338 0, 0, NULL
, NULL
, VM_SLEEP
);
1343 * The big page VAC handling code assumes VAC
1344 * will not be bigger than the smallest big
1345 * page- which is 64K.
1347 if (TTEPAGES(TTE64K
) < CACHE_NUM_COLOR
) {
1348 cmn_err(CE_PANIC
, "VAC too big!");
1352 uhme_hash_pa
= va_to_pa(uhme_hash
);
1353 khme_hash_pa
= va_to_pa(khme_hash
);
1356 * Initialize relocation locks. kpr_suspendlock is held
1357 * at PIL_MAX to prevent interrupts from pinning the holder
1358 * of a suspended TTE which may access it leading to a
1359 * deadlock condition.
1361 mutex_init(&kpr_mutex
, NULL
, MUTEX_DEFAULT
, NULL
);
1362 mutex_init(&kpr_suspendlock
, NULL
, MUTEX_SPIN
, (void *)PIL_MAX
);
1365 * If Shared context support is disabled via /etc/system
1366 * set shctx_on to 0 here if it was set to 1 earlier in boot
1367 * sequence by cpu module initialization code.
1369 if (shctx_on
&& disable_shctx
) {
1374 srd_buckets
= kmem_zalloc(SFMMU_MAX_SRD_BUCKETS
*
1375 sizeof (srd_buckets
[0]), KM_SLEEP
);
1376 for (i
= 0; i
< SFMMU_MAX_SRD_BUCKETS
; i
++) {
1377 mutex_init(&srd_buckets
[i
].srdb_lock
, NULL
,
1378 MUTEX_DEFAULT
, NULL
);
1381 srd_cache
= kmem_cache_create("srd_cache", sizeof (sf_srd_t
),
1382 0, sfmmu_srdcache_constructor
, sfmmu_srdcache_destructor
,
1383 NULL
, NULL
, NULL
, 0);
1384 region_cache
= kmem_cache_create("region_cache",
1385 sizeof (sf_region_t
), 0, sfmmu_rgncache_constructor
,
1386 sfmmu_rgncache_destructor
, NULL
, NULL
, NULL
, 0);
1387 scd_cache
= kmem_cache_create("scd_cache", sizeof (sf_scd_t
),
1388 0, sfmmu_scdcache_constructor
, sfmmu_scdcache_destructor
,
1389 NULL
, NULL
, NULL
, 0);
1393 * Pre-allocate hrm_hashtab before enabling the collection of
1394 * refmod statistics. Allocating on the fly would mean us
1395 * running the risk of suffering recursive mutex enters or
1398 hrm_hashtab
= kmem_zalloc(HRM_HASHSIZE
* sizeof (struct hrmstat
*),
1401 /* Allocate per-cpu pending freelist of hmeblks */
1402 cpu_hme_pend
= kmem_zalloc((NCPU
* sizeof (cpu_hme_pend_t
)) + 64,
1404 cpu_hme_pend
= (cpu_hme_pend_t
*)P2ROUNDUP(
1405 (uintptr_t)cpu_hme_pend
, 64);
1407 for (i
= 0; i
< NCPU
; i
++) {
1408 mutex_init(&cpu_hme_pend
[i
].chp_mutex
, NULL
, MUTEX_DEFAULT
,
1412 if (cpu_hme_pend_thresh
== 0) {
1413 cpu_hme_pend_thresh
= CPU_HME_PEND_THRESH
;
1418 * Initialize locking for the hat layer, called early during boot.
1426 * initialize the array of mutexes protecting a page's mapping
1427 * list and p_nrm field.
1429 for (i
= 0; i
< MML_TABLE_SIZE
; i
++)
1430 mutex_init(&mml_table
[i
].pad_mutex
, NULL
, MUTEX_DEFAULT
, NULL
);
1433 for (i
= 0; i
< kpmp_table_sz
; i
++) {
1434 mutex_init(&kpmp_table
[i
].khl_mutex
, NULL
,
1435 MUTEX_DEFAULT
, NULL
);
1440 * Initialize array of mutex locks that protects sfmmu fields and
1443 for (i
= 0; i
< SFMMU_NUM_LOCK
; i
++)
1444 mutex_init(HATLOCK_MUTEXP(&hat_lock
[i
]), NULL
, MUTEX_DEFAULT
,
1448 #define SFMMU_KERNEL_MAXVA \
1449 (kmem64_base ? (uintptr_t)kmem64_end : (SYSLIMIT))
1452 * Allocate a hat structure.
1453 * Called when an address space first uses a hat.
1456 hat_alloc(struct as
*as
)
1461 extern uint_t
get_color_start(struct as
*);
1463 ASSERT(AS_WRITE_HELD(as
));
1464 sfmmup
= kmem_cache_alloc(sfmmuid_cache
, KM_SLEEP
);
1465 sfmmup
->sfmmu_as
= as
;
1466 sfmmup
->sfmmu_flags
= 0;
1467 sfmmup
->sfmmu_tteflags
= 0;
1468 sfmmup
->sfmmu_rtteflags
= 0;
1469 LOCK_INIT_CLEAR(&sfmmup
->sfmmu_ctx_lock
);
1473 sfmmup
->sfmmu_cext
= 0;
1476 sfmmup
->sfmmu_clrstart
= 0;
1477 sfmmup
->sfmmu_tsb
= NULL
;
1479 * hat_kern_setup() will call sfmmu_init_ktsbinfo()
1480 * to setup tsb_info for ksfmmup.
1485 * Just set to invalid ctx. When it faults, it will
1486 * get a valid ctx. This would avoid the situation
1487 * where we get a ctx, but it gets stolen and then
1488 * we fault when we try to run and so have to get
1491 sfmmup
->sfmmu_cext
= 0;
1492 cnum
= INVALID_CONTEXT
;
1494 /* initialize original physical page coloring bin */
1495 sfmmup
->sfmmu_clrstart
= get_color_start(as
);
1497 if (tsb_random_size
) {
1498 uint32_t randval
= (uint32_t)gettick() >> 4;
1499 int size
= randval
% (tsb_max_growsize
+ 1);
1501 /* chose a random tsb size for stress testing */
1502 (void) sfmmu_tsbinfo_alloc(&sfmmup
->sfmmu_tsb
, size
,
1503 TSB8K
|TSB64K
|TSB512K
, 0, sfmmup
);
1506 (void) sfmmu_tsbinfo_alloc(&sfmmup
->sfmmu_tsb
,
1508 TSB8K
|TSB64K
|TSB512K
, 0, sfmmup
);
1509 sfmmup
->sfmmu_flags
= HAT_SWAPPED
| HAT_ALLCTX_INVALID
;
1510 ASSERT(sfmmup
->sfmmu_tsb
!= NULL
);
1513 ASSERT(max_mmu_ctxdoms
> 0);
1514 for (i
= 0; i
< max_mmu_ctxdoms
; i
++) {
1515 sfmmup
->sfmmu_ctxs
[i
].cnum
= cnum
;
1516 sfmmup
->sfmmu_ctxs
[i
].gnum
= 0;
1519 for (i
= 0; i
< max_mmu_page_sizes
; i
++) {
1520 sfmmup
->sfmmu_ttecnt
[i
] = 0;
1521 sfmmup
->sfmmu_scdrttecnt
[i
] = 0;
1522 sfmmup
->sfmmu_ismttecnt
[i
] = 0;
1523 sfmmup
->sfmmu_scdismttecnt
[i
] = 0;
1524 sfmmup
->sfmmu_pgsz
[i
] = TTE8K
;
1526 sfmmup
->sfmmu_tsb0_4minflcnt
= 0;
1527 sfmmup
->sfmmu_iblk
= NULL
;
1528 sfmmup
->sfmmu_ismhat
= 0;
1529 sfmmup
->sfmmu_scdhat
= 0;
1530 sfmmup
->sfmmu_ismblkpa
= (uint64_t)-1;
1531 if (sfmmup
== ksfmmup
) {
1532 CPUSET_ALL(sfmmup
->sfmmu_cpusran
);
1534 CPUSET_ZERO(sfmmup
->sfmmu_cpusran
);
1536 sfmmup
->sfmmu_free
= 0;
1537 sfmmup
->sfmmu_rmstat
= 0;
1538 sfmmup
->sfmmu_clrbin
= sfmmup
->sfmmu_clrstart
;
1539 cv_init(&sfmmup
->sfmmu_tsb_cv
, NULL
, CV_DEFAULT
, NULL
);
1540 sfmmup
->sfmmu_srdp
= NULL
;
1541 SF_RGNMAP_ZERO(sfmmup
->sfmmu_region_map
);
1542 bzero(sfmmup
->sfmmu_hmeregion_links
, SFMMU_L1_HMERLINKS_SIZE
);
1543 sfmmup
->sfmmu_scdp
= NULL
;
1544 sfmmup
->sfmmu_scd_link
.next
= NULL
;
1545 sfmmup
->sfmmu_scd_link
.prev
= NULL
;
1550 * Create per-MMU context domain kstats for a given MMU ctx.
1553 sfmmu_mmu_kstat_create(mmu_ctx_t
*mmu_ctxp
)
1555 mmu_ctx_stat_t stat
;
1558 ASSERT(MUTEX_HELD(&cpu_lock
));
1559 ASSERT(mmu_ctxp
->mmu_kstat
== NULL
);
1561 mmu_kstat
= kstat_create("unix", mmu_ctxp
->mmu_idx
, "mmu_ctx",
1562 "hat", KSTAT_TYPE_NAMED
, MMU_CTX_NUM_STATS
, KSTAT_FLAG_VIRTUAL
);
1564 if (mmu_kstat
== NULL
) {
1565 cmn_err(CE_WARN
, "kstat_create for MMU %d failed",
1568 mmu_kstat
->ks_data
= mmu_ctxp
->mmu_kstat_data
;
1569 for (stat
= 0; stat
< MMU_CTX_NUM_STATS
; stat
++)
1570 kstat_named_init(&mmu_ctxp
->mmu_kstat_data
[stat
],
1571 mmu_ctx_kstat_names
[stat
], KSTAT_DATA_INT64
);
1572 mmu_ctxp
->mmu_kstat
= mmu_kstat
;
1573 kstat_install(mmu_kstat
);
1578 * plat_cpuid_to_mmu_ctx_info() is a platform interface that returns MMU
1579 * context domain information for a given CPU. If a platform does not
1580 * specify that interface, then the function below is used instead to return
1581 * default information. The defaults are as follows:
1583 * - The number of MMU context IDs supported on any CPU in the
1585 * - There is one MMU context domain per CPU.
1589 sfmmu_cpuid_to_mmu_ctx_info(processorid_t cpuid
, mmu_ctx_info_t
*infop
)
1591 infop
->mmu_nctxs
= nctxs
;
1592 infop
->mmu_idx
= cpu
[cpuid
]->cpu_seqid
;
1596 * Called during CPU initialization to set the MMU context-related information
1599 * cpu_lock serializes accesses to mmu_ctxs and mmu_saved_gnum.
1602 sfmmu_cpu_init(cpu_t
*cp
)
1604 mmu_ctx_info_t info
;
1605 mmu_ctx_t
*mmu_ctxp
;
1607 ASSERT(MUTEX_HELD(&cpu_lock
));
1609 if (&plat_cpuid_to_mmu_ctx_info
== NULL
)
1610 sfmmu_cpuid_to_mmu_ctx_info(cp
->cpu_id
, &info
);
1612 plat_cpuid_to_mmu_ctx_info(cp
->cpu_id
, &info
);
1614 ASSERT(info
.mmu_idx
< max_mmu_ctxdoms
);
1616 if ((mmu_ctxp
= mmu_ctxs_tbl
[info
.mmu_idx
]) == NULL
) {
1617 /* Each mmu_ctx is cacheline aligned. */
1618 mmu_ctxp
= kmem_cache_alloc(mmuctxdom_cache
, KM_SLEEP
);
1619 bzero(mmu_ctxp
, sizeof (mmu_ctx_t
));
1621 mutex_init(&mmu_ctxp
->mmu_lock
, NULL
, MUTEX_SPIN
,
1622 (void *)ipltospl(DISP_LEVEL
));
1623 mmu_ctxp
->mmu_idx
= info
.mmu_idx
;
1624 mmu_ctxp
->mmu_nctxs
= info
.mmu_nctxs
;
1626 * Globally for lifetime of a system,
1627 * gnum must always increase.
1628 * mmu_saved_gnum is protected by the cpu_lock.
1630 mmu_ctxp
->mmu_gnum
= mmu_saved_gnum
+ 1;
1631 mmu_ctxp
->mmu_cnum
= NUM_LOCKED_CTXS
;
1633 sfmmu_mmu_kstat_create(mmu_ctxp
);
1635 mmu_ctxs_tbl
[info
.mmu_idx
] = mmu_ctxp
;
1637 ASSERT(mmu_ctxp
->mmu_idx
== info
.mmu_idx
);
1638 ASSERT(mmu_ctxp
->mmu_nctxs
<= info
.mmu_nctxs
);
1642 * The mmu_lock is acquired here to prevent races with
1643 * the wrap-around code.
1645 mutex_enter(&mmu_ctxp
->mmu_lock
);
1648 mmu_ctxp
->mmu_ncpus
++;
1649 CPUSET_ADD(mmu_ctxp
->mmu_cpuset
, cp
->cpu_id
);
1650 CPU_MMU_IDX(cp
) = info
.mmu_idx
;
1651 CPU_MMU_CTXP(cp
) = mmu_ctxp
;
1653 mutex_exit(&mmu_ctxp
->mmu_lock
);
1657 sfmmu_ctxdom_free(mmu_ctx_t
*mmu_ctxp
)
1659 ASSERT(MUTEX_HELD(&cpu_lock
));
1660 ASSERT(!MUTEX_HELD(&mmu_ctxp
->mmu_lock
));
1662 mutex_destroy(&mmu_ctxp
->mmu_lock
);
1664 if (mmu_ctxp
->mmu_kstat
)
1665 kstat_delete(mmu_ctxp
->mmu_kstat
);
1667 /* mmu_saved_gnum is protected by the cpu_lock. */
1668 if (mmu_saved_gnum
< mmu_ctxp
->mmu_gnum
)
1669 mmu_saved_gnum
= mmu_ctxp
->mmu_gnum
;
1671 kmem_cache_free(mmuctxdom_cache
, mmu_ctxp
);
1675 * Called to perform MMU context-related cleanup for a CPU.
1678 sfmmu_cpu_cleanup(cpu_t
*cp
)
1680 mmu_ctx_t
*mmu_ctxp
;
1682 ASSERT(MUTEX_HELD(&cpu_lock
));
1684 mmu_ctxp
= CPU_MMU_CTXP(cp
);
1685 ASSERT(mmu_ctxp
!= NULL
);
1688 * The mmu_lock is acquired here to prevent races with
1689 * the wrap-around code.
1691 mutex_enter(&mmu_ctxp
->mmu_lock
);
1693 CPU_MMU_CTXP(cp
) = NULL
;
1695 CPUSET_DEL(mmu_ctxp
->mmu_cpuset
, cp
->cpu_id
);
1696 if (--mmu_ctxp
->mmu_ncpus
== 0) {
1697 mmu_ctxs_tbl
[mmu_ctxp
->mmu_idx
] = NULL
;
1698 mutex_exit(&mmu_ctxp
->mmu_lock
);
1699 sfmmu_ctxdom_free(mmu_ctxp
);
1703 mutex_exit(&mmu_ctxp
->mmu_lock
);
1707 sfmmu_ctxdom_nctxs(int idx
)
1709 return (mmu_ctxs_tbl
[idx
]->mmu_nctxs
);
1714 * sfmmu_ctxdoms_* is an interface provided to help keep context domains
1715 * consistant after suspend/resume on system that can resume on a different
1716 * hardware than it was suspended.
1718 * sfmmu_ctxdom_lock(void) locks all context domains and prevents new contexts
1719 * from being allocated. It acquires all hat_locks, which blocks most access to
1720 * context data, except for a few cases that are handled separately or are
1721 * harmless. It wraps each domain to increment gnum and invalidate on-CPU
1722 * contexts, and forces cnum to its max. As a result of this call all user
1723 * threads that are running on CPUs trap and try to perform wrap around but
1724 * can't because hat_locks are taken. Threads that were not on CPUs but started
1725 * by scheduler go to sfmmu_alloc_ctx() to aquire context without checking
1726 * hat_lock, but fail, because cnum == nctxs, and therefore also trap and block
1727 * on hat_lock trying to wrap. sfmmu_ctxdom_lock() must be called before CPUs
1728 * are paused, else it could deadlock acquiring locks held by paused CPUs.
1730 * sfmmu_ctxdoms_remove() removes context domains from every CPUs and records
1731 * the CPUs that had them. It must be called after CPUs have been paused. This
1732 * ensures that no threads are in sfmmu_alloc_ctx() accessing domain data,
1733 * because pause_cpus sends a mondo interrupt to every CPU, and sfmmu_alloc_ctx
1734 * runs with interrupts disabled. When CPUs are later resumed, they may enter
1735 * sfmmu_alloc_ctx, but it will check for CPU_MMU_CTXP = NULL and immediately
1736 * return failure. Or, they will be blocked trying to acquire hat_lock. Thus
1737 * after sfmmu_ctxdoms_remove returns, we are guaranteed that no one is
1738 * accessing the old context domains.
1740 * sfmmu_ctxdoms_update(void) frees space used by old context domains and
1741 * allocates new context domains based on hardware layout. It initializes
1742 * every CPU that had context domain before migration to have one again.
1743 * sfmmu_ctxdoms_update must be called after CPUs are resumed, else it
1744 * could deadlock acquiring locks held by paused CPUs.
1746 * sfmmu_ctxdoms_unlock(void) releases all hat_locks after which user threads
1747 * acquire new context ids and continue execution.
1749 * Therefore functions should be called in the following order:
1751 * sfmmu_ctxdom_lock()
1754 * if (suspend failed)
1755 * sfmmu_ctxdom_unlock()
1757 * sfmmu_ctxdom_remove()
1759 * sfmmu_ctxdom_update()
1760 * sfmmu_ctxdom_unlock()
1762 static cpuset_t sfmmu_ctxdoms_pset
;
1765 sfmmu_ctxdoms_remove()
1771 * Record the CPUs that have domains in sfmmu_ctxdoms_pset, so they can
1772 * be restored post-migration. A CPU may be powered off and not have a
1773 * domain, for example.
1775 CPUSET_ZERO(sfmmu_ctxdoms_pset
);
1777 for (id
= 0; id
< NCPU
; id
++) {
1778 if ((cp
= cpu
[id
]) != NULL
&& CPU_MMU_CTXP(cp
) != NULL
) {
1779 CPUSET_ADD(sfmmu_ctxdoms_pset
, id
);
1780 CPU_MMU_CTXP(cp
) = NULL
;
1786 sfmmu_ctxdoms_lock(void)
1789 mmu_ctx_t
*mmu_ctxp
;
1791 sfmmu_hat_lock_all();
1794 * At this point, no thread can be in sfmmu_ctx_wrap_around, because
1795 * hat_lock is always taken before calling it.
1797 * For each domain, set mmu_cnum to max so no more contexts can be
1798 * allocated, and wrap to flush on-CPU contexts and force threads to
1799 * acquire a new context when we later drop hat_lock after migration.
1800 * Setting mmu_cnum may race with sfmmu_alloc_ctx which also sets cnum,
1801 * but the latter uses CAS and will miscompare and not overwrite it.
1803 kpreempt_disable(); /* required by sfmmu_ctx_wrap_around */
1804 for (idx
= 0; idx
< max_mmu_ctxdoms
; idx
++) {
1805 if ((mmu_ctxp
= mmu_ctxs_tbl
[idx
]) != NULL
) {
1806 mutex_enter(&mmu_ctxp
->mmu_lock
);
1807 mmu_ctxp
->mmu_cnum
= mmu_ctxp
->mmu_nctxs
;
1808 /* make sure updated cnum visible */
1810 mutex_exit(&mmu_ctxp
->mmu_lock
);
1811 sfmmu_ctx_wrap_around(mmu_ctxp
, B_FALSE
);
1818 sfmmu_ctxdoms_unlock(void)
1820 sfmmu_hat_unlock_all();
1824 sfmmu_ctxdoms_update(void)
1829 mmu_ctx_t
*mmu_ctxp
;
1832 * Free all context domains. As side effect, this increases
1833 * mmu_saved_gnum to the maximum gnum over all domains, which is used to
1834 * init gnum in the new domains, which therefore will be larger than the
1835 * sfmmu gnum for any process, guaranteeing that every process will see
1836 * a new generation and allocate a new context regardless of what new
1837 * domain it runs in.
1839 mutex_enter(&cpu_lock
);
1841 for (idx
= 0; idx
< max_mmu_ctxdoms
; idx
++) {
1842 if (mmu_ctxs_tbl
[idx
] != NULL
) {
1843 mmu_ctxp
= mmu_ctxs_tbl
[idx
];
1844 mmu_ctxs_tbl
[idx
] = NULL
;
1845 sfmmu_ctxdom_free(mmu_ctxp
);
1849 for (id
= 0; id
< NCPU
; id
++) {
1850 if (CPU_IN_SET(sfmmu_ctxdoms_pset
, id
) &&
1851 (cp
= cpu
[id
]) != NULL
)
1854 mutex_exit(&cpu_lock
);
1859 * Hat_setup, makes an address space context the current active one.
1860 * In sfmmu this translates to setting the secondary context with the
1861 * corresponding context.
1864 hat_setup(struct hat
*sfmmup
, int allocflag
)
1866 hatlock_t
*hatlockp
;
1868 /* Init needs some special treatment. */
1869 if (allocflag
== HAT_INIT
) {
1871 * Make sure that we have
1873 * 2. a valid ctx that doesn't get stolen after this point.
1875 hatlockp
= sfmmu_hat_enter(sfmmup
);
1878 * Swap in the TSB. hat_init() allocates tsbinfos without
1879 * TSBs, but we need one for init, since the kernel does some
1880 * special things to set up its stack and needs the TSB to
1881 * resolve page faults.
1883 sfmmu_tsb_swapin(sfmmup
, hatlockp
);
1885 sfmmu_get_ctx(sfmmup
);
1887 sfmmu_hat_exit(hatlockp
);
1889 ASSERT(allocflag
== HAT_ALLOC
);
1891 hatlockp
= sfmmu_hat_enter(sfmmup
);
1894 CPUSET_ADD(sfmmup
->sfmmu_cpusran
, CPU
->cpu_id
);
1896 * sfmmu_setctx_sec takes <pgsz|cnum> as a parameter,
1897 * pagesize bits don't matter in this case since we are passing
1898 * INVALID_CONTEXT to it.
1899 * Compatibility Note: hw takes care of MMU_SCONTEXT1
1901 sfmmu_setctx_sec(INVALID_CONTEXT
);
1902 sfmmu_clear_utsbinfo();
1905 sfmmu_hat_exit(hatlockp
);
1910 * Free all the translation resources for the specified address space.
1911 * Called from as_free when an address space is being destroyed.
1914 hat_free_start(struct hat
*sfmmup
)
1916 ASSERT(AS_WRITE_HELD(sfmmup
->sfmmu_as
));
1917 ASSERT(sfmmup
!= ksfmmup
);
1919 sfmmup
->sfmmu_free
= 1;
1920 if (sfmmup
->sfmmu_scdp
!= NULL
) {
1921 sfmmu_leave_scd(sfmmup
, 0);
1924 ASSERT(sfmmup
->sfmmu_scdp
== NULL
);
1928 hat_free_end(struct hat
*sfmmup
)
1932 ASSERT(sfmmup
->sfmmu_free
== 1);
1933 ASSERT(sfmmup
->sfmmu_ttecnt
[TTE8K
] == 0);
1934 ASSERT(sfmmup
->sfmmu_ttecnt
[TTE64K
] == 0);
1935 ASSERT(sfmmup
->sfmmu_ttecnt
[TTE512K
] == 0);
1936 ASSERT(sfmmup
->sfmmu_ttecnt
[TTE4M
] == 0);
1937 ASSERT(sfmmup
->sfmmu_ttecnt
[TTE32M
] == 0);
1938 ASSERT(sfmmup
->sfmmu_ttecnt
[TTE256M
] == 0);
1940 if (sfmmup
->sfmmu_rmstat
) {
1941 hat_freestat(sfmmup
->sfmmu_as
, NULL
);
1944 while (sfmmup
->sfmmu_tsb
!= NULL
) {
1945 struct tsb_info
*next
= sfmmup
->sfmmu_tsb
->tsb_next
;
1946 sfmmu_tsbinfo_free(sfmmup
->sfmmu_tsb
);
1947 sfmmup
->sfmmu_tsb
= next
;
1950 if (sfmmup
->sfmmu_srdp
!= NULL
) {
1951 sfmmu_leave_srd(sfmmup
);
1952 ASSERT(sfmmup
->sfmmu_srdp
== NULL
);
1953 for (i
= 0; i
< SFMMU_L1_HMERLINKS
; i
++) {
1954 if (sfmmup
->sfmmu_hmeregion_links
[i
] != NULL
) {
1955 kmem_free(sfmmup
->sfmmu_hmeregion_links
[i
],
1956 SFMMU_L2_HMERLINKS_SIZE
);
1957 sfmmup
->sfmmu_hmeregion_links
[i
] = NULL
;
1961 sfmmu_free_sfmmu(sfmmup
);
1964 for (i
= 0; i
< SFMMU_L1_HMERLINKS
; i
++) {
1965 ASSERT(sfmmup
->sfmmu_hmeregion_links
[i
] == NULL
);
1969 kmem_cache_free(sfmmuid_cache
, sfmmup
);
1973 * Set up any translation structures, for the specified address space,
1974 * that are needed or preferred when the process is being swapped in.
1978 hat_swapin(struct hat
*hat
)
1983 * Free all of the translation resources, for the specified address space,
1984 * that can be freed while the process is swapped out. Called from as_swapout.
1985 * Also, free up the ctx that this process was using.
1988 hat_swapout(struct hat
*sfmmup
)
1990 struct hmehash_bucket
*hmebp
;
1991 struct hme_blk
*hmeblkp
;
1992 struct hme_blk
*pr_hblk
= NULL
;
1993 struct hme_blk
*nx_hblk
;
1995 struct hme_blk
*list
= NULL
;
1996 hatlock_t
*hatlockp
;
1997 struct tsb_info
*tsbinfop
;
1999 struct free_tsb
*next
;
2000 struct tsb_info
*tsbinfop
;
2001 }; /* free list of TSBs */
2002 struct free_tsb
*freelist
, *last
, *next
;
2004 SFMMU_STAT(sf_swapout
);
2007 * There is no way to go from an as to all its translations in sfmmu.
2008 * Here is one of the times when we take the big hit and traverse
2009 * the hash looking for hme_blks to free up. Not only do we free up
2010 * this as hme_blks but all those that are free. We are obviously
2011 * swapping because we need memory so let's free up as much
2014 * Note that we don't flush TLB/TSB here -- it's not necessary
2016 * 1) we free the ctx we're using and throw away the TSB(s);
2017 * 2) processes aren't runnable while being swapped out.
2019 ASSERT(sfmmup
!= KHATID
);
2020 for (i
= 0; i
<= UHMEHASH_SZ
; i
++) {
2021 hmebp
= &uhme_hash
[i
];
2022 SFMMU_HASH_LOCK(hmebp
);
2023 hmeblkp
= hmebp
->hmeblkp
;
2027 if ((hmeblkp
->hblk_tag
.htag_id
== sfmmup
) &&
2028 !hmeblkp
->hblk_shw_bit
&& !hmeblkp
->hblk_lckcnt
) {
2029 ASSERT(!hmeblkp
->hblk_shared
);
2030 (void) sfmmu_hblk_unload(sfmmup
, hmeblkp
,
2031 (caddr_t
)get_hblk_base(hmeblkp
),
2032 get_hblk_endaddr(hmeblkp
),
2035 nx_hblk
= hmeblkp
->hblk_next
;
2036 if (!hmeblkp
->hblk_vcnt
&& !hmeblkp
->hblk_hmecnt
) {
2037 ASSERT(!hmeblkp
->hblk_lckcnt
);
2038 sfmmu_hblk_hash_rm(hmebp
, hmeblkp
, pr_hblk
,
2045 SFMMU_HASH_UNLOCK(hmebp
);
2048 sfmmu_hblks_list_purge(&list
, 0);
2051 * Now free up the ctx so that others can reuse it.
2053 hatlockp
= sfmmu_hat_enter(sfmmup
);
2055 sfmmu_invalidate_ctx(sfmmup
);
2058 * Free TSBs, but not tsbinfos, and set SWAPPED flag.
2059 * If TSBs were never swapped in, just return.
2060 * This implies that we don't support partial swapping
2061 * of TSBs -- either all are swapped out, or none are.
2063 * We must hold the HAT lock here to prevent racing with another
2064 * thread trying to unmap TTEs from the TSB or running the post-
2065 * relocator after relocating the TSB's memory. Unfortunately, we
2066 * can't free memory while holding the HAT lock or we could
2067 * deadlock, so we build a list of TSBs to be freed after marking
2068 * the tsbinfos as swapped out and free them after dropping the
2071 if (SFMMU_FLAGS_ISSET(sfmmup
, HAT_SWAPPED
)) {
2072 sfmmu_hat_exit(hatlockp
);
2076 SFMMU_FLAGS_SET(sfmmup
, HAT_SWAPPED
);
2077 last
= freelist
= NULL
;
2078 for (tsbinfop
= sfmmup
->sfmmu_tsb
; tsbinfop
!= NULL
;
2079 tsbinfop
= tsbinfop
->tsb_next
) {
2080 ASSERT((tsbinfop
->tsb_flags
& TSB_SWAPPED
) == 0);
2083 * Cast the TSB into a struct free_tsb and put it on the free
2086 if (freelist
== NULL
) {
2087 last
= freelist
= (struct free_tsb
*)tsbinfop
->tsb_va
;
2089 last
->next
= (struct free_tsb
*)tsbinfop
->tsb_va
;
2093 last
->tsbinfop
= tsbinfop
;
2094 tsbinfop
->tsb_flags
|= TSB_SWAPPED
;
2096 * Zero out the TTE to clear the valid bit.
2097 * Note we can't use a value like 0xbad because we want to
2098 * ensure diagnostic bits are NEVER set on TTEs that might
2099 * be loaded. The intent is to catch any invalid access
2100 * to the swapped TSB, such as a thread running with a valid
2101 * context without first calling sfmmu_tsb_swapin() to
2102 * allocate TSB memory.
2104 tsbinfop
->tsb_tte
.ll
= 0;
2107 /* Now we can drop the lock and free the TSB memory. */
2108 sfmmu_hat_exit(hatlockp
);
2109 for (; freelist
!= NULL
; freelist
= next
) {
2110 next
= freelist
->next
;
2111 sfmmu_tsb_free(freelist
->tsbinfop
);
2116 * Duplicate the translations of an as into another newas
2120 hat_dup(struct hat
*hat
, struct hat
*newhat
, caddr_t addr
, size_t len
,
2126 extern uint_t
get_color_start(struct as
*);
2128 ASSERT((flag
== 0) || (flag
== HAT_DUP_ALL
) || (flag
== HAT_DUP_COW
) ||
2129 (flag
== HAT_DUP_SRD
));
2130 ASSERT(hat
!= ksfmmup
);
2131 ASSERT(newhat
!= ksfmmup
);
2132 ASSERT(flag
!= HAT_DUP_ALL
|| hat
->sfmmu_srdp
== newhat
->sfmmu_srdp
);
2134 if (flag
== HAT_DUP_COW
) {
2135 panic("hat_dup: HAT_DUP_COW not supported");
2138 if (flag
== HAT_DUP_SRD
&& ((srdp
= hat
->sfmmu_srdp
) != NULL
)) {
2139 ASSERT(srdp
->srd_evp
!= NULL
);
2140 VN_HOLD(srdp
->srd_evp
);
2141 ASSERT(srdp
->srd_refcnt
> 0);
2142 newhat
->sfmmu_srdp
= srdp
;
2143 atomic_inc_32((volatile uint_t
*)&srdp
->srd_refcnt
);
2147 * HAT_DUP_ALL flag is used after as duplication is done.
2149 if (flag
== HAT_DUP_ALL
&& ((srdp
= newhat
->sfmmu_srdp
) != NULL
)) {
2150 ASSERT(newhat
->sfmmu_srdp
->srd_refcnt
>= 2);
2151 newhat
->sfmmu_rtteflags
= hat
->sfmmu_rtteflags
;
2152 if (hat
->sfmmu_flags
& HAT_4MTEXT_FLAG
) {
2153 newhat
->sfmmu_flags
|= HAT_4MTEXT_FLAG
;
2156 /* check if need to join scd */
2157 if ((scdp
= hat
->sfmmu_scdp
) != NULL
&&
2158 newhat
->sfmmu_scdp
!= scdp
) {
2160 SF_RGNMAP_IS_SUBSET(&newhat
->sfmmu_region_map
,
2161 &scdp
->scd_region_map
, ret
);
2163 sfmmu_join_scd(scdp
, newhat
);
2164 ASSERT(newhat
->sfmmu_scdp
== scdp
&&
2165 scdp
->scd_refcnt
>= 2);
2166 for (i
= 0; i
< max_mmu_page_sizes
; i
++) {
2167 newhat
->sfmmu_ismttecnt
[i
] =
2168 hat
->sfmmu_ismttecnt
[i
];
2169 newhat
->sfmmu_scdismttecnt
[i
] =
2170 hat
->sfmmu_scdismttecnt
[i
];
2174 sfmmu_check_page_sizes(newhat
, 1);
2177 if (flag
== HAT_DUP_ALL
&& consistent_coloring
== 0 &&
2178 update_proc_pgcolorbase_after_fork
!= 0) {
2179 hat
->sfmmu_clrbin
= get_color_start(hat
->sfmmu_as
);
2185 hat_memload(struct hat
*hat
, caddr_t addr
, struct page
*pp
,
2186 uint_t attr
, uint_t flags
)
2188 hat_do_memload(hat
, addr
, pp
, attr
, flags
,
2189 SFMMU_INVALID_SHMERID
);
2193 hat_memload_region(struct hat
*hat
, caddr_t addr
, struct page
*pp
,
2194 uint_t attr
, uint_t flags
, hat_region_cookie_t rcookie
)
2197 if (rcookie
== HAT_INVALID_REGION_COOKIE
) {
2198 hat_do_memload(hat
, addr
, pp
, attr
, flags
,
2199 SFMMU_INVALID_SHMERID
);
2202 rid
= (uint_t
)((uint64_t)rcookie
);
2203 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
2204 hat_do_memload(hat
, addr
, pp
, attr
, flags
, rid
);
2208 * Set up addr to map to page pp with protection prot.
2209 * As an optimization we also load the TSB with the
2210 * corresponding tte but it is no big deal if the tte gets kicked out.
2213 hat_do_memload(struct hat
*hat
, caddr_t addr
, struct page
*pp
,
2214 uint_t attr
, uint_t flags
, uint_t rid
)
2219 ASSERT(hat
!= NULL
);
2220 ASSERT(PAGE_LOCKED(pp
));
2221 ASSERT(!((uintptr_t)addr
& MMU_PAGEOFFSET
));
2222 ASSERT(!(flags
& ~SFMMU_LOAD_ALLFLAG
));
2223 ASSERT(!(attr
& ~SFMMU_LOAD_ALLATTR
));
2224 SFMMU_VALIDATE_HMERID(hat
, rid
, addr
, MMU_PAGESIZE
);
2226 if (PP_ISFREE(pp
)) {
2227 panic("hat_memload: loading a mapping to free page %p",
2231 ASSERT((hat
== ksfmmup
) || AS_LOCK_HELD(hat
->sfmmu_as
));
2233 if (flags
& ~SFMMU_LOAD_ALLFLAG
)
2234 cmn_err(CE_NOTE
, "hat_memload: unsupported flags %d",
2235 flags
& ~SFMMU_LOAD_ALLFLAG
);
2237 if (hat
->sfmmu_rmstat
)
2238 hat_resvstat(MMU_PAGESIZE
, hat
->sfmmu_as
, addr
);
2240 #if defined(SF_ERRATA_57)
2241 if ((hat
!= ksfmmup
) && AS_TYPE_64BIT(hat
->sfmmu_as
) &&
2242 (addr
< errata57_limit
) && (attr
& PROT_EXEC
) &&
2243 !(flags
& HAT_LOAD_SHARE
)) {
2244 cmn_err(CE_WARN
, "hat_memload: illegal attempt to make user "
2245 " page executable");
2250 sfmmu_memtte(&tte
, pp
->p_pagenum
, attr
, TTE8K
);
2251 (void) sfmmu_tteload_array(hat
, &tte
, addr
, &pp
, flags
, rid
);
2254 * Check TSB and TLB page sizes.
2256 if ((flags
& HAT_LOAD_SHARE
) == 0) {
2257 sfmmu_check_page_sizes(hat
, 1);
2262 * hat_devload can be called to map real memory (e.g.
2263 * /dev/kmem) and even though hat_devload will determine pf is
2264 * for memory, it will be unable to get a shared lock on the
2265 * page (because someone else has it exclusively) and will
2266 * pass dp = NULL. If tteload doesn't get a non-NULL
2267 * page pointer it can't cache memory.
2270 hat_devload(struct hat
*hat
, caddr_t addr
, size_t len
, pfn_t pfn
,
2271 uint_t attr
, int flags
)
2274 struct page
*pp
= NULL
;
2277 ASSERT(hat
!= NULL
);
2279 ASSERT(!(flags
& ~SFMMU_LOAD_ALLFLAG
));
2280 ASSERT(!(attr
& ~SFMMU_LOAD_ALLATTR
));
2281 ASSERT((hat
== ksfmmup
) || AS_LOCK_HELD(hat
->sfmmu_as
));
2283 panic("hat_devload: zero len");
2284 if (flags
& ~SFMMU_LOAD_ALLFLAG
)
2285 cmn_err(CE_NOTE
, "hat_devload: unsupported flags %d",
2286 flags
& ~SFMMU_LOAD_ALLFLAG
);
2288 #if defined(SF_ERRATA_57)
2289 if ((hat
!= ksfmmup
) && AS_TYPE_64BIT(hat
->sfmmu_as
) &&
2290 (addr
< errata57_limit
) && (attr
& PROT_EXEC
) &&
2291 !(flags
& HAT_LOAD_SHARE
)) {
2292 cmn_err(CE_WARN
, "hat_devload: illegal attempt to make user "
2293 " page executable");
2299 * If it's a memory page find its pp
2301 if (!(flags
& HAT_LOAD_NOCONSIST
) && pf_is_memory(pfn
)) {
2302 pp
= page_numtopp_nolock(pfn
);
2304 flags
|= HAT_LOAD_NOCONSIST
;
2306 if (PP_ISFREE(pp
)) {
2307 panic("hat_memload: loading "
2308 "a mapping to free page %p",
2311 if (!PAGE_LOCKED(pp
) && !PP_ISNORELOC(pp
)) {
2312 panic("hat_memload: loading a mapping "
2313 "to unlocked relocatable page %p",
2316 ASSERT(len
== MMU_PAGESIZE
);
2320 if (hat
->sfmmu_rmstat
)
2321 hat_resvstat(len
, hat
->sfmmu_as
, addr
);
2323 if (flags
& HAT_LOAD_NOCONSIST
) {
2324 attr
|= SFMMU_UNCACHEVTTE
;
2327 if (!pf_is_memory(pfn
)) {
2328 attr
|= SFMMU_UNCACHEPTTE
| HAT_NOSYNC
;
2330 switch (attr
& HAT_ORDER_MASK
) {
2331 case HAT_STRICTORDER
:
2332 case HAT_UNORDERED_OK
:
2334 * we set the side effect bit for all non
2335 * memory mappings unless merging is ok
2337 attr
|= SFMMU_SIDEFFECT
;
2339 case HAT_MERGING_OK
:
2340 case HAT_LOADCACHING_OK
:
2341 case HAT_STORECACHING_OK
:
2344 panic("hat_devload: bad attr");
2350 sfmmu_memtte(&tte
, pfn
, attr
, TTE8K
);
2351 (void) sfmmu_tteload_array(hat
, &tte
, addr
, &pp
,
2352 flags
, SFMMU_INVALID_SHMERID
);
2353 len
-= MMU_PAGESIZE
;
2354 addr
+= MMU_PAGESIZE
;
2359 * try to use large pages, check va/pa alignments
2360 * Note that 32M/256M page sizes are not (yet) supported.
2362 if ((len
>= MMU_PAGESIZE4M
) &&
2363 !((uintptr_t)addr
& MMU_PAGEOFFSET4M
) &&
2364 !(disable_large_pages
& (1 << TTE4M
)) &&
2365 !(mmu_ptob(pfn
) & MMU_PAGEOFFSET4M
)) {
2366 sfmmu_memtte(&tte
, pfn
, attr
, TTE4M
);
2367 (void) sfmmu_tteload_array(hat
, &tte
, addr
, &pp
,
2368 flags
, SFMMU_INVALID_SHMERID
);
2369 len
-= MMU_PAGESIZE4M
;
2370 addr
+= MMU_PAGESIZE4M
;
2371 pfn
+= MMU_PAGESIZE4M
/ MMU_PAGESIZE
;
2372 } else if ((len
>= MMU_PAGESIZE512K
) &&
2373 !((uintptr_t)addr
& MMU_PAGEOFFSET512K
) &&
2374 !(disable_large_pages
& (1 << TTE512K
)) &&
2375 !(mmu_ptob(pfn
) & MMU_PAGEOFFSET512K
)) {
2376 sfmmu_memtte(&tte
, pfn
, attr
, TTE512K
);
2377 (void) sfmmu_tteload_array(hat
, &tte
, addr
, &pp
,
2378 flags
, SFMMU_INVALID_SHMERID
);
2379 len
-= MMU_PAGESIZE512K
;
2380 addr
+= MMU_PAGESIZE512K
;
2381 pfn
+= MMU_PAGESIZE512K
/ MMU_PAGESIZE
;
2382 } else if ((len
>= MMU_PAGESIZE64K
) &&
2383 !((uintptr_t)addr
& MMU_PAGEOFFSET64K
) &&
2384 !(disable_large_pages
& (1 << TTE64K
)) &&
2385 !(mmu_ptob(pfn
) & MMU_PAGEOFFSET64K
)) {
2386 sfmmu_memtte(&tte
, pfn
, attr
, TTE64K
);
2387 (void) sfmmu_tteload_array(hat
, &tte
, addr
, &pp
,
2388 flags
, SFMMU_INVALID_SHMERID
);
2389 len
-= MMU_PAGESIZE64K
;
2390 addr
+= MMU_PAGESIZE64K
;
2391 pfn
+= MMU_PAGESIZE64K
/ MMU_PAGESIZE
;
2393 sfmmu_memtte(&tte
, pfn
, attr
, TTE8K
);
2394 (void) sfmmu_tteload_array(hat
, &tte
, addr
, &pp
,
2395 flags
, SFMMU_INVALID_SHMERID
);
2396 len
-= MMU_PAGESIZE
;
2397 addr
+= MMU_PAGESIZE
;
2403 * Check TSB and TLB page sizes.
2405 if ((flags
& HAT_LOAD_SHARE
) == 0) {
2406 sfmmu_check_page_sizes(hat
, 1);
2411 hat_memload_array(struct hat
*hat
, caddr_t addr
, size_t len
,
2412 struct page
**pps
, uint_t attr
, uint_t flags
)
2414 hat_do_memload_array(hat
, addr
, len
, pps
, attr
, flags
,
2415 SFMMU_INVALID_SHMERID
);
2419 hat_memload_array_region(struct hat
*hat
, caddr_t addr
, size_t len
,
2420 struct page
**pps
, uint_t attr
, uint_t flags
,
2421 hat_region_cookie_t rcookie
)
2424 if (rcookie
== HAT_INVALID_REGION_COOKIE
) {
2425 hat_do_memload_array(hat
, addr
, len
, pps
, attr
, flags
,
2426 SFMMU_INVALID_SHMERID
);
2429 rid
= (uint_t
)((uint64_t)rcookie
);
2430 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
2431 hat_do_memload_array(hat
, addr
, len
, pps
, attr
, flags
, rid
);
2435 * Map the largest extend possible out of the page array. The array may NOT
2436 * be in order. The largest possible mapping a page can have
2437 * is specified in the p_szc field. The p_szc field
2438 * cannot change as long as there any mappings (large or small)
2439 * to any of the pages that make up the large page. (ie. any
2440 * promotion/demotion of page size is not up to the hat but up to
2441 * the page free list manager). The array
2442 * should consist of properly aligned contigous pages that are
2443 * part of a big page for a large mapping to be created.
2446 hat_do_memload_array(struct hat
*hat
, caddr_t addr
, size_t len
,
2447 struct page
**pps
, uint_t attr
, uint_t flags
, uint_t rid
)
2451 pgcnt_t numpg
, npgs
;
2454 uint_t large_pages_disable
;
2456 ASSERT(!((uintptr_t)addr
& MMU_PAGEOFFSET
));
2457 SFMMU_VALIDATE_HMERID(hat
, rid
, addr
, len
);
2459 if (hat
->sfmmu_rmstat
)
2460 hat_resvstat(len
, hat
->sfmmu_as
, addr
);
2462 #if defined(SF_ERRATA_57)
2463 if ((hat
!= ksfmmup
) && AS_TYPE_64BIT(hat
->sfmmu_as
) &&
2464 (addr
< errata57_limit
) && (attr
& PROT_EXEC
) &&
2465 !(flags
& HAT_LOAD_SHARE
)) {
2466 cmn_err(CE_WARN
, "hat_memload_array: illegal attempt to make "
2467 "user page executable");
2472 /* Get number of pages */
2473 npgs
= len
>> MMU_PAGESHIFT
;
2475 if (flags
& HAT_LOAD_SHARE
) {
2476 large_pages_disable
= disable_ism_large_pages
;
2478 large_pages_disable
= disable_large_pages
;
2481 if (npgs
< NHMENTS
|| large_pages_disable
== LARGE_PAGES_OFF
) {
2482 sfmmu_memload_batchsmall(hat
, addr
, pps
, attr
, flags
, npgs
,
2487 while (npgs
>= NHMENTS
) {
2489 for (ttesz
= pp
->p_szc
; ttesz
!= TTE8K
; ttesz
--) {
2491 * Check if this page size is disabled.
2493 if (large_pages_disable
& (1 << ttesz
))
2496 numpg
= TTEPAGES(ttesz
);
2497 mapsz
= numpg
<< MMU_PAGESHIFT
;
2498 if ((npgs
>= numpg
) &&
2499 IS_P2ALIGNED(addr
, mapsz
) &&
2500 IS_P2ALIGNED(pp
->p_pagenum
, numpg
)) {
2502 * At this point we have enough pages and
2503 * we know the virtual address and the pfn
2504 * are properly aligned. We still need
2505 * to check for physical contiguity but since
2506 * it is very likely that this is the case
2507 * we will assume they are so and undo
2508 * the request if necessary. It would
2509 * be great if we could get a hint flag
2510 * like HAT_CONTIG which would tell us
2511 * the pages are contigous for sure.
2513 sfmmu_memtte(&tte
, (*pps
)->p_pagenum
,
2515 if (!sfmmu_tteload_array(hat
, &tte
, addr
,
2521 if (ttesz
== TTE8K
) {
2523 * We were not able to map array using a large page
2524 * batch a hmeblk or fraction at a time.
2526 numpg
= ((uintptr_t)addr
>> MMU_PAGESHIFT
)
2528 numpg
= NHMENTS
- numpg
;
2529 ASSERT(numpg
<= npgs
);
2530 mapsz
= numpg
* MMU_PAGESIZE
;
2531 sfmmu_memload_batchsmall(hat
, addr
, pps
, attr
, flags
,
2540 sfmmu_memload_batchsmall(hat
, addr
, pps
, attr
, flags
, npgs
,
2545 * Check TSB and TLB page sizes.
2547 if ((flags
& HAT_LOAD_SHARE
) == 0) {
2548 sfmmu_check_page_sizes(hat
, 1);
2553 * Function tries to batch 8K pages into the same hme blk.
2556 sfmmu_memload_batchsmall(struct hat
*hat
, caddr_t vaddr
, page_t
**pps
,
2557 uint_t attr
, uint_t flags
, pgcnt_t npgs
, uint_t rid
)
2561 struct hmehash_bucket
*hmebp
;
2562 struct hme_blk
*hmeblkp
;
2567 * Acquire the hash bucket.
2569 hmebp
= sfmmu_tteload_acquire_hashbucket(hat
, vaddr
, TTE8K
,
2574 * Find the hment block.
2576 hmeblkp
= sfmmu_tteload_find_hmeblk(hat
, hmebp
, vaddr
,
2585 sfmmu_memtte(&tte
, pp
->p_pagenum
, attr
, TTE8K
);
2588 * Add the translation.
2590 (void) sfmmu_tteload_addentry(hat
, hmeblkp
, &tte
,
2591 vaddr
, pps
, flags
, rid
);
2600 * Goto next address.
2602 vaddr
+= MMU_PAGESIZE
;
2605 * Don't crossover into a different hmentblk.
2607 index
= (int)(((uintptr_t)vaddr
>> MMU_PAGESHIFT
) &
2610 } while (index
!= 0 && npgs
!= 0);
2613 * Release the hash bucket.
2616 sfmmu_tteload_release_hashbucket(hmebp
);
2621 * Construct a tte for a page:
2624 * tte_size2 = size & TTE_SZ2_BITS (Panther and Olympus-C only)
2626 * tte_nfo = attr & HAT_NOFAULT
2627 * tte_ie = attr & HAT_STRUCTURE_LE
2628 * tte_hmenum = hmenum
2629 * tte_pahi = pp->p_pagenum >> TTE_PASHIFT;
2630 * tte_palo = pp->p_pagenum & TTE_PALOMASK;
2631 * tte_ref = 1 (optimization)
2632 * tte_wr_perm = attr & PROT_WRITE;
2633 * tte_no_sync = attr & HAT_NOSYNC
2634 * tte_lock = attr & SFMMU_LOCKTTE
2635 * tte_cp = !(attr & SFMMU_UNCACHEPTTE)
2636 * tte_cv = !(attr & SFMMU_UNCACHEVTTE)
2637 * tte_e = attr & SFMMU_SIDEFFECT
2638 * tte_priv = !(attr & PROT_USER)
2639 * tte_hwwr = if nosync is set and it is writable we set the mod bit (opt)
2643 sfmmu_memtte(tte_t
*ttep
, pfn_t pfn
, uint_t attr
, int tte_sz
)
2645 ASSERT(!(attr
& ~SFMMU_LOAD_ALLATTR
));
2647 ttep
->tte_inthi
= MAKE_TTE_INTHI(pfn
, attr
, tte_sz
, 0 /* hmenum */);
2648 ttep
->tte_intlo
= MAKE_TTE_INTLO(pfn
, attr
, tte_sz
, 0 /* hmenum */);
2650 if (TTE_IS_NOSYNC(ttep
)) {
2652 if (TTE_IS_WRITABLE(ttep
)) {
2656 if (TTE_IS_NFO(ttep
) && TTE_IS_EXECUTABLE(ttep
)) {
2657 panic("sfmmu_memtte: can't set both NFO and EXEC bits");
2662 * This function will add a translation to the hme_blk and allocate the
2663 * hme_blk if one does not exist.
2664 * If a page structure is specified then it will add the
2665 * corresponding hment to the mapping list.
2666 * It will also update the hmenum field for the tte.
2668 * Currently this function is only used for kernel mappings.
2669 * So pass invalid region to sfmmu_tteload_array().
2672 sfmmu_tteload(struct hat
*sfmmup
, tte_t
*ttep
, caddr_t vaddr
, page_t
*pp
,
2675 ASSERT(sfmmup
== ksfmmup
);
2676 (void) sfmmu_tteload_array(sfmmup
, ttep
, vaddr
, &pp
, flags
,
2677 SFMMU_INVALID_SHMERID
);
2681 * Load (ttep != NULL) or unload (ttep == NULL) one entry in the TSB.
2682 * Assumes that a particular page size may only be resident in one TSB.
2685 sfmmu_mod_tsb(sfmmu_t
*sfmmup
, caddr_t vaddr
, tte_t
*ttep
, int ttesz
)
2687 struct tsb_info
*tsbinfop
= NULL
;
2689 struct tsbe
*tsbe_addr
;
2692 int vpshift
= MMU_PAGESHIFT
;
2695 if (sfmmup
== ksfmmup
) { /* No support for 32/256M ksfmmu pages */
2697 if (ttesz
>= TTE4M
) {
2699 ASSERT((ttesz
!= TTE32M
) && (ttesz
!= TTE256M
));
2701 tsb_base
= (phys
)? ktsb4m_pbase
: (uint64_t)ktsb4m_base
;
2702 tsb_size
= ktsb4m_szcode
;
2704 tsb_base
= (phys
)? ktsb_pbase
: (uint64_t)ktsb_base
;
2705 tsb_size
= ktsb_szcode
;
2708 SFMMU_GET_TSBINFO(tsbinfop
, sfmmup
, ttesz
);
2711 * If there isn't a TSB for this page size, or the TSB is
2712 * swapped out, there is nothing to do. Note that the latter
2713 * case seems impossible but can occur if hat_pageunload()
2714 * is called on an ISM mapping while the process is swapped
2717 if (tsbinfop
== NULL
|| (tsbinfop
->tsb_flags
& TSB_SWAPPED
))
2721 * If another thread is in the middle of relocating a TSB
2722 * we can't unload the entry so set a flag so that the
2723 * TSB will be flushed before it can be accessed by the
2726 if ((tsbinfop
->tsb_flags
& TSB_RELOC_FLAG
) != 0) {
2728 tsbinfop
->tsb_flags
|= TSB_FLUSH_NEEDED
;
2731 #if defined(UTSB_PHYS)
2733 tsb_base
= (uint64_t)tsbinfop
->tsb_pa
;
2735 tsb_base
= (uint64_t)tsbinfop
->tsb_va
;
2737 tsb_size
= tsbinfop
->tsb_szc
;
2740 vpshift
= MMU_PAGESHIFT4M
;
2742 tsbe_addr
= sfmmu_get_tsbe(tsb_base
, vaddr
, vpshift
, tsb_size
);
2743 tag
= sfmmu_make_tsbtag(vaddr
);
2746 sfmmu_unload_tsbe(tsbe_addr
, tag
, phys
);
2748 if (ttesz
>= TTE4M
) {
2749 SFMMU_STAT(sf_tsb_load4m
);
2751 SFMMU_STAT(sf_tsb_load8k
);
2754 sfmmu_load_tsbe(tsbe_addr
, tag
, ttep
, phys
);
2759 * Unmap all entries from [start, end) matching the given page size.
2761 * This function is used primarily to unmap replicated 64K or 512K entries
2762 * from the TSB that are inserted using the base page size TSB pointer, but
2763 * it may also be called to unmap a range of addresses from the TSB.
2766 sfmmu_unload_tsb_range(sfmmu_t
*sfmmup
, caddr_t start
, caddr_t end
, int ttesz
)
2768 struct tsb_info
*tsbinfop
;
2770 struct tsbe
*tsbe_addr
;
2779 * If ttesz == 8K, 64K or 512K, we walk through the range 8K
2780 * at a time shooting down any valid entries we encounter.
2782 * If ttesz >= 4M we walk the range 4M at a time shooting
2783 * down any valid mappings we find.
2785 if (sfmmup
== ksfmmup
) {
2787 if (ttesz
>= TTE4M
) {
2789 ASSERT((ttesz
!= TTE32M
) && (ttesz
!= TTE256M
));
2791 tsb_base
= (phys
)? ktsb4m_pbase
: (uint64_t)ktsb4m_base
;
2792 tsb_size
= ktsb4m_szcode
;
2794 tsb_base
= (phys
)? ktsb_pbase
: (uint64_t)ktsb_base
;
2795 tsb_size
= ktsb_szcode
;
2798 SFMMU_GET_TSBINFO(tsbinfop
, sfmmup
, ttesz
);
2801 * If there isn't a TSB for this page size, or the TSB is
2802 * swapped out, there is nothing to do. Note that the latter
2803 * case seems impossible but can occur if hat_pageunload()
2804 * is called on an ISM mapping while the process is swapped
2807 if (tsbinfop
== NULL
|| (tsbinfop
->tsb_flags
& TSB_SWAPPED
))
2811 * If another thread is in the middle of relocating a TSB
2812 * we can't unload the entry so set a flag so that the
2813 * TSB will be flushed before it can be accessed by the
2816 if ((tsbinfop
->tsb_flags
& TSB_RELOC_FLAG
) != 0) {
2817 tsbinfop
->tsb_flags
|= TSB_FLUSH_NEEDED
;
2820 #if defined(UTSB_PHYS)
2822 tsb_base
= (uint64_t)tsbinfop
->tsb_pa
;
2824 tsb_base
= (uint64_t)tsbinfop
->tsb_va
;
2826 tsb_size
= tsbinfop
->tsb_szc
;
2828 if (ttesz
>= TTE4M
) {
2829 vpshift
= MMU_PAGESHIFT4M
;
2830 vpgsz
= MMU_PAGESIZE4M
;
2832 vpshift
= MMU_PAGESHIFT
;
2833 vpgsz
= MMU_PAGESIZE
;
2836 for (vaddr
= start
; vaddr
< end
; vaddr
+= vpgsz
) {
2837 tag
= sfmmu_make_tsbtag(vaddr
);
2838 tsbe_addr
= sfmmu_get_tsbe(tsb_base
, vaddr
, vpshift
, tsb_size
);
2839 sfmmu_unload_tsbe(tsbe_addr
, tag
, phys
);
2844 * Select the optimum TSB size given the number of mappings
2845 * that need to be cached.
2848 sfmmu_select_tsb_szc(pgcnt_t pgcnt
)
2853 if (tsb_grow_stress
) {
2854 uint32_t randval
= (uint32_t)gettick() >> 4;
2855 return (randval
% (tsb_max_growsize
+ 1));
2859 while ((szc
< tsb_max_growsize
) && (pgcnt
> SFMMU_RSS_TSBSIZE(szc
)))
2865 * This function will add a translation to the hme_blk and allocate the
2866 * hme_blk if one does not exist.
2867 * If a page structure is specified then it will add the
2868 * corresponding hment to the mapping list.
2869 * It will also update the hmenum field for the tte.
2870 * Furthermore, it attempts to create a large page translation
2871 * for <addr,hat> at page array pps. It assumes addr and first
2872 * pp is correctly aligned. It returns 0 if successful and 1 otherwise.
2875 sfmmu_tteload_array(sfmmu_t
*sfmmup
, tte_t
*ttep
, caddr_t vaddr
,
2876 page_t
**pps
, uint_t flags
, uint_t rid
)
2878 struct hmehash_bucket
*hmebp
;
2879 struct hme_blk
*hmeblkp
;
2886 size
= TTE_CSZ(ttep
);
2887 ASSERT(!((uintptr_t)vaddr
& TTE_PAGE_OFFSET(size
)));
2890 * Acquire the hash bucket.
2892 hmebp
= sfmmu_tteload_acquire_hashbucket(sfmmup
, vaddr
, size
, rid
);
2896 * Find the hment block.
2898 hmeblkp
= sfmmu_tteload_find_hmeblk(sfmmup
, hmebp
, vaddr
, size
, flags
,
2903 * Add the translation.
2905 ret
= sfmmu_tteload_addentry(sfmmup
, hmeblkp
, ttep
, vaddr
, pps
, flags
,
2909 * Release the hash bucket.
2911 sfmmu_tteload_release_hashbucket(hmebp
);
2917 * Function locks and returns a pointer to the hash bucket for vaddr and size.
2919 static struct hmehash_bucket
*
2920 sfmmu_tteload_acquire_hashbucket(sfmmu_t
*sfmmup
, caddr_t vaddr
, int size
,
2923 struct hmehash_bucket
*hmebp
;
2925 void *htagid
= sfmmutohtagid(sfmmup
, rid
);
2927 ASSERT(htagid
!= NULL
);
2929 hmeshift
= HME_HASH_SHIFT(size
);
2931 hmebp
= HME_HASH_FUNCTION(htagid
, vaddr
, hmeshift
);
2933 SFMMU_HASH_LOCK(hmebp
);
2939 * Function returns a pointer to an hmeblk in the hash bucket, hmebp. If the
2940 * hmeblk doesn't exists for the [sfmmup, vaddr & size] signature, a hmeblk is
2943 static struct hme_blk
*
2944 sfmmu_tteload_find_hmeblk(sfmmu_t
*sfmmup
, struct hmehash_bucket
*hmebp
,
2945 caddr_t vaddr
, uint_t size
, uint_t flags
, uint_t rid
)
2949 struct hme_blk
*hmeblkp
, *pr_hblk
, *list
= NULL
;
2951 SFMMU_VALIDATE_HMERID(sfmmup
, rid
, vaddr
, TTEBYTES(size
));
2953 hblktag
.htag_id
= sfmmutohtagid(sfmmup
, rid
);
2954 ASSERT(hblktag
.htag_id
!= NULL
);
2955 hmeshift
= HME_HASH_SHIFT(size
);
2956 hblktag
.htag_bspage
= HME_HASH_BSPAGE(vaddr
, hmeshift
);
2957 hblktag
.htag_rehash
= HME_HASH_REHASH(size
);
2958 hblktag
.htag_rid
= rid
;
2962 HME_HASH_SEARCH_PREV(hmebp
, hblktag
, hmeblkp
, pr_hblk
, &list
);
2965 * We block until hblk_reserve_lock is released; it's held by
2966 * the thread, temporarily using hblk_reserve, until hblk_reserve is
2967 * replaced by a hblk from sfmmu8_cache.
2969 if (hmeblkp
== (struct hme_blk
*)hblk_reserve
&&
2970 hblk_reserve_thread
!= curthread
) {
2971 SFMMU_HASH_UNLOCK(hmebp
);
2972 mutex_enter(&hblk_reserve_lock
);
2973 mutex_exit(&hblk_reserve_lock
);
2974 SFMMU_STAT(sf_hblk_reserve_hit
);
2975 SFMMU_HASH_LOCK(hmebp
);
2976 goto ttearray_realloc
;
2979 if (hmeblkp
== NULL
) {
2980 hmeblkp
= sfmmu_hblk_alloc(sfmmup
, vaddr
, hmebp
, size
,
2981 hblktag
, flags
, rid
);
2982 ASSERT(!SFMMU_IS_SHMERID_VALID(rid
) || hmeblkp
->hblk_shared
);
2983 ASSERT(SFMMU_IS_SHMERID_VALID(rid
) || !hmeblkp
->hblk_shared
);
2986 * It is possible for 8k and 64k hblks to collide since they
2987 * have the same rehash value. This is because we
2988 * lazily free hblks and 8K/64K blks could be lingering.
2989 * If we find size mismatch we free the block and & try again.
2991 if (get_hblk_ttesz(hmeblkp
) != size
) {
2992 ASSERT(!hmeblkp
->hblk_vcnt
);
2993 ASSERT(!hmeblkp
->hblk_hmecnt
);
2994 sfmmu_hblk_hash_rm(hmebp
, hmeblkp
, pr_hblk
,
2996 goto ttearray_realloc
;
2998 if (hmeblkp
->hblk_shw_bit
) {
3000 * if the hblk was previously used as a shadow hblk then
3001 * we will change it to a normal hblk
3003 ASSERT(!hmeblkp
->hblk_shared
);
3004 if (hmeblkp
->hblk_shw_mask
) {
3005 sfmmu_shadow_hcleanup(sfmmup
, hmeblkp
, hmebp
);
3006 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp
));
3007 goto ttearray_realloc
;
3009 hmeblkp
->hblk_shw_bit
= 0;
3012 SFMMU_STAT(sf_hblk_hit
);
3016 * hat_memload() should never call kmem_cache_free() for kernel hmeblks;
3017 * see block comment showing the stacktrace in sfmmu_hblk_alloc();
3018 * set the flag parameter to 1 so that sfmmu_hblks_list_purge() will
3019 * just add these hmeblks to the per-cpu pending queue.
3021 sfmmu_hblks_list_purge(&list
, 1);
3023 ASSERT(get_hblk_ttesz(hmeblkp
) == size
);
3024 ASSERT(!hmeblkp
->hblk_shw_bit
);
3025 ASSERT(!SFMMU_IS_SHMERID_VALID(rid
) || hmeblkp
->hblk_shared
);
3026 ASSERT(SFMMU_IS_SHMERID_VALID(rid
) || !hmeblkp
->hblk_shared
);
3027 ASSERT(hmeblkp
->hblk_tag
.htag_rid
== rid
);
3033 * Function adds a tte entry into the hmeblk. It returns 0 if successful and 1
3037 sfmmu_tteload_addentry(sfmmu_t
*sfmmup
, struct hme_blk
*hmeblkp
, tte_t
*ttep
,
3038 caddr_t vaddr
, page_t
**pps
, uint_t flags
, uint_t rid
)
3041 int hmenum
, size
, remap
;
3042 tte_t tteold
, flush_tte
;
3046 struct sf_hment
*sfhme
;
3047 kmutex_t
*pml
, *pmtx
;
3048 hatlock_t
*hatlockp
;
3052 * remove this panic when we decide to let user virtual address
3053 * space be >= USERLIMIT.
3055 if (!TTE_IS_PRIVILEGED(ttep
) && vaddr
>= (caddr_t
)USERLIMIT
)
3056 panic("user addr %p in kernel space", (void *)vaddr
);
3057 #if defined(TTE_IS_GLOBAL)
3058 if (TTE_IS_GLOBAL(ttep
))
3059 panic("sfmmu_tteload: creating global tte");
3063 if (pf_is_memory(sfmmu_ttetopfn(ttep
, vaddr
)) &&
3064 !TTE_IS_PCACHEABLE(ttep
) && !sfmmu_allow_nc_trans
)
3065 panic("sfmmu_tteload: non cacheable memory tte");
3068 /* don't simulate dirty bit for writeable ISM/DISM mappings */
3069 if ((flags
& HAT_LOAD_SHARE
) && TTE_IS_WRITABLE(ttep
)) {
3074 if ((flags
& HAT_LOAD_SHARE
) || !TTE_IS_REF(ttep
) ||
3075 !TTE_IS_MOD(ttep
)) {
3077 * Don't load TSB for dummy as in ISM. Also don't preload
3078 * the TSB if the TTE isn't writable since we're likely to
3079 * fault on it again -- preloading can be fairly expensive.
3081 flags
|= SFMMU_NO_TSBLOAD
;
3084 size
= TTE_CSZ(ttep
);
3087 SFMMU_STAT(sf_tteload8k
);
3090 SFMMU_STAT(sf_tteload64k
);
3093 SFMMU_STAT(sf_tteload512k
);
3096 SFMMU_STAT(sf_tteload4m
);
3099 SFMMU_STAT(sf_tteload32m
);
3100 ASSERT(mmu_page_sizes
== max_mmu_page_sizes
);
3103 SFMMU_STAT(sf_tteload256m
);
3104 ASSERT(mmu_page_sizes
== max_mmu_page_sizes
);
3108 ASSERT(!((uintptr_t)vaddr
& TTE_PAGE_OFFSET(size
)));
3109 SFMMU_VALIDATE_HMERID(sfmmup
, rid
, vaddr
, TTEBYTES(size
));
3110 ASSERT(!SFMMU_IS_SHMERID_VALID(rid
) || hmeblkp
->hblk_shared
);
3111 ASSERT(SFMMU_IS_SHMERID_VALID(rid
) || !hmeblkp
->hblk_shared
);
3113 HBLKTOHME_IDX(sfhme
, hmeblkp
, vaddr
, hmenum
);
3116 * Need to grab mlist lock here so that pageunload
3117 * will not change tte behind us.
3120 pml
= sfmmu_mlist_enter(pp
);
3123 sfmmu_copytte(&sfhme
->hme_tte
, &tteold
);
3125 * Look for corresponding hment and if valid verify
3128 remap
= TTE_IS_VALID(&tteold
);
3130 pfn_t new_pfn
, old_pfn
;
3132 old_pfn
= TTE_TO_PFN(vaddr
, &tteold
);
3133 new_pfn
= TTE_TO_PFN(vaddr
, ttep
);
3135 if (flags
& HAT_LOAD_REMAP
) {
3136 /* make sure we are remapping same type of pages */
3137 if (pf_is_memory(old_pfn
) != pf_is_memory(new_pfn
)) {
3138 panic("sfmmu_tteload - tte remap io<->memory");
3140 if (old_pfn
!= new_pfn
&&
3141 (pp
!= NULL
|| sfhme
->hme_page
!= NULL
)) {
3142 panic("sfmmu_tteload - tte remap pp != NULL");
3144 } else if (old_pfn
!= new_pfn
) {
3145 panic("sfmmu_tteload - tte remap, hmeblkp 0x%p",
3148 ASSERT(TTE_CSZ(&tteold
) == TTE_CSZ(ttep
));
3152 if (size
== TTE8K
) {
3155 * Handle VAC consistency
3157 if (!remap
&& (cache
& CACHE_VAC
) && !PP_ISNC(pp
)) {
3158 sfmmu_vac_conflict(sfmmup
, vaddr
, pp
);
3162 if (TTE_IS_WRITABLE(ttep
) && PP_ISRO(pp
)) {
3163 pmtx
= sfmmu_page_enter(pp
);
3165 sfmmu_page_exit(pmtx
);
3166 } else if (!PP_ISMAPPED(pp
) &&
3167 (!TTE_IS_WRITABLE(ttep
)) && !(PP_ISMOD(pp
))) {
3168 pmtx
= sfmmu_page_enter(pp
);
3169 if (!(PP_ISMOD(pp
))) {
3172 sfmmu_page_exit(pmtx
);
3175 } else if (sfmmu_pagearray_setup(vaddr
, pps
, ttep
, remap
)) {
3177 * sfmmu_pagearray_setup failed so return
3179 sfmmu_mlist_exit(pml
);
3185 * Make sure hment is not on a mapping list.
3187 ASSERT(remap
|| (sfhme
->hme_page
== NULL
));
3189 /* if it is not a remap then hme->next better be NULL */
3190 ASSERT((!remap
) ? sfhme
->hme_next
== NULL
: 1);
3192 if (flags
& HAT_LOAD_LOCK
) {
3193 if ((hmeblkp
->hblk_lckcnt
+ 1) >= MAX_HBLK_LCKCNT
) {
3194 panic("too high lckcnt-hmeblk %p",
3197 atomic_inc_32(&hmeblkp
->hblk_lckcnt
);
3199 HBLK_STACK_TRACE(hmeblkp
, HBLK_LOCK
);
3203 if (pp
&& PP_ISNC(pp
)) {
3205 * If the physical page is marked to be uncacheable, like
3206 * by a vac conflict, make sure the new mapping is also
3209 TTE_CLR_VCACHEABLE(ttep
);
3210 ASSERT(PP_GET_VCOLOR(pp
) == NO_VCOLOR
);
3213 ttep
->tte_hmenum
= hmenum
;
3219 while (sfmmu_modifytte_try(&tteold
, ttep
, &sfhme
->hme_tte
) < 0) {
3220 if ((sfmmup
== KHATID
) &&
3221 (flags
& (HAT_LOAD_LOCK
| HAT_LOAD_REMAP
))) {
3222 sfmmu_copytte(&sfhme
->hme_tte
, &tteold
);
3225 chk_tte(&orig_old
, &tteold
, ttep
, hmeblkp
);
3228 ASSERT(TTE_IS_VALID(&sfhme
->hme_tte
));
3230 if (!TTE_IS_VALID(&tteold
)) {
3232 atomic_inc_16(&hmeblkp
->hblk_vcnt
);
3233 if (rid
== SFMMU_INVALID_SHMERID
) {
3234 atomic_inc_ulong(&sfmmup
->sfmmu_ttecnt
[size
]);
3236 sf_srd_t
*srdp
= sfmmup
->sfmmu_srdp
;
3237 sf_region_t
*rgnp
= srdp
->srd_hmergnp
[rid
];
3239 * We already accounted for region ttecnt's in sfmmu
3240 * during hat_join_region() processing. Here we
3241 * only update ttecnt's in region struture.
3243 atomic_inc_ulong(&rgnp
->rgn_ttecnt
[size
]);
3247 myflt
= (astosfmmu(curthread
->t_procp
->p_as
) == sfmmup
);
3248 if (size
> TTE8K
&& (flags
& HAT_LOAD_SHARE
) == 0 &&
3249 sfmmup
!= ksfmmup
) {
3250 uchar_t tteflag
= 1 << size
;
3251 if (rid
== SFMMU_INVALID_SHMERID
) {
3252 if (!(sfmmup
->sfmmu_tteflags
& tteflag
)) {
3253 hatlockp
= sfmmu_hat_enter(sfmmup
);
3254 sfmmup
->sfmmu_tteflags
|= tteflag
;
3255 sfmmu_hat_exit(hatlockp
);
3257 } else if (!(sfmmup
->sfmmu_rtteflags
& tteflag
)) {
3258 hatlockp
= sfmmu_hat_enter(sfmmup
);
3259 sfmmup
->sfmmu_rtteflags
|= tteflag
;
3260 sfmmu_hat_exit(hatlockp
);
3263 * Update the current CPU tsbmiss area, so the current thread
3264 * won't need to take the tsbmiss for the new pagesize.
3265 * The other threads in the process will update their tsb
3266 * miss area lazily in sfmmu_tsbmiss_exception() when they
3267 * fail to find the translation for a newly added pagesize.
3269 if (size
> TTE64K
&& myflt
) {
3270 struct tsbmiss
*tsbmp
;
3272 tsbmp
= &tsbmiss_area
[CPU
->cpu_id
];
3273 if (rid
== SFMMU_INVALID_SHMERID
) {
3274 if (!(tsbmp
->uhat_tteflags
& tteflag
)) {
3275 tsbmp
->uhat_tteflags
|= tteflag
;
3278 if (!(tsbmp
->uhat_rtteflags
& tteflag
)) {
3279 tsbmp
->uhat_rtteflags
|= tteflag
;
3286 if (size
>= TTE4M
&& (flags
& HAT_LOAD_TEXT
) &&
3287 !SFMMU_FLAGS_ISSET(sfmmup
, HAT_4MTEXT_FLAG
)) {
3288 hatlockp
= sfmmu_hat_enter(sfmmup
);
3289 SFMMU_FLAGS_SET(sfmmup
, HAT_4MTEXT_FLAG
);
3290 sfmmu_hat_exit(hatlockp
);
3293 flush_tte
.tte_intlo
= (tteold
.tte_intlo
^ ttep
->tte_intlo
) &
3295 flush_tte
.tte_inthi
= (tteold
.tte_inthi
^ ttep
->tte_inthi
) &
3298 if (remap
&& (flush_tte
.tte_inthi
|| flush_tte
.tte_intlo
)) {
3300 * If remap and new tte differs from old tte we need
3301 * to sync the mod bit and flush TLB/TSB. We don't
3302 * need to sync ref bit because we currently always set
3303 * ref bit in tteload.
3305 ASSERT(TTE_IS_REF(ttep
));
3306 if (TTE_IS_MOD(&tteold
)) {
3307 sfmmu_ttesync(sfmmup
, vaddr
, &tteold
, pp
);
3310 * hwtte bits shouldn't change for SRD hmeblks as long as SRD
3311 * hmes are only used for read only text. Adding this code for
3312 * completeness and future use of shared hmeblks with writable
3313 * mappings of VMODSORT vnodes.
3315 if (hmeblkp
->hblk_shared
) {
3316 cpuset_t cpuset
= sfmmu_rgntlb_demap(vaddr
,
3317 sfmmup
->sfmmu_srdp
->srd_hmergnp
[rid
], hmeblkp
, 1);
3319 SFMMU_STAT_ADD(sf_region_remap_demap
, 1);
3321 sfmmu_tlb_demap(vaddr
, sfmmup
, hmeblkp
, 0, 0);
3322 xt_sync(sfmmup
->sfmmu_cpusran
);
3326 if ((flags
& SFMMU_NO_TSBLOAD
) == 0) {
3328 * We only preload 8K and 4M mappings into the TSB, since
3329 * 64K and 512K mappings are replicated and hence don't
3330 * have a single, unique TSB entry. Ditto for 32M/256M.
3332 if (size
== TTE8K
|| size
== TTE4M
) {
3334 hatlockp
= sfmmu_hat_enter(sfmmup
);
3336 * Don't preload private TSB if the mapping is used
3337 * by the shctx in the SCD.
3339 scdp
= sfmmup
->sfmmu_scdp
;
3340 if (rid
== SFMMU_INVALID_SHMERID
|| scdp
== NULL
||
3341 !SF_RGNMAP_TEST(scdp
->scd_hmeregion_map
, rid
)) {
3342 sfmmu_load_tsb(sfmmup
, vaddr
, &sfhme
->hme_tte
,
3345 sfmmu_hat_exit(hatlockp
);
3351 atomic_inc_16(&hmeblkp
->hblk_hmecnt
);
3352 ASSERT(hmeblkp
->hblk_hmecnt
> 0);
3355 * Cannot ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS)
3356 * see pageunload() for comment.
3359 sfmmu_mlist_exit(pml
);
3365 * Function unlocks hash bucket.
3368 sfmmu_tteload_release_hashbucket(struct hmehash_bucket
*hmebp
)
3370 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp
));
3371 SFMMU_HASH_UNLOCK(hmebp
);
3375 * function which checks and sets up page array for a large
3376 * translation. Will set p_vcolor, p_index, p_ro fields.
3377 * Assumes addr and pfnum of first page are properly aligned.
3378 * Will check for physical contiguity. If check fails it return
3382 sfmmu_pagearray_setup(caddr_t addr
, page_t
**pps
, tte_t
*ttep
, int remap
)
3384 int i
, index
, ttesz
;
3396 ttesz
= TTE_CSZ(ttep
);
3398 ASSERT(ttesz
> TTE8K
);
3400 npgs
= TTEPAGES(ttesz
);
3401 index
= PAGESZ_TO_INDEX(ttesz
);
3403 pfnum
= (*pps
)->p_pagenum
;
3404 ASSERT(IS_P2ALIGNED(pfnum
, npgs
));
3407 * Save the first pp so we can do HAT_TMPNC at the end.
3411 osz
= fnd_mapping_sz(pp1
);
3414 for (i
= 0; i
< npgs
; i
++, pps
++) {
3416 ASSERT(PAGE_LOCKED(pp
));
3417 ASSERT(pp
->p_szc
>= ttesz
);
3418 ASSERT(pp
->p_szc
== pp1
->p_szc
);
3419 ASSERT(sfmmu_mlist_held(pp
));
3422 * XXX is it possible to maintain P_RO on the root only?
3424 if (TTE_IS_WRITABLE(ttep
) && PP_ISRO(pp
)) {
3425 pmtx
= sfmmu_page_enter(pp
);
3427 sfmmu_page_exit(pmtx
);
3428 } else if (!PP_ISMAPPED(pp
) && !TTE_IS_WRITABLE(ttep
) &&
3430 pmtx
= sfmmu_page_enter(pp
);
3431 if (!(PP_ISMOD(pp
))) {
3434 sfmmu_page_exit(pmtx
);
3438 * If this is a remap we skip vac & contiguity checks.
3444 * set p_vcolor and detect any vac conflicts.
3448 vac_err
= sfmmu_vacconflict_array(addr
, pp
, &cflags
);
3454 * Save current index in case we need to undo it.
3455 * Note: "PAGESZ_TO_INDEX(sz) (1 << (sz))"
3456 * "SFMMU_INDEX_SHIFT 6"
3457 * "SFMMU_INDEX_MASK ((1 << SFMMU_INDEX_SHIFT) - 1)"
3458 * "PP_MAPINDEX(p_index) (p_index & SFMMU_INDEX_MASK)"
3460 * So: index = PAGESZ_TO_INDEX(ttesz);
3461 * if ttesz == 1 then index = 0x2
3462 * 2 then index = 0x4
3463 * 3 then index = 0x8
3464 * 4 then index = 0x10
3465 * 5 then index = 0x20
3466 * The code below checks if it's a new pagesize (ie, newidx)
3467 * in case we need to take it back out of p_index,
3468 * and then or's the new index into the existing index.
3470 if ((PP_MAPINDEX(pp
) & index
) == 0)
3472 pp
->p_index
= (PP_MAPINDEX(pp
) | index
);
3477 if (pp
->p_pagenum
!= pfnum
) {
3479 * If we fail the contiguity test then
3480 * the only thing we need to fix is the p_index field.
3481 * We might get a few extra flushes but since this
3482 * path is rare that is ok. The p_ro field will
3483 * get automatically fixed on the next tteload to
3484 * the page. NO TNC bit is set yet.
3489 pp
->p_index
= (PP_MAPINDEX(pp
) &
3497 addr
+= MMU_PAGESIZE
;
3504 * There are some smaller mappings that causes vac
3505 * conflicts. Convert all existing small mappings to
3508 SFMMU_STAT_ADD(sf_uncache_conflict
, npgs
);
3509 sfmmu_page_cache_array(pp1
, HAT_TMPNC
, CACHE_FLUSH
,
3514 * If there exists an big page mapping,
3515 * that means the whole existing big page
3516 * has TNC setting already. No need to covert to
3519 ASSERT(PP_ISTNC(pp1
));
3529 * Routine that detects vac consistency for a large page. It also
3530 * sets virtual color for all pp's for this big mapping.
3533 sfmmu_vacconflict_array(caddr_t addr
, page_t
*pp
, int *cflags
)
3537 ASSERT(sfmmu_mlist_held(pp
));
3543 vcolor
= addr_to_vcolor(addr
);
3544 if (PP_NEWPAGE(pp
)) {
3545 PP_SET_VCOLOR(pp
, vcolor
);
3549 ocolor
= PP_GET_VCOLOR(pp
);
3550 if (ocolor
== vcolor
) {
3554 if (!PP_ISMAPPED(pp
) && !PP_ISMAPPED_KPM(pp
)) {
3556 * Previous user of page had a differnet color
3557 * but since there are no current users
3558 * we just flush the cache and change the color.
3559 * As an optimization for large pages we flush the
3560 * entire cache of that color and set a flag.
3562 SFMMU_STAT(sf_pgcolor_conflict
);
3563 if (!CacheColor_IsFlushed(*cflags
, ocolor
)) {
3564 CacheColor_SetFlushed(*cflags
, ocolor
);
3565 sfmmu_cache_flushcolor(ocolor
, pp
->p_pagenum
);
3567 PP_SET_VCOLOR(pp
, vcolor
);
3572 * We got a real conflict with a current mapping.
3573 * set flags to start unencaching all mappings
3574 * and return failure so we restart looping
3575 * the pp array from the beginning.
3582 * creates a large page shadow hmeblk for a tte.
3583 * The purpose of this routine is to allow us to do quick unloads because
3584 * the vm layer can easily pass a very large but sparsely populated range.
3586 static struct hme_blk
*
3587 sfmmu_shadow_hcreate(sfmmu_t
*sfmmup
, caddr_t vaddr
, int ttesz
, uint_t flags
)
3589 struct hmehash_bucket
*hmebp
;
3591 int hmeshift
, size
, vshift
;
3592 uint_t shw_mask
, newshw_mask
;
3593 struct hme_blk
*hmeblkp
;
3595 ASSERT(sfmmup
!= KHATID
);
3596 if (mmu_page_sizes
== max_mmu_page_sizes
) {
3597 ASSERT(ttesz
< TTE256M
);
3599 ASSERT(ttesz
< TTE4M
);
3600 ASSERT(sfmmup
->sfmmu_ttecnt
[TTE32M
] == 0);
3601 ASSERT(sfmmup
->sfmmu_ttecnt
[TTE256M
] == 0);
3604 if (ttesz
== TTE8K
) {
3610 hblktag
.htag_id
= sfmmup
;
3611 hmeshift
= HME_HASH_SHIFT(size
);
3612 hblktag
.htag_bspage
= HME_HASH_BSPAGE(vaddr
, hmeshift
);
3613 hblktag
.htag_rehash
= HME_HASH_REHASH(size
);
3614 hblktag
.htag_rid
= SFMMU_INVALID_SHMERID
;
3615 hmebp
= HME_HASH_FUNCTION(sfmmup
, vaddr
, hmeshift
);
3617 SFMMU_HASH_LOCK(hmebp
);
3619 HME_HASH_FAST_SEARCH(hmebp
, hblktag
, hmeblkp
);
3620 ASSERT(hmeblkp
!= (struct hme_blk
*)hblk_reserve
);
3621 if (hmeblkp
== NULL
) {
3622 hmeblkp
= sfmmu_hblk_alloc(sfmmup
, vaddr
, hmebp
, size
,
3623 hblktag
, flags
, SFMMU_INVALID_SHMERID
);
3626 if (!hmeblkp
->hblk_shw_mask
) {
3628 * if this is a unused hblk it was just allocated or could
3629 * potentially be a previous large page hblk so we need to
3630 * set the shadow bit.
3632 ASSERT(!hmeblkp
->hblk_vcnt
&& !hmeblkp
->hblk_hmecnt
);
3633 hmeblkp
->hblk_shw_bit
= 1;
3634 } else if (hmeblkp
->hblk_shw_bit
== 0) {
3635 panic("sfmmu_shadow_hcreate: shw bit not set in hmeblkp 0x%p",
3638 ASSERT(hmeblkp
->hblk_shw_bit
== 1);
3639 ASSERT(!hmeblkp
->hblk_shared
);
3640 vshift
= vaddr_to_vshift(hblktag
, vaddr
, size
);
3643 * Atomically set shw mask bit
3646 shw_mask
= hmeblkp
->hblk_shw_mask
;
3647 newshw_mask
= shw_mask
| (1 << vshift
);
3648 newshw_mask
= atomic_cas_32(&hmeblkp
->hblk_shw_mask
, shw_mask
,
3650 } while (newshw_mask
!= shw_mask
);
3652 SFMMU_HASH_UNLOCK(hmebp
);
3658 * This routine cleanup a previous shadow hmeblk and changes it to
3659 * a regular hblk. This happens rarely but it is possible
3660 * when a process wants to use large pages and there are hblks still
3661 * lying around from the previous as that used these hmeblks.
3662 * The alternative was to cleanup the shadow hblks at unload time
3663 * but since so few user processes actually use large pages, it is
3664 * better to be lazy and cleanup at this time.
3667 sfmmu_shadow_hcleanup(sfmmu_t
*sfmmup
, struct hme_blk
*hmeblkp
,
3668 struct hmehash_bucket
*hmebp
)
3670 caddr_t addr
, endaddr
;
3673 ASSERT(hmeblkp
->hblk_shw_bit
);
3674 ASSERT(!hmeblkp
->hblk_shared
);
3676 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp
));
3678 if (!hmeblkp
->hblk_shw_mask
) {
3679 hmeblkp
->hblk_shw_bit
= 0;
3682 addr
= (caddr_t
)get_hblk_base(hmeblkp
);
3683 endaddr
= get_hblk_endaddr(hmeblkp
);
3684 size
= get_hblk_ttesz(hmeblkp
);
3687 SFMMU_HASH_UNLOCK(hmebp
);
3689 sfmmu_free_hblks(sfmmup
, addr
, endaddr
, hashno
);
3691 SFMMU_HASH_LOCK(hmebp
);
3695 sfmmu_free_hblks(sfmmu_t
*sfmmup
, caddr_t addr
, caddr_t endaddr
,
3698 int hmeshift
, shadow
= 0;
3700 struct hmehash_bucket
*hmebp
;
3701 struct hme_blk
*hmeblkp
;
3702 struct hme_blk
*nx_hblk
, *pr_hblk
, *list
= NULL
;
3705 hblktag
.htag_id
= sfmmup
;
3706 hblktag
.htag_rehash
= hashno
;
3707 hblktag
.htag_rid
= SFMMU_INVALID_SHMERID
;
3709 hmeshift
= HME_HASH_SHIFT(hashno
);
3711 while (addr
< endaddr
) {
3712 hblktag
.htag_bspage
= HME_HASH_BSPAGE(addr
, hmeshift
);
3713 hmebp
= HME_HASH_FUNCTION(sfmmup
, addr
, hmeshift
);
3714 SFMMU_HASH_LOCK(hmebp
);
3715 /* inline HME_HASH_SEARCH */
3716 hmeblkp
= hmebp
->hmeblkp
;
3719 if (HTAGS_EQ(hmeblkp
->hblk_tag
, hblktag
)) {
3721 ASSERT(!hmeblkp
->hblk_shared
);
3722 if (hmeblkp
->hblk_shw_bit
) {
3723 if (hmeblkp
->hblk_shw_mask
) {
3725 sfmmu_shadow_hcleanup(sfmmup
,
3729 hmeblkp
->hblk_shw_bit
= 0;
3734 * Hblk_hmecnt and hblk_vcnt could be non zero
3735 * since hblk_unload() does not gurantee that.
3737 * XXX - this could cause tteload() to spin
3738 * where sfmmu_shadow_hcleanup() is called.
3742 nx_hblk
= hmeblkp
->hblk_next
;
3743 if (!hmeblkp
->hblk_vcnt
&& !hmeblkp
->hblk_hmecnt
) {
3744 sfmmu_hblk_hash_rm(hmebp
, hmeblkp
, pr_hblk
,
3752 SFMMU_HASH_UNLOCK(hmebp
);
3756 * We found another shadow hblk so cleaned its
3757 * children. We need to go back and cleanup
3758 * the original hblk so we don't change the
3763 addr
= (caddr_t
)roundup((uintptr_t)addr
+ 1,
3767 sfmmu_hblks_list_purge(&list
, 0);
3771 * This routine's job is to delete stale invalid shared hmeregions hmeblks that
3772 * may still linger on after pageunload.
3775 sfmmu_cleanup_rhblk(sf_srd_t
*srdp
, caddr_t addr
, uint_t rid
, int ttesz
)
3779 struct hmehash_bucket
*hmebp
;
3780 struct hme_blk
*hmeblkp
;
3781 struct hme_blk
*pr_hblk
;
3782 struct hme_blk
*list
= NULL
;
3784 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
3785 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
3787 hmeshift
= HME_HASH_SHIFT(ttesz
);
3788 hblktag
.htag_bspage
= HME_HASH_BSPAGE(addr
, hmeshift
);
3789 hblktag
.htag_rehash
= ttesz
;
3790 hblktag
.htag_rid
= rid
;
3791 hblktag
.htag_id
= srdp
;
3792 hmebp
= HME_HASH_FUNCTION(srdp
, addr
, hmeshift
);
3794 SFMMU_HASH_LOCK(hmebp
);
3795 HME_HASH_SEARCH_PREV(hmebp
, hblktag
, hmeblkp
, pr_hblk
, &list
);
3796 if (hmeblkp
!= NULL
) {
3797 ASSERT(hmeblkp
->hblk_shared
);
3798 ASSERT(!hmeblkp
->hblk_shw_bit
);
3799 if (hmeblkp
->hblk_vcnt
|| hmeblkp
->hblk_hmecnt
) {
3800 panic("sfmmu_cleanup_rhblk: valid hmeblk");
3802 ASSERT(!hmeblkp
->hblk_lckcnt
);
3803 sfmmu_hblk_hash_rm(hmebp
, hmeblkp
, pr_hblk
,
3806 SFMMU_HASH_UNLOCK(hmebp
);
3807 sfmmu_hblks_list_purge(&list
, 0);
3812 sfmmu_rgn_cb_noop(caddr_t saddr
, caddr_t eaddr
, caddr_t r_saddr
,
3813 size_t r_size
, void *r_obj
, u_offset_t r_objoff
)
3818 * Searches for an hmeblk which maps addr, then unloads this mapping
3819 * and updates *eaddrp, if the hmeblk is found.
3822 sfmmu_unload_hmeregion_va(sf_srd_t
*srdp
, uint_t rid
, caddr_t addr
,
3823 caddr_t eaddr
, int ttesz
, caddr_t
*eaddrp
)
3827 struct hmehash_bucket
*hmebp
;
3828 struct hme_blk
*hmeblkp
;
3829 struct hme_blk
*pr_hblk
;
3830 struct hme_blk
*list
= NULL
;
3832 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
3833 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
3834 ASSERT(ttesz
>= HBLK_MIN_TTESZ
);
3836 hmeshift
= HME_HASH_SHIFT(ttesz
);
3837 hblktag
.htag_bspage
= HME_HASH_BSPAGE(addr
, hmeshift
);
3838 hblktag
.htag_rehash
= ttesz
;
3839 hblktag
.htag_rid
= rid
;
3840 hblktag
.htag_id
= srdp
;
3841 hmebp
= HME_HASH_FUNCTION(srdp
, addr
, hmeshift
);
3843 SFMMU_HASH_LOCK(hmebp
);
3844 HME_HASH_SEARCH_PREV(hmebp
, hblktag
, hmeblkp
, pr_hblk
, &list
);
3845 if (hmeblkp
!= NULL
) {
3846 ASSERT(hmeblkp
->hblk_shared
);
3847 ASSERT(!hmeblkp
->hblk_lckcnt
);
3848 if (hmeblkp
->hblk_vcnt
|| hmeblkp
->hblk_hmecnt
) {
3849 *eaddrp
= sfmmu_hblk_unload(NULL
, hmeblkp
, addr
,
3850 eaddr
, NULL
, HAT_UNLOAD
);
3851 ASSERT(*eaddrp
> addr
);
3853 ASSERT(!hmeblkp
->hblk_vcnt
&& !hmeblkp
->hblk_hmecnt
);
3854 sfmmu_hblk_hash_rm(hmebp
, hmeblkp
, pr_hblk
,
3857 SFMMU_HASH_UNLOCK(hmebp
);
3858 sfmmu_hblks_list_purge(&list
, 0);
3862 sfmmu_unload_hmeregion(sf_srd_t
*srdp
, sf_region_t
*rgnp
)
3864 int ttesz
= rgnp
->rgn_pgszc
;
3865 size_t rsz
= rgnp
->rgn_size
;
3866 caddr_t rsaddr
= rgnp
->rgn_saddr
;
3867 caddr_t readdr
= rsaddr
+ rsz
;
3870 uint_t rid
= rgnp
->rgn_id
;
3873 hat_rgn_cb_func_t rcbfunc
;
3876 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
3877 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
3879 ASSERT(IS_P2ALIGNED(rsaddr
, TTEBYTES(ttesz
)));
3880 ASSERT(IS_P2ALIGNED(rsz
, TTEBYTES(ttesz
)));
3881 if (ttesz
< HBLK_MIN_TTESZ
) {
3882 ttesz
= HBLK_MIN_TTESZ
;
3883 rhsaddr
= (caddr_t
)P2ALIGN((uintptr_t)rsaddr
, HBLK_MIN_BYTES
);
3888 if ((rcbfunc
= rgnp
->rgn_cb_function
) == NULL
) {
3889 rcbfunc
= sfmmu_rgn_cb_noop
;
3892 while (ttesz
>= HBLK_MIN_TTESZ
) {
3895 if (!(rgnp
->rgn_hmeflags
& (1 << ttesz
))) {
3901 while (va
< readdr
) {
3902 ASSERT(va
>= rhsaddr
);
3903 if (va
!= cbeaddr
) {
3904 if (cbeaddr
!= cbsaddr
) {
3905 ASSERT(cbeaddr
> cbsaddr
);
3906 (*rcbfunc
)(cbsaddr
, cbeaddr
,
3907 rsaddr
, rsz
, rgnp
->rgn_obj
,
3913 sfmmu_unload_hmeregion_va(srdp
, rid
, va
, readdr
,
3916 va
= rhsaddr
+ (cnt
<< TTE_PAGE_SHIFT(ttesz
));
3918 if (cbeaddr
!= cbsaddr
) {
3919 ASSERT(cbeaddr
> cbsaddr
);
3920 (*rcbfunc
)(cbsaddr
, cbeaddr
, rsaddr
,
3929 * Release one hardware address translation lock on the given address range.
3932 hat_unlock(struct hat
*sfmmup
, caddr_t addr
, size_t len
)
3934 struct hmehash_bucket
*hmebp
;
3936 int hmeshift
, hashno
= 1;
3937 struct hme_blk
*hmeblkp
, *list
= NULL
;
3940 ASSERT(sfmmup
!= NULL
);
3942 ASSERT((sfmmup
== ksfmmup
) || AS_LOCK_HELD(sfmmup
->sfmmu_as
));
3943 ASSERT((len
& MMU_PAGEOFFSET
) == 0);
3944 endaddr
= addr
+ len
;
3945 hblktag
.htag_id
= sfmmup
;
3946 hblktag
.htag_rid
= SFMMU_INVALID_SHMERID
;
3949 * Spitfire supports 4 page sizes.
3950 * Most pages are expected to be of the smallest page size (8K) and
3951 * these will not need to be rehashed. 64K pages also don't need to be
3952 * rehashed because an hmeblk spans 64K of address space. 512K pages
3953 * might need 1 rehash and and 4M pages might need 2 rehashes.
3955 while (addr
< endaddr
) {
3956 hmeshift
= HME_HASH_SHIFT(hashno
);
3957 hblktag
.htag_bspage
= HME_HASH_BSPAGE(addr
, hmeshift
);
3958 hblktag
.htag_rehash
= hashno
;
3959 hmebp
= HME_HASH_FUNCTION(sfmmup
, addr
, hmeshift
);
3961 SFMMU_HASH_LOCK(hmebp
);
3963 HME_HASH_SEARCH(hmebp
, hblktag
, hmeblkp
, &list
);
3964 if (hmeblkp
!= NULL
) {
3965 ASSERT(!hmeblkp
->hblk_shared
);
3967 * If we encounter a shadow hmeblk then
3968 * we know there are no valid hmeblks mapping
3969 * this address at this size or larger.
3970 * Just increment address by the smallest
3973 if (hmeblkp
->hblk_shw_bit
) {
3974 addr
+= MMU_PAGESIZE
;
3976 addr
= sfmmu_hblk_unlock(hmeblkp
, addr
,
3979 SFMMU_HASH_UNLOCK(hmebp
);
3983 SFMMU_HASH_UNLOCK(hmebp
);
3985 if (!HME_REHASH(sfmmup
) || (hashno
>= mmu_hashcnt
)) {
3987 * We have traversed the whole list and rehashed
3988 * if necessary without finding the address to unlock
3989 * which should never happen.
3991 panic("sfmmu_unlock: addr not found. "
3992 "addr %p hat %p", (void *)addr
, (void *)sfmmup
);
3998 sfmmu_hblks_list_purge(&list
, 0);
4002 hat_unlock_region(struct hat
*sfmmup
, caddr_t addr
, size_t len
,
4003 hat_region_cookie_t rcookie
)
4013 struct hmehash_bucket
*hmebp
;
4014 struct hme_blk
*hmeblkp
;
4015 struct hme_blk
*pr_hblk
;
4016 struct hme_blk
*list
;
4018 if (rcookie
== HAT_INVALID_REGION_COOKIE
) {
4019 hat_unlock(sfmmup
, addr
, len
);
4023 ASSERT(sfmmup
!= NULL
);
4024 ASSERT(sfmmup
!= ksfmmup
);
4026 srdp
= sfmmup
->sfmmu_srdp
;
4027 rid
= (uint_t
)((uint64_t)rcookie
);
4028 VERIFY3U(rid
, <, SFMMU_MAX_HME_REGIONS
);
4032 rgnp
= srdp
->srd_hmergnp
[rid
];
4033 SFMMU_VALIDATE_HMERID(sfmmup
, rid
, addr
, len
);
4035 ASSERT(IS_P2ALIGNED(addr
, TTEBYTES(rgnp
->rgn_pgszc
)));
4036 ASSERT(IS_P2ALIGNED(len
, TTEBYTES(rgnp
->rgn_pgszc
)));
4037 if (rgnp
->rgn_pgszc
< HBLK_MIN_TTESZ
) {
4038 ttesz
= HBLK_MIN_TTESZ
;
4040 ttesz
= rgnp
->rgn_pgszc
;
4042 while (va
< eaddr
) {
4043 while (ttesz
< rgnp
->rgn_pgszc
&&
4044 IS_P2ALIGNED(va
, TTEBYTES(ttesz
+ 1))) {
4047 while (ttesz
>= HBLK_MIN_TTESZ
) {
4048 if (!(rgnp
->rgn_hmeflags
& (1 << ttesz
))) {
4052 hmeshift
= HME_HASH_SHIFT(ttesz
);
4053 hblktag
.htag_bspage
= HME_HASH_BSPAGE(va
, hmeshift
);
4054 hblktag
.htag_rehash
= ttesz
;
4055 hblktag
.htag_rid
= rid
;
4056 hblktag
.htag_id
= srdp
;
4057 hmebp
= HME_HASH_FUNCTION(srdp
, va
, hmeshift
);
4058 SFMMU_HASH_LOCK(hmebp
);
4059 HME_HASH_SEARCH_PREV(hmebp
, hblktag
, hmeblkp
, pr_hblk
,
4061 if (hmeblkp
== NULL
) {
4062 SFMMU_HASH_UNLOCK(hmebp
);
4066 ASSERT(hmeblkp
->hblk_shared
);
4067 va
= sfmmu_hblk_unlock(hmeblkp
, va
, eaddr
);
4068 ASSERT(va
>= eaddr
||
4069 IS_P2ALIGNED((uintptr_t)va
, TTEBYTES(ttesz
)));
4070 SFMMU_HASH_UNLOCK(hmebp
);
4073 if (ttesz
< HBLK_MIN_TTESZ
) {
4074 panic("hat_unlock_region: addr not found "
4075 "addr %p hat %p", (void *)va
, (void *)sfmmup
);
4078 sfmmu_hblks_list_purge(&list
, 0);
4082 * Function to unlock a range of addresses in an hmeblk. It returns the
4083 * next address that needs to be unlocked.
4084 * Should be called with the hash lock held.
4087 sfmmu_hblk_unlock(struct hme_blk
*hmeblkp
, caddr_t addr
, caddr_t endaddr
)
4089 struct sf_hment
*sfhme
;
4090 tte_t tteold
, ttemod
;
4093 ASSERT(in_hblk_range(hmeblkp
, addr
));
4094 ASSERT(hmeblkp
->hblk_shw_bit
== 0);
4096 endaddr
= MIN(endaddr
, get_hblk_endaddr(hmeblkp
));
4097 ttesz
= get_hblk_ttesz(hmeblkp
);
4099 HBLKTOHME(sfhme
, hmeblkp
, addr
);
4100 while (addr
< endaddr
) {
4102 sfmmu_copytte(&sfhme
->hme_tte
, &tteold
);
4103 if (TTE_IS_VALID(&tteold
)) {
4107 ret
= sfmmu_modifytte_try(&tteold
, &ttemod
,
4113 if (hmeblkp
->hblk_lckcnt
== 0)
4114 panic("zero hblk lckcnt");
4116 if (((uintptr_t)addr
+ TTEBYTES(ttesz
)) >
4118 panic("can't unlock large tte");
4120 ASSERT(hmeblkp
->hblk_lckcnt
> 0);
4121 atomic_dec_32(&hmeblkp
->hblk_lckcnt
);
4122 HBLK_STACK_TRACE(hmeblkp
, HBLK_UNLOCK
);
4124 panic("sfmmu_hblk_unlock: invalid tte");
4126 addr
+= TTEBYTES(ttesz
);
4133 * Physical Address Mapping Framework
4137 * (1) Applies only to seg_kmem memory pages. To make things easier,
4138 * seg_kpm addresses are also accepted by the routines, but nothing
4139 * is done with them since by definition their PA mappings are static.
4140 * (2) hat_add_callback() may only be called while holding the page lock
4141 * SE_SHARED or SE_EXCL of the underlying page (e.g., as_pagelock()),
4142 * or passing HAC_PAGELOCK flag.
4143 * (3) prehandler() and posthandler() may not call hat_add_callback() or
4144 * hat_delete_callback(), nor should they allocate memory. Post quiesce
4145 * callbacks may not sleep or acquire adaptive mutex locks.
4146 * (4) Either prehandler() or posthandler() (but not both) may be specified
4147 * as being NULL. Specifying an errhandler() is optional.
4149 * Details of using the framework:
4151 * registering a callback (hat_register_callback())
4153 * Pass prehandler, posthandler, errhandler addresses
4154 * as described below. If capture_cpus argument is nonzero,
4155 * suspend callback to the prehandler will occur with CPUs
4156 * captured and executing xc_loop() and CPUs will remain
4157 * captured until after the posthandler suspend callback
4160 * adding a callback (hat_add_callback())
4163 * hat_add_callback();
4164 * save returned pfn in private data structures or program registers;
4169 * Stop all accesses by physical address to this memory page.
4170 * Called twice: the first, PRESUSPEND, is a context safe to acquire
4171 * adaptive locks. The second, SUSPEND, is called at high PIL with
4172 * CPUs captured so adaptive locks may NOT be acquired (and all spin
4173 * locks must be XCALL_PIL or higher locks).
4175 * May return the following errors:
4176 * EIO: A fatal error has occurred. This will result in panic.
4177 * EAGAIN: The page cannot be suspended. This will fail the
4183 * Save new pfn in private data structures or program registers;
4184 * not allowed to fail (non-zero return values will result in panic).
4188 * called when an error occurs related to the callback. Currently
4189 * the only such error is HAT_CB_ERR_LEAKED which indicates that
4190 * a page is being freed, but there are still outstanding callback(s)
4191 * registered on the page.
4193 * removing a callback (hat_delete_callback(); e.g., prior to freeing memory)
4195 * stop using physical address
4196 * hat_delete_callback();
4201 * Register a callback class. Each subsystem should do this once and
4202 * cache the id_t returned for use in setting up and tearing down callbacks.
4204 * There is no facility for removing callback IDs once they are created;
4205 * the "key" should be unique for each module, so in case a module is unloaded
4206 * and subsequently re-loaded, we can recycle the module's previous entry.
4209 hat_register_callback(int key
,
4210 int (*prehandler
)(caddr_t
, uint_t
, uint_t
, void *),
4211 int (*posthandler
)(caddr_t
, uint_t
, uint_t
, void *, pfn_t
),
4212 int (*errhandler
)(caddr_t
, uint_t
, uint_t
, void *),
4218 * Search the table for a pre-existing callback associated with
4219 * the identifier "key". If one exists, we re-use that entry in
4220 * the table for this instance, otherwise we assign the next
4221 * available table slot.
4223 for (id
= 0; id
< sfmmu_max_cb_id
; id
++) {
4224 if (sfmmu_cb_table
[id
].key
== key
)
4228 if (id
== sfmmu_max_cb_id
) {
4229 id
= sfmmu_cb_nextid
++;
4230 if (id
>= sfmmu_max_cb_id
)
4231 panic("hat_register_callback: out of callback IDs");
4234 ASSERT(prehandler
!= NULL
|| posthandler
!= NULL
);
4236 sfmmu_cb_table
[id
].key
= key
;
4237 sfmmu_cb_table
[id
].prehandler
= prehandler
;
4238 sfmmu_cb_table
[id
].posthandler
= posthandler
;
4239 sfmmu_cb_table
[id
].errhandler
= errhandler
;
4240 sfmmu_cb_table
[id
].capture_cpus
= capture_cpus
;
4245 #define HAC_COOKIE_NONE (void *)-1
4248 * Add relocation callbacks to the specified addr/len which will be called
4249 * when relocating the associated page. See the description of pre and
4250 * posthandler above for more details.
4252 * If HAC_PAGELOCK is included in flags, the underlying memory page is
4253 * locked internally so the caller must be able to deal with the callback
4254 * running even before this function has returned. If HAC_PAGELOCK is not
4255 * set, it is assumed that the underlying memory pages are locked.
4257 * Since the caller must track the individual page boundaries anyway,
4258 * we only allow a callback to be added to a single page (large
4259 * or small). Thus [addr, addr + len) MUST be contained within a single
4262 * Registering multiple callbacks on the same [addr, addr+len) is supported,
4263 * _provided_that_ a unique parameter is specified for each callback.
4264 * If multiple callbacks are registered on the same range the callback will
4265 * be invoked with each unique parameter. Registering the same callback with
4266 * the same argument more than once will result in corrupted kernel state.
4268 * Returns the pfn of the underlying kernel page in *rpfn
4269 * on success, or PFN_INVALID on failure.
4271 * cookiep (if passed) provides storage space for an opaque cookie
4272 * to return later to hat_delete_callback(). This cookie makes the callback
4273 * deletion significantly quicker by avoiding a potentially lengthy hash
4278 * ENOMEM: memory allocation failure (e.g. flags was passed as HAC_NOSLEEP)
4279 * EINVAL: callback ID is not valid
4280 * ENXIO: ["vaddr", "vaddr" + len) is not mapped in the kernel's address
4282 * ERANGE: ["vaddr", "vaddr" + len) crosses a page boundary
4285 hat_add_callback(id_t callback_id
, caddr_t vaddr
, uint_t len
, uint_t flags
,
4286 void *pvt
, pfn_t
*rpfn
, void **cookiep
)
4288 struct hmehash_bucket
*hmebp
;
4290 struct hme_blk
*hmeblkp
;
4291 int hmeshift
, hashno
;
4292 caddr_t saddr
, eaddr
, baseaddr
;
4293 struct pa_hment
*pahmep
;
4294 struct sf_hment
*sfhmep
, *osfhmep
;
4301 int kmflags
= (flags
& HAC_SLEEP
)? KM_SLEEP
: KM_NOSLEEP
;
4305 * For KPM mappings, just return the physical address since we
4306 * don't need to register any callbacks.
4308 if (IS_KPM_ADDR(vaddr
)) {
4310 SFMMU_KPM_VTOP(vaddr
, paddr
);
4311 *rpfn
= btop(paddr
);
4312 if (cookiep
!= NULL
)
4313 *cookiep
= HAC_COOKIE_NONE
;
4317 if (callback_id
< (id_t
)0 || callback_id
>= sfmmu_cb_nextid
) {
4318 *rpfn
= PFN_INVALID
;
4322 if ((pahmep
= kmem_cache_alloc(pa_hment_cache
, kmflags
)) == NULL
) {
4323 *rpfn
= PFN_INVALID
;
4327 sfhmep
= &pahmep
->sfment
;
4329 saddr
= (caddr_t
)((uintptr_t)vaddr
& MMU_PAGEMASK
);
4330 eaddr
= saddr
+ len
;
4333 /* Find the mapping(s) for this page */
4334 for (hashno
= TTE64K
, hmeblkp
= NULL
;
4335 hmeblkp
== NULL
&& hashno
<= mmu_hashcnt
;
4337 hmeshift
= HME_HASH_SHIFT(hashno
);
4338 hblktag
.htag_id
= ksfmmup
;
4339 hblktag
.htag_rid
= SFMMU_INVALID_SHMERID
;
4340 hblktag
.htag_bspage
= HME_HASH_BSPAGE(saddr
, hmeshift
);
4341 hblktag
.htag_rehash
= hashno
;
4342 hmebp
= HME_HASH_FUNCTION(ksfmmup
, saddr
, hmeshift
);
4344 SFMMU_HASH_LOCK(hmebp
);
4346 HME_HASH_FAST_SEARCH(hmebp
, hblktag
, hmeblkp
);
4348 if (hmeblkp
== NULL
)
4349 SFMMU_HASH_UNLOCK(hmebp
);
4352 if (hmeblkp
== NULL
) {
4353 kmem_cache_free(pa_hment_cache
, pahmep
);
4354 *rpfn
= PFN_INVALID
;
4358 ASSERT(!hmeblkp
->hblk_shared
);
4360 HBLKTOHME(osfhmep
, hmeblkp
, saddr
);
4361 sfmmu_copytte(&osfhmep
->hme_tte
, &tte
);
4363 if (!TTE_IS_VALID(&tte
)) {
4364 SFMMU_HASH_UNLOCK(hmebp
);
4365 kmem_cache_free(pa_hment_cache
, pahmep
);
4366 *rpfn
= PFN_INVALID
;
4371 * Make sure the boundaries for the callback fall within this
4374 baseaddr
= (caddr_t
)get_hblk_base(hmeblkp
);
4375 ASSERT(saddr
>= baseaddr
);
4376 if (eaddr
> saddr
+ TTEBYTES(TTE_CSZ(&tte
))) {
4377 SFMMU_HASH_UNLOCK(hmebp
);
4378 kmem_cache_free(pa_hment_cache
, pahmep
);
4379 *rpfn
= PFN_INVALID
;
4383 pfn
= sfmmu_ttetopfn(&tte
, vaddr
);
4386 * The pfn may not have a page_t underneath in which case we
4387 * just return it. This can happen if we are doing I/O to a
4388 * static portion of the kernel's address space, for instance.
4390 pp
= osfhmep
->hme_page
;
4392 SFMMU_HASH_UNLOCK(hmebp
);
4393 kmem_cache_free(pa_hment_cache
, pahmep
);
4396 *cookiep
= HAC_COOKIE_NONE
;
4399 ASSERT(pp
== PP_PAGEROOT(pp
));
4404 pml
= sfmmu_mlist_enter(pp
);
4406 if (flags
& HAC_PAGELOCK
) {
4407 if (!page_trylock(pp
, SE_SHARED
)) {
4409 * Somebody is holding SE_EXCL lock. Might
4410 * even be hat_page_relocate(). Drop all
4411 * our locks, lookup the page in &kvp, and
4412 * retry. If it doesn't exist in &kvp and &zvp,
4413 * then we must be dealing with a kernel mapped
4414 * page which doesn't actually belong to
4415 * segkmem so we punt.
4417 sfmmu_mlist_exit(pml
);
4418 SFMMU_HASH_UNLOCK(hmebp
);
4419 pp
= page_lookup(&kvp
, (u_offset_t
)saddr
, SE_SHARED
);
4421 /* check zvp before giving up */
4423 pp
= page_lookup(&zvp
, (u_offset_t
)saddr
,
4426 /* Okay, we didn't find it, give up */
4428 kmem_cache_free(pa_hment_cache
, pahmep
);
4431 *cookiep
= HAC_COOKIE_NONE
;
4440 if (!PAGE_LOCKED(pp
) && !panicstr
)
4441 panic("hat_add_callback: page 0x%p not locked", (void *)pp
);
4443 if (osfhmep
->hme_page
!= pp
|| pp
->p_vnode
!= vp
||
4444 pp
->p_offset
!= off
) {
4446 * The page moved before we got our hands on it. Drop
4447 * all the locks and try again.
4449 ASSERT((flags
& HAC_PAGELOCK
) != 0);
4450 sfmmu_mlist_exit(pml
);
4451 SFMMU_HASH_UNLOCK(hmebp
);
4457 if (!VN_ISKAS(vp
)) {
4459 * This is not a segkmem page but another page which
4460 * has been kernel mapped. It had better have at least
4461 * a share lock on it. Return the pfn.
4463 sfmmu_mlist_exit(pml
);
4464 SFMMU_HASH_UNLOCK(hmebp
);
4467 kmem_cache_free(pa_hment_cache
, pahmep
);
4468 ASSERT(PAGE_LOCKED(pp
));
4471 *cookiep
= HAC_COOKIE_NONE
;
4476 * Setup this pa_hment and link its embedded dummy sf_hment into
4480 pahmep
->cb_id
= callback_id
;
4481 pahmep
->addr
= vaddr
;
4487 sfhmep
->hme_tte
.ll
= 0;
4488 sfhmep
->hme_data
= pahmep
;
4489 sfhmep
->hme_prev
= osfhmep
;
4490 sfhmep
->hme_next
= osfhmep
->hme_next
;
4492 if (osfhmep
->hme_next
)
4493 osfhmep
->hme_next
->hme_prev
= sfhmep
;
4495 osfhmep
->hme_next
= sfhmep
;
4497 sfmmu_mlist_exit(pml
);
4498 SFMMU_HASH_UNLOCK(hmebp
);
4505 *cookiep
= (void *)pahmep
;
4511 * Remove the relocation callbacks from the specified addr/len.
4514 hat_delete_callback(caddr_t vaddr
, uint_t len
, void *pvt
, uint_t flags
,
4517 struct hmehash_bucket
*hmebp
;
4519 struct hme_blk
*hmeblkp
;
4520 int hmeshift
, hashno
;
4522 struct pa_hment
*pahmep
;
4523 struct sf_hment
*sfhmep
, *osfhmep
;
4532 * If the cookie is HAC_COOKIE_NONE then there is no pa_hment to
4533 * remove so just return.
4535 if (cookie
== HAC_COOKIE_NONE
|| IS_KPM_ADDR(vaddr
))
4538 saddr
= (caddr_t
)((uintptr_t)vaddr
& MMU_PAGEMASK
);
4541 /* Find the mapping(s) for this page */
4542 for (hashno
= TTE64K
, hmeblkp
= NULL
;
4543 hmeblkp
== NULL
&& hashno
<= mmu_hashcnt
;
4545 hmeshift
= HME_HASH_SHIFT(hashno
);
4546 hblktag
.htag_id
= ksfmmup
;
4547 hblktag
.htag_rid
= SFMMU_INVALID_SHMERID
;
4548 hblktag
.htag_bspage
= HME_HASH_BSPAGE(saddr
, hmeshift
);
4549 hblktag
.htag_rehash
= hashno
;
4550 hmebp
= HME_HASH_FUNCTION(ksfmmup
, saddr
, hmeshift
);
4552 SFMMU_HASH_LOCK(hmebp
);
4554 HME_HASH_FAST_SEARCH(hmebp
, hblktag
, hmeblkp
);
4556 if (hmeblkp
== NULL
)
4557 SFMMU_HASH_UNLOCK(hmebp
);
4560 if (hmeblkp
== NULL
)
4563 ASSERT(!hmeblkp
->hblk_shared
);
4565 HBLKTOHME(osfhmep
, hmeblkp
, saddr
);
4567 sfmmu_copytte(&osfhmep
->hme_tte
, &tte
);
4568 if (!TTE_IS_VALID(&tte
)) {
4569 SFMMU_HASH_UNLOCK(hmebp
);
4573 pp
= osfhmep
->hme_page
;
4575 SFMMU_HASH_UNLOCK(hmebp
);
4576 ASSERT(cookie
== NULL
);
4583 pml
= sfmmu_mlist_enter(pp
);
4585 if (flags
& HAC_PAGELOCK
) {
4586 if (!page_trylock(pp
, SE_SHARED
)) {
4588 * Somebody is holding SE_EXCL lock. Might
4589 * even be hat_page_relocate(). Drop all
4590 * our locks, lookup the page in &kvp, and
4591 * retry. If it doesn't exist in &kvp and &zvp,
4592 * then we must be dealing with a kernel mapped
4593 * page which doesn't actually belong to
4594 * segkmem so we punt.
4596 sfmmu_mlist_exit(pml
);
4597 SFMMU_HASH_UNLOCK(hmebp
);
4598 pp
= page_lookup(&kvp
, (u_offset_t
)saddr
, SE_SHARED
);
4599 /* check zvp before giving up */
4601 pp
= page_lookup(&zvp
, (u_offset_t
)saddr
,
4605 ASSERT(cookie
== NULL
);
4614 ASSERT(PAGE_LOCKED(pp
));
4616 if (osfhmep
->hme_page
!= pp
|| pp
->p_vnode
!= vp
||
4617 pp
->p_offset
!= off
) {
4619 * The page moved before we got our hands on it. Drop
4620 * all the locks and try again.
4622 ASSERT((flags
& HAC_PAGELOCK
) != 0);
4623 sfmmu_mlist_exit(pml
);
4624 SFMMU_HASH_UNLOCK(hmebp
);
4630 if (!VN_ISKAS(vp
)) {
4632 * This is not a segkmem page but another page which
4633 * has been kernel mapped.
4635 sfmmu_mlist_exit(pml
);
4636 SFMMU_HASH_UNLOCK(hmebp
);
4639 ASSERT(cookie
== NULL
);
4643 if (cookie
!= NULL
) {
4644 pahmep
= (struct pa_hment
*)cookie
;
4645 sfhmep
= &pahmep
->sfment
;
4647 for (sfhmep
= pp
->p_mapping
; sfhmep
!= NULL
;
4648 sfhmep
= sfhmep
->hme_next
) {
4651 * skip va<->pa mappings
4653 if (!IS_PAHME(sfhmep
))
4656 pahmep
= sfhmep
->hme_data
;
4657 ASSERT(pahmep
!= NULL
);
4660 * if pa_hment matches, remove it
4662 if ((pahmep
->pvt
== pvt
) &&
4663 (pahmep
->addr
== vaddr
) &&
4664 (pahmep
->len
== len
)) {
4670 if (sfhmep
== NULL
) {
4672 panic("hat_delete_callback: pa_hment not found, pp %p",
4679 * Note: at this point a valid kernel mapping must still be
4680 * present on this page.
4683 if (pp
->p_share
<= 0)
4684 panic("hat_delete_callback: zero p_share");
4686 if (--pahmep
->refcnt
== 0) {
4687 if (pahmep
->flags
!= 0)
4688 panic("hat_delete_callback: pa_hment is busy");
4691 * Remove sfhmep from the mapping list for the page.
4693 if (sfhmep
->hme_prev
) {
4694 sfhmep
->hme_prev
->hme_next
= sfhmep
->hme_next
;
4696 pp
->p_mapping
= sfhmep
->hme_next
;
4699 if (sfhmep
->hme_next
)
4700 sfhmep
->hme_next
->hme_prev
= sfhmep
->hme_prev
;
4702 sfmmu_mlist_exit(pml
);
4703 SFMMU_HASH_UNLOCK(hmebp
);
4708 kmem_cache_free(pa_hment_cache
, pahmep
);
4712 sfmmu_mlist_exit(pml
);
4713 SFMMU_HASH_UNLOCK(hmebp
);
4719 * hat_probe returns 1 if the translation for the address 'addr' is
4720 * loaded, zero otherwise.
4722 * hat_probe should be used only for advisorary purposes because it may
4723 * occasionally return the wrong value. The implementation must guarantee that
4724 * returning the wrong value is a very rare event. hat_probe is used
4725 * to implement optimizations in the segment drivers.
4729 hat_probe(struct hat
*sfmmup
, caddr_t addr
)
4734 ASSERT(sfmmup
!= NULL
);
4736 ASSERT((sfmmup
== ksfmmup
) || AS_LOCK_HELD(sfmmup
->sfmmu_as
));
4738 if (sfmmup
== ksfmmup
) {
4739 while ((pfn
= sfmmu_vatopfn(addr
, sfmmup
, &tte
))
4741 sfmmu_vatopfn_suspended(addr
, sfmmup
, &tte
);
4744 pfn
= sfmmu_uvatopfn(addr
, sfmmup
, NULL
);
4747 if (pfn
!= PFN_INVALID
)
4754 hat_getpagesize(struct hat
*sfmmup
, caddr_t addr
)
4758 if (sfmmup
== ksfmmup
) {
4759 if (sfmmu_vatopfn(addr
, sfmmup
, &tte
) == PFN_INVALID
) {
4763 if (sfmmu_uvatopfn(addr
, sfmmup
, &tte
) == PFN_INVALID
) {
4768 ASSERT(TTE_IS_VALID(&tte
));
4769 return (TTEBYTES(TTE_CSZ(&tte
)));
4773 hat_getattr(struct hat
*sfmmup
, caddr_t addr
, uint_t
*attr
)
4777 if (sfmmup
== ksfmmup
) {
4778 if (sfmmu_vatopfn(addr
, sfmmup
, &tte
) == PFN_INVALID
) {
4782 if (sfmmu_uvatopfn(addr
, sfmmup
, &tte
) == PFN_INVALID
) {
4786 if (TTE_IS_VALID(&tte
)) {
4787 *attr
= sfmmu_ptov_attr(&tte
);
4791 return ((uint_t
)0xffffffff);
4795 * Enables more attributes on specified address range (ie. logical OR)
4798 hat_setattr(struct hat
*hat
, caddr_t addr
, size_t len
, uint_t attr
)
4800 ASSERT(hat
->sfmmu_as
!= NULL
);
4802 sfmmu_chgattr(hat
, addr
, len
, attr
, SFMMU_SETATTR
);
4806 * Assigns attributes to the specified address range. All the attributes
4810 hat_chgattr(struct hat
*hat
, caddr_t addr
, size_t len
, uint_t attr
)
4812 ASSERT(hat
->sfmmu_as
!= NULL
);
4814 sfmmu_chgattr(hat
, addr
, len
, attr
, SFMMU_CHGATTR
);
4818 * Remove attributes on the specified address range (ie. loginal NAND)
4821 hat_clrattr(struct hat
*hat
, caddr_t addr
, size_t len
, uint_t attr
)
4823 ASSERT(hat
->sfmmu_as
!= NULL
);
4825 sfmmu_chgattr(hat
, addr
, len
, attr
, SFMMU_CLRATTR
);
4829 * Change attributes on an address range to that specified by attr and mode.
4832 sfmmu_chgattr(struct hat
*sfmmup
, caddr_t addr
, size_t len
, uint_t attr
,
4835 struct hmehash_bucket
*hmebp
;
4837 int hmeshift
, hashno
= 1;
4838 struct hme_blk
*hmeblkp
, *list
= NULL
;
4843 CPUSET_ZERO(cpuset
);
4845 ASSERT((sfmmup
== ksfmmup
) || AS_LOCK_HELD(sfmmup
->sfmmu_as
));
4846 ASSERT((len
& MMU_PAGEOFFSET
) == 0);
4847 ASSERT(((uintptr_t)addr
& MMU_PAGEOFFSET
) == 0);
4849 if ((attr
& PROT_USER
) && (mode
!= SFMMU_CLRATTR
) &&
4850 ((addr
+ len
) > (caddr_t
)USERLIMIT
)) {
4851 panic("user addr %p in kernel space",
4855 endaddr
= addr
+ len
;
4856 hblktag
.htag_id
= sfmmup
;
4857 hblktag
.htag_rid
= SFMMU_INVALID_SHMERID
;
4858 DEMAP_RANGE_INIT(sfmmup
, &dmr
);
4860 while (addr
< endaddr
) {
4861 hmeshift
= HME_HASH_SHIFT(hashno
);
4862 hblktag
.htag_bspage
= HME_HASH_BSPAGE(addr
, hmeshift
);
4863 hblktag
.htag_rehash
= hashno
;
4864 hmebp
= HME_HASH_FUNCTION(sfmmup
, addr
, hmeshift
);
4866 SFMMU_HASH_LOCK(hmebp
);
4868 HME_HASH_SEARCH(hmebp
, hblktag
, hmeblkp
, &list
);
4869 if (hmeblkp
!= NULL
) {
4870 ASSERT(!hmeblkp
->hblk_shared
);
4872 * We've encountered a shadow hmeblk so skip the range
4873 * of the next smaller mapping size.
4875 if (hmeblkp
->hblk_shw_bit
) {
4876 ASSERT(sfmmup
!= ksfmmup
);
4878 addr
= (caddr_t
)P2END((uintptr_t)addr
,
4879 TTEBYTES(hashno
- 1));
4881 addr
= sfmmu_hblk_chgattr(sfmmup
,
4882 hmeblkp
, addr
, endaddr
, &dmr
, attr
, mode
);
4884 SFMMU_HASH_UNLOCK(hmebp
);
4888 SFMMU_HASH_UNLOCK(hmebp
);
4890 if (!HME_REHASH(sfmmup
) || (hashno
>= mmu_hashcnt
)) {
4892 * We have traversed the whole list and rehashed
4893 * if necessary without finding the address to chgattr.
4894 * This is ok, so we increment the address by the
4895 * smallest hmeblk range for kernel mappings or for
4896 * user mappings with no large pages, and the largest
4897 * hmeblk range, to account for shadow hmeblks, for
4898 * user mappings with large pages and continue.
4900 if (sfmmup
== ksfmmup
)
4901 addr
= (caddr_t
)P2END((uintptr_t)addr
,
4904 addr
= (caddr_t
)P2END((uintptr_t)addr
,
4912 sfmmu_hblks_list_purge(&list
, 0);
4913 DEMAP_RANGE_FLUSH(&dmr
);
4914 cpuset
= sfmmup
->sfmmu_cpusran
;
4919 * This function chgattr on a range of addresses in an hmeblk. It returns the
4920 * next addres that needs to be chgattr.
4921 * It should be called with the hash lock held.
4922 * XXX It should be possible to optimize chgattr by not flushing every time but
4923 * on the other hand:
4924 * 1. do one flush crosscall.
4925 * 2. only flush if we are increasing permissions (make sure this will work)
4928 sfmmu_hblk_chgattr(struct hat
*sfmmup
, struct hme_blk
*hmeblkp
, caddr_t addr
,
4929 caddr_t endaddr
, demap_range_t
*dmrp
, uint_t attr
, int mode
)
4931 tte_t tte
, tteattr
, tteflags
, ttemod
;
4932 struct sf_hment
*sfhmep
;
4934 struct page
*pp
= NULL
;
4935 kmutex_t
*pml
, *pmtx
;
4937 int use_demap_range
;
4938 #if defined(SF_ERRATA_57)
4942 ASSERT(in_hblk_range(hmeblkp
, addr
));
4943 ASSERT(hmeblkp
->hblk_shw_bit
== 0);
4944 ASSERT(!hmeblkp
->hblk_shared
);
4946 endaddr
= MIN(endaddr
, get_hblk_endaddr(hmeblkp
));
4947 ttesz
= get_hblk_ttesz(hmeblkp
);
4950 * Flush the current demap region if addresses have been
4951 * skipped or the page size doesn't match.
4953 use_demap_range
= (TTEBYTES(ttesz
) == DEMAP_RANGE_PGSZ(dmrp
));
4954 if (use_demap_range
) {
4955 DEMAP_RANGE_CONTINUE(dmrp
, addr
, endaddr
);
4956 } else if (dmrp
!= NULL
) {
4957 DEMAP_RANGE_FLUSH(dmrp
);
4960 tteattr
.ll
= sfmmu_vtop_attr(attr
, mode
, &tteflags
);
4961 #if defined(SF_ERRATA_57)
4962 check_exec
= (sfmmup
!= ksfmmup
) &&
4963 AS_TYPE_64BIT(sfmmup
->sfmmu_as
) &&
4964 TTE_IS_EXECUTABLE(&tteattr
);
4966 HBLKTOHME(sfhmep
, hmeblkp
, addr
);
4967 while (addr
< endaddr
) {
4968 sfmmu_copytte(&sfhmep
->hme_tte
, &tte
);
4969 if (TTE_IS_VALID(&tte
)) {
4970 if ((tte
.ll
& tteflags
.ll
) == tteattr
.ll
) {
4972 * if the new attr is the same as old
4977 if (!TTE_IS_WRITABLE(&tteattr
)) {
4979 * make sure we clear hw modify bit if we
4980 * removing write protections
4982 tteflags
.tte_intlo
|= TTE_HWWR_INT
;
4986 pp
= sfhmep
->hme_page
;
4988 pml
= sfmmu_mlist_enter(pp
);
4991 if (pp
!= sfhmep
->hme_page
) {
4993 * tte must have been unloaded.
4996 sfmmu_mlist_exit(pml
);
5000 ASSERT(pp
== NULL
|| sfmmu_mlist_held(pp
));
5003 ttemod
.ll
= (ttemod
.ll
& ~tteflags
.ll
) | tteattr
.ll
;
5004 ASSERT(TTE_TO_TTEPFN(&ttemod
) == TTE_TO_TTEPFN(&tte
));
5006 #if defined(SF_ERRATA_57)
5007 if (check_exec
&& addr
< errata57_limit
)
5008 ttemod
.tte_exec_perm
= 0;
5010 ret
= sfmmu_modifytte_try(&tte
, &ttemod
,
5014 /* tte changed underneath us */
5016 sfmmu_mlist_exit(pml
);
5021 if (tteflags
.tte_intlo
& TTE_HWWR_INT
) {
5023 * need to sync if we are clearing modify bit.
5025 sfmmu_ttesync(sfmmup
, addr
, &tte
, pp
);
5028 if (pp
&& PP_ISRO(pp
)) {
5029 if (tteattr
.tte_intlo
& TTE_WRPRM_INT
) {
5030 pmtx
= sfmmu_page_enter(pp
);
5032 sfmmu_page_exit(pmtx
);
5036 if (ret
> 0 && use_demap_range
) {
5037 DEMAP_RANGE_MARKPG(dmrp
, addr
);
5038 } else if (ret
> 0) {
5039 sfmmu_tlb_demap(addr
, sfmmup
, hmeblkp
, 0, 0);
5043 sfmmu_mlist_exit(pml
);
5047 addr
+= TTEBYTES(ttesz
);
5049 DEMAP_RANGE_NEXTPG(dmrp
);
5055 * This routine converts virtual attributes to physical ones. It will
5056 * update the tteflags field with the tte mask corresponding to the attributes
5057 * affected and it returns the new attributes. It will also clear the modify
5058 * bit if we are taking away write permission. This is necessary since the
5059 * modify bit is the hardware permission bit and we need to clear it in order
5060 * to detect write faults.
5063 sfmmu_vtop_attr(uint_t attr
, int mode
, tte_t
*ttemaskp
)
5067 ASSERT(!(attr
& ~SFMMU_LOAD_ALLATTR
));
5071 /* all attributes specified */
5072 ttevalue
.tte_inthi
= MAKE_TTEATTR_INTHI(attr
);
5073 ttevalue
.tte_intlo
= MAKE_TTEATTR_INTLO(attr
);
5074 ttemaskp
->tte_inthi
= TTEINTHI_ATTR
;
5075 ttemaskp
->tte_intlo
= TTEINTLO_ATTR
;
5078 ASSERT(!(attr
& ~HAT_PROT_MASK
));
5082 * a valid tte implies exec and read for sfmmu
5083 * so no need to do anything about them.
5084 * since priviledged access implies user access
5085 * PROT_USER doesn't make sense either.
5087 if (attr
& PROT_WRITE
) {
5088 ttemaskp
->tte_intlo
|= TTE_WRPRM_INT
;
5089 ttevalue
.tte_intlo
|= TTE_WRPRM_INT
;
5093 /* attributes will be nand with current ones */
5094 if (attr
& ~(PROT_WRITE
| PROT_USER
)) {
5095 panic("sfmmu: attr %x not supported", attr
);
5099 if (attr
& PROT_WRITE
) {
5100 /* clear both writable and modify bit */
5101 ttemaskp
->tte_intlo
|= TTE_WRPRM_INT
| TTE_HWWR_INT
;
5103 if (attr
& PROT_USER
) {
5104 ttemaskp
->tte_intlo
|= TTE_PRIV_INT
;
5105 ttevalue
.tte_intlo
|= TTE_PRIV_INT
;
5109 panic("sfmmu_vtop_attr: bad mode %x", mode
);
5111 ASSERT(TTE_TO_TTEPFN(&ttevalue
) == 0);
5112 return (ttevalue
.ll
);
5116 sfmmu_ptov_attr(tte_t
*ttep
)
5120 ASSERT(TTE_IS_VALID(ttep
));
5124 if (TTE_IS_WRITABLE(ttep
)) {
5127 if (TTE_IS_EXECUTABLE(ttep
)) {
5130 if (!TTE_IS_PRIVILEGED(ttep
)) {
5133 if (TTE_IS_NFO(ttep
)) {
5134 attr
|= HAT_NOFAULT
;
5136 if (TTE_IS_NOSYNC(ttep
)) {
5139 if (TTE_IS_SIDEFFECT(ttep
)) {
5140 attr
|= SFMMU_SIDEFFECT
;
5142 if (!TTE_IS_VCACHEABLE(ttep
)) {
5143 attr
|= SFMMU_UNCACHEVTTE
;
5145 if (!TTE_IS_PCACHEABLE(ttep
)) {
5146 attr
|= SFMMU_UNCACHEPTTE
;
5152 * hat_chgprot is a deprecated hat call. New segment drivers
5153 * should store all attributes and use hat_*attr calls.
5155 * Change the protections in the virtual address range
5156 * given to the specified virtual protection. If vprot is ~PROT_WRITE,
5157 * then remove write permission, leaving the other
5158 * permissions unchanged. If vprot is ~PROT_USER, remove user permissions.
5162 hat_chgprot(struct hat
*sfmmup
, caddr_t addr
, size_t len
, uint_t vprot
)
5164 struct hmehash_bucket
*hmebp
;
5166 int hmeshift
, hashno
= 1;
5167 struct hme_blk
*hmeblkp
, *list
= NULL
;
5172 ASSERT((len
& MMU_PAGEOFFSET
) == 0);
5173 ASSERT(((uintptr_t)addr
& MMU_PAGEOFFSET
) == 0);
5175 ASSERT(sfmmup
->sfmmu_as
!= NULL
);
5177 CPUSET_ZERO(cpuset
);
5179 if ((vprot
!= (uint_t
)~PROT_WRITE
) && (vprot
& PROT_USER
) &&
5180 ((addr
+ len
) > (caddr_t
)USERLIMIT
)) {
5181 panic("user addr %p vprot %x in kernel space",
5182 (void *)addr
, vprot
);
5184 endaddr
= addr
+ len
;
5185 hblktag
.htag_id
= sfmmup
;
5186 hblktag
.htag_rid
= SFMMU_INVALID_SHMERID
;
5187 DEMAP_RANGE_INIT(sfmmup
, &dmr
);
5189 while (addr
< endaddr
) {
5190 hmeshift
= HME_HASH_SHIFT(hashno
);
5191 hblktag
.htag_bspage
= HME_HASH_BSPAGE(addr
, hmeshift
);
5192 hblktag
.htag_rehash
= hashno
;
5193 hmebp
= HME_HASH_FUNCTION(sfmmup
, addr
, hmeshift
);
5195 SFMMU_HASH_LOCK(hmebp
);
5197 HME_HASH_SEARCH(hmebp
, hblktag
, hmeblkp
, &list
);
5198 if (hmeblkp
!= NULL
) {
5199 ASSERT(!hmeblkp
->hblk_shared
);
5201 * We've encountered a shadow hmeblk so skip the range
5202 * of the next smaller mapping size.
5204 if (hmeblkp
->hblk_shw_bit
) {
5205 ASSERT(sfmmup
!= ksfmmup
);
5207 addr
= (caddr_t
)P2END((uintptr_t)addr
,
5208 TTEBYTES(hashno
- 1));
5210 addr
= sfmmu_hblk_chgprot(sfmmup
, hmeblkp
,
5211 addr
, endaddr
, &dmr
, vprot
);
5213 SFMMU_HASH_UNLOCK(hmebp
);
5217 SFMMU_HASH_UNLOCK(hmebp
);
5219 if (!HME_REHASH(sfmmup
) || (hashno
>= mmu_hashcnt
)) {
5221 * We have traversed the whole list and rehashed
5222 * if necessary without finding the address to chgprot.
5223 * This is ok so we increment the address by the
5224 * smallest hmeblk range for kernel mappings and the
5225 * largest hmeblk range, to account for shadow hmeblks,
5226 * for user mappings and continue.
5228 if (sfmmup
== ksfmmup
)
5229 addr
= (caddr_t
)P2END((uintptr_t)addr
,
5232 addr
= (caddr_t
)P2END((uintptr_t)addr
,
5240 sfmmu_hblks_list_purge(&list
, 0);
5241 DEMAP_RANGE_FLUSH(&dmr
);
5242 cpuset
= sfmmup
->sfmmu_cpusran
;
5247 * This function chgprots a range of addresses in an hmeblk. It returns the
5248 * next addres that needs to be chgprot.
5249 * It should be called with the hash lock held.
5250 * XXX It shold be possible to optimize chgprot by not flushing every time but
5251 * on the other hand:
5252 * 1. do one flush crosscall.
5253 * 2. only flush if we are increasing permissions (make sure this will work)
5256 sfmmu_hblk_chgprot(sfmmu_t
*sfmmup
, struct hme_blk
*hmeblkp
, caddr_t addr
,
5257 caddr_t endaddr
, demap_range_t
*dmrp
, uint_t vprot
)
5261 struct sf_hment
*sfhmep
;
5264 struct page
*pp
= NULL
;
5265 kmutex_t
*pml
, *pmtx
;
5267 int use_demap_range
;
5268 #if defined(SF_ERRATA_57)
5272 ASSERT(in_hblk_range(hmeblkp
, addr
));
5273 ASSERT(hmeblkp
->hblk_shw_bit
== 0);
5274 ASSERT(!hmeblkp
->hblk_shared
);
5277 if (get_hblk_ttesz(hmeblkp
) != TTE8K
&&
5278 (endaddr
< get_hblk_endaddr(hmeblkp
))) {
5279 panic("sfmmu_hblk_chgprot: partial chgprot of large page");
5283 endaddr
= MIN(endaddr
, get_hblk_endaddr(hmeblkp
));
5284 ttesz
= get_hblk_ttesz(hmeblkp
);
5286 pprot
= sfmmu_vtop_prot(vprot
, &tteflags
);
5287 #if defined(SF_ERRATA_57)
5288 check_exec
= (sfmmup
!= ksfmmup
) &&
5289 AS_TYPE_64BIT(sfmmup
->sfmmu_as
) &&
5290 ((vprot
& PROT_EXEC
) == PROT_EXEC
);
5292 HBLKTOHME(sfhmep
, hmeblkp
, addr
);
5295 * Flush the current demap region if addresses have been
5296 * skipped or the page size doesn't match.
5298 use_demap_range
= (TTEBYTES(ttesz
) == MMU_PAGESIZE
);
5299 if (use_demap_range
) {
5300 DEMAP_RANGE_CONTINUE(dmrp
, addr
, endaddr
);
5301 } else if (dmrp
!= NULL
) {
5302 DEMAP_RANGE_FLUSH(dmrp
);
5305 while (addr
< endaddr
) {
5306 sfmmu_copytte(&sfhmep
->hme_tte
, &tte
);
5307 if (TTE_IS_VALID(&tte
)) {
5308 if (TTE_GET_LOFLAGS(&tte
, tteflags
) == pprot
) {
5310 * if the new protection is the same as old
5316 pp
= sfhmep
->hme_page
;
5318 pml
= sfmmu_mlist_enter(pp
);
5320 if (pp
!= sfhmep
->hme_page
) {
5322 * tte most have been unloaded
5323 * underneath us. Recheck
5326 sfmmu_mlist_exit(pml
);
5330 ASSERT(pp
== NULL
|| sfmmu_mlist_held(pp
));
5333 TTE_SET_LOFLAGS(&ttemod
, tteflags
, pprot
);
5334 #if defined(SF_ERRATA_57)
5335 if (check_exec
&& addr
< errata57_limit
)
5336 ttemod
.tte_exec_perm
= 0;
5338 ret
= sfmmu_modifytte_try(&tte
, &ttemod
,
5342 /* tte changed underneath us */
5344 sfmmu_mlist_exit(pml
);
5349 if (tteflags
& TTE_HWWR_INT
) {
5351 * need to sync if we are clearing modify bit.
5353 sfmmu_ttesync(sfmmup
, addr
, &tte
, pp
);
5356 if (pp
&& PP_ISRO(pp
)) {
5357 if (pprot
& TTE_WRPRM_INT
) {
5358 pmtx
= sfmmu_page_enter(pp
);
5360 sfmmu_page_exit(pmtx
);
5364 if (ret
> 0 && use_demap_range
) {
5365 DEMAP_RANGE_MARKPG(dmrp
, addr
);
5366 } else if (ret
> 0) {
5367 sfmmu_tlb_demap(addr
, sfmmup
, hmeblkp
, 0, 0);
5371 sfmmu_mlist_exit(pml
);
5375 addr
+= TTEBYTES(ttesz
);
5377 DEMAP_RANGE_NEXTPG(dmrp
);
5383 * This routine is deprecated and should only be used by hat_chgprot.
5384 * The correct routine is sfmmu_vtop_attr.
5385 * This routine converts virtual page protections to physical ones. It will
5386 * update the tteflags field with the tte mask corresponding to the protections
5387 * affected and it returns the new protections. It will also clear the modify
5388 * bit if we are taking away write permission. This is necessary since the
5389 * modify bit is the hardware permission bit and we need to clear it in order
5390 * to detect write faults.
5391 * It accepts the following special protections:
5392 * ~PROT_WRITE = remove write permissions.
5393 * ~PROT_USER = remove user permissions.
5396 sfmmu_vtop_prot(uint_t vprot
, uint_t
*tteflagsp
)
5398 if (vprot
== (uint_t
)~PROT_WRITE
) {
5399 *tteflagsp
= TTE_WRPRM_INT
| TTE_HWWR_INT
;
5400 return (0); /* will cause wrprm to be cleared */
5402 if (vprot
== (uint_t
)~PROT_USER
) {
5403 *tteflagsp
= TTE_PRIV_INT
;
5404 return (0); /* will cause privprm to be cleared */
5406 if ((vprot
== 0) || (vprot
== PROT_USER
) ||
5407 ((vprot
& PROT_ALL
) != vprot
)) {
5408 panic("sfmmu_vtop_prot -- bad prot %x", vprot
);
5414 case (PROT_EXEC
| PROT_READ
):
5415 *tteflagsp
= TTE_PRIV_INT
| TTE_WRPRM_INT
| TTE_HWWR_INT
;
5416 return (TTE_PRIV_INT
); /* set prv and clr wrt */
5418 case (PROT_WRITE
| PROT_READ
):
5419 case (PROT_EXEC
| PROT_WRITE
):
5420 case (PROT_EXEC
| PROT_WRITE
| PROT_READ
):
5421 *tteflagsp
= TTE_PRIV_INT
| TTE_WRPRM_INT
;
5422 return (TTE_PRIV_INT
| TTE_WRPRM_INT
); /* set prv and wrt */
5423 case (PROT_USER
| PROT_READ
):
5424 case (PROT_USER
| PROT_EXEC
):
5425 case (PROT_USER
| PROT_EXEC
| PROT_READ
):
5426 *tteflagsp
= TTE_PRIV_INT
| TTE_WRPRM_INT
| TTE_HWWR_INT
;
5427 return (0); /* clr prv and wrt */
5428 case (PROT_USER
| PROT_WRITE
):
5429 case (PROT_USER
| PROT_WRITE
| PROT_READ
):
5430 case (PROT_USER
| PROT_EXEC
| PROT_WRITE
):
5431 case (PROT_USER
| PROT_EXEC
| PROT_WRITE
| PROT_READ
):
5432 *tteflagsp
= TTE_PRIV_INT
| TTE_WRPRM_INT
;
5433 return (TTE_WRPRM_INT
); /* clr prv and set wrt */
5435 panic("sfmmu_vtop_prot -- bad prot %x", vprot
);
5441 * Alternate unload for very large virtual ranges. With a true 64 bit VA,
5442 * the normal algorithm would take too long for a very large VA range with
5443 * few real mappings. This routine just walks thru all HMEs in the global
5444 * hash table to find and remove mappings.
5447 hat_unload_large_virtual(
5452 hat_callback_t
*callback
)
5454 struct hmehash_bucket
*hmebp
;
5455 struct hme_blk
*hmeblkp
;
5456 struct hme_blk
*pr_hblk
= NULL
;
5457 struct hme_blk
*nx_hblk
;
5458 struct hme_blk
*list
= NULL
;
5460 demap_range_t dmr
, *dmrp
;
5462 caddr_t endaddr
= startaddr
+ len
;
5465 caddr_t cb_sa
[MAX_CB_ADDR
];
5466 caddr_t cb_ea
[MAX_CB_ADDR
];
5470 if (sfmmup
->sfmmu_free
) {
5474 DEMAP_RANGE_INIT(sfmmup
, dmrp
);
5478 * Loop through all the hash buckets of HME blocks looking for matches.
5480 for (i
= 0; i
<= UHMEHASH_SZ
; i
++) {
5481 hmebp
= &uhme_hash
[i
];
5482 SFMMU_HASH_LOCK(hmebp
);
5483 hmeblkp
= hmebp
->hmeblkp
;
5486 nx_hblk
= hmeblkp
->hblk_next
;
5489 * skip if not this context, if a shadow block or
5490 * if the mapping is not in the requested range
5492 if (hmeblkp
->hblk_tag
.htag_id
!= sfmmup
||
5493 hmeblkp
->hblk_shw_bit
||
5494 (sa
= (caddr_t
)get_hblk_base(hmeblkp
)) >= endaddr
||
5495 (ea
= get_hblk_endaddr(hmeblkp
)) <= startaddr
) {
5500 ASSERT(!hmeblkp
->hblk_shared
);
5502 * unload if there are any current valid mappings
5504 if (hmeblkp
->hblk_vcnt
!= 0 ||
5505 hmeblkp
->hblk_hmecnt
!= 0)
5506 (void) sfmmu_hblk_unload(sfmmup
, hmeblkp
,
5507 sa
, ea
, dmrp
, flags
);
5510 * on unmap we also release the HME block itself, once
5511 * all mappings are gone.
5513 if ((flags
& HAT_UNLOAD_UNMAP
) != 0 &&
5514 !hmeblkp
->hblk_vcnt
&&
5515 !hmeblkp
->hblk_hmecnt
) {
5516 ASSERT(!hmeblkp
->hblk_lckcnt
);
5517 sfmmu_hblk_hash_rm(hmebp
, hmeblkp
, pr_hblk
,
5523 if (callback
== NULL
)
5527 * HME blocks may span more than one page, but we may be
5528 * unmapping only one page, so check for a smaller range
5536 cb_sa
[addr_cnt
] = sa
;
5537 cb_ea
[addr_cnt
] = ea
;
5538 if (++addr_cnt
== MAX_CB_ADDR
) {
5540 DEMAP_RANGE_FLUSH(dmrp
);
5541 cpuset
= sfmmup
->sfmmu_cpusran
;
5545 for (a
= 0; a
< MAX_CB_ADDR
; ++a
) {
5546 callback
->hcb_start_addr
= cb_sa
[a
];
5547 callback
->hcb_end_addr
= cb_ea
[a
];
5548 callback
->hcb_function(callback
);
5556 SFMMU_HASH_UNLOCK(hmebp
);
5559 sfmmu_hblks_list_purge(&list
, 0);
5561 DEMAP_RANGE_FLUSH(dmrp
);
5562 cpuset
= sfmmup
->sfmmu_cpusran
;
5566 for (a
= 0; a
< addr_cnt
; ++a
) {
5567 callback
->hcb_start_addr
= cb_sa
[a
];
5568 callback
->hcb_end_addr
= cb_ea
[a
];
5569 callback
->hcb_function(callback
);
5573 * Check TSB and TLB page sizes if the process isn't exiting.
5575 if (!sfmmup
->sfmmu_free
)
5576 sfmmu_check_page_sizes(sfmmup
, 0);
5580 * Unload all the mappings in the range [addr..addr+len). addr and len must
5581 * be MMU_PAGESIZE aligned.
5584 extern struct seg
*segkmap
;
5585 #define ISSEGKMAP(sfmmup, addr) (sfmmup == ksfmmup && \
5586 segkmap->s_base <= (addr) && (addr) < (segkmap->s_base + segkmap->s_size))
5590 hat_unload_callback(
5595 hat_callback_t
*callback
)
5597 struct hmehash_bucket
*hmebp
;
5599 int hmeshift
, hashno
, iskernel
;
5600 struct hme_blk
*hmeblkp
, *pr_hblk
, *list
= NULL
;
5605 caddr_t cb_start_addr
[MAX_CB_ADDR
];
5606 caddr_t cb_end_addr
[MAX_CB_ADDR
];
5607 int issegkmap
= ISSEGKMAP(sfmmup
, addr
);
5608 demap_range_t dmr
, *dmrp
;
5610 ASSERT(sfmmup
->sfmmu_as
!= NULL
);
5612 ASSERT((sfmmup
== ksfmmup
) || (flags
& HAT_UNLOAD_OTHER
) || \
5613 AS_LOCK_HELD(sfmmup
->sfmmu_as
));
5615 ASSERT(sfmmup
!= NULL
);
5616 ASSERT((len
& MMU_PAGEOFFSET
) == 0);
5617 ASSERT(!((uintptr_t)addr
& MMU_PAGEOFFSET
));
5620 * Probing through a large VA range (say 63 bits) will be slow, even
5621 * at 4 Meg steps between the probes. So, when the virtual address range
5622 * is very large, search the HME entries for what to unload.
5624 * len >> TTE_PAGE_SHIFT(TTE4M) is the # of 4Meg probes we'd need
5626 * UHMEHASH_SZ is number of hash buckets to examine
5629 if (sfmmup
!= KHATID
&& (len
>> TTE_PAGE_SHIFT(TTE4M
)) > UHMEHASH_SZ
) {
5630 hat_unload_large_virtual(sfmmup
, addr
, len
, flags
, callback
);
5634 CPUSET_ZERO(cpuset
);
5637 * If the process is exiting, we can save a lot of fuss since
5638 * we'll flush the TLB when we free the ctx anyway.
5640 if (sfmmup
->sfmmu_free
) {
5644 DEMAP_RANGE_INIT(sfmmup
, dmrp
);
5647 endaddr
= addr
+ len
;
5648 hblktag
.htag_id
= sfmmup
;
5649 hblktag
.htag_rid
= SFMMU_INVALID_SHMERID
;
5652 * It is likely for the vm to call unload over a wide range of
5653 * addresses that are actually very sparsely populated by
5654 * translations. In order to speed this up the sfmmu hat supports
5655 * the concept of shadow hmeblks. Dummy large page hmeblks that
5656 * correspond to actual small translations are allocated at tteload
5657 * time and are referred to as shadow hmeblks. Now, during unload
5658 * time, we first check if we have a shadow hmeblk for that
5659 * translation. The absence of one means the corresponding address
5660 * range is empty and can be skipped.
5662 * The kernel is an exception to above statement and that is why
5663 * we don't use shadow hmeblks and hash starting from the smallest
5666 if (sfmmup
== KHATID
) {
5671 if (mmu_page_sizes
== max_mmu_page_sizes
) {
5677 while (addr
< endaddr
) {
5678 hmeshift
= HME_HASH_SHIFT(hashno
);
5679 hblktag
.htag_bspage
= HME_HASH_BSPAGE(addr
, hmeshift
);
5680 hblktag
.htag_rehash
= hashno
;
5681 hmebp
= HME_HASH_FUNCTION(sfmmup
, addr
, hmeshift
);
5683 SFMMU_HASH_LOCK(hmebp
);
5685 HME_HASH_SEARCH_PREV(hmebp
, hblktag
, hmeblkp
, pr_hblk
, &list
);
5686 if (hmeblkp
== NULL
) {
5688 * didn't find an hmeblk. skip the appropiate
5691 SFMMU_HASH_UNLOCK(hmebp
);
5693 if (hashno
< mmu_hashcnt
) {
5698 addr
= (caddr_t
)roundup((uintptr_t)addr
5699 + 1, MMU_PAGESIZE64K
);
5703 addr
= (caddr_t
)roundup((uintptr_t)addr
+ 1,
5705 if ((uintptr_t)addr
& MMU_PAGEOFFSET512K
) {
5706 ASSERT(hashno
== TTE64K
);
5709 if ((uintptr_t)addr
& MMU_PAGEOFFSET4M
) {
5713 if (mmu_page_sizes
== max_mmu_page_sizes
) {
5714 if ((uintptr_t)addr
& MMU_PAGEOFFSET32M
) {
5718 if ((uintptr_t)addr
& MMU_PAGEOFFSET256M
) {
5730 ASSERT(!hmeblkp
->hblk_shared
);
5731 if (!hmeblkp
->hblk_vcnt
&& !hmeblkp
->hblk_hmecnt
) {
5733 * If the valid count is zero we can skip the range
5734 * mapped by this hmeblk.
5735 * We free hblks in the case of HAT_UNMAP. HAT_UNMAP
5736 * is used by segment drivers as a hint
5737 * that the mapping resource won't be used any longer.
5738 * The best example of this is during exit().
5740 addr
= (caddr_t
)roundup((uintptr_t)addr
+ 1,
5741 get_hblk_span(hmeblkp
));
5742 if ((flags
& HAT_UNLOAD_UNMAP
) ||
5743 (iskernel
&& !issegkmap
)) {
5744 sfmmu_hblk_hash_rm(hmebp
, hmeblkp
, pr_hblk
,
5747 SFMMU_HASH_UNLOCK(hmebp
);
5753 if ((uintptr_t)addr
& MMU_PAGEOFFSET512K
) {
5754 ASSERT(hashno
== TTE64K
);
5757 if ((uintptr_t)addr
& MMU_PAGEOFFSET4M
) {
5761 if (mmu_page_sizes
== max_mmu_page_sizes
) {
5762 if ((uintptr_t)addr
& MMU_PAGEOFFSET32M
) {
5766 if ((uintptr_t)addr
& MMU_PAGEOFFSET256M
) {
5777 if (hmeblkp
->hblk_shw_bit
) {
5779 * If we encounter a shadow hmeblk we know there is
5780 * smaller sized hmeblks mapping the same address space.
5781 * Decrement the hash size and rehash.
5783 ASSERT(sfmmup
!= KHATID
);
5785 SFMMU_HASH_UNLOCK(hmebp
);
5790 * track callback address ranges.
5791 * only start a new range when it's not contiguous
5793 if (callback
!= NULL
) {
5794 if (addr_count
> 0 &&
5795 addr
== cb_end_addr
[addr_count
- 1])
5798 cb_start_addr
[addr_count
] = addr
;
5801 addr
= sfmmu_hblk_unload(sfmmup
, hmeblkp
, addr
, endaddr
,
5804 if (callback
!= NULL
)
5805 cb_end_addr
[addr_count
++] = addr
;
5807 if (((flags
& HAT_UNLOAD_UNMAP
) || (iskernel
&& !issegkmap
)) &&
5808 !hmeblkp
->hblk_vcnt
&& !hmeblkp
->hblk_hmecnt
) {
5809 sfmmu_hblk_hash_rm(hmebp
, hmeblkp
, pr_hblk
, &list
, 0);
5811 SFMMU_HASH_UNLOCK(hmebp
);
5814 * Notify our caller as to exactly which pages
5815 * have been unloaded. We do these in clumps,
5816 * to minimize the number of xt_sync()s that need to occur.
5818 if (callback
!= NULL
&& addr_count
== MAX_CB_ADDR
) {
5820 DEMAP_RANGE_FLUSH(dmrp
);
5821 cpuset
= sfmmup
->sfmmu_cpusran
;
5825 for (a
= 0; a
< MAX_CB_ADDR
; ++a
) {
5826 callback
->hcb_start_addr
= cb_start_addr
[a
];
5827 callback
->hcb_end_addr
= cb_end_addr
[a
];
5828 callback
->hcb_function(callback
);
5836 if ((uintptr_t)addr
& MMU_PAGEOFFSET512K
) {
5837 ASSERT(hashno
== TTE64K
);
5840 if ((uintptr_t)addr
& MMU_PAGEOFFSET4M
) {
5844 if (mmu_page_sizes
== max_mmu_page_sizes
) {
5845 if ((uintptr_t)addr
& MMU_PAGEOFFSET32M
) {
5849 if ((uintptr_t)addr
& MMU_PAGEOFFSET256M
) {
5859 sfmmu_hblks_list_purge(&list
, 0);
5861 DEMAP_RANGE_FLUSH(dmrp
);
5862 cpuset
= sfmmup
->sfmmu_cpusran
;
5865 if (callback
&& addr_count
!= 0) {
5866 for (a
= 0; a
< addr_count
; ++a
) {
5867 callback
->hcb_start_addr
= cb_start_addr
[a
];
5868 callback
->hcb_end_addr
= cb_end_addr
[a
];
5869 callback
->hcb_function(callback
);
5874 * Check TSB and TLB page sizes if the process isn't exiting.
5876 if (!sfmmup
->sfmmu_free
)
5877 sfmmu_check_page_sizes(sfmmup
, 0);
5881 * Unload all the mappings in the range [addr..addr+len). addr and len must
5882 * be MMU_PAGESIZE aligned.
5885 hat_unload(struct hat
*sfmmup
, caddr_t addr
, size_t len
, uint_t flags
)
5887 hat_unload_callback(sfmmup
, addr
, len
, flags
, NULL
);
5892 * Find the largest mapping size for this page.
5895 fnd_mapping_sz(page_t
*pp
)
5900 p_index
= PP_MAPINDEX(pp
);
5903 p_index
>>= 1; /* don't care about 8K bit */
5904 for (; p_index
; p_index
>>= 1) {
5912 * This function unloads a range of addresses for an hmeblk.
5913 * It returns the next address to be unloaded.
5914 * It should be called with the hash lock held.
5917 sfmmu_hblk_unload(struct hat
*sfmmup
, struct hme_blk
*hmeblkp
, caddr_t addr
,
5918 caddr_t endaddr
, demap_range_t
*dmrp
, uint_t flags
)
5921 struct sf_hment
*sfhmep
;
5927 int use_demap_range
;
5929 ASSERT(in_hblk_range(hmeblkp
, addr
));
5930 ASSERT(!hmeblkp
->hblk_shw_bit
);
5931 ASSERT(sfmmup
!= NULL
|| hmeblkp
->hblk_shared
);
5932 ASSERT(sfmmup
== NULL
|| !hmeblkp
->hblk_shared
);
5933 ASSERT(dmrp
== NULL
|| !hmeblkp
->hblk_shared
);
5936 if (get_hblk_ttesz(hmeblkp
) != TTE8K
&&
5937 (endaddr
< get_hblk_endaddr(hmeblkp
))) {
5938 panic("sfmmu_hblk_unload: partial unload of large page");
5942 endaddr
= MIN(endaddr
, get_hblk_endaddr(hmeblkp
));
5943 ttesz
= get_hblk_ttesz(hmeblkp
);
5945 use_demap_range
= ((dmrp
== NULL
) ||
5946 (TTEBYTES(ttesz
) == DEMAP_RANGE_PGSZ(dmrp
)));
5948 if (use_demap_range
) {
5949 DEMAP_RANGE_CONTINUE(dmrp
, addr
, endaddr
);
5950 } else if (dmrp
!= NULL
) {
5951 DEMAP_RANGE_FLUSH(dmrp
);
5954 HBLKTOHME(sfhmep
, hmeblkp
, addr
);
5956 while (addr
< endaddr
) {
5958 sfmmu_copytte(&sfhmep
->hme_tte
, &tte
);
5959 if (TTE_IS_VALID(&tte
)) {
5960 pp
= sfhmep
->hme_page
;
5962 pml
= sfmmu_mlist_enter(pp
);
5966 * Verify if hme still points to 'pp' now that
5967 * we have p_mapping lock.
5969 if (sfhmep
->hme_page
!= pp
) {
5970 if (pp
!= NULL
&& sfhmep
->hme_page
!= NULL
) {
5971 ASSERT(pml
!= NULL
);
5972 sfmmu_mlist_exit(pml
);
5973 /* Re-start this iteration. */
5976 ASSERT((pp
!= NULL
) &&
5977 (sfhmep
->hme_page
== NULL
));
5982 * This point on we have both HASH and p_mapping
5985 ASSERT(pp
== sfhmep
->hme_page
);
5986 ASSERT(pp
== NULL
|| sfmmu_mlist_held(pp
));
5989 * We need to loop on modify tte because it is
5990 * possible for pagesync to come along and
5991 * change the software bits beneath us.
5993 * Page_unload can also invalidate the tte after
5994 * we read tte outside of p_mapping lock.
5999 TTE_SET_INVALID(&ttemod
);
6000 ret
= sfmmu_modifytte_try(&tte
, &ttemod
,
6004 if (TTE_IS_VALID(&tte
)) {
6009 panic("sfmmu_hblk_unload: pp = 0x%p "
6010 "tte became invalid under mlist"
6011 " lock = 0x%p", (void *)pp
,
6017 if (!(flags
& HAT_UNLOAD_NOSYNC
)) {
6018 sfmmu_ttesync(sfmmup
, addr
, &tte
, pp
);
6022 * Ok- we invalidated the tte. Do the rest of the job.
6026 if (flags
& HAT_UNLOAD_UNLOCK
) {
6027 ASSERT(hmeblkp
->hblk_lckcnt
> 0);
6028 atomic_dec_32(&hmeblkp
->hblk_lckcnt
);
6029 HBLK_STACK_TRACE(hmeblkp
, HBLK_UNLOCK
);
6033 * Normally we would need to flush the page
6034 * from the virtual cache at this point in
6035 * order to prevent a potential cache alias
6037 * The particular scenario we need to worry
6039 * Given: va1 and va2 are two virtual address
6040 * that alias and map the same physical
6042 * 1. mapping exists from va1 to pa and data
6043 * has been read into the cache.
6045 * 3. load va2 and modify data using va2.
6047 * 5. load va1 and reference data. Unless we
6048 * flush the data cache when we unload we will
6050 * Fortunately, page coloring eliminates the
6051 * above scenario by remembering the color a
6052 * physical page was last or is currently
6053 * mapped to. Now, we delay the flush until
6054 * the loading of translations. Only when the
6055 * new translation is of a different color
6056 * are we forced to flush.
6058 if (use_demap_range
) {
6060 * Mark this page as needing a demap.
6062 DEMAP_RANGE_MARKPG(dmrp
, addr
);
6064 ASSERT(sfmmup
!= NULL
);
6065 ASSERT(!hmeblkp
->hblk_shared
);
6066 sfmmu_tlb_demap(addr
, sfmmup
, hmeblkp
,
6067 sfmmup
->sfmmu_free
, 0);
6072 * Remove the hment from the mapping list
6074 ASSERT(hmeblkp
->hblk_hmecnt
> 0);
6078 * ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS);
6080 HME_SUB(sfhmep
, pp
);
6082 atomic_dec_16(&hmeblkp
->hblk_hmecnt
);
6085 ASSERT(hmeblkp
->hblk_vcnt
> 0);
6086 atomic_dec_16(&hmeblkp
->hblk_vcnt
);
6088 ASSERT(hmeblkp
->hblk_hmecnt
|| hmeblkp
->hblk_vcnt
||
6089 !hmeblkp
->hblk_lckcnt
);
6092 if (pp
&& (pp
->p_nrm
& (P_KPMC
| P_KPMS
| P_TNC
))) {
6095 * If page was temporary
6096 * uncached, try to recache
6097 * it. Note that HME_SUB() was
6098 * called above so p_index and
6099 * mlist had been updated.
6101 conv_tnc(pp
, ttesz
);
6102 } else if (pp
->p_mapping
== NULL
) {
6105 * Page is marked to be in VAC conflict
6106 * to an existing kpm mapping and/or is
6107 * kpm mapped using only the regular
6110 sfmmu_kpm_hme_unload(pp
);
6114 } else if ((pp
= sfhmep
->hme_page
) != NULL
) {
6116 * TTE is invalid but the hme
6117 * still exists. let pageunload
6120 ASSERT(pml
== NULL
);
6121 pml
= sfmmu_mlist_enter(pp
);
6122 if (sfhmep
->hme_page
!= NULL
) {
6123 sfmmu_mlist_exit(pml
);
6126 ASSERT(sfhmep
->hme_page
== NULL
);
6127 } else if (hmeblkp
->hblk_hmecnt
!= 0) {
6129 * pageunload may have not finished decrementing
6130 * hblk_vcnt and hblk_hmecnt. Find page_t if any and
6131 * wait for pageunload to finish. Rely on pageunload
6132 * to decrement hblk_hmecnt after hblk_vcnt.
6134 pfn_t pfn
= TTE_TO_TTEPFN(&tte
);
6135 ASSERT(pml
== NULL
);
6136 if (pf_is_memory(pfn
)) {
6137 pp
= page_numtopp_nolock(pfn
);
6139 pml
= sfmmu_mlist_enter(pp
);
6140 sfmmu_mlist_exit(pml
);
6148 * At this point, the tte we are looking at
6149 * should be unloaded, and hme has been unlinked
6150 * from page too. This is important because in
6151 * pageunload, it does ttesync() then HME_SUB.
6152 * We need to make sure HME_SUB has been completed
6153 * so we know ttesync() has been completed. Otherwise,
6154 * at exit time, after return from hat layer, VM will
6155 * release as structure which hat_setstat() (called
6156 * by ttesync()) needs.
6162 ASSERT(sfhmep
->hme_page
== NULL
);
6164 sfmmu_copytte(&sfhmep
->hme_tte
, &dtte
);
6165 ASSERT(!TTE_IS_VALID(&dtte
));
6170 sfmmu_mlist_exit(pml
);
6173 addr
+= TTEBYTES(ttesz
);
6175 DEMAP_RANGE_NEXTPG(dmrp
);
6178 * For shared hmeblks this routine is only called when region is freed
6179 * and no longer referenced. So no need to decrement ttecnt
6180 * in the region structure here.
6182 if (ttecnt
> 0 && sfmmup
!= NULL
) {
6183 atomic_add_long(&sfmmup
->sfmmu_ttecnt
[ttesz
], -ttecnt
);
6189 * Invalidate a virtual address range for the local CPU.
6190 * For best performance ensure that the va range is completely
6191 * mapped, otherwise the entire TLB will be flushed.
6194 hat_flush_range(struct hat
*sfmmup
, caddr_t va
, size_t size
)
6197 caddr_t endva
= va
+ size
;
6199 while (va
< endva
) {
6200 sz
= hat_getpagesize(sfmmup
, va
);
6205 vtag_flushpage(va
, (uint64_t)sfmmup
);
6211 * Synchronize all the mappings in the range [addr..addr+len).
6212 * Can be called with clearflag having two states:
6213 * HAT_SYNC_DONTZERO means just return the rm stats
6214 * HAT_SYNC_ZERORM means zero rm bits in the tte and return the stats
6217 hat_sync(struct hat
*sfmmup
, caddr_t addr
, size_t len
, uint_t clearflag
)
6219 struct hmehash_bucket
*hmebp
;
6221 int hmeshift
, hashno
= 1;
6222 struct hme_blk
*hmeblkp
, *list
= NULL
;
6226 ASSERT((sfmmup
== ksfmmup
) || AS_LOCK_HELD(sfmmup
->sfmmu_as
));
6227 ASSERT((len
& MMU_PAGEOFFSET
) == 0);
6228 ASSERT((clearflag
== HAT_SYNC_DONTZERO
) ||
6229 (clearflag
== HAT_SYNC_ZERORM
));
6231 CPUSET_ZERO(cpuset
);
6233 endaddr
= addr
+ len
;
6234 hblktag
.htag_id
= sfmmup
;
6235 hblktag
.htag_rid
= SFMMU_INVALID_SHMERID
;
6238 * Spitfire supports 4 page sizes.
6239 * Most pages are expected to be of the smallest page
6240 * size (8K) and these will not need to be rehashed. 64K
6241 * pages also don't need to be rehashed because the an hmeblk
6242 * spans 64K of address space. 512K pages might need 1 rehash and
6243 * and 4M pages 2 rehashes.
6245 while (addr
< endaddr
) {
6246 hmeshift
= HME_HASH_SHIFT(hashno
);
6247 hblktag
.htag_bspage
= HME_HASH_BSPAGE(addr
, hmeshift
);
6248 hblktag
.htag_rehash
= hashno
;
6249 hmebp
= HME_HASH_FUNCTION(sfmmup
, addr
, hmeshift
);
6251 SFMMU_HASH_LOCK(hmebp
);
6253 HME_HASH_SEARCH(hmebp
, hblktag
, hmeblkp
, &list
);
6254 if (hmeblkp
!= NULL
) {
6255 ASSERT(!hmeblkp
->hblk_shared
);
6257 * We've encountered a shadow hmeblk so skip the range
6258 * of the next smaller mapping size.
6260 if (hmeblkp
->hblk_shw_bit
) {
6261 ASSERT(sfmmup
!= ksfmmup
);
6263 addr
= (caddr_t
)P2END((uintptr_t)addr
,
6264 TTEBYTES(hashno
- 1));
6266 addr
= sfmmu_hblk_sync(sfmmup
, hmeblkp
,
6267 addr
, endaddr
, clearflag
);
6269 SFMMU_HASH_UNLOCK(hmebp
);
6273 SFMMU_HASH_UNLOCK(hmebp
);
6275 if (!HME_REHASH(sfmmup
) || (hashno
>= mmu_hashcnt
)) {
6277 * We have traversed the whole list and rehashed
6278 * if necessary without finding the address to sync.
6279 * This is ok so we increment the address by the
6280 * smallest hmeblk range for kernel mappings and the
6281 * largest hmeblk range, to account for shadow hmeblks,
6282 * for user mappings and continue.
6284 if (sfmmup
== ksfmmup
)
6285 addr
= (caddr_t
)P2END((uintptr_t)addr
,
6288 addr
= (caddr_t
)P2END((uintptr_t)addr
,
6295 sfmmu_hblks_list_purge(&list
, 0);
6296 cpuset
= sfmmup
->sfmmu_cpusran
;
6301 sfmmu_hblk_sync(struct hat
*sfmmup
, struct hme_blk
*hmeblkp
, caddr_t addr
,
6302 caddr_t endaddr
, int clearflag
)
6305 struct sf_hment
*sfhmep
;
6311 ASSERT(hmeblkp
->hblk_shw_bit
== 0);
6312 ASSERT(!hmeblkp
->hblk_shared
);
6314 endaddr
= MIN(endaddr
, get_hblk_endaddr(hmeblkp
));
6316 ttesz
= get_hblk_ttesz(hmeblkp
);
6317 HBLKTOHME(sfhmep
, hmeblkp
, addr
);
6319 while (addr
< endaddr
) {
6320 sfmmu_copytte(&sfhmep
->hme_tte
, &tte
);
6321 if (TTE_IS_VALID(&tte
)) {
6323 pp
= sfhmep
->hme_page
;
6325 pml
= sfmmu_mlist_enter(pp
);
6327 if (pp
!= sfhmep
->hme_page
) {
6329 * tte most have been unloaded
6330 * underneath us. Recheck
6333 sfmmu_mlist_exit(pml
);
6337 ASSERT(pp
== NULL
|| sfmmu_mlist_held(pp
));
6339 if (clearflag
== HAT_SYNC_ZERORM
) {
6341 TTE_CLR_RM(&ttemod
);
6342 ret
= sfmmu_modifytte_try(&tte
, &ttemod
,
6346 sfmmu_mlist_exit(pml
);
6352 sfmmu_tlb_demap(addr
, sfmmup
,
6356 sfmmu_ttesync(sfmmup
, addr
, &tte
, pp
);
6358 sfmmu_mlist_exit(pml
);
6361 addr
+= TTEBYTES(ttesz
);
6368 * This function will sync a tte to the page struct and it will
6369 * update the hat stats. Currently it allows us to pass a NULL pp
6370 * and we will simply update the stats. We may want to change this
6371 * so we only keep stats for pages backed by pp's.
6374 sfmmu_ttesync(struct hat
*sfmmup
, caddr_t addr
, tte_t
*ttep
, page_t
*pp
)
6380 ASSERT(TTE_IS_VALID(ttep
));
6382 if (TTE_IS_NOSYNC(ttep
)) {
6386 if (TTE_IS_REF(ttep
)) {
6389 if (TTE_IS_MOD(ttep
)) {
6398 if (sfmmup
!= NULL
&& sfmmup
->sfmmu_rmstat
) {
6400 caddr_t vaddr
= addr
;
6402 for (i
= 0; i
< TTEPAGES(sz
); i
++, vaddr
+= MMU_PAGESIZE
) {
6403 hat_setstat(sfmmup
->sfmmu_as
, vaddr
, MMU_PAGESIZE
, rm
);
6409 * XXX I want to use cas to update nrm bits but they
6410 * currently belong in common/vm and not in hat where
6412 * The nrm bits are protected by the same mutex as
6413 * the one that protects the page's mapping list.
6417 ASSERT(sfmmu_mlist_held(pp
));
6419 * If the tte is for a large page, we need to sync all the
6420 * pages covered by the tte.
6423 ASSERT(pp
->p_szc
!= 0);
6424 pp
= PP_GROUPLEADER(pp
, sz
);
6425 ASSERT(sfmmu_mlist_held(pp
));
6428 /* Get number of pages from tte size. */
6429 npgs
= TTEPAGES(sz
);
6433 ASSERT(sfmmu_mlist_held(pp
));
6434 if (((rm
& P_REF
) != 0 && !PP_ISREF(pp
)) ||
6435 ((rm
& P_MOD
) != 0 && !PP_ISMOD(pp
)))
6436 hat_page_setattr(pp
, rm
);
6439 * Are we done? If not, we must have a large mapping.
6440 * For large mappings we need to sync the rest of the pages
6441 * covered by this tte; goto the next page.
6443 } while (--npgs
> 0 && (pp
= PP_PAGENEXT(pp
)));
6447 * Execute pre-callback handler of each pa_hment linked to pp
6450 * flag: either HAT_PRESUSPEND or HAT_SUSPEND.
6451 * capture_cpus: pointer to return value (below)
6454 * Propagates the subsystem callback return values back to the caller;
6455 * returns 0 on success. If capture_cpus is non-NULL, the value returned
6456 * is zero if all of the pa_hments are of a type that do not require
6457 * capturing CPUs prior to suspending the mapping, else it is 1.
6460 hat_pageprocess_precallbacks(struct page
*pp
, uint_t flag
, int *capture_cpus
)
6462 struct sf_hment
*sfhmep
;
6463 struct pa_hment
*pahmep
;
6464 int (*f
)(caddr_t
, uint_t
, uint_t
, void *);
6470 ASSERT(PAGE_EXCL(pp
));
6471 if (!sfmmu_mlist_held(pp
)) {
6472 pml
= sfmmu_mlist_enter(pp
);
6480 for (sfhmep
= pp
->p_mapping
; sfhmep
; sfhmep
= sfhmep
->hme_next
) {
6482 * skip sf_hments corresponding to VA<->PA mappings;
6483 * for pa_hment's, hme_tte.ll is zero
6485 if (!IS_PAHME(sfhmep
))
6488 pahmep
= sfhmep
->hme_data
;
6489 ASSERT(pahmep
!= NULL
);
6492 * skip if pre-handler has been called earlier in this loop
6494 if (pahmep
->flags
& flag
)
6498 ASSERT(id
>= (id_t
)0 && id
< sfmmu_cb_nextid
);
6499 if (capture_cpus
&& sfmmu_cb_table
[id
].capture_cpus
!= 0)
6501 if ((f
= sfmmu_cb_table
[id
].prehandler
) == NULL
) {
6502 pahmep
->flags
|= flag
;
6507 * Drop the mapping list lock to avoid locking order issues.
6510 sfmmu_mlist_exit(pml
);
6512 ret
= f(pahmep
->addr
, pahmep
->len
, flag
, pahmep
->pvt
);
6514 return (ret
); /* caller must do the cleanup */
6517 pml
= sfmmu_mlist_enter(pp
);
6518 pahmep
->flags
|= flag
;
6522 pahmep
->flags
|= flag
;
6526 sfmmu_mlist_exit(pml
);
6532 * Execute post-callback handler of each pa_hment linked to pp
6534 * Same overall assumptions and restrictions apply as for
6535 * hat_pageprocess_precallbacks().
6538 hat_pageprocess_postcallbacks(struct page
*pp
, uint_t flag
)
6540 pfn_t pgpfn
= pp
->p_pagenum
;
6541 pfn_t pgmask
= btop(page_get_pagesize(pp
->p_szc
)) - 1;
6543 struct sf_hment
*sfhmep
;
6544 struct pa_hment
*pahmep
;
6545 int (*f
)(caddr_t
, uint_t
, uint_t
, void *, pfn_t
);
6550 ASSERT(PAGE_EXCL(pp
));
6551 if (!sfmmu_mlist_held(pp
)) {
6552 pml
= sfmmu_mlist_enter(pp
);
6557 for (sfhmep
= pp
->p_mapping
; sfhmep
; sfhmep
= sfhmep
->hme_next
) {
6559 * skip sf_hments corresponding to VA<->PA mappings;
6560 * for pa_hment's, hme_tte.ll is zero
6562 if (!IS_PAHME(sfhmep
))
6565 pahmep
= sfhmep
->hme_data
;
6566 ASSERT(pahmep
!= NULL
);
6568 if ((pahmep
->flags
& flag
) == 0)
6571 pahmep
->flags
&= ~flag
;
6574 ASSERT(id
>= (id_t
)0 && id
< sfmmu_cb_nextid
);
6575 if ((f
= sfmmu_cb_table
[id
].posthandler
) == NULL
)
6579 * Convert the base page PFN into the constituent PFN
6580 * which is needed by the callback handler.
6582 newpfn
= pgpfn
| (btop((uintptr_t)pahmep
->addr
) & pgmask
);
6585 * Drop the mapping list lock to avoid locking order issues.
6588 sfmmu_mlist_exit(pml
);
6590 if (f(pahmep
->addr
, pahmep
->len
, flag
, pahmep
->pvt
, newpfn
)
6592 panic("sfmmu: posthandler failed");
6595 pml
= sfmmu_mlist_enter(pp
);
6601 sfmmu_mlist_exit(pml
);
6605 * Suspend locked kernel mapping
6608 hat_pagesuspend(struct page
*pp
)
6610 struct sf_hment
*sfhmep
;
6613 struct hme_blk
*hmeblkp
;
6618 ASSERT(PAGE_EXCL(pp
));
6619 ASSERT(sfmmu_mlist_held(pp
));
6621 mutex_enter(&kpr_suspendlock
);
6624 * We're about to suspend a kernel mapping so mark this thread as
6625 * non-traceable by DTrace. This prevents us from running into issues
6626 * with probe context trying to touch a suspended page
6627 * in the relocation codepath itself.
6629 curthread
->t_flag
|= T_DONTDTRACE
;
6631 index
= PP_MAPINDEX(pp
);
6635 for (sfhmep
= pp
->p_mapping
; sfhmep
; sfhmep
= sfhmep
->hme_next
) {
6637 if (IS_PAHME(sfhmep
))
6640 if (get_hblk_ttesz(sfmmu_hmetohblk(sfhmep
)) != cons
)
6644 * Loop until we successfully set the suspend bit in
6648 sfmmu_copytte(&sfhmep
->hme_tte
, &tte
);
6649 ASSERT(TTE_IS_VALID(&tte
));
6652 TTE_SET_SUSPEND(&ttemod
);
6653 if (sfmmu_modifytte_try(&tte
, &ttemod
,
6654 &sfhmep
->hme_tte
) < 0)
6658 * Invalidate TSB entry
6660 hmeblkp
= sfmmu_hmetohblk(sfhmep
);
6662 sfmmup
= hblktosfmmu(hmeblkp
);
6663 ASSERT(sfmmup
== ksfmmup
);
6664 ASSERT(!hmeblkp
->hblk_shared
);
6666 addr
= tte_to_vaddr(hmeblkp
, tte
);
6669 * No need to make sure that the TSB for this sfmmu is
6670 * not being relocated since it is ksfmmup and thus it
6671 * will never be relocated.
6673 SFMMU_UNLOAD_TSB(addr
, sfmmup
, hmeblkp
, 0);
6676 * Update xcall stats
6678 cpuset
= cpu_ready_set
;
6679 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
6681 /* LINTED: constant in conditional context */
6682 SFMMU_XCALL_STATS(ksfmmup
);
6685 * Flush TLB entry on remote CPU's
6687 xt_some(cpuset
, vtag_flushpage_tl1
, (uint64_t)addr
,
6692 * Flush TLB entry on local CPU
6694 vtag_flushpage(addr
, (uint64_t)ksfmmup
);
6697 while (index
!= 0) {
6702 pp
= PP_GROUPLEADER(pp
, cons
);
6719 static struct prle page_relocate_log
[N_PRLE
];
6720 static int prl_entry
;
6721 static kmutex_t prl_mutex
;
6723 #define PAGE_RELOCATE_LOG(t, r, s, p) \
6724 mutex_enter(&prl_mutex); \
6725 page_relocate_log[prl_entry].targ = *(t); \
6726 page_relocate_log[prl_entry].repl = *(r); \
6727 page_relocate_log[prl_entry].status = (s); \
6728 page_relocate_log[prl_entry].pausecpus = (p); \
6729 page_relocate_log[prl_entry].whence = gethrtime(); \
6730 prl_entry = (prl_entry == (N_PRLE - 1))? 0 : prl_entry + 1; \
6731 mutex_exit(&prl_mutex);
6734 #define PAGE_RELOCATE_LOG(t, r, s, p)
6738 * Core Kernel Page Relocation Algorithm
6742 * target : constituent pages are SE_EXCL locked.
6743 * replacement: constituent pages are SE_EXCL locked.
6747 * nrelocp: number of pages relocated
6750 hat_page_relocate(page_t
**target
, page_t
**replacement
, spgcnt_t
*nrelocp
)
6752 page_t
*targ
, *repl
;
6754 kmutex_t
*low
, *high
;
6765 if (!kcage_on
|| PP_ISNORELOC(*target
)) {
6766 PAGE_RELOCATE_LOG(target
, replacement
, EAGAIN
, -1);
6770 mutex_enter(&kpr_mutex
);
6771 kreloc_thread
= curthread
;
6774 repl
= *replacement
;
6775 ASSERT(repl
!= NULL
);
6776 ASSERT(targ
->p_szc
== repl
->p_szc
);
6778 npages
= page_get_pagecnt(targ
->p_szc
);
6781 * unload VA<->PA mappings that are not locked
6784 for (i
= 0; i
< npages
; i
++) {
6785 (void) hat_pageunload(tpp
, SFMMU_KERNEL_RELOC
);
6790 * Do "presuspend" callbacks, in a context from which we can still
6791 * block as needed. Note that we don't hold the mapping list lock
6792 * of "targ" at this point due to potential locking order issues;
6793 * we assume that between the hat_pageunload() above and holding
6794 * the SE_EXCL lock that the mapping list *cannot* change at this
6797 ret
= hat_pageprocess_precallbacks(targ
, HAT_PRESUSPEND
, &cap_cpus
);
6800 * EIO translates to fatal error, for all others cleanup
6801 * and return EAGAIN.
6804 hat_pageprocess_postcallbacks(targ
, HAT_POSTUNSUSPEND
);
6805 PAGE_RELOCATE_LOG(target
, replacement
, ret
, -1);
6806 kreloc_thread
= NULL
;
6807 mutex_exit(&kpr_mutex
);
6812 * acquire p_mapping list lock for both the target and replacement
6815 * low and high refer to the need to grab the mlist locks in a
6816 * specific order in order to prevent race conditions. Thus the
6817 * lower lock must be grabbed before the higher lock.
6819 * This will block hat_unload's accessing p_mapping list. Since
6820 * we have SE_EXCL lock, hat_memload and hat_pageunload will be
6821 * blocked. Thus, no one else will be accessing the p_mapping list
6822 * while we suspend and reload the locked mapping below.
6826 sfmmu_mlist_reloc_enter(tpp
, rpp
, &low
, &high
);
6831 * We raise our PIL to 13 so that we don't get captured by
6832 * another CPU or pinned by an interrupt thread. We can't go to
6833 * PIL 14 since the nexus driver(s) may need to interrupt at
6834 * that level in the case of IOMMU pseudo mappings.
6836 cpuset
= cpu_ready_set
;
6837 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
6838 if (!cap_cpus
|| CPUSET_ISNULL(cpuset
)) {
6839 old_pil
= splr(XCALL_PIL
);
6842 xc_attention(cpuset
);
6844 ASSERT(getpil() == XCALL_PIL
);
6847 * Now do suspend callbacks. In the case of an IOMMU mapping
6848 * this will suspend all DMA activity to the page while it is
6849 * being relocated. Since we are well above LOCK_LEVEL and CPUs
6850 * may be captured at this point we should have acquired any needed
6851 * locks in the presuspend callback.
6853 ret
= hat_pageprocess_precallbacks(targ
, HAT_SUSPEND
, NULL
);
6860 * Raise the PIL yet again, this time to block all high-level
6861 * interrupts on this CPU. This is necessary to prevent an
6862 * interrupt routine from pinning the thread which holds the
6863 * mapping suspended and then touching the suspended page.
6865 * Once the page is suspended we also need to be careful to
6866 * avoid calling any functions which touch any seg_kmem memory
6867 * since that memory may be backed by the very page we are
6868 * relocating in here!
6870 hat_pagesuspend(targ
);
6873 * Now that we are confident everybody has stopped using this page,
6874 * copy the page contents. Note we use a physical copy to prevent
6875 * locking issues and to avoid fpRAS because we can't handle it in
6878 for (i
= 0; i
< npages
; i
++, tpp
++, rpp
++) {
6881 * If the replacement has a different vcolor than
6882 * the one being replacd, we need to handle VAC
6883 * consistency for it just as we were setting up
6884 * a new mapping to it.
6886 if ((PP_GET_VCOLOR(rpp
) != NO_VCOLOR
) &&
6887 (tpp
->p_vcolor
!= rpp
->p_vcolor
) &&
6888 !CacheColor_IsFlushed(cflags
, PP_GET_VCOLOR(rpp
))) {
6889 CacheColor_SetFlushed(cflags
, PP_GET_VCOLOR(rpp
));
6890 sfmmu_cache_flushcolor(PP_GET_VCOLOR(rpp
),
6895 * Copy the contents of the page.
6897 ppcopy_kernel(tpp
, rpp
);
6902 for (i
= 0; i
< npages
; i
++, tpp
++, rpp
++) {
6904 * Copy attributes. VAC consistency was handled above,
6907 rpp
->p_nrm
= tpp
->p_nrm
;
6909 rpp
->p_index
= tpp
->p_index
;
6912 rpp
->p_vcolor
= tpp
->p_vcolor
;
6917 * First, unsuspend the page, if we set the suspend bit, and transfer
6918 * the mapping list from the target page to the replacement page.
6919 * Next process postcallbacks; since pa_hment's are linked only to the
6920 * p_mapping list of root page, we don't iterate over the constituent
6923 hat_pagereload(targ
, repl
);
6926 hat_pageprocess_postcallbacks(repl
, HAT_UNSUSPEND
);
6929 * Now lower our PIL and release any captured CPUs since we
6930 * are out of the "danger zone". After this it will again be
6931 * safe to acquire adaptive mutex locks, or to drop them...
6933 if (old_pil
!= -1) {
6936 xc_dismissed(cpuset
);
6941 sfmmu_mlist_reloc_exit(low
, high
);
6944 * Postsuspend callbacks should drop any locks held across
6945 * the suspend callbacks. As before, we don't hold the mapping
6946 * list lock at this point.. our assumption is that the mapping
6947 * list still can't change due to our holding SE_EXCL lock and
6948 * there being no unlocked mappings left. Hence the restriction
6949 * on calling context to hat_delete_callback()
6951 hat_pageprocess_postcallbacks(repl
, HAT_POSTUNSUSPEND
);
6954 * The second presuspend call failed: we got here through
6955 * the suspend_fail label above.
6958 PAGE_RELOCATE_LOG(target
, replacement
, ret
, cap_cpus
);
6959 kreloc_thread
= NULL
;
6960 mutex_exit(&kpr_mutex
);
6965 * Now that we're out of the performance critical section we can
6966 * take care of updating the hash table, since we still
6967 * hold all the pages locked SE_EXCL at this point we
6968 * needn't worry about things changing out from under us.
6972 for (i
= 0; i
< npages
; i
++, tpp
++, rpp
++) {
6975 * replace targ with replacement in page_hash table
6978 page_relocate_hash(rpp
, targ
);
6981 * concatenate target; caller of platform_page_relocate()
6982 * expects target to be concatenated after returning.
6984 ASSERT(targ
->p_next
== targ
);
6985 ASSERT(targ
->p_prev
== targ
);
6986 page_list_concat(&pl
, &targ
);
6989 ASSERT(*target
== pl
);
6991 PAGE_RELOCATE_LOG(target
, replacement
, 0, cap_cpus
);
6992 kreloc_thread
= NULL
;
6993 mutex_exit(&kpr_mutex
);
6998 * Called when stray pa_hments are found attached to a page which is
6999 * being freed. Notify the subsystem which attached the pa_hment of
7000 * the error if it registered a suitable handler, else panic.
7003 sfmmu_pahment_leaked(struct pa_hment
*pahmep
)
7005 id_t cb_id
= pahmep
->cb_id
;
7007 ASSERT(cb_id
>= (id_t
)0 && cb_id
< sfmmu_cb_nextid
);
7008 if (sfmmu_cb_table
[cb_id
].errhandler
!= NULL
) {
7009 if (sfmmu_cb_table
[cb_id
].errhandler(pahmep
->addr
, pahmep
->len
,
7010 HAT_CB_ERR_LEAKED
, pahmep
->pvt
) == 0)
7011 return; /* non-fatal */
7013 panic("pa_hment leaked: 0x%p", (void *)pahmep
);
7017 * Remove all mappings to page 'pp'.
7020 hat_pageunload(struct page
*pp
, uint_t forceflag
)
7022 struct page
*origpp
= pp
;
7023 struct sf_hment
*sfhme
, *tmphme
;
7024 struct hme_blk
*hmeblkp
;
7029 cpuset_t cpuset
, tset
;
7033 ASSERT(PAGE_EXCL(pp
));
7037 CPUSET_ZERO(cpuset
);
7039 pml
= sfmmu_mlist_enter(pp
);
7043 sfmmu_kpm_pageunload(pp
);
7044 ASSERT(!PP_ISMAPPED_KPM(pp
));
7047 * Clear vpm reference. Since the page is exclusively locked
7048 * vpm cannot be referencing it.
7054 index
= PP_MAPINDEX(pp
);
7057 for (sfhme
= pp
->p_mapping
; sfhme
; sfhme
= tmphme
) {
7058 tmphme
= sfhme
->hme_next
;
7060 if (IS_PAHME(sfhme
)) {
7061 ASSERT(sfhme
->hme_data
!= NULL
);
7066 hmeblkp
= sfmmu_hmetohblk(sfhme
);
7069 * If there are kernel mappings don't unload them, they will
7072 if (forceflag
== SFMMU_KERNEL_RELOC
&& hmeblkp
->hblk_lckcnt
&&
7073 hmeblkp
->hblk_tag
.htag_id
== ksfmmup
)
7076 tset
= sfmmu_pageunload(pp
, sfhme
, cons
);
7077 CPUSET_OR(cpuset
, tset
);
7080 while (index
!= 0) {
7085 /* Go to leading page */
7086 pp
= PP_GROUPLEADER(pp
, cons
);
7087 ASSERT(sfmmu_mlist_held(pp
));
7093 * cpuset may be empty if the page was only mapped by segkpm,
7094 * in which case we won't actually cross-trap.
7099 * The page should have no mappings at this point, unless
7100 * we were called from hat_page_relocate() in which case we
7101 * leave the locked mappings which will be suspended later.
7103 ASSERT(!PP_ISMAPPED(origpp
) || pa_hments
||
7104 (forceflag
== SFMMU_KERNEL_RELOC
));
7108 if (cons
== TTE8K
) {
7109 pmtx
= sfmmu_page_enter(pp
);
7111 sfmmu_page_exit(pmtx
);
7118 if (pa_hments
&& forceflag
!= SFMMU_KERNEL_RELOC
) {
7120 * Unlink any pa_hments and free them, calling back
7121 * the responsible subsystem to notify it of the error.
7122 * This can occur in situations such as drivers leaking
7123 * DMA handles: naughty, but common enough that we'd like
7124 * to keep the system running rather than bringing it
7125 * down with an obscure error like "pa_hment leaked"
7126 * which doesn't aid the user in debugging their driver.
7128 for (sfhme
= pp
->p_mapping
; sfhme
; sfhme
= tmphme
) {
7129 tmphme
= sfhme
->hme_next
;
7130 if (IS_PAHME(sfhme
)) {
7131 struct pa_hment
*pahmep
= sfhme
->hme_data
;
7132 sfmmu_pahment_leaked(pahmep
);
7134 kmem_cache_free(pa_hment_cache
, pahmep
);
7138 ASSERT(!PP_ISMAPPED(origpp
));
7141 sfmmu_mlist_exit(pml
);
7147 sfmmu_pageunload(page_t
*pp
, struct sf_hment
*sfhme
, int cons
)
7149 struct hme_blk
*hmeblkp
;
7161 ASSERT(sfmmu_mlist_held(pp
));
7162 ASSERT(!PP_ISKAS(pp
));
7164 CPUSET_ZERO(cpuset
);
7166 hmeblkp
= sfmmu_hmetohblk(sfhme
);
7169 sfmmu_copytte(&sfhme
->hme_tte
, &tte
);
7170 if (TTE_IS_VALID(&tte
)) {
7171 sfmmup
= hblktosfmmu(hmeblkp
);
7172 ttesz
= get_hblk_ttesz(hmeblkp
);
7174 * Only unload mappings of 'cons' size.
7180 * Note that we have p_mapping lock, but no hash lock here.
7181 * hblk_unload() has to have both hash lock AND p_mapping
7182 * lock before it tries to modify tte. So, the tte could
7183 * not become invalid in the sfmmu_modifytte_try() below.
7190 TTE_SET_INVALID(&ttemod
);
7191 ret
= sfmmu_modifytte_try(&tte
, &ttemod
, &sfhme
->hme_tte
);
7194 /* only R/M bits can change. */
7195 chk_tte(&orig_old
, &tte
, &ttemod
, hmeblkp
);
7201 panic("pageunload: cas failed?");
7204 addr
= tte_to_vaddr(hmeblkp
, tte
);
7206 if (hmeblkp
->hblk_shared
) {
7207 sf_srd_t
*srdp
= (sf_srd_t
*)sfmmup
;
7208 uint_t rid
= hmeblkp
->hblk_tag
.htag_rid
;
7210 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
7211 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
7212 ASSERT(srdp
!= NULL
);
7213 rgnp
= srdp
->srd_hmergnp
[rid
];
7214 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp
, srdp
, rgnp
, rid
);
7215 cpuset
= sfmmu_rgntlb_demap(addr
, rgnp
, hmeblkp
, 1);
7216 sfmmu_ttesync(NULL
, addr
, &tte
, pp
);
7217 ASSERT(rgnp
->rgn_ttecnt
[ttesz
] > 0);
7218 atomic_dec_ulong(&rgnp
->rgn_ttecnt
[ttesz
]);
7220 sfmmu_ttesync(sfmmup
, addr
, &tte
, pp
);
7221 atomic_dec_ulong(&sfmmup
->sfmmu_ttecnt
[ttesz
]);
7224 * We need to flush the page from the virtual cache
7225 * in order to prevent a virtual cache alias
7226 * inconsistency. The particular scenario we need
7227 * to worry about is:
7228 * Given: va1 and va2 are two virtual address that
7229 * alias and will map the same physical address.
7230 * 1. mapping exists from va1 to pa and data has
7231 * been read into the cache.
7233 * 3. load va2 and modify data using va2.
7235 * 5. load va1 and reference data. Unless we flush
7236 * the data cache when we unload we will get
7238 * This scenario is taken care of by using virtual
7241 if (sfmmup
->sfmmu_ismhat
) {
7243 * Flush TSBs, TLBs and caches
7245 * sharing this ism segment.
7247 sfmmu_hat_lock_all();
7248 mutex_enter(&ism_mlist_lock
);
7250 sfmmu_ismtlbcache_demap(addr
, sfmmup
, hmeblkp
,
7251 pp
->p_pagenum
, CACHE_NO_FLUSH
);
7253 mutex_exit(&ism_mlist_lock
);
7254 sfmmu_hat_unlock_all();
7255 cpuset
= cpu_ready_set
;
7257 sfmmu_tlb_demap(addr
, sfmmup
, hmeblkp
, 0, 0);
7258 cpuset
= sfmmup
->sfmmu_cpusran
;
7263 * Hme_sub has to run after ttesync() and a_rss update.
7264 * See hblk_unload().
7270 * We can not make ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS)
7271 * since pteload may have done a HME_ADD() right after
7272 * we did the HME_SUB() above. Hmecnt is now maintained
7273 * by cas only. no lock guranteed its value. The only
7274 * gurantee we have is the hmecnt should not be less than
7275 * what it should be so the hblk will not be taken away.
7276 * It's also important that we decremented the hmecnt after
7277 * we are done with hmeblkp so that this hmeblk won't be
7280 ASSERT(hmeblkp
->hblk_hmecnt
> 0);
7281 ASSERT(hmeblkp
->hblk_vcnt
> 0);
7282 atomic_dec_16(&hmeblkp
->hblk_vcnt
);
7283 atomic_dec_16(&hmeblkp
->hblk_hmecnt
);
7285 * This is bug 4063182.
7287 * ASSERT(hmeblkp->hblk_hmecnt || hmeblkp->hblk_vcnt ||
7288 * !hmeblkp->hblk_lckcnt);
7291 panic("invalid tte? pp %p &tte %p",
7292 (void *)pp
, (void *)&tte
);
7299 * While relocating a kernel page, this function will move the mappings
7300 * from tpp to dpp and modify any associated data with these mappings.
7301 * It also unsuspends the suspended kernel mapping.
7304 hat_pagereload(struct page
*tpp
, struct page
*dpp
)
7306 struct sf_hment
*sfhme
;
7310 ASSERT(getpil() == PIL_MAX
);
7311 ASSERT(sfmmu_mlist_held(tpp
));
7312 ASSERT(sfmmu_mlist_held(dpp
));
7314 index
= PP_MAPINDEX(tpp
);
7317 /* Update real mappings to the page */
7319 for (sfhme
= tpp
->p_mapping
; sfhme
!= NULL
; sfhme
= sfhme
->hme_next
) {
7320 if (IS_PAHME(sfhme
))
7322 sfmmu_copytte(&sfhme
->hme_tte
, &tte
);
7326 * replace old pfn with new pfn in TTE
7328 PFN_TO_TTE(ttemod
, dpp
->p_pagenum
);
7333 ASSERT(TTE_IS_SUSPEND(&ttemod
));
7334 TTE_CLR_SUSPEND(&ttemod
);
7336 if (sfmmu_modifytte_try(&tte
, &ttemod
, &sfhme
->hme_tte
) < 0)
7337 panic("hat_pagereload(): sfmmu_modifytte_try() failed");
7340 * set hme_page point to new page
7342 sfhme
->hme_page
= dpp
;
7346 * move p_mapping list from old page to new page
7348 dpp
->p_mapping
= tpp
->p_mapping
;
7349 tpp
->p_mapping
= NULL
;
7350 dpp
->p_share
= tpp
->p_share
;
7353 while (index
!= 0) {
7358 tpp
= PP_GROUPLEADER(tpp
, cons
);
7359 dpp
= PP_GROUPLEADER(dpp
, cons
);
7364 curthread
->t_flag
&= ~T_DONTDTRACE
;
7365 mutex_exit(&kpr_suspendlock
);
7369 hat_pagesync(struct page
*pp
, uint_t clearflag
)
7371 struct sf_hment
*sfhme
, *tmphme
= NULL
;
7372 struct hme_blk
*hmeblkp
;
7374 cpuset_t cpuset
, tset
;
7376 extern ulong_t po_share
;
7377 page_t
*save_pp
= pp
;
7381 CPUSET_ZERO(cpuset
);
7383 if (PP_ISRO(pp
) && (clearflag
& HAT_SYNC_STOPON_MOD
)) {
7384 return (PP_GENERIC_ATTR(pp
));
7387 if ((clearflag
& HAT_SYNC_ZERORM
) == 0) {
7388 if ((clearflag
& HAT_SYNC_STOPON_REF
) && PP_ISREF(pp
)) {
7389 return (PP_GENERIC_ATTR(pp
));
7391 if ((clearflag
& HAT_SYNC_STOPON_MOD
) && PP_ISMOD(pp
)) {
7392 return (PP_GENERIC_ATTR(pp
));
7394 if (clearflag
& HAT_SYNC_STOPON_SHARED
) {
7395 if (pp
->p_share
> po_share
) {
7396 hat_page_setattr(pp
, P_REF
);
7397 return (PP_GENERIC_ATTR(pp
));
7404 clearflag
&= ~HAT_SYNC_STOPON_SHARED
;
7405 pml
= sfmmu_mlist_enter(pp
);
7406 index
= PP_MAPINDEX(pp
);
7409 for (sfhme
= pp
->p_mapping
; sfhme
; sfhme
= tmphme
) {
7411 * We need to save the next hment on the list since
7412 * it is possible for pagesync to remove an invalid hment
7415 tmphme
= sfhme
->hme_next
;
7416 if (IS_PAHME(sfhme
))
7419 * If we are looking for large mappings and this hme doesn't
7420 * reach the range we are seeking, just ignore it.
7422 hmeblkp
= sfmmu_hmetohblk(sfhme
);
7424 if (hme_size(sfhme
) < cons
)
7428 if (hmeblkp
->hblk_shared
) {
7429 sf_srd_t
*srdp
= hblktosrd(hmeblkp
);
7430 uint_t rid
= hmeblkp
->hblk_tag
.htag_rid
;
7432 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
7433 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
7434 ASSERT(srdp
!= NULL
);
7435 rgnp
= srdp
->srd_hmergnp
[rid
];
7436 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp
, srdp
,
7438 shcnt
+= rgnp
->rgn_refcnt
;
7442 if (shcnt
> po_share
) {
7444 * tell the pager to spare the page this time
7447 hat_page_setattr(save_pp
, P_REF
);
7452 tset
= sfmmu_pagesync(pp
, sfhme
,
7453 clearflag
& ~HAT_SYNC_STOPON_RM
);
7454 CPUSET_OR(cpuset
, tset
);
7457 * If clearflag is HAT_SYNC_DONTZERO, break out as soon
7458 * as the "ref" or "mod" is set or share cnt exceeds po_share.
7460 if ((clearflag
& ~HAT_SYNC_STOPON_RM
) == HAT_SYNC_DONTZERO
&&
7461 (((clearflag
& HAT_SYNC_STOPON_MOD
) && PP_ISMOD(save_pp
)) ||
7462 ((clearflag
& HAT_SYNC_STOPON_REF
) && PP_ISREF(save_pp
)))) {
7472 /* Go to leading page */
7473 pp
= PP_GROUPLEADER(pp
, cons
);
7479 sfmmu_mlist_exit(pml
);
7480 return (PP_GENERIC_ATTR(save_pp
));
7484 * Get all the hardware dependent attributes for a page struct
7487 sfmmu_pagesync(struct page
*pp
, struct sf_hment
*sfhme
,
7492 struct hme_blk
*hmeblkp
;
7498 ASSERT(sfmmu_mlist_held(pp
));
7499 ASSERT((clearflag
== HAT_SYNC_DONTZERO
) ||
7500 (clearflag
== HAT_SYNC_ZERORM
));
7502 SFMMU_STAT(sf_pagesync
);
7504 CPUSET_ZERO(cpuset
);
7506 sfmmu_pagesync_retry
:
7508 sfmmu_copytte(&sfhme
->hme_tte
, &tte
);
7509 if (TTE_IS_VALID(&tte
)) {
7510 hmeblkp
= sfmmu_hmetohblk(sfhme
);
7511 sfmmup
= hblktosfmmu(hmeblkp
);
7512 addr
= tte_to_vaddr(hmeblkp
, tte
);
7513 if (clearflag
== HAT_SYNC_ZERORM
) {
7515 TTE_CLR_RM(&ttemod
);
7516 ret
= sfmmu_modifytte_try(&tte
, &ttemod
,
7520 * cas failed and the new value is not what
7523 goto sfmmu_pagesync_retry
;
7527 /* we win the cas */
7528 if (hmeblkp
->hblk_shared
) {
7529 sf_srd_t
*srdp
= (sf_srd_t
*)sfmmup
;
7531 hmeblkp
->hblk_tag
.htag_rid
;
7533 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
7534 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
7535 ASSERT(srdp
!= NULL
);
7536 rgnp
= srdp
->srd_hmergnp
[rid
];
7537 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp
,
7539 cpuset
= sfmmu_rgntlb_demap(addr
,
7542 sfmmu_tlb_demap(addr
, sfmmup
, hmeblkp
,
7544 cpuset
= sfmmup
->sfmmu_cpusran
;
7548 sfmmu_ttesync(hmeblkp
->hblk_shared
? NULL
: sfmmup
, addr
,
7555 * Remove write permission from a mappings to a page, so that
7556 * we can detect the next modification of it. This requires modifying
7557 * the TTE then invalidating (demap) any TLB entry using that TTE.
7558 * This code is similar to sfmmu_pagesync().
7561 sfmmu_pageclrwrt(struct page
*pp
, struct sf_hment
*sfhme
)
7566 struct hme_blk
*hmeblkp
;
7572 ASSERT(sfmmu_mlist_held(pp
));
7574 CPUSET_ZERO(cpuset
);
7575 SFMMU_STAT(sf_clrwrt
);
7579 sfmmu_copytte(&sfhme
->hme_tte
, &tte
);
7580 if (TTE_IS_VALID(&tte
) && TTE_IS_WRITABLE(&tte
)) {
7581 hmeblkp
= sfmmu_hmetohblk(sfhme
);
7582 sfmmup
= hblktosfmmu(hmeblkp
);
7583 addr
= tte_to_vaddr(hmeblkp
, tte
);
7586 TTE_CLR_WRT(&ttemod
);
7587 TTE_CLR_MOD(&ttemod
);
7588 ret
= sfmmu_modifytte_try(&tte
, &ttemod
, &sfhme
->hme_tte
);
7591 * if cas failed and the new value is not what
7597 /* we win the cas */
7599 if (hmeblkp
->hblk_shared
) {
7600 sf_srd_t
*srdp
= (sf_srd_t
*)sfmmup
;
7601 uint_t rid
= hmeblkp
->hblk_tag
.htag_rid
;
7603 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
7604 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
7605 ASSERT(srdp
!= NULL
);
7606 rgnp
= srdp
->srd_hmergnp
[rid
];
7607 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp
,
7609 cpuset
= sfmmu_rgntlb_demap(addr
,
7612 sfmmu_tlb_demap(addr
, sfmmup
, hmeblkp
, 0, 0);
7613 cpuset
= sfmmup
->sfmmu_cpusran
;
7622 * Walk all mappings of a page, removing write permission and clearing the
7623 * ref/mod bits. This code is similar to hat_pagesync()
7626 hat_page_clrwrt(page_t
*pp
)
7628 struct sf_hment
*sfhme
;
7629 struct sf_hment
*tmphme
= NULL
;
7636 CPUSET_ZERO(cpuset
);
7638 pml
= sfmmu_mlist_enter(pp
);
7639 index
= PP_MAPINDEX(pp
);
7642 for (sfhme
= pp
->p_mapping
; sfhme
; sfhme
= tmphme
) {
7643 tmphme
= sfhme
->hme_next
;
7646 * If we are looking for large mappings and this hme doesn't
7647 * reach the range we are seeking, just ignore its.
7650 if (hme_size(sfhme
) < cons
)
7653 tset
= sfmmu_pageclrwrt(pp
, sfhme
);
7654 CPUSET_OR(cpuset
, tset
);
7661 /* Go to leading page */
7662 pp
= PP_GROUPLEADER(pp
, cons
);
7668 sfmmu_mlist_exit(pml
);
7672 * Set the given REF/MOD/RO bits for the given page.
7673 * For a vnode with a sorted v_pages list, we need to change
7674 * the attributes and the v_pages list together under page_vnode_mutex.
7677 hat_page_setattr(page_t
*pp
, uint_t flag
)
7679 vnode_t
*vp
= pp
->p_vnode
;
7682 kmutex_t
*vphm
= NULL
;
7685 noshuffle
= flag
& P_NSH
;
7688 ASSERT(!(flag
& ~(P_MOD
| P_REF
| P_RO
)));
7691 * nothing to do if attribute already set
7693 if ((pp
->p_nrm
& flag
) == flag
)
7696 if ((flag
& P_MOD
) != 0 && vp
!= NULL
&& IS_VMODSORT(vp
) &&
7698 vphm
= page_vnode_mutex(vp
);
7702 pmtx
= sfmmu_page_enter(pp
);
7704 sfmmu_page_exit(pmtx
);
7708 * Some File Systems examine v_pages for NULL w/o
7709 * grabbing the vphm mutex. Must not let it become NULL when
7710 * pp is the only page on the list.
7712 if (pp
->p_vpnext
!= pp
) {
7713 page_vpsub(&vp
->v_pages
, pp
);
7714 if (vp
->v_pages
!= NULL
)
7715 listp
= &vp
->v_pages
->p_vpprev
->p_vpnext
;
7717 listp
= &vp
->v_pages
;
7718 page_vpadd(listp
, pp
);
7725 hat_page_clrattr(page_t
*pp
, uint_t flag
)
7727 vnode_t
*vp
= pp
->p_vnode
;
7730 ASSERT(!(flag
& ~(P_MOD
| P_REF
| P_RO
)));
7732 pmtx
= sfmmu_page_enter(pp
);
7735 * Caller is expected to hold page's io lock for VMODSORT to work
7736 * correctly with pvn_vplist_dirty() and pvn_getdirty() when mod
7738 * We don't have assert to avoid tripping some existing third party
7739 * code. The dirty page is moved back to top of the v_page list
7740 * after IO is done in pvn_write_done().
7743 sfmmu_page_exit(pmtx
);
7745 if ((flag
& P_MOD
) != 0 && vp
!= NULL
&& IS_VMODSORT(vp
)) {
7748 * VMODSORT works by removing write permissions and getting
7749 * a fault when a page is made dirty. At this point
7750 * we need to remove write permission from all mappings
7753 hat_page_clrwrt(pp
);
7758 hat_page_getattr(page_t
*pp
, uint_t flag
)
7760 ASSERT(!(flag
& ~(P_MOD
| P_REF
| P_RO
)));
7761 return ((uint_t
)(pp
->p_nrm
& flag
));
7765 * DEBUG kernels: verify that a kernel va<->pa translation
7766 * is safe by checking the underlying page_t is in a page
7767 * relocation-safe state.
7771 sfmmu_check_kpfn(pfn_t pfn
)
7776 if (hat_check_vtop
== 0)
7779 if (kvseg
.s_base
== NULL
|| panicstr
)
7782 pp
= page_numtopp_nolock(pfn
);
7786 if (PAGE_LOCKED(pp
) || PP_ISNORELOC(pp
))
7790 * Handed a large kernel page, we dig up the root page since we
7791 * know the root page might have the lock also.
7793 if (pp
->p_szc
!= 0) {
7794 index
= PP_MAPINDEX(pp
);
7797 while (index
!= 0) {
7802 pp
= PP_GROUPLEADER(pp
, cons
);
7808 if (PAGE_LOCKED(pp
) || PP_ISNORELOC(pp
))
7812 * Pages need to be locked or allocated "permanent" (either from
7813 * static_arena arena or explicitly setting PG_NORELOC when calling
7814 * page_create_va()) for VA->PA translations to be valid.
7816 if (!PP_ISNORELOC(pp
))
7817 panic("Illegal VA->PA translation, pp 0x%p not permanent",
7820 panic("Illegal VA->PA translation, pp 0x%p not locked",
7826 * Returns a page frame number for a given virtual address.
7827 * Returns PFN_INVALID to indicate an invalid mapping
7830 hat_getpfnum(struct hat
*hat
, caddr_t addr
)
7837 * ASSERT(AS_LOCK_HELD(as));
7838 * but we can't because the iommu driver will call this
7839 * routine at interrupt time and it can't grab the as lock
7840 * or it will deadlock: A thread could have the as lock
7841 * and be waiting for io. The io can't complete
7842 * because the interrupt thread is blocked trying to grab
7846 if (hat
== ksfmmup
) {
7847 if (IS_KMEM_VA_LARGEPAGE(addr
)) {
7848 ASSERT(segkmem_lpszc
> 0);
7849 pfn
= sfmmu_kvaszc2pfn(addr
, segkmem_lpszc
);
7850 if (pfn
!= PFN_INVALID
) {
7851 sfmmu_check_kpfn(pfn
);
7854 } else if (segkpm
&& IS_KPM_ADDR(addr
)) {
7855 return (sfmmu_kpm_vatopfn(addr
));
7857 while ((pfn
= sfmmu_vatopfn(addr
, ksfmmup
, &tte
))
7859 sfmmu_vatopfn_suspended(addr
, ksfmmup
, &tte
);
7861 sfmmu_check_kpfn(pfn
);
7864 return (sfmmu_uvatopfn(addr
, hat
, NULL
));
7869 * This routine will return both pfn and tte for the vaddr.
7872 sfmmu_uvatopfn(caddr_t vaddr
, struct hat
*sfmmup
, tte_t
*ttep
)
7874 struct hmehash_bucket
*hmebp
;
7876 int hmeshift
, hashno
= 1;
7877 struct hme_blk
*hmeblkp
= NULL
;
7880 struct sf_hment
*sfhmep
;
7883 /* support for ISM */
7885 ism_blk_t
*ism_blkp
;
7887 sfmmu_t
*ism_hatid
= NULL
;
7888 sfmmu_t
*locked_hatid
= NULL
;
7889 sfmmu_t
*sv_sfmmup
= sfmmup
;
7890 caddr_t sv_vaddr
= vaddr
;
7899 ASSERT(sfmmup
!= ksfmmup
);
7900 SFMMU_STAT(sf_user_vtop
);
7902 * Set ism_hatid if vaddr falls in a ISM segment.
7904 ism_blkp
= sfmmup
->sfmmu_iblk
;
7905 if (ism_blkp
!= NULL
) {
7906 sfmmu_ismhat_enter(sfmmup
, 0);
7907 locked_hatid
= sfmmup
;
7909 while (ism_blkp
!= NULL
&& ism_hatid
== NULL
) {
7910 ism_map
= ism_blkp
->iblk_maps
;
7911 for (i
= 0; ism_map
[i
].imap_ismhat
&& i
< ISM_MAP_SLOTS
; i
++) {
7912 if (vaddr
>= ism_start(ism_map
[i
]) &&
7913 vaddr
< ism_end(ism_map
[i
])) {
7914 sfmmup
= ism_hatid
= ism_map
[i
].imap_ismhat
;
7915 vaddr
= (caddr_t
)(vaddr
-
7916 ism_start(ism_map
[i
]));
7920 ism_blkp
= ism_blkp
->iblk_next
;
7923 sfmmu_ismhat_exit(locked_hatid
, 0);
7926 hblktag
.htag_id
= sfmmup
;
7927 hblktag
.htag_rid
= SFMMU_INVALID_SHMERID
;
7929 hmeshift
= HME_HASH_SHIFT(hashno
);
7930 hblktag
.htag_bspage
= HME_HASH_BSPAGE(vaddr
, hmeshift
);
7931 hblktag
.htag_rehash
= hashno
;
7932 hmebp
= HME_HASH_FUNCTION(sfmmup
, vaddr
, hmeshift
);
7934 SFMMU_HASH_LOCK(hmebp
);
7936 HME_HASH_FAST_SEARCH(hmebp
, hblktag
, hmeblkp
);
7937 if (hmeblkp
!= NULL
) {
7938 ASSERT(!hmeblkp
->hblk_shared
);
7939 HBLKTOHME(sfhmep
, hmeblkp
, vaddr
);
7940 sfmmu_copytte(&sfhmep
->hme_tte
, ttep
);
7941 SFMMU_HASH_UNLOCK(hmebp
);
7942 if (TTE_IS_VALID(ttep
)) {
7943 pfn
= TTE_TO_PFN(vaddr
, ttep
);
7948 SFMMU_HASH_UNLOCK(hmebp
);
7950 } while (HME_REHASH(sfmmup
) && (hashno
<= mmu_hashcnt
));
7952 if (SF_HMERGNMAP_ISNULL(sv_sfmmup
)) {
7953 return (PFN_INVALID
);
7955 srdp
= sv_sfmmup
->sfmmu_srdp
;
7956 ASSERT(srdp
!= NULL
);
7957 ASSERT(srdp
->srd_refcnt
!= 0);
7958 hblktag
.htag_id
= srdp
;
7961 hmeshift
= HME_HASH_SHIFT(hashno
);
7962 hblktag
.htag_bspage
= HME_HASH_BSPAGE(sv_vaddr
, hmeshift
);
7963 hblktag
.htag_rehash
= hashno
;
7964 hmebp
= HME_HASH_FUNCTION(srdp
, sv_vaddr
, hmeshift
);
7966 SFMMU_HASH_LOCK(hmebp
);
7967 for (hmeblkp
= hmebp
->hmeblkp
; hmeblkp
!= NULL
;
7968 hmeblkp
= hmeblkp
->hblk_next
) {
7974 if (!HTAGS_EQ_SHME(hmeblkp
->hblk_tag
, hblktag
,
7975 sv_sfmmup
->sfmmu_hmeregion_map
)) {
7978 ASSERT(hmeblkp
->hblk_shared
);
7979 rid
= hmeblkp
->hblk_tag
.htag_rid
;
7980 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
7981 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
7982 rgnp
= srdp
->srd_hmergnp
[rid
];
7983 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp
, srdp
, rgnp
, rid
);
7984 HBLKTOHME(sfhmep
, hmeblkp
, sv_vaddr
);
7985 sfmmu_copytte(&sfhmep
->hme_tte
, ttep
);
7986 rsaddr
= rgnp
->rgn_saddr
;
7987 readdr
= rsaddr
+ rgnp
->rgn_size
;
7989 if (TTE_IS_VALID(ttep
) ||
7990 get_hblk_ttesz(hmeblkp
) > TTE8K
) {
7991 caddr_t eva
= tte_to_evaddr(hmeblkp
, ttep
);
7992 ASSERT(eva
> sv_vaddr
);
7993 ASSERT(sv_vaddr
>= rsaddr
);
7994 ASSERT(sv_vaddr
< readdr
);
7995 ASSERT(eva
<= readdr
);
7999 * Continue the search if we
8000 * found an invalid 8K tte outside of the area
8001 * covered by this hmeblk's region.
8003 if (TTE_IS_VALID(ttep
)) {
8004 SFMMU_HASH_UNLOCK(hmebp
);
8005 pfn
= TTE_TO_PFN(sv_vaddr
, ttep
);
8007 } else if (get_hblk_ttesz(hmeblkp
) > TTE8K
||
8008 (sv_vaddr
>= rsaddr
&& sv_vaddr
< readdr
)) {
8009 SFMMU_HASH_UNLOCK(hmebp
);
8014 SFMMU_HASH_UNLOCK(hmebp
);
8016 } while (hashno
<= mmu_hashcnt
);
8017 return (PFN_INVALID
);
8022 * For compatability with AT&T and later optimizations
8026 hat_map(struct hat
*hat
, caddr_t addr
, size_t len
, uint_t flags
)
8028 ASSERT(hat
!= NULL
);
8032 * Return the number of mappings to a particular page. This number is an
8033 * approximation of the number of people sharing the page.
8035 * shared hmeblks or ism hmeblks are counted as 1 mapping here.
8036 * hat_page_checkshare() can be used to compare threshold to share
8037 * count that reflects the number of region sharers albeit at higher cost.
8040 hat_page_getshare(page_t
*pp
)
8042 page_t
*spp
= pp
; /* start page */
8045 int index
, sz
= TTE64K
;
8048 * We need to grab the mlist lock to make sure any outstanding
8049 * load/unloads complete. Otherwise we could return zero
8050 * even though the unload(s) hasn't finished yet.
8052 pml
= sfmmu_mlist_enter(spp
);
8057 cnt
+= spp
->p_kpmref
;
8059 if (vpm_enable
&& pp
->p_vpmref
) {
8064 * If we have any large mappings, we count the number of
8065 * mappings that this large page is part of.
8067 index
= PP_MAPINDEX(spp
);
8070 pp
= PP_GROUPLEADER(spp
, sz
);
8071 if ((index
& 0x1) && pp
!= spp
) {
8078 sfmmu_mlist_exit(pml
);
8083 * Return 1 if the number of mappings exceeds sh_thresh. Return 0
8084 * otherwise. Count shared hmeblks by region's refcnt.
8087 hat_page_checkshare(page_t
*pp
, ulong_t sh_thresh
)
8091 int index
, sz
= TTE8K
;
8092 struct sf_hment
*sfhme
, *tmphme
= NULL
;
8093 struct hme_blk
*hmeblkp
;
8095 pml
= sfmmu_mlist_enter(pp
);
8102 if (vpm_enable
&& pp
->p_vpmref
) {
8106 if (pp
->p_share
+ cnt
> sh_thresh
) {
8107 sfmmu_mlist_exit(pml
);
8111 index
= PP_MAPINDEX(pp
);
8114 for (sfhme
= pp
->p_mapping
; sfhme
; sfhme
= tmphme
) {
8115 tmphme
= sfhme
->hme_next
;
8116 if (IS_PAHME(sfhme
)) {
8120 hmeblkp
= sfmmu_hmetohblk(sfhme
);
8121 if (hme_size(sfhme
) != sz
) {
8125 if (hmeblkp
->hblk_shared
) {
8126 sf_srd_t
*srdp
= hblktosrd(hmeblkp
);
8127 uint_t rid
= hmeblkp
->hblk_tag
.htag_rid
;
8129 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
8130 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
8131 ASSERT(srdp
!= NULL
);
8132 rgnp
= srdp
->srd_hmergnp
[rid
];
8133 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp
, srdp
,
8135 cnt
+= rgnp
->rgn_refcnt
;
8139 if (cnt
> sh_thresh
) {
8140 sfmmu_mlist_exit(pml
);
8148 pp
= PP_GROUPLEADER(pp
, sz
);
8149 ASSERT(sfmmu_mlist_held(pp
));
8156 sfmmu_mlist_exit(pml
);
8161 * Unload all large mappings to the pp and reset the p_szc field of every
8162 * constituent page according to the remaining mappings.
8164 * pp must be locked SE_EXCL. Even though no other constituent pages are
8165 * locked it's legal to unload the large mappings to the pp because all
8166 * constituent pages of large locked mappings have to be locked SE_SHARED.
8167 * This means if we have SE_EXCL lock on one of constituent pages none of the
8168 * large mappings to pp are locked.
8170 * Decrease p_szc field starting from the last constituent page and ending
8171 * with the root page. This method is used because other threads rely on the
8172 * root's p_szc to find the lock to syncronize on. After a root page_t's p_szc
8173 * is demoted then other threads will succeed in sfmmu_mlspl_enter(). This
8174 * ensures that p_szc changes of the constituent pages appears atomic for all
8175 * threads that use sfmmu_mlspl_enter() to examine p_szc field.
8177 * This mechanism is only used for file system pages where it's not always
8178 * possible to get SE_EXCL locks on all constituent pages to demote the size
8179 * code (as is done for anonymous or kernel large pages).
8181 * See more comments in front of sfmmu_mlspl_enter().
8184 hat_page_demote(page_t
*pp
)
8191 struct sf_hment
*sfhme
;
8192 struct sf_hment
*tmphme
= NULL
;
8193 struct hme_blk
*hmeblkp
;
8199 kmutex_t
*pmtx
= NULL
;
8201 ASSERT(PAGE_EXCL(pp
));
8202 ASSERT(!PP_ISFREE(pp
));
8203 ASSERT(!PP_ISKAS(pp
));
8204 ASSERT(page_szc_lock_assert(pp
));
8205 pml
= sfmmu_mlist_enter(pp
);
8212 index
= PP_MAPINDEX(pp
) >> 1;
8215 CPUSET_ZERO(cpuset
);
8221 if (!(index
& 0x1)) {
8227 rootpp
= PP_GROUPLEADER(pp
, sz
);
8228 for (sfhme
= rootpp
->p_mapping
; sfhme
; sfhme
= tmphme
) {
8229 tmphme
= sfhme
->hme_next
;
8230 ASSERT(!IS_PAHME(sfhme
));
8231 hmeblkp
= sfmmu_hmetohblk(sfhme
);
8232 if (hme_size(sfhme
) != sz
) {
8235 tset
= sfmmu_pageunload(rootpp
, sfhme
, sz
);
8236 CPUSET_OR(cpuset
, tset
);
8243 ASSERT(!PP_ISMAPPED_LARGE(pp
));
8249 conv_tnc(rootpp
, sz
);
8254 pmtx
= sfmmu_page_enter(pp
);
8256 ASSERT(pp
->p_szc
== pszc
);
8257 rootpp
= PP_PAGEROOT(pp
);
8258 ASSERT(rootpp
->p_szc
== pszc
);
8259 lastpp
= PP_PAGENEXT_N(rootpp
, TTEPAGES(pszc
) - 1);
8261 while (lastpp
!= rootpp
) {
8262 sz
= PP_MAPINDEX(lastpp
) ? fnd_mapping_sz(lastpp
) : 0;
8264 npgs
= (sz
== 0) ? 1 : TTEPAGES(sz
);
8265 ASSERT(P2PHASE(lastpp
->p_pagenum
, npgs
) == npgs
- 1);
8266 while (--npgs
> 0) {
8267 lastpp
->p_szc
= (uchar_t
)sz
;
8268 lastpp
= PP_PAGEPREV(lastpp
);
8272 * make sure before current root's pszc
8273 * is updated all updates to constituent pages pszc
8274 * fields are globally visible.
8279 ASSERT(IS_P2ALIGNED(lastpp
->p_pagenum
, TTEPAGES(sz
)));
8280 if (lastpp
!= rootpp
) {
8281 lastpp
= PP_PAGEPREV(lastpp
);
8285 /* the loop above doesn't cover this case */
8289 ASSERT(pp
->p_szc
== 0);
8291 sfmmu_page_exit(pmtx
);
8293 sfmmu_mlist_exit(pml
);
8297 * Refresh the HAT ismttecnt[] element for size szc.
8298 * Caller must have set ISM busy flag to prevent mapping
8299 * lists from changing while we're traversing them.
8302 ism_tsb_entries(sfmmu_t
*sfmmup
, int szc
)
8304 ism_blk_t
*ism_blkp
= sfmmup
->sfmmu_iblk
;
8307 pgcnt_t npgs_scd
= 0;
8312 ASSERT(SFMMU_FLAGS_ISSET(sfmmup
, HAT_ISMBUSY
));
8313 scdp
= sfmmup
->sfmmu_scdp
;
8315 for (; ism_blkp
!= NULL
; ism_blkp
= ism_blkp
->iblk_next
) {
8316 ism_map
= ism_blkp
->iblk_maps
;
8317 for (j
= 0; ism_map
[j
].imap_ismhat
&& j
< ISM_MAP_SLOTS
; j
++) {
8318 rid
= ism_map
[j
].imap_rid
;
8319 ASSERT(rid
== SFMMU_INVALID_ISMRID
||
8320 rid
< sfmmup
->sfmmu_srdp
->srd_next_ismrid
);
8322 if (scdp
!= NULL
&& rid
!= SFMMU_INVALID_ISMRID
&&
8323 SF_RGNMAP_TEST(scdp
->scd_ismregion_map
, rid
)) {
8324 /* ISM is in sfmmup's SCD */
8326 ism_map
[j
].imap_ismhat
->sfmmu_ttecnt
[szc
];
8328 /* ISMs is not in SCD */
8330 ism_map
[j
].imap_ismhat
->sfmmu_ttecnt
[szc
];
8334 sfmmup
->sfmmu_ismttecnt
[szc
] = npgs
;
8335 sfmmup
->sfmmu_scdismttecnt
[szc
] = npgs_scd
;
8340 * Yield the memory claim requirement for an address space.
8342 * This is currently implemented as the number of bytes that have active
8343 * hardware translations that have page structures. Therefore, it can
8344 * underestimate the traditional resident set size, eg, if the
8345 * physical page is present and the hardware translation is missing;
8346 * and it can overestimate the rss, eg, if there are active
8347 * translations to a frame buffer with page structs.
8348 * Also, it does not take sharing into account.
8350 * Note that we don't acquire locks here since this function is most often
8351 * called from the clock thread.
8354 hat_get_mapped_size(struct hat
*hat
)
8362 for (i
= 0; i
< mmu_page_sizes
; i
++)
8363 assize
+= ((pgcnt_t
)hat
->sfmmu_ttecnt
[i
] +
8364 (pgcnt_t
)hat
->sfmmu_scdrttecnt
[i
]) * TTEBYTES(i
);
8366 if (hat
->sfmmu_iblk
== NULL
)
8369 for (i
= 0; i
< mmu_page_sizes
; i
++)
8370 assize
+= ((pgcnt_t
)hat
->sfmmu_ismttecnt
[i
] +
8371 (pgcnt_t
)hat
->sfmmu_scdismttecnt
[i
]) * TTEBYTES(i
);
8377 hat_stats_enable(struct hat
*hat
)
8379 hatlock_t
*hatlockp
;
8381 hatlockp
= sfmmu_hat_enter(hat
);
8382 hat
->sfmmu_rmstat
++;
8383 sfmmu_hat_exit(hatlockp
);
8388 hat_stats_disable(struct hat
*hat
)
8390 hatlock_t
*hatlockp
;
8392 hatlockp
= sfmmu_hat_enter(hat
);
8393 hat
->sfmmu_rmstat
--;
8394 sfmmu_hat_exit(hatlockp
);
8398 * Routines for entering or removing ourselves from the
8399 * ism_hat's mapping list. This is used for both private and
8403 iment_add(struct ism_ment
*iment
, struct hat
*ism_hat
)
8405 ASSERT(MUTEX_HELD(&ism_mlist_lock
));
8407 iment
->iment_prev
= NULL
;
8408 iment
->iment_next
= ism_hat
->sfmmu_iment
;
8409 if (ism_hat
->sfmmu_iment
) {
8410 ism_hat
->sfmmu_iment
->iment_prev
= iment
;
8412 ism_hat
->sfmmu_iment
= iment
;
8416 iment_sub(struct ism_ment
*iment
, struct hat
*ism_hat
)
8418 ASSERT(MUTEX_HELD(&ism_mlist_lock
));
8420 if (ism_hat
->sfmmu_iment
== NULL
) {
8421 panic("ism map entry remove - no entries");
8424 if (iment
->iment_prev
) {
8425 ASSERT(ism_hat
->sfmmu_iment
!= iment
);
8426 iment
->iment_prev
->iment_next
= iment
->iment_next
;
8428 ASSERT(ism_hat
->sfmmu_iment
== iment
);
8429 ism_hat
->sfmmu_iment
= iment
->iment_next
;
8432 if (iment
->iment_next
) {
8433 iment
->iment_next
->iment_prev
= iment
->iment_prev
;
8437 * zero out the entry
8439 iment
->iment_next
= NULL
;
8440 iment
->iment_prev
= NULL
;
8441 iment
->iment_hat
= NULL
;
8442 iment
->iment_base_va
= 0;
8446 * Hat_share()/unshare() return an (non-zero) error
8447 * when saddr and daddr are not properly aligned.
8449 * The top level mapping element determines the alignment
8450 * requirement for saddr and daddr, depending on different
8453 * When hat_share()/unshare() are not supported,
8454 * HATOP_SHARE()/UNSHARE() return 0
8457 hat_share(struct hat
*sfmmup
, caddr_t addr
,
8458 struct hat
*ism_hatid
, caddr_t sptaddr
, size_t len
, uint_t ismszc
)
8460 ism_blk_t
*ism_blkp
;
8461 ism_blk_t
*new_iblk
;
8463 ism_ment_t
*ism_ment
;
8465 hatlock_t
*hatlockp
;
8467 uint_t ismshift
= page_get_shift(ismszc
);
8468 size_t ismpgsz
= page_get_pagesize(ismszc
);
8469 uint_t ismmask
= (uint_t
)ismpgsz
- 1;
8470 size_t sh_size
= ISM_SHIFT(ismshift
, len
);
8471 ushort_t ismhatflag
;
8472 hat_region_cookie_t rcookie
;
8476 caddr_t eaddr
= addr
+ len
;
8479 ASSERT(ism_hatid
!= NULL
&& sfmmup
!= NULL
);
8480 ASSERT(sptaddr
== ISMID_STARTADDR
);
8482 * Check the alignment.
8484 if (!ISM_ALIGNED(ismshift
, addr
) || !ISM_ALIGNED(ismshift
, sptaddr
))
8488 * Check size alignment.
8490 if (!ISM_ALIGNED(ismshift
, len
))
8494 * Allocate ism_ment for the ism_hat's mapping list, and an
8495 * ism map blk in case we need one. We must do our
8496 * allocations before acquiring locks to prevent a deadlock
8497 * in the kmem allocator on the mapping list lock.
8499 new_iblk
= kmem_cache_alloc(ism_blk_cache
, KM_SLEEP
);
8500 ism_ment
= kmem_cache_alloc(ism_ment_cache
, KM_SLEEP
);
8503 * Serialize ISM mappings with the ISM busy flag, and also the
8506 sfmmu_ismhat_enter(sfmmup
, 0);
8509 * Allocate an ism map blk if necessary.
8511 if (sfmmup
->sfmmu_iblk
== NULL
) {
8512 sfmmup
->sfmmu_iblk
= new_iblk
;
8513 bzero(new_iblk
, sizeof (*new_iblk
));
8514 new_iblk
->iblk_nextpa
= (uint64_t)-1;
8515 membar_stst(); /* make sure next ptr visible to all CPUs */
8516 sfmmup
->sfmmu_ismblkpa
= va_to_pa((caddr_t
)new_iblk
);
8523 * Make sure mapping does not already exist.
8525 ism_blkp
= sfmmup
->sfmmu_iblk
;
8526 while (ism_blkp
!= NULL
) {
8527 ism_map
= ism_blkp
->iblk_maps
;
8528 for (i
= 0; i
< ISM_MAP_SLOTS
&& ism_map
[i
].imap_ismhat
; i
++) {
8529 if ((addr
>= ism_start(ism_map
[i
]) &&
8530 addr
< ism_end(ism_map
[i
])) ||
8531 eaddr
> ism_start(ism_map
[i
]) &&
8532 eaddr
<= ism_end(ism_map
[i
])) {
8533 panic("sfmmu_share: Already mapped!");
8536 ism_blkp
= ism_blkp
->iblk_next
;
8540 ASSERT(ismszc
>= TTE4M
);
8541 if (ismszc
== TTE4M
) {
8542 ismhatflag
= HAT_4M_FLAG
;
8543 } else if (ismszc
== TTE32M
) {
8544 ismhatflag
= HAT_32M_FLAG
;
8545 } else if (ismszc
== TTE256M
) {
8546 ismhatflag
= HAT_256M_FLAG
;
8549 * Add mapping to first available mapping slot.
8551 ism_blkp
= sfmmup
->sfmmu_iblk
;
8554 ism_map
= ism_blkp
->iblk_maps
;
8555 for (i
= 0; i
< ISM_MAP_SLOTS
; i
++) {
8556 if (ism_map
[i
].imap_ismhat
== NULL
) {
8558 ism_map
[i
].imap_ismhat
= ism_hatid
;
8559 ism_map
[i
].imap_vb_shift
= (uchar_t
)ismshift
;
8560 ism_map
[i
].imap_rid
= SFMMU_INVALID_ISMRID
;
8561 ism_map
[i
].imap_hatflags
= ismhatflag
;
8562 ism_map
[i
].imap_sz_mask
= ismmask
;
8564 * imap_seg is checked in ISM_CHECK to see if
8565 * non-NULL, then other info assumed valid.
8568 ism_map
[i
].imap_seg
= (uintptr_t)addr
| sh_size
;
8569 ism_map
[i
].imap_ment
= ism_ment
;
8572 * Now add ourselves to the ism_hat's
8575 ism_ment
->iment_hat
= sfmmup
;
8576 ism_ment
->iment_base_va
= addr
;
8577 ism_hatid
->sfmmu_ismhat
= 1;
8578 mutex_enter(&ism_mlist_lock
);
8579 iment_add(ism_ment
, ism_hatid
);
8580 mutex_exit(&ism_mlist_lock
);
8585 if (!added
&& ism_blkp
->iblk_next
== NULL
) {
8586 ism_blkp
->iblk_next
= new_iblk
;
8588 bzero(ism_blkp
->iblk_next
,
8589 sizeof (*ism_blkp
->iblk_next
));
8590 ism_blkp
->iblk_next
->iblk_nextpa
= (uint64_t)-1;
8592 ism_blkp
->iblk_nextpa
=
8593 va_to_pa((caddr_t
)ism_blkp
->iblk_next
);
8595 ism_blkp
= ism_blkp
->iblk_next
;
8599 * After calling hat_join_region, sfmmup may join a new SCD or
8600 * move from the old scd to a new scd, in which case, we want to
8601 * shrink the sfmmup's private tsb size, i.e., pass shrink to
8602 * sfmmu_check_page_sizes at the end of this routine.
8604 old_scdp
= sfmmup
->sfmmu_scdp
;
8606 rcookie
= hat_join_region(sfmmup
, addr
, len
, (void *)ism_hatid
, 0,
8607 PROT_ALL
, ismszc
, NULL
, HAT_REGION_ISM
);
8608 if (rcookie
!= HAT_INVALID_REGION_COOKIE
) {
8609 ism_map
[i
].imap_rid
= (uchar_t
)((uint64_t)rcookie
);
8612 * Update our counters for this sfmmup's ism mappings.
8614 for (i
= 0; i
<= ismszc
; i
++) {
8615 if (!(disable_ism_large_pages
& (1 << i
)))
8616 (void) ism_tsb_entries(sfmmup
, i
);
8620 * For ISM and DISM we do not support 512K pages, so we only only
8621 * search the 4M and 8K/64K hashes for 4 pagesize cpus, and search the
8622 * 256M or 32M, and 4M and 8K/64K hashes for 6 pagesize cpus.
8624 * Need to set 32M/256M ISM flags to make sure
8625 * sfmmu_check_page_sizes() enables them on Panther.
8627 ASSERT((disable_ism_large_pages
& (1 << TTE512K
)) != 0);
8631 if (!SFMMU_FLAGS_ISSET(sfmmup
, HAT_256M_ISM
)) {
8632 hatlockp
= sfmmu_hat_enter(sfmmup
);
8633 SFMMU_FLAGS_SET(sfmmup
, HAT_256M_ISM
);
8634 sfmmu_hat_exit(hatlockp
);
8638 if (!SFMMU_FLAGS_ISSET(sfmmup
, HAT_32M_ISM
)) {
8639 hatlockp
= sfmmu_hat_enter(sfmmup
);
8640 SFMMU_FLAGS_SET(sfmmup
, HAT_32M_ISM
);
8641 sfmmu_hat_exit(hatlockp
);
8649 * If we updated the ismblkpa for this HAT we must make
8650 * sure all CPUs running this process reload their tsbmiss area.
8651 * Otherwise they will fail to load the mappings in the tsbmiss
8652 * handler and will loop calling pagefault().
8655 hatlockp
= sfmmu_hat_enter(sfmmup
);
8656 sfmmu_sync_mmustate(sfmmup
);
8657 sfmmu_hat_exit(hatlockp
);
8660 sfmmu_ismhat_exit(sfmmup
, 0);
8663 * Free up ismblk if we didn't use it.
8665 if (new_iblk
!= NULL
)
8666 kmem_cache_free(ism_blk_cache
, new_iblk
);
8669 * Check TSB and TLB page sizes.
8671 if (sfmmup
->sfmmu_scdp
!= NULL
&& old_scdp
!= sfmmup
->sfmmu_scdp
) {
8672 sfmmu_check_page_sizes(sfmmup
, 0);
8674 sfmmu_check_page_sizes(sfmmup
, 1);
8680 * hat_unshare removes exactly one ism_map from
8681 * this process's as. It expects multiple calls
8682 * to hat_unshare for multiple shm segments.
8685 hat_unshare(struct hat
*sfmmup
, caddr_t addr
, size_t len
, uint_t ismszc
)
8688 ism_ment_t
*free_ment
= NULL
;
8689 ism_blk_t
*ism_blkp
;
8690 struct hat
*ism_hatid
;
8692 hatlock_t
*hatlockp
;
8693 struct tsb_info
*tsbinfo
;
8694 uint_t ismshift
= page_get_shift(ismszc
);
8695 size_t sh_size
= ISM_SHIFT(ismshift
, len
);
8699 ASSERT(ISM_ALIGNED(ismshift
, addr
));
8700 ASSERT(ISM_ALIGNED(ismshift
, len
));
8701 ASSERT(sfmmup
!= NULL
);
8702 ASSERT(sfmmup
!= ksfmmup
);
8704 ASSERT(sfmmup
->sfmmu_as
!= NULL
);
8707 * Make sure that during the entire time ISM mappings are removed,
8708 * the trap handlers serialize behind us, and that no one else
8709 * can be mucking with ISM mappings. This also lets us get away
8710 * with not doing expensive cross calls to flush the TLB -- we
8711 * just discard the context, flush the entire TSB, and call it
8714 sfmmu_ismhat_enter(sfmmup
, 0);
8717 * Remove the mapping.
8719 * We can't have any holes in the ism map.
8720 * The tsb miss code while searching the ism map will
8721 * stop on an empty map slot. So we must move
8722 * everyone past the hole up 1 if any.
8724 * Also empty ism map blks are not freed until the
8725 * process exits. This is to prevent a MT race condition
8726 * between sfmmu_unshare() and sfmmu_tsbmiss_exception().
8729 ism_blkp
= sfmmup
->sfmmu_iblk
;
8730 while (!found
&& ism_blkp
!= NULL
) {
8731 ism_map
= ism_blkp
->iblk_maps
;
8732 for (i
= 0; i
< ISM_MAP_SLOTS
; i
++) {
8733 if (addr
== ism_start(ism_map
[i
]) &&
8734 sh_size
== (size_t)(ism_size(ism_map
[i
]))) {
8740 ism_blkp
= ism_blkp
->iblk_next
;
8744 ism_hatid
= ism_map
[i
].imap_ismhat
;
8745 ism_rid
= ism_map
[i
].imap_rid
;
8746 ASSERT(ism_hatid
!= NULL
);
8747 ASSERT(ism_hatid
->sfmmu_ismhat
== 1);
8750 * After hat_leave_region, the sfmmup may leave SCD,
8751 * in which case, we want to grow the private tsb size when
8752 * calling sfmmu_check_page_sizes at the end of the routine.
8754 old_scdp
= sfmmup
->sfmmu_scdp
;
8756 * Then remove ourselves from the region.
8758 if (ism_rid
!= SFMMU_INVALID_ISMRID
) {
8759 hat_leave_region(sfmmup
, (void *)((uint64_t)ism_rid
),
8764 * And now guarantee that any other cpu
8765 * that tries to process an ISM miss
8768 hatlockp
= sfmmu_hat_enter(sfmmup
);
8769 sfmmu_invalidate_ctx(sfmmup
);
8770 sfmmu_hat_exit(hatlockp
);
8773 * Remove ourselves from the ism mapping list.
8775 mutex_enter(&ism_mlist_lock
);
8776 iment_sub(ism_map
[i
].imap_ment
, ism_hatid
);
8777 mutex_exit(&ism_mlist_lock
);
8778 free_ment
= ism_map
[i
].imap_ment
;
8781 * We delete the ism map by copying
8782 * the next map over the current one.
8783 * We will take the next one in the maps
8784 * array or from the next ism_blk.
8786 while (ism_blkp
!= NULL
) {
8787 ism_map
= ism_blkp
->iblk_maps
;
8788 while (i
< (ISM_MAP_SLOTS
- 1)) {
8789 ism_map
[i
] = ism_map
[i
+ 1];
8792 /* i == (ISM_MAP_SLOTS - 1) */
8793 ism_blkp
= ism_blkp
->iblk_next
;
8794 if (ism_blkp
!= NULL
) {
8795 ism_map
[i
] = ism_blkp
->iblk_maps
[0];
8798 ism_map
[i
].imap_seg
= 0;
8799 ism_map
[i
].imap_vb_shift
= 0;
8800 ism_map
[i
].imap_rid
= SFMMU_INVALID_ISMRID
;
8801 ism_map
[i
].imap_hatflags
= 0;
8802 ism_map
[i
].imap_sz_mask
= 0;
8803 ism_map
[i
].imap_ismhat
= NULL
;
8804 ism_map
[i
].imap_ment
= NULL
;
8809 * Now flush entire TSB for the process, since
8810 * demapping page by page can be too expensive.
8811 * We don't have to flush the TLB here anymore
8812 * since we switch to a new TLB ctx instead.
8813 * Also, there is no need to flush if the process
8814 * is exiting since the TSB will be freed later.
8816 if (!sfmmup
->sfmmu_free
) {
8817 hatlockp
= sfmmu_hat_enter(sfmmup
);
8818 for (tsbinfo
= sfmmup
->sfmmu_tsb
; tsbinfo
!= NULL
;
8819 tsbinfo
= tsbinfo
->tsb_next
) {
8820 if (tsbinfo
->tsb_flags
& TSB_SWAPPED
)
8822 if (tsbinfo
->tsb_flags
& TSB_RELOC_FLAG
) {
8823 tsbinfo
->tsb_flags
|=
8828 sfmmu_inv_tsb(tsbinfo
->tsb_va
,
8829 TSB_BYTES(tsbinfo
->tsb_szc
));
8831 sfmmu_hat_exit(hatlockp
);
8836 * Update our counters for this sfmmup's ism mappings.
8838 for (i
= 0; i
<= ismszc
; i
++) {
8839 if (!(disable_ism_large_pages
& (1 << i
)))
8840 (void) ism_tsb_entries(sfmmup
, i
);
8843 sfmmu_ismhat_exit(sfmmup
, 0);
8846 * We must do our freeing here after dropping locks
8847 * to prevent a deadlock in the kmem allocator on the
8848 * mapping list lock.
8850 if (free_ment
!= NULL
)
8851 kmem_cache_free(ism_ment_cache
, free_ment
);
8854 * Check TSB and TLB page sizes if the process isn't exiting.
8856 if (!sfmmup
->sfmmu_free
) {
8857 if (found
&& old_scdp
!= NULL
&& sfmmup
->sfmmu_scdp
== NULL
) {
8858 sfmmu_check_page_sizes(sfmmup
, 1);
8860 sfmmu_check_page_sizes(sfmmup
, 0);
8867 sfmmu_idcache_constructor(void *buf
, void *cdrarg
, int kmflags
)
8869 /* void *buf is sfmmu_t pointer */
8870 bzero(buf
, sizeof (sfmmu_t
));
8877 sfmmu_idcache_destructor(void *buf
, void *cdrarg
)
8879 /* void *buf is sfmmu_t pointer */
8883 * setup kmem hmeblks by bzeroing all members and initializing the nextpa
8884 * field to be the pa of this hmeblk
8888 sfmmu_hblkcache_constructor(void *buf
, void *cdrarg
, int kmflags
)
8890 struct hme_blk
*hmeblkp
;
8892 bzero(buf
, (size_t)cdrarg
);
8893 hmeblkp
= (struct hme_blk
*)buf
;
8894 hmeblkp
->hblk_nextpa
= va_to_pa((caddr_t
)hmeblkp
);
8897 mutex_init(&hmeblkp
->hblk_audit_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
8898 #endif /* HBLK_TRACE */
8905 sfmmu_hblkcache_destructor(void *buf
, void *cdrarg
)
8910 struct hme_blk
*hmeblkp
;
8912 hmeblkp
= (struct hme_blk
*)buf
;
8913 mutex_destroy(&hmeblkp
->hblk_audit_lock
);
8915 #endif /* HBLK_TRACE */
8918 #define SFMMU_CACHE_RECLAIM_SCAN_RATIO 8
8919 static int sfmmu_cache_reclaim_scan_ratio
= SFMMU_CACHE_RECLAIM_SCAN_RATIO
;
8921 * The kmem allocator will callback into our reclaim routine when the system
8922 * is running low in memory. We traverse the hash and free up all unused but
8923 * still cached hme_blks. We also traverse the free list and free them up
8928 sfmmu_hblkcache_reclaim(void *cdrarg
)
8931 struct hmehash_bucket
*hmebp
;
8932 struct hme_blk
*hmeblkp
, *nx_hblk
, *pr_hblk
= NULL
;
8933 static struct hmehash_bucket
*uhmehash_reclaim_hand
;
8934 static struct hmehash_bucket
*khmehash_reclaim_hand
;
8935 struct hme_blk
*list
= NULL
, *last_hmeblkp
;
8936 cpuset_t cpuset
= cpu_ready_set
;
8937 cpu_hme_pend_t
*cpuhp
;
8939 /* Free up hmeblks on the cpu pending lists */
8940 for (i
= 0; i
< NCPU
; i
++) {
8941 cpuhp
= &cpu_hme_pend
[i
];
8942 if (cpuhp
->chp_listp
!= NULL
) {
8943 mutex_enter(&cpuhp
->chp_mutex
);
8944 if (cpuhp
->chp_listp
== NULL
) {
8945 mutex_exit(&cpuhp
->chp_mutex
);
8948 for (last_hmeblkp
= cpuhp
->chp_listp
;
8949 last_hmeblkp
->hblk_next
!= NULL
;
8950 last_hmeblkp
= last_hmeblkp
->hblk_next
)
8952 last_hmeblkp
->hblk_next
= list
;
8953 list
= cpuhp
->chp_listp
;
8954 cpuhp
->chp_listp
= NULL
;
8955 cpuhp
->chp_count
= 0;
8956 mutex_exit(&cpuhp
->chp_mutex
);
8963 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
8967 sfmmu_hblk_free(&list
);
8971 hmebp
= uhmehash_reclaim_hand
;
8972 if (hmebp
== NULL
|| hmebp
> &uhme_hash
[UHMEHASH_SZ
])
8973 uhmehash_reclaim_hand
= hmebp
= uhme_hash
;
8974 uhmehash_reclaim_hand
+= UHMEHASH_SZ
/ sfmmu_cache_reclaim_scan_ratio
;
8976 for (i
= UHMEHASH_SZ
/ sfmmu_cache_reclaim_scan_ratio
; i
; i
--) {
8977 if (SFMMU_HASH_LOCK_TRYENTER(hmebp
) != 0) {
8978 hmeblkp
= hmebp
->hmeblkp
;
8981 nx_hblk
= hmeblkp
->hblk_next
;
8982 if (!hmeblkp
->hblk_vcnt
&&
8983 !hmeblkp
->hblk_hmecnt
) {
8984 sfmmu_hblk_hash_rm(hmebp
, hmeblkp
,
8991 SFMMU_HASH_UNLOCK(hmebp
);
8993 if (hmebp
++ == &uhme_hash
[UHMEHASH_SZ
])
8997 hmebp
= khmehash_reclaim_hand
;
8998 if (hmebp
== NULL
|| hmebp
> &khme_hash
[KHMEHASH_SZ
])
8999 khmehash_reclaim_hand
= hmebp
= khme_hash
;
9000 khmehash_reclaim_hand
+= KHMEHASH_SZ
/ sfmmu_cache_reclaim_scan_ratio
;
9002 for (i
= KHMEHASH_SZ
/ sfmmu_cache_reclaim_scan_ratio
; i
; i
--) {
9003 if (SFMMU_HASH_LOCK_TRYENTER(hmebp
) != 0) {
9004 hmeblkp
= hmebp
->hmeblkp
;
9007 nx_hblk
= hmeblkp
->hblk_next
;
9008 if (!hmeblkp
->hblk_vcnt
&&
9009 !hmeblkp
->hblk_hmecnt
) {
9010 sfmmu_hblk_hash_rm(hmebp
, hmeblkp
,
9017 SFMMU_HASH_UNLOCK(hmebp
);
9019 if (hmebp
++ == &khme_hash
[KHMEHASH_SZ
])
9022 sfmmu_hblks_list_purge(&list
, 0);
9026 * sfmmu_get_ppvcolor should become a vm_machdep or hatop interface.
9027 * same goes for sfmmu_get_addrvcolor().
9029 * This function will return the virtual color for the specified page. The
9030 * virtual color corresponds to this page current mapping or its last mapping.
9031 * It is used by memory allocators to choose addresses with the correct
9032 * alignment so vac consistency is automatically maintained. If the page
9033 * has no color it returns -1.
9037 sfmmu_get_ppvcolor(struct page
*pp
)
9042 if (!(cache
& CACHE_VAC
) || PP_NEWPAGE(pp
)) {
9045 color
= PP_GET_VCOLOR(pp
);
9046 ASSERT(color
< mmu_btop(shm_alignment
));
9054 * This function will return the desired alignment for vac consistency
9055 * (vac color) given a virtual address. If no vac is present it returns -1.
9059 sfmmu_get_addrvcolor(caddr_t vaddr
)
9062 if (cache
& CACHE_VAC
) {
9063 return (addr_to_vcolor(vaddr
));
9074 * Check for conflicts.
9075 * A conflict exists if the new and existent mappings do not match in
9076 * their "shm_alignment fields. If conflicts exist, the existant mappings
9077 * are flushed unless one of them is locked. If one of them is locked, then
9078 * the mappings are flushed and converted to non-cacheable mappings.
9081 sfmmu_vac_conflict(struct hat
*hat
, caddr_t addr
, page_t
*pp
)
9084 struct sf_hment
*sfhmep
, *tmphme
= NULL
;
9085 struct hme_blk
*hmeblkp
;
9089 ASSERT(sfmmu_mlist_held(pp
));
9090 ASSERT(!PP_ISNC(pp
)); /* page better be cacheable */
9092 vcolor
= addr_to_vcolor(addr
);
9093 if (PP_NEWPAGE(pp
)) {
9094 PP_SET_VCOLOR(pp
, vcolor
);
9098 if (PP_GET_VCOLOR(pp
) == vcolor
) {
9102 if (!PP_ISMAPPED(pp
) && !PP_ISMAPPED_KPM(pp
)) {
9104 * Previous user of page had a different color
9105 * but since there are no current users
9106 * we just flush the cache and change the color.
9108 SFMMU_STAT(sf_pgcolor_conflict
);
9109 sfmmu_cache_flush(pp
->p_pagenum
, PP_GET_VCOLOR(pp
));
9110 PP_SET_VCOLOR(pp
, vcolor
);
9115 * If we get here we have a vac conflict with a current
9116 * mapping. VAC conflict policy is as follows.
9117 * - The default is to unload the other mappings unless:
9118 * - If we have a large mapping we uncache the page.
9119 * We need to uncache the rest of the large page too.
9120 * - If any of the mappings are locked we uncache the page.
9121 * - If the requested mapping is inconsistent
9122 * with another mapping and that mapping
9123 * is in the same address space we have to
9124 * make it non-cached. The default thing
9125 * to do is unload the inconsistent mapping
9126 * but if they are in the same address space
9127 * we run the risk of unmapping the pc or the
9128 * stack which we will use as we return to the user,
9129 * in which case we can then fault on the thing
9130 * we just unloaded and get into an infinite loop.
9132 if (PP_ISMAPPED_LARGE(pp
)) {
9136 * Existing mapping is for big pages. We don't unload
9137 * existing big mappings to satisfy new mappings.
9138 * Always convert all mappings to TNC.
9140 sz
= fnd_mapping_sz(pp
);
9141 pp
= PP_GROUPLEADER(pp
, sz
);
9142 SFMMU_STAT_ADD(sf_uncache_conflict
, TTEPAGES(sz
));
9143 sfmmu_page_cache_array(pp
, HAT_TMPNC
, CACHE_FLUSH
,
9150 * check if any mapping is in same as or if it is locked
9151 * since in that case we need to uncache.
9153 for (sfhmep
= pp
->p_mapping
; sfhmep
; sfhmep
= tmphme
) {
9154 tmphme
= sfhmep
->hme_next
;
9155 if (IS_PAHME(sfhmep
))
9157 hmeblkp
= sfmmu_hmetohblk(sfhmep
);
9158 tmphat
= hblktosfmmu(hmeblkp
);
9159 sfmmu_copytte(&sfhmep
->hme_tte
, &tte
);
9160 ASSERT(TTE_IS_VALID(&tte
));
9161 if (hmeblkp
->hblk_shared
|| tmphat
== hat
||
9162 hmeblkp
->hblk_lckcnt
) {
9164 * We have an uncache conflict
9166 SFMMU_STAT(sf_uncache_conflict
);
9167 sfmmu_page_cache_array(pp
, HAT_TMPNC
, CACHE_FLUSH
, 1);
9173 * We have an unload conflict
9174 * We have already checked for LARGE mappings, therefore
9175 * the remaining mapping(s) must be TTE8K.
9177 SFMMU_STAT(sf_unload_conflict
);
9179 for (sfhmep
= pp
->p_mapping
; sfhmep
; sfhmep
= tmphme
) {
9180 tmphme
= sfhmep
->hme_next
;
9181 if (IS_PAHME(sfhmep
))
9183 hmeblkp
= sfmmu_hmetohblk(sfhmep
);
9184 ASSERT(!hmeblkp
->hblk_shared
);
9185 (void) sfmmu_pageunload(pp
, sfhmep
, TTE8K
);
9188 if (PP_ISMAPPED_KPM(pp
))
9189 sfmmu_kpm_vac_unload(pp
, addr
);
9192 * Unloads only do TLB flushes so we need to flush the
9195 sfmmu_cache_flush(pp
->p_pagenum
, PP_GET_VCOLOR(pp
));
9196 PP_SET_VCOLOR(pp
, vcolor
);
9200 * Whenever a mapping is unloaded and the page is in TNC state,
9201 * we see if the page can be made cacheable again. 'pp' is
9202 * the page that we just unloaded a mapping from, the size
9203 * of mapping that was unloaded is 'ottesz'.
9205 * The recache policy for mpss pages can leave a performance problem
9206 * under the following circumstances:
9207 * . A large page in uncached mode has just been unmapped.
9208 * . All constituent pages are TNC due to a conflicting small mapping.
9209 * . There are many other, non conflicting, small mappings around for
9210 * a lot of the constituent pages.
9211 * . We're called w/ the "old" groupleader page and the old ottesz,
9212 * but this is irrelevant, since we're no more "PP_ISMAPPED_LARGE", so
9213 * we end up w/ TTE8K or npages == 1.
9214 * . We call tst_tnc w/ the old groupleader only, and if there is no
9215 * conflict, we re-cache only this page.
9216 * . All other small mappings are not checked and will be left in TNC mode.
9217 * The problem is not very serious because:
9218 * . mpss is actually only defined for heap and stack, so the probability
9219 * is not very high that a large page mapping exists in parallel to a small
9220 * one (this is possible, but seems to be bad programming style in the
9222 * . The problem gets a little bit more serious, when those TNC pages
9223 * have to be mapped into kernel space, e.g. for networking.
9224 * . When VAC alias conflicts occur in applications, this is regarded
9225 * as an application bug. So if kstat's show them, the appl should
9226 * be changed anyway.
9229 conv_tnc(page_t
*pp
, int ottesz
)
9232 pgcnt_t curnpgs
, dopgs
;
9237 * Determine how big a range we check for TNC and find
9238 * leader page. cursz is the size of the biggest
9239 * mapping that still exist on 'pp'.
9241 if (PP_ISMAPPED_LARGE(pp
)) {
9242 cursz
= fnd_mapping_sz(pp
);
9247 if (ottesz
>= cursz
) {
9252 pp2
= PP_GROUPLEADER(pp
, dosz
);
9255 pg64k
= TTEPAGES(TTE64K
);
9256 dopgs
= TTEPAGES(dosz
);
9258 ASSERT(dopgs
== 1 || ((dopgs
& (pg64k
- 1)) == 0));
9260 while (dopgs
!= 0) {
9261 curnpgs
= TTEPAGES(cursz
);
9262 if (tst_tnc(pp2
, curnpgs
)) {
9263 SFMMU_STAT_ADD(sf_recache
, curnpgs
);
9264 sfmmu_page_cache_array(pp2
, HAT_CACHE
, CACHE_NO_FLUSH
,
9268 ASSERT(dopgs
>= curnpgs
);
9275 pp2
= PP_PAGENEXT_N(pp2
, curnpgs
);
9276 if (((dopgs
& (pg64k
- 1)) == 0) && PP_ISMAPPED_LARGE(pp2
)) {
9277 cursz
= fnd_mapping_sz(pp2
);
9285 * Returns 1 if page(s) can be converted from TNC to cacheable setting,
9286 * returns 0 otherwise. Note that oaddr argument is valid for only
9290 tst_tnc(page_t
*pp
, pgcnt_t npages
)
9292 struct sf_hment
*sfhme
;
9293 struct hme_blk
*hmeblkp
;
9297 int color
, color1
, bcolor
;
9301 ASSERT(!(cache
& CACHE_WRITEBACK
));
9304 ncolors
= CACHE_NUM_COLOR
;
9307 for (i
= 0; i
< npages
; i
++) {
9308 ASSERT(sfmmu_mlist_held(pp
));
9309 ASSERT(PP_ISTNC(pp
));
9310 ASSERT(PP_GET_VCOLOR(pp
) == NO_VCOLOR
);
9317 if (PP_ISMAPPED_KPM(pp
)) {
9321 kpmvaddr
= hat_kpm_page2va(pp
, 1);
9322 ASSERT(!(npages
> 1 && IS_KPM_ALIAS_RANGE(kpmvaddr
)));
9323 color1
= addr_to_vcolor(kpmvaddr
);
9327 for (sfhme
= pp
->p_mapping
; sfhme
; sfhme
= sfhme
->hme_next
) {
9328 if (IS_PAHME(sfhme
))
9330 hmeblkp
= sfmmu_hmetohblk(sfhme
);
9332 sfmmu_copytte(&sfhme
->hme_tte
, &tte
);
9333 ASSERT(TTE_IS_VALID(&tte
));
9335 vaddr
= tte_to_vaddr(hmeblkp
, tte
);
9336 color
= addr_to_vcolor(vaddr
);
9340 * If there is a big mapping, make sure
9341 * 8K mapping is consistent with the big
9344 bcolor
= i
% ncolors
;
9345 if (color
!= bcolor
) {
9354 if (color1
!= color
) {
9359 pp
= PP_PAGENEXT(pp
);
9366 sfmmu_page_cache_array(page_t
*pp
, int flags
, int cache_flush_flag
,
9370 int i
, ncolors
, bcolor
;
9375 ASSERT(!(cache
& CACHE_WRITEBACK
));
9377 kpmp
= sfmmu_kpm_kpmp_enter(pp
, npages
);
9378 pmtx
= sfmmu_page_enter(pp
);
9381 * Fast path caching single unmapped page
9383 if (npages
== 1 && !PP_ISMAPPED(pp
) && !PP_ISMAPPED_KPM(pp
) &&
9384 flags
== HAT_CACHE
) {
9387 sfmmu_page_exit(pmtx
);
9388 sfmmu_kpm_kpmp_exit(kpmp
);
9393 * We need to capture all cpus in order to change cacheability
9394 * because we can't allow one cpu to access the same physical
9395 * page using a cacheable and a non-cachebale mapping at the same
9396 * time. Since we may end up walking the ism mapping list
9397 * have to grab it's lock now since we can't after all the
9398 * cpus have been captured.
9400 sfmmu_hat_lock_all();
9401 mutex_enter(&ism_mlist_lock
);
9403 cpuset
= cpu_ready_set
;
9404 xc_attention(cpuset
);
9408 * Make sure all colors are flushed since the
9409 * sfmmu_page_cache() only flushes one color-
9410 * it does not know big pages.
9412 ncolors
= CACHE_NUM_COLOR
;
9413 if (flags
& HAT_TMPNC
) {
9414 for (i
= 0; i
< ncolors
; i
++) {
9415 sfmmu_cache_flushcolor(i
, pp
->p_pagenum
);
9417 cache_flush_flag
= CACHE_NO_FLUSH
;
9421 for (i
= 0; i
< npages
; i
++) {
9423 ASSERT(sfmmu_mlist_held(pp
));
9425 if (!(flags
== HAT_TMPNC
&& PP_ISTNC(pp
))) {
9428 bcolor
= i
% ncolors
;
9433 sfmmu_page_cache(pp
, flags
, cache_flush_flag
,
9437 pp
= PP_PAGENEXT(pp
);
9441 xc_dismissed(cpuset
);
9442 mutex_exit(&ism_mlist_lock
);
9443 sfmmu_hat_unlock_all();
9444 sfmmu_page_exit(pmtx
);
9445 sfmmu_kpm_kpmp_exit(kpmp
);
9450 * This function changes the virtual cacheability of all mappings to a
9451 * particular page. When changing from uncache to cacheable the mappings will
9452 * only be changed if all of them have the same virtual color.
9453 * We need to flush the cache in all cpus. It is possible that
9454 * a process referenced a page as cacheable but has sinced exited
9455 * and cleared the mapping list. We still to flush it but have no
9456 * state so all cpus is the only alternative.
9459 sfmmu_page_cache(page_t
*pp
, int flags
, int cache_flush_flag
, int bcolor
)
9461 struct sf_hment
*sfhme
;
9462 struct hme_blk
*hmeblkp
;
9470 pfn
= pp
->p_pagenum
;
9472 for (sfhme
= pp
->p_mapping
; sfhme
; sfhme
= sfhme
->hme_next
) {
9474 if (IS_PAHME(sfhme
))
9476 hmeblkp
= sfmmu_hmetohblk(sfhme
);
9478 sfmmu_copytte(&sfhme
->hme_tte
, &tte
);
9479 ASSERT(TTE_IS_VALID(&tte
));
9480 vaddr
= tte_to_vaddr(hmeblkp
, tte
);
9481 color
= addr_to_vcolor(vaddr
);
9484 if ((flags
& HAT_CACHE
) && bcolor
!= NO_VCOLOR
) {
9485 ASSERT(color
== bcolor
);
9489 ASSERT(flags
!= HAT_TMPNC
|| color
== PP_GET_VCOLOR(pp
));
9492 if (flags
& (HAT_UNCACHE
| HAT_TMPNC
)) {
9493 TTE_CLR_VCACHEABLE(&ttemod
);
9494 } else { /* flags & HAT_CACHE */
9495 TTE_SET_VCACHEABLE(&ttemod
);
9497 ret
= sfmmu_modifytte_try(&tte
, &ttemod
, &sfhme
->hme_tte
);
9500 * Since all cpus are captured modifytte should not
9503 panic("sfmmu_page_cache: write to tte failed");
9506 sfmmup
= hblktosfmmu(hmeblkp
);
9507 if (cache_flush_flag
== CACHE_FLUSH
) {
9509 * Flush TSBs, TLBs and caches
9511 if (hmeblkp
->hblk_shared
) {
9512 sf_srd_t
*srdp
= (sf_srd_t
*)sfmmup
;
9513 uint_t rid
= hmeblkp
->hblk_tag
.htag_rid
;
9515 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
9516 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
9517 ASSERT(srdp
!= NULL
);
9518 rgnp
= srdp
->srd_hmergnp
[rid
];
9519 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp
,
9521 (void) sfmmu_rgntlb_demap(vaddr
, rgnp
,
9523 sfmmu_cache_flush(pfn
, addr_to_vcolor(vaddr
));
9524 } else if (sfmmup
->sfmmu_ismhat
) {
9525 if (flags
& HAT_CACHE
) {
9526 SFMMU_STAT(sf_ism_recache
);
9528 SFMMU_STAT(sf_ism_uncache
);
9530 sfmmu_ismtlbcache_demap(vaddr
, sfmmup
, hmeblkp
,
9533 sfmmu_tlbcache_demap(vaddr
, sfmmup
, hmeblkp
,
9534 pfn
, 0, FLUSH_ALL_CPUS
, CACHE_FLUSH
, 1);
9538 * all cache entries belonging to this pfn are
9541 cache_flush_flag
= CACHE_NO_FLUSH
;
9544 * Flush only TSBs and TLBs.
9546 if (hmeblkp
->hblk_shared
) {
9547 sf_srd_t
*srdp
= (sf_srd_t
*)sfmmup
;
9548 uint_t rid
= hmeblkp
->hblk_tag
.htag_rid
;
9550 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
9551 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
9552 ASSERT(srdp
!= NULL
);
9553 rgnp
= srdp
->srd_hmergnp
[rid
];
9554 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp
,
9556 (void) sfmmu_rgntlb_demap(vaddr
, rgnp
,
9558 } else if (sfmmup
->sfmmu_ismhat
) {
9559 if (flags
& HAT_CACHE
) {
9560 SFMMU_STAT(sf_ism_recache
);
9562 SFMMU_STAT(sf_ism_uncache
);
9564 sfmmu_ismtlbcache_demap(vaddr
, sfmmup
, hmeblkp
,
9565 pfn
, CACHE_NO_FLUSH
);
9567 sfmmu_tlb_demap(vaddr
, sfmmup
, hmeblkp
, 0, 1);
9572 if (PP_ISMAPPED_KPM(pp
))
9573 sfmmu_kpm_page_cache(pp
, flags
, cache_flush_flag
);
9578 panic("sfmmu_pagecache: unknown flags");
9584 PP_SET_VCOLOR(pp
, color
);
9589 PP_SET_VCOLOR(pp
, NO_VCOLOR
);
9595 PP_SET_VCOLOR(pp
, NO_VCOLOR
);
9603 * Wrapper routine used to return a context.
9605 * It's the responsibility of the caller to guarantee that the
9606 * process serializes on calls here by taking the HAT lock for
9611 sfmmu_get_ctx(sfmmu_t
*sfmmup
)
9613 mmu_ctx_t
*mmu_ctxp
;
9617 ASSERT(sfmmu_hat_lock_held(sfmmup
));
9618 ASSERT(sfmmup
!= ksfmmup
);
9620 if (SFMMU_FLAGS_ISSET(sfmmup
, HAT_ALLCTX_INVALID
)) {
9621 sfmmu_setup_tsbinfo(sfmmup
);
9622 SFMMU_FLAGS_CLEAR(sfmmup
, HAT_ALLCTX_INVALID
);
9627 mmu_ctxp
= CPU_MMU_CTXP(CPU
);
9629 ASSERT(mmu_ctxp
->mmu_idx
< max_mmu_ctxdoms
);
9630 ASSERT(mmu_ctxp
== mmu_ctxs_tbl
[mmu_ctxp
->mmu_idx
]);
9633 * Do a wrap-around if cnum reaches the max # cnum supported by a MMU.
9635 if (mmu_ctxp
->mmu_cnum
== mmu_ctxp
->mmu_nctxs
)
9636 sfmmu_ctx_wrap_around(mmu_ctxp
, B_TRUE
);
9639 * Let the MMU set up the page sizes to use for
9640 * this context in the TLB. Don't program 2nd dtlb for ism hat.
9642 if ((&mmu_set_ctx_page_sizes
) && (sfmmup
->sfmmu_ismhat
== 0)) {
9643 mmu_set_ctx_page_sizes(sfmmup
);
9647 * sfmmu_alloc_ctx and sfmmu_load_mmustate will be performed with
9648 * interrupts disabled to prevent race condition with wrap-around
9649 * ctx invalidatation. In sun4v, ctx invalidation also involves
9650 * a HV call to set the number of TSBs to 0. If interrupts are not
9651 * disabled until after sfmmu_load_mmustate is complete TSBs may
9652 * become assigned to INVALID_CONTEXT. This is not allowed.
9654 pstate_save
= sfmmu_disable_intrs();
9656 if (sfmmu_alloc_ctx(sfmmup
, 1, CPU
, SFMMU_PRIVATE
) &&
9657 sfmmup
->sfmmu_scdp
!= NULL
) {
9658 sf_scd_t
*scdp
= sfmmup
->sfmmu_scdp
;
9659 sfmmu_t
*scsfmmup
= scdp
->scd_sfmmup
;
9660 ret
= sfmmu_alloc_ctx(scsfmmup
, 1, CPU
, SFMMU_SHARED
);
9661 /* debug purpose only */
9662 ASSERT(!ret
|| scsfmmup
->sfmmu_ctxs
[CPU_MMU_IDX(CPU
)].cnum
9663 != INVALID_CONTEXT
);
9665 sfmmu_load_mmustate(sfmmup
);
9667 sfmmu_enable_intrs(pstate_save
);
9673 * When all cnums are used up in a MMU, cnum will wrap around to the
9674 * next generation and start from 2.
9677 sfmmu_ctx_wrap_around(mmu_ctx_t
*mmu_ctxp
, boolean_t reset_cnum
)
9680 /* caller must have disabled the preemption */
9681 ASSERT(curthread
->t_preempt
>= 1);
9682 ASSERT(mmu_ctxp
!= NULL
);
9684 /* acquire Per-MMU (PM) spin lock */
9685 mutex_enter(&mmu_ctxp
->mmu_lock
);
9687 /* re-check to see if wrap-around is needed */
9688 if (mmu_ctxp
->mmu_cnum
< mmu_ctxp
->mmu_nctxs
)
9691 SFMMU_MMU_STAT(mmu_wrap_around
);
9694 ASSERT(mmu_ctxp
->mmu_gnum
!= 0);
9695 mmu_ctxp
->mmu_gnum
++;
9696 if (mmu_ctxp
->mmu_gnum
== 0 ||
9697 mmu_ctxp
->mmu_gnum
> MAX_SFMMU_GNUM_VAL
) {
9698 cmn_err(CE_PANIC
, "mmu_gnum of mmu_ctx 0x%p is out of bound.",
9702 if (mmu_ctxp
->mmu_ncpus
> 1) {
9705 membar_enter(); /* make sure updated gnum visible */
9707 SFMMU_XCALL_STATS(NULL
);
9709 /* xcall to others on the same MMU to invalidate ctx */
9710 cpuset
= mmu_ctxp
->mmu_cpuset
;
9711 ASSERT(CPU_IN_SET(cpuset
, CPU
->cpu_id
) || !reset_cnum
);
9712 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
9713 CPUSET_AND(cpuset
, cpu_ready_set
);
9716 * Pass in INVALID_CONTEXT as the first parameter to
9717 * sfmmu_raise_tsb_exception, which invalidates the context
9718 * of any process running on the CPUs in the MMU.
9720 xt_some(cpuset
, sfmmu_raise_tsb_exception
,
9721 INVALID_CONTEXT
, INVALID_CONTEXT
);
9724 SFMMU_MMU_STAT(mmu_tsb_raise_exception
);
9727 if (sfmmu_getctx_sec() != INVALID_CONTEXT
) {
9728 sfmmu_setctx_sec(INVALID_CONTEXT
);
9729 sfmmu_clear_utsbinfo();
9733 * No xcall is needed here. For sun4u systems all CPUs in context
9734 * domain share a single physical MMU therefore it's enough to flush
9735 * TLB on local CPU. On sun4v systems we use 1 global context
9736 * domain and flush all remote TLBs in sfmmu_raise_tsb_exception
9737 * handler. Note that vtag_flushall_uctxs() is called
9738 * for Ultra II machine, where the equivalent flushall functionality
9739 * is implemented in SW, and only user ctx TLB entries are flushed.
9741 if (&vtag_flushall_uctxs
!= NULL
) {
9742 vtag_flushall_uctxs();
9747 /* reset mmu cnum, skips cnum 0 and 1 */
9748 if (reset_cnum
== B_TRUE
)
9749 mmu_ctxp
->mmu_cnum
= NUM_LOCKED_CTXS
;
9752 mutex_exit(&mmu_ctxp
->mmu_lock
);
9757 * For multi-threaded process, set the process context to INVALID_CONTEXT
9758 * so that it faults and reloads the MMU state from TL=0. For single-threaded
9759 * process, we can just load the MMU state directly without having to
9760 * set context invalid. Caller must hold the hat lock since we don't
9764 sfmmu_sync_mmustate(sfmmu_t
*sfmmup
)
9769 ASSERT(sfmmup
!= ksfmmup
);
9770 ASSERT(sfmmu_hat_lock_held(sfmmup
));
9775 * We check whether the pass'ed-in sfmmup is the same as the
9776 * current running proc. This is to makes sure the current proc
9777 * stays single-threaded if it already is.
9779 if ((sfmmup
== curthread
->t_procp
->p_as
->a_hat
) &&
9780 (curthread
->t_procp
->p_lwpcnt
== 1)) {
9782 cnum
= sfmmup
->sfmmu_ctxs
[CPU_MMU_IDX(CPU
)].cnum
;
9783 if (cnum
!= INVALID_CONTEXT
) {
9786 * Disable interrupts to prevent race condition
9787 * with sfmmu_ctx_wrap_around ctx invalidation.
9788 * In sun4v, ctx invalidation involves setting
9789 * TSB to NULL, hence, interrupts should be disabled
9790 * untill after sfmmu_load_mmustate is completed.
9792 pstate_save
= sfmmu_disable_intrs();
9793 curcnum
= sfmmu_getctx_sec();
9794 if (curcnum
== cnum
)
9795 sfmmu_load_mmustate(sfmmup
);
9796 sfmmu_enable_intrs(pstate_save
);
9797 ASSERT(curcnum
== cnum
|| curcnum
== INVALID_CONTEXT
);
9802 * or when sfmmup is not the same as the curproc.
9804 sfmmu_invalidate_ctx(sfmmup
);
9812 * Replace the specified TSB with a new TSB. This function gets called when
9813 * we grow, shrink or swapin a TSB. When swapping in a TSB (TSB_SWAPIN), the
9814 * TSB_FORCEALLOC flag may be used to force allocation of a minimum-sized TSB
9817 * Caller must hold the HAT lock, but should assume any tsb_info
9818 * pointers it has are no longer valid after calling this function.
9821 * TSB_ALLOCFAIL Failed to allocate a TSB, due to memory constraints
9822 * TSB_LOSTRACE HAT is busy, i.e. another thread is already doing
9823 * something to this tsbinfo/TSB
9824 * TSB_SUCCESS Operation succeeded
9826 static tsb_replace_rc_t
9827 sfmmu_replace_tsb(sfmmu_t
*sfmmup
, struct tsb_info
*old_tsbinfo
, uint_t szc
,
9828 hatlock_t
*hatlockp
, uint_t flags
)
9830 struct tsb_info
*new_tsbinfo
= NULL
;
9831 struct tsb_info
*curtsb
, *prevtsb
;
9835 ASSERT(sfmmup
!= ksfmmup
);
9836 ASSERT(sfmmup
->sfmmu_ismhat
== 0);
9837 ASSERT(sfmmu_hat_lock_held(sfmmup
));
9838 ASSERT(szc
<= tsb_max_growsize
);
9840 if (SFMMU_FLAGS_ISSET(sfmmup
, HAT_BUSY
))
9841 return (TSB_LOSTRACE
);
9844 * Find the tsb_info ahead of this one in the list, and
9845 * also make sure that the tsb_info passed in really
9848 for (prevtsb
= NULL
, curtsb
= sfmmup
->sfmmu_tsb
;
9849 curtsb
!= old_tsbinfo
&& curtsb
!= NULL
;
9850 prevtsb
= curtsb
, curtsb
= curtsb
->tsb_next
)
9852 ASSERT(curtsb
!= NULL
);
9854 if (!(flags
& TSB_SWAPIN
) && SFMMU_FLAGS_ISSET(sfmmup
, HAT_SWAPPED
)) {
9856 * The process is swapped out, so just set the new size
9857 * code. When it swaps back in, we'll allocate a new one
9858 * of the new chosen size.
9860 curtsb
->tsb_szc
= szc
;
9861 return (TSB_SUCCESS
);
9863 SFMMU_FLAGS_SET(sfmmup
, HAT_BUSY
);
9865 tte_sz_mask
= old_tsbinfo
->tsb_ttesz_mask
;
9868 * All initialization is done inside of sfmmu_tsbinfo_alloc().
9869 * If we fail to allocate a TSB, exit.
9871 * If tsb grows with new tsb size > 4M and old tsb size < 4M,
9872 * then try 4M slab after the initial alloc fails.
9874 * If tsb swapin with tsb size > 4M, then try 4M after the
9875 * initial alloc fails.
9877 sfmmu_hat_exit(hatlockp
);
9878 if (sfmmu_tsbinfo_alloc(&new_tsbinfo
, szc
,
9879 tte_sz_mask
, flags
, sfmmup
) &&
9880 (!(flags
& (TSB_GROW
| TSB_SWAPIN
)) || (szc
<= TSB_4M_SZCODE
) ||
9881 (!(flags
& TSB_SWAPIN
) &&
9882 (old_tsbinfo
->tsb_szc
>= TSB_4M_SZCODE
)) ||
9883 sfmmu_tsbinfo_alloc(&new_tsbinfo
, TSB_4M_SZCODE
,
9884 tte_sz_mask
, flags
, sfmmup
))) {
9885 (void) sfmmu_hat_enter(sfmmup
);
9886 if (!(flags
& TSB_SWAPIN
))
9887 SFMMU_STAT(sf_tsb_resize_failures
);
9888 SFMMU_FLAGS_CLEAR(sfmmup
, HAT_BUSY
);
9889 return (TSB_ALLOCFAIL
);
9891 (void) sfmmu_hat_enter(sfmmup
);
9894 * Re-check to make sure somebody else didn't muck with us while we
9895 * didn't hold the HAT lock. If the process swapped out, fine, just
9896 * exit; this can happen if we try to shrink the TSB from the context
9897 * of another process (such as on an ISM unmap), though it is rare.
9899 if (!(flags
& TSB_SWAPIN
) && SFMMU_FLAGS_ISSET(sfmmup
, HAT_SWAPPED
)) {
9900 SFMMU_STAT(sf_tsb_resize_failures
);
9901 SFMMU_FLAGS_CLEAR(sfmmup
, HAT_BUSY
);
9902 sfmmu_hat_exit(hatlockp
);
9903 sfmmu_tsbinfo_free(new_tsbinfo
);
9904 (void) sfmmu_hat_enter(sfmmup
);
9905 return (TSB_LOSTRACE
);
9909 /* Reverify that the tsb_info still exists.. for debugging only */
9910 for (prevtsb
= NULL
, curtsb
= sfmmup
->sfmmu_tsb
;
9911 curtsb
!= old_tsbinfo
&& curtsb
!= NULL
;
9912 prevtsb
= curtsb
, curtsb
= curtsb
->tsb_next
)
9914 ASSERT(curtsb
!= NULL
);
9918 * Quiesce any CPUs running this process on their next TLB miss
9919 * so they atomically see the new tsb_info. We temporarily set the
9920 * context to invalid context so new threads that come on processor
9921 * after we do the xcall to cpusran will also serialize behind the
9922 * HAT lock on TLB miss and will see the new TSB. Since this short
9923 * race with a new thread coming on processor is relatively rare,
9924 * this synchronization mechanism should be cheaper than always
9925 * pausing all CPUs for the duration of the setup, which is what
9926 * the old implementation did. This is particuarly true if we are
9927 * copying a huge chunk of memory around during that window.
9929 * The memory barriers are to make sure things stay consistent
9930 * with resume() since it does not hold the HAT lock while
9931 * walking the list of tsb_info structures.
9933 if ((flags
& TSB_SWAPIN
) != TSB_SWAPIN
) {
9934 /* The TSB is either growing or shrinking. */
9935 sfmmu_invalidate_ctx(sfmmup
);
9938 * It is illegal to swap in TSBs from a process other
9939 * than a process being swapped in. This in turn
9940 * implies we do not have a valid MMU context here
9941 * since a process needs one to resolve translation
9944 ASSERT(curthread
->t_procp
->p_as
->a_hat
== sfmmup
);
9948 ASSERT(max_mmu_ctxdoms
> 0);
9951 * Process should have INVALID_CONTEXT on all MMUs
9953 for (i
= 0; i
< max_mmu_ctxdoms
; i
++) {
9955 ASSERT(sfmmup
->sfmmu_ctxs
[i
].cnum
== INVALID_CONTEXT
);
9959 new_tsbinfo
->tsb_next
= old_tsbinfo
->tsb_next
;
9960 membar_stst(); /* strict ordering required */
9962 prevtsb
->tsb_next
= new_tsbinfo
;
9964 sfmmup
->sfmmu_tsb
= new_tsbinfo
;
9965 membar_enter(); /* make sure new TSB globally visible */
9968 * We need to migrate TSB entries from the old TSB to the new TSB
9969 * if tsb_remap_ttes is set and the TSB is growing.
9971 if (tsb_remap_ttes
&& ((flags
& TSB_GROW
) == TSB_GROW
))
9972 sfmmu_copy_tsb(old_tsbinfo
, new_tsbinfo
);
9974 SFMMU_FLAGS_CLEAR(sfmmup
, HAT_BUSY
);
9977 * Drop the HAT lock to free our old tsb_info.
9979 sfmmu_hat_exit(hatlockp
);
9981 if ((flags
& TSB_GROW
) == TSB_GROW
) {
9982 SFMMU_STAT(sf_tsb_grow
);
9983 } else if ((flags
& TSB_SHRINK
) == TSB_SHRINK
) {
9984 SFMMU_STAT(sf_tsb_shrink
);
9987 sfmmu_tsbinfo_free(old_tsbinfo
);
9989 (void) sfmmu_hat_enter(sfmmup
);
9990 return (TSB_SUCCESS
);
9994 * This function will re-program hat pgsz array, and invalidate the
9995 * process' context, forcing the process to switch to another
9996 * context on the next TLB miss, and therefore start using the
9997 * TLB that is reprogrammed for the new page sizes.
10000 sfmmu_reprog_pgsz_arr(sfmmu_t
*sfmmup
, uint8_t *tmp_pgsz
)
10003 hatlock_t
*hatlockp
= NULL
;
10005 hatlockp
= sfmmu_hat_enter(sfmmup
);
10006 /* USIII+-IV+ optimization, requires hat lock */
10008 for (i
= 0; i
< mmu_page_sizes
; i
++)
10009 sfmmup
->sfmmu_pgsz
[i
] = tmp_pgsz
[i
];
10011 SFMMU_STAT(sf_tlb_reprog_pgsz
);
10013 sfmmu_invalidate_ctx(sfmmup
);
10015 sfmmu_hat_exit(hatlockp
);
10019 * The scd_rttecnt field in the SCD must be updated to take account of the
10020 * regions which it contains.
10023 sfmmu_set_scd_rttecnt(sf_srd_t
*srdp
, sf_scd_t
*scdp
)
10030 ASSERT(srdp
!= NULL
);
10032 for (i
= 0; i
< SFMMU_HMERGNMAP_WORDS
; i
++) {
10033 if ((w
= scdp
->scd_region_map
.bitmap
[i
]) == 0) {
10044 rid
= (i
<< BT_ULSHIFT
) | j
;
10048 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
10049 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
10050 rgnp
= srdp
->srd_hmergnp
[rid
];
10051 ASSERT(rgnp
->rgn_refcnt
> 0);
10052 ASSERT(rgnp
->rgn_id
== rid
);
10054 scdp
->scd_rttecnt
[rgnp
->rgn_pgszc
] +=
10055 rgnp
->rgn_size
>> TTE_PAGE_SHIFT(rgnp
->rgn_pgszc
);
10058 * Maintain the tsb0 inflation cnt for the regions
10061 if (rgnp
->rgn_pgszc
>= TTE4M
) {
10062 scdp
->scd_sfmmup
->sfmmu_tsb0_4minflcnt
+=
10064 (TTE_PAGE_SHIFT(TTE8K
) + 2);
10071 * This function assumes that there are either four or six supported page
10072 * sizes and at most two programmable TLBs, so we need to decide which
10073 * page sizes are most important and then tell the MMU layer so it
10074 * can adjust the TLB page sizes accordingly (if supported).
10076 * If these assumptions change, this function will need to be
10077 * updated to support whatever the new limits are.
10079 * The growing flag is nonzero if we are growing the address space,
10080 * and zero if it is shrinking. This allows us to decide whether
10081 * to grow or shrink our TSB, depending upon available memory
10085 sfmmu_check_page_sizes(sfmmu_t
*sfmmup
, int growing
)
10087 uint64_t ttecnt
[MMU_PAGE_SIZES
];
10088 uint64_t tte8k_cnt
, tte4m_cnt
;
10093 * Kernel threads, processes with small address spaces not using
10094 * large pages, and dummy ISM HATs need not apply.
10096 if (sfmmup
== ksfmmup
|| sfmmup
->sfmmu_ismhat
!= NULL
)
10099 if (!SFMMU_LGPGS_INUSE(sfmmup
) &&
10100 sfmmup
->sfmmu_ttecnt
[TTE8K
] <= tsb_rss_factor
)
10103 for (i
= 0; i
< mmu_page_sizes
; i
++) {
10104 ttecnt
[i
] = sfmmup
->sfmmu_ttecnt
[i
] +
10105 sfmmup
->sfmmu_ismttecnt
[i
];
10108 /* Check pagesizes in use, and possibly reprogram DTLB. */
10109 if (&mmu_check_page_sizes
)
10110 mmu_check_page_sizes(sfmmup
, ttecnt
);
10113 * Calculate the number of 8k ttes to represent the span of these
10116 tte8k_cnt
= ttecnt
[TTE8K
] +
10117 (ttecnt
[TTE64K
] << (MMU_PAGESHIFT64K
- MMU_PAGESHIFT
)) +
10118 (ttecnt
[TTE512K
] << (MMU_PAGESHIFT512K
- MMU_PAGESHIFT
));
10119 if (mmu_page_sizes
== max_mmu_page_sizes
) {
10120 tte4m_cnt
= ttecnt
[TTE4M
] +
10121 (ttecnt
[TTE32M
] << (MMU_PAGESHIFT32M
- MMU_PAGESHIFT4M
)) +
10122 (ttecnt
[TTE256M
] << (MMU_PAGESHIFT256M
- MMU_PAGESHIFT4M
));
10124 tte4m_cnt
= ttecnt
[TTE4M
];
10128 * Inflate tte8k_cnt to allow for region large page allocation failure.
10130 tte8k_cnt
+= sfmmup
->sfmmu_tsb0_4minflcnt
;
10133 * Inflate TSB sizes by a factor of 2 if this process
10134 * uses 4M text pages to minimize extra conflict misses
10135 * in the first TSB since without counting text pages
10136 * 8K TSB may become too small.
10138 * Also double the size of the second TSB to minimize
10139 * extra conflict misses due to competition between 4M text pages
10142 * We need to adjust the second TSB allocation threshold by the
10143 * inflation factor, since there is no point in creating a second
10144 * TSB when we know all the mappings can fit in the I/D TLBs.
10146 sectsb_thresh
= tsb_sectsb_threshold
;
10147 if (sfmmup
->sfmmu_flags
& HAT_4MTEXT_FLAG
) {
10150 sectsb_thresh
<<= 1;
10154 * Check to see if our TSB is the right size; we may need to
10155 * grow or shrink it. If the process is small, our work is
10156 * finished at this point.
10158 if (tte8k_cnt
<= tsb_rss_factor
&& tte4m_cnt
<= sectsb_thresh
) {
10161 sfmmu_size_tsb(sfmmup
, growing
, tte8k_cnt
, tte4m_cnt
, sectsb_thresh
);
10165 sfmmu_size_tsb(sfmmu_t
*sfmmup
, int growing
, uint64_t tte8k_cnt
,
10166 uint64_t tte4m_cnt
, int sectsb_thresh
)
10170 struct tsb_info
*tsbinfop
;
10171 hatlock_t
*hatlockp
= NULL
;
10173 hatlockp
= sfmmu_hat_enter(sfmmup
);
10174 ASSERT(hatlockp
!= NULL
);
10175 tsbinfop
= sfmmup
->sfmmu_tsb
;
10176 ASSERT(tsbinfop
!= NULL
);
10179 * If we're growing, select the size based on RSS. If we're
10180 * shrinking, leave some room so we don't have to turn around and
10181 * grow again immediately.
10184 tsb_szc
= SELECT_TSB_SIZECODE(tte8k_cnt
);
10186 tsb_szc
= SELECT_TSB_SIZECODE(tte8k_cnt
<< 1);
10188 if (!growing
&& (tsb_szc
< tsbinfop
->tsb_szc
) &&
10189 (tsb_szc
>= default_tsb_size
) && TSB_OK_SHRINK()) {
10190 (void) sfmmu_replace_tsb(sfmmup
, tsbinfop
, tsb_szc
,
10191 hatlockp
, TSB_SHRINK
);
10192 } else if (growing
&& tsb_szc
> tsbinfop
->tsb_szc
&& TSB_OK_GROW()) {
10193 (void) sfmmu_replace_tsb(sfmmup
, tsbinfop
, tsb_szc
,
10194 hatlockp
, TSB_GROW
);
10196 tsbinfop
= sfmmup
->sfmmu_tsb
;
10199 * With the TLB and first TSB out of the way, we need to see if
10200 * we need a second TSB for 4M pages. If we managed to reprogram
10201 * the TLB page sizes above, the process will start using this new
10202 * TSB right away; otherwise, it will start using it on the next
10203 * context switch. Either way, it's no big deal so there's no
10204 * synchronization with the trap handlers here unless we grow the
10205 * TSB (in which case it's required to prevent using the old one
10206 * after it's freed). Note: second tsb is required for 32M/256M
10209 if (tte4m_cnt
> sectsb_thresh
) {
10211 * If we're growing, select the size based on RSS. If we're
10212 * shrinking, leave some room so we don't have to turn
10213 * around and grow again immediately.
10216 tsb_szc
= SELECT_TSB_SIZECODE(tte4m_cnt
);
10218 tsb_szc
= SELECT_TSB_SIZECODE(tte4m_cnt
<< 1);
10219 if (tsbinfop
->tsb_next
== NULL
) {
10220 struct tsb_info
*newtsb
;
10221 int allocflags
= SFMMU_FLAGS_ISSET(sfmmup
, HAT_SWAPPED
)?
10224 sfmmu_hat_exit(hatlockp
);
10227 * Try to allocate a TSB for 4[32|256]M pages. If we
10228 * can't get the size we want, retry w/a minimum sized
10229 * TSB. If that still didn't work, give up; we can
10230 * still run without one.
10232 tsb_bits
= (mmu_page_sizes
== max_mmu_page_sizes
)?
10233 TSB4M
|TSB32M
|TSB256M
:TSB4M
;
10234 if ((sfmmu_tsbinfo_alloc(&newtsb
, tsb_szc
, tsb_bits
,
10235 allocflags
, sfmmup
)) &&
10236 (tsb_szc
<= TSB_4M_SZCODE
||
10237 sfmmu_tsbinfo_alloc(&newtsb
, TSB_4M_SZCODE
,
10238 tsb_bits
, allocflags
, sfmmup
)) &&
10239 sfmmu_tsbinfo_alloc(&newtsb
, TSB_MIN_SZCODE
,
10240 tsb_bits
, allocflags
, sfmmup
)) {
10244 hatlockp
= sfmmu_hat_enter(sfmmup
);
10246 sfmmu_invalidate_ctx(sfmmup
);
10248 if (sfmmup
->sfmmu_tsb
->tsb_next
== NULL
) {
10249 sfmmup
->sfmmu_tsb
->tsb_next
= newtsb
;
10250 SFMMU_STAT(sf_tsb_sectsb_create
);
10251 sfmmu_hat_exit(hatlockp
);
10255 * It's annoying, but possible for us
10256 * to get here.. we dropped the HAT lock
10257 * because of locking order in the kmem
10258 * allocator, and while we were off getting
10259 * our memory, some other thread decided to
10260 * do us a favor and won the race to get a
10261 * second TSB for this process. Sigh.
10263 sfmmu_hat_exit(hatlockp
);
10264 sfmmu_tsbinfo_free(newtsb
);
10270 * We have a second TSB, see if it's big enough.
10272 tsbinfop
= tsbinfop
->tsb_next
;
10275 * Check to see if our second TSB is the right size;
10276 * we may need to grow or shrink it.
10277 * To prevent thrashing (e.g. growing the TSB on a
10278 * subsequent map operation), only try to shrink if
10279 * the TSB reach exceeds twice the virtual address
10282 if (!growing
&& (tsb_szc
< tsbinfop
->tsb_szc
) &&
10283 (tsb_szc
>= default_tsb_size
) && TSB_OK_SHRINK()) {
10284 (void) sfmmu_replace_tsb(sfmmup
, tsbinfop
,
10285 tsb_szc
, hatlockp
, TSB_SHRINK
);
10286 } else if (growing
&& tsb_szc
> tsbinfop
->tsb_szc
&&
10288 (void) sfmmu_replace_tsb(sfmmup
, tsbinfop
,
10289 tsb_szc
, hatlockp
, TSB_GROW
);
10293 sfmmu_hat_exit(hatlockp
);
10298 * Since the sfmmu is currently embedded in the hat struct we simply zero
10299 * out our fields and free up the ism map blk list if any.
10302 sfmmu_free_sfmmu(sfmmu_t
*sfmmup
)
10304 ism_blk_t
*blkp
, *nx_blkp
;
10310 ASSERT(sfmmup
->sfmmu_ttecnt
[TTE8K
] == 0);
10311 ASSERT(sfmmup
->sfmmu_ttecnt
[TTE64K
] == 0);
10312 ASSERT(sfmmup
->sfmmu_ttecnt
[TTE512K
] == 0);
10313 ASSERT(sfmmup
->sfmmu_ttecnt
[TTE4M
] == 0);
10314 ASSERT(sfmmup
->sfmmu_ttecnt
[TTE32M
] == 0);
10315 ASSERT(sfmmup
->sfmmu_ttecnt
[TTE256M
] == 0);
10316 ASSERT(SF_RGNMAP_ISNULL(sfmmup
));
10318 sfmmup
->sfmmu_free
= 0;
10319 sfmmup
->sfmmu_ismhat
= 0;
10321 blkp
= sfmmup
->sfmmu_iblk
;
10322 sfmmup
->sfmmu_iblk
= NULL
;
10326 map
= blkp
->iblk_maps
;
10327 for (i
= 0; i
< ISM_MAP_SLOTS
; i
++) {
10328 ASSERT(map
[i
].imap_seg
== 0);
10329 ASSERT(map
[i
].imap_ismhat
== NULL
);
10330 ASSERT(map
[i
].imap_ment
== NULL
);
10333 nx_blkp
= blkp
->iblk_next
;
10334 blkp
->iblk_next
= NULL
;
10335 blkp
->iblk_nextpa
= (uint64_t)-1;
10336 kmem_cache_free(ism_blk_cache
, blkp
);
10342 * Locking primitves accessed by HATLOCK macros
10345 #define SFMMU_SPL_MTX (0x0)
10346 #define SFMMU_ML_MTX (0x1)
10348 #define SFMMU_MLSPL_MTX(type, pg) (((type) == SFMMU_SPL_MTX) ? \
10349 SPL_HASH(pg) : MLIST_HASH(pg))
10352 sfmmu_page_enter(struct page
*pp
)
10354 return (sfmmu_mlspl_enter(pp
, SFMMU_SPL_MTX
));
10358 sfmmu_page_exit(kmutex_t
*spl
)
10364 sfmmu_page_spl_held(struct page
*pp
)
10366 return (sfmmu_mlspl_held(pp
, SFMMU_SPL_MTX
));
10370 sfmmu_mlist_enter(struct page
*pp
)
10372 return (sfmmu_mlspl_enter(pp
, SFMMU_ML_MTX
));
10376 sfmmu_mlist_exit(kmutex_t
*mml
)
10382 sfmmu_mlist_held(struct page
*pp
)
10385 return (sfmmu_mlspl_held(pp
, SFMMU_ML_MTX
));
10389 * Common code for sfmmu_mlist_enter() and sfmmu_page_enter(). For
10390 * sfmmu_mlist_enter() case mml_table lock array is used and for
10391 * sfmmu_page_enter() sfmmu_page_lock lock array is used.
10393 * The lock is taken on a root page so that it protects an operation on all
10394 * constituent pages of a large page pp belongs to.
10396 * The routine takes a lock from the appropriate array. The lock is determined
10397 * by hashing the root page. After taking the lock this routine checks if the
10398 * root page has the same size code that was used to determine the root (i.e
10399 * that root hasn't changed). If root page has the expected p_szc field we
10400 * have the right lock and it's returned to the caller. If root's p_szc
10401 * decreased we release the lock and retry from the beginning. This case can
10402 * happen due to hat_page_demote() decreasing p_szc between our load of p_szc
10403 * value and taking the lock. The number of retries due to p_szc decrease is
10404 * limited by the maximum p_szc value. If p_szc is 0 we return the lock
10405 * determined by hashing pp itself.
10407 * If our caller doesn't hold a SE_SHARED or SE_EXCL lock on pp it's also
10408 * possible that p_szc can increase. To increase p_szc a thread has to lock
10409 * all constituent pages EXCL and do hat_pageunload() on all of them. All the
10410 * callers that don't hold a page locked recheck if hmeblk through which pp
10411 * was found still maps this pp. If it doesn't map it anymore returned lock
10412 * is immediately dropped. Therefore if sfmmu_mlspl_enter() hits the case of
10413 * p_szc increase after taking the lock it returns this lock without further
10414 * retries because in this case the caller doesn't care about which lock was
10415 * taken. The caller will drop it right away.
10417 * After the routine returns it's guaranteed that hat_page_demote() can't
10418 * change p_szc field of any of constituent pages of a large page pp belongs
10419 * to as long as pp was either locked at least SHARED prior to this call or
10420 * the caller finds that hment that pointed to this pp still references this
10421 * pp (this also assumes that the caller holds hme hash bucket lock so that
10422 * the same pp can't be remapped into the same hmeblk after it was unmapped by
10423 * hat_pageunload()).
10426 sfmmu_mlspl_enter(struct page
*pp
, int type
)
10429 uint_t prev_rszc
= UINT_MAX
;
10433 uint_t pszc
= pp
->p_szc
;
10435 ASSERT(pp
!= NULL
);
10439 mtx
= SFMMU_MLSPL_MTX(type
, pp
);
10444 /* The lock lives in the root page */
10445 rootpp
= PP_GROUPLEADER(pp
, pszc
);
10446 mtx
= SFMMU_MLSPL_MTX(type
, rootpp
);
10450 * Return mml in the following 3 cases:
10452 * 1) If pp itself is root since if its p_szc decreased before we took
10453 * the lock pp is still the root of smaller szc page. And if its p_szc
10454 * increased it doesn't matter what lock we return (see comment in
10455 * front of this routine).
10457 * 2) If pp's not root but rootpp is the root of a rootpp->p_szc size
10458 * large page we have the right lock since any previous potential
10459 * hat_page_demote() is done demoting from greater than current root's
10460 * p_szc because hat_page_demote() changes root's p_szc last. No
10461 * further hat_page_demote() can start or be in progress since it
10462 * would need the same lock we currently hold.
10464 * 3) If rootpp's p_szc increased since previous iteration it doesn't
10465 * matter what lock we return (see comment in front of this routine).
10467 if (pp
== rootpp
|| (rszc
= rootpp
->p_szc
) == pszc
||
10468 rszc
>= prev_rszc
) {
10473 * hat_page_demote() could have decreased root's p_szc.
10474 * In this case pp's p_szc must also be smaller than pszc.
10485 * pp's p_szc increased after it was decreased.
10486 * page cannot be mapped. Return current lock. The caller
10487 * will drop it right away.
10493 * root's p_szc is greater than pp's p_szc.
10494 * hat_page_demote() is not done with all pages
10495 * yet. Wait for it to complete.
10498 rootpp
= PP_GROUPLEADER(rootpp
, rszc
);
10499 mtx
= SFMMU_MLSPL_MTX(type
, rootpp
);
10507 sfmmu_mlspl_held(struct page
*pp
, int type
)
10511 ASSERT(pp
!= NULL
);
10512 /* The lock lives in the root page */
10513 pp
= PP_PAGEROOT(pp
);
10514 ASSERT(pp
!= NULL
);
10516 mtx
= SFMMU_MLSPL_MTX(type
, pp
);
10517 return (MUTEX_HELD(mtx
));
10521 sfmmu_get_free_hblk(struct hme_blk
**hmeblkpp
, uint_t critical
)
10523 struct hme_blk
*hblkp
;
10526 if (freehblkp
!= NULL
) {
10527 mutex_enter(&freehblkp_lock
);
10528 if (freehblkp
!= NULL
) {
10530 * If the current thread is owning hblk_reserve OR
10531 * critical request from sfmmu_hblk_steal()
10532 * let it succeed even if freehblkcnt is really low.
10534 if (freehblkcnt
<= HBLK_RESERVE_MIN
&& !critical
) {
10535 SFMMU_STAT(sf_get_free_throttle
);
10536 mutex_exit(&freehblkp_lock
);
10540 *hmeblkpp
= freehblkp
;
10542 freehblkp
= hblkp
->hblk_next
;
10543 mutex_exit(&freehblkp_lock
);
10544 hblkp
->hblk_next
= NULL
;
10545 SFMMU_STAT(sf_get_free_success
);
10547 ASSERT(hblkp
->hblk_hmecnt
== 0);
10548 ASSERT(hblkp
->hblk_vcnt
== 0);
10549 ASSERT(hblkp
->hblk_nextpa
== va_to_pa((caddr_t
)hblkp
));
10553 mutex_exit(&freehblkp_lock
);
10556 /* Check cpu hblk pending queues */
10557 if ((*hmeblkpp
= sfmmu_check_pending_hblks(TTE8K
)) != NULL
) {
10559 hblkp
->hblk_next
= NULL
;
10560 hblkp
->hblk_nextpa
= va_to_pa((caddr_t
)hblkp
);
10562 ASSERT(hblkp
->hblk_hmecnt
== 0);
10563 ASSERT(hblkp
->hblk_vcnt
== 0);
10568 SFMMU_STAT(sf_get_free_fail
);
10573 sfmmu_put_free_hblk(struct hme_blk
*hmeblkp
, uint_t critical
)
10575 struct hme_blk
*hblkp
;
10577 ASSERT(hmeblkp
->hblk_hmecnt
== 0);
10578 ASSERT(hmeblkp
->hblk_vcnt
== 0);
10579 ASSERT(hmeblkp
->hblk_nextpa
== va_to_pa((caddr_t
)hmeblkp
));
10582 * If the current thread is mapping into kernel space,
10583 * let it succede even if freehblkcnt is max
10584 * so that it will avoid freeing it to kmem.
10585 * This will prevent stack overflow due to
10586 * possible recursion since kmem_cache_free()
10587 * might require creation of a slab which
10588 * in turn needs an hmeblk to map that slab;
10589 * let's break this vicious chain at the first
10592 if (freehblkcnt
< HBLK_RESERVE_CNT
|| critical
) {
10593 mutex_enter(&freehblkp_lock
);
10594 if (freehblkcnt
< HBLK_RESERVE_CNT
|| critical
) {
10595 SFMMU_STAT(sf_put_free_success
);
10597 hmeblkp
->hblk_next
= freehblkp
;
10598 freehblkp
= hmeblkp
;
10599 mutex_exit(&freehblkp_lock
);
10602 mutex_exit(&freehblkp_lock
);
10606 * Bring down freehblkcnt to HBLK_RESERVE_CNT. We are here
10607 * only if freehblkcnt is at least HBLK_RESERVE_CNT *and*
10608 * we are not in the process of mapping into kernel space.
10611 while (freehblkcnt
> HBLK_RESERVE_CNT
) {
10612 mutex_enter(&freehblkp_lock
);
10613 if (freehblkcnt
> HBLK_RESERVE_CNT
) {
10616 freehblkp
= hblkp
->hblk_next
;
10617 mutex_exit(&freehblkp_lock
);
10618 ASSERT(get_hblk_cache(hblkp
) == sfmmu8_cache
);
10619 kmem_cache_free(sfmmu8_cache
, hblkp
);
10622 mutex_exit(&freehblkp_lock
);
10624 SFMMU_STAT(sf_put_free_fail
);
10629 sfmmu_hblk_swap(struct hme_blk
*new)
10631 struct hme_blk
*old
, *hblkp
, *prev
;
10633 caddr_t base
, vaddr
, endaddr
;
10634 struct hmehash_bucket
*hmebp
;
10635 struct sf_hment
*osfhme
, *nsfhme
;
10639 struct hme_blk
*list
= NULL
;
10642 hmeblk_tag hblktag
;
10643 struct hme_blk
*found
;
10645 old
= HBLK_RESERVE
;
10646 ASSERT(!old
->hblk_shared
);
10649 * save pa before bcopy clobbers it
10651 newpa
= new->hblk_nextpa
;
10653 base
= (caddr_t
)get_hblk_base(old
);
10654 endaddr
= base
+ get_hblk_span(old
);
10657 * acquire hash bucket lock.
10659 hmebp
= sfmmu_tteload_acquire_hashbucket(ksfmmup
, base
, TTE8K
,
10660 SFMMU_INVALID_SHMERID
);
10663 * copy contents from old to new
10665 bcopy((void *)old
, (void *)new, HME8BLK_SZ
);
10668 * add new to hash chain
10670 sfmmu_hblk_hash_add(hmebp
, new, newpa
);
10673 * search hash chain for hblk_reserve; this needs to be performed
10674 * after adding new, otherwise prev won't correspond to the hblk which
10675 * is prior to old in hash chain when we call sfmmu_hblk_hash_rm to
10676 * remove old later.
10679 hblkp
= hmebp
->hmeblkp
; hblkp
!= NULL
&& hblkp
!= old
;
10680 prev
= hblkp
, hblkp
= hblkp
->hblk_next
)
10684 panic("sfmmu_hblk_swap: hblk_reserve not found");
10687 * p_mapping list is still pointing to hments in hblk_reserve;
10688 * fix up p_mapping list so that they point to hments in new.
10690 * Since all these mappings are created by hblk_reserve_thread
10691 * on the way and it's using at least one of the buffers from each of
10692 * the newly minted slabs, there is no danger of any of these
10693 * mappings getting unloaded by another thread.
10695 * tsbmiss could only modify ref/mod bits of hments in old/new.
10696 * Since all of these hments hold mappings established by segkmem
10697 * and mappings in segkmem are setup with HAT_NOSYNC, ref/mod bits
10698 * have no meaning for the mappings in hblk_reserve. hments in
10699 * old and new are identical except for ref/mod bits.
10701 for (vaddr
= base
; vaddr
< endaddr
; vaddr
+= TTEBYTES(TTE8K
)) {
10703 HBLKTOHME(osfhme
, old
, vaddr
);
10704 sfmmu_copytte(&osfhme
->hme_tte
, &tte
);
10706 if (TTE_IS_VALID(&tte
)) {
10707 if ((pp
= osfhme
->hme_page
) == NULL
)
10708 panic("sfmmu_hblk_swap: page not mapped");
10710 pml
= sfmmu_mlist_enter(pp
);
10712 if (pp
!= osfhme
->hme_page
)
10713 panic("sfmmu_hblk_swap: mapping changed");
10715 HBLKTOHME(nsfhme
, new, vaddr
);
10717 HME_ADD(nsfhme
, pp
);
10718 HME_SUB(osfhme
, pp
);
10720 sfmmu_mlist_exit(pml
);
10725 * remove old from hash chain
10727 sfmmu_hblk_hash_rm(hmebp
, old
, prev
, &list
, 1);
10731 hblktag
.htag_id
= ksfmmup
;
10732 hblktag
.htag_rid
= SFMMU_INVALID_SHMERID
;
10733 hblktag
.htag_bspage
= HME_HASH_BSPAGE(base
, HME_HASH_SHIFT(TTE8K
));
10734 hblktag
.htag_rehash
= HME_HASH_REHASH(TTE8K
);
10735 HME_HASH_FAST_SEARCH(hmebp
, hblktag
, found
);
10738 panic("sfmmu_hblk_swap: new hblk not found");
10741 SFMMU_HASH_UNLOCK(hmebp
);
10744 * Reset hblk_reserve
10746 bzero((void *)old
, HME8BLK_SZ
);
10747 old
->hblk_nextpa
= va_to_pa((caddr_t
)old
);
10751 * Grab the mlist mutex for both pages passed in.
10753 * low and high will be returned as pointers to the mutexes for these pages.
10754 * low refers to the mutex residing in the lower bin of the mlist hash, while
10755 * high refers to the mutex residing in the higher bin of the mlist hash. This
10756 * is due to the locking order restrictions on the same thread grabbing
10757 * multiple mlist mutexes. The low lock must be acquired before the high lock.
10759 * If both pages hash to the same mutex, only grab that single mutex, and
10760 * high will be returned as NULL
10761 * If the pages hash to different bins in the hash, grab the lower addressed
10762 * lock first and then the higher addressed lock in order to follow the locking
10763 * rules involved with the same thread grabbing multiple mlist mutexes.
10764 * low and high will both have non-NULL values.
10767 sfmmu_mlist_reloc_enter(struct page
*targ
, struct page
*repl
,
10768 kmutex_t
**low
, kmutex_t
**high
)
10770 kmutex_t
*mml_targ
, *mml_repl
;
10773 * no need to do the dance around szc as in sfmmu_mlist_enter()
10774 * because this routine is only called by hat_page_relocate() and all
10775 * targ and repl pages are already locked EXCL so szc can't change.
10778 mml_targ
= MLIST_HASH(PP_PAGEROOT(targ
));
10779 mml_repl
= MLIST_HASH(PP_PAGEROOT(repl
));
10781 if (mml_targ
== mml_repl
) {
10785 if (mml_targ
< mml_repl
) {
10796 mutex_enter(*high
);
10800 sfmmu_mlist_reloc_exit(kmutex_t
*low
, kmutex_t
*high
)
10808 sfmmu_hat_enter(sfmmu_t
*sfmmup
)
10810 hatlock_t
*hatlockp
;
10812 if (sfmmup
!= ksfmmup
) {
10813 hatlockp
= TSB_HASH(sfmmup
);
10814 mutex_enter(HATLOCK_MUTEXP(hatlockp
));
10821 sfmmu_hat_tryenter(sfmmu_t
*sfmmup
)
10823 hatlock_t
*hatlockp
;
10825 if (sfmmup
!= ksfmmup
) {
10826 hatlockp
= TSB_HASH(sfmmup
);
10827 if (mutex_tryenter(HATLOCK_MUTEXP(hatlockp
)) == 0)
10835 sfmmu_hat_exit(hatlock_t
*hatlockp
)
10837 if (hatlockp
!= NULL
)
10838 mutex_exit(HATLOCK_MUTEXP(hatlockp
));
10842 sfmmu_hat_lock_all(void)
10845 for (i
= 0; i
< SFMMU_NUM_LOCK
; i
++)
10846 mutex_enter(HATLOCK_MUTEXP(&hat_lock
[i
]));
10850 sfmmu_hat_unlock_all(void)
10853 for (i
= SFMMU_NUM_LOCK
- 1; i
>= 0; i
--)
10854 mutex_exit(HATLOCK_MUTEXP(&hat_lock
[i
]));
10858 sfmmu_hat_lock_held(sfmmu_t
*sfmmup
)
10860 ASSERT(sfmmup
!= ksfmmup
);
10861 return (MUTEX_HELD(HATLOCK_MUTEXP(TSB_HASH(sfmmup
))));
10865 * Locking primitives to provide consistency between ISM unmap
10866 * and other operations. Since ISM unmap can take a long time, we
10867 * use HAT_ISMBUSY flag (protected by the hatlock) to avoid creating
10868 * contention on the hatlock buckets while ISM segments are being
10869 * unmapped. The tradeoff is that the flags don't prevent priority
10870 * inversion from occurring, so we must request kernel priority in
10871 * case we have to sleep to keep from getting buried while holding
10872 * the HAT_ISMBUSY flag set, which in turn could block other kernel
10873 * threads from running (for example, in sfmmu_uvatopfn()).
10876 sfmmu_ismhat_enter(sfmmu_t
*sfmmup
, int hatlock_held
)
10878 hatlock_t
*hatlockp
;
10880 THREAD_KPRI_REQUEST();
10882 hatlockp
= sfmmu_hat_enter(sfmmup
);
10883 while (SFMMU_FLAGS_ISSET(sfmmup
, HAT_ISMBUSY
))
10884 cv_wait(&sfmmup
->sfmmu_tsb_cv
, HATLOCK_MUTEXP(hatlockp
));
10885 SFMMU_FLAGS_SET(sfmmup
, HAT_ISMBUSY
);
10887 sfmmu_hat_exit(hatlockp
);
10891 sfmmu_ismhat_exit(sfmmu_t
*sfmmup
, int hatlock_held
)
10893 hatlock_t
*hatlockp
;
10896 hatlockp
= sfmmu_hat_enter(sfmmup
);
10897 ASSERT(SFMMU_FLAGS_ISSET(sfmmup
, HAT_ISMBUSY
));
10898 SFMMU_FLAGS_CLEAR(sfmmup
, HAT_ISMBUSY
);
10899 cv_broadcast(&sfmmup
->sfmmu_tsb_cv
);
10901 sfmmu_hat_exit(hatlockp
);
10902 THREAD_KPRI_RELEASE();
10909 * (1) if segkmem is not ready, allocate hblk from an array of pre-alloc'ed
10912 * (2) if we are allocating an hblk for mapping a slab in sfmmu_cache,
10914 * (a) try to return an hblk from reserve pool of free hblks;
10915 * (b) if the reserve pool is empty, acquire hblk_reserve_lock
10916 * and return hblk_reserve.
10918 * (3) call kmem_cache_alloc() to allocate hblk;
10920 * (a) if hblk_reserve_lock is held by the current thread,
10921 * atomically replace hblk_reserve by the hblk that is
10922 * returned by kmem_cache_alloc; release hblk_reserve_lock
10923 * and call kmem_cache_alloc() again.
10924 * (b) if reserve pool is not full, add the hblk that is
10925 * returned by kmem_cache_alloc to reserve pool and
10926 * call kmem_cache_alloc again.
10929 static struct hme_blk
*
10930 sfmmu_hblk_alloc(sfmmu_t
*sfmmup
, caddr_t vaddr
,
10931 struct hmehash_bucket
*hmebp
, uint_t size
, hmeblk_tag hblktag
,
10932 uint_t flags
, uint_t rid
)
10934 struct hme_blk
*hmeblkp
= NULL
;
10935 struct hme_blk
*newhblkp
;
10936 struct hme_blk
*shw_hblkp
= NULL
;
10937 struct kmem_cache
*sfmmu_cache
= NULL
;
10940 uint_t owner
; /* set to 1 if using hblk_reserve */
10946 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp
));
10947 ASSERT(hblktag
.htag_rid
== rid
);
10948 SFMMU_VALIDATE_HMERID(sfmmup
, rid
, vaddr
, TTEBYTES(size
));
10949 ASSERT(!SFMMU_IS_SHMERID_VALID(rid
) ||
10950 IS_P2ALIGNED(vaddr
, TTEBYTES(size
)));
10953 * If segkmem is not created yet, allocate from static hmeblks
10954 * created at the end of startup_modules(). See the block comment
10955 * in startup_modules() describing how we estimate the number of
10956 * static hmeblks that will be needed during re-map.
10958 if (!hblk_alloc_dynamic
) {
10960 ASSERT(!SFMMU_IS_SHMERID_VALID(rid
));
10962 if (size
== TTE8K
) {
10963 index
= nucleus_hblk8
.index
;
10964 if (index
>= nucleus_hblk8
.len
) {
10966 * If we panic here, see startup_modules() to
10967 * make sure that we are calculating the
10968 * number of hblk8's that we need correctly.
10970 prom_panic("no nucleus hblk8 to allocate");
10973 (struct hme_blk
*)&nucleus_hblk8
.list
[index
];
10974 nucleus_hblk8
.index
++;
10975 SFMMU_STAT(sf_hblk8_nalloc
);
10977 index
= nucleus_hblk1
.index
;
10978 if (nucleus_hblk1
.index
>= nucleus_hblk1
.len
) {
10980 * If we panic here, see startup_modules().
10981 * Most likely you need to update the
10982 * calculation of the number of hblk1 elements
10983 * that the kernel needs to boot.
10985 prom_panic("no nucleus hblk1 to allocate");
10988 (struct hme_blk
*)&nucleus_hblk1
.list
[index
];
10989 nucleus_hblk1
.index
++;
10990 SFMMU_STAT(sf_hblk1_nalloc
);
10996 SFMMU_HASH_UNLOCK(hmebp
);
10998 if (sfmmup
!= KHATID
&& !SFMMU_IS_SHMERID_VALID(rid
)) {
10999 if (mmu_page_sizes
== max_mmu_page_sizes
) {
11000 if (size
< TTE256M
)
11001 shw_hblkp
= sfmmu_shadow_hcreate(sfmmup
, vaddr
,
11005 shw_hblkp
= sfmmu_shadow_hcreate(sfmmup
, vaddr
,
11008 } else if (SFMMU_IS_SHMERID_VALID(rid
)) {
11010 * Shared hmes use per region bitmaps in rgn_hmeflag
11011 * rather than shadow hmeblks to keep track of the
11012 * mapping sizes which have been allocated for the region.
11013 * Here we cleanup old invalid hmeblks with this rid,
11014 * which may be left around by pageunload().
11018 caddr_t eva
= vaddr
+ TTEBYTES(size
);
11020 ASSERT(sfmmup
!= KHATID
);
11022 srdp
= sfmmup
->sfmmu_srdp
;
11023 ASSERT(srdp
!= NULL
&& srdp
->srd_refcnt
!= 0);
11024 rgnp
= srdp
->srd_hmergnp
[rid
];
11025 ASSERT(rgnp
!= NULL
&& rgnp
->rgn_id
== rid
);
11026 ASSERT(rgnp
->rgn_refcnt
!= 0);
11027 ASSERT(size
<= rgnp
->rgn_pgszc
);
11029 ttesz
= HBLK_MIN_TTESZ
;
11031 if (!(rgnp
->rgn_hmeflags
& (0x1 << ttesz
))) {
11035 if (ttesz
> size
&& ttesz
!= HBLK_MIN_TTESZ
) {
11036 sfmmu_cleanup_rhblk(srdp
, vaddr
, rid
, ttesz
);
11037 } else if (ttesz
< size
) {
11038 for (va
= vaddr
; va
< eva
;
11039 va
+= TTEBYTES(ttesz
)) {
11040 sfmmu_cleanup_rhblk(srdp
, va
, rid
,
11044 } while (++ttesz
<= rgnp
->rgn_pgszc
);
11048 owner
= (hblk_reserve_thread
== curthread
) ? 1 : 0;
11050 if (owner
&& size
== TTE8K
) {
11052 ASSERT(!SFMMU_IS_SHMERID_VALID(rid
));
11054 * We are really in a tight spot. We already own
11055 * hblk_reserve and we need another hblk. In anticipation
11056 * of this kind of scenario, we specifically set aside
11057 * HBLK_RESERVE_MIN number of hblks to be used exclusively
11058 * by owner of hblk_reserve.
11060 SFMMU_STAT(sf_hblk_recurse_cnt
);
11062 if (!sfmmu_get_free_hblk(&hmeblkp
, 1))
11063 panic("sfmmu_hblk_alloc: reserve list is empty");
11070 if ((flags
& HAT_NO_KALLOC
) == 0) {
11072 sfmmu_cache
= ((size
== TTE8K
) ? sfmmu8_cache
: sfmmu1_cache
);
11073 sleep
= ((sfmmup
== KHATID
) ? KM_NOSLEEP
: KM_SLEEP
);
11075 if ((hmeblkp
= kmem_cache_alloc(sfmmu_cache
, sleep
)) == NULL
) {
11076 hmeblkp
= sfmmu_hblk_steal(size
);
11079 * if we are the owner of hblk_reserve,
11080 * swap hblk_reserve with hmeblkp and
11081 * start a fresh life. Hope things go
11082 * better this time.
11084 if (hblk_reserve_thread
== curthread
) {
11085 ASSERT(sfmmu_cache
== sfmmu8_cache
);
11086 sfmmu_hblk_swap(hmeblkp
);
11087 hblk_reserve_thread
= NULL
;
11088 mutex_exit(&hblk_reserve_lock
);
11092 * let's donate this hblk to our reserve list if
11093 * we are not mapping kernel range
11095 if (size
== TTE8K
&& sfmmup
!= KHATID
) {
11096 if (sfmmu_put_free_hblk(hmeblkp
, 0))
11102 * We are here to map the slab in sfmmu8_cache; let's
11103 * check if we could tap our reserve list; if successful,
11104 * this will avoid the pain of going thru sfmmu_hblk_swap
11106 SFMMU_STAT(sf_hblk_slab_cnt
);
11107 if (!sfmmu_get_free_hblk(&hmeblkp
, 0)) {
11109 * let's start hblk_reserve dance
11111 SFMMU_STAT(sf_hblk_reserve_cnt
);
11113 mutex_enter(&hblk_reserve_lock
);
11114 hmeblkp
= HBLK_RESERVE
;
11115 hblk_reserve_thread
= curthread
;
11120 ASSERT(hmeblkp
!= NULL
);
11121 set_hblk_sz(hmeblkp
, size
);
11122 ASSERT(hmeblkp
->hblk_nextpa
== va_to_pa((caddr_t
)hmeblkp
));
11123 SFMMU_HASH_LOCK(hmebp
);
11124 HME_HASH_FAST_SEARCH(hmebp
, hblktag
, newhblkp
);
11125 if (newhblkp
!= NULL
) {
11126 SFMMU_HASH_UNLOCK(hmebp
);
11127 if (hmeblkp
!= HBLK_RESERVE
) {
11129 * This is really tricky!
11131 * vmem_alloc(vmem_seg_arena)
11132 * vmem_alloc(vmem_internal_arena)
11133 * segkmem_alloc(heap_arena)
11134 * vmem_alloc(heap_arena)
11137 * kmem_cache_free()
11138 * kmem_cache_alloc()
11139 * kmem_slab_create()
11140 * vmem_alloc(kmem_internal_arena)
11141 * segkmem_alloc(heap_arena)
11142 * vmem_alloc(heap_arena)
11145 * kmem_cache_free()
11148 * Thus, hat_memload() could call kmem_cache_free
11149 * for enough number of times that we could easily
11150 * hit the bottom of the stack or run out of reserve
11151 * list of vmem_seg structs. So, we must donate
11152 * this hblk to reserve list if it's allocated
11153 * from sfmmu8_cache *and* mapping kernel range.
11154 * We don't need to worry about freeing hmeblk1's
11155 * to kmem since they don't map any kmem slabs.
11157 * Note: When segkmem supports largepages, we must
11158 * free hmeblk1's to reserve list as well.
11160 forcefree
= (sfmmup
== KHATID
) ? 1 : 0;
11161 if (size
== TTE8K
&&
11162 sfmmu_put_free_hblk(hmeblkp
, forcefree
)) {
11165 ASSERT(sfmmup
!= KHATID
);
11166 kmem_cache_free(get_hblk_cache(hmeblkp
), hmeblkp
);
11169 * Hey! we don't need hblk_reserve any more.
11172 hblk_reserve_thread
= NULL
;
11173 mutex_exit(&hblk_reserve_lock
);
11178 * let's check if the goodies are still present
11180 SFMMU_HASH_LOCK(hmebp
);
11181 HME_HASH_FAST_SEARCH(hmebp
, hblktag
, newhblkp
);
11182 if (newhblkp
!= NULL
) {
11184 * return newhblkp if it's not hblk_reserve;
11185 * if newhblkp is hblk_reserve, return it
11186 * _only if_ we are the owner of hblk_reserve.
11188 if (newhblkp
!= HBLK_RESERVE
|| owner
) {
11189 ASSERT(!SFMMU_IS_SHMERID_VALID(rid
) ||
11190 newhblkp
->hblk_shared
);
11191 ASSERT(SFMMU_IS_SHMERID_VALID(rid
) ||
11192 !newhblkp
->hblk_shared
);
11196 * we just hit hblk_reserve in the hash and
11197 * we are not the owner of that;
11199 * block until hblk_reserve_thread completes
11200 * swapping hblk_reserve and try the dance
11203 SFMMU_HASH_UNLOCK(hmebp
);
11204 mutex_enter(&hblk_reserve_lock
);
11205 mutex_exit(&hblk_reserve_lock
);
11206 SFMMU_STAT(sf_hblk_reserve_hit
);
11211 * it's no more! try the dance once again.
11213 SFMMU_HASH_UNLOCK(hmebp
);
11219 if (SFMMU_IS_SHMERID_VALID(rid
)) {
11220 uint16_t tteflag
= 0x1 <<
11221 ((size
< HBLK_MIN_TTESZ
) ? HBLK_MIN_TTESZ
: size
);
11223 if (!(rgnp
->rgn_hmeflags
& tteflag
)) {
11224 atomic_or_16(&rgnp
->rgn_hmeflags
, tteflag
);
11226 hmeblkp
->hblk_shared
= 1;
11228 hmeblkp
->hblk_shared
= 0;
11230 set_hblk_sz(hmeblkp
, size
);
11231 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp
));
11232 hmeblkp
->hblk_next
= (struct hme_blk
*)NULL
;
11233 hmeblkp
->hblk_tag
= hblktag
;
11234 hmeblkp
->hblk_shadow
= shw_hblkp
;
11235 hblkpa
= hmeblkp
->hblk_nextpa
;
11236 hmeblkp
->hblk_nextpa
= HMEBLK_ENDPA
;
11238 ASSERT(get_hblk_ttesz(hmeblkp
) == size
);
11239 ASSERT(get_hblk_span(hmeblkp
) == HMEBLK_SPAN(size
));
11240 ASSERT(hmeblkp
->hblk_hmecnt
== 0);
11241 ASSERT(hmeblkp
->hblk_vcnt
== 0);
11242 ASSERT(hmeblkp
->hblk_lckcnt
== 0);
11243 ASSERT(hblkpa
== va_to_pa((caddr_t
)hmeblkp
));
11244 sfmmu_hblk_hash_add(hmebp
, hmeblkp
, hblkpa
);
11249 * This function cleans up the hme_blk and returns it to the free list.
11253 sfmmu_hblk_free(struct hme_blk
**listp
)
11255 struct hme_blk
*hmeblkp
, *next_hmeblkp
;
11260 ASSERT(*listp
!= NULL
);
11263 while (hmeblkp
!= NULL
) {
11264 next_hmeblkp
= hmeblkp
->hblk_next
;
11265 ASSERT(!hmeblkp
->hblk_hmecnt
);
11266 ASSERT(!hmeblkp
->hblk_vcnt
);
11267 ASSERT(!hmeblkp
->hblk_lckcnt
);
11268 ASSERT(hmeblkp
!= (struct hme_blk
*)hblk_reserve
);
11269 ASSERT(hmeblkp
->hblk_shared
== 0);
11270 ASSERT(hmeblkp
->hblk_shw_bit
== 0);
11271 ASSERT(hmeblkp
->hblk_shadow
== NULL
);
11273 hblkpa
= va_to_pa((caddr_t
)hmeblkp
);
11274 ASSERT(hblkpa
!= (uint64_t)-1);
11275 critical
= (hblktosfmmu(hmeblkp
) == KHATID
) ? 1 : 0;
11277 size
= get_hblk_ttesz(hmeblkp
);
11278 hmeblkp
->hblk_next
= NULL
;
11279 hmeblkp
->hblk_nextpa
= hblkpa
;
11281 if (hmeblkp
->hblk_nuc_bit
== 0) {
11283 if (size
!= TTE8K
||
11284 !sfmmu_put_free_hblk(hmeblkp
, critical
))
11285 kmem_cache_free(get_hblk_cache(hmeblkp
),
11288 hmeblkp
= next_hmeblkp
;
11292 #define BUCKETS_TO_SEARCH_BEFORE_UNLOAD 30
11293 #define SFMMU_HBLK_STEAL_THRESHOLD 5
11295 static uint_t sfmmu_hblk_steal_twice
;
11296 static uint_t sfmmu_hblk_steal_count
, sfmmu_hblk_steal_unload_count
;
11299 * Steal a hmeblk from user or kernel hme hash lists.
11300 * For 8K tte grab one from reserve pool (freehblkp) before proceeding to
11301 * steal and if we fail to steal after SFMMU_HBLK_STEAL_THRESHOLD attempts
11302 * tap into critical reserve of freehblkp.
11303 * Note: We remain looping in this routine until we find one.
11305 static struct hme_blk
*
11306 sfmmu_hblk_steal(int size
)
11308 static struct hmehash_bucket
*uhmehash_steal_hand
= NULL
;
11309 struct hmehash_bucket
*hmebp
;
11310 struct hme_blk
*hmeblkp
= NULL
, *pr_hblk
;
11313 uint_t loop_cnt
= 0, critical
;
11316 /* Check cpu hblk pending queues */
11317 if ((hmeblkp
= sfmmu_check_pending_hblks(size
)) != NULL
) {
11318 hmeblkp
->hblk_nextpa
= va_to_pa((caddr_t
)hmeblkp
);
11319 ASSERT(hmeblkp
->hblk_hmecnt
== 0);
11320 ASSERT(hmeblkp
->hblk_vcnt
== 0);
11324 if (size
== TTE8K
) {
11326 (++loop_cnt
> SFMMU_HBLK_STEAL_THRESHOLD
) ? 1 : 0;
11327 if (sfmmu_get_free_hblk(&hmeblkp
, critical
))
11331 hmebp
= (uhmehash_steal_hand
== NULL
) ? uhme_hash
:
11332 uhmehash_steal_hand
;
11333 ASSERT(hmebp
>= uhme_hash
&& hmebp
<= &uhme_hash
[UHMEHASH_SZ
]);
11335 for (i
= 0; hmeblkp
== NULL
&& i
<= UHMEHASH_SZ
+
11336 BUCKETS_TO_SEARCH_BEFORE_UNLOAD
; i
++) {
11337 SFMMU_HASH_LOCK(hmebp
);
11338 hmeblkp
= hmebp
->hmeblkp
;
11339 hblkpa
= hmebp
->hmeh_nextpa
;
11343 * check if it is a hmeblk that is not locked
11344 * and not shared. skip shadow hmeblks with
11345 * shadow_mask set i.e valid count non zero.
11347 if ((get_hblk_ttesz(hmeblkp
) == size
) &&
11348 (hmeblkp
->hblk_shw_bit
== 0 ||
11349 hmeblkp
->hblk_vcnt
== 0) &&
11350 (hmeblkp
->hblk_lckcnt
== 0)) {
11352 * there is a high probability that we
11353 * will find a free one. search some
11354 * buckets for a free hmeblk initially
11355 * before unloading a valid hmeblk.
11357 if ((hmeblkp
->hblk_vcnt
== 0 &&
11358 hmeblkp
->hblk_hmecnt
== 0) || (i
>=
11359 BUCKETS_TO_SEARCH_BEFORE_UNLOAD
)) {
11360 if (sfmmu_steal_this_hblk(hmebp
,
11361 hmeblkp
, hblkpa
, pr_hblk
)) {
11371 hblkpa
= hmeblkp
->hblk_nextpa
;
11372 hmeblkp
= hmeblkp
->hblk_next
;
11375 SFMMU_HASH_UNLOCK(hmebp
);
11376 if (hmebp
++ == &uhme_hash
[UHMEHASH_SZ
])
11379 uhmehash_steal_hand
= hmebp
;
11381 if (hmeblkp
!= NULL
)
11385 * in the worst case, look for a free one in the kernel
11388 for (i
= 0, hmebp
= khme_hash
; i
<= KHMEHASH_SZ
; i
++) {
11389 SFMMU_HASH_LOCK(hmebp
);
11390 hmeblkp
= hmebp
->hmeblkp
;
11391 hblkpa
= hmebp
->hmeh_nextpa
;
11395 * check if it is free hmeblk
11397 if ((get_hblk_ttesz(hmeblkp
) == size
) &&
11398 (hmeblkp
->hblk_lckcnt
== 0) &&
11399 (hmeblkp
->hblk_vcnt
== 0) &&
11400 (hmeblkp
->hblk_hmecnt
== 0)) {
11401 if (sfmmu_steal_this_hblk(hmebp
,
11402 hmeblkp
, hblkpa
, pr_hblk
)) {
11406 * Cannot fail since we have
11409 panic("fail to steal?");
11414 hblkpa
= hmeblkp
->hblk_nextpa
;
11415 hmeblkp
= hmeblkp
->hblk_next
;
11418 SFMMU_HASH_UNLOCK(hmebp
);
11419 if (hmebp
++ == &khme_hash
[KHMEHASH_SZ
])
11423 if (hmeblkp
!= NULL
)
11425 sfmmu_hblk_steal_twice
++;
11431 * This routine does real work to prepare a hblk to be "stolen" by
11432 * unloading the mappings, updating shadow counts ....
11433 * It returns 1 if the block is ready to be reused (stolen), or 0
11434 * means the block cannot be stolen yet- pageunload is still working
11438 sfmmu_steal_this_hblk(struct hmehash_bucket
*hmebp
, struct hme_blk
*hmeblkp
,
11439 uint64_t hblkpa
, struct hme_blk
*pr_hblk
)
11441 int shw_size
, vshift
;
11442 struct hme_blk
*shw_hblkp
;
11444 uint_t shw_mask
, newshw_mask
;
11445 struct hme_blk
*list
= NULL
;
11447 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp
));
11450 * check if the hmeblk is free, unload if necessary
11452 if (hmeblkp
->hblk_vcnt
|| hmeblkp
->hblk_hmecnt
) {
11456 sfmmup
= hblktosfmmu(hmeblkp
);
11457 if (hmeblkp
->hblk_shared
|| sfmmup
->sfmmu_ismhat
) {
11460 DEMAP_RANGE_INIT(sfmmup
, &dmr
);
11461 (void) sfmmu_hblk_unload(sfmmup
, hmeblkp
,
11462 (caddr_t
)get_hblk_base(hmeblkp
),
11463 get_hblk_endaddr(hmeblkp
), &dmr
, HAT_UNLOAD
);
11464 DEMAP_RANGE_FLUSH(&dmr
);
11465 if (hmeblkp
->hblk_vcnt
|| hmeblkp
->hblk_hmecnt
) {
11467 * Pageunload is working on the same hblk.
11472 sfmmu_hblk_steal_unload_count
++;
11475 ASSERT(hmeblkp
->hblk_lckcnt
== 0);
11476 ASSERT(hmeblkp
->hblk_vcnt
== 0 && hmeblkp
->hblk_hmecnt
== 0);
11478 sfmmu_hblk_hash_rm(hmebp
, hmeblkp
, pr_hblk
, &list
, 1);
11479 hmeblkp
->hblk_nextpa
= hblkpa
;
11481 shw_hblkp
= hmeblkp
->hblk_shadow
;
11483 ASSERT(!hmeblkp
->hblk_shared
);
11484 shw_size
= get_hblk_ttesz(shw_hblkp
);
11485 vaddr
= (caddr_t
)get_hblk_base(hmeblkp
);
11486 vshift
= vaddr_to_vshift(shw_hblkp
->hblk_tag
, vaddr
, shw_size
);
11487 ASSERT(vshift
< 8);
11489 * Atomically clear shadow mask bit
11492 shw_mask
= shw_hblkp
->hblk_shw_mask
;
11493 ASSERT(shw_mask
& (1 << vshift
));
11494 newshw_mask
= shw_mask
& ~(1 << vshift
);
11495 newshw_mask
= atomic_cas_32(&shw_hblkp
->hblk_shw_mask
,
11496 shw_mask
, newshw_mask
);
11497 } while (newshw_mask
!= shw_mask
);
11498 hmeblkp
->hblk_shadow
= NULL
;
11502 * remove shadow bit if we are stealing an unused shadow hmeblk.
11503 * sfmmu_hblk_alloc needs it that way, will set shadow bit later if
11504 * we are indeed allocating a shadow hmeblk.
11506 hmeblkp
->hblk_shw_bit
= 0;
11508 if (hmeblkp
->hblk_shared
) {
11513 srdp
= hblktosrd(hmeblkp
);
11514 ASSERT(srdp
!= NULL
&& srdp
->srd_refcnt
!= 0);
11515 rid
= hmeblkp
->hblk_tag
.htag_rid
;
11516 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
11517 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
11518 rgnp
= srdp
->srd_hmergnp
[rid
];
11519 ASSERT(rgnp
!= NULL
);
11520 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp
, srdp
, rgnp
, rid
);
11521 hmeblkp
->hblk_shared
= 0;
11524 sfmmu_hblk_steal_count
++;
11525 SFMMU_STAT(sf_steal_count
);
11531 sfmmu_hmetohblk(struct sf_hment
*sfhme
)
11533 struct hme_blk
*hmeblkp
;
11534 struct sf_hment
*sfhme0
;
11535 struct hme_blk
*hblk_dummy
= 0;
11538 * No dummy sf_hments, please.
11540 ASSERT(sfhme
->hme_tte
.ll
!= 0);
11542 sfhme0
= sfhme
- sfhme
->hme_tte
.tte_hmenum
;
11543 hmeblkp
= (struct hme_blk
*)((uintptr_t)sfhme0
-
11544 (uintptr_t)&hblk_dummy
->hblk_hme
[0]);
11550 * On swapin, get appropriately sized TSB(s) and clear the HAT_SWAPPED flag.
11551 * If we can't get appropriately sized TSB(s), try for 8K TSB(s) using
11552 * KM_SLEEP allocation.
11554 * Return 0 on success, -1 otherwise.
11557 sfmmu_tsb_swapin(sfmmu_t
*sfmmup
, hatlock_t
*hatlockp
)
11559 struct tsb_info
*tsbinfop
, *next
;
11560 tsb_replace_rc_t rc
;
11561 boolean_t gotfirst
= B_FALSE
;
11563 ASSERT(sfmmup
!= ksfmmup
);
11564 ASSERT(sfmmu_hat_lock_held(sfmmup
));
11566 while (SFMMU_FLAGS_ISSET(sfmmup
, HAT_SWAPIN
)) {
11567 cv_wait(&sfmmup
->sfmmu_tsb_cv
, HATLOCK_MUTEXP(hatlockp
));
11570 if (SFMMU_FLAGS_ISSET(sfmmup
, HAT_SWAPPED
)) {
11571 SFMMU_FLAGS_SET(sfmmup
, HAT_SWAPIN
);
11576 ASSERT(sfmmup
->sfmmu_tsb
!= NULL
);
11579 * Loop over all tsbinfo's replacing them with ones that actually have
11580 * a TSB. If any of the replacements ever fail, bail out of the loop.
11582 for (tsbinfop
= sfmmup
->sfmmu_tsb
; tsbinfop
!= NULL
; tsbinfop
= next
) {
11583 ASSERT(tsbinfop
->tsb_flags
& TSB_SWAPPED
);
11584 next
= tsbinfop
->tsb_next
;
11585 rc
= sfmmu_replace_tsb(sfmmup
, tsbinfop
, tsbinfop
->tsb_szc
,
11586 hatlockp
, TSB_SWAPIN
);
11587 if (rc
!= TSB_SUCCESS
) {
11595 SFMMU_FLAGS_CLEAR(sfmmup
, HAT_SWAPPED
|HAT_SWAPIN
);
11596 cv_broadcast(&sfmmup
->sfmmu_tsb_cv
);
11600 case TSB_ALLOCFAIL
:
11603 panic("sfmmu_replace_tsb returned unrecognized failure code "
11608 * In this case, we failed to get one of our TSBs. If we failed to
11609 * get the first TSB, get one of minimum size (8KB). Walk the list
11610 * and throw away the tsbinfos, starting where the allocation failed;
11611 * we can get by with just one TSB as long as we don't leave the
11612 * SWAPPED tsbinfo structures lying around.
11614 tsbinfop
= sfmmup
->sfmmu_tsb
;
11615 next
= tsbinfop
->tsb_next
;
11616 tsbinfop
->tsb_next
= NULL
;
11618 sfmmu_hat_exit(hatlockp
);
11619 for (tsbinfop
= next
; tsbinfop
!= NULL
; tsbinfop
= next
) {
11620 next
= tsbinfop
->tsb_next
;
11621 sfmmu_tsbinfo_free(tsbinfop
);
11623 hatlockp
= sfmmu_hat_enter(sfmmup
);
11626 * If we don't have any TSBs, get a single 8K TSB for 8K, 64K and 512K
11630 tsbinfop
= sfmmup
->sfmmu_tsb
;
11631 rc
= sfmmu_replace_tsb(sfmmup
, tsbinfop
, TSB_MIN_SZCODE
,
11632 hatlockp
, TSB_SWAPIN
| TSB_FORCEALLOC
);
11633 ASSERT(rc
== TSB_SUCCESS
);
11636 SFMMU_FLAGS_CLEAR(sfmmup
, HAT_SWAPPED
|HAT_SWAPIN
);
11637 cv_broadcast(&sfmmup
->sfmmu_tsb_cv
);
11641 sfmmu_is_rgnva(sf_srd_t
*srdp
, caddr_t addr
, ulong_t w
, ulong_t bmw
)
11647 ASSERT(srdp
!= NULL
);
11648 ASSERT(srdp
->srd_refcnt
!= 0);
11652 if (!(bmw
& 0x1)) {
11658 rgnp
= srdp
->srd_hmergnp
[rid
];
11659 ASSERT(rgnp
->rgn_refcnt
> 0);
11660 ASSERT(rgnp
->rgn_id
== rid
);
11661 if (addr
< rgnp
->rgn_saddr
||
11662 addr
>= (rgnp
->rgn_saddr
+ rgnp
->rgn_size
)) {
11673 * Handle exceptions for low level tsb_handler.
11675 * There are many scenarios that could land us here:
11677 * If the context is invalid we land here. The context can be invalid
11678 * for 3 reasons: 1) we couldn't allocate a new context and now need to
11679 * perform a wrap around operation in order to allocate a new context.
11680 * 2) Context was invalidated to change pagesize programming 3) ISMs or
11681 * TSBs configuration is changeing for this process and we are forced into
11682 * here to do a syncronization operation. If the context is valid we can
11683 * be here from window trap hanlder. In this case just call trap to handle
11686 * Note that the process will run in INVALID_CONTEXT before
11687 * faulting into here and subsequently loading the MMU registers
11688 * (including the TSB base register) associated with this process.
11689 * For this reason, the trap handlers must all test for
11690 * INVALID_CONTEXT before attempting to access any registers other
11691 * than the context registers.
11694 sfmmu_tsbmiss_exception(struct regs
*rp
, uintptr_t tagaccess
, uint_t traptype
)
11696 sfmmu_t
*sfmmup
, *shsfmmup
;
11699 char lwp_save_state
;
11700 hatlock_t
*hatlockp
, *shatlockp
;
11701 struct tsb_info
*tsbinfop
;
11702 struct tsbmiss
*tsbmp
;
11705 SFMMU_STAT(sf_tsb_exceptions
);
11706 SFMMU_MMU_STAT(mmu_tsb_exceptions
);
11707 sfmmup
= astosfmmu(curthread
->t_procp
->p_as
);
11709 * note that in sun4u, tagacces register contains ctxnum
11710 * while sun4v passes ctxtype in the tagaccess register.
11712 ctxtype
= tagaccess
& TAGACC_CTX_MASK
;
11714 ASSERT(sfmmup
!= ksfmmup
&& ctxtype
!= KCONTEXT
);
11715 ASSERT(sfmmup
->sfmmu_ismhat
== 0);
11716 ASSERT(!SFMMU_FLAGS_ISSET(sfmmup
, HAT_SWAPPED
) ||
11717 ctxtype
== INVALID_CONTEXT
);
11719 if (ctxtype
!= INVALID_CONTEXT
&& traptype
!= T_DATA_PROT
) {
11721 * We may land here because shme bitmap and pagesize
11722 * flags are updated lazily in tsbmiss area on other cpus.
11723 * If we detect here that tsbmiss area is out of sync with
11724 * sfmmu update it and retry the trapped instruction.
11725 * Otherwise call trap().
11728 uchar_t tteflag_mask
= (1 << TTE64K
) | (1 << TTE8K
);
11729 caddr_t addr
= (caddr_t
)(tagaccess
& TAGACC_VADDR_MASK
);
11732 * Must set lwp state to LWP_SYS before
11733 * trying to acquire any adaptive lock
11735 lwp
= ttolwp(curthread
);
11737 lwp_save_state
= lwp
->lwp_state
;
11738 lwp
->lwp_state
= LWP_SYS
;
11740 hatlockp
= sfmmu_hat_enter(sfmmup
);
11741 kpreempt_disable();
11742 tsbmp
= &tsbmiss_area
[CPU
->cpu_id
];
11743 ASSERT(sfmmup
== tsbmp
->usfmmup
);
11744 if (((tsbmp
->uhat_tteflags
^ sfmmup
->sfmmu_tteflags
) &
11746 ((tsbmp
->uhat_rtteflags
^ sfmmup
->sfmmu_rtteflags
) &
11748 tsbmp
->uhat_tteflags
= sfmmup
->sfmmu_tteflags
;
11749 tsbmp
->uhat_rtteflags
= sfmmup
->sfmmu_rtteflags
;
11752 if (sfmmup
->sfmmu_srdp
!= NULL
) {
11753 ulong_t
*sm
= sfmmup
->sfmmu_hmeregion_map
.bitmap
;
11754 ulong_t
*tm
= tsbmp
->shmermap
;
11756 for (i
= 0; i
< SFMMU_HMERGNMAP_WORDS
; i
++) {
11757 ulong_t d
= tm
[i
] ^ sm
[i
];
11760 if (!ret
&& sfmmu_is_rgnva(
11761 sfmmup
->sfmmu_srdp
,
11762 addr
, i
, d
& sm
[i
])) {
11771 sfmmu_hat_exit(hatlockp
);
11772 lwp
->lwp_state
= lwp_save_state
;
11776 } else if (ctxtype
== INVALID_CONTEXT
) {
11778 * First, make sure we come out of here with a valid ctx,
11779 * since if we don't get one we'll simply loop on the
11780 * faulting instruction.
11782 * If the ISM mappings are changing, the TSB is relocated,
11783 * the process is swapped, the process is joining SCD or
11784 * leaving SCD or shared regions we serialize behind the
11785 * controlling thread with hat lock, sfmmu_flags and
11786 * sfmmu_tsb_cv condition variable.
11790 * Must set lwp state to LWP_SYS before
11791 * trying to acquire any adaptive lock
11793 lwp
= ttolwp(curthread
);
11795 lwp_save_state
= lwp
->lwp_state
;
11796 lwp
->lwp_state
= LWP_SYS
;
11798 hatlockp
= sfmmu_hat_enter(sfmmup
);
11800 if ((scdp
= sfmmup
->sfmmu_scdp
) != NULL
) {
11801 shsfmmup
= scdp
->scd_sfmmup
;
11802 ASSERT(shsfmmup
!= NULL
);
11804 for (tsbinfop
= shsfmmup
->sfmmu_tsb
; tsbinfop
!= NULL
;
11805 tsbinfop
= tsbinfop
->tsb_next
) {
11806 if (tsbinfop
->tsb_flags
& TSB_RELOC_FLAG
) {
11807 /* drop the private hat lock */
11808 sfmmu_hat_exit(hatlockp
);
11809 /* acquire the shared hat lock */
11810 shatlockp
= sfmmu_hat_enter(shsfmmup
);
11812 * recheck to see if anything changed
11813 * after we drop the private hat lock.
11815 if (sfmmup
->sfmmu_scdp
== scdp
&&
11816 shsfmmup
== scdp
->scd_sfmmup
) {
11817 sfmmu_tsb_chk_reloc(shsfmmup
,
11820 sfmmu_hat_exit(shatlockp
);
11821 hatlockp
= sfmmu_hat_enter(sfmmup
);
11827 for (tsbinfop
= sfmmup
->sfmmu_tsb
; tsbinfop
!= NULL
;
11828 tsbinfop
= tsbinfop
->tsb_next
) {
11829 if (tsbinfop
->tsb_flags
& TSB_RELOC_FLAG
) {
11830 cv_wait(&sfmmup
->sfmmu_tsb_cv
,
11831 HATLOCK_MUTEXP(hatlockp
));
11837 * Wait for ISM maps to be updated.
11839 if (SFMMU_FLAGS_ISSET(sfmmup
, HAT_ISMBUSY
)) {
11840 cv_wait(&sfmmup
->sfmmu_tsb_cv
,
11841 HATLOCK_MUTEXP(hatlockp
));
11845 /* Is this process joining an SCD? */
11846 if (SFMMU_FLAGS_ISSET(sfmmup
, HAT_JOIN_SCD
)) {
11848 * Flush private TSB and setup shared TSB.
11849 * sfmmu_finish_join_scd() does not drop the
11852 sfmmu_finish_join_scd(sfmmup
);
11853 SFMMU_FLAGS_CLEAR(sfmmup
, HAT_JOIN_SCD
);
11857 * If we're swapping in, get TSB(s). Note that we must do
11858 * this before we get a ctx or load the MMU state. Once
11859 * we swap in we have to recheck to make sure the TSB(s) and
11860 * ISM mappings didn't change while we slept.
11862 if (SFMMU_FLAGS_ISSET(sfmmup
, HAT_SWAPPED
)) {
11863 sfmmu_tsb_swapin(sfmmup
, hatlockp
);
11867 sfmmu_get_ctx(sfmmup
);
11869 sfmmu_hat_exit(hatlockp
);
11871 * Must restore lwp_state if not calling
11872 * trap() for further processing. Restore
11875 lwp
->lwp_state
= lwp_save_state
;
11878 trap(rp
, (caddr_t
)tagaccess
, traptype
, 0);
11882 sfmmu_tsb_chk_reloc(sfmmu_t
*sfmmup
, hatlock_t
*hatlockp
)
11884 struct tsb_info
*tp
;
11886 ASSERT(sfmmu_hat_lock_held(sfmmup
));
11888 for (tp
= sfmmup
->sfmmu_tsb
; tp
!= NULL
; tp
= tp
->tsb_next
) {
11889 if (tp
->tsb_flags
& TSB_RELOC_FLAG
) {
11890 cv_wait(&sfmmup
->sfmmu_tsb_cv
,
11891 HATLOCK_MUTEXP(hatlockp
));
11898 * sfmmu_vatopfn_suspended is called from GET_TTE when TL=0 and
11899 * TTE_SUSPENDED bit set in tte we block on aquiring a page lock
11900 * rather than spinning to avoid send mondo timeouts with
11901 * interrupts enabled. When the lock is acquired it is immediately
11902 * released and we return back to sfmmu_vatopfn just after
11903 * the GET_TTE call.
11906 sfmmu_vatopfn_suspended(caddr_t vaddr
, sfmmu_t
*sfmmu
, tte_t
*ttep
)
11910 (void) as_pagelock(sfmmu
->sfmmu_as
, &pp
, vaddr
, TTE_CSZ(ttep
), S_WRITE
);
11911 as_pageunlock(sfmmu
->sfmmu_as
, pp
, vaddr
, TTE_CSZ(ttep
), S_WRITE
);
11915 * sfmmu_tsbmiss_suspended is called from GET_TTE when TL>0 and
11916 * TTE_SUSPENDED bit set in tte. We do this so that we can handle
11917 * cross traps which cannot be handled while spinning in the
11918 * trap handlers. Simply enter and exit the kpr_suspendlock spin
11919 * mutex, which is held by the holder of the suspend bit, and then
11920 * retry the trapped instruction after unwinding.
11924 sfmmu_tsbmiss_suspended(struct regs
*rp
, uintptr_t tagacc
, uint_t traptype
)
11926 ASSERT(curthread
!= kreloc_thread
);
11927 mutex_enter(&kpr_suspendlock
);
11928 mutex_exit(&kpr_suspendlock
);
11932 * This routine could be optimized to reduce the number of xcalls by flushing
11933 * the entire TLBs if region reference count is above some threshold but the
11934 * tradeoff will depend on the size of the TLB. So for now flush the specific
11935 * page a context at a time.
11937 * If uselocks is 0 then it's called after all cpus were captured and all the
11938 * hat locks were taken. In this case don't take the region lock by relying on
11939 * the order of list region update operations in hat_join_region(),
11940 * hat_leave_region() and hat_dup_region(). The ordering in those routines
11941 * guarantees that list is always forward walkable and reaches active sfmmus
11942 * regardless of where xc_attention() captures a cpu.
11945 sfmmu_rgntlb_demap(caddr_t addr
, sf_region_t
*rgnp
,
11946 struct hme_blk
*hmeblkp
, int uselocks
)
11951 hatlock_t
*hatlockp
;
11952 uint_t rid
= rgnp
->rgn_id
;
11953 sf_rgn_link_t
*rlink
;
11956 ASSERT(hmeblkp
->hblk_shared
);
11957 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
11958 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
11960 CPUSET_ZERO(rcpuset
);
11962 mutex_enter(&rgnp
->rgn_mutex
);
11964 sfmmup
= rgnp
->rgn_sfmmu_head
;
11965 while (sfmmup
!= NULL
) {
11967 hatlockp
= sfmmu_hat_enter(sfmmup
);
11971 * When an SCD is created the SCD hat is linked on the sfmmu
11972 * region lists for each hme region which is part of the
11973 * SCD. If we find an SCD hat, when walking these lists,
11974 * then we flush the shared TSBs, if we find a private hat,
11975 * which is part of an SCD, but where the region
11976 * is not part of the SCD then we flush the private TSBs.
11978 if (!sfmmup
->sfmmu_scdhat
&& sfmmup
->sfmmu_scdp
!= NULL
&&
11979 !SFMMU_FLAGS_ISSET(sfmmup
, HAT_JOIN_SCD
)) {
11980 scdp
= sfmmup
->sfmmu_scdp
;
11981 if (SF_RGNMAP_TEST(scdp
->scd_hmeregion_map
, rid
)) {
11983 sfmmu_hat_exit(hatlockp
);
11989 SFMMU_UNLOAD_TSB(addr
, sfmmup
, hmeblkp
, 0);
11991 kpreempt_disable();
11992 cpuset
= sfmmup
->sfmmu_cpusran
;
11993 CPUSET_AND(cpuset
, cpu_ready_set
);
11994 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
11995 SFMMU_XCALL_STATS(sfmmup
);
11996 xt_some(cpuset
, vtag_flushpage_tl1
,
11997 (uint64_t)addr
, (uint64_t)sfmmup
);
11998 vtag_flushpage(addr
, (uint64_t)sfmmup
);
12000 sfmmu_hat_exit(hatlockp
);
12003 CPUSET_OR(rcpuset
, cpuset
);
12006 /* LINTED: constant in conditional context */
12007 SFMMU_HMERID2RLINKP(sfmmup
, rid
, rlink
, 0, 0);
12008 ASSERT(rlink
!= NULL
);
12009 sfmmup
= rlink
->next
;
12012 mutex_exit(&rgnp
->rgn_mutex
);
12018 * This routine takes an sfmmu pointer and the va for an adddress in an
12019 * ISM region as input and returns the corresponding region id in ism_rid.
12020 * The return value of 1 indicates that a region has been found and ism_rid
12021 * is valid, otherwise 0 is returned.
12024 find_ism_rid(sfmmu_t
*sfmmup
, sfmmu_t
*ism_sfmmup
, caddr_t va
, uint_t
*ism_rid
)
12026 ism_blk_t
*ism_blkp
;
12028 ism_map_t
*ism_map
;
12030 struct hat
*ism_hatid
;
12032 ASSERT(sfmmu_hat_lock_held(sfmmup
));
12034 ism_blkp
= sfmmup
->sfmmu_iblk
;
12035 while (ism_blkp
!= NULL
) {
12036 ism_map
= ism_blkp
->iblk_maps
;
12037 for (i
= 0; i
< ISM_MAP_SLOTS
&& ism_map
[i
].imap_ismhat
; i
++) {
12038 if ((va
>= ism_start(ism_map
[i
])) &&
12039 (va
< ism_end(ism_map
[i
]))) {
12041 *ism_rid
= ism_map
[i
].imap_rid
;
12043 ism_hatid
= ism_map
[i
].imap_ismhat
;
12044 ASSERT(ism_hatid
== ism_sfmmup
);
12045 ASSERT(ism_hatid
->sfmmu_ismhat
);
12050 ism_blkp
= ism_blkp
->iblk_next
;
12056 * Special routine to flush out ism mappings- TSBs, TLBs and D-caches.
12057 * This routine may be called with all cpu's captured. Therefore, the
12058 * caller is responsible for holding all locks and disabling kernel
12063 sfmmu_ismtlbcache_demap(caddr_t addr
, sfmmu_t
*ism_sfmmup
,
12064 struct hme_blk
*hmeblkp
, pfn_t pfnum
, int cache_flush_flag
)
12077 ASSERT(!hmeblkp
->hblk_shared
);
12079 * Walk the ism_hat's mapping list and flush the page
12080 * from every hat sharing this ism_hat. This routine
12081 * may be called while all cpu's have been captured.
12082 * Therefore we can't attempt to grab any locks. For now
12083 * this means we will protect the ism mapping list under
12084 * a single lock which will be grabbed by the caller.
12085 * If hat_share/unshare scalibility becomes a performance
12086 * problem then we may need to re-think ism mapping list locking.
12088 ASSERT(ism_sfmmup
->sfmmu_ismhat
);
12089 ASSERT(MUTEX_HELD(&ism_mlist_lock
));
12090 addr
= addr
- ISMID_STARTADDR
;
12092 for (ment
= ism_sfmmup
->sfmmu_iment
; ment
; ment
= ment
->iment_next
) {
12094 sfmmup
= ment
->iment_hat
;
12096 va
= ment
->iment_base_va
;
12097 va
= (caddr_t
)((uintptr_t)va
+ (uintptr_t)addr
);
12100 * When an SCD is created the SCD hat is linked on the ism
12101 * mapping lists for each ISM segment which is part of the
12102 * SCD. If we find an SCD hat, when walking these lists,
12103 * then we flush the shared TSBs, if we find a private hat,
12104 * which is part of an SCD, but where the region
12105 * corresponding to this va is not part of the SCD then we
12106 * flush the private TSBs.
12108 if (!sfmmup
->sfmmu_scdhat
&& sfmmup
->sfmmu_scdp
!= NULL
&&
12109 !SFMMU_FLAGS_ISSET(sfmmup
, HAT_JOIN_SCD
) &&
12110 !SFMMU_FLAGS_ISSET(sfmmup
, HAT_ISMBUSY
)) {
12111 if (!find_ism_rid(sfmmup
, ism_sfmmup
, va
,
12114 "can't find matching ISM rid!");
12117 scdp
= sfmmup
->sfmmu_scdp
;
12118 if (SFMMU_IS_ISMRID_VALID(ism_rid
) &&
12119 SF_RGNMAP_TEST(scdp
->scd_ismregion_map
,
12124 SFMMU_UNLOAD_TSB(va
, sfmmup
, hmeblkp
, 1);
12126 cpuset
= sfmmup
->sfmmu_cpusran
;
12127 CPUSET_AND(cpuset
, cpu_ready_set
);
12128 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
12129 SFMMU_XCALL_STATS(sfmmup
);
12130 xt_some(cpuset
, vtag_flushpage_tl1
, (uint64_t)va
,
12132 vtag_flushpage(va
, (uint64_t)sfmmup
);
12137 * When flushing D$ we must flush all
12138 * cpu's. See sfmmu_cache_flush().
12140 if (cache_flush_flag
== CACHE_FLUSH
) {
12141 cpuset
= cpu_ready_set
;
12142 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
12144 SFMMU_XCALL_STATS(sfmmup
);
12145 vcolor
= addr_to_vcolor(va
);
12146 xt_some(cpuset
, vac_flushpage_tl1
, pfnum
, vcolor
);
12147 vac_flushpage(pfnum
, vcolor
);
12154 * Demaps the TSB, CPU caches, and flushes all TLBs on all CPUs of
12155 * a particular virtual address and ctx. If noflush is set we do not
12156 * flush the TLB/TSB. This function may or may not be called with the
12160 sfmmu_tlbcache_demap(caddr_t addr
, sfmmu_t
*sfmmup
, struct hme_blk
*hmeblkp
,
12161 pfn_t pfnum
, int tlb_noflush
, int cpu_flag
, int cache_flush_flag
,
12168 hatlock_t
*hatlockp
;
12170 ASSERT(!hmeblkp
->hblk_shared
);
12172 #if defined(lint) && !defined(VAC)
12174 cpu_flag
= cpu_flag
;
12175 cache_flush_flag
= cache_flush_flag
;
12179 * There is no longer a need to protect against ctx being
12180 * stolen here since we don't store the ctx in the TSB anymore.
12183 vcolor
= addr_to_vcolor(addr
);
12187 * We must hold the hat lock during the flush of TLB,
12188 * to avoid a race with sfmmu_invalidate_ctx(), where
12189 * sfmmu_cnum on a MMU could be set to INVALID_CONTEXT,
12190 * causing TLB demap routine to skip flush on that MMU.
12191 * If the context on a MMU has already been set to
12192 * INVALID_CONTEXT, we just get an extra flush on
12195 if (!hat_lock_held
&& !tlb_noflush
)
12196 hatlockp
= sfmmu_hat_enter(sfmmup
);
12198 kpreempt_disable();
12199 if (!tlb_noflush
) {
12201 * Flush the TSB and TLB.
12203 SFMMU_UNLOAD_TSB(addr
, sfmmup
, hmeblkp
, 0);
12205 cpuset
= sfmmup
->sfmmu_cpusran
;
12206 CPUSET_AND(cpuset
, cpu_ready_set
);
12207 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
12209 SFMMU_XCALL_STATS(sfmmup
);
12211 xt_some(cpuset
, vtag_flushpage_tl1
, (uint64_t)addr
,
12214 vtag_flushpage(addr
, (uint64_t)sfmmup
);
12217 if (!hat_lock_held
&& !tlb_noflush
)
12218 sfmmu_hat_exit(hatlockp
);
12224 * Even if the ctx is stolen, we need to flush the
12225 * cache. Our ctx stealer only flushes the TLBs.
12227 if (cache_flush_flag
== CACHE_FLUSH
) {
12228 if (cpu_flag
& FLUSH_ALL_CPUS
) {
12229 cpuset
= cpu_ready_set
;
12231 cpuset
= sfmmup
->sfmmu_cpusran
;
12232 CPUSET_AND(cpuset
, cpu_ready_set
);
12234 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
12235 SFMMU_XCALL_STATS(sfmmup
);
12236 xt_some(cpuset
, vac_flushpage_tl1
, pfnum
, vcolor
);
12237 vac_flushpage(pfnum
, vcolor
);
12244 * Demaps the TSB and flushes all TLBs on all cpus for a particular virtual
12245 * address and ctx. If noflush is set we do not currently do anything.
12246 * This function may or may not be called with the HAT lock held.
12249 sfmmu_tlb_demap(caddr_t addr
, sfmmu_t
*sfmmup
, struct hme_blk
*hmeblkp
,
12250 int tlb_noflush
, int hat_lock_held
)
12253 hatlock_t
*hatlockp
;
12255 ASSERT(!hmeblkp
->hblk_shared
);
12258 * If the process is exiting we have nothing to do.
12266 if (!hat_lock_held
)
12267 hatlockp
= sfmmu_hat_enter(sfmmup
);
12268 SFMMU_UNLOAD_TSB(addr
, sfmmup
, hmeblkp
, 0);
12270 kpreempt_disable();
12272 cpuset
= sfmmup
->sfmmu_cpusran
;
12273 CPUSET_AND(cpuset
, cpu_ready_set
);
12274 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
12276 SFMMU_XCALL_STATS(sfmmup
);
12277 xt_some(cpuset
, vtag_flushpage_tl1
, (uint64_t)addr
, (uint64_t)sfmmup
);
12279 vtag_flushpage(addr
, (uint64_t)sfmmup
);
12281 if (!hat_lock_held
)
12282 sfmmu_hat_exit(hatlockp
);
12289 * Special case of sfmmu_tlb_demap for MMU_PAGESIZE hblks. Use the xcall
12290 * call handler that can flush a range of pages to save on xcalls.
12292 static int sfmmu_xcall_save
;
12295 * this routine is never used for demaping addresses backed by SRD hmeblks.
12298 sfmmu_tlb_range_demap(demap_range_t
*dmrp
)
12300 sfmmu_t
*sfmmup
= dmrp
->dmr_sfmmup
;
12301 hatlock_t
*hatlockp
;
12303 uint64_t sfmmu_pgcnt
;
12307 caddr_t addr
= dmrp
->dmr_addr
;
12309 uint64_t bitvec
= dmrp
->dmr_bitvec
;
12311 ASSERT(bitvec
& 1);
12314 * Flush TSB and calculate number of pages to flush.
12316 while (bitvec
!= 0) {
12319 * Find the first page to flush and then count how many
12320 * pages there are after it that also need to be flushed.
12321 * This way the number of TSB flushes is minimized.
12323 while ((bitvec
& 1) == 0) {
12325 addr
+= MMU_PAGESIZE
;
12328 while (bitvec
& 1) {
12332 eaddr
= addr
+ ptob(dirtypg
);
12333 hatlockp
= sfmmu_hat_enter(sfmmup
);
12334 sfmmu_unload_tsb_range(sfmmup
, addr
, eaddr
, TTE8K
);
12335 sfmmu_hat_exit(hatlockp
);
12336 pgunload
+= dirtypg
;
12341 ASSERT((pgcnt
<<MMU_PAGESHIFT
) <= dmrp
->dmr_endaddr
- dmrp
->dmr_addr
);
12342 if (sfmmup
->sfmmu_free
== 0) {
12343 addr
= dmrp
->dmr_addr
;
12344 bitvec
= dmrp
->dmr_bitvec
;
12347 * make sure it has SFMMU_PGCNT_SHIFT bits only,
12348 * as it will be used to pack argument for xt_some
12350 ASSERT((pgcnt
> 0) &&
12351 (pgcnt
<= (1 << SFMMU_PGCNT_SHIFT
)));
12354 * Encode pgcnt as (pgcnt -1 ), and pass (pgcnt - 1) in
12355 * the low 6 bits of sfmmup. This is doable since pgcnt
12358 ASSERT(!((uint64_t)sfmmup
& SFMMU_PGCNT_MASK
));
12359 sfmmu_pgcnt
= (uint64_t)sfmmup
|
12360 ((pgcnt
- 1) & SFMMU_PGCNT_MASK
);
12363 * We must hold the hat lock during the flush of TLB,
12364 * to avoid a race with sfmmu_invalidate_ctx(), where
12365 * sfmmu_cnum on a MMU could be set to INVALID_CONTEXT,
12366 * causing TLB demap routine to skip flush on that MMU.
12367 * If the context on a MMU has already been set to
12368 * INVALID_CONTEXT, we just get an extra flush on
12371 hatlockp
= sfmmu_hat_enter(sfmmup
);
12372 kpreempt_disable();
12374 cpuset
= sfmmup
->sfmmu_cpusran
;
12375 CPUSET_AND(cpuset
, cpu_ready_set
);
12376 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
12378 SFMMU_XCALL_STATS(sfmmup
);
12379 xt_some(cpuset
, vtag_flush_pgcnt_tl1
, (uint64_t)addr
,
12382 for (; bitvec
!= 0; bitvec
>>= 1) {
12384 vtag_flushpage(addr
, (uint64_t)sfmmup
);
12385 addr
+= MMU_PAGESIZE
;
12388 sfmmu_hat_exit(hatlockp
);
12390 sfmmu_xcall_save
+= (pgunload
-1);
12392 dmrp
->dmr_bitvec
= 0;
12396 * In cases where we need to synchronize with TLB/TSB miss trap
12397 * handlers, _and_ need to flush the TLB, it's a lot easier to
12398 * throw away the context from the process than to do a
12399 * special song and dance to keep things consistent for the
12402 * Since the process suddenly ends up without a context and our caller
12403 * holds the hat lock, threads that fault after this function is called
12404 * will pile up on the lock. We can then do whatever we need to
12405 * atomically from the context of the caller. The first blocked thread
12406 * to resume executing will get the process a new context, and the
12407 * process will resume executing.
12409 * One added advantage of this approach is that on MMUs that
12410 * support a "flush all" operation, we will delay the flush until
12411 * cnum wrap-around, and then flush the TLB one time. This
12412 * is rather rare, so it's a lot less expensive than making 8000
12413 * x-calls to flush the TLB 8000 times.
12415 * A per-process (PP) lock is used to synchronize ctx allocations in
12416 * resume() and ctx invalidations here.
12419 sfmmu_invalidate_ctx(sfmmu_t
*sfmmup
)
12422 int cnum
, currcnum
;
12423 mmu_ctx_t
*mmu_ctxp
;
12425 uint_t pstate_save
;
12427 SFMMU_STAT(sf_ctx_inv
);
12429 ASSERT(sfmmu_hat_lock_held(sfmmup
));
12430 ASSERT(sfmmup
!= ksfmmup
);
12432 kpreempt_disable();
12434 mmu_ctxp
= CPU_MMU_CTXP(CPU
);
12436 ASSERT(mmu_ctxp
->mmu_idx
< max_mmu_ctxdoms
);
12437 ASSERT(mmu_ctxp
== mmu_ctxs_tbl
[mmu_ctxp
->mmu_idx
]);
12439 currcnum
= sfmmup
->sfmmu_ctxs
[mmu_ctxp
->mmu_idx
].cnum
;
12441 pstate_save
= sfmmu_disable_intrs();
12443 lock_set(&sfmmup
->sfmmu_ctx_lock
); /* acquire PP lock */
12444 /* set HAT cnum invalid across all context domains. */
12445 for (i
= 0; i
< max_mmu_ctxdoms
; i
++) {
12447 cnum
= sfmmup
->sfmmu_ctxs
[i
].cnum
;
12448 if (cnum
== INVALID_CONTEXT
) {
12452 sfmmup
->sfmmu_ctxs
[i
].cnum
= INVALID_CONTEXT
;
12454 membar_enter(); /* make sure globally visible to all CPUs */
12455 lock_clear(&sfmmup
->sfmmu_ctx_lock
); /* release PP lock */
12457 sfmmu_enable_intrs(pstate_save
);
12459 cpuset
= sfmmup
->sfmmu_cpusran
;
12460 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
12461 CPUSET_AND(cpuset
, cpu_ready_set
);
12462 if (!CPUSET_ISNULL(cpuset
)) {
12463 SFMMU_XCALL_STATS(sfmmup
);
12464 xt_some(cpuset
, sfmmu_raise_tsb_exception
,
12465 (uint64_t)sfmmup
, INVALID_CONTEXT
);
12467 SFMMU_STAT(sf_tsb_raise_exception
);
12468 SFMMU_MMU_STAT(mmu_tsb_raise_exception
);
12472 * If the hat to-be-invalidated is the same as the current
12473 * process on local CPU we need to invalidate
12474 * this CPU context as well.
12476 if ((sfmmu_getctx_sec() == currcnum
) &&
12477 (currcnum
!= INVALID_CONTEXT
)) {
12478 /* sets shared context to INVALID too */
12479 sfmmu_setctx_sec(INVALID_CONTEXT
);
12480 sfmmu_clear_utsbinfo();
12483 SFMMU_FLAGS_SET(sfmmup
, HAT_ALLCTX_INVALID
);
12488 * we hold the hat lock, so nobody should allocate a context
12491 ASSERT(sfmmup
->sfmmu_ctxs
[mmu_ctxp
->mmu_idx
].cnum
== INVALID_CONTEXT
);
12496 * We need to flush the cache in all cpus. It is possible that
12497 * a process referenced a page as cacheable but has sinced exited
12498 * and cleared the mapping list. We still to flush it but have no
12499 * state so all cpus is the only alternative.
12502 sfmmu_cache_flush(pfn_t pfnum
, int vcolor
)
12506 kpreempt_disable();
12507 cpuset
= cpu_ready_set
;
12508 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
12509 SFMMU_XCALL_STATS(NULL
); /* account to any ctx */
12510 xt_some(cpuset
, vac_flushpage_tl1
, pfnum
, vcolor
);
12512 vac_flushpage(pfnum
, vcolor
);
12517 sfmmu_cache_flushcolor(int vcolor
, pfn_t pfnum
)
12521 ASSERT(vcolor
>= 0);
12523 kpreempt_disable();
12524 cpuset
= cpu_ready_set
;
12525 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
12526 SFMMU_XCALL_STATS(NULL
); /* account to any ctx */
12527 xt_some(cpuset
, vac_flushcolor_tl1
, vcolor
, pfnum
);
12529 vac_flushcolor(vcolor
, pfnum
);
12535 * We need to prevent processes from accessing the TSB using a cached physical
12536 * address. It's alright if they try to access the TSB via virtual address
12537 * since they will just fault on that virtual address once the mapping has
12540 #pragma weak sendmondo_in_recover
12544 sfmmu_tsb_pre_relocator(caddr_t va
, uint_t tsbsz
, uint_t flags
, void *tsbinfo
)
12546 struct tsb_info
*tsbinfop
= (struct tsb_info
*)tsbinfo
;
12547 sfmmu_t
*sfmmup
= tsbinfop
->tsb_sfmmu
;
12548 hatlock_t
*hatlockp
;
12551 if (flags
!= HAT_PRESUSPEND
)
12555 * If tsb is a shared TSB with TSB_SHAREDCTX set, sfmmup must
12556 * be a shared hat, then set SCD's tsbinfo's flag.
12557 * If tsb is not shared, sfmmup is a private hat, then set
12558 * its private tsbinfo's flag.
12560 hatlockp
= sfmmu_hat_enter(sfmmup
);
12561 tsbinfop
->tsb_flags
|= TSB_RELOC_FLAG
;
12563 if (!(tsbinfop
->tsb_flags
& TSB_SHAREDCTX
)) {
12564 sfmmu_tsb_inv_ctx(sfmmup
);
12565 sfmmu_hat_exit(hatlockp
);
12567 /* release lock on the shared hat */
12568 sfmmu_hat_exit(hatlockp
);
12569 /* sfmmup is a shared hat */
12570 ASSERT(sfmmup
->sfmmu_scdhat
);
12571 scdp
= sfmmup
->sfmmu_scdp
;
12572 ASSERT(scdp
!= NULL
);
12573 /* get private hat from the scd list */
12574 mutex_enter(&scdp
->scd_mutex
);
12575 sfmmup
= scdp
->scd_sf_list
;
12576 while (sfmmup
!= NULL
) {
12577 hatlockp
= sfmmu_hat_enter(sfmmup
);
12579 * We do not call sfmmu_tsb_inv_ctx here because
12580 * sendmondo_in_recover check is only needed for
12583 sfmmu_invalidate_ctx(sfmmup
);
12584 sfmmu_hat_exit(hatlockp
);
12585 sfmmup
= sfmmup
->sfmmu_scd_link
.next
;
12588 mutex_exit(&scdp
->scd_mutex
);
12594 sfmmu_tsb_inv_ctx(sfmmu_t
*sfmmup
)
12596 extern uint32_t sendmondo_in_recover
;
12598 ASSERT(sfmmu_hat_lock_held(sfmmup
));
12601 * For Cheetah+ Erratum 25:
12602 * Wait for any active recovery to finish. We can't risk
12603 * relocating the TSB of the thread running mondo_recover_proc()
12604 * since, if we did that, we would deadlock. The scenario we are
12605 * trying to avoid is as follows:
12607 * THIS CPU RECOVER CPU
12608 * -------- -----------
12609 * Begins recovery, walking through TSB
12610 * hat_pagesuspend() TSB TTE
12611 * TLB miss on TSB TTE, spins at TL1
12613 * send_mondo_timeout()
12614 * mondo_recover_proc()
12617 * The second half of the workaround is that mondo_recover_proc()
12618 * checks to see if the tsb_info has the RELOC flag set, and if it
12619 * does, it skips over that TSB without ever touching tsbinfop->tsb_va
12620 * and hence avoiding the TLB miss that could result in a deadlock.
12622 if (&sendmondo_in_recover
) {
12623 membar_enter(); /* make sure RELOC flag visible */
12624 while (sendmondo_in_recover
) {
12630 sfmmu_invalidate_ctx(sfmmup
);
12635 sfmmu_tsb_post_relocator(caddr_t va
, uint_t tsbsz
, uint_t flags
,
12636 void *tsbinfo
, pfn_t newpfn
)
12638 hatlock_t
*hatlockp
;
12639 struct tsb_info
*tsbinfop
= (struct tsb_info
*)tsbinfo
;
12640 sfmmu_t
*sfmmup
= tsbinfop
->tsb_sfmmu
;
12642 if (flags
!= HAT_POSTUNSUSPEND
)
12645 hatlockp
= sfmmu_hat_enter(sfmmup
);
12647 SFMMU_STAT(sf_tsb_reloc
);
12650 * The process may have swapped out while we were relocating one
12651 * of its TSBs. If so, don't bother doing the setup since the
12652 * process can't be using the memory anymore.
12654 if ((tsbinfop
->tsb_flags
& TSB_SWAPPED
) == 0) {
12655 ASSERT(va
== tsbinfop
->tsb_va
);
12656 sfmmu_tsbinfo_setup_phys(tsbinfop
, newpfn
);
12658 if (tsbinfop
->tsb_flags
& TSB_FLUSH_NEEDED
) {
12659 sfmmu_inv_tsb(tsbinfop
->tsb_va
,
12660 TSB_BYTES(tsbinfop
->tsb_szc
));
12661 tsbinfop
->tsb_flags
&= ~TSB_FLUSH_NEEDED
;
12666 tsbinfop
->tsb_flags
&= ~TSB_RELOC_FLAG
;
12667 cv_broadcast(&sfmmup
->sfmmu_tsb_cv
);
12669 sfmmu_hat_exit(hatlockp
);
12675 * Allocate and initialize a tsb_info structure. Note that we may or may not
12676 * allocate a TSB here, depending on the flags passed in.
12679 sfmmu_tsbinfo_alloc(struct tsb_info
**tsbinfopp
, int tsb_szc
, int tte_sz_mask
,
12680 uint_t flags
, sfmmu_t
*sfmmup
)
12684 *tsbinfopp
= (struct tsb_info
*)kmem_cache_alloc(
12685 sfmmu_tsbinfo_cache
, KM_SLEEP
);
12687 if ((err
= sfmmu_init_tsbinfo(*tsbinfopp
, tte_sz_mask
,
12688 tsb_szc
, flags
, sfmmup
)) != 0) {
12689 kmem_cache_free(sfmmu_tsbinfo_cache
, *tsbinfopp
);
12690 SFMMU_STAT(sf_tsb_allocfail
);
12694 SFMMU_STAT(sf_tsb_alloc
);
12697 * Bump the TSB size counters for this TSB size.
12699 (*(((int *)&sfmmu_tsbsize_stat
) + tsb_szc
))++;
12704 sfmmu_tsb_free(struct tsb_info
*tsbinfo
)
12706 caddr_t tsbva
= tsbinfo
->tsb_va
;
12707 uint_t tsb_size
= TSB_BYTES(tsbinfo
->tsb_szc
);
12708 struct kmem_cache
*kmem_cachep
= tsbinfo
->tsb_cache
;
12709 vmem_t
*vmp
= tsbinfo
->tsb_vmp
;
12712 * If we allocated this TSB from relocatable kernel memory, then we
12713 * need to uninstall the callback handler.
12715 if (tsbinfo
->tsb_cache
!= sfmmu_tsb8k_cache
) {
12716 uintptr_t slab_mask
;
12717 caddr_t slab_vaddr
;
12721 ASSERT(tsb_size
<= MMU_PAGESIZE4M
|| use_bigtsb_arena
);
12722 if (tsb_size
> MMU_PAGESIZE4M
)
12723 slab_mask
= ~((uintptr_t)bigtsb_slab_mask
) << PAGESHIFT
;
12725 slab_mask
= ~((uintptr_t)tsb_slab_mask
) << PAGESHIFT
;
12726 slab_vaddr
= (caddr_t
)((uintptr_t)tsbva
& slab_mask
);
12728 ret
= as_pagelock(&kas
, &ppl
, slab_vaddr
, PAGESIZE
, S_WRITE
);
12730 hat_delete_callback(tsbva
, (uint_t
)tsb_size
, (void *)tsbinfo
,
12732 as_pageunlock(&kas
, ppl
, slab_vaddr
, PAGESIZE
, S_WRITE
);
12735 if (kmem_cachep
!= NULL
) {
12736 kmem_cache_free(kmem_cachep
, tsbva
);
12738 vmem_xfree(vmp
, (void *)tsbva
, tsb_size
);
12740 tsbinfo
->tsb_va
= (caddr_t
)0xbad00bad;
12741 atomic_add_64(&tsb_alloc_bytes
, -(int64_t)tsb_size
);
12745 sfmmu_tsbinfo_free(struct tsb_info
*tsbinfo
)
12747 if ((tsbinfo
->tsb_flags
& TSB_SWAPPED
) == 0) {
12748 sfmmu_tsb_free(tsbinfo
);
12750 kmem_cache_free(sfmmu_tsbinfo_cache
, tsbinfo
);
12755 * Setup all the references to physical memory for this tsbinfo.
12756 * The underlying page(s) must be locked.
12759 sfmmu_tsbinfo_setup_phys(struct tsb_info
*tsbinfo
, pfn_t pfn
)
12761 ASSERT(pfn
!= PFN_INVALID
);
12762 ASSERT(pfn
== va_to_pfn(tsbinfo
->tsb_va
));
12765 if (tsbinfo
->tsb_szc
== 0) {
12766 sfmmu_memtte(&tsbinfo
->tsb_tte
, pfn
,
12767 PROT_WRITE
|PROT_READ
, TTE8K
);
12770 * Round down PA and use a large mapping; the handlers will
12771 * compute the TSB pointer at the correct offset into the
12772 * big virtual page. NOTE: this assumes all TSBs larger
12773 * than 8K must come from physically contiguous slabs of
12774 * size tsb_slab_size.
12776 sfmmu_memtte(&tsbinfo
->tsb_tte
, pfn
& ~tsb_slab_mask
,
12777 PROT_WRITE
|PROT_READ
, tsb_slab_ttesz
);
12779 tsbinfo
->tsb_pa
= ptob(pfn
);
12781 TTE_SET_LOCKED(&tsbinfo
->tsb_tte
); /* lock the tte into dtlb */
12782 TTE_SET_MOD(&tsbinfo
->tsb_tte
); /* enable writes */
12784 ASSERT(TTE_IS_PRIVILEGED(&tsbinfo
->tsb_tte
));
12785 ASSERT(TTE_IS_LOCKED(&tsbinfo
->tsb_tte
));
12787 tsbinfo
->tsb_pa
= ptob(pfn
);
12793 * Returns zero on success, ENOMEM if over the high water mark,
12794 * or EAGAIN if the caller needs to retry with a smaller TSB
12795 * size (or specify TSB_FORCEALLOC if the allocation can't fail).
12797 * This call cannot fail to allocate a TSB if TSB_FORCEALLOC
12798 * is specified and the TSB requested is PAGESIZE, though it
12799 * may sleep waiting for memory if sufficient memory is not
12803 sfmmu_init_tsbinfo(struct tsb_info
*tsbinfo
, int tteszmask
,
12804 int tsbcode
, uint_t flags
, sfmmu_t
*sfmmup
)
12806 caddr_t vaddr
= NULL
;
12807 caddr_t slab_vaddr
;
12808 uintptr_t slab_mask
;
12809 int tsbbytes
= TSB_BYTES(tsbcode
);
12811 struct kmem_cache
*kmem_cachep
= NULL
;
12812 vmem_t
*vmp
= NULL
;
12813 lgrp_id_t lgrpid
= LGRP_NONE
;
12815 uint_t cbflags
= HAC_SLEEP
;
12819 ASSERT(tsbbytes
<= MMU_PAGESIZE4M
|| use_bigtsb_arena
);
12820 if (tsbbytes
> MMU_PAGESIZE4M
)
12821 slab_mask
= ~((uintptr_t)bigtsb_slab_mask
) << PAGESHIFT
;
12823 slab_mask
= ~((uintptr_t)tsb_slab_mask
) << PAGESHIFT
;
12825 if (flags
& (TSB_FORCEALLOC
| TSB_SWAPIN
| TSB_GROW
| TSB_SHRINK
))
12826 flags
|= TSB_ALLOC
;
12828 ASSERT((flags
& TSB_FORCEALLOC
) == 0 || tsbcode
== TSB_MIN_SZCODE
);
12830 tsbinfo
->tsb_sfmmu
= sfmmup
;
12833 * If not allocating a TSB, set up the tsbinfo, set TSB_SWAPPED, and
12836 if ((flags
& TSB_ALLOC
) == 0) {
12837 tsbinfo
->tsb_szc
= tsbcode
;
12838 tsbinfo
->tsb_ttesz_mask
= tteszmask
;
12839 tsbinfo
->tsb_va
= (caddr_t
)0xbadbadbeef;
12840 tsbinfo
->tsb_pa
= -1;
12841 tsbinfo
->tsb_tte
.ll
= 0;
12842 tsbinfo
->tsb_next
= NULL
;
12843 tsbinfo
->tsb_flags
= TSB_SWAPPED
;
12844 tsbinfo
->tsb_cache
= NULL
;
12845 tsbinfo
->tsb_vmp
= NULL
;
12852 * Randomly force allocation failures every tsb_alloc_mtbf
12853 * tries if TSB_FORCEALLOC is not specified. This will
12854 * return ENOMEM if tsb_alloc_mtbf is odd, or EAGAIN if
12855 * it is even, to allow testing of both failure paths...
12857 if (tsb_alloc_mtbf
&& ((flags
& TSB_FORCEALLOC
) == 0) &&
12858 (tsb_alloc_count
++ == tsb_alloc_mtbf
)) {
12859 tsb_alloc_count
= 0;
12860 tsb_alloc_fail_mtbf
++;
12861 return ((tsb_alloc_mtbf
& 1)? ENOMEM
: EAGAIN
);
12866 * Enforce high water mark if we are not doing a forced allocation
12867 * and are not shrinking a process' TSB.
12869 if ((flags
& TSB_SHRINK
) == 0 &&
12870 (tsbbytes
+ tsb_alloc_bytes
) > tsb_alloc_hiwater
) {
12871 if ((flags
& TSB_FORCEALLOC
) == 0)
12877 * Allocate from the correct location based upon the size of the TSB
12878 * compared to the base page size, and what memory conditions dictate.
12879 * Note we always do nonblocking allocations from the TSB arena since
12880 * we don't want memory fragmentation to cause processes to block
12881 * indefinitely waiting for memory; until the kernel algorithms that
12882 * coalesce large pages are improved this is our best option.
12885 * If allocating a "large" TSB (>8K), allocate from the
12886 * appropriate kmem_tsb_default_arena vmem arena
12887 * else if low on memory or the TSB_FORCEALLOC flag is set or
12888 * tsb_forceheap is set
12889 * Allocate from kernel heap via sfmmu_tsb8k_cache with
12890 * KM_SLEEP (never fails)
12892 * Allocate from appropriate sfmmu_tsb_cache with
12896 if (tsb_lgrp_affinity
)
12897 lgrpid
= lgrp_home_id(curthread
);
12898 if (lgrpid
== LGRP_NONE
)
12899 lgrpid
= 0; /* use lgrp of boot CPU */
12901 if (tsbbytes
> MMU_PAGESIZE
) {
12902 if (tsbbytes
> MMU_PAGESIZE4M
) {
12903 vmp
= kmem_bigtsb_default_arena
[lgrpid
];
12904 vaddr
= (caddr_t
)vmem_xalloc(vmp
, tsbbytes
, tsbbytes
,
12905 0, 0, NULL
, NULL
, VM_NOSLEEP
);
12907 vmp
= kmem_tsb_default_arena
[lgrpid
];
12908 vaddr
= (caddr_t
)vmem_xalloc(vmp
, tsbbytes
, tsbbytes
,
12909 0, 0, NULL
, NULL
, VM_NOSLEEP
);
12912 } else if (lowmem
|| (flags
& TSB_FORCEALLOC
) || tsb_forceheap
) {
12914 } else if (lowmem
|| (flags
& TSB_FORCEALLOC
)) {
12916 kmem_cachep
= sfmmu_tsb8k_cache
;
12917 vaddr
= (caddr_t
)kmem_cache_alloc(kmem_cachep
, KM_SLEEP
);
12918 ASSERT(vaddr
!= NULL
);
12920 kmem_cachep
= sfmmu_tsb_cache
[lgrpid
];
12921 vaddr
= (caddr_t
)kmem_cache_alloc(kmem_cachep
, KM_NOSLEEP
);
12924 tsbinfo
->tsb_cache
= kmem_cachep
;
12925 tsbinfo
->tsb_vmp
= vmp
;
12927 if (vaddr
== NULL
) {
12931 atomic_add_64(&tsb_alloc_bytes
, (int64_t)tsbbytes
);
12932 kmem_cachep
= tsbinfo
->tsb_cache
;
12935 * If we are allocating from outside the cage, then we need to
12936 * register a relocation callback handler. Note that for now
12937 * since pseudo mappings always hang off of the slab's root page,
12938 * we need only lock the first 8K of the TSB slab. This is a bit
12939 * hacky but it is good for performance.
12941 if (kmem_cachep
!= sfmmu_tsb8k_cache
) {
12942 slab_vaddr
= (caddr_t
)((uintptr_t)vaddr
& slab_mask
);
12943 ret
= as_pagelock(&kas
, &pplist
, slab_vaddr
, PAGESIZE
, S_WRITE
);
12945 ret
= hat_add_callback(sfmmu_tsb_cb_id
, vaddr
, (uint_t
)tsbbytes
,
12946 cbflags
, (void *)tsbinfo
, &pfn
, NULL
);
12949 * Need to free up resources if we could not successfully
12950 * add the callback function and return an error condition.
12954 kmem_cache_free(kmem_cachep
, vaddr
);
12956 vmem_xfree(vmp
, (void *)vaddr
, tsbbytes
);
12958 as_pageunlock(&kas
, pplist
, slab_vaddr
, PAGESIZE
,
12964 * Since allocation of 8K TSBs from heap is rare and occurs
12965 * during memory pressure we allocate them from permanent
12966 * memory rather than using callbacks to get the PFN.
12968 pfn
= hat_getpfnum(kas
.a_hat
, vaddr
);
12971 tsbinfo
->tsb_va
= vaddr
;
12972 tsbinfo
->tsb_szc
= tsbcode
;
12973 tsbinfo
->tsb_ttesz_mask
= tteszmask
;
12974 tsbinfo
->tsb_next
= NULL
;
12975 tsbinfo
->tsb_flags
= 0;
12977 sfmmu_tsbinfo_setup_phys(tsbinfo
, pfn
);
12979 sfmmu_inv_tsb(vaddr
, tsbbytes
);
12981 if (kmem_cachep
!= sfmmu_tsb8k_cache
) {
12982 as_pageunlock(&kas
, pplist
, slab_vaddr
, PAGESIZE
, S_WRITE
);
12989 * Initialize per cpu tsb and per cpu tsbmiss_area
12992 sfmmu_init_tsbs(void)
12995 struct tsbmiss
*tsbmissp
;
12996 struct kpmtsbm
*kpmtsbmp
;
12998 extern int dcache_line_mask
;
13000 extern uint_t vac_colors
;
13003 * Init. tsb miss area.
13005 tsbmissp
= tsbmiss_area
;
13007 for (i
= 0; i
< NCPU
; tsbmissp
++, i
++) {
13009 * initialize the tsbmiss area.
13010 * Do this for all possible CPUs as some may be added
13011 * while the system is running. There is no cost to this.
13013 tsbmissp
->ksfmmup
= ksfmmup
;
13015 tsbmissp
->dcache_line_mask
= (uint16_t)dcache_line_mask
;
13017 tsbmissp
->khashstart
=
13018 (struct hmehash_bucket
*)va_to_pa((caddr_t
)khme_hash
);
13019 tsbmissp
->uhashstart
=
13020 (struct hmehash_bucket
*)va_to_pa((caddr_t
)uhme_hash
);
13021 tsbmissp
->khashsz
= khmehash_num
;
13022 tsbmissp
->uhashsz
= uhmehash_num
;
13025 sfmmu_tsb_cb_id
= hat_register_callback('T'<<16 | 'S' << 8 | 'B',
13026 sfmmu_tsb_pre_relocator
, sfmmu_tsb_post_relocator
, NULL
, 0);
13028 if (kpm_enable
== 0)
13031 /* -- Begin KPM specific init -- */
13033 if (kpm_smallpages
) {
13035 * If we're using base pagesize pages for seg_kpm
13036 * mappings, we use the kernel TSB since we can't afford
13037 * to allocate a second huge TSB for these mappings.
13039 kpm_tsbbase
= ktsb_phys
? ktsb_pbase
: (uint64_t)ktsb_base
;
13040 kpm_tsbsz
= ktsb_szcode
;
13041 kpmsm_tsbbase
= kpm_tsbbase
;
13042 kpmsm_tsbsz
= kpm_tsbsz
;
13045 * In VAC conflict case, just put the entries in the
13046 * kernel 8K indexed TSB for now so we can find them.
13047 * This could really be changed in the future if we feel
13050 kpmsm_tsbbase
= ktsb_phys
? ktsb_pbase
: (uint64_t)ktsb_base
;
13051 kpmsm_tsbsz
= ktsb_szcode
;
13052 kpm_tsbbase
= ktsb_phys
? ktsb4m_pbase
: (uint64_t)ktsb4m_base
;
13053 kpm_tsbsz
= ktsb4m_szcode
;
13056 kpmtsbmp
= kpmtsbm_area
;
13057 for (i
= 0; i
< NCPU
; kpmtsbmp
++, i
++) {
13059 * Initialize the kpmtsbm area.
13060 * Do this for all possible CPUs as some may be added
13061 * while the system is running. There is no cost to this.
13063 kpmtsbmp
->vbase
= kpm_vbase
;
13064 kpmtsbmp
->vend
= kpm_vbase
+ kpm_size
* vac_colors
;
13065 kpmtsbmp
->sz_shift
= kpm_size_shift
;
13066 kpmtsbmp
->kpmp_shift
= kpmp_shift
;
13067 kpmtsbmp
->kpmp2pshft
= (uchar_t
)kpmp2pshft
;
13068 if (kpm_smallpages
== 0) {
13069 kpmtsbmp
->kpmp_table_sz
= kpmp_table_sz
;
13070 kpmtsbmp
->kpmp_tablepa
= va_to_pa(kpmp_table
);
13072 kpmtsbmp
->kpmp_table_sz
= kpmp_stable_sz
;
13073 kpmtsbmp
->kpmp_tablepa
= va_to_pa(kpmp_stable
);
13075 kpmtsbmp
->msegphashpa
= va_to_pa(memseg_phash
);
13076 kpmtsbmp
->flags
= KPMTSBM_ENABLE_FLAG
;
13078 kpmtsbmp
->flags
|= (kpm_tsbmtl
) ? KPMTSBM_TLTSBM_FLAG
: 0;
13081 kpmtsbmp
->flags
|= KPMTSBM_TSBPHYS_FLAG
;
13084 /* -- End KPM specific init -- */
13087 /* Avoid using sfmmu_tsbinfo_alloc() to avoid kmem_alloc - no real reason */
13088 struct tsb_info ktsb_info
[2];
13091 * Called from hat_kern_setup() to setup the tsb_info for ksfmmup.
13094 sfmmu_init_ktsbinfo()
13096 ASSERT(ksfmmup
!= NULL
);
13097 ASSERT(ksfmmup
->sfmmu_tsb
== NULL
);
13099 * Allocate tsbinfos for kernel and copy in data
13100 * to make debug easier and sun4v setup easier.
13102 ktsb_info
[0].tsb_sfmmu
= ksfmmup
;
13103 ktsb_info
[0].tsb_szc
= ktsb_szcode
;
13104 ktsb_info
[0].tsb_ttesz_mask
= TSB8K
|TSB64K
|TSB512K
;
13105 ktsb_info
[0].tsb_va
= ktsb_base
;
13106 ktsb_info
[0].tsb_pa
= ktsb_pbase
;
13107 ktsb_info
[0].tsb_flags
= 0;
13108 ktsb_info
[0].tsb_tte
.ll
= 0;
13109 ktsb_info
[0].tsb_cache
= NULL
;
13111 ktsb_info
[1].tsb_sfmmu
= ksfmmup
;
13112 ktsb_info
[1].tsb_szc
= ktsb4m_szcode
;
13113 ktsb_info
[1].tsb_ttesz_mask
= TSB4M
;
13114 ktsb_info
[1].tsb_va
= ktsb4m_base
;
13115 ktsb_info
[1].tsb_pa
= ktsb4m_pbase
;
13116 ktsb_info
[1].tsb_flags
= 0;
13117 ktsb_info
[1].tsb_tte
.ll
= 0;
13118 ktsb_info
[1].tsb_cache
= NULL
;
13120 /* Link them into ksfmmup. */
13121 ktsb_info
[0].tsb_next
= &ktsb_info
[1];
13122 ktsb_info
[1].tsb_next
= NULL
;
13123 ksfmmup
->sfmmu_tsb
= &ktsb_info
[0];
13125 sfmmu_setup_tsbinfo(ksfmmup
);
13129 * Cache the last value returned from va_to_pa(). If the VA specified
13130 * in the current call to cached_va_to_pa() maps to the same Page (as the
13131 * previous call to cached_va_to_pa()), then compute the PA using
13132 * cached info, else call va_to_pa().
13134 * Note: this function is neither MT-safe nor consistent in the presence
13135 * of multiple, interleaved threads. This function was created to enable
13136 * an optimization used during boot (at a point when there's only one thread
13137 * executing on the "boot CPU", and before startup_vm() has been called).
13140 cached_va_to_pa(void *vaddr
)
13142 static uint64_t prev_vaddr_base
= 0;
13143 static uint64_t prev_pfn
= 0;
13145 if ((((uint64_t)vaddr
) & MMU_PAGEMASK
) == prev_vaddr_base
) {
13146 return (prev_pfn
| ((uint64_t)vaddr
& MMU_PAGEOFFSET
));
13148 uint64_t pa
= va_to_pa(vaddr
);
13150 if (pa
!= ((uint64_t)-1)) {
13152 * Computed physical address is valid. Cache its
13153 * related info for the next cached_va_to_pa() call.
13155 prev_pfn
= pa
& MMU_PAGEMASK
;
13156 prev_vaddr_base
= ((uint64_t)vaddr
) & MMU_PAGEMASK
;
13164 * Carve up our nucleus hblk region. We may allocate more hblks than
13165 * asked due to rounding errors but we are guaranteed to have at least
13166 * enough space to allocate the requested number of hblk8's and hblk1's.
13169 sfmmu_init_nucleus_hblks(caddr_t addr
, size_t size
, int nhblk8
, int nhblk1
)
13171 struct hme_blk
*hmeblkp
;
13172 size_t hme8blk_sz
, hme1blk_sz
;
13174 size_t hblk8_bound
;
13175 ulong_t j
= 0, k
= 0;
13177 ASSERT(addr
!= NULL
&& size
!= 0);
13179 /* Need to use proper structure alignment */
13180 hme8blk_sz
= roundup(HME8BLK_SZ
, sizeof (int64_t));
13181 hme1blk_sz
= roundup(HME1BLK_SZ
, sizeof (int64_t));
13183 nucleus_hblk8
.list
= (void *)addr
;
13184 nucleus_hblk8
.index
= 0;
13187 * Use as much memory as possible for hblk8's since we
13188 * expect all bop_alloc'ed memory to be allocated in 8k chunks.
13189 * We need to hold back enough space for the hblk1's which
13190 * we'll allocate next.
13192 hblk8_bound
= size
- (nhblk1
* hme1blk_sz
) - hme8blk_sz
;
13193 for (i
= 0; i
<= hblk8_bound
; i
+= hme8blk_sz
, j
++) {
13194 hmeblkp
= (struct hme_blk
*)addr
;
13195 addr
+= hme8blk_sz
;
13196 hmeblkp
->hblk_nuc_bit
= 1;
13197 hmeblkp
->hblk_nextpa
= cached_va_to_pa((caddr_t
)hmeblkp
);
13199 nucleus_hblk8
.len
= j
;
13200 ASSERT(j
>= nhblk8
);
13201 SFMMU_STAT_ADD(sf_hblk8_ncreate
, j
);
13203 nucleus_hblk1
.list
= (void *)addr
;
13204 nucleus_hblk1
.index
= 0;
13205 for (; i
<= (size
- hme1blk_sz
); i
+= hme1blk_sz
, k
++) {
13206 hmeblkp
= (struct hme_blk
*)addr
;
13207 addr
+= hme1blk_sz
;
13208 hmeblkp
->hblk_nuc_bit
= 1;
13209 hmeblkp
->hblk_nextpa
= cached_va_to_pa((caddr_t
)hmeblkp
);
13211 ASSERT(k
>= nhblk1
);
13212 nucleus_hblk1
.len
= k
;
13213 SFMMU_STAT_ADD(sf_hblk1_ncreate
, k
);
13217 * This function is currently not supported on this platform. For what
13218 * it's supposed to do, see hat.c and hat_srmmu.c
13222 hat_softlock(struct hat
*hat
, caddr_t addr
, size_t *lenp
, page_t
**ppp
,
13225 return (FC_NOSUPPORT
);
13229 * Searchs the mapping list of the page for a mapping of the same size. If not
13230 * found the corresponding bit is cleared in the p_index field. When large
13231 * pages are more prevalent in the system, we can maintain the mapping list
13232 * in order and we don't have to traverse the list each time. Just check the
13233 * next and prev entries, and if both are of different size, we clear the bit.
13236 sfmmu_rm_large_mappings(page_t
*pp
, int ttesz
)
13238 struct sf_hment
*sfhmep
;
13239 struct hme_blk
*hmeblkp
;
13243 ASSERT(ttesz
> TTE8K
);
13245 ASSERT(sfmmu_mlist_held(pp
));
13247 ASSERT(PP_ISMAPPED_LARGE(pp
));
13250 * Traverse mapping list looking for another mapping of same size.
13251 * since we only want to clear index field if all mappings of
13252 * that size are gone.
13255 for (sfhmep
= pp
->p_mapping
; sfhmep
; sfhmep
= sfhmep
->hme_next
) {
13256 if (IS_PAHME(sfhmep
))
13258 hmeblkp
= sfmmu_hmetohblk(sfhmep
);
13259 if (hme_size(sfhmep
) == ttesz
) {
13261 * another mapping of the same size. don't clear index.
13268 * Clear the p_index bit for large page.
13270 index
= PAGESZ_TO_INDEX(ttesz
);
13271 npgs
= TTEPAGES(ttesz
);
13272 while (npgs
-- > 0) {
13273 ASSERT(pp
->p_index
& index
);
13274 pp
->p_index
&= ~index
;
13275 pp
= PP_PAGENEXT(pp
);
13280 * return supported features
13284 hat_supported(enum hat_features feature
, void *arg
)
13287 case HAT_SHARED_PT
:
13288 case HAT_DYNAMIC_ISM_UNMAP
:
13291 case HAT_SHARED_REGIONS
:
13302 hat_enter(struct hat
*hat
)
13304 hatlock_t
*hatlockp
;
13306 if (hat
!= ksfmmup
) {
13307 hatlockp
= TSB_HASH(hat
);
13308 mutex_enter(HATLOCK_MUTEXP(hatlockp
));
13313 hat_exit(struct hat
*hat
)
13315 hatlock_t
*hatlockp
;
13317 if (hat
!= ksfmmup
) {
13318 hatlockp
= TSB_HASH(hat
);
13319 mutex_exit(HATLOCK_MUTEXP(hatlockp
));
13325 hat_reserve(struct as
*as
, caddr_t addr
, size_t len
)
13330 hat_kstat_init(void)
13334 ksp
= kstat_create("unix", 0, "sfmmu_global_stat", "hat",
13335 KSTAT_TYPE_RAW
, sizeof (struct sfmmu_global_stat
),
13336 KSTAT_FLAG_VIRTUAL
);
13338 ksp
->ks_data
= (void *) &sfmmu_global_stat
;
13339 kstat_install(ksp
);
13341 ksp
= kstat_create("unix", 0, "sfmmu_tsbsize_stat", "hat",
13342 KSTAT_TYPE_RAW
, sizeof (struct sfmmu_tsbsize_stat
),
13343 KSTAT_FLAG_VIRTUAL
);
13345 ksp
->ks_data
= (void *) &sfmmu_tsbsize_stat
;
13346 kstat_install(ksp
);
13348 ksp
= kstat_create("unix", 0, "sfmmu_percpu_stat", "hat",
13349 KSTAT_TYPE_RAW
, sizeof (struct sfmmu_percpu_stat
) * NCPU
,
13350 KSTAT_FLAG_WRITABLE
);
13352 ksp
->ks_update
= sfmmu_kstat_percpu_update
;
13353 kstat_install(ksp
);
13359 sfmmu_kstat_percpu_update(kstat_t
*ksp
, int rw
)
13361 struct sfmmu_percpu_stat
*cpu_kstat
= ksp
->ks_data
;
13362 struct tsbmiss
*tsbm
= tsbmiss_area
;
13363 struct kpmtsbm
*kpmtsbm
= kpmtsbm_area
;
13367 if (rw
== KSTAT_READ
) {
13368 for (i
= 0; i
< NCPU
; cpu_kstat
++, tsbm
++, kpmtsbm
++, i
++) {
13369 cpu_kstat
->sf_itlb_misses
= 0;
13370 cpu_kstat
->sf_dtlb_misses
= 0;
13371 cpu_kstat
->sf_utsb_misses
= tsbm
->utsb_misses
-
13373 cpu_kstat
->sf_ktsb_misses
= tsbm
->ktsb_misses
+
13374 kpmtsbm
->kpm_tsb_misses
- tsbm
->kprot_traps
;
13375 cpu_kstat
->sf_tsb_hits
= 0;
13376 cpu_kstat
->sf_umod_faults
= tsbm
->uprot_traps
;
13377 cpu_kstat
->sf_kmod_faults
= tsbm
->kprot_traps
;
13380 /* KSTAT_WRITE is used to clear stats */
13381 for (i
= 0; i
< NCPU
; tsbm
++, kpmtsbm
++, i
++) {
13382 tsbm
->utsb_misses
= 0;
13383 tsbm
->ktsb_misses
= 0;
13384 tsbm
->uprot_traps
= 0;
13385 tsbm
->kprot_traps
= 0;
13386 kpmtsbm
->kpm_dtlb_misses
= 0;
13387 kpmtsbm
->kpm_tsb_misses
= 0;
13395 tte_t
*gorig
[NCPU
], *gcur
[NCPU
], *gnew
[NCPU
];
13398 * A tte checker. *orig_old is the value we read before cas.
13399 * *cur is the value returned by cas.
13400 * *new is the desired value when we do the cas.
13402 * *hmeblkp is currently unused.
13407 chk_tte(tte_t
*orig_old
, tte_t
*cur
, tte_t
*new, struct hme_blk
*hmeblkp
)
13410 int cpuid
= CPU
->cpu_id
;
13412 gorig
[cpuid
] = orig_old
;
13420 if (TTE_IS_VALID(orig_old
)) {
13421 if (TTE_IS_VALID(cur
)) {
13422 i
= TTE_TO_TTEPFN(orig_old
);
13423 j
= TTE_TO_TTEPFN(cur
);
13424 k
= TTE_TO_TTEPFN(new);
13427 panic("chk_tte: bad pfn, 0x%lx, 0x%lx", i
, j
);
13432 panic("chk_tte: bad pfn2, 0x%lx, 0x%lx", i
, k
);
13435 if (TTE_IS_VALID(new)) {
13436 panic("chk_tte: invalid cur? ");
13439 i
= TTE_TO_TTEPFN(orig_old
);
13440 k
= TTE_TO_TTEPFN(new);
13442 panic("chk_tte: bad pfn3, 0x%lx, 0x%lx", i
, k
);
13446 if (TTE_IS_VALID(cur
)) {
13447 j
= TTE_TO_TTEPFN(cur
);
13448 if (TTE_IS_VALID(new)) {
13449 k
= TTE_TO_TTEPFN(new);
13451 panic("chk_tte: bad pfn4, 0x%lx, 0x%lx",
13455 panic("chk_tte: why here?");
13458 if (!TTE_IS_VALID(new)) {
13459 panic("chk_tte: why here2 ?");
13467 extern void prefetch_tsbe_read(struct tsbe
*);
13468 extern void prefetch_tsbe_write(struct tsbe
*);
13472 * We want to prefetch 7 cache lines ahead for our read prefetch. This gives
13473 * us optimal performance on Cheetah+. You can only have 8 outstanding
13474 * prefetches at any one time, so we opted for 7 read prefetches and 1 write
13475 * prefetch to make the most utilization of the prefetch capability.
13477 #define TSBE_PREFETCH_STRIDE (7)
13480 sfmmu_copy_tsb(struct tsb_info
*old_tsbinfo
, struct tsb_info
*new_tsbinfo
)
13482 int old_bytes
= TSB_BYTES(old_tsbinfo
->tsb_szc
);
13483 int new_bytes
= TSB_BYTES(new_tsbinfo
->tsb_szc
);
13484 int old_entries
= TSB_ENTRIES(old_tsbinfo
->tsb_szc
);
13485 int new_entries
= TSB_ENTRIES(new_tsbinfo
->tsb_szc
);
13488 struct tsbe
*new_base
= (struct tsbe
*)new_tsbinfo
->tsb_va
;
13495 if (old_bytes
== new_bytes
) {
13496 bcopy(old_tsbinfo
->tsb_va
, new_tsbinfo
->tsb_va
, new_bytes
);
13500 * A TSBE is 16 bytes which means there are four TSBE's per
13501 * P$ line (64 bytes), thus every 4 TSBE's we prefetch.
13503 old
= (struct tsbe
*)old_tsbinfo
->tsb_va
;
13504 last_prefetch
= old_entries
- (4*(TSBE_PREFETCH_STRIDE
+1));
13505 for (i
= 0; i
< old_entries
; i
++, old
++) {
13506 if (((i
& (4-1)) == 0) && (i
< last_prefetch
))
13507 prefetch_tsbe_read(old
);
13508 if (!old
->tte_tag
.tag_invalid
) {
13510 * We have a valid TTE to remap. Check the
13511 * size. We won't remap 64K or 512K TTEs
13512 * because they span more than one TSB entry
13513 * and are indexed using an 8K virt. page.
13514 * Ditto for 32M and 256M TTEs.
13516 if (TTE_CSZ(&old
->tte_data
) == TTE64K
||
13517 TTE_CSZ(&old
->tte_data
) == TTE512K
)
13519 if (mmu_page_sizes
== max_mmu_page_sizes
) {
13520 if (TTE_CSZ(&old
->tte_data
) == TTE32M
||
13521 TTE_CSZ(&old
->tte_data
) == TTE256M
)
13525 /* clear the lower 22 bits of the va */
13526 va
= *(uint64_t *)old
<< 22;
13527 /* turn va into a virtual pfn */
13528 va
>>= 22 - TSB_START_SIZE
;
13530 * or in bits from the offset in the tsb
13531 * to get the real virtual pfn. These
13532 * correspond to bits [21:13] in the va
13535 TTE_BSZS_SHIFT(TTE_CSZ(&old
->tte_data
)) &
13537 va
|= (i
<< vpshift
);
13539 new_offset
= va
& (new_entries
- 1);
13540 new = new_base
+ new_offset
;
13541 prefetch_tsbe_write(new);
13557 * Called when a thread is exiting and we have switched to the kernel address
13558 * space. Perform the same VM initialization resume() uses when switching
13561 * Note that sfmmu_load_mmustate() is currently a no-op for kernel threads, but
13562 * we call it anyway in case the semantics change in the future.
13566 hat_thread_exit(kthread_t
*thd
)
13569 uint_t pstate_save
;
13571 ASSERT(thd
->t_procp
->p_as
== &kas
);
13573 pgsz_cnum
= KCONTEXT
;
13575 pgsz_cnum
|= (ksfmmup
->sfmmu_cext
<< CTXREG_EXT_SHIFT
);
13579 * Note that sfmmu_load_mmustate() is currently a no-op for
13580 * kernel threads. We need to disable interrupts here,
13581 * simply because otherwise sfmmu_load_mmustate() would panic
13582 * if the caller does not disable interrupts.
13584 pstate_save
= sfmmu_disable_intrs();
13586 /* Compatibility Note: hw takes care of MMU_SCONTEXT1 */
13587 sfmmu_setctx_sec(pgsz_cnum
);
13588 sfmmu_load_mmustate(ksfmmup
);
13589 sfmmu_enable_intrs(pstate_save
);
13596 #define SRD_HASH_FUNCTION(vp) (((((uintptr_t)(vp)) >> 4) ^ \
13597 (((uintptr_t)(vp)) >> 11)) & \
13601 * Attach the process to the srd struct associated with the exec vnode
13602 * from which the process is started.
13605 hat_join_srd(struct hat
*sfmmup
, vnode_t
*evp
)
13607 uint_t hash
= SRD_HASH_FUNCTION(evp
);
13611 ASSERT(sfmmup
!= ksfmmup
);
13612 ASSERT(sfmmup
->sfmmu_srdp
== NULL
);
13620 if (srd_buckets
[hash
].srdb_srdp
!= NULL
) {
13621 mutex_enter(&srd_buckets
[hash
].srdb_lock
);
13622 for (srdp
= srd_buckets
[hash
].srdb_srdp
; srdp
!= NULL
;
13623 srdp
= srdp
->srd_hash
) {
13624 if (srdp
->srd_evp
== evp
) {
13625 ASSERT(srdp
->srd_refcnt
>= 0);
13626 sfmmup
->sfmmu_srdp
= srdp
;
13628 (volatile uint_t
*)&srdp
->srd_refcnt
);
13629 mutex_exit(&srd_buckets
[hash
].srdb_lock
);
13633 mutex_exit(&srd_buckets
[hash
].srdb_lock
);
13635 newsrdp
= kmem_cache_alloc(srd_cache
, KM_SLEEP
);
13636 ASSERT(newsrdp
->srd_next_ismrid
== 0 && newsrdp
->srd_next_hmerid
== 0);
13638 newsrdp
->srd_evp
= evp
;
13639 newsrdp
->srd_refcnt
= 1;
13640 newsrdp
->srd_hmergnfree
= NULL
;
13641 newsrdp
->srd_ismrgnfree
= NULL
;
13643 mutex_enter(&srd_buckets
[hash
].srdb_lock
);
13644 for (srdp
= srd_buckets
[hash
].srdb_srdp
; srdp
!= NULL
;
13645 srdp
= srdp
->srd_hash
) {
13646 if (srdp
->srd_evp
== evp
) {
13647 ASSERT(srdp
->srd_refcnt
>= 0);
13648 sfmmup
->sfmmu_srdp
= srdp
;
13649 atomic_inc_32((volatile uint_t
*)&srdp
->srd_refcnt
);
13650 mutex_exit(&srd_buckets
[hash
].srdb_lock
);
13651 kmem_cache_free(srd_cache
, newsrdp
);
13655 newsrdp
->srd_hash
= srd_buckets
[hash
].srdb_srdp
;
13656 srd_buckets
[hash
].srdb_srdp
= newsrdp
;
13657 sfmmup
->sfmmu_srdp
= newsrdp
;
13659 mutex_exit(&srd_buckets
[hash
].srdb_lock
);
13664 sfmmu_leave_srd(sfmmu_t
*sfmmup
)
13667 sf_srd_t
*srdp
= sfmmup
->sfmmu_srdp
;
13669 sf_srd_t
**prev_srdpp
;
13671 sf_region_t
*nrgnp
;
13677 ASSERT(sfmmup
!= ksfmmup
);
13678 ASSERT(srdp
!= NULL
);
13679 ASSERT(srdp
->srd_refcnt
> 0);
13680 ASSERT(sfmmup
->sfmmu_scdp
== NULL
);
13681 ASSERT(sfmmup
->sfmmu_free
== 1);
13683 sfmmup
->sfmmu_srdp
= NULL
;
13684 evp
= srdp
->srd_evp
;
13685 ASSERT(evp
!= NULL
);
13686 if (atomic_dec_32_nv((volatile uint_t
*)&srdp
->srd_refcnt
)) {
13691 hash
= SRD_HASH_FUNCTION(evp
);
13692 mutex_enter(&srd_buckets
[hash
].srdb_lock
);
13693 for (prev_srdpp
= &srd_buckets
[hash
].srdb_srdp
;
13694 (srdp
= *prev_srdpp
) != NULL
; prev_srdpp
= &srdp
->srd_hash
) {
13695 if (srdp
->srd_evp
== evp
) {
13699 if (srdp
== NULL
|| srdp
->srd_refcnt
) {
13700 mutex_exit(&srd_buckets
[hash
].srdb_lock
);
13704 *prev_srdpp
= srdp
->srd_hash
;
13705 mutex_exit(&srd_buckets
[hash
].srdb_lock
);
13707 ASSERT(srdp
->srd_refcnt
== 0);
13711 for (i
= 0; i
< SFMMU_MAX_REGION_BUCKETS
; i
++) {
13712 ASSERT(srdp
->srd_rgnhash
[i
] == NULL
);
13716 /* free each hme regions in the srd */
13717 for (rgnp
= srdp
->srd_hmergnfree
; rgnp
!= NULL
; rgnp
= nrgnp
) {
13718 nrgnp
= rgnp
->rgn_next
;
13719 ASSERT(rgnp
->rgn_id
< srdp
->srd_next_hmerid
);
13720 ASSERT(rgnp
->rgn_refcnt
== 0);
13721 ASSERT(rgnp
->rgn_sfmmu_head
== NULL
);
13722 ASSERT(rgnp
->rgn_flags
& SFMMU_REGION_FREE
);
13723 ASSERT(rgnp
->rgn_hmeflags
== 0);
13724 ASSERT(srdp
->srd_hmergnp
[rgnp
->rgn_id
] == rgnp
);
13726 for (i
= 0; i
< MMU_PAGE_SIZES
; i
++) {
13727 ASSERT(rgnp
->rgn_ttecnt
[i
] == 0);
13731 kmem_cache_free(region_cache
, rgnp
);
13733 ASSERT(rgns
== srdp
->srd_next_hmerid
);
13738 /* free each ism rgns in the srd */
13739 for (rgnp
= srdp
->srd_ismrgnfree
; rgnp
!= NULL
; rgnp
= nrgnp
) {
13740 nrgnp
= rgnp
->rgn_next
;
13741 ASSERT(rgnp
->rgn_id
< srdp
->srd_next_ismrid
);
13742 ASSERT(rgnp
->rgn_refcnt
== 0);
13743 ASSERT(rgnp
->rgn_sfmmu_head
== NULL
);
13744 ASSERT(rgnp
->rgn_flags
& SFMMU_REGION_FREE
);
13745 ASSERT(srdp
->srd_ismrgnp
[rgnp
->rgn_id
] == rgnp
);
13747 for (i
= 0; i
< MMU_PAGE_SIZES
; i
++) {
13748 ASSERT(rgnp
->rgn_ttecnt
[i
] == 0);
13752 kmem_cache_free(region_cache
, rgnp
);
13754 ASSERT(rgns
== srdp
->srd_next_ismrid
);
13755 ASSERT(srdp
->srd_ismbusyrgns
== 0);
13756 ASSERT(srdp
->srd_hmebusyrgns
== 0);
13758 srdp
->srd_next_ismrid
= 0;
13759 srdp
->srd_next_hmerid
= 0;
13761 bzero((void *)srdp
->srd_ismrgnp
,
13762 sizeof (sf_region_t
*) * SFMMU_MAX_ISM_REGIONS
);
13763 bzero((void *)srdp
->srd_hmergnp
,
13764 sizeof (sf_region_t
*) * SFMMU_MAX_HME_REGIONS
);
13766 ASSERT(srdp
->srd_scdp
== NULL
);
13767 kmem_cache_free(srd_cache
, srdp
);
13772 sfmmu_srdcache_constructor(void *buf
, void *cdrarg
, int kmflags
)
13774 sf_srd_t
*srdp
= (sf_srd_t
*)buf
;
13775 bzero(buf
, sizeof (*srdp
));
13777 mutex_init(&srdp
->srd_mutex
, NULL
, MUTEX_DEFAULT
, NULL
);
13778 mutex_init(&srdp
->srd_scd_mutex
, NULL
, MUTEX_DEFAULT
, NULL
);
13784 sfmmu_srdcache_destructor(void *buf
, void *cdrarg
)
13786 sf_srd_t
*srdp
= (sf_srd_t
*)buf
;
13788 mutex_destroy(&srdp
->srd_mutex
);
13789 mutex_destroy(&srdp
->srd_scd_mutex
);
13793 * The caller makes sure hat_join_region()/hat_leave_region() can't be called
13794 * at the same time for the same process and address range. This is ensured by
13795 * the fact that address space is locked as writer when a process joins the
13796 * regions. Therefore there's no need to hold an srd lock during the entire
13797 * execution of hat_join_region()/hat_leave_region().
13800 #define RGN_HASH_FUNCTION(obj) (((((uintptr_t)(obj)) >> 4) ^ \
13801 (((uintptr_t)(obj)) >> 11)) & \
13804 * This routine implements the shared context functionality required when
13805 * attaching a segment to an address space. It must be called from
13806 * hat_share() for D(ISM) segments and from segvn_create() for segments
13807 * with the MAP_PRIVATE and MAP_TEXT flags set. It returns a region_cookie
13808 * which is saved in the private segment data for hme segments and
13809 * the ism_map structure for ism segments.
13811 hat_region_cookie_t
13812 hat_join_region(struct hat
*sfmmup
,
13816 u_offset_t r_objoff
,
13819 hat_rgn_cb_func_t r_cb_function
,
13822 sf_srd_t
*srdp
= sfmmup
->sfmmu_srdp
;
13825 hatlock_t
*hatlockp
;
13827 sf_region_t
*new_rgnp
= NULL
;
13830 sf_region_t
**freelistp
;
13832 sf_region_t
**rarrp
;
13833 uint16_t *busyrgnsp
;
13836 uchar_t r_type
= flags
& HAT_REGION_TYPE_MASK
;
13837 int text
= (r_type
== HAT_REGION_TEXT
);
13839 if (srdp
== NULL
|| r_size
== 0) {
13840 return (HAT_INVALID_REGION_COOKIE
);
13843 ASSERT(sfmmup
!= ksfmmup
);
13844 ASSERT(AS_WRITE_HELD(sfmmup
->sfmmu_as
));
13845 ASSERT(srdp
->srd_refcnt
> 0);
13846 ASSERT(!(flags
& ~HAT_REGION_TYPE_MASK
));
13847 ASSERT(flags
== HAT_REGION_TEXT
|| flags
== HAT_REGION_ISM
);
13848 ASSERT(r_pgszc
< mmu_page_sizes
);
13849 if (!IS_P2ALIGNED(r_saddr
, TTEBYTES(r_pgszc
)) ||
13850 !IS_P2ALIGNED(r_size
, TTEBYTES(r_pgszc
))) {
13851 panic("hat_join_region: region addr or size is not aligned\n");
13855 r_type
= (r_type
== HAT_REGION_ISM
) ? SFMMU_REGION_ISM
:
13858 * Currently only support shared hmes for the read only main text
13861 if (r_type
== SFMMU_REGION_HME
&& ((r_obj
!= srdp
->srd_evp
) ||
13862 (r_perm
& PROT_WRITE
))) {
13863 return (HAT_INVALID_REGION_COOKIE
);
13866 rhash
= RGN_HASH_FUNCTION(r_obj
);
13868 if (r_type
== SFMMU_REGION_ISM
) {
13869 nextidp
= &srdp
->srd_next_ismrid
;
13870 freelistp
= &srdp
->srd_ismrgnfree
;
13871 maxids
= SFMMU_MAX_ISM_REGIONS
;
13872 rarrp
= srdp
->srd_ismrgnp
;
13873 busyrgnsp
= &srdp
->srd_ismbusyrgns
;
13875 nextidp
= &srdp
->srd_next_hmerid
;
13876 freelistp
= &srdp
->srd_hmergnfree
;
13877 maxids
= SFMMU_MAX_HME_REGIONS
;
13878 rarrp
= srdp
->srd_hmergnp
;
13879 busyrgnsp
= &srdp
->srd_hmebusyrgns
;
13882 mutex_enter(&srdp
->srd_mutex
);
13884 for (rgnp
= srdp
->srd_rgnhash
[rhash
]; rgnp
!= NULL
;
13885 rgnp
= rgnp
->rgn_hash
) {
13886 if (rgnp
->rgn_saddr
== r_saddr
&& rgnp
->rgn_size
== r_size
&&
13887 rgnp
->rgn_obj
== r_obj
&& rgnp
->rgn_objoff
== r_objoff
&&
13888 rgnp
->rgn_perm
== r_perm
&& rgnp
->rgn_pgszc
== r_pgszc
) {
13894 if (rgnp
!= NULL
) {
13895 ASSERT((rgnp
->rgn_flags
& SFMMU_REGION_TYPE_MASK
) == r_type
);
13896 ASSERT(rgnp
->rgn_cb_function
== r_cb_function
);
13897 ASSERT(rgnp
->rgn_refcnt
>= 0);
13898 rid
= rgnp
->rgn_id
;
13899 ASSERT(rid
< maxids
);
13900 ASSERT(rarrp
[rid
] == rgnp
);
13901 ASSERT(rid
< *nextidp
);
13902 atomic_inc_32((volatile uint_t
*)&rgnp
->rgn_refcnt
);
13903 mutex_exit(&srdp
->srd_mutex
);
13904 if (new_rgnp
!= NULL
) {
13905 kmem_cache_free(region_cache
, new_rgnp
);
13907 if (r_type
== SFMMU_REGION_HME
) {
13909 (sfmmup
== astosfmmu(curthread
->t_procp
->p_as
));
13911 sfmmu_link_to_hmeregion(sfmmup
, rgnp
);
13913 * bitmap should be updated after linking sfmmu on
13914 * region list so that pageunload() doesn't skip
13915 * TSB/TLB flush. As soon as bitmap is updated another
13916 * thread in this process can already start accessing
13920 * Normally ttecnt accounting is done as part of
13921 * pagefault handling. But a process may not take any
13922 * pagefaults on shared hmeblks created by some other
13923 * process. To compensate for this assume that the
13924 * entire region will end up faulted in using
13925 * the region's pagesize.
13928 if (r_pgszc
> TTE8K
) {
13929 tteflag
= 1 << r_pgszc
;
13930 if (disable_large_pages
& tteflag
) {
13936 if (tteflag
&& !(sfmmup
->sfmmu_rtteflags
& tteflag
)) {
13937 hatlockp
= sfmmu_hat_enter(sfmmup
);
13938 sfmmup
->sfmmu_rtteflags
|= tteflag
;
13939 sfmmu_hat_exit(hatlockp
);
13941 hatlockp
= sfmmu_hat_enter(sfmmup
);
13944 * Preallocate 1/4 of ttecnt's in 8K TSB for >= 4M
13945 * region to allow for large page allocation failure.
13947 if (r_pgszc
>= TTE4M
) {
13948 sfmmup
->sfmmu_tsb0_4minflcnt
+=
13949 r_size
>> (TTE_PAGE_SHIFT(TTE8K
) + 2);
13952 /* update sfmmu_ttecnt with the shme rgn ttecnt */
13953 rttecnt
= r_size
>> TTE_PAGE_SHIFT(r_pgszc
);
13954 atomic_add_long(&sfmmup
->sfmmu_ttecnt
[r_pgszc
],
13957 if (text
&& r_pgszc
>= TTE4M
&&
13958 (tteflag
|| ((disable_large_pages
>> TTE4M
) &
13959 ((1 << (r_pgszc
- TTE4M
+ 1)) - 1))) &&
13960 !SFMMU_FLAGS_ISSET(sfmmup
, HAT_4MTEXT_FLAG
)) {
13961 SFMMU_FLAGS_SET(sfmmup
, HAT_4MTEXT_FLAG
);
13964 sfmmu_hat_exit(hatlockp
);
13966 * On Panther we need to make sure TLB is programmed
13967 * to accept 32M/256M pages. Call
13968 * sfmmu_check_page_sizes() now to make sure TLB is
13969 * setup before making hmeregions visible to other
13972 sfmmu_check_page_sizes(sfmmup
, 1);
13973 hatlockp
= sfmmu_hat_enter(sfmmup
);
13974 SF_RGNMAP_ADD(sfmmup
->sfmmu_hmeregion_map
, rid
);
13977 * if context is invalid tsb miss exception code will
13978 * call sfmmu_check_page_sizes() and update tsbmiss
13981 kpreempt_disable();
13983 (sfmmup
->sfmmu_ctxs
[CPU_MMU_IDX(CPU
)].cnum
13984 != INVALID_CONTEXT
)) {
13985 struct tsbmiss
*tsbmp
;
13987 tsbmp
= &tsbmiss_area
[CPU
->cpu_id
];
13988 ASSERT(sfmmup
== tsbmp
->usfmmup
);
13989 BT_SET(tsbmp
->shmermap
, rid
);
13990 if (r_pgszc
> TTE64K
) {
13991 tsbmp
->uhat_rtteflags
|= tteflag
;
13997 sfmmu_hat_exit(hatlockp
);
13998 ASSERT((hat_region_cookie_t
)((uint64_t)rid
) !=
13999 HAT_INVALID_REGION_COOKIE
);
14001 hatlockp
= sfmmu_hat_enter(sfmmup
);
14002 SF_RGNMAP_ADD(sfmmup
->sfmmu_ismregion_map
, rid
);
14003 sfmmu_hat_exit(hatlockp
);
14005 ASSERT(rid
< maxids
);
14007 if (r_type
== SFMMU_REGION_ISM
) {
14008 sfmmu_find_scd(sfmmup
);
14010 return ((hat_region_cookie_t
)((uint64_t)rid
));
14013 ASSERT(new_rgnp
== NULL
);
14015 if (*busyrgnsp
>= maxids
) {
14016 mutex_exit(&srdp
->srd_mutex
);
14017 return (HAT_INVALID_REGION_COOKIE
);
14020 ASSERT(MUTEX_HELD(&srdp
->srd_mutex
));
14021 if (*freelistp
!= NULL
) {
14023 *freelistp
= rgnp
->rgn_next
;
14024 ASSERT(rgnp
->rgn_id
< *nextidp
);
14025 ASSERT(rgnp
->rgn_id
< maxids
);
14026 ASSERT(rgnp
->rgn_flags
& SFMMU_REGION_FREE
);
14027 ASSERT((rgnp
->rgn_flags
& SFMMU_REGION_TYPE_MASK
)
14029 ASSERT(rarrp
[rgnp
->rgn_id
] == rgnp
);
14030 ASSERT(rgnp
->rgn_hmeflags
== 0);
14033 * release local locks before memory allocation.
14035 mutex_exit(&srdp
->srd_mutex
);
14037 new_rgnp
= kmem_cache_alloc(region_cache
, KM_SLEEP
);
14039 mutex_enter(&srdp
->srd_mutex
);
14040 for (rgnp
= srdp
->srd_rgnhash
[rhash
]; rgnp
!= NULL
;
14041 rgnp
= rgnp
->rgn_hash
) {
14042 if (rgnp
->rgn_saddr
== r_saddr
&&
14043 rgnp
->rgn_size
== r_size
&&
14044 rgnp
->rgn_obj
== r_obj
&&
14045 rgnp
->rgn_objoff
== r_objoff
&&
14046 rgnp
->rgn_perm
== r_perm
&&
14047 rgnp
->rgn_pgszc
== r_pgszc
) {
14051 if (rgnp
!= NULL
) {
14055 if (*nextidp
>= maxids
) {
14056 mutex_exit(&srdp
->srd_mutex
);
14061 rgnp
->rgn_id
= (*nextidp
)++;
14062 ASSERT(rgnp
->rgn_id
< maxids
);
14063 ASSERT(rarrp
[rgnp
->rgn_id
] == NULL
);
14064 rarrp
[rgnp
->rgn_id
] = rgnp
;
14067 ASSERT(rgnp
->rgn_sfmmu_head
== NULL
);
14068 ASSERT(rgnp
->rgn_hmeflags
== 0);
14070 for (i
= 0; i
< MMU_PAGE_SIZES
; i
++) {
14071 ASSERT(rgnp
->rgn_ttecnt
[i
] == 0);
14074 rgnp
->rgn_saddr
= r_saddr
;
14075 rgnp
->rgn_size
= r_size
;
14076 rgnp
->rgn_obj
= r_obj
;
14077 rgnp
->rgn_objoff
= r_objoff
;
14078 rgnp
->rgn_perm
= r_perm
;
14079 rgnp
->rgn_pgszc
= r_pgszc
;
14080 rgnp
->rgn_flags
= r_type
;
14081 rgnp
->rgn_refcnt
= 0;
14082 rgnp
->rgn_cb_function
= r_cb_function
;
14083 rgnp
->rgn_hash
= srdp
->srd_rgnhash
[rhash
];
14084 srdp
->srd_rgnhash
[rhash
] = rgnp
;
14086 ASSERT(*busyrgnsp
<= maxids
);
14090 ASSERT(new_rgnp
!= NULL
);
14091 kmem_cache_free(region_cache
, new_rgnp
);
14092 return (HAT_INVALID_REGION_COOKIE
);
14096 * This function implements the shared context functionality required
14097 * when detaching a segment from an address space. It must be called
14098 * from hat_unshare() for all D(ISM) segments and from segvn_unmap(),
14099 * for segments with a valid region_cookie.
14100 * It will also be called from all seg_vn routines which change a
14101 * segment's attributes such as segvn_setprot(), segvn_setpagesize(),
14102 * segvn_clrszc() & segvn_advise(), as well as in the case of COW fault
14103 * from segvn_fault().
14106 hat_leave_region(struct hat
*sfmmup
, hat_region_cookie_t rcookie
, uint_t flags
)
14108 sf_srd_t
*srdp
= sfmmup
->sfmmu_srdp
;
14111 uint_t rid
= (uint_t
)((uint64_t)rcookie
);
14112 hatlock_t
*hatlockp
= NULL
;
14114 sf_region_t
**prev_rgnpp
;
14115 sf_region_t
*cur_rgnp
;
14122 uchar_t r_type
= flags
& HAT_REGION_TYPE_MASK
;
14124 ASSERT(sfmmup
!= ksfmmup
);
14125 ASSERT(srdp
!= NULL
);
14126 ASSERT(srdp
->srd_refcnt
> 0);
14127 ASSERT(!(flags
& ~HAT_REGION_TYPE_MASK
));
14128 ASSERT(flags
== HAT_REGION_TEXT
|| flags
== HAT_REGION_ISM
);
14129 ASSERT(!sfmmup
->sfmmu_free
|| sfmmup
->sfmmu_scdp
== NULL
);
14131 r_type
= (r_type
== HAT_REGION_ISM
) ? SFMMU_REGION_ISM
:
14134 if (r_type
== SFMMU_REGION_ISM
) {
14135 ASSERT(SFMMU_IS_ISMRID_VALID(rid
));
14136 ASSERT(rid
< SFMMU_MAX_ISM_REGIONS
);
14137 rgnp
= srdp
->srd_ismrgnp
[rid
];
14139 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
14140 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
14141 rgnp
= srdp
->srd_hmergnp
[rid
];
14143 ASSERT(rgnp
!= NULL
);
14144 ASSERT(rgnp
->rgn_id
== rid
);
14145 ASSERT((rgnp
->rgn_flags
& SFMMU_REGION_TYPE_MASK
) == r_type
);
14146 ASSERT(!(rgnp
->rgn_flags
& SFMMU_REGION_FREE
));
14147 ASSERT(AS_LOCK_HELD(sfmmup
->sfmmu_as
));
14149 if (sfmmup
->sfmmu_free
) {
14151 r_pgszc
= rgnp
->rgn_pgszc
;
14152 r_size
= rgnp
->rgn_size
;
14154 ASSERT(sfmmup
->sfmmu_scdp
== NULL
);
14155 if (r_type
== SFMMU_REGION_ISM
) {
14156 SF_RGNMAP_DEL(sfmmup
->sfmmu_ismregion_map
, rid
);
14158 /* update shme rgns ttecnt in sfmmu_ttecnt */
14159 rttecnt
= r_size
>> TTE_PAGE_SHIFT(r_pgszc
);
14160 ASSERT(sfmmup
->sfmmu_ttecnt
[r_pgszc
] >= rttecnt
);
14162 atomic_add_long(&sfmmup
->sfmmu_ttecnt
[r_pgszc
],
14165 SF_RGNMAP_DEL(sfmmup
->sfmmu_hmeregion_map
, rid
);
14167 } else if (r_type
== SFMMU_REGION_ISM
) {
14168 hatlockp
= sfmmu_hat_enter(sfmmup
);
14169 ASSERT(rid
< srdp
->srd_next_ismrid
);
14170 SF_RGNMAP_DEL(sfmmup
->sfmmu_ismregion_map
, rid
);
14171 scdp
= sfmmup
->sfmmu_scdp
;
14172 if (scdp
!= NULL
&&
14173 SF_RGNMAP_TEST(scdp
->scd_ismregion_map
, rid
)) {
14174 sfmmu_leave_scd(sfmmup
, r_type
);
14175 ASSERT(sfmmu_hat_lock_held(sfmmup
));
14177 sfmmu_hat_exit(hatlockp
);
14180 r_pgszc
= rgnp
->rgn_pgszc
;
14181 r_saddr
= rgnp
->rgn_saddr
;
14182 r_size
= rgnp
->rgn_size
;
14183 r_eaddr
= r_saddr
+ r_size
;
14185 ASSERT(r_type
== SFMMU_REGION_HME
);
14186 hatlockp
= sfmmu_hat_enter(sfmmup
);
14187 ASSERT(rid
< srdp
->srd_next_hmerid
);
14188 SF_RGNMAP_DEL(sfmmup
->sfmmu_hmeregion_map
, rid
);
14191 * If region is part of an SCD call sfmmu_leave_scd().
14192 * Otherwise if process is not exiting and has valid context
14193 * just drop the context on the floor to lose stale TLB
14194 * entries and force the update of tsb miss area to reflect
14195 * the new region map. After that clean our TSB entries.
14197 scdp
= sfmmup
->sfmmu_scdp
;
14198 if (scdp
!= NULL
&&
14199 SF_RGNMAP_TEST(scdp
->scd_hmeregion_map
, rid
)) {
14200 sfmmu_leave_scd(sfmmup
, r_type
);
14201 ASSERT(sfmmu_hat_lock_held(sfmmup
));
14203 sfmmu_invalidate_ctx(sfmmup
);
14206 while (i
< mmu_page_sizes
) {
14207 if (rgnp
->rgn_ttecnt
[i
] != 0) {
14208 sfmmu_unload_tsb_range(sfmmup
, r_saddr
,
14219 /* Remove the preallocated 1/4 8k ttecnt for 4M regions. */
14220 if (r_pgszc
>= TTE4M
) {
14221 rttecnt
= r_size
>> (TTE_PAGE_SHIFT(TTE8K
) + 2);
14222 ASSERT(sfmmup
->sfmmu_tsb0_4minflcnt
>=
14224 sfmmup
->sfmmu_tsb0_4minflcnt
-= rttecnt
;
14227 /* update shme rgns ttecnt in sfmmu_ttecnt */
14228 rttecnt
= r_size
>> TTE_PAGE_SHIFT(r_pgszc
);
14229 ASSERT(sfmmup
->sfmmu_ttecnt
[r_pgszc
] >= rttecnt
);
14230 atomic_add_long(&sfmmup
->sfmmu_ttecnt
[r_pgszc
], -rttecnt
);
14232 sfmmu_hat_exit(hatlockp
);
14233 if (scdp
!= NULL
&& sfmmup
->sfmmu_scdp
== NULL
) {
14234 /* sfmmup left the scd, grow private tsb */
14235 sfmmu_check_page_sizes(sfmmup
, 1);
14237 sfmmu_check_page_sizes(sfmmup
, 0);
14241 if (r_type
== SFMMU_REGION_HME
) {
14242 sfmmu_unlink_from_hmeregion(sfmmup
, rgnp
);
14245 r_obj
= rgnp
->rgn_obj
;
14246 if (atomic_dec_32_nv((volatile uint_t
*)&rgnp
->rgn_refcnt
)) {
14251 * looks like nobody uses this region anymore. Free it.
14253 rhash
= RGN_HASH_FUNCTION(r_obj
);
14254 mutex_enter(&srdp
->srd_mutex
);
14255 for (prev_rgnpp
= &srdp
->srd_rgnhash
[rhash
];
14256 (cur_rgnp
= *prev_rgnpp
) != NULL
;
14257 prev_rgnpp
= &cur_rgnp
->rgn_hash
) {
14258 if (cur_rgnp
== rgnp
&& cur_rgnp
->rgn_refcnt
== 0) {
14263 if (cur_rgnp
== NULL
) {
14264 mutex_exit(&srdp
->srd_mutex
);
14268 ASSERT((rgnp
->rgn_flags
& SFMMU_REGION_TYPE_MASK
) == r_type
);
14269 *prev_rgnpp
= rgnp
->rgn_hash
;
14270 if (r_type
== SFMMU_REGION_ISM
) {
14271 rgnp
->rgn_flags
|= SFMMU_REGION_FREE
;
14272 ASSERT(rid
< srdp
->srd_next_ismrid
);
14273 rgnp
->rgn_next
= srdp
->srd_ismrgnfree
;
14274 srdp
->srd_ismrgnfree
= rgnp
;
14275 ASSERT(srdp
->srd_ismbusyrgns
> 0);
14276 srdp
->srd_ismbusyrgns
--;
14277 mutex_exit(&srdp
->srd_mutex
);
14280 mutex_exit(&srdp
->srd_mutex
);
14283 * Destroy region's hmeblks.
14285 sfmmu_unload_hmeregion(srdp
, rgnp
);
14287 rgnp
->rgn_hmeflags
= 0;
14289 ASSERT(rgnp
->rgn_sfmmu_head
== NULL
);
14290 ASSERT(rgnp
->rgn_id
== rid
);
14291 for (i
= 0; i
< MMU_PAGE_SIZES
; i
++) {
14292 rgnp
->rgn_ttecnt
[i
] = 0;
14294 rgnp
->rgn_flags
|= SFMMU_REGION_FREE
;
14295 mutex_enter(&srdp
->srd_mutex
);
14296 ASSERT(rid
< srdp
->srd_next_hmerid
);
14297 rgnp
->rgn_next
= srdp
->srd_hmergnfree
;
14298 srdp
->srd_hmergnfree
= rgnp
;
14299 ASSERT(srdp
->srd_hmebusyrgns
> 0);
14300 srdp
->srd_hmebusyrgns
--;
14301 mutex_exit(&srdp
->srd_mutex
);
14305 * For now only called for hmeblk regions and not for ISM regions.
14308 hat_dup_region(struct hat
*sfmmup
, hat_region_cookie_t rcookie
)
14310 sf_srd_t
*srdp
= sfmmup
->sfmmu_srdp
;
14311 uint_t rid
= (uint_t
)((uint64_t)rcookie
);
14313 sf_rgn_link_t
*rlink
;
14314 sf_rgn_link_t
*hrlink
;
14317 ASSERT(sfmmup
!= ksfmmup
);
14318 ASSERT(srdp
!= NULL
);
14319 ASSERT(srdp
->srd_refcnt
> 0);
14321 ASSERT(rid
< srdp
->srd_next_hmerid
);
14322 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
14323 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
14325 rgnp
= srdp
->srd_hmergnp
[rid
];
14326 ASSERT(rgnp
->rgn_refcnt
> 0);
14327 ASSERT(rgnp
->rgn_id
== rid
);
14328 ASSERT((rgnp
->rgn_flags
& SFMMU_REGION_TYPE_MASK
) == SFMMU_REGION_HME
);
14329 ASSERT(!(rgnp
->rgn_flags
& SFMMU_REGION_FREE
));
14331 atomic_inc_32((volatile uint_t
*)&rgnp
->rgn_refcnt
);
14333 /* LINTED: constant in conditional context */
14334 SFMMU_HMERID2RLINKP(sfmmup
, rid
, rlink
, 1, 0);
14335 ASSERT(rlink
!= NULL
);
14336 mutex_enter(&rgnp
->rgn_mutex
);
14337 ASSERT(rgnp
->rgn_sfmmu_head
!= NULL
);
14338 /* LINTED: constant in conditional context */
14339 SFMMU_HMERID2RLINKP(rgnp
->rgn_sfmmu_head
, rid
, hrlink
, 0, 0);
14340 ASSERT(hrlink
!= NULL
);
14341 ASSERT(hrlink
->prev
== NULL
);
14342 rlink
->next
= rgnp
->rgn_sfmmu_head
;
14343 rlink
->prev
= NULL
;
14344 hrlink
->prev
= sfmmup
;
14346 * make sure rlink's next field is correct
14347 * before making this link visible.
14350 rgnp
->rgn_sfmmu_head
= sfmmup
;
14351 mutex_exit(&rgnp
->rgn_mutex
);
14353 /* update sfmmu_ttecnt with the shme rgn ttecnt */
14354 rttecnt
= rgnp
->rgn_size
>> TTE_PAGE_SHIFT(rgnp
->rgn_pgszc
);
14355 atomic_add_long(&sfmmup
->sfmmu_ttecnt
[rgnp
->rgn_pgszc
], rttecnt
);
14356 /* update tsb0 inflation count */
14357 if (rgnp
->rgn_pgszc
>= TTE4M
) {
14358 sfmmup
->sfmmu_tsb0_4minflcnt
+=
14359 rgnp
->rgn_size
>> (TTE_PAGE_SHIFT(TTE8K
) + 2);
14362 * Update regionid bitmask without hat lock since no other thread
14363 * can update this region bitmask right now.
14365 SF_RGNMAP_ADD(sfmmup
->sfmmu_hmeregion_map
, rid
);
14370 sfmmu_rgncache_constructor(void *buf
, void *cdrarg
, int kmflags
)
14372 sf_region_t
*rgnp
= (sf_region_t
*)buf
;
14373 bzero(buf
, sizeof (*rgnp
));
14375 mutex_init(&rgnp
->rgn_mutex
, NULL
, MUTEX_DEFAULT
, NULL
);
14382 sfmmu_rgncache_destructor(void *buf
, void *cdrarg
)
14384 sf_region_t
*rgnp
= (sf_region_t
*)buf
;
14385 mutex_destroy(&rgnp
->rgn_mutex
);
14389 sfrgnmap_isnull(sf_region_map_t
*map
)
14393 for (i
= 0; i
< SFMMU_RGNMAP_WORDS
; i
++) {
14394 if (map
->bitmap
[i
] != 0) {
14402 sfhmergnmap_isnull(sf_hmeregion_map_t
*map
)
14406 for (i
= 0; i
< SFMMU_HMERGNMAP_WORDS
; i
++) {
14407 if (map
->bitmap
[i
] != 0) {
14416 check_scd_sfmmu_list(sfmmu_t
**headp
, sfmmu_t
*sfmmup
, int onlist
)
14419 sf_srd_t
*srdp
= sfmmup
->sfmmu_srdp
;
14421 for (sp
= *headp
; sp
!= NULL
; sp
= sp
->sfmmu_scd_link
.next
) {
14422 ASSERT(srdp
== sp
->sfmmu_srdp
);
14423 if (sp
== sfmmup
) {
14427 panic("shctx: sfmmu 0x%p found on scd"
14428 "list 0x%p", (void *)sfmmup
,
14434 panic("shctx: sfmmu 0x%p not found on scd list 0x%p",
14435 (void *)sfmmup
, (void *)*headp
);
14441 #define check_scd_sfmmu_list(headp, sfmmup, onlist)
14445 * Removes an sfmmu from the SCD sfmmu list.
14448 sfmmu_from_scd_list(sfmmu_t
**headp
, sfmmu_t
*sfmmup
)
14450 ASSERT(sfmmup
->sfmmu_srdp
!= NULL
);
14451 check_scd_sfmmu_list(headp
, sfmmup
, 1);
14452 if (sfmmup
->sfmmu_scd_link
.prev
!= NULL
) {
14453 ASSERT(*headp
!= sfmmup
);
14454 sfmmup
->sfmmu_scd_link
.prev
->sfmmu_scd_link
.next
=
14455 sfmmup
->sfmmu_scd_link
.next
;
14457 ASSERT(*headp
== sfmmup
);
14458 *headp
= sfmmup
->sfmmu_scd_link
.next
;
14460 if (sfmmup
->sfmmu_scd_link
.next
!= NULL
) {
14461 sfmmup
->sfmmu_scd_link
.next
->sfmmu_scd_link
.prev
=
14462 sfmmup
->sfmmu_scd_link
.prev
;
14468 * Adds an sfmmu to the start of the queue.
14471 sfmmu_to_scd_list(sfmmu_t
**headp
, sfmmu_t
*sfmmup
)
14473 check_scd_sfmmu_list(headp
, sfmmup
, 0);
14474 sfmmup
->sfmmu_scd_link
.prev
= NULL
;
14475 sfmmup
->sfmmu_scd_link
.next
= *headp
;
14476 if (*headp
!= NULL
)
14477 (*headp
)->sfmmu_scd_link
.prev
= sfmmup
;
14482 * Remove an scd from the start of the queue.
14485 sfmmu_remove_scd(sf_scd_t
**headp
, sf_scd_t
*scdp
)
14487 if (scdp
->scd_prev
!= NULL
) {
14488 ASSERT(*headp
!= scdp
);
14489 scdp
->scd_prev
->scd_next
= scdp
->scd_next
;
14491 ASSERT(*headp
== scdp
);
14492 *headp
= scdp
->scd_next
;
14495 if (scdp
->scd_next
!= NULL
) {
14496 scdp
->scd_next
->scd_prev
= scdp
->scd_prev
;
14501 * Add an scd to the start of the queue.
14504 sfmmu_add_scd(sf_scd_t
**headp
, sf_scd_t
*scdp
)
14506 scdp
->scd_prev
= NULL
;
14507 scdp
->scd_next
= *headp
;
14508 if (*headp
!= NULL
) {
14509 (*headp
)->scd_prev
= scdp
;
14515 sfmmu_alloc_scd_tsbs(sf_srd_t
*srdp
, sf_scd_t
*scdp
)
14522 ulong_t tte8k_cnt
= 0;
14523 ulong_t tte4m_cnt
= 0;
14525 sfmmu_t
*scsfmmup
= scdp
->scd_sfmmup
;
14526 sfmmu_t
*ism_hatid
;
14527 struct tsb_info
*newtsb
;
14530 ASSERT(srdp
!= NULL
);
14532 for (i
= 0; i
< SFMMU_RGNMAP_WORDS
; i
++) {
14533 if ((w
= scdp
->scd_region_map
.bitmap
[i
]) == 0) {
14543 rid
= (i
<< BT_ULSHIFT
) | j
;
14547 if (rid
< SFMMU_MAX_HME_REGIONS
) {
14548 rgnp
= srdp
->srd_hmergnp
[rid
];
14549 ASSERT(rgnp
->rgn_id
== rid
);
14550 ASSERT(rgnp
->rgn_refcnt
> 0);
14552 if (rgnp
->rgn_pgszc
< TTE4M
) {
14553 tte8k_cnt
+= rgnp
->rgn_size
>>
14554 TTE_PAGE_SHIFT(TTE8K
);
14556 ASSERT(rgnp
->rgn_pgszc
>= TTE4M
);
14557 tte4m_cnt
+= rgnp
->rgn_size
>>
14558 TTE_PAGE_SHIFT(TTE4M
);
14560 * Inflate SCD tsb0 by preallocating
14561 * 1/4 8k ttecnt for 4M regions to
14562 * allow for lgpg alloc failure.
14564 tte8k_cnt
+= rgnp
->rgn_size
>>
14565 (TTE_PAGE_SHIFT(TTE8K
) + 2);
14568 rid
-= SFMMU_MAX_HME_REGIONS
;
14569 rgnp
= srdp
->srd_ismrgnp
[rid
];
14570 ASSERT(rgnp
->rgn_id
== rid
);
14571 ASSERT(rgnp
->rgn_refcnt
> 0);
14573 ism_hatid
= (sfmmu_t
*)rgnp
->rgn_obj
;
14574 ASSERT(ism_hatid
->sfmmu_ismhat
);
14576 for (szc
= 0; szc
< TTE4M
; szc
++) {
14578 ism_hatid
->sfmmu_ttecnt
[szc
] <<
14579 TTE_BSZS_SHIFT(szc
);
14582 ASSERT(rgnp
->rgn_pgszc
>= TTE4M
);
14583 if (rgnp
->rgn_pgszc
>= TTE4M
) {
14584 tte4m_cnt
+= rgnp
->rgn_size
>>
14585 TTE_PAGE_SHIFT(TTE4M
);
14591 tsb_szc
= SELECT_TSB_SIZECODE(tte8k_cnt
);
14593 /* Allocate both the SCD TSBs here. */
14594 if (sfmmu_tsbinfo_alloc(&scsfmmup
->sfmmu_tsb
,
14595 tsb_szc
, TSB8K
|TSB64K
|TSB512K
, TSB_ALLOC
, scsfmmup
) &&
14596 (tsb_szc
<= TSB_4M_SZCODE
||
14597 sfmmu_tsbinfo_alloc(&scsfmmup
->sfmmu_tsb
,
14598 TSB_4M_SZCODE
, TSB8K
|TSB64K
|TSB512K
,
14599 TSB_ALLOC
, scsfmmup
))) {
14601 SFMMU_STAT(sf_scd_1sttsb_allocfail
);
14602 return (TSB_ALLOCFAIL
);
14604 scsfmmup
->sfmmu_tsb
->tsb_flags
|= TSB_SHAREDCTX
;
14607 tsb_szc
= SELECT_TSB_SIZECODE(tte4m_cnt
);
14608 if (sfmmu_tsbinfo_alloc(&newtsb
, tsb_szc
,
14609 TSB4M
|TSB32M
|TSB256M
, TSB_ALLOC
, scsfmmup
) &&
14610 (tsb_szc
<= TSB_4M_SZCODE
||
14611 sfmmu_tsbinfo_alloc(&newtsb
, TSB_4M_SZCODE
,
14612 TSB4M
|TSB32M
|TSB256M
,
14613 TSB_ALLOC
, scsfmmup
))) {
14615 * If we fail to allocate the 2nd shared tsb,
14616 * just free the 1st tsb, return failure.
14618 sfmmu_tsbinfo_free(scsfmmup
->sfmmu_tsb
);
14619 SFMMU_STAT(sf_scd_2ndtsb_allocfail
);
14620 return (TSB_ALLOCFAIL
);
14622 ASSERT(scsfmmup
->sfmmu_tsb
->tsb_next
== NULL
);
14623 newtsb
->tsb_flags
|= TSB_SHAREDCTX
;
14624 scsfmmup
->sfmmu_tsb
->tsb_next
= newtsb
;
14625 SFMMU_STAT(sf_scd_2ndtsb_alloc
);
14628 SFMMU_STAT(sf_scd_1sttsb_alloc
);
14630 return (TSB_SUCCESS
);
14634 sfmmu_free_scd_tsbs(sfmmu_t
*scd_sfmmu
)
14636 while (scd_sfmmu
->sfmmu_tsb
!= NULL
) {
14637 struct tsb_info
*next
= scd_sfmmu
->sfmmu_tsb
->tsb_next
;
14638 sfmmu_tsbinfo_free(scd_sfmmu
->sfmmu_tsb
);
14639 scd_sfmmu
->sfmmu_tsb
= next
;
14644 * Link the sfmmu onto the hme region list.
14647 sfmmu_link_to_hmeregion(sfmmu_t
*sfmmup
, sf_region_t
*rgnp
)
14650 sf_rgn_link_t
*rlink
;
14652 sf_rgn_link_t
*hrlink
;
14654 rid
= rgnp
->rgn_id
;
14655 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
14657 /* LINTED: constant in conditional context */
14658 SFMMU_HMERID2RLINKP(sfmmup
, rid
, rlink
, 1, 1);
14659 ASSERT(rlink
!= NULL
);
14660 mutex_enter(&rgnp
->rgn_mutex
);
14661 if ((head
= rgnp
->rgn_sfmmu_head
) == NULL
) {
14662 rlink
->next
= NULL
;
14663 rlink
->prev
= NULL
;
14665 * make sure rlink's next field is NULL
14666 * before making this link visible.
14669 rgnp
->rgn_sfmmu_head
= sfmmup
;
14671 /* LINTED: constant in conditional context */
14672 SFMMU_HMERID2RLINKP(head
, rid
, hrlink
, 0, 0);
14673 ASSERT(hrlink
!= NULL
);
14674 ASSERT(hrlink
->prev
== NULL
);
14675 rlink
->next
= head
;
14676 rlink
->prev
= NULL
;
14677 hrlink
->prev
= sfmmup
;
14679 * make sure rlink's next field is correct
14680 * before making this link visible.
14683 rgnp
->rgn_sfmmu_head
= sfmmup
;
14685 mutex_exit(&rgnp
->rgn_mutex
);
14689 * Unlink the sfmmu from the hme region list.
14692 sfmmu_unlink_from_hmeregion(sfmmu_t
*sfmmup
, sf_region_t
*rgnp
)
14695 sf_rgn_link_t
*rlink
;
14697 rid
= rgnp
->rgn_id
;
14698 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
14700 /* LINTED: constant in conditional context */
14701 SFMMU_HMERID2RLINKP(sfmmup
, rid
, rlink
, 0, 0);
14702 ASSERT(rlink
!= NULL
);
14703 mutex_enter(&rgnp
->rgn_mutex
);
14704 if (rgnp
->rgn_sfmmu_head
== sfmmup
) {
14705 sfmmu_t
*next
= rlink
->next
;
14706 rgnp
->rgn_sfmmu_head
= next
;
14708 * if we are stopped by xc_attention() after this
14709 * point the forward link walking in
14710 * sfmmu_rgntlb_demap() will work correctly since the
14711 * head correctly points to the next element.
14714 rlink
->next
= NULL
;
14715 ASSERT(rlink
->prev
== NULL
);
14716 if (next
!= NULL
) {
14717 sf_rgn_link_t
*nrlink
;
14718 /* LINTED: constant in conditional context */
14719 SFMMU_HMERID2RLINKP(next
, rid
, nrlink
, 0, 0);
14720 ASSERT(nrlink
!= NULL
);
14721 ASSERT(nrlink
->prev
== sfmmup
);
14722 nrlink
->prev
= NULL
;
14725 sfmmu_t
*next
= rlink
->next
;
14726 sfmmu_t
*prev
= rlink
->prev
;
14727 sf_rgn_link_t
*prlink
;
14729 ASSERT(prev
!= NULL
);
14730 /* LINTED: constant in conditional context */
14731 SFMMU_HMERID2RLINKP(prev
, rid
, prlink
, 0, 0);
14732 ASSERT(prlink
!= NULL
);
14733 ASSERT(prlink
->next
== sfmmup
);
14734 prlink
->next
= next
;
14736 * if we are stopped by xc_attention()
14737 * after this point the forward link walking
14738 * will work correctly since the prev element
14739 * correctly points to the next element.
14742 rlink
->next
= NULL
;
14743 rlink
->prev
= NULL
;
14744 if (next
!= NULL
) {
14745 sf_rgn_link_t
*nrlink
;
14746 /* LINTED: constant in conditional context */
14747 SFMMU_HMERID2RLINKP(next
, rid
, nrlink
, 0, 0);
14748 ASSERT(nrlink
!= NULL
);
14749 ASSERT(nrlink
->prev
== sfmmup
);
14750 nrlink
->prev
= prev
;
14753 mutex_exit(&rgnp
->rgn_mutex
);
14757 * Link scd sfmmu onto ism or hme region list for each region in the
14761 sfmmu_link_scd_to_regions(sf_srd_t
*srdp
, sf_scd_t
*scdp
)
14770 scsfmmup
= scdp
->scd_sfmmup
;
14771 ASSERT(scsfmmup
->sfmmu_scdhat
);
14772 for (i
= 0; i
< SFMMU_RGNMAP_WORDS
; i
++) {
14773 if ((w
= scdp
->scd_region_map
.bitmap
[i
]) == 0) {
14783 rid
= (i
<< BT_ULSHIFT
) | j
;
14787 if (rid
< SFMMU_MAX_HME_REGIONS
) {
14788 rgnp
= srdp
->srd_hmergnp
[rid
];
14789 ASSERT(rgnp
->rgn_id
== rid
);
14790 ASSERT(rgnp
->rgn_refcnt
> 0);
14791 sfmmu_link_to_hmeregion(scsfmmup
, rgnp
);
14793 sfmmu_t
*ism_hatid
= NULL
;
14794 ism_ment_t
*ism_ment
;
14795 rid
-= SFMMU_MAX_HME_REGIONS
;
14796 rgnp
= srdp
->srd_ismrgnp
[rid
];
14797 ASSERT(rgnp
->rgn_id
== rid
);
14798 ASSERT(rgnp
->rgn_refcnt
> 0);
14800 ism_hatid
= (sfmmu_t
*)rgnp
->rgn_obj
;
14801 ASSERT(ism_hatid
->sfmmu_ismhat
);
14802 ism_ment
= &scdp
->scd_ism_links
[rid
];
14803 ism_ment
->iment_hat
= scsfmmup
;
14804 ism_ment
->iment_base_va
= rgnp
->rgn_saddr
;
14805 mutex_enter(&ism_mlist_lock
);
14806 iment_add(ism_ment
, ism_hatid
);
14807 mutex_exit(&ism_mlist_lock
);
14814 * Unlink scd sfmmu from ism or hme region list for each region in the
14818 sfmmu_unlink_scd_from_regions(sf_srd_t
*srdp
, sf_scd_t
*scdp
)
14827 scsfmmup
= scdp
->scd_sfmmup
;
14828 for (i
= 0; i
< SFMMU_RGNMAP_WORDS
; i
++) {
14829 if ((w
= scdp
->scd_region_map
.bitmap
[i
]) == 0) {
14839 rid
= (i
<< BT_ULSHIFT
) | j
;
14843 if (rid
< SFMMU_MAX_HME_REGIONS
) {
14844 rgnp
= srdp
->srd_hmergnp
[rid
];
14845 ASSERT(rgnp
->rgn_id
== rid
);
14846 ASSERT(rgnp
->rgn_refcnt
> 0);
14847 sfmmu_unlink_from_hmeregion(scsfmmup
,
14851 sfmmu_t
*ism_hatid
= NULL
;
14852 ism_ment_t
*ism_ment
;
14853 rid
-= SFMMU_MAX_HME_REGIONS
;
14854 rgnp
= srdp
->srd_ismrgnp
[rid
];
14855 ASSERT(rgnp
->rgn_id
== rid
);
14856 ASSERT(rgnp
->rgn_refcnt
> 0);
14858 ism_hatid
= (sfmmu_t
*)rgnp
->rgn_obj
;
14859 ASSERT(ism_hatid
->sfmmu_ismhat
);
14860 ism_ment
= &scdp
->scd_ism_links
[rid
];
14861 ASSERT(ism_ment
->iment_hat
== scdp
->scd_sfmmup
);
14862 ASSERT(ism_ment
->iment_base_va
==
14864 mutex_enter(&ism_mlist_lock
);
14865 iment_sub(ism_ment
, ism_hatid
);
14866 mutex_exit(&ism_mlist_lock
);
14873 * Allocates and initialises a new SCD structure, this is called with
14874 * the srd_scd_mutex held and returns with the reference count
14875 * initialised to 1.
14878 sfmmu_alloc_scd(sf_srd_t
*srdp
, sf_region_map_t
*new_map
)
14880 sf_scd_t
*new_scdp
;
14884 ASSERT(MUTEX_HELD(&srdp
->srd_scd_mutex
));
14885 new_scdp
= kmem_cache_alloc(scd_cache
, KM_SLEEP
);
14887 scsfmmup
= kmem_cache_alloc(sfmmuid_cache
, KM_SLEEP
);
14888 new_scdp
->scd_sfmmup
= scsfmmup
;
14889 scsfmmup
->sfmmu_srdp
= srdp
;
14890 scsfmmup
->sfmmu_scdp
= new_scdp
;
14891 scsfmmup
->sfmmu_tsb0_4minflcnt
= 0;
14892 scsfmmup
->sfmmu_scdhat
= 1;
14893 CPUSET_ALL(scsfmmup
->sfmmu_cpusran
);
14894 bzero(scsfmmup
->sfmmu_hmeregion_links
, SFMMU_L1_HMERLINKS_SIZE
);
14896 ASSERT(max_mmu_ctxdoms
> 0);
14897 for (i
= 0; i
< max_mmu_ctxdoms
; i
++) {
14898 scsfmmup
->sfmmu_ctxs
[i
].cnum
= INVALID_CONTEXT
;
14899 scsfmmup
->sfmmu_ctxs
[i
].gnum
= 0;
14902 for (i
= 0; i
< MMU_PAGE_SIZES
; i
++) {
14903 new_scdp
->scd_rttecnt
[i
] = 0;
14906 new_scdp
->scd_region_map
= *new_map
;
14907 new_scdp
->scd_refcnt
= 1;
14908 if (sfmmu_alloc_scd_tsbs(srdp
, new_scdp
) != TSB_SUCCESS
) {
14909 kmem_cache_free(scd_cache
, new_scdp
);
14910 kmem_cache_free(sfmmuid_cache
, scsfmmup
);
14913 if (&mmu_init_scd
) {
14914 mmu_init_scd(new_scdp
);
14920 * The first phase of a process joining an SCD. The hat structure is
14921 * linked to the SCD queue and then the HAT_JOIN_SCD sfmmu flag is set
14922 * and a cross-call with context invalidation is used to cause the
14923 * remaining work to be carried out in the sfmmu_tsbmiss_exception()
14927 sfmmu_join_scd(sf_scd_t
*scdp
, sfmmu_t
*sfmmup
)
14929 hatlock_t
*hatlockp
;
14930 sf_srd_t
*srdp
= sfmmup
->sfmmu_srdp
;
14932 sf_scd_t
*old_scdp
;
14934 ASSERT(srdp
!= NULL
);
14935 ASSERT(scdp
!= NULL
);
14936 ASSERT(scdp
->scd_refcnt
> 0);
14937 ASSERT(AS_WRITE_HELD(sfmmup
->sfmmu_as
));
14939 if ((old_scdp
= sfmmup
->sfmmu_scdp
) != NULL
) {
14940 ASSERT(old_scdp
!= scdp
);
14942 mutex_enter(&old_scdp
->scd_mutex
);
14943 sfmmu_from_scd_list(&old_scdp
->scd_sf_list
, sfmmup
);
14944 mutex_exit(&old_scdp
->scd_mutex
);
14946 * sfmmup leaves the old scd. Update sfmmu_ttecnt to
14947 * include the shme rgn ttecnt for rgns that
14948 * were in the old SCD
14950 for (i
= 0; i
< mmu_page_sizes
; i
++) {
14951 ASSERT(sfmmup
->sfmmu_scdrttecnt
[i
] ==
14952 old_scdp
->scd_rttecnt
[i
]);
14953 atomic_add_long(&sfmmup
->sfmmu_ttecnt
[i
],
14954 sfmmup
->sfmmu_scdrttecnt
[i
]);
14959 * Move sfmmu to the scd lists.
14961 mutex_enter(&scdp
->scd_mutex
);
14962 sfmmu_to_scd_list(&scdp
->scd_sf_list
, sfmmup
);
14963 mutex_exit(&scdp
->scd_mutex
);
14964 SF_SCD_INCR_REF(scdp
);
14966 hatlockp
= sfmmu_hat_enter(sfmmup
);
14968 * For a multi-thread process, we must stop
14969 * all the other threads before joining the scd.
14972 SFMMU_FLAGS_SET(sfmmup
, HAT_JOIN_SCD
);
14974 sfmmu_invalidate_ctx(sfmmup
);
14975 sfmmup
->sfmmu_scdp
= scdp
;
14978 * Copy scd_rttecnt into sfmmup's sfmmu_scdrttecnt, and update
14979 * sfmmu_ttecnt to not include the rgn ttecnt just joined in SCD.
14981 for (i
= 0; i
< mmu_page_sizes
; i
++) {
14982 sfmmup
->sfmmu_scdrttecnt
[i
] = scdp
->scd_rttecnt
[i
];
14983 ASSERT(sfmmup
->sfmmu_ttecnt
[i
] >= scdp
->scd_rttecnt
[i
]);
14984 atomic_add_long(&sfmmup
->sfmmu_ttecnt
[i
],
14985 -sfmmup
->sfmmu_scdrttecnt
[i
]);
14987 /* update tsb0 inflation count */
14988 if (old_scdp
!= NULL
) {
14989 sfmmup
->sfmmu_tsb0_4minflcnt
+=
14990 old_scdp
->scd_sfmmup
->sfmmu_tsb0_4minflcnt
;
14992 ASSERT(sfmmup
->sfmmu_tsb0_4minflcnt
>=
14993 scdp
->scd_sfmmup
->sfmmu_tsb0_4minflcnt
);
14994 sfmmup
->sfmmu_tsb0_4minflcnt
-= scdp
->scd_sfmmup
->sfmmu_tsb0_4minflcnt
;
14996 sfmmu_hat_exit(hatlockp
);
14998 if (old_scdp
!= NULL
) {
14999 SF_SCD_DECR_REF(srdp
, old_scdp
);
15005 * This routine is called by a process to become part of an SCD. It is called
15006 * from sfmmu_tsbmiss_exception() once most of the initial work has been
15007 * done by sfmmu_join_scd(). This routine must not drop the hat lock.
15010 sfmmu_finish_join_scd(sfmmu_t
*sfmmup
)
15012 struct tsb_info
*tsbinfop
;
15014 ASSERT(sfmmu_hat_lock_held(sfmmup
));
15015 ASSERT(sfmmup
->sfmmu_scdp
!= NULL
);
15016 ASSERT(SFMMU_FLAGS_ISSET(sfmmup
, HAT_JOIN_SCD
));
15017 ASSERT(!SFMMU_FLAGS_ISSET(sfmmup
, HAT_ISMBUSY
));
15018 ASSERT(SFMMU_FLAGS_ISSET(sfmmup
, HAT_ALLCTX_INVALID
));
15020 for (tsbinfop
= sfmmup
->sfmmu_tsb
; tsbinfop
!= NULL
;
15021 tsbinfop
= tsbinfop
->tsb_next
) {
15022 if (tsbinfop
->tsb_flags
& TSB_SWAPPED
) {
15025 ASSERT(!(tsbinfop
->tsb_flags
& TSB_RELOC_FLAG
));
15027 sfmmu_inv_tsb(tsbinfop
->tsb_va
,
15028 TSB_BYTES(tsbinfop
->tsb_szc
));
15031 /* Set HAT_CTX1_FLAG for all SCD ISMs */
15032 sfmmu_ism_hatflags(sfmmup
, 1);
15034 SFMMU_STAT(sf_join_scd
);
15038 * This routine is called in order to check if there is an SCD which matches
15039 * the process's region map if not then a new SCD may be created.
15042 sfmmu_find_scd(sfmmu_t
*sfmmup
)
15044 sf_srd_t
*srdp
= sfmmup
->sfmmu_srdp
;
15045 sf_scd_t
*scdp
, *new_scdp
;
15048 ASSERT(srdp
!= NULL
);
15049 ASSERT(AS_WRITE_HELD(sfmmup
->sfmmu_as
));
15051 mutex_enter(&srdp
->srd_scd_mutex
);
15052 for (scdp
= srdp
->srd_scdp
; scdp
!= NULL
;
15053 scdp
= scdp
->scd_next
) {
15054 SF_RGNMAP_EQUAL(&scdp
->scd_region_map
,
15055 &sfmmup
->sfmmu_region_map
, ret
);
15057 SF_SCD_INCR_REF(scdp
);
15058 mutex_exit(&srdp
->srd_scd_mutex
);
15059 sfmmu_join_scd(scdp
, sfmmup
);
15060 ASSERT(scdp
->scd_refcnt
>= 2);
15061 atomic_dec_32((volatile uint32_t *)&scdp
->scd_refcnt
);
15065 * If the sfmmu region map is a subset of the scd
15066 * region map, then the assumption is that this process
15067 * will continue attaching to ISM segments until the
15068 * region maps are equal.
15070 SF_RGNMAP_IS_SUBSET(&scdp
->scd_region_map
,
15071 &sfmmup
->sfmmu_region_map
, ret
);
15073 mutex_exit(&srdp
->srd_scd_mutex
);
15079 ASSERT(scdp
== NULL
);
15081 * No matching SCD has been found, create a new one.
15083 if ((new_scdp
= sfmmu_alloc_scd(srdp
, &sfmmup
->sfmmu_region_map
)) ==
15085 mutex_exit(&srdp
->srd_scd_mutex
);
15090 * sfmmu_alloc_scd() returns with a ref count of 1 on the scd.
15093 /* Set scd_rttecnt for shme rgns in SCD */
15094 sfmmu_set_scd_rttecnt(srdp
, new_scdp
);
15097 * Link scd onto srd_scdp list and scd sfmmu onto region/iment lists.
15099 sfmmu_link_scd_to_regions(srdp
, new_scdp
);
15100 sfmmu_add_scd(&srdp
->srd_scdp
, new_scdp
);
15101 SFMMU_STAT_ADD(sf_create_scd
, 1);
15103 mutex_exit(&srdp
->srd_scd_mutex
);
15104 sfmmu_join_scd(new_scdp
, sfmmup
);
15105 ASSERT(new_scdp
->scd_refcnt
>= 2);
15106 atomic_dec_32((volatile uint32_t *)&new_scdp
->scd_refcnt
);
15110 * This routine is called by a process to remove itself from an SCD. It is
15111 * either called when the processes has detached from a segment or from
15112 * hat_free_start() as a result of calling exit.
15115 sfmmu_leave_scd(sfmmu_t
*sfmmup
, uchar_t r_type
)
15117 sf_scd_t
*scdp
= sfmmup
->sfmmu_scdp
;
15118 sf_srd_t
*srdp
= sfmmup
->sfmmu_srdp
;
15119 hatlock_t
*hatlockp
= TSB_HASH(sfmmup
);
15122 ASSERT(scdp
!= NULL
);
15123 ASSERT(srdp
!= NULL
);
15125 if (sfmmup
->sfmmu_free
) {
15127 * If the process is part of an SCD the sfmmu is unlinked
15128 * from scd_sf_list.
15130 mutex_enter(&scdp
->scd_mutex
);
15131 sfmmu_from_scd_list(&scdp
->scd_sf_list
, sfmmup
);
15132 mutex_exit(&scdp
->scd_mutex
);
15134 * Update sfmmu_ttecnt to include the rgn ttecnt for rgns that
15135 * are about to leave the SCD
15137 for (i
= 0; i
< mmu_page_sizes
; i
++) {
15138 ASSERT(sfmmup
->sfmmu_scdrttecnt
[i
] ==
15139 scdp
->scd_rttecnt
[i
]);
15140 atomic_add_long(&sfmmup
->sfmmu_ttecnt
[i
],
15141 sfmmup
->sfmmu_scdrttecnt
[i
]);
15142 sfmmup
->sfmmu_scdrttecnt
[i
] = 0;
15144 sfmmup
->sfmmu_scdp
= NULL
;
15146 SF_SCD_DECR_REF(srdp
, scdp
);
15150 ASSERT(r_type
!= SFMMU_REGION_ISM
||
15151 SFMMU_FLAGS_ISSET(sfmmup
, HAT_ISMBUSY
));
15152 ASSERT(scdp
->scd_refcnt
);
15153 ASSERT(!sfmmup
->sfmmu_free
);
15154 ASSERT(sfmmu_hat_lock_held(sfmmup
));
15155 ASSERT(AS_LOCK_HELD(sfmmup
->sfmmu_as
));
15158 * Wait for ISM maps to be updated.
15160 if (r_type
!= SFMMU_REGION_ISM
) {
15161 while (SFMMU_FLAGS_ISSET(sfmmup
, HAT_ISMBUSY
) &&
15162 sfmmup
->sfmmu_scdp
!= NULL
) {
15163 cv_wait(&sfmmup
->sfmmu_tsb_cv
,
15164 HATLOCK_MUTEXP(hatlockp
));
15167 if (sfmmup
->sfmmu_scdp
== NULL
) {
15168 sfmmu_hat_exit(hatlockp
);
15171 SFMMU_FLAGS_SET(sfmmup
, HAT_ISMBUSY
);
15174 if (SFMMU_FLAGS_ISSET(sfmmup
, HAT_JOIN_SCD
)) {
15175 SFMMU_FLAGS_CLEAR(sfmmup
, HAT_JOIN_SCD
);
15177 * Since HAT_JOIN_SCD was set our context
15178 * is still invalid.
15182 * For a multi-thread process, we must stop
15183 * all the other threads before leaving the scd.
15186 sfmmu_invalidate_ctx(sfmmup
);
15189 /* Clear all the rid's for ISM, delete flags, etc */
15190 ASSERT(SFMMU_FLAGS_ISSET(sfmmup
, HAT_ISMBUSY
));
15191 sfmmu_ism_hatflags(sfmmup
, 0);
15194 * Update sfmmu_ttecnt to include the rgn ttecnt for rgns that
15195 * are in SCD before this sfmmup leaves the SCD.
15197 for (i
= 0; i
< mmu_page_sizes
; i
++) {
15198 ASSERT(sfmmup
->sfmmu_scdrttecnt
[i
] ==
15199 scdp
->scd_rttecnt
[i
]);
15200 atomic_add_long(&sfmmup
->sfmmu_ttecnt
[i
],
15201 sfmmup
->sfmmu_scdrttecnt
[i
]);
15202 sfmmup
->sfmmu_scdrttecnt
[i
] = 0;
15203 /* update ismttecnt to include SCD ism before hat leaves SCD */
15204 sfmmup
->sfmmu_ismttecnt
[i
] += sfmmup
->sfmmu_scdismttecnt
[i
];
15205 sfmmup
->sfmmu_scdismttecnt
[i
] = 0;
15207 /* update tsb0 inflation count */
15208 sfmmup
->sfmmu_tsb0_4minflcnt
+= scdp
->scd_sfmmup
->sfmmu_tsb0_4minflcnt
;
15210 if (r_type
!= SFMMU_REGION_ISM
) {
15211 SFMMU_FLAGS_CLEAR(sfmmup
, HAT_ISMBUSY
);
15213 sfmmup
->sfmmu_scdp
= NULL
;
15215 sfmmu_hat_exit(hatlockp
);
15218 * Unlink sfmmu from scd_sf_list this can be done without holding
15219 * the hat lock as we hold the sfmmu_as lock which prevents
15220 * hat_join_region from adding this thread to the scd again. Other
15221 * threads check if sfmmu_scdp is NULL under hat lock and if it's NULL
15222 * they won't get here, since sfmmu_leave_scd() clears sfmmu_scdp
15223 * while holding the hat lock.
15225 mutex_enter(&scdp
->scd_mutex
);
15226 sfmmu_from_scd_list(&scdp
->scd_sf_list
, sfmmup
);
15227 mutex_exit(&scdp
->scd_mutex
);
15228 SFMMU_STAT(sf_leave_scd
);
15230 SF_SCD_DECR_REF(srdp
, scdp
);
15231 hatlockp
= sfmmu_hat_enter(sfmmup
);
15236 * Unlink and free up an SCD structure with a reference count of 0.
15239 sfmmu_destroy_scd(sf_srd_t
*srdp
, sf_scd_t
*scdp
, sf_region_map_t
*scd_rmap
)
15243 hatlock_t
*shatlockp
;
15246 mutex_enter(&srdp
->srd_scd_mutex
);
15247 for (sp
= srdp
->srd_scdp
; sp
!= NULL
; sp
= sp
->scd_next
) {
15251 if (sp
== NULL
|| sp
->scd_refcnt
) {
15252 mutex_exit(&srdp
->srd_scd_mutex
);
15257 * It is possible that the scd has been freed and reallocated with a
15258 * different region map while we've been waiting for the srd_scd_mutex.
15260 SF_RGNMAP_EQUAL(scd_rmap
, &sp
->scd_region_map
, ret
);
15262 mutex_exit(&srdp
->srd_scd_mutex
);
15266 ASSERT(scdp
->scd_sf_list
== NULL
);
15268 * Unlink scd from srd_scdp list.
15270 sfmmu_remove_scd(&srdp
->srd_scdp
, scdp
);
15271 mutex_exit(&srdp
->srd_scd_mutex
);
15273 sfmmu_unlink_scd_from_regions(srdp
, scdp
);
15275 /* Clear shared context tsb and release ctx */
15276 scsfmmup
= scdp
->scd_sfmmup
;
15279 * create a barrier so that scd will not be destroyed
15280 * if other thread still holds the same shared hat lock.
15281 * E.g., sfmmu_tsbmiss_exception() needs to acquire the
15282 * shared hat lock before checking the shared tsb reloc flag.
15284 shatlockp
= sfmmu_hat_enter(scsfmmup
);
15285 sfmmu_hat_exit(shatlockp
);
15287 sfmmu_free_scd_tsbs(scsfmmup
);
15289 for (i
= 0; i
< SFMMU_L1_HMERLINKS
; i
++) {
15290 if (scsfmmup
->sfmmu_hmeregion_links
[i
] != NULL
) {
15291 kmem_free(scsfmmup
->sfmmu_hmeregion_links
[i
],
15292 SFMMU_L2_HMERLINKS_SIZE
);
15293 scsfmmup
->sfmmu_hmeregion_links
[i
] = NULL
;
15296 kmem_cache_free(sfmmuid_cache
, scsfmmup
);
15297 kmem_cache_free(scd_cache
, scdp
);
15298 SFMMU_STAT(sf_destroy_scd
);
15302 * Modifies the HAT_CTX1_FLAG for each of the ISM segments which correspond to
15303 * bits which are set in the ism_region_map parameter. This flag indicates to
15304 * the tsbmiss handler that mapping for these segments should be loaded using
15305 * the shared context.
15308 sfmmu_ism_hatflags(sfmmu_t
*sfmmup
, int addflag
)
15310 sf_scd_t
*scdp
= sfmmup
->sfmmu_scdp
;
15311 ism_blk_t
*ism_blkp
;
15312 ism_map_t
*ism_map
;
15315 ASSERT(sfmmup
->sfmmu_iblk
!= NULL
);
15316 ASSERT(scdp
!= NULL
);
15318 * Note that the caller either set HAT_ISMBUSY flag or checked
15319 * under hat lock that HAT_ISMBUSY was not set by another thread.
15321 ASSERT(sfmmu_hat_lock_held(sfmmup
));
15323 ism_blkp
= sfmmup
->sfmmu_iblk
;
15324 while (ism_blkp
!= NULL
) {
15325 ism_map
= ism_blkp
->iblk_maps
;
15326 for (i
= 0; ism_map
[i
].imap_ismhat
&& i
< ISM_MAP_SLOTS
; i
++) {
15327 rid
= ism_map
[i
].imap_rid
;
15328 if (rid
== SFMMU_INVALID_ISMRID
) {
15331 ASSERT(rid
>= 0 && rid
< SFMMU_MAX_ISM_REGIONS
);
15332 if (SF_RGNMAP_TEST(scdp
->scd_ismregion_map
, rid
) &&
15334 ism_map
[i
].imap_hatflags
|=
15337 ism_map
[i
].imap_hatflags
&=
15341 ism_blkp
= ism_blkp
->iblk_next
;
15346 sfmmu_srd_lock_held(sf_srd_t
*srdp
)
15348 return (MUTEX_HELD(&srdp
->srd_mutex
));
15353 sfmmu_scdcache_constructor(void *buf
, void *cdrarg
, int kmflags
)
15355 sf_scd_t
*scdp
= (sf_scd_t
*)buf
;
15357 bzero(buf
, sizeof (sf_scd_t
));
15358 mutex_init(&scdp
->scd_mutex
, NULL
, MUTEX_DEFAULT
, NULL
);
15364 sfmmu_scdcache_destructor(void *buf
, void *cdrarg
)
15366 sf_scd_t
*scdp
= (sf_scd_t
*)buf
;
15368 mutex_destroy(&scdp
->scd_mutex
);
15372 * The listp parameter is a pointer to a list of hmeblks which are partially
15373 * freed as result of calling sfmmu_hblk_hash_rm(), the last phase of the
15374 * freeing process is to cross-call all cpus to ensure that there are no
15375 * remaining cached references.
15377 * If the local generation number is less than the global then we can free
15378 * hmeblks which are already on the pending queue as another cpu has completed
15381 * We cross-call to make sure that there are no threads on other cpus accessing
15382 * these hmblks and then complete the process of freeing them under the
15383 * following conditions:
15384 * The total number of pending hmeblks is greater than the threshold
15385 * The reserve list has fewer than HBLK_RESERVE_CNT hmeblks
15386 * It is at least 1 second since the last time we cross-called
15388 * Otherwise, we add the hmeblks to the per-cpu pending queue.
15391 sfmmu_hblks_list_purge(struct hme_blk
**listp
, int dontfree
)
15393 struct hme_blk
*hblkp
, *pr_hblkp
= NULL
;
15395 cpuset_t cpuset
= cpu_ready_set
;
15396 cpu_hme_pend_t
*cpuhp
;
15398 int one_second_expired
= 0;
15400 gethrestime_lasttick(&now
);
15402 for (hblkp
= *listp
; hblkp
!= NULL
; hblkp
= hblkp
->hblk_next
) {
15403 ASSERT(hblkp
->hblk_shw_bit
== 0);
15404 ASSERT(hblkp
->hblk_shared
== 0);
15409 cpuhp
= &cpu_hme_pend
[CPU
->cpu_seqid
];
15410 mutex_enter(&cpuhp
->chp_mutex
);
15412 if ((cpuhp
->chp_count
+ count
) == 0) {
15413 mutex_exit(&cpuhp
->chp_mutex
);
15417 if ((now
.tv_sec
- cpuhp
->chp_timestamp
) > 1) {
15418 one_second_expired
= 1;
15421 if (!dontfree
&& (freehblkcnt
< HBLK_RESERVE_CNT
||
15422 (cpuhp
->chp_count
+ count
) > cpu_hme_pend_thresh
||
15423 one_second_expired
)) {
15424 /* Append global list to local */
15425 if (pr_hblkp
== NULL
) {
15426 *listp
= cpuhp
->chp_listp
;
15428 pr_hblkp
->hblk_next
= cpuhp
->chp_listp
;
15430 cpuhp
->chp_listp
= NULL
;
15431 cpuhp
->chp_count
= 0;
15432 cpuhp
->chp_timestamp
= now
.tv_sec
;
15433 mutex_exit(&cpuhp
->chp_mutex
);
15435 kpreempt_disable();
15436 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
15442 * At this stage we know that no trap handlers on other
15443 * cpus can have references to hmeblks on the list.
15445 sfmmu_hblk_free(listp
);
15446 } else if (*listp
!= NULL
) {
15447 pr_hblkp
->hblk_next
= cpuhp
->chp_listp
;
15448 cpuhp
->chp_listp
= *listp
;
15449 cpuhp
->chp_count
+= count
;
15451 mutex_exit(&cpuhp
->chp_mutex
);
15453 mutex_exit(&cpuhp
->chp_mutex
);
15458 * Add an hmeblk to the the hash list.
15461 sfmmu_hblk_hash_add(struct hmehash_bucket
*hmebp
, struct hme_blk
*hmeblkp
,
15464 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp
));
15466 if (hmebp
->hmeblkp
== NULL
) {
15467 ASSERT(hmebp
->hmeh_nextpa
== HMEBLK_ENDPA
);
15471 hmeblkp
->hblk_nextpa
= hmebp
->hmeh_nextpa
;
15473 * Since the TSB miss handler now does not lock the hash chain before
15474 * walking it, make sure that the hmeblks nextpa is globally visible
15475 * before we make the hmeblk globally visible by updating the chain root
15476 * pointer in the hash bucket.
15479 hmebp
->hmeh_nextpa
= hblkpa
;
15480 hmeblkp
->hblk_next
= hmebp
->hmeblkp
;
15481 hmebp
->hmeblkp
= hmeblkp
;
15486 * This function is the first part of a 2 part process to remove an hmeblk
15487 * from the hash chain. In this phase we unlink the hmeblk from the hash chain
15488 * but leave the next physical pointer unchanged. The hmeblk is then linked onto
15489 * a per-cpu pending list using the virtual address pointer.
15491 * TSB miss trap handlers that start after this phase will no longer see
15492 * this hmeblk. TSB miss handlers that still cache this hmeblk in a register
15493 * can still use it for further chain traversal because we haven't yet modifed
15494 * the next physical pointer or freed it.
15496 * In the second phase of hmeblk removal we'll issue a barrier xcall before
15497 * we reuse or free this hmeblk. This will make sure all lingering references to
15498 * the hmeblk after first phase disappear before we finally reclaim it.
15499 * This scheme eliminates the need for TSB miss handlers to lock hmeblk chains
15500 * during their traversal.
15502 * The hmehash_mutex must be held when calling this function.
15505 * hmebp - hme hash bucket pointer
15506 * hmeblkp - address of hmeblk to be removed
15507 * pr_hblk - virtual address of previous hmeblkp
15508 * listp - pointer to list of hmeblks linked by virtual address
15509 * free_now flag - indicates that a complete removal from the hash chains
15512 * It is inefficient to use the free_now flag as a cross-call is required to
15513 * remove a single hmeblk from the hash chain but is necessary when hmeblks are
15517 sfmmu_hblk_hash_rm(struct hmehash_bucket
*hmebp
, struct hme_blk
*hmeblkp
,
15518 struct hme_blk
*pr_hblk
, struct hme_blk
**listp
,
15521 int shw_size
, vshift
;
15522 struct hme_blk
*shw_hblkp
;
15523 uint_t shw_mask
, newshw_mask
;
15526 cpuset_t cpuset
= cpu_ready_set
;
15528 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp
));
15530 if (hmebp
->hmeblkp
== hmeblkp
) {
15531 hmebp
->hmeh_nextpa
= hmeblkp
->hblk_nextpa
;
15532 hmebp
->hmeblkp
= hmeblkp
->hblk_next
;
15534 pr_hblk
->hblk_nextpa
= hmeblkp
->hblk_nextpa
;
15535 pr_hblk
->hblk_next
= hmeblkp
->hblk_next
;
15538 size
= get_hblk_ttesz(hmeblkp
);
15539 shw_hblkp
= hmeblkp
->hblk_shadow
;
15541 ASSERT(hblktosfmmu(hmeblkp
) != KHATID
);
15542 ASSERT(!hmeblkp
->hblk_shared
);
15544 if (mmu_page_sizes
== max_mmu_page_sizes
) {
15545 ASSERT(size
< TTE256M
);
15547 ASSERT(size
< TTE4M
);
15551 shw_size
= get_hblk_ttesz(shw_hblkp
);
15552 vaddr
= (caddr_t
)get_hblk_base(hmeblkp
);
15553 vshift
= vaddr_to_vshift(shw_hblkp
->hblk_tag
, vaddr
, shw_size
);
15554 ASSERT(vshift
< 8);
15556 * Atomically clear shadow mask bit
15559 shw_mask
= shw_hblkp
->hblk_shw_mask
;
15560 ASSERT(shw_mask
& (1 << vshift
));
15561 newshw_mask
= shw_mask
& ~(1 << vshift
);
15562 newshw_mask
= atomic_cas_32(&shw_hblkp
->hblk_shw_mask
,
15563 shw_mask
, newshw_mask
);
15564 } while (newshw_mask
!= shw_mask
);
15565 hmeblkp
->hblk_shadow
= NULL
;
15567 hmeblkp
->hblk_shw_bit
= 0;
15569 if (hmeblkp
->hblk_shared
) {
15575 srdp
= hblktosrd(hmeblkp
);
15576 ASSERT(srdp
!= NULL
&& srdp
->srd_refcnt
!= 0);
15577 rid
= hmeblkp
->hblk_tag
.htag_rid
;
15578 ASSERT(SFMMU_IS_SHMERID_VALID(rid
));
15579 ASSERT(rid
< SFMMU_MAX_HME_REGIONS
);
15580 rgnp
= srdp
->srd_hmergnp
[rid
];
15581 ASSERT(rgnp
!= NULL
);
15582 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp
, srdp
, rgnp
, rid
);
15584 hmeblkp
->hblk_shared
= 0;
15587 kpreempt_disable();
15588 CPUSET_DEL(cpuset
, CPU
->cpu_id
);
15593 hmeblkp
->hblk_nextpa
= HMEBLK_ENDPA
;
15594 hmeblkp
->hblk_next
= NULL
;
15596 /* Append hmeblkp to listp for processing later. */
15597 hmeblkp
->hblk_next
= *listp
;
15603 * This routine is called when memory is in short supply and returns a free
15604 * hmeblk of the requested size from the cpu pending lists.
15606 static struct hme_blk
*
15607 sfmmu_check_pending_hblks(int size
)
15610 struct hme_blk
*hmeblkp
= NULL
, *last_hmeblkp
;
15612 cpuset_t cpuset
= cpu_ready_set
;
15613 cpu_hme_pend_t
*cpuhp
;
15615 /* Flush cpu hblk pending queues */
15616 for (i
= 0; i
< NCPU
; i
++) {
15617 cpuhp
= &cpu_hme_pend
[i
];
15618 if (cpuhp
->chp_listp
!= NULL
) {
15619 mutex_enter(&cpuhp
->chp_mutex
);
15620 if (cpuhp
->chp_listp
== NULL
) {
15621 mutex_exit(&cpuhp
->chp_mutex
);
15625 last_hmeblkp
= NULL
;
15626 for (hmeblkp
= cpuhp
->chp_listp
; hmeblkp
!= NULL
;
15627 hmeblkp
= hmeblkp
->hblk_next
) {
15628 if (get_hblk_ttesz(hmeblkp
) == size
) {
15629 if (last_hmeblkp
== NULL
) {
15631 hmeblkp
->hblk_next
;
15633 last_hmeblkp
->hblk_next
=
15634 hmeblkp
->hblk_next
;
15636 ASSERT(cpuhp
->chp_count
> 0);
15637 cpuhp
->chp_count
--;
15641 last_hmeblkp
= hmeblkp
;
15644 mutex_exit(&cpuhp
->chp_mutex
);
15646 if (found_hmeblk
) {
15647 kpreempt_disable();
15648 CPUSET_DEL(cpuset
, CPU
->cpu_id
);