Merge commit '5e2cca1843c61ee0ef1bb95c5dddc9b450b790c6'
[unleashed.git] / arch / x86 / kernel / os / desctbls.c
blob7679216666ee77792396e973d0bf9a5dd4b39290
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
27 * Copyright 2018 Joyent, Inc. All rights reserved.
31 * Copyright (c) 1992 Terrence R. Lambert.
32 * Copyright (c) 1990 The Regents of the University of California.
33 * All rights reserved.
35 * This code is derived from software contributed to Berkeley by
36 * William Jolitz.
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 * 3. All advertising materials mentioning features or use of this software
47 * must display the following acknowledgement:
48 * This product includes software developed by the University of
49 * California, Berkeley and its contributors.
50 * 4. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
66 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
69 #include <sys/types.h>
70 #include <sys/sysmacros.h>
71 #include <sys/tss.h>
72 #include <sys/segments.h>
73 #include <sys/trap.h>
74 #include <sys/cpuvar.h>
75 #include <sys/bootconf.h>
76 #include <sys/x86_archext.h>
77 #include <sys/controlregs.h>
78 #include <sys/archsystm.h>
79 #include <sys/machsystm.h>
80 #include <sys/kobj.h>
81 #include <sys/cmn_err.h>
82 #include <sys/reboot.h>
83 #include <sys/kdi.h>
84 #include <sys/mach_mmu.h>
85 #include <sys/systm.h>
86 #include <sys/note.h>
89 #include <sys/promif.h>
90 #include <sys/bootinfo.h>
91 #include <vm/kboot_mmu.h>
92 #include <vm/hat_pte.h>
95 * cpu0 and default tables and structures.
97 user_desc_t *gdt0;
98 desctbr_t gdt0_default_r;
100 gate_desc_t *idt0; /* interrupt descriptor table */
101 #if defined(__i386)
102 desctbr_t idt0_default_r; /* describes idt0 in IDTR format */
103 #endif
105 tss_t *ktss0; /* kernel task state structure */
107 #if defined(__i386)
108 tss_t *dftss0; /* #DF double-fault exception */
109 #endif /* __i386 */
111 user_desc_t zero_udesc; /* base zero user desc native procs */
112 user_desc_t null_udesc; /* null user descriptor */
113 system_desc_t null_sdesc; /* null system descriptor */
115 #if defined(__amd64)
116 user_desc_t zero_u32desc; /* 32-bit compatibility procs */
117 #endif /* __amd64 */
119 #if defined(__amd64)
120 user_desc_t ucs_on;
121 user_desc_t ucs_off;
122 user_desc_t ucs32_on;
123 user_desc_t ucs32_off;
124 #endif /* __amd64 */
127 * If the size of this is changed, you must update hat_pcp_setup() and the
128 * definitions in exception.s
130 extern char dblfault_stack0[DEFAULTSTKSZ];
131 extern char nmi_stack0[DEFAULTSTKSZ];
132 extern char mce_stack0[DEFAULTSTKSZ];
134 extern void fast_null(void);
135 extern hrtime_t get_hrtime(void);
136 extern hrtime_t gethrvtime(void);
137 extern hrtime_t get_hrestime(void);
138 extern uint64_t getlgrp(void);
140 void (*(fasttable[]))(void) = {
141 fast_null, /* T_FNULL routine */
142 fast_null, /* T_FGETFP routine (initially null) */
143 fast_null, /* T_FSETFP routine (initially null) */
144 (void (*)())get_hrtime, /* T_GETHRTIME */
145 (void (*)())gethrvtime, /* T_GETHRVTIME */
146 (void (*)())get_hrestime, /* T_GETHRESTIME */
147 (void (*)())getlgrp /* T_GETLGRP */
151 * Structure containing pre-computed descriptors to allow us to temporarily
152 * interpose on a standard handler.
154 struct interposing_handler {
155 int ih_inum;
156 gate_desc_t ih_interp_desc;
157 gate_desc_t ih_default_desc;
161 * The brand infrastructure interposes on two handlers, and we use one as a
162 * NULL signpost.
164 static struct interposing_handler brand_tbl[2];
167 * software prototypes for default local descriptor table
171 * Routines for loading segment descriptors in format the hardware
172 * can understand.
176 * In long mode we have the new L or long mode attribute bit
177 * for code segments. Only the conforming bit in type is used along
178 * with descriptor priority and present bits. Default operand size must
179 * be zero when in long mode. In 32-bit compatibility mode all fields
180 * are treated as in legacy mode. For data segments while in long mode
181 * only the present bit is loaded.
183 void
184 set_usegd(user_desc_t *dp, uint_t lmode, void *base, size_t size,
185 uint_t type, uint_t dpl, uint_t gran, uint_t defopsz)
187 ASSERT(lmode == SDP_SHORT || lmode == SDP_LONG);
188 /* This should never be a "system" segment. */
189 ASSERT3U(type & SDT_S, !=, 0);
192 * 64-bit long mode.
194 if (lmode == SDP_LONG)
195 dp->usd_def32 = 0; /* 32-bit operands only */
196 else
198 * 32-bit compatibility mode.
200 dp->usd_def32 = defopsz; /* 0 = 16, 1 = 32-bit ops */
203 * We should always set the "accessed" bit (SDT_A), otherwise the CPU
204 * will write to the GDT whenever we change segment registers around.
205 * With KPTI on, the GDT is read-only in the user page table, which
206 * causes crashes if we don't set this.
208 ASSERT3U(type & SDT_A, !=, 0);
210 dp->usd_long = lmode; /* 64-bit mode */
211 dp->usd_type = type;
212 dp->usd_dpl = dpl;
213 dp->usd_p = 1;
214 dp->usd_gran = gran; /* 0 = bytes, 1 = pages */
216 dp->usd_lobase = (uintptr_t)base;
217 dp->usd_midbase = (uintptr_t)base >> 16;
218 dp->usd_hibase = (uintptr_t)base >> (16 + 8);
219 dp->usd_lolimit = size;
220 dp->usd_hilimit = (uintptr_t)size >> 16;
224 * Install system segment descriptor for LDT and TSS segments.
227 void
228 set_syssegd(system_desc_t *dp, void *base, size_t size, uint_t type,
229 uint_t dpl)
231 dp->ssd_lolimit = size;
232 dp->ssd_hilimit = (uintptr_t)size >> 16;
234 dp->ssd_lobase = (uintptr_t)base;
235 dp->ssd_midbase = (uintptr_t)base >> 16;
236 dp->ssd_hibase = (uintptr_t)base >> (16 + 8);
237 dp->ssd_hi64base = (uintptr_t)base >> (16 + 8 + 8);
239 dp->ssd_type = type;
240 dp->ssd_zero1 = 0; /* must be zero */
241 dp->ssd_zero2 = 0;
242 dp->ssd_dpl = dpl;
243 dp->ssd_p = 1;
244 dp->ssd_gran = 0; /* force byte units */
247 void *
248 get_ssd_base(system_desc_t *dp)
250 uintptr_t base;
252 base = (uintptr_t)dp->ssd_lobase |
253 (uintptr_t)dp->ssd_midbase << 16 |
254 (uintptr_t)dp->ssd_hibase << (16 + 8) |
255 (uintptr_t)dp->ssd_hi64base << (16 + 8 + 8);
256 return ((void *)base);
260 * Install gate segment descriptor for interrupt, trap, call and task gates.
262 * For 64 bit native if we have KPTI enabled, we use the IST stack mechanism on
263 * all interrupts. We have different ISTs for each class of exceptions that are
264 * most likely to occur while handling an existing exception; while many of
265 * these are just going to panic, it's nice not to trample on the existing
266 * exception state for debugging purposes.
268 * Normal interrupts are all redirected unconditionally to the KPTI trampoline
269 * stack space. This unifies the trampoline handling between user and kernel
270 * space (and avoids the need to touch %gs).
272 * The KDI IDT *all* uses the DBG IST: consider single stepping tr_pftrap, when
273 * we do a read from KMDB that cause another #PF. Without its own IST, this
274 * would stomp on the kernel's mcpu_kpti_flt frame.
276 uint_t
277 idt_vector_to_ist(uint_t vector)
279 #if defined(__xpv)
280 _NOTE(ARGUNUSED(vector));
281 return (IST_NONE);
282 #else
283 switch (vector) {
284 /* These should always use IST even without KPTI enabled. */
285 case T_DBLFLT:
286 return (IST_DF);
287 case T_NMIFLT:
288 return (IST_NMI);
289 case T_MCE:
290 return (IST_MCE);
292 case T_BPTFLT:
293 case T_SGLSTP:
294 if (kpti_enable == 1) {
295 return (IST_DBG);
297 return (IST_NONE);
298 case T_STKFLT:
299 case T_GPFLT:
300 case T_PGFLT:
301 if (kpti_enable == 1) {
302 return (IST_NESTABLE);
304 return (IST_NONE);
305 default:
306 if (kpti_enable == 1) {
307 return (IST_DEFAULT);
309 return (IST_NONE);
311 #endif
314 void
315 set_gatesegd(gate_desc_t *dp, void (*func)(void), selector_t sel,
316 uint_t type, uint_t dpl, uint_t ist)
318 dp->sgd_looffset = (uintptr_t)func;
319 dp->sgd_hioffset = (uintptr_t)func >> 16;
320 dp->sgd_hi64offset = (uintptr_t)func >> (16 + 16);
321 dp->sgd_selector = (uint16_t)sel;
322 dp->sgd_ist = ist;
323 dp->sgd_type = type;
324 dp->sgd_dpl = dpl;
325 dp->sgd_p = 1;
329 * Updates a single user descriptor in the the GDT of the current cpu.
330 * Caller is responsible for preventing cpu migration.
333 void
334 gdt_update_usegd(uint_t sidx, user_desc_t *udp)
336 #if defined(DEBUG)
337 /* This should never be a "system" segment, but it might be null. */
338 if (udp->usd_p != 0 || udp->usd_type != 0) {
339 ASSERT3U(udp->usd_type & SDT_S, !=, 0);
342 * We should always set the "accessed" bit (SDT_A), otherwise the CPU
343 * will write to the GDT whenever we change segment registers around.
344 * With KPTI on, the GDT is read-only in the user page table, which
345 * causes crashes if we don't set this.
347 if (udp->usd_p != 0 || udp->usd_type != 0) {
348 ASSERT3U(udp->usd_type & SDT_A, !=, 0);
350 #endif
352 CPU->cpu_gdt[sidx] = *udp;
356 * Writes single descriptor pointed to by udp into a processes
357 * LDT entry pointed to by ldp.
360 ldt_update_segd(user_desc_t *ldp, user_desc_t *udp)
362 #if defined(DEBUG)
363 /* This should never be a "system" segment, but it might be null. */
364 if (udp->usd_p != 0 || udp->usd_type != 0) {
365 ASSERT3U(udp->usd_type & SDT_S, !=, 0);
368 * We should always set the "accessed" bit (SDT_A), otherwise the CPU
369 * will write to the LDT whenever we change segment registers around.
370 * With KPTI on, the LDT is read-only in the user page table, which
371 * causes crashes if we don't set this.
373 if (udp->usd_p != 0 || udp->usd_type != 0) {
374 ASSERT3U(udp->usd_type & SDT_A, !=, 0);
376 #endif
378 *ldp = *udp;
380 return (0);
384 #if defined(__amd64)
387 * Build kernel GDT.
390 static void
391 init_gdt_common(user_desc_t *gdt)
393 int i;
396 * 64-bit kernel code segment.
398 set_usegd(&gdt[GDT_KCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_KPL,
399 SDP_PAGES, SDP_OP32);
402 * 64-bit kernel data segment. The limit attribute is ignored in 64-bit
403 * mode, but we set it here to 0xFFFF so that we can use the SYSRET
404 * instruction to return from system calls back to 32-bit applications.
405 * SYSRET doesn't update the base, limit, or attributes of %ss or %ds
406 * descriptors. We therefore must ensure that the kernel uses something,
407 * though it will be ignored by hardware, that is compatible with 32-bit
408 * apps. For the same reason we must set the default op size of this
409 * descriptor to 32-bit operands.
411 set_usegd(&gdt[GDT_KDATA], SDP_LONG, NULL, -1, SDT_MEMRWA,
412 SEL_KPL, SDP_PAGES, SDP_OP32);
413 gdt[GDT_KDATA].usd_def32 = 1;
416 * 64-bit user code segment.
418 set_usegd(&gdt[GDT_UCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_UPL,
419 SDP_PAGES, SDP_OP32);
422 * 32-bit user code segment.
424 set_usegd(&gdt[GDT_U32CODE], SDP_SHORT, NULL, -1, SDT_MEMERA,
425 SEL_UPL, SDP_PAGES, SDP_OP32);
428 * See gdt_ucode32() and gdt_ucode_native().
430 ucs_on = ucs_off = gdt[GDT_UCODE];
431 ucs_off.usd_p = 0; /* forces #np fault */
433 ucs32_on = ucs32_off = gdt[GDT_U32CODE];
434 ucs32_off.usd_p = 0; /* forces #np fault */
437 * 32 and 64 bit data segments can actually share the same descriptor.
438 * In long mode only the present bit is checked but all other fields
439 * are loaded. But in compatibility mode all fields are interpreted
440 * as in legacy mode so they must be set correctly for a 32-bit data
441 * segment.
443 set_usegd(&gdt[GDT_UDATA], SDP_SHORT, NULL, -1, SDT_MEMRWA, SEL_UPL,
444 SDP_PAGES, SDP_OP32);
446 #if !defined(__xpv)
449 * The 64-bit kernel has no default LDT. By default, the LDT descriptor
450 * in the GDT is 0.
454 * Kernel TSS
456 set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0,
457 sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL);
459 #endif /* !__xpv */
462 * Initialize fs and gs descriptors for 32 bit processes.
463 * Only attributes and limits are initialized, the effective
464 * base address is programmed via fsbase/gsbase.
466 set_usegd(&gdt[GDT_LWPFS], SDP_SHORT, NULL, -1, SDT_MEMRWA,
467 SEL_UPL, SDP_PAGES, SDP_OP32);
468 set_usegd(&gdt[GDT_LWPGS], SDP_SHORT, NULL, -1, SDT_MEMRWA,
469 SEL_UPL, SDP_PAGES, SDP_OP32);
472 * Initialize the descriptors set aside for brand usage.
473 * Only attributes and limits are initialized.
475 for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++)
476 set_usegd(&gdt0[i], SDP_SHORT, NULL, -1, SDT_MEMRWA,
477 SEL_UPL, SDP_PAGES, SDP_OP32);
480 * Initialize convenient zero base user descriptors for clearing
481 * lwp private %fs and %gs descriptors in GDT. See setregs() for
482 * an example.
484 set_usegd(&zero_udesc, SDP_LONG, 0, 0, SDT_MEMRWA, SEL_UPL,
485 SDP_BYTES, SDP_OP32);
486 set_usegd(&zero_u32desc, SDP_SHORT, 0, -1, SDT_MEMRWA, SEL_UPL,
487 SDP_PAGES, SDP_OP32);
490 #if defined(__xpv)
492 static user_desc_t *
493 init_gdt(void)
495 uint64_t gdtpa;
496 ulong_t ma[1]; /* XXPV should be a memory_t */
497 ulong_t addr;
500 * Our gdt is never larger than a single page.
502 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
503 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
504 PAGESIZE, PAGESIZE);
505 bzero(gdt0, PAGESIZE);
507 init_gdt_common(gdt0);
510 * XXX Since we never invoke kmdb until after the kernel takes
511 * over the descriptor tables why not have it use the kernel's
512 * selectors?
514 if (boothowto & RB_DEBUG) {
515 set_usegd(&gdt0[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA,
516 SEL_KPL, SDP_PAGES, SDP_OP32);
517 set_usegd(&gdt0[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA,
518 SEL_KPL, SDP_PAGES, SDP_OP32);
522 * Clear write permission for page containing the gdt and install it.
524 gdtpa = pfn_to_pa(va_to_pfn(gdt0));
525 ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT);
526 kbm_read_only((uintptr_t)gdt0, gdtpa);
527 xen_set_gdt(ma, NGDT);
530 * Reload the segment registers to use the new GDT.
531 * On 64-bit, fixup KCS_SEL to be in ring 3.
532 * See KCS_SEL in segments.h.
534 load_segment_registers((KCS_SEL | SEL_KPL), KFS_SEL, KGS_SEL, KDS_SEL);
537 * setup %gs for kernel
539 xen_set_segment_base(SEGBASE_GS_KERNEL, (ulong_t)&cpus[0]);
542 * XX64 We should never dereference off "other gsbase" or
543 * "fsbase". So, we should arrange to point FSBASE and
544 * KGSBASE somewhere truly awful e.g. point it at the last
545 * valid address below the hole so that any attempts to index
546 * off them cause an exception.
548 * For now, point it at 8G -- at least it should be unmapped
549 * until some 64-bit processes run.
551 addr = 0x200000000ul;
552 xen_set_segment_base(SEGBASE_FS, addr);
553 xen_set_segment_base(SEGBASE_GS_USER, addr);
554 xen_set_segment_base(SEGBASE_GS_USER_SEL, 0);
556 return (gdt0);
559 #else /* __xpv */
561 static user_desc_t *
562 init_gdt(void)
564 desctbr_t r_bgdt, r_gdt;
565 user_desc_t *bgdt;
568 * Our gdt is never larger than a single page.
570 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
571 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
572 PAGESIZE, PAGESIZE);
573 bzero(gdt0, PAGESIZE);
575 init_gdt_common(gdt0);
578 * Copy in from boot's gdt to our gdt.
579 * Entry 0 is the null descriptor by definition.
581 rd_gdtr(&r_bgdt);
582 bgdt = (user_desc_t *)r_bgdt.dtr_base;
583 if (bgdt == NULL)
584 panic("null boot gdt");
586 gdt0[GDT_B32DATA] = bgdt[GDT_B32DATA];
587 gdt0[GDT_B32CODE] = bgdt[GDT_B32CODE];
588 gdt0[GDT_B16CODE] = bgdt[GDT_B16CODE];
589 gdt0[GDT_B16DATA] = bgdt[GDT_B16DATA];
590 gdt0[GDT_B64CODE] = bgdt[GDT_B64CODE];
593 * Install our new GDT
595 r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1;
596 r_gdt.dtr_base = (uintptr_t)gdt0;
597 wr_gdtr(&r_gdt);
600 * Reload the segment registers to use the new GDT
602 load_segment_registers(KCS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
605 * setup %gs for kernel
607 wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]);
610 * XX64 We should never dereference off "other gsbase" or
611 * "fsbase". So, we should arrange to point FSBASE and
612 * KGSBASE somewhere truly awful e.g. point it at the last
613 * valid address below the hole so that any attempts to index
614 * off them cause an exception.
616 * For now, point it at 8G -- at least it should be unmapped
617 * until some 64-bit processes run.
619 wrmsr(MSR_AMD_FSBASE, 0x200000000ul);
620 wrmsr(MSR_AMD_KGSBASE, 0x200000000ul);
621 return (gdt0);
624 #endif /* __xpv */
626 #elif defined(__i386)
628 static void
629 init_gdt_common(user_desc_t *gdt)
631 int i;
634 * Text and data for both kernel and user span entire 32 bit
635 * address space.
639 * kernel code segment.
641 set_usegd(&gdt[GDT_KCODE], NULL, -1, SDT_MEMERA, SEL_KPL, SDP_PAGES,
642 SDP_OP32);
645 * kernel data segment.
647 set_usegd(&gdt[GDT_KDATA], NULL, -1, SDT_MEMRWA, SEL_KPL, SDP_PAGES,
648 SDP_OP32);
651 * user code segment.
653 set_usegd(&gdt[GDT_UCODE], NULL, -1, SDT_MEMERA, SEL_UPL, SDP_PAGES,
654 SDP_OP32);
657 * user data segment.
659 set_usegd(&gdt[GDT_UDATA], NULL, -1, SDT_MEMRWA, SEL_UPL, SDP_PAGES,
660 SDP_OP32);
662 #if !defined(__xpv)
665 * TSS for T_DBLFLT (double fault) handler
667 set_syssegd((system_desc_t *)&gdt[GDT_DBFLT], dftss0,
668 sizeof (*dftss0) - 1, SDT_SYSTSS, SEL_KPL);
671 * TSS for kernel
673 set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0,
674 sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL);
676 #endif /* !__xpv */
679 * %gs selector for kernel
681 set_usegd(&gdt[GDT_GS], &cpus[0], sizeof (struct cpu) -1, SDT_MEMRWA,
682 SEL_KPL, SDP_BYTES, SDP_OP32);
685 * Initialize lwp private descriptors.
686 * Only attributes and limits are initialized, the effective
687 * base address is programmed via fsbase/gsbase.
689 set_usegd(&gdt[GDT_LWPFS], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
690 SDP_PAGES, SDP_OP32);
691 set_usegd(&gdt[GDT_LWPGS], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
692 SDP_PAGES, SDP_OP32);
695 * Initialize the descriptors set aside for brand usage.
696 * Only attributes and limits are initialized.
698 for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++)
699 set_usegd(&gdt0[i], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
700 SDP_PAGES, SDP_OP32);
702 * Initialize convenient zero base user descriptor for clearing
703 * lwp private %fs and %gs descriptors in GDT. See setregs() for
704 * an example.
706 set_usegd(&zero_udesc, NULL, -1, SDT_MEMRWA, SEL_UPL,
707 SDP_BYTES, SDP_OP32);
710 #if defined(__xpv)
712 static user_desc_t *
713 init_gdt(void)
715 uint64_t gdtpa;
716 ulong_t ma[1]; /* XXPV should be a memory_t */
719 * Our gdt is never larger than a single page.
721 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
722 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
723 PAGESIZE, PAGESIZE);
724 bzero(gdt0, PAGESIZE);
726 init_gdt_common(gdt0);
727 gdtpa = pfn_to_pa(va_to_pfn(gdt0));
730 * XXX Since we never invoke kmdb until after the kernel takes
731 * over the descriptor tables why not have it use the kernel's
732 * selectors?
734 if (boothowto & RB_DEBUG) {
735 set_usegd(&gdt0[GDT_B32DATA], NULL, -1, SDT_MEMRWA, SEL_KPL,
736 SDP_PAGES, SDP_OP32);
737 set_usegd(&gdt0[GDT_B32CODE], NULL, -1, SDT_MEMERA, SEL_KPL,
738 SDP_PAGES, SDP_OP32);
742 * Clear write permission for page containing the gdt and install it.
744 ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT);
745 kbm_read_only((uintptr_t)gdt0, gdtpa);
746 xen_set_gdt(ma, NGDT);
749 * Reload the segment registers to use the new GDT
751 load_segment_registers(
752 KCS_SEL, KDS_SEL, KDS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
754 return (gdt0);
757 #else /* __xpv */
759 static user_desc_t *
760 init_gdt(void)
762 desctbr_t r_bgdt, r_gdt;
763 user_desc_t *bgdt;
766 * Our gdt is never larger than a single page.
768 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
770 * XXX this allocation belongs in our caller, not here.
772 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
773 PAGESIZE, PAGESIZE);
774 bzero(gdt0, PAGESIZE);
776 init_gdt_common(gdt0);
779 * Copy in from boot's gdt to our gdt entries.
780 * Entry 0 is null descriptor by definition.
782 rd_gdtr(&r_bgdt);
783 bgdt = (user_desc_t *)r_bgdt.dtr_base;
784 if (bgdt == NULL)
785 panic("null boot gdt");
787 gdt0[GDT_B32DATA] = bgdt[GDT_B32DATA];
788 gdt0[GDT_B32CODE] = bgdt[GDT_B32CODE];
789 gdt0[GDT_B16CODE] = bgdt[GDT_B16CODE];
790 gdt0[GDT_B16DATA] = bgdt[GDT_B16DATA];
793 * Install our new GDT
795 r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1;
796 r_gdt.dtr_base = (uintptr_t)gdt0;
797 wr_gdtr(&r_gdt);
800 * Reload the segment registers to use the new GDT
802 load_segment_registers(
803 KCS_SEL, KDS_SEL, KDS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
805 return (gdt0);
808 #endif /* __xpv */
809 #endif /* __i386 */
812 * Build kernel IDT.
814 * Note that for amd64 we pretty much require every gate to be an interrupt
815 * gate which blocks interrupts atomically on entry; that's because of our
816 * dependency on using 'swapgs' every time we come into the kernel to find
817 * the cpu structure. If we get interrupted just before doing that, %cs could
818 * be in kernel mode (so that the trap prolog doesn't do a swapgs), but
819 * %gsbase is really still pointing at something in userland. Bad things will
820 * ensue. We also use interrupt gates for i386 as well even though this is not
821 * required for some traps.
823 * Perhaps they should have invented a trap gate that does an atomic swapgs?
825 static void
826 init_idt_common(gate_desc_t *idt)
828 set_gatesegd(&idt[T_ZERODIV],
829 (kpti_enable == 1) ? &tr_div0trap : &div0trap,
830 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ZERODIV));
831 set_gatesegd(&idt[T_SGLSTP],
832 (kpti_enable == 1) ? &tr_dbgtrap : &dbgtrap,
833 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SGLSTP));
834 set_gatesegd(&idt[T_NMIFLT],
835 (kpti_enable == 1) ? &tr_nmiint : &nmiint,
836 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NMIFLT));
837 set_gatesegd(&idt[T_BPTFLT],
838 (kpti_enable == 1) ? &tr_brktrap : &brktrap,
839 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_BPTFLT));
840 set_gatesegd(&idt[T_OVFLW],
841 (kpti_enable == 1) ? &tr_ovflotrap : &ovflotrap,
842 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_OVFLW));
843 set_gatesegd(&idt[T_BOUNDFLT],
844 (kpti_enable == 1) ? &tr_boundstrap : &boundstrap,
845 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_BOUNDFLT));
846 set_gatesegd(&idt[T_ILLINST],
847 (kpti_enable == 1) ? &tr_invoptrap : &invoptrap,
848 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ILLINST));
849 set_gatesegd(&idt[T_NOEXTFLT],
850 (kpti_enable == 1) ? &tr_ndptrap : &ndptrap,
851 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NOEXTFLT));
854 * double fault handler.
856 * Note that on the hypervisor a guest does not receive #df faults.
857 * Instead a failsafe event is injected into the guest if its selectors
858 * and/or stack is in a broken state. See xen_failsafe_callback.
860 set_gatesegd(&idt[T_DBLFLT],
861 (kpti_enable == 1) ? &tr_syserrtrap : &syserrtrap,
862 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_DBLFLT));
865 * T_EXTOVRFLT coprocessor-segment-overrun not supported.
867 set_gatesegd(&idt[T_TSSFLT],
868 (kpti_enable == 1) ? &tr_invtsstrap : &invtsstrap,
869 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_TSSFLT));
870 set_gatesegd(&idt[T_SEGFLT],
871 (kpti_enable == 1) ? &tr_segnptrap : &segnptrap,
872 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SEGFLT));
873 set_gatesegd(&idt[T_STKFLT],
874 (kpti_enable == 1) ? &tr_stktrap : &stktrap,
875 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_STKFLT));
876 set_gatesegd(&idt[T_GPFLT],
877 (kpti_enable == 1) ? &tr_gptrap : &gptrap,
878 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_GPFLT));
879 set_gatesegd(&idt[T_PGFLT],
880 (kpti_enable == 1) ? &tr_pftrap : &pftrap,
881 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_PGFLT));
882 set_gatesegd(&idt[T_EXTERRFLT],
883 (kpti_enable == 1) ? &tr_ndperr : &ndperr,
884 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_EXTERRFLT));
885 set_gatesegd(&idt[T_ALIGNMENT],
886 (kpti_enable == 1) ? &tr_achktrap : &achktrap,
887 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ALIGNMENT));
888 set_gatesegd(&idt[T_MCE],
889 (kpti_enable == 1) ? &tr_mcetrap : &mcetrap,
890 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_MCE));
891 set_gatesegd(&idt[T_SIMDFPE],
892 (kpti_enable == 1) ? &tr_xmtrap : &xmtrap,
893 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SIMDFPE));
896 * install fast trap handler at 210.
898 set_gatesegd(&idt[T_FASTTRAP],
899 (kpti_enable == 1) ? &tr_fasttrap : &fasttrap,
900 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_FASTTRAP));
903 * System call handler.
905 set_gatesegd(&idt[T_SYSCALLINT],
906 (kpti_enable == 1) ? &tr_sys_syscall_int : &sys_syscall_int,
907 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_SYSCALLINT));
910 * Install the DTrace interrupt handler for the pid provider.
912 set_gatesegd(&idt[T_DTRACE_RET],
913 (kpti_enable == 1) ? &tr_dtrace_ret : &dtrace_ret,
914 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_DTRACE_RET));
917 * Prepare interposing descriptor for the syscall handler
918 * and cache copy of the default descriptor.
920 brand_tbl[0].ih_inum = T_SYSCALLINT;
921 brand_tbl[0].ih_default_desc = idt0[T_SYSCALLINT];
923 set_gatesegd(&(brand_tbl[0].ih_interp_desc),
924 (kpti_enable == 1) ? &tr_brand_sys_syscall_int :
925 &brand_sys_syscall_int, KCS_SEL, SDT_SYSIGT, TRP_UPL,
926 idt_vector_to_ist(T_SYSCALLINT));
928 brand_tbl[1].ih_inum = 0;
932 static void
933 init_idt(gate_desc_t *idt)
935 char ivctname[80];
936 void (*ivctptr)(void);
937 int i;
940 * Initialize entire table with 'reserved' trap and then overwrite
941 * specific entries. T_EXTOVRFLT (9) is unsupported and reserved
942 * since it can only be generated on a 386 processor. 15 is also
943 * unsupported and reserved.
945 #if !defined(__xpv)
946 for (i = 0; i < NIDT; i++) {
947 set_gatesegd(&idt[i],
948 (kpti_enable == 1) ? &tr_resvtrap : &resvtrap,
949 KCS_SEL, SDT_SYSIGT, TRP_KPL,
950 idt_vector_to_ist(T_RESVTRAP));
952 #else
953 for (i = 0; i < NIDT; i++) {
954 set_gatesegd(&idt[i], &resvtrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
955 IST_NONE);
957 #endif
960 * 20-31 reserved
962 #if !defined(__xpv)
963 for (i = 20; i < 32; i++) {
964 set_gatesegd(&idt[i],
965 (kpti_enable == 1) ? &tr_invaltrap : &invaltrap,
966 KCS_SEL, SDT_SYSIGT, TRP_KPL,
967 idt_vector_to_ist(T_INVALTRAP));
969 #else
970 for (i = 20; i < 32; i++) {
971 set_gatesegd(&idt[i], &invaltrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
972 IST_NONE);
974 #endif
977 * interrupts 32 - 255
979 for (i = 32; i < 256; i++) {
980 #if !defined(__xpv)
981 (void) snprintf(ivctname, sizeof (ivctname),
982 (kpti_enable == 1) ? "tr_ivct%d" : "ivct%d", i);
983 #else
984 (void) snprintf(ivctname, sizeof (ivctname), "ivct%d", i);
985 #endif
986 ivctptr = (void (*)(void))kobj_getsymvalue(ivctname, 0);
987 if (ivctptr == NULL)
988 panic("kobj_getsymvalue(%s) failed", ivctname);
990 set_gatesegd(&idt[i], ivctptr, KCS_SEL, SDT_SYSIGT, TRP_KPL,
991 idt_vector_to_ist(i));
995 * Now install the common ones. Note that it will overlay some
996 * entries installed above like T_SYSCALLINT, T_FASTTRAP etc.
998 init_idt_common(idt);
1003 * The kernel does not deal with LDTs unless a user explicitly creates
1004 * one. Under normal circumstances, the LDTR contains 0. Any process attempting
1005 * to reference the LDT will therefore cause a #gp. System calls made via the
1006 * obsolete lcall mechanism are emulated by the #gp fault handler.
1008 static void
1009 init_ldt(void)
1011 wr_ldtr(0);
1014 static void
1015 init_tss(void)
1017 extern struct cpu cpus[];
1020 * tss_rsp0 is dynamically filled in by resume() (in swtch.s) on each
1021 * context switch but it'll be overwritten with this same value anyway.
1023 if (kpti_enable == 1) {
1024 ktss0->tss_rsp0 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
1027 /* Set up the IST stacks for double fault, NMI, MCE. */
1028 ktss0->tss_ist1 = (uintptr_t)&dblfault_stack0[sizeof (dblfault_stack0)];
1029 ktss0->tss_ist2 = (uintptr_t)&nmi_stack0[sizeof (nmi_stack0)];
1030 ktss0->tss_ist3 = (uintptr_t)&mce_stack0[sizeof (mce_stack0)];
1033 * This IST stack is used for #DB,#BP (debug) interrupts (when KPTI is
1034 * enabled), and also for KDI (always).
1036 ktss0->tss_ist4 = (uint64_t)&cpus->cpu_m.mcpu_kpti_dbg.kf_tr_rsp;
1038 if (kpti_enable == 1) {
1039 /* This IST stack is used for #GP,#PF,#SS (fault) interrupts. */
1040 ktss0->tss_ist5 =
1041 (uint64_t)&cpus->cpu_m.mcpu_kpti_flt.kf_tr_rsp;
1043 /* This IST stack is used for all other intrs (for KPTI). */
1044 ktss0->tss_ist6 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
1048 * Set I/O bit map offset equal to size of TSS segment limit
1049 * for no I/O permission map. This will force all user I/O
1050 * instructions to generate #gp fault.
1052 ktss0->tss_bitmapbase = sizeof (*ktss0);
1055 * Point %tr to descriptor for ktss0 in gdt.
1057 wr_tsr(KTSS_SEL);
1061 void
1062 init_desctbls(void)
1064 user_desc_t *gdt;
1065 desctbr_t idtr;
1068 * Allocate IDT and TSS structures on unique pages for better
1069 * performance in virtual machines.
1071 ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE);
1072 idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA,
1073 PAGESIZE, PAGESIZE);
1074 bzero(idt0, PAGESIZE);
1075 ASSERT(sizeof (*ktss0) <= PAGESIZE);
1076 ktss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)KTSS_VA,
1077 PAGESIZE, PAGESIZE);
1078 bzero(ktss0, PAGESIZE);
1080 #if defined(__i386)
1081 ASSERT(sizeof (*dftss0) <= PAGESIZE);
1082 dftss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)DFTSS_VA,
1083 PAGESIZE, PAGESIZE);
1084 bzero(dftss0, PAGESIZE);
1085 #endif
1088 * Setup and install our GDT.
1090 gdt = init_gdt();
1091 ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE));
1092 CPU->cpu_gdt = gdt;
1095 * Initialize this CPU's LDT.
1097 CPU->cpu_m.mcpu_ldt = BOP_ALLOC(bootops, (caddr_t)LDT_VA,
1098 LDT_CPU_SIZE, PAGESIZE);
1099 bzero(CPU->cpu_m.mcpu_ldt, LDT_CPU_SIZE);
1100 CPU->cpu_m.mcpu_ldt_len = 0;
1103 * Setup and install our IDT.
1105 init_idt(idt0);
1107 idtr.dtr_base = (uintptr_t)idt0;
1108 idtr.dtr_limit = (NIDT * sizeof (*idt0)) - 1;
1109 wr_idtr(&idtr);
1110 CPU->cpu_idt = idt0;
1112 #if defined(__i386)
1114 * We maintain a description of idt0 in convenient IDTR format
1115 * for #pf's on some older pentium processors. See pentium_pftrap().
1117 idt0_default_r = idtr;
1118 #endif /* __i386 */
1120 init_tss();
1121 CPU->cpu_tss = ktss0;
1122 init_ldt();
1124 /* Stash this so that the NMI,MCE,#DF and KDI handlers can use it. */
1125 kpti_safe_cr3 = (uint64_t)getcr3();
1129 #ifndef __xpv
1131 * As per Intel Vol 3 27.5.2, the GDTR limit is reset to 64Kb on a VM exit, so
1132 * we have to manually fix it up ourselves.
1134 * The caller may still need to make sure that it can't go off-CPU with the
1135 * incorrect limit, before calling this (such as disabling pre-emption).
1137 void
1138 reset_gdtr_limit(void)
1140 ulong_t flags = intr_clear();
1141 desctbr_t gdtr;
1143 rd_gdtr(&gdtr);
1144 gdtr.dtr_limit = (sizeof (user_desc_t) * NGDT) - 1;
1145 wr_gdtr(&gdtr);
1147 intr_restore(flags);
1149 #endif /* __xpv */
1152 * In the early kernel, we need to set up a simple GDT to run on.
1154 * XXPV Can dboot use this too? See dboot_gdt.s
1156 void
1157 init_boot_gdt(user_desc_t *bgdt)
1159 #if defined(__amd64)
1160 set_usegd(&bgdt[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA, SEL_KPL,
1161 SDP_PAGES, SDP_OP32);
1162 set_usegd(&bgdt[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA, SEL_KPL,
1163 SDP_PAGES, SDP_OP32);
1164 #elif defined(__i386)
1165 set_usegd(&bgdt[GDT_B32DATA], NULL, -1, SDT_MEMRWA, SEL_KPL,
1166 SDP_PAGES, SDP_OP32);
1167 set_usegd(&bgdt[GDT_B32CODE], NULL, -1, SDT_MEMERA, SEL_KPL,
1168 SDP_PAGES, SDP_OP32);
1169 #endif /* __i386 */
1173 * Enable interpositioning on the system call path by rewriting the
1174 * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
1175 * the branded entry points.
1177 void
1178 brand_interpositioning_enable(void)
1180 gate_desc_t *idt = CPU->cpu_idt;
1181 int i;
1183 ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL);
1185 for (i = 0; brand_tbl[i].ih_inum; i++) {
1186 idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_interp_desc;
1189 #if defined(__amd64)
1190 #if defined(__xpv)
1193 * Currently the hypervisor only supports 64-bit syscalls via
1194 * syscall instruction. The 32-bit syscalls are handled by
1195 * interrupt gate above.
1197 xen_set_callback(brand_sys_syscall, CALLBACKTYPE_syscall,
1198 CALLBACKF_mask_events);
1200 #else
1202 if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
1203 if (kpti_enable == 1) {
1204 wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_brand_sys_syscall);
1205 wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_brand_sys_syscall32);
1206 } else {
1207 wrmsr(MSR_AMD_LSTAR, (uintptr_t)brand_sys_syscall);
1208 wrmsr(MSR_AMD_CSTAR, (uintptr_t)brand_sys_syscall32);
1212 #endif
1213 #endif /* __amd64 */
1215 if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
1216 if (kpti_enable == 1) {
1217 wrmsr(MSR_INTC_SEP_EIP,
1218 (uintptr_t)tr_brand_sys_sysenter);
1219 } else {
1220 wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)brand_sys_sysenter);
1226 * Disable interpositioning on the system call path by rewriting the
1227 * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
1228 * the standard entry points, which bypass the interpositioning hooks.
1230 void
1231 brand_interpositioning_disable(void)
1233 gate_desc_t *idt = CPU->cpu_idt;
1234 int i;
1236 ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL);
1238 for (i = 0; brand_tbl[i].ih_inum; i++) {
1239 idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_default_desc;
1242 #if defined(__amd64)
1243 #if defined(__xpv)
1246 * See comment above in brand_interpositioning_enable.
1248 xen_set_callback(sys_syscall, CALLBACKTYPE_syscall,
1249 CALLBACKF_mask_events);
1251 #else
1253 if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
1254 if (kpti_enable == 1) {
1255 wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_sys_syscall);
1256 wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_sys_syscall32);
1257 } else {
1258 wrmsr(MSR_AMD_LSTAR, (uintptr_t)sys_syscall);
1259 wrmsr(MSR_AMD_CSTAR, (uintptr_t)sys_syscall32);
1263 #endif
1264 #endif /* __amd64 */
1266 if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
1267 if (kpti_enable == 1) {
1268 wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)tr_sys_sysenter);
1269 } else {
1270 wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)sys_sysenter);