stop linking against libsocket
[unleashed.git] / arch / x86 / kernel / os / desctbls.c
blob6d274f5ef547dbbf6ec9445d4172a210a438677b
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
27 * Copyright 2011 Joyent, Inc. All rights reserved.
31 * Copyright (c) 1992 Terrence R. Lambert.
32 * Copyright (c) 1990 The Regents of the University of California.
33 * All rights reserved.
35 * This code is derived from software contributed to Berkeley by
36 * William Jolitz.
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 * 3. All advertising materials mentioning features or use of this software
47 * must display the following acknowledgement:
48 * This product includes software developed by the University of
49 * California, Berkeley and its contributors.
50 * 4. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
66 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
69 #include <sys/types.h>
70 #include <sys/sysmacros.h>
71 #include <sys/tss.h>
72 #include <sys/segments.h>
73 #include <sys/trap.h>
74 #include <sys/cpuvar.h>
75 #include <sys/bootconf.h>
76 #include <sys/x86_archext.h>
77 #include <sys/controlregs.h>
78 #include <sys/archsystm.h>
79 #include <sys/machsystm.h>
80 #include <sys/kobj.h>
81 #include <sys/cmn_err.h>
82 #include <sys/reboot.h>
83 #include <sys/kdi.h>
84 #include <sys/mach_mmu.h>
85 #include <sys/systm.h>
88 #include <sys/promif.h>
89 #include <sys/bootinfo.h>
90 #include <vm/kboot_mmu.h>
91 #include <vm/hat_pte.h>
94 * cpu0 and default tables and structures.
96 user_desc_t *gdt0;
97 desctbr_t gdt0_default_r;
99 gate_desc_t *idt0; /* interrupt descriptor table */
100 #if defined(__i386)
101 desctbr_t idt0_default_r; /* describes idt0 in IDTR format */
102 #endif
104 tss_t *ktss0; /* kernel task state structure */
106 #if defined(__i386)
107 tss_t *dftss0; /* #DF double-fault exception */
108 #endif /* __i386 */
110 user_desc_t zero_udesc; /* base zero user desc native procs */
111 user_desc_t null_udesc; /* null user descriptor */
112 system_desc_t null_sdesc; /* null system descriptor */
114 #if defined(__amd64)
115 user_desc_t zero_u32desc; /* 32-bit compatibility procs */
116 #endif /* __amd64 */
118 #if defined(__amd64)
119 user_desc_t ucs_on;
120 user_desc_t ucs_off;
121 user_desc_t ucs32_on;
122 user_desc_t ucs32_off;
123 #endif /* __amd64 */
125 #pragma align 16(dblfault_stack0)
126 char dblfault_stack0[DEFAULTSTKSZ];
128 extern void fast_null(void);
129 extern hrtime_t get_hrtime(void);
130 extern hrtime_t gethrvtime(void);
131 extern hrtime_t get_hrestime(void);
132 extern uint64_t getlgrp(void);
134 void (*(fasttable[]))(void) = {
135 fast_null, /* T_FNULL routine */
136 fast_null, /* T_FGETFP routine (initially null) */
137 fast_null, /* T_FSETFP routine (initially null) */
138 (void (*)())get_hrtime, /* T_GETHRTIME */
139 (void (*)())gethrvtime, /* T_GETHRVTIME */
140 (void (*)())get_hrestime, /* T_GETHRESTIME */
141 (void (*)())getlgrp /* T_GETLGRP */
145 * Structure containing pre-computed descriptors to allow us to temporarily
146 * interpose on a standard handler.
148 struct interposing_handler {
149 int ih_inum;
150 gate_desc_t ih_interp_desc;
151 gate_desc_t ih_default_desc;
155 * The brand infrastructure interposes on two handlers, and we use one as a
156 * NULL signpost.
158 static struct interposing_handler brand_tbl[2];
161 * software prototypes for default local descriptor table
165 * Routines for loading segment descriptors in format the hardware
166 * can understand.
169 #if defined(__amd64)
172 * In long mode we have the new L or long mode attribute bit
173 * for code segments. Only the conforming bit in type is used along
174 * with descriptor priority and present bits. Default operand size must
175 * be zero when in long mode. In 32-bit compatibility mode all fields
176 * are treated as in legacy mode. For data segments while in long mode
177 * only the present bit is loaded.
179 void
180 set_usegd(user_desc_t *dp, uint_t lmode, void *base, size_t size,
181 uint_t type, uint_t dpl, uint_t gran, uint_t defopsz)
183 ASSERT(lmode == SDP_SHORT || lmode == SDP_LONG);
186 * 64-bit long mode.
188 if (lmode == SDP_LONG)
189 dp->usd_def32 = 0; /* 32-bit operands only */
190 else
192 * 32-bit compatibility mode.
194 dp->usd_def32 = defopsz; /* 0 = 16, 1 = 32-bit ops */
196 dp->usd_long = lmode; /* 64-bit mode */
197 dp->usd_type = type;
198 dp->usd_dpl = dpl;
199 dp->usd_p = 1;
200 dp->usd_gran = gran; /* 0 = bytes, 1 = pages */
202 dp->usd_lobase = (uintptr_t)base;
203 dp->usd_midbase = (uintptr_t)base >> 16;
204 dp->usd_hibase = (uintptr_t)base >> (16 + 8);
205 dp->usd_lolimit = size;
206 dp->usd_hilimit = (uintptr_t)size >> 16;
209 #elif defined(__i386)
212 * Install user segment descriptor for code and data.
214 void
215 set_usegd(user_desc_t *dp, void *base, size_t size, uint_t type,
216 uint_t dpl, uint_t gran, uint_t defopsz)
218 dp->usd_lolimit = size;
219 dp->usd_hilimit = (uintptr_t)size >> 16;
221 dp->usd_lobase = (uintptr_t)base;
222 dp->usd_midbase = (uintptr_t)base >> 16;
223 dp->usd_hibase = (uintptr_t)base >> (16 + 8);
225 dp->usd_type = type;
226 dp->usd_dpl = dpl;
227 dp->usd_p = 1;
228 dp->usd_def32 = defopsz; /* 0 = 16, 1 = 32 bit operands */
229 dp->usd_gran = gran; /* 0 = bytes, 1 = pages */
232 #endif /* __i386 */
235 * Install system segment descriptor for LDT and TSS segments.
238 #if defined(__amd64)
240 void
241 set_syssegd(system_desc_t *dp, void *base, size_t size, uint_t type,
242 uint_t dpl)
244 dp->ssd_lolimit = size;
245 dp->ssd_hilimit = (uintptr_t)size >> 16;
247 dp->ssd_lobase = (uintptr_t)base;
248 dp->ssd_midbase = (uintptr_t)base >> 16;
249 dp->ssd_hibase = (uintptr_t)base >> (16 + 8);
250 dp->ssd_hi64base = (uintptr_t)base >> (16 + 8 + 8);
252 dp->ssd_type = type;
253 dp->ssd_zero1 = 0; /* must be zero */
254 dp->ssd_zero2 = 0;
255 dp->ssd_dpl = dpl;
256 dp->ssd_p = 1;
257 dp->ssd_gran = 0; /* force byte units */
260 void *
261 get_ssd_base(system_desc_t *dp)
263 uintptr_t base;
265 base = (uintptr_t)dp->ssd_lobase |
266 (uintptr_t)dp->ssd_midbase << 16 |
267 (uintptr_t)dp->ssd_hibase << (16 + 8) |
268 (uintptr_t)dp->ssd_hi64base << (16 + 8 + 8);
269 return ((void *)base);
272 #elif defined(__i386)
274 void
275 set_syssegd(system_desc_t *dp, void *base, size_t size, uint_t type,
276 uint_t dpl)
278 dp->ssd_lolimit = size;
279 dp->ssd_hilimit = (uintptr_t)size >> 16;
281 dp->ssd_lobase = (uintptr_t)base;
282 dp->ssd_midbase = (uintptr_t)base >> 16;
283 dp->ssd_hibase = (uintptr_t)base >> (16 + 8);
285 dp->ssd_type = type;
286 dp->ssd_zero = 0; /* must be zero */
287 dp->ssd_dpl = dpl;
288 dp->ssd_p = 1;
289 dp->ssd_gran = 0; /* force byte units */
292 void *
293 get_ssd_base(system_desc_t *dp)
295 uintptr_t base;
297 base = (uintptr_t)dp->ssd_lobase |
298 (uintptr_t)dp->ssd_midbase << 16 |
299 (uintptr_t)dp->ssd_hibase << (16 + 8);
300 return ((void *)base);
303 #endif /* __i386 */
306 * Install gate segment descriptor for interrupt, trap, call and task gates.
309 #if defined(__amd64)
311 /*ARGSUSED*/
312 void
313 set_gatesegd(gate_desc_t *dp, void (*func)(void), selector_t sel,
314 uint_t type, uint_t dpl, uint_t vector)
316 dp->sgd_looffset = (uintptr_t)func;
317 dp->sgd_hioffset = (uintptr_t)func >> 16;
318 dp->sgd_hi64offset = (uintptr_t)func >> (16 + 16);
320 dp->sgd_selector = (uint16_t)sel;
323 * For 64 bit native we use the IST stack mechanism
324 * for double faults. All other traps use the CPL = 0
325 * (tss_rsp0) stack.
327 #if !defined(__xpv)
328 if (vector == T_DBLFLT)
329 dp->sgd_ist = 1;
330 else
331 #endif
332 dp->sgd_ist = 0;
334 dp->sgd_type = type;
335 dp->sgd_dpl = dpl;
336 dp->sgd_p = 1;
339 #elif defined(__i386)
341 /*ARGSUSED*/
342 void
343 set_gatesegd(gate_desc_t *dp, void (*func)(void), selector_t sel,
344 uint_t type, uint_t dpl, uint_t unused)
346 dp->sgd_looffset = (uintptr_t)func;
347 dp->sgd_hioffset = (uintptr_t)func >> 16;
349 dp->sgd_selector = (uint16_t)sel;
350 dp->sgd_stkcpy = 0; /* always zero bytes */
351 dp->sgd_type = type;
352 dp->sgd_dpl = dpl;
353 dp->sgd_p = 1;
356 #endif /* __i386 */
359 * Updates a single user descriptor in the the GDT of the current cpu.
360 * Caller is responsible for preventing cpu migration.
363 void
364 gdt_update_usegd(uint_t sidx, user_desc_t *udp)
367 CPU->cpu_gdt[sidx] = *udp;
372 * Writes single descriptor pointed to by udp into a processes
373 * LDT entry pointed to by ldp.
376 ldt_update_segd(user_desc_t *ldp, user_desc_t *udp)
379 *ldp = *udp;
381 return (0);
385 #if defined(__amd64)
388 * Build kernel GDT.
391 static void
392 init_gdt_common(user_desc_t *gdt)
394 int i;
397 * 64-bit kernel code segment.
399 set_usegd(&gdt[GDT_KCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_KPL,
400 SDP_PAGES, SDP_OP32);
403 * 64-bit kernel data segment. The limit attribute is ignored in 64-bit
404 * mode, but we set it here to 0xFFFF so that we can use the SYSRET
405 * instruction to return from system calls back to 32-bit applications.
406 * SYSRET doesn't update the base, limit, or attributes of %ss or %ds
407 * descriptors. We therefore must ensure that the kernel uses something,
408 * though it will be ignored by hardware, that is compatible with 32-bit
409 * apps. For the same reason we must set the default op size of this
410 * descriptor to 32-bit operands.
412 set_usegd(&gdt[GDT_KDATA], SDP_LONG, NULL, -1, SDT_MEMRWA,
413 SEL_KPL, SDP_PAGES, SDP_OP32);
414 gdt[GDT_KDATA].usd_def32 = 1;
417 * 64-bit user code segment.
419 set_usegd(&gdt[GDT_UCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_UPL,
420 SDP_PAGES, SDP_OP32);
423 * 32-bit user code segment.
425 set_usegd(&gdt[GDT_U32CODE], SDP_SHORT, NULL, -1, SDT_MEMERA,
426 SEL_UPL, SDP_PAGES, SDP_OP32);
429 * See gdt_ucode32() and gdt_ucode_native().
431 ucs_on = ucs_off = gdt[GDT_UCODE];
432 ucs_off.usd_p = 0; /* forces #np fault */
434 ucs32_on = ucs32_off = gdt[GDT_U32CODE];
435 ucs32_off.usd_p = 0; /* forces #np fault */
438 * 32 and 64 bit data segments can actually share the same descriptor.
439 * In long mode only the present bit is checked but all other fields
440 * are loaded. But in compatibility mode all fields are interpreted
441 * as in legacy mode so they must be set correctly for a 32-bit data
442 * segment.
444 set_usegd(&gdt[GDT_UDATA], SDP_SHORT, NULL, -1, SDT_MEMRWA, SEL_UPL,
445 SDP_PAGES, SDP_OP32);
447 #if !defined(__xpv)
450 * The 64-bit kernel has no default LDT. By default, the LDT descriptor
451 * in the GDT is 0.
455 * Kernel TSS
457 set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0,
458 sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL);
460 #endif /* !__xpv */
463 * Initialize fs and gs descriptors for 32 bit processes.
464 * Only attributes and limits are initialized, the effective
465 * base address is programmed via fsbase/gsbase.
467 set_usegd(&gdt[GDT_LWPFS], SDP_SHORT, NULL, -1, SDT_MEMRWA,
468 SEL_UPL, SDP_PAGES, SDP_OP32);
469 set_usegd(&gdt[GDT_LWPGS], SDP_SHORT, NULL, -1, SDT_MEMRWA,
470 SEL_UPL, SDP_PAGES, SDP_OP32);
473 * Initialize the descriptors set aside for brand usage.
474 * Only attributes and limits are initialized.
476 for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++)
477 set_usegd(&gdt0[i], SDP_SHORT, NULL, -1, SDT_MEMRWA,
478 SEL_UPL, SDP_PAGES, SDP_OP32);
481 * Initialize convenient zero base user descriptors for clearing
482 * lwp private %fs and %gs descriptors in GDT. See setregs() for
483 * an example.
485 set_usegd(&zero_udesc, SDP_LONG, 0, 0, SDT_MEMRWA, SEL_UPL,
486 SDP_BYTES, SDP_OP32);
487 set_usegd(&zero_u32desc, SDP_SHORT, 0, -1, SDT_MEMRWA, SEL_UPL,
488 SDP_PAGES, SDP_OP32);
491 #if defined(__xpv)
493 static user_desc_t *
494 init_gdt(void)
496 uint64_t gdtpa;
497 ulong_t ma[1]; /* XXPV should be a memory_t */
498 ulong_t addr;
501 * Our gdt is never larger than a single page.
503 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
504 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
505 PAGESIZE, PAGESIZE);
506 bzero(gdt0, PAGESIZE);
508 init_gdt_common(gdt0);
511 * XXX Since we never invoke kmdb until after the kernel takes
512 * over the descriptor tables why not have it use the kernel's
513 * selectors?
515 if (boothowto & RB_DEBUG) {
516 set_usegd(&gdt0[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA,
517 SEL_KPL, SDP_PAGES, SDP_OP32);
518 set_usegd(&gdt0[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA,
519 SEL_KPL, SDP_PAGES, SDP_OP32);
523 * Clear write permission for page containing the gdt and install it.
525 gdtpa = pfn_to_pa(va_to_pfn(gdt0));
526 ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT);
527 kbm_read_only((uintptr_t)gdt0, gdtpa);
528 xen_set_gdt(ma, NGDT);
531 * Reload the segment registers to use the new GDT.
532 * On 64-bit, fixup KCS_SEL to be in ring 3.
533 * See KCS_SEL in segments.h.
535 load_segment_registers((KCS_SEL | SEL_KPL), KFS_SEL, KGS_SEL, KDS_SEL);
538 * setup %gs for kernel
540 xen_set_segment_base(SEGBASE_GS_KERNEL, (ulong_t)&cpus[0]);
543 * XX64 We should never dereference off "other gsbase" or
544 * "fsbase". So, we should arrange to point FSBASE and
545 * KGSBASE somewhere truly awful e.g. point it at the last
546 * valid address below the hole so that any attempts to index
547 * off them cause an exception.
549 * For now, point it at 8G -- at least it should be unmapped
550 * until some 64-bit processes run.
552 addr = 0x200000000ul;
553 xen_set_segment_base(SEGBASE_FS, addr);
554 xen_set_segment_base(SEGBASE_GS_USER, addr);
555 xen_set_segment_base(SEGBASE_GS_USER_SEL, 0);
557 return (gdt0);
560 #else /* __xpv */
562 static user_desc_t *
563 init_gdt(void)
565 desctbr_t r_bgdt, r_gdt;
566 user_desc_t *bgdt;
569 * Our gdt is never larger than a single page.
571 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
572 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
573 PAGESIZE, PAGESIZE);
574 bzero(gdt0, PAGESIZE);
576 init_gdt_common(gdt0);
579 * Copy in from boot's gdt to our gdt.
580 * Entry 0 is the null descriptor by definition.
582 rd_gdtr(&r_bgdt);
583 bgdt = (user_desc_t *)r_bgdt.dtr_base;
584 if (bgdt == NULL)
585 panic("null boot gdt");
587 gdt0[GDT_B32DATA] = bgdt[GDT_B32DATA];
588 gdt0[GDT_B32CODE] = bgdt[GDT_B32CODE];
589 gdt0[GDT_B16CODE] = bgdt[GDT_B16CODE];
590 gdt0[GDT_B16DATA] = bgdt[GDT_B16DATA];
591 gdt0[GDT_B64CODE] = bgdt[GDT_B64CODE];
594 * Install our new GDT
596 r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1;
597 r_gdt.dtr_base = (uintptr_t)gdt0;
598 wr_gdtr(&r_gdt);
601 * Reload the segment registers to use the new GDT
603 load_segment_registers(KCS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
606 * setup %gs for kernel
608 wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]);
611 * XX64 We should never dereference off "other gsbase" or
612 * "fsbase". So, we should arrange to point FSBASE and
613 * KGSBASE somewhere truly awful e.g. point it at the last
614 * valid address below the hole so that any attempts to index
615 * off them cause an exception.
617 * For now, point it at 8G -- at least it should be unmapped
618 * until some 64-bit processes run.
620 wrmsr(MSR_AMD_FSBASE, 0x200000000ul);
621 wrmsr(MSR_AMD_KGSBASE, 0x200000000ul);
622 return (gdt0);
625 #endif /* __xpv */
627 #elif defined(__i386)
629 static void
630 init_gdt_common(user_desc_t *gdt)
632 int i;
635 * Text and data for both kernel and user span entire 32 bit
636 * address space.
640 * kernel code segment.
642 set_usegd(&gdt[GDT_KCODE], NULL, -1, SDT_MEMERA, SEL_KPL, SDP_PAGES,
643 SDP_OP32);
646 * kernel data segment.
648 set_usegd(&gdt[GDT_KDATA], NULL, -1, SDT_MEMRWA, SEL_KPL, SDP_PAGES,
649 SDP_OP32);
652 * user code segment.
654 set_usegd(&gdt[GDT_UCODE], NULL, -1, SDT_MEMERA, SEL_UPL, SDP_PAGES,
655 SDP_OP32);
658 * user data segment.
660 set_usegd(&gdt[GDT_UDATA], NULL, -1, SDT_MEMRWA, SEL_UPL, SDP_PAGES,
661 SDP_OP32);
663 #if !defined(__xpv)
666 * TSS for T_DBLFLT (double fault) handler
668 set_syssegd((system_desc_t *)&gdt[GDT_DBFLT], dftss0,
669 sizeof (*dftss0) - 1, SDT_SYSTSS, SEL_KPL);
672 * TSS for kernel
674 set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0,
675 sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL);
677 #endif /* !__xpv */
680 * %gs selector for kernel
682 set_usegd(&gdt[GDT_GS], &cpus[0], sizeof (struct cpu) -1, SDT_MEMRWA,
683 SEL_KPL, SDP_BYTES, SDP_OP32);
686 * Initialize lwp private descriptors.
687 * Only attributes and limits are initialized, the effective
688 * base address is programmed via fsbase/gsbase.
690 set_usegd(&gdt[GDT_LWPFS], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
691 SDP_PAGES, SDP_OP32);
692 set_usegd(&gdt[GDT_LWPGS], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
693 SDP_PAGES, SDP_OP32);
696 * Initialize the descriptors set aside for brand usage.
697 * Only attributes and limits are initialized.
699 for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++)
700 set_usegd(&gdt0[i], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
701 SDP_PAGES, SDP_OP32);
703 * Initialize convenient zero base user descriptor for clearing
704 * lwp private %fs and %gs descriptors in GDT. See setregs() for
705 * an example.
707 set_usegd(&zero_udesc, NULL, -1, SDT_MEMRWA, SEL_UPL,
708 SDP_BYTES, SDP_OP32);
711 #if defined(__xpv)
713 static user_desc_t *
714 init_gdt(void)
716 uint64_t gdtpa;
717 ulong_t ma[1]; /* XXPV should be a memory_t */
720 * Our gdt is never larger than a single page.
722 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
723 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
724 PAGESIZE, PAGESIZE);
725 bzero(gdt0, PAGESIZE);
727 init_gdt_common(gdt0);
728 gdtpa = pfn_to_pa(va_to_pfn(gdt0));
731 * XXX Since we never invoke kmdb until after the kernel takes
732 * over the descriptor tables why not have it use the kernel's
733 * selectors?
735 if (boothowto & RB_DEBUG) {
736 set_usegd(&gdt0[GDT_B32DATA], NULL, -1, SDT_MEMRWA, SEL_KPL,
737 SDP_PAGES, SDP_OP32);
738 set_usegd(&gdt0[GDT_B32CODE], NULL, -1, SDT_MEMERA, SEL_KPL,
739 SDP_PAGES, SDP_OP32);
743 * Clear write permission for page containing the gdt and install it.
745 ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT);
746 kbm_read_only((uintptr_t)gdt0, gdtpa);
747 xen_set_gdt(ma, NGDT);
750 * Reload the segment registers to use the new GDT
752 load_segment_registers(
753 KCS_SEL, KDS_SEL, KDS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
755 return (gdt0);
758 #else /* __xpv */
760 static user_desc_t *
761 init_gdt(void)
763 desctbr_t r_bgdt, r_gdt;
764 user_desc_t *bgdt;
767 * Our gdt is never larger than a single page.
769 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
771 * XXX this allocation belongs in our caller, not here.
773 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
774 PAGESIZE, PAGESIZE);
775 bzero(gdt0, PAGESIZE);
777 init_gdt_common(gdt0);
780 * Copy in from boot's gdt to our gdt entries.
781 * Entry 0 is null descriptor by definition.
783 rd_gdtr(&r_bgdt);
784 bgdt = (user_desc_t *)r_bgdt.dtr_base;
785 if (bgdt == NULL)
786 panic("null boot gdt");
788 gdt0[GDT_B32DATA] = bgdt[GDT_B32DATA];
789 gdt0[GDT_B32CODE] = bgdt[GDT_B32CODE];
790 gdt0[GDT_B16CODE] = bgdt[GDT_B16CODE];
791 gdt0[GDT_B16DATA] = bgdt[GDT_B16DATA];
794 * Install our new GDT
796 r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1;
797 r_gdt.dtr_base = (uintptr_t)gdt0;
798 wr_gdtr(&r_gdt);
801 * Reload the segment registers to use the new GDT
803 load_segment_registers(
804 KCS_SEL, KDS_SEL, KDS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
806 return (gdt0);
809 #endif /* __xpv */
810 #endif /* __i386 */
813 * Build kernel IDT.
815 * Note that for amd64 we pretty much require every gate to be an interrupt
816 * gate which blocks interrupts atomically on entry; that's because of our
817 * dependency on using 'swapgs' every time we come into the kernel to find
818 * the cpu structure. If we get interrupted just before doing that, %cs could
819 * be in kernel mode (so that the trap prolog doesn't do a swapgs), but
820 * %gsbase is really still pointing at something in userland. Bad things will
821 * ensue. We also use interrupt gates for i386 as well even though this is not
822 * required for some traps.
824 * Perhaps they should have invented a trap gate that does an atomic swapgs?
826 static void
827 init_idt_common(gate_desc_t *idt)
829 set_gatesegd(&idt[T_ZERODIV], &div0trap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
831 set_gatesegd(&idt[T_SGLSTP], &dbgtrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
833 set_gatesegd(&idt[T_NMIFLT], &nmiint, KCS_SEL, SDT_SYSIGT, TRP_KPL,
835 set_gatesegd(&idt[T_BPTFLT], &brktrap, KCS_SEL, SDT_SYSIGT, TRP_UPL,
837 set_gatesegd(&idt[T_OVFLW], &ovflotrap, KCS_SEL, SDT_SYSIGT, TRP_UPL,
839 set_gatesegd(&idt[T_BOUNDFLT], &boundstrap, KCS_SEL, SDT_SYSIGT,
840 TRP_KPL, 0);
841 set_gatesegd(&idt[T_ILLINST], &invoptrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
843 set_gatesegd(&idt[T_NOEXTFLT], &ndptrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
847 * double fault handler.
849 * Note that on the hypervisor a guest does not receive #df faults.
850 * Instead a failsafe event is injected into the guest if its selectors
851 * and/or stack is in a broken state. See xen_failsafe_callback.
853 #if defined(__amd64)
855 set_gatesegd(&idt[T_DBLFLT], &syserrtrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
856 T_DBLFLT);
858 #elif defined(__i386)
861 * task gate required.
863 set_gatesegd(&idt[T_DBLFLT], NULL, DFTSS_SEL, SDT_SYSTASKGT, TRP_KPL,
866 #endif /* __i386 */
869 * T_EXTOVRFLT coprocessor-segment-overrun not supported.
872 set_gatesegd(&idt[T_TSSFLT], &invtsstrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
874 set_gatesegd(&idt[T_SEGFLT], &segnptrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
876 set_gatesegd(&idt[T_STKFLT], &stktrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, 0);
877 set_gatesegd(&idt[T_GPFLT], &gptrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, 0);
878 set_gatesegd(&idt[T_PGFLT], &pftrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, 0);
879 set_gatesegd(&idt[T_EXTERRFLT], &ndperr, KCS_SEL, SDT_SYSIGT, TRP_KPL,
881 set_gatesegd(&idt[T_ALIGNMENT], &achktrap, KCS_SEL, SDT_SYSIGT,
882 TRP_KPL, 0);
883 set_gatesegd(&idt[T_MCE], &mcetrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, 0);
884 set_gatesegd(&idt[T_SIMDFPE], &xmtrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, 0);
887 * install fast trap handler at 210.
889 set_gatesegd(&idt[T_FASTTRAP], &fasttrap, KCS_SEL, SDT_SYSIGT, TRP_UPL,
893 * System call handler.
895 #if defined(__amd64)
896 set_gatesegd(&idt[T_SYSCALLINT], &sys_syscall_int, KCS_SEL, SDT_SYSIGT,
897 TRP_UPL, 0);
899 #elif defined(__i386)
900 set_gatesegd(&idt[T_SYSCALLINT], &sys_call, KCS_SEL, SDT_SYSIGT,
901 TRP_UPL, 0);
902 #endif /* __i386 */
905 * Install the DTrace interrupt handler for the pid provider.
907 set_gatesegd(&idt[T_DTRACE_RET], &dtrace_ret, KCS_SEL,
908 SDT_SYSIGT, TRP_UPL, 0);
911 * Prepare interposing descriptor for the syscall handler
912 * and cache copy of the default descriptor.
914 brand_tbl[0].ih_inum = T_SYSCALLINT;
915 brand_tbl[0].ih_default_desc = idt0[T_SYSCALLINT];
917 #if defined(__amd64)
918 set_gatesegd(&(brand_tbl[0].ih_interp_desc), &brand_sys_syscall_int,
919 KCS_SEL, SDT_SYSIGT, TRP_UPL, 0);
920 #elif defined(__i386)
921 set_gatesegd(&(brand_tbl[0].ih_interp_desc), &brand_sys_call,
922 KCS_SEL, SDT_SYSIGT, TRP_UPL, 0);
923 #endif /* __i386 */
925 brand_tbl[1].ih_inum = 0;
929 static void
930 init_idt(gate_desc_t *idt)
932 char ivctname[80];
933 void (*ivctptr)(void);
934 int i;
937 * Initialize entire table with 'reserved' trap and then overwrite
938 * specific entries. T_EXTOVRFLT (9) is unsupported and reserved
939 * since it can only be generated on a 386 processor. 15 is also
940 * unsupported and reserved.
942 for (i = 0; i < NIDT; i++)
943 set_gatesegd(&idt[i], &resvtrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
947 * 20-31 reserved
949 for (i = 20; i < 32; i++)
950 set_gatesegd(&idt[i], &invaltrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
954 * interrupts 32 - 255
956 for (i = 32; i < 256; i++) {
957 (void) snprintf(ivctname, sizeof (ivctname), "ivct%d", i);
958 ivctptr = (void (*)(void))kobj_getsymvalue(ivctname, 0);
959 if (ivctptr == NULL)
960 panic("kobj_getsymvalue(%s) failed", ivctname);
962 set_gatesegd(&idt[i], ivctptr, KCS_SEL, SDT_SYSIGT, TRP_KPL, 0);
966 * Now install the common ones. Note that it will overlay some
967 * entries installed above like T_SYSCALLINT, T_FASTTRAP etc.
969 init_idt_common(idt);
974 * The kernel does not deal with LDTs unless a user explicitly creates
975 * one. Under normal circumstances, the LDTR contains 0. Any process attempting
976 * to reference the LDT will therefore cause a #gp. System calls made via the
977 * obsolete lcall mechanism are emulated by the #gp fault handler.
979 static void
980 init_ldt(void)
982 wr_ldtr(0);
985 #if defined(__amd64)
987 static void
988 init_tss(void)
991 * tss_rsp0 is dynamically filled in by resume() on each context switch.
992 * All exceptions but #DF will run on the thread stack.
993 * Set up the double fault stack here.
995 ktss0->tss_ist1 =
996 (uint64_t)&dblfault_stack0[sizeof (dblfault_stack0)];
999 * Set I/O bit map offset equal to size of TSS segment limit
1000 * for no I/O permission map. This will force all user I/O
1001 * instructions to generate #gp fault.
1003 ktss0->tss_bitmapbase = sizeof (*ktss0);
1006 * Point %tr to descriptor for ktss0 in gdt.
1008 wr_tsr(KTSS_SEL);
1011 #elif defined(__i386)
1013 static void
1014 init_tss(void)
1017 * ktss0->tss_esp dynamically filled in by resume() on each
1018 * context switch.
1020 ktss0->tss_ss0 = KDS_SEL;
1021 ktss0->tss_eip = (uint32_t)_start;
1022 ktss0->tss_ds = ktss0->tss_es = ktss0->tss_ss = KDS_SEL;
1023 ktss0->tss_cs = KCS_SEL;
1024 ktss0->tss_fs = KFS_SEL;
1025 ktss0->tss_gs = KGS_SEL;
1026 ktss0->tss_ldt = ULDT_SEL;
1029 * Initialize double fault tss.
1031 dftss0->tss_esp0 = (uint32_t)&dblfault_stack0[sizeof (dblfault_stack0)];
1032 dftss0->tss_ss0 = KDS_SEL;
1035 * tss_cr3 will get initialized in hat_kern_setup() once our page
1036 * tables have been setup.
1038 dftss0->tss_eip = (uint32_t)syserrtrap;
1039 dftss0->tss_esp = (uint32_t)&dblfault_stack0[sizeof (dblfault_stack0)];
1040 dftss0->tss_cs = KCS_SEL;
1041 dftss0->tss_ds = KDS_SEL;
1042 dftss0->tss_es = KDS_SEL;
1043 dftss0->tss_ss = KDS_SEL;
1044 dftss0->tss_fs = KFS_SEL;
1045 dftss0->tss_gs = KGS_SEL;
1048 * Set I/O bit map offset equal to size of TSS segment limit
1049 * for no I/O permission map. This will force all user I/O
1050 * instructions to generate #gp fault.
1052 ktss0->tss_bitmapbase = sizeof (*ktss0);
1055 * Point %tr to descriptor for ktss0 in gdt.
1057 wr_tsr(KTSS_SEL);
1060 #endif /* __i386 */
1063 void
1064 init_desctbls(void)
1066 user_desc_t *gdt;
1067 desctbr_t idtr;
1070 * Allocate IDT and TSS structures on unique pages for better
1071 * performance in virtual machines.
1073 ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE);
1074 idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA,
1075 PAGESIZE, PAGESIZE);
1076 bzero(idt0, PAGESIZE);
1077 ASSERT(sizeof (*ktss0) <= PAGESIZE);
1078 ktss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)KTSS_VA,
1079 PAGESIZE, PAGESIZE);
1080 bzero(ktss0, PAGESIZE);
1082 #if defined(__i386)
1083 ASSERT(sizeof (*dftss0) <= PAGESIZE);
1084 dftss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)DFTSS_VA,
1085 PAGESIZE, PAGESIZE);
1086 bzero(dftss0, PAGESIZE);
1087 #endif
1090 * Setup and install our GDT.
1092 gdt = init_gdt();
1093 ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE));
1094 CPU->cpu_gdt = gdt;
1097 * Setup and install our IDT.
1099 init_idt(idt0);
1101 idtr.dtr_base = (uintptr_t)idt0;
1102 idtr.dtr_limit = (NIDT * sizeof (*idt0)) - 1;
1103 wr_idtr(&idtr);
1104 CPU->cpu_idt = idt0;
1106 #if defined(__i386)
1108 * We maintain a description of idt0 in convenient IDTR format
1109 * for #pf's on some older pentium processors. See pentium_pftrap().
1111 idt0_default_r = idtr;
1112 #endif /* __i386 */
1114 init_tss();
1115 CPU->cpu_tss = ktss0;
1116 init_ldt();
1121 * In the early kernel, we need to set up a simple GDT to run on.
1123 * XXPV Can dboot use this too? See dboot_gdt.s
1125 void
1126 init_boot_gdt(user_desc_t *bgdt)
1128 #if defined(__amd64)
1129 set_usegd(&bgdt[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA, SEL_KPL,
1130 SDP_PAGES, SDP_OP32);
1131 set_usegd(&bgdt[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA, SEL_KPL,
1132 SDP_PAGES, SDP_OP32);
1133 #elif defined(__i386)
1134 set_usegd(&bgdt[GDT_B32DATA], NULL, -1, SDT_MEMRWA, SEL_KPL,
1135 SDP_PAGES, SDP_OP32);
1136 set_usegd(&bgdt[GDT_B32CODE], NULL, -1, SDT_MEMERA, SEL_KPL,
1137 SDP_PAGES, SDP_OP32);
1138 #endif /* __i386 */
1142 * Enable interpositioning on the system call path by rewriting the
1143 * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
1144 * the branded entry points.
1146 void
1147 brand_interpositioning_enable(void)
1149 gate_desc_t *idt = CPU->cpu_idt;
1150 int i;
1152 ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL);
1154 for (i = 0; brand_tbl[i].ih_inum; i++) {
1155 idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_interp_desc;
1158 #if defined(__amd64)
1159 #if defined(__xpv)
1162 * Currently the hypervisor only supports 64-bit syscalls via
1163 * syscall instruction. The 32-bit syscalls are handled by
1164 * interrupt gate above.
1166 xen_set_callback(brand_sys_syscall, CALLBACKTYPE_syscall,
1167 CALLBACKF_mask_events);
1169 #else
1171 if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
1172 wrmsr(MSR_AMD_LSTAR, (uintptr_t)brand_sys_syscall);
1173 wrmsr(MSR_AMD_CSTAR, (uintptr_t)brand_sys_syscall32);
1176 #endif
1177 #endif /* __amd64 */
1179 if (is_x86_feature(x86_featureset, X86FSET_SEP))
1180 wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)brand_sys_sysenter);
1184 * Disable interpositioning on the system call path by rewriting the
1185 * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
1186 * the standard entry points, which bypass the interpositioning hooks.
1188 void
1189 brand_interpositioning_disable(void)
1191 gate_desc_t *idt = CPU->cpu_idt;
1192 int i;
1194 ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL);
1196 for (i = 0; brand_tbl[i].ih_inum; i++) {
1197 idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_default_desc;
1200 #if defined(__amd64)
1201 #if defined(__xpv)
1204 * See comment above in brand_interpositioning_enable.
1206 xen_set_callback(sys_syscall, CALLBACKTYPE_syscall,
1207 CALLBACKF_mask_events);
1209 #else
1211 if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
1212 wrmsr(MSR_AMD_LSTAR, (uintptr_t)sys_syscall);
1213 wrmsr(MSR_AMD_CSTAR, (uintptr_t)sys_syscall32);
1216 #endif
1217 #endif /* __amd64 */
1219 if (is_x86_feature(x86_featureset, X86FSET_SEP))
1220 wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)sys_sysenter);