4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright (c) 2010, Intel Corporation.
26 * All rights reserved.
29 * Copyright 2018 Joyent, Inc
33 * Welcome to the world of the "real mode platter".
34 * See also startup.c, mpcore.s and apic.c for related routines.
37 #include <sys/types.h>
38 #include <sys/systm.h>
39 #include <sys/cpuvar.h>
40 #include <sys/cpu_module.h>
42 #include <sys/archsystm.h>
43 #include <sys/machsystm.h>
44 #include <sys/controlregs.h>
45 #include <sys/x86_archext.h>
46 #include <sys/smp_impldefs.h>
47 #include <sys/sysmacros.h>
48 #include <sys/mach_mmu.h>
49 #include <sys/promif.h>
51 #include <sys/cpu_event.h>
52 #include <sys/sunndi.h>
53 #include <sys/fs/dv_node.h>
54 #include <vm/hat_i86.h>
57 extern cpuset_t cpu_ready_set
;
59 extern int mp_start_cpu_common(cpu_t
*cp
, boolean_t boot
);
60 extern void real_mode_start_cpu(void);
61 extern void real_mode_start_cpu_end(void);
62 extern void real_mode_stop_cpu_stage1(void);
63 extern void real_mode_stop_cpu_stage1_end(void);
64 extern void real_mode_stop_cpu_stage2(void);
65 extern void real_mode_stop_cpu_stage2_end(void);
67 void rmp_gdt_init(rm_platter_t
*);
70 * Fill up the real mode platter to make it easy for real mode code to
71 * kick it off. This area should really be one passed by boot to kernel
72 * and guaranteed to be below 1MB and aligned to 16 bytes. Should also
73 * have identical physical and virtual address in paged mode.
75 static ushort_t
*warm_reset_vector
= NULL
;
78 mach_cpucontext_init(void)
82 struct rm_platter
*rm
= (struct rm_platter
*)rm_platter_va
;
84 if (!(vec
= (ushort_t
*)psm_map_phys(WARM_RESET_VECTOR
,
85 sizeof (vec
), PROT_READ
| PROT_WRITE
)))
89 * setup secondary cpu bios boot up vector
90 * Write page offset to 0x467 and page frame number to 0x469.
92 addr
= (ulong_t
)((caddr_t
)rm
->rm_code
- (caddr_t
)rm
) + rm_platter_pa
;
93 vec
[0] = (ushort_t
)(addr
& PAGEOFFSET
);
94 vec
[1] = (ushort_t
)((addr
& (0xfffff & PAGEMASK
)) >> 4);
95 warm_reset_vector
= vec
;
97 /* Map real mode platter into kas so kernel can access it. */
98 hat_devload(kas
.a_hat
,
99 (caddr_t
)(uintptr_t)rm_platter_pa
, MMU_PAGESIZE
,
100 btop(rm_platter_pa
), PROT_READ
| PROT_WRITE
| PROT_EXEC
,
103 /* Copy CPU startup code to rm_platter if it's still during boot. */
104 if (!plat_dr_enabled()) {
105 ASSERT((size_t)real_mode_start_cpu_end
-
106 (size_t)real_mode_start_cpu
<= RM_PLATTER_CODE_SIZE
);
107 bcopy((caddr_t
)real_mode_start_cpu
, (caddr_t
)rm
->rm_code
,
108 (size_t)real_mode_start_cpu_end
-
109 (size_t)real_mode_start_cpu
);
116 mach_cpucontext_fini(void)
118 if (warm_reset_vector
)
119 psm_unmap_phys((caddr_t
)warm_reset_vector
,
120 sizeof (warm_reset_vector
));
121 hat_unload(kas
.a_hat
, (caddr_t
)(uintptr_t)rm_platter_pa
, MMU_PAGESIZE
,
126 extern void *long_mode_64(void);
131 rmp_gdt_init(rm_platter_t
*rm
)
135 /* Use the kas address space for the CPU startup thread. */
136 if (mmu_ptob(kas
.a_hat
->hat_htable
->ht_pfn
) > 0xffffffffUL
) {
137 panic("Cannot initialize CPUs; kernel's 64-bit page tables\n"
138 "located above 4G in physical memory (@ 0x%lx)",
139 mmu_ptob(kas
.a_hat
->hat_htable
->ht_pfn
));
143 * Setup pseudo-descriptors for temporary GDT and IDT for use ONLY
144 * by code in real_mode_start_cpu():
146 * GDT[0]: NULL selector
147 * GDT[1]: 64-bit CS: Long = 1, Present = 1, bits 12, 11 = 1
149 * Clear the IDT as interrupts will be off and a limit of 0 will cause
150 * the CPU to triple fault and reset on an NMI, seemingly as reasonable
151 * a course of action as any other, though it may cause the entire
152 * platform to reset in some cases...
154 rm
->rm_temp_gdt
[0] = 0;
155 rm
->rm_temp_gdt
[TEMPGDT_KCODE64
] = 0x20980000000000ULL
;
157 rm
->rm_temp_gdt_lim
= (ushort_t
)(sizeof (rm
->rm_temp_gdt
) - 1);
158 rm
->rm_temp_gdt_base
= rm_platter_pa
+
159 (uint32_t)offsetof(rm_platter_t
, rm_temp_gdt
);
160 rm
->rm_temp_idt_lim
= 0;
161 rm
->rm_temp_idt_base
= 0;
164 * Since the CPU needs to jump to protected mode using an identity
165 * mapped address, we need to calculate it here.
167 rm
->rm_longmode64_addr
= rm_platter_pa
+
168 (uint32_t)((uintptr_t)long_mode_64
-
169 (uintptr_t)real_mode_start_cpu
);
174 mach_cpucontext_alloc_tables(struct cpu
*cp
)
177 struct cpu_tables
*ct
;
181 * Allocate space for stack, tss, gdt and idt. We round the size
182 * allotted for cpu_tables up, so that the TSS is on a unique page.
183 * This is more efficient when running in virtual machines.
185 ctsize
= P2ROUNDUP(sizeof (*ct
), PAGESIZE
);
186 ct
= kmem_zalloc(ctsize
, KM_SLEEP
);
187 if ((uintptr_t)ct
& PAGEOFFSET
)
188 panic("mach_cpucontext_alloc_tables: cpu%d misaligned tables",
191 ntss
= cp
->cpu_tss
= &ct
->ct_tss
;
198 * #DF (double fault).
200 ntss
->tss_ist1
= (uintptr_t)&ct
->ct_stack1
[sizeof (ct
->ct_stack1
)];
203 * #NM (non-maskable interrupt)
205 ntss
->tss_ist2
= (uintptr_t)&ct
->ct_stack2
[sizeof (ct
->ct_stack2
)];
208 * #MC (machine check exception / hardware error)
210 ntss
->tss_ist3
= (uintptr_t)&ct
->ct_stack3
[sizeof (ct
->ct_stack3
)];
213 * #DB, #BP debug interrupts and KDI/kmdb
215 ntss
->tss_ist4
= (uintptr_t)&cp
->cpu_m
.mcpu_kpti_dbg
.kf_tr_rsp
;
217 if (kpti_enable
== 1) {
219 * #GP, #PF, #SS fault interrupts
221 ntss
->tss_ist5
= (uintptr_t)&cp
->cpu_m
.mcpu_kpti_flt
.kf_tr_rsp
;
224 * Used by all other interrupts
226 ntss
->tss_ist6
= (uint64_t)&cp
->cpu_m
.mcpu_kpti
.kf_tr_rsp
;
229 * On AMD64 we need to make sure that all of the pages of the
230 * struct cpu_tables are punched through onto the user CPU for
233 * The final page will always be the TSS, so treat that
236 for (va
= (uintptr_t)ct
, len
= ctsize
- MMU_PAGESIZE
;
238 len
-= MMU_PAGESIZE
, va
+= MMU_PAGESIZE
) {
239 /* The doublefault stack must be RW */
240 hati_cpu_punchin(cp
, va
, PROT_READ
| PROT_WRITE
);
242 ASSERT3U((uintptr_t)ntss
, ==, va
);
243 hati_cpu_punchin(cp
, (uintptr_t)ntss
, PROT_READ
);
246 #elif defined(__i386)
248 ntss
->tss_esp0
= ntss
->tss_esp1
= ntss
->tss_esp2
= ntss
->tss_esp
=
249 (uint32_t)&ct
->ct_stack1
[sizeof (ct
->ct_stack1
)];
251 ntss
->tss_ss0
= ntss
->tss_ss1
= ntss
->tss_ss2
= ntss
->tss_ss
= KDS_SEL
;
253 ntss
->tss_eip
= (uint32_t)cp
->cpu_thread
->t_pc
;
255 ntss
->tss_cs
= KCS_SEL
;
256 ntss
->tss_ds
= ntss
->tss_es
= KDS_SEL
;
257 ntss
->tss_fs
= KFS_SEL
;
258 ntss
->tss_gs
= KGS_SEL
;
263 * Set I/O bit map offset equal to size of TSS segment limit
264 * for no I/O permission map. This will cause all user I/O
265 * instructions to generate #gp fault.
267 ntss
->tss_bitmapbase
= sizeof (*ntss
);
272 set_syssegd((system_desc_t
*)&cp
->cpu_gdt
[GDT_KTSS
], cp
->cpu_tss
,
273 sizeof (*cp
->cpu_tss
) - 1, SDT_SYSTSS
, SEL_KPL
);
279 mach_cpucontext_xalloc(struct cpu
*cp
, int optype
)
282 struct cpu_tables
*ct
;
283 rm_platter_t
*rm
= (rm_platter_t
*)rm_platter_va
;
284 static int cpu_halt_code_ready
;
286 if (optype
== MACH_CPUCONTEXT_OP_STOP
) {
287 ASSERT(plat_dr_enabled());
290 * The WARM_RESET_VECTOR has a limitation that the physical
291 * address written to it must be page-aligned. To work around
292 * this limitation, the CPU stop code has been splitted into
294 * The stage 2 code, which implements the real logic to halt
295 * CPUs, is copied to the rm_cpu_halt_code field in the real
296 * mode platter. The stage 1 code, which simply jumps to the
297 * stage 2 code in the rm_cpu_halt_code field, is copied to
298 * rm_code field in the real mode platter and it may be
299 * overwritten after the CPU has been stopped.
301 if (!cpu_halt_code_ready
) {
303 * The rm_cpu_halt_code field in the real mode platter
304 * is used by the CPU stop code only. So only copy the
305 * CPU stop stage 2 code into the rm_cpu_halt_code
306 * field on the first call.
308 len
= (size_t)real_mode_stop_cpu_stage2_end
-
309 (size_t)real_mode_stop_cpu_stage2
;
310 ASSERT(len
<= RM_PLATTER_CPU_HALT_CODE_SIZE
);
311 bcopy((caddr_t
)real_mode_stop_cpu_stage2
,
312 (caddr_t
)rm
->rm_cpu_halt_code
, len
);
313 cpu_halt_code_ready
= 1;
317 * The rm_code field in the real mode platter is shared by
318 * the CPU start, CPU stop, CPR and fast reboot code. So copy
319 * the CPU stop stage 1 code into the rm_code field every time.
321 len
= (size_t)real_mode_stop_cpu_stage1_end
-
322 (size_t)real_mode_stop_cpu_stage1
;
323 ASSERT(len
<= RM_PLATTER_CODE_SIZE
);
324 bcopy((caddr_t
)real_mode_stop_cpu_stage1
,
325 (caddr_t
)rm
->rm_code
, len
);
326 rm
->rm_cpu_halted
= 0;
328 return (cp
->cpu_m
.mcpu_mach_ctx_ptr
);
329 } else if (optype
!= MACH_CPUCONTEXT_OP_START
) {
334 * Only need to allocate tables when starting CPU.
335 * Tables allocated when starting CPU will be reused when stopping CPU.
337 ct
= mach_cpucontext_alloc_tables(cp
);
342 /* Copy CPU startup code to rm_platter for CPU hot-add operations. */
343 if (plat_dr_enabled()) {
344 bcopy((caddr_t
)real_mode_start_cpu
, (caddr_t
)rm
->rm_code
,
345 (size_t)real_mode_start_cpu_end
-
346 (size_t)real_mode_start_cpu
);
350 * Now copy all that we've set up onto the real mode platter
351 * for the real mode code to digest as part of starting the cpu.
353 rm
->rm_idt_base
= cp
->cpu_idt
;
354 rm
->rm_idt_lim
= sizeof (*cp
->cpu_idt
) * NIDT
- 1;
355 rm
->rm_gdt_base
= cp
->cpu_gdt
;
356 rm
->rm_gdt_lim
= sizeof (*cp
->cpu_gdt
) * NGDT
- 1;
359 * CPU needs to access kernel address space after powering on.
361 rm
->rm_pdbr
= MAKECR3(kas
.a_hat
->hat_htable
->ht_pfn
, PCID_NONE
);
362 rm
->rm_cpu
= cp
->cpu_id
;
365 * We need to mask off any bits set on our boot CPU that can't apply
366 * while the subject CPU is initializing. If appropriate, they are
369 rm
->rm_cr4
= getcr4();
370 rm
->rm_cr4
&= ~(CR4_MCE
| CR4_PCE
| CR4_PCIDE
);
378 mach_cpucontext_xfree(struct cpu
*cp
, void *arg
, int err
, int optype
)
380 struct cpu_tables
*ct
= arg
;
382 ASSERT(&ct
->ct_tss
== cp
->cpu_tss
);
383 if (optype
== MACH_CPUCONTEXT_OP_START
) {
387 * Save pointer for reuse when stopping CPU.
389 cp
->cpu_m
.mcpu_mach_ctx_ptr
= arg
;
393 * The processor was poked, but failed to start before
394 * we gave up waiting for it. In case it starts later,
395 * don't free anything.
397 cp
->cpu_m
.mcpu_mach_ctx_ptr
= arg
;
401 * Some other, passive, error occurred.
403 kmem_free(ct
, P2ROUNDUP(sizeof (*ct
), PAGESIZE
));
407 } else if (optype
== MACH_CPUCONTEXT_OP_STOP
) {
411 * Free resources allocated when starting CPU.
413 kmem_free(ct
, P2ROUNDUP(sizeof (*ct
), PAGESIZE
));
415 cp
->cpu_m
.mcpu_mach_ctx_ptr
= NULL
;
419 * Don't touch table pointer in case of failure.
429 mach_cpucontext_alloc(struct cpu
*cp
)
431 return (mach_cpucontext_xalloc(cp
, MACH_CPUCONTEXT_OP_START
));
435 mach_cpucontext_free(struct cpu
*cp
, void *arg
, int err
)
437 mach_cpucontext_xfree(cp
, arg
, err
, MACH_CPUCONTEXT_OP_START
);
441 * "Enter monitor." Called via cross-call from stop_other_cpus().
444 mach_cpu_halt(char *msg
)
447 prom_printf("%s\n", msg
);
449 /*CONSTANTCONDITION*/
461 mach_cpu_pause(volatile char *safe
)
464 * This cpu is now safe.
467 membar_enter(); /* make sure stores are flushed */
470 * Now we wait. When we are allowed to continue, safe
471 * will be set to PAUSE_IDLE.
473 while (*safe
!= PAUSE_IDLE
)
478 * Power on the target CPU.
481 mp_cpu_poweron(struct cpu
*cp
)
489 if (use_mp
== 0 || plat_dr_support_cpu() == 0) {
491 } else if (cpuid
< 0 || cpuid
>= max_ncpus
) {
496 * The currrent x86 implementaiton of mp_cpu_configure() and
497 * mp_cpu_poweron() have a limitation that mp_cpu_poweron() could only
498 * be called once after calling mp_cpu_configure() for a specific CPU.
499 * It's because mp_cpu_poweron() will destroy data structure created
500 * by mp_cpu_configure(). So reject the request if the CPU has already
501 * been powered on once after calling mp_cpu_configure().
502 * This limitaiton only affects the p_online syscall and the DR driver
503 * won't be affected because the DR driver always invoke public CPU
504 * management interfaces in the predefined order:
505 * cpu_configure()->cpu_poweron()...->cpu_poweroff()->cpu_unconfigure()
507 if (cpuid_checkpass(cp
, 4) || cp
->cpu_thread
== cp
->cpu_idle_thread
) {
512 * Check if there's at least a Mbyte of kmem available
513 * before attempting to start the cpu.
515 if (kmem_avail() < 1024 * 1024) {
517 * Kick off a reap in case that helps us with
524 affinity_set(CPU
->cpu_id
);
527 * Start the target CPU. No need to call mach_cpucontext_fini()
528 * if mach_cpucontext_init() fails.
530 if ((error
= mach_cpucontext_init()) == 0) {
531 error
= mp_start_cpu_common(cp
, B_FALSE
);
532 mach_cpucontext_fini();
539 /* Wait for the target cpu to reach READY state. */
540 tempset
= cpu_ready_set
;
541 while (!CPU_IN_SET(tempset
, cpuid
)) {
543 tempset
= *((volatile cpuset_t
*)&cpu_ready_set
);
546 /* Mark the target CPU as available for mp operation. */
547 CPUSET_ATOMIC_ADD(mp_cpus
, cpuid
);
549 /* Free the space allocated to hold the microcode file */
557 #define MP_CPU_DETACH_MAX_TRIES 5
558 #define MP_CPU_DETACH_DELAY 100
561 mp_cpu_detach_driver(dev_info_t
*dip
)
567 pdip
= ddi_get_parent(dip
);
568 ASSERT(pdip
!= NULL
);
570 * Check if caller holds pdip busy - can cause deadlocks in
571 * e_ddi_branch_unconfigure(), which calls devfs_clean().
573 if (DEVI_BUSY_OWNED(pdip
)) {
577 for (i
= 0; i
< MP_CPU_DETACH_MAX_TRIES
; i
++) {
578 if (e_ddi_branch_unconfigure(dip
, NULL
, 0) == 0) {
582 DELAY(MP_CPU_DETACH_DELAY
);
589 * Power off the target CPU.
590 * Note: cpu_lock will be released and then reacquired.
593 mp_cpu_poweroff(struct cpu
*cp
)
597 dev_info_t
*dip
= NULL
;
598 rm_platter_t
*rm
= (rm_platter_t
*)rm_platter_va
;
599 extern void cpupm_start(cpu_t
*);
600 extern void cpupm_stop(cpu_t
*);
603 ASSERT((cp
->cpu_flags
& CPU_OFFLINE
) != 0);
604 ASSERT((cp
->cpu_flags
& CPU_QUIESCED
) != 0);
606 if (use_mp
== 0 || plat_dr_support_cpu() == 0) {
610 * There is no support for powering off cpu0 yet.
611 * There are many pieces of code which have a hard dependency on cpu0.
613 if (cp
->cpu_id
== 0) {
617 if (mach_cpu_get_device_node(cp
, &dip
) != PSM_SUCCESS
) {
621 if (mp_cpu_detach_driver(dip
) != 0) {
626 /* Allocate CPU context for stopping */
627 if (mach_cpucontext_init() != 0) {
631 ctx
= mach_cpucontext_xalloc(cp
, MACH_CPUCONTEXT_OP_STOP
);
634 goto out_context_fini
;
638 cpu_event_fini_cpu(cp
);
640 if (cp
->cpu_m
.mcpu_cmi_hdl
!= NULL
) {
641 cmi_fini(cp
->cpu_m
.mcpu_cmi_hdl
);
642 cp
->cpu_m
.mcpu_cmi_hdl
= NULL
;
645 rv
= mach_cpu_stop(cp
, ctx
);
650 /* Wait until the target CPU has been halted. */
651 while (*(volatile ushort_t
*)&(rm
->rm_cpu_halted
) != 0xdead) {
654 rm
->rm_cpu_halted
= 0xffff;
656 /* CPU_READY has been cleared by mach_cpu_stop. */
657 ASSERT((cp
->cpu_flags
& CPU_READY
) == 0);
658 ASSERT((cp
->cpu_flags
& CPU_RUNNING
) == 0);
659 cp
->cpu_flags
= CPU_OFFLINE
| CPU_QUIESCED
| CPU_POWEROFF
;
660 CPUSET_ATOMIC_DEL(mp_cpus
, cp
->cpu_id
);
662 mach_cpucontext_xfree(cp
, ctx
, 0, MACH_CPUCONTEXT_OP_STOP
);
663 mach_cpucontext_fini();
671 if ((hdl
= cmi_init(CMI_HDL_NATIVE
, cmi_ntv_hwchipid(cp
),
672 cmi_ntv_hwcoreid(cp
), cmi_ntv_hwstrandid(cp
))) != NULL
) {
673 if (is_x86_feature(x86_featureset
, X86FSET_MCA
))
675 cp
->cpu_m
.mcpu_cmi_hdl
= hdl
;
678 cpu_event_init_cpu(cp
);
680 mach_cpucontext_xfree(cp
, ctx
, rv
, MACH_CPUCONTEXT_OP_STOP
);
683 mach_cpucontext_fini();
686 (void) e_ddi_branch_configure(dip
, NULL
, 0);
688 if (rv
!= EAGAIN
&& rv
!= ETIME
) {
696 * Return vcpu state, since this could be a virtual environment that we
697 * are unaware of, return "unknown".
701 vcpu_on_pcpu(processorid_t cpu
)
703 return (VCPU_STATE_UNKNOWN
);