sys/platform/pc64/x86_64/machdep.c

   1 /*-
   2  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
   3  * Copyright (c) 1992 Terrence R. Lambert.
   4  * Copyright (c) 2003 Peter Wemm.
   5  * Copyright (c) 2008 The DragonFly Project.
   6  * All rights reserved.
   7  *
   8  * This code is derived from software contributed to Berkeley by
   9  * William Jolitz.
  10  *
  11  * Redistribution and use in source and binary forms, with or without
  12  * modification, are permitted provided that the following conditions
  13  * are met:
  14  * 1. Redistributions of source code must retain the above copyright
  15  *    notice, this list of conditions and the following disclaimer.
  16  * 2. Redistributions in binary form must reproduce the above copyright
  17  *    notice, this list of conditions and the following disclaimer in the
  18  *    documentation and/or other materials provided with the distribution.
  19  * 3. All advertising materials mentioning features or use of this software
  20  *    must display the following acknowledgement:
  21  *      This product includes software developed by the University of
  22  *      California, Berkeley and its contributors.
  23  * 4. Neither the name of the University nor the names of its contributors
  24  *    may be used to endorse or promote products derived from this software
  25  *    without specific prior written permission.
  26  *
  27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  37  * SUCH DAMAGE.
  38  *
  39  * from: @(#)machdep.c  7.4 (Berkeley) 6/3/91
  40  * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $
  41  */
  42
  43 //#include "use_npx.h"
  44 #include "use_isa.h"
  45 #include "opt_cpu.h"
  46 #include "opt_ddb.h"
  47 #include "opt_inet.h"
  48 #include "opt_msgbuf.h"
  49 #include "opt_swap.h"
  50
  51 #include <sys/param.h>
  52 #include <sys/systm.h>
  53 #include <sys/sysproto.h>
  54 #include <sys/signalvar.h>
  55 #include <sys/kernel.h>
  56 #include <sys/linker.h>
  57 #include <sys/malloc.h>
  58 #include <sys/proc.h>
  59 #include <sys/priv.h>
  60 #include <sys/buf.h>
  61 #include <sys/reboot.h>
  62 #include <sys/mbuf.h>
  63 #include <sys/msgbuf.h>
  64 #include <sys/sysent.h>
  65 #include <sys/sysctl.h>
  66 #include <sys/vmmeter.h>
  67 #include <sys/bus.h>
  68 #include <sys/usched.h>
  69 #include <sys/reg.h>
  70 #include <sys/sbuf.h>
  71 #include <sys/ctype.h>
  72 #include <sys/serialize.h>
  73 #include <sys/systimer.h>
  74
  75 #include <vm/vm.h>
  76 #include <vm/vm_param.h>
  77 #include <sys/lock.h>
  78 #include <vm/vm_kern.h>
  79 #include <vm/vm_object.h>
  80 #include <vm/vm_page.h>
  81 #include <vm/vm_map.h>
  82 #include <vm/vm_pager.h>
  83 #include <vm/vm_extern.h>
  84
  85 #include <sys/thread2.h>
  86 #include <sys/mplock2.h>
  87 #include <sys/mutex2.h>
  88
  89 #include <sys/user.h>
  90 #include <sys/exec.h>
  91 #include <sys/cons.h>
  92
  93 #include <sys/efi.h>
  94
  95 #include <ddb/ddb.h>
  96
  97 #include <machine/cpu.h>
  98 #include <machine/clock.h>
  99 #include <machine/specialreg.h>
 100 #if 0 /* JG */
 101 #include <machine/bootinfo.h>
 102 #endif
 103 #include <machine/md_var.h>
 104 #include <machine/metadata.h>
 105 #include <machine/pc/bios.h>
 106 #include <machine/pcb_ext.h>            /* pcb.h included via sys/user.h */
 107 #include <machine/globaldata.h>         /* CPU_prvspace */
 108 #include <machine/smp.h>
 109 #include <machine/cputypes.h>
 110 #include <machine/intr_machdep.h>
 111 #include <machine/framebuffer.h>
 112
 113 #ifdef OLD_BUS_ARCH
 114 #include <bus/isa/isa_device.h>
 115 #endif
 116 #include <machine_base/isa/isa_intr.h>
 117 #include <bus/isa/rtc.h>
 118 #include <sys/random.h>
 119 #include <sys/ptrace.h>
 120 #include <machine/sigframe.h>
 121
 122 #include <sys/machintr.h>
 123 #include <machine_base/icu/icu_abi.h>
 124 #include <machine_base/icu/elcr_var.h>
 125 #include <machine_base/apic/lapic.h>
 126 #include <machine_base/apic/ioapic.h>
 127 #include <machine_base/apic/ioapic_abi.h>
 128 #include <machine/mptable.h>
 129
 130 #define PHYSMAP_ENTRIES         10
 131
 132 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
 133
 134 extern void printcpuinfo(void); /* XXX header file */
 135 extern void identify_cpu(void);
 136 #if 0 /* JG */
 137 extern void finishidentcpu(void);
 138 #endif
 139 extern void panicifcpuunsupported(void);
 140
 141 static void cpu_startup(void *);
 142 static void pic_finish(void *);
 143 static void cpu_finish(void *);
 144
 145 static void set_fpregs_xmm(struct save87 *, struct savexmm *);
 146 static void fill_fpregs_xmm(struct savexmm *, struct save87 *);
 147 static void init_locks(void);
 148
 149 extern void pcpu_timer_always(struct intrframe *);
 150
 151 SYSINIT(cpu, SI_BOOT2_START_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
 152 SYSINIT(pic_finish, SI_BOOT2_FINISH_PIC, SI_ORDER_FIRST, pic_finish, NULL);
 153 SYSINIT(cpu_finish, SI_BOOT2_FINISH_CPU, SI_ORDER_FIRST, cpu_finish, NULL);
 154
 155 #ifdef DDB
 156 extern vm_offset_t ksym_start, ksym_end;
 157 #endif
 158
 159 struct privatespace CPU_prvspace_bsp __aligned(4096);
 160 struct privatespace *CPU_prvspace[MAXCPU] = { &CPU_prvspace_bsp };
 161
 162 vm_paddr_t efi_systbl_phys;
 163 int     _udatasel, _ucodesel, _ucode32sel;
 164 u_long  atdevbase;
 165 int64_t tsc_offsets[MAXCPU];
 166 cpumask_t smp_idleinvl_mask;
 167 cpumask_t smp_idleinvl_reqs;
 168
 169 static int cpu_mwait_halt_global; /* MWAIT hint (EAX) or CPU_MWAIT_HINT_ */
 170
 171 #if defined(SWTCH_OPTIM_STATS)
 172 extern int swtch_optim_stats;
 173 SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats,
 174         CTLFLAG_RD, &swtch_optim_stats, 0, "");
 175 SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count,
 176         CTLFLAG_RD, &tlb_flush_count, 0, "");
 177 #endif
 178 SYSCTL_INT(_hw, OID_AUTO, cpu_mwait_halt,
 179         CTLFLAG_RD, &cpu_mwait_halt_global, 0, "");
 180 SYSCTL_INT(_hw, OID_AUTO, cpu_mwait_spin, CTLFLAG_RD, &cpu_mwait_spin, 0,
 181     "monitor/mwait target state");
 182
 183 #define CPU_MWAIT_HAS_CX        \
 184         ((cpu_feature2 & CPUID2_MON) && \
 185          (cpu_mwait_feature & CPUID_MWAIT_EXT))
 186
 187 #define CPU_MWAIT_CX_NAMELEN    16
 188
 189 #define CPU_MWAIT_C1            1
 190 #define CPU_MWAIT_C2            2
 191 #define CPU_MWAIT_C3            3
 192 #define CPU_MWAIT_CX_MAX        8
 193
 194 #define CPU_MWAIT_HINT_AUTO     -1      /* C1 and C2 */
 195 #define CPU_MWAIT_HINT_AUTODEEP -2      /* C3+ */
 196
 197 SYSCTL_NODE(_machdep, OID_AUTO, mwait, CTLFLAG_RW, 0, "MWAIT features");
 198 SYSCTL_NODE(_machdep_mwait, OID_AUTO, CX, CTLFLAG_RW, 0, "MWAIT Cx settings");
 199
 200 struct cpu_mwait_cx {
 201         int                     subcnt;
 202         char                    name[4];
 203         struct sysctl_ctx_list  sysctl_ctx;
 204         struct sysctl_oid       *sysctl_tree;
 205 };
 206 static struct cpu_mwait_cx      cpu_mwait_cx_info[CPU_MWAIT_CX_MAX];
 207 static char                     cpu_mwait_cx_supported[256];
 208
 209 static int                      cpu_mwait_c1_hints_cnt;
 210 static int                      cpu_mwait_hints_cnt;
 211 static int                      *cpu_mwait_hints;
 212
 213 static int                      cpu_mwait_deep_hints_cnt;
 214 static int                      *cpu_mwait_deep_hints;
 215
 216 #define CPU_IDLE_REPEAT_DEFAULT 750
 217
 218 static u_int                    cpu_idle_repeat = CPU_IDLE_REPEAT_DEFAULT;
 219 static u_long                   cpu_idle_repeat_max = CPU_IDLE_REPEAT_DEFAULT;
 220 static u_int                    cpu_mwait_repeat_shift = 1;
 221
 222 #define CPU_MWAIT_C3_PREAMBLE_BM_ARB    0x1
 223 #define CPU_MWAIT_C3_PREAMBLE_BM_STS    0x2
 224
 225 static int                      cpu_mwait_c3_preamble =
 226                                     CPU_MWAIT_C3_PREAMBLE_BM_ARB |
 227                                     CPU_MWAIT_C3_PREAMBLE_BM_STS;
 228
 229 SYSCTL_STRING(_machdep_mwait_CX, OID_AUTO, supported, CTLFLAG_RD,
 230     cpu_mwait_cx_supported, 0, "MWAIT supported C states");
 231 SYSCTL_INT(_machdep_mwait_CX, OID_AUTO, c3_preamble, CTLFLAG_RD,
 232     &cpu_mwait_c3_preamble, 0, "C3+ preamble mask");
 233
 234 static int      cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS,
 235                     int *, boolean_t);
 236 static int      cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS);
 237 static int      cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS);
 238 static int      cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS);
 239
 240 SYSCTL_PROC(_machdep_mwait_CX, OID_AUTO, idle, CTLTYPE_STRING|CTLFLAG_RW,
 241     NULL, 0, cpu_mwait_cx_idle_sysctl, "A", "");
 242 SYSCTL_PROC(_machdep_mwait_CX, OID_AUTO, spin, CTLTYPE_STRING|CTLFLAG_RW,
 243     NULL, 0, cpu_mwait_cx_spin_sysctl, "A", "");
 244 SYSCTL_UINT(_machdep_mwait_CX, OID_AUTO, repeat_shift, CTLFLAG_RW,
 245     &cpu_mwait_repeat_shift, 0, "");
 246
 247 long physmem = 0;
 248
 249 u_long ebda_addr = 0;
 250
 251 int imcr_present = 0;
 252
 253 int naps = 0; /* # of Applications processors */
 254
 255 u_int base_memory;
 256 struct mtx dt_lock;             /* lock for GDT and LDT */
 257
 258 static int
 259 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS)
 260 {
 261         u_long pmem = ctob(physmem);
 262
 263         int error = sysctl_handle_long(oidp, &pmem, 0, req);
 264         return (error);
 265 }
 266
 267 SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_ULONG|CTLFLAG_RD,
 268         0, 0, sysctl_hw_physmem, "LU", "Total system memory in bytes (number of pages * page size)");
 269
 270 static int
 271 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS)
 272 {
 273         int error = sysctl_handle_int(oidp, 0,
 274                 ctob(physmem - vmstats.v_wire_count), req);
 275         return (error);
 276 }
 277
 278 SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT|CTLFLAG_RD,
 279         0, 0, sysctl_hw_usermem, "IU", "");
 280
 281 static int
 282 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS)
 283 {
 284         int error = sysctl_handle_int(oidp, 0,
 285                 x86_64_btop(avail_end - avail_start), req);
 286         return (error);
 287 }
 288
 289 SYSCTL_PROC(_hw, OID_AUTO, availpages, CTLTYPE_INT|CTLFLAG_RD,
 290         0, 0, sysctl_hw_availpages, "I", "");
 291
 292 vm_paddr_t Maxmem;
 293 vm_paddr_t Realmem;
 294
 295 /*
 296  * The number of PHYSMAP entries must be one less than the number of
 297  * PHYSSEG entries because the PHYSMAP entry that spans the largest
 298  * physical address that is accessible by ISA DMA is split into two
 299  * PHYSSEG entries.
 300  */
 301 vm_phystable_t phys_avail[VM_PHYSSEG_MAX + 1];
 302 vm_phystable_t dump_avail[VM_PHYSSEG_MAX + 1];
 303
 304 /* must be 1 less so 0 0 can signal end of chunks */
 305 #define PHYS_AVAIL_ARRAY_END (NELEM(phys_avail) - 1)
 306 #define DUMP_AVAIL_ARRAY_END (NELEM(dump_avail) - 1)
 307
 308 static vm_offset_t buffer_sva, buffer_eva;
 309 vm_offset_t clean_sva, clean_eva;
 310 static vm_offset_t pager_sva, pager_eva;
 311 static struct trapframe proc0_tf;
 312
 313 static void
 314 cpu_startup(void *dummy)
 315 {
 316         caddr_t v;
 317         vm_size_t size = 0;
 318         vm_offset_t firstaddr;
 319
 320         /*
 321          * Good {morning,afternoon,evening,night}.
 322          */
 323         kprintf("%s", version);
 324         startrtclock();
 325         printcpuinfo();
 326         panicifcpuunsupported();
 327         kprintf("real memory  = %ju (%ju MB)\n",
 328                 (intmax_t)Realmem,
 329                 (intmax_t)Realmem / 1024 / 1024);
 330         /*
 331          * Display any holes after the first chunk of extended memory.
 332          */
 333         if (bootverbose) {
 334                 int indx;
 335
 336                 kprintf("Physical memory chunk(s):\n");
 337                 for (indx = 0; phys_avail[indx].phys_end != 0; ++indx) {
 338                         vm_paddr_t size1;
 339
 340                         size1 = phys_avail[indx].phys_end -
 341                                 phys_avail[indx].phys_beg;
 342
 343                         kprintf("0x%08jx - 0x%08jx, %ju bytes (%ju pages)\n",
 344                                 (intmax_t)phys_avail[indx].phys_beg,
 345                                 (intmax_t)phys_avail[indx].phys_end - 1,
 346                                 (intmax_t)size1,
 347                                 (intmax_t)(size1 / PAGE_SIZE));
 348                 }
 349         }
 350
 351         /*
 352          * Allocate space for system data structures.
 353          * The first available kernel virtual address is in "v".
 354          * As pages of kernel virtual memory are allocated, "v" is incremented.
 355          * As pages of memory are allocated and cleared,
 356          * "firstaddr" is incremented.
 357          * An index into the kernel page table corresponding to the
 358          * virtual memory address maintained in "v" is kept in "mapaddr".
 359          */
 360
 361         /*
 362          * Make two passes.  The first pass calculates how much memory is
 363          * needed and allocates it.  The second pass assigns virtual
 364          * addresses to the various data structures.
 365          */
 366         firstaddr = 0;
 367 again:
 368         v = (caddr_t)firstaddr;
 369
 370 #define valloc(name, type, num) \
 371             (name) = (type *)v; v = (caddr_t)((name)+(num))
 372 #define valloclim(name, type, num, lim) \
 373             (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
 374
 375         /*
 376          * The nominal buffer size (and minimum KVA allocation) is MAXBSIZE.
 377          * For the first 64MB of ram nominally allocate sufficient buffers to
 378          * cover 1/4 of our ram.  Beyond the first 64MB allocate additional
 379          * buffers to cover 1/20 of our ram over 64MB.  When auto-sizing
 380          * the buffer cache we limit the eventual kva reservation to
 381          * maxbcache bytes.
 382          *
 383          * factor represents the 1/4 x ram conversion.
 384          */
 385         if (nbuf == 0) {
 386                 long factor = 4 * NBUFCALCSIZE / 1024;
 387                 long kbytes = physmem * (PAGE_SIZE / 1024);
 388
 389                 nbuf = 50;
 390                 if (kbytes > 4096)
 391                         nbuf += min((kbytes - 4096) / factor, 65536 / factor);
 392                 if (kbytes > 65536)
 393                         nbuf += (kbytes - 65536) * 2 / (factor * 5);
 394                 if (maxbcache && nbuf > maxbcache / NBUFCALCSIZE)
 395                         nbuf = maxbcache / NBUFCALCSIZE;
 396         }
 397
 398         /*
 399          * Do not allow the buffer_map to be more then 1/2 the size of the
 400          * kernel_map.
 401          */
 402         if (nbuf > (virtual_end - virtual_start +
 403                     virtual2_end - virtual2_start) / (MAXBSIZE * 2)) {
 404                 nbuf = (virtual_end - virtual_start +
 405                         virtual2_end - virtual2_start) / (MAXBSIZE * 2);
 406                 kprintf("Warning: nbufs capped at %ld due to kvm\n", nbuf);
 407         }
 408
 409         /*
 410          * Do not allow the buffer_map to use more than 50% of available
 411          * physical-equivalent memory.  Since the VM pages which back
 412          * individual buffers are typically wired, having too many bufs
 413          * can prevent the system from paging properly.
 414          */
 415         if (nbuf > physmem * PAGE_SIZE / (NBUFCALCSIZE * 2)) {
 416                 nbuf = physmem * PAGE_SIZE / (NBUFCALCSIZE * 2);
 417                 kprintf("Warning: nbufs capped at %ld due to physmem\n", nbuf);
 418         }
 419
 420         /*
 421          * Do not allow the sizeof(struct buf) * nbuf to exceed half of
 422          * the valloc space which is just the virtual_end - virtual_start
 423          * section.  We use valloc() to allocate the buf header array.
 424          */
 425         if (nbuf > (virtual_end - virtual_start) / sizeof(struct buf) / 2) {
 426                 nbuf = (virtual_end - virtual_start) /
 427                        sizeof(struct buf) / 2;
 428                 kprintf("Warning: nbufs capped at %ld due to valloc "
 429                         "considerations\n", nbuf);
 430         }
 431
 432         nswbuf_mem = lmax(lmin(nbuf / 32, 512), 8);
 433 #ifdef NSWBUF_MIN
 434         if (nswbuf_mem < NSWBUF_MIN)
 435                 nswbuf_mem = NSWBUF_MIN;
 436 #endif
 437         nswbuf_kva = lmax(lmin(nbuf / 4, 512), 16);
 438 #ifdef NSWBUF_MIN
 439         if (nswbuf_kva < NSWBUF_MIN)
 440                 nswbuf_kva = NSWBUF_MIN;
 441 #endif
 442
 443         valloc(swbuf_mem, struct buf, nswbuf_mem);
 444         valloc(swbuf_kva, struct buf, nswbuf_kva);
 445         valloc(buf, struct buf, nbuf);
 446
 447         /*
 448          * End of first pass, size has been calculated so allocate memory
 449          */
 450         if (firstaddr == 0) {
 451                 size = (vm_size_t)(v - firstaddr);
 452                 firstaddr = kmem_alloc(&kernel_map, round_page(size),
 453                                        VM_SUBSYS_BUF);
 454                 if (firstaddr == 0)
 455                         panic("startup: no room for tables");
 456                 goto again;
 457         }
 458
 459         /*
 460          * End of second pass, addresses have been assigned
 461          *
 462          * nbuf is an int, make sure we don't overflow the field.
 463          *
 464          * On 64-bit systems we always reserve maximal allocations for
 465          * buffer cache buffers and there are no fragmentation issues,
 466          * so the KVA segment does not have to be excessively oversized.
 467          */
 468         if ((vm_size_t)(v - firstaddr) != size)
 469                 panic("startup: table size inconsistency");
 470
 471         kmem_suballoc(&kernel_map, &clean_map, &clean_sva, &clean_eva,
 472                       ((vm_offset_t)(nbuf + 16) * MAXBSIZE) +
 473                       ((nswbuf_mem + nswbuf_kva) * MAXPHYS) + pager_map_size);
 474         kmem_suballoc(&clean_map, &buffer_map, &buffer_sva, &buffer_eva,
 475                       ((vm_offset_t)(nbuf + 16) * MAXBSIZE));
 476         buffer_map.system_map = 1;
 477         kmem_suballoc(&clean_map, &pager_map, &pager_sva, &pager_eva,
 478                       ((vm_offset_t)(nswbuf_mem + nswbuf_kva) * MAXPHYS) +
 479                       pager_map_size);
 480         pager_map.system_map = 1;
 481         kprintf("avail memory = %ju (%ju MB)\n",
 482                 (uintmax_t)ptoa(vmstats.v_free_count + vmstats.v_dma_pages),
 483                 (uintmax_t)ptoa(vmstats.v_free_count + vmstats.v_dma_pages) /
 484                 1024 / 1024);
 485 }
 486
 487 struct cpu_idle_stat {
 488         int     hint;
 489         int     reserved;
 490         u_long  halt;
 491         u_long  spin;
 492         u_long  repeat;
 493         u_long  repeat_last;
 494         u_long  repeat_delta;
 495         u_long  mwait_cx[CPU_MWAIT_CX_MAX];
 496 } __cachealign;
 497
 498 #define CPU_IDLE_STAT_HALT      -1
 499 #define CPU_IDLE_STAT_SPIN      -2
 500
 501 static struct cpu_idle_stat     cpu_idle_stats[MAXCPU];
 502
 503 static int
 504 sysctl_cpu_idle_cnt(SYSCTL_HANDLER_ARGS)
 505 {
 506         int idx = arg2, cpu, error;
 507         u_long val = 0;
 508
 509         if (idx == CPU_IDLE_STAT_HALT) {
 510                 for (cpu = 0; cpu < ncpus; ++cpu)
 511                         val += cpu_idle_stats[cpu].halt;
 512         } else if (idx == CPU_IDLE_STAT_SPIN) {
 513                 for (cpu = 0; cpu < ncpus; ++cpu)
 514                         val += cpu_idle_stats[cpu].spin;
 515         } else {
 516                 KASSERT(idx >= 0 && idx < CPU_MWAIT_CX_MAX,
 517                     ("invalid index %d", idx));
 518                 for (cpu = 0; cpu < ncpus; ++cpu)
 519                         val += cpu_idle_stats[cpu].mwait_cx[idx];
 520         }
 521
 522         error = sysctl_handle_quad(oidp, &val, 0, req);
 523         if (error || req->newptr == NULL)
 524                 return error;
 525
 526         if (idx == CPU_IDLE_STAT_HALT) {
 527                 for (cpu = 0; cpu < ncpus; ++cpu)
 528                         cpu_idle_stats[cpu].halt = 0;
 529                 cpu_idle_stats[0].halt = val;
 530         } else if (idx == CPU_IDLE_STAT_SPIN) {
 531                 for (cpu = 0; cpu < ncpus; ++cpu)
 532                         cpu_idle_stats[cpu].spin = 0;
 533                 cpu_idle_stats[0].spin = val;
 534         } else {
 535                 KASSERT(idx >= 0 && idx < CPU_MWAIT_CX_MAX,
 536                     ("invalid index %d", idx));
 537                 for (cpu = 0; cpu < ncpus; ++cpu)
 538                         cpu_idle_stats[cpu].mwait_cx[idx] = 0;
 539                 cpu_idle_stats[0].mwait_cx[idx] = val;
 540         }
 541         return 0;
 542 }
 543
 544 static void
 545 cpu_mwait_attach(void)
 546 {
 547         struct sbuf sb;
 548         int hint_idx, i;
 549
 550         if (!CPU_MWAIT_HAS_CX)
 551                 return;
 552
 553         if (cpu_vendor_id == CPU_VENDOR_INTEL &&
 554             (CPUID_TO_FAMILY(cpu_id) > 0xf ||
 555              (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
 556               CPUID_TO_MODEL(cpu_id) >= 0xf))) {
 557                 int bm_sts = 1;
 558
 559                 /*
 560                  * Pentium dual-core, Core 2 and beyond do not need any
 561                  * additional activities to enter deep C-state, i.e. C3(+).
 562                  */
 563                 cpu_mwait_cx_no_bmarb();
 564
 565                 TUNABLE_INT_FETCH("machdep.cpu.mwait.bm_sts", &bm_sts);
 566                 if (!bm_sts)
 567                         cpu_mwait_cx_no_bmsts();
 568         }
 569
 570         sbuf_new(&sb, cpu_mwait_cx_supported,
 571             sizeof(cpu_mwait_cx_supported), SBUF_FIXEDLEN);
 572
 573         for (i = 0; i < CPU_MWAIT_CX_MAX; ++i) {
 574                 struct cpu_mwait_cx *cx = &cpu_mwait_cx_info[i];
 575                 int sub;
 576
 577                 ksnprintf(cx->name, sizeof(cx->name), "C%d", i);
 578
 579                 sysctl_ctx_init(&cx->sysctl_ctx);
 580                 cx->sysctl_tree = SYSCTL_ADD_NODE(&cx->sysctl_ctx,
 581                     SYSCTL_STATIC_CHILDREN(_machdep_mwait), OID_AUTO,
 582                     cx->name, CTLFLAG_RW, NULL, "Cx control/info");
 583                 if (cx->sysctl_tree == NULL)
 584                         continue;
 585
 586                 cx->subcnt = CPUID_MWAIT_CX_SUBCNT(cpu_mwait_extemu, i);
 587                 SYSCTL_ADD_INT(&cx->sysctl_ctx,
 588                     SYSCTL_CHILDREN(cx->sysctl_tree), OID_AUTO,
 589                     "subcnt", CTLFLAG_RD, &cx->subcnt, 0,
 590                     "sub-state count");
 591                 SYSCTL_ADD_PROC(&cx->sysctl_ctx,
 592                     SYSCTL_CHILDREN(cx->sysctl_tree), OID_AUTO,
 593                     "entered", (CTLTYPE_QUAD | CTLFLAG_RW), 0,
 594                     i, sysctl_cpu_idle_cnt, "Q", "# of times entered");
 595
 596                 for (sub = 0; sub < cx->subcnt; ++sub)
 597                         sbuf_printf(&sb, "C%d/%d ", i, sub);
 598         }
 599         sbuf_trim(&sb);
 600         sbuf_finish(&sb);
 601
 602         /*
 603          * Non-deep C-states
 604          */
 605         cpu_mwait_c1_hints_cnt = cpu_mwait_cx_info[CPU_MWAIT_C1].subcnt;
 606         for (i = CPU_MWAIT_C1; i < CPU_MWAIT_C3; ++i)
 607                 cpu_mwait_hints_cnt += cpu_mwait_cx_info[i].subcnt;
 608         cpu_mwait_hints = kmalloc(sizeof(int) * cpu_mwait_hints_cnt,
 609             M_DEVBUF, M_WAITOK);
 610
 611         hint_idx = 0;
 612         for (i = CPU_MWAIT_C1; i < CPU_MWAIT_C3; ++i) {
 613                 int j, subcnt;
 614
 615                 subcnt = cpu_mwait_cx_info[i].subcnt;
 616                 for (j = 0; j < subcnt; ++j) {
 617                         KASSERT(hint_idx < cpu_mwait_hints_cnt,
 618                             ("invalid mwait hint index %d", hint_idx));
 619                         cpu_mwait_hints[hint_idx] = MWAIT_EAX_HINT(i, j);
 620                         ++hint_idx;
 621                 }
 622         }
 623         KASSERT(hint_idx == cpu_mwait_hints_cnt,
 624             ("mwait hint count %d != index %d",
 625              cpu_mwait_hints_cnt, hint_idx));
 626
 627         if (bootverbose) {
 628                 kprintf("MWAIT hints (%d C1 hints):\n", cpu_mwait_c1_hints_cnt);
 629                 for (i = 0; i < cpu_mwait_hints_cnt; ++i) {
 630                         int hint = cpu_mwait_hints[i];
 631
 632                         kprintf("  C%d/%d hint 0x%04x\n",
 633                             MWAIT_EAX_TO_CX(hint), MWAIT_EAX_TO_CX_SUB(hint),
 634                             hint);
 635                 }
 636         }
 637
 638         /*
 639          * Deep C-states
 640          */
 641         for (i = CPU_MWAIT_C1; i < CPU_MWAIT_CX_MAX; ++i)
 642                 cpu_mwait_deep_hints_cnt += cpu_mwait_cx_info[i].subcnt;
 643         cpu_mwait_deep_hints = kmalloc(sizeof(int) * cpu_mwait_deep_hints_cnt,
 644             M_DEVBUF, M_WAITOK);
 645
 646         hint_idx = 0;
 647         for (i = CPU_MWAIT_C1; i < CPU_MWAIT_CX_MAX; ++i) {
 648                 int j, subcnt;
 649
 650                 subcnt = cpu_mwait_cx_info[i].subcnt;
 651                 for (j = 0; j < subcnt; ++j) {
 652                         KASSERT(hint_idx < cpu_mwait_deep_hints_cnt,
 653                             ("invalid mwait deep hint index %d", hint_idx));
 654                         cpu_mwait_deep_hints[hint_idx] = MWAIT_EAX_HINT(i, j);
 655                         ++hint_idx;
 656                 }
 657         }
 658         KASSERT(hint_idx == cpu_mwait_deep_hints_cnt,
 659             ("mwait deep hint count %d != index %d",
 660              cpu_mwait_deep_hints_cnt, hint_idx));
 661
 662         if (bootverbose) {
 663                 kprintf("MWAIT deep hints:\n");
 664                 for (i = 0; i < cpu_mwait_deep_hints_cnt; ++i) {
 665                         int hint = cpu_mwait_deep_hints[i];
 666
 667                         kprintf("  C%d/%d hint 0x%04x\n",
 668                             MWAIT_EAX_TO_CX(hint), MWAIT_EAX_TO_CX_SUB(hint),
 669                             hint);
 670                 }
 671         }
 672         cpu_idle_repeat_max = 256 * cpu_mwait_deep_hints_cnt;
 673
 674         for (i = 0; i < ncpus; ++i) {
 675                 char name[16];
 676
 677                 ksnprintf(name, sizeof(name), "idle%d", i);
 678                 SYSCTL_ADD_PROC(NULL,
 679                     SYSCTL_STATIC_CHILDREN(_machdep_mwait_CX), OID_AUTO,
 680                     name, (CTLTYPE_STRING | CTLFLAG_RW), &cpu_idle_stats[i],
 681                     0, cpu_mwait_cx_pcpu_idle_sysctl, "A", "");
 682         }
 683 }
 684
 685 static void
 686 cpu_finish(void *dummy __unused)
 687 {
 688         cpu_setregs();
 689         cpu_mwait_attach();
 690 }
 691
 692 static void
 693 pic_finish(void *dummy __unused)
 694 {
 695         /* Log ELCR information */
 696         elcr_dump();
 697
 698         /* Log MPTABLE information */
 699         mptable_pci_int_dump();
 700
 701         /* Finalize PCI */
 702         MachIntrABI.finalize();
 703 }
 704
 705 /*
 706  * Send an interrupt to process.
 707  *
 708  * Stack is set up to allow sigcode stored
 709  * at top to call routine, followed by kcall
 710  * to sigreturn routine below.  After sigreturn
 711  * resets the signal mask, the stack, and the
 712  * frame pointer, it returns to the user
 713  * specified pc, psl.
 714  */
 715 void
 716 sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
 717 {
 718         struct lwp *lp = curthread->td_lwp;
 719         struct proc *p = lp->lwp_proc;
 720         struct trapframe *regs;
 721         struct sigacts *psp = p->p_sigacts;
 722         struct sigframe sf, *sfp;
 723         int oonstack;
 724         char *sp;
 725
 726         regs = lp->lwp_md.md_regs;
 727         oonstack = (lp->lwp_sigstk.ss_flags & SS_ONSTACK) ? 1 : 0;
 728
 729         /* Save user context */
 730         bzero(&sf, sizeof(struct sigframe));
 731         sf.sf_uc.uc_sigmask = *mask;
 732         sf.sf_uc.uc_stack = lp->lwp_sigstk;
 733         sf.sf_uc.uc_mcontext.mc_onstack = oonstack;
 734         KKASSERT(__offsetof(struct trapframe, tf_rdi) == 0);
 735         bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(struct trapframe));
 736
 737         /* Make the size of the saved context visible to userland */
 738         sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext);
 739
 740         /* Allocate and validate space for the signal handler context. */
 741         if ((lp->lwp_flags & LWP_ALTSTACK) != 0 && !oonstack &&
 742             SIGISMEMBER(psp->ps_sigonstack, sig)) {
 743                 sp = (char *)(lp->lwp_sigstk.ss_sp + lp->lwp_sigstk.ss_size -
 744                               sizeof(struct sigframe));
 745                 lp->lwp_sigstk.ss_flags |= SS_ONSTACK;
 746         } else {
 747                 /* We take red zone into account */
 748                 sp = (char *)regs->tf_rsp - sizeof(struct sigframe) - 128;
 749         }
 750
 751         /*
 752          * XXX AVX needs 64-byte alignment but sigframe has other fields and
 753          * the embedded ucontext is not at the front, so aligning this won't
 754          * help us.  Fortunately we bcopy in/out of the sigframe, so the
 755          * kernel is ok.
 756          *
 757          * The problem though is if userland winds up trying to use the
 758          * context directly.
 759          */
 760         sfp = (struct sigframe *)((intptr_t)sp & ~(intptr_t)0xF);
 761
 762         /* Translate the signal is appropriate */
 763         if (p->p_sysent->sv_sigtbl) {
 764                 if (sig <= p->p_sysent->sv_sigsize)
 765                         sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 766         }
 767
 768         /*
 769          * Build the argument list for the signal handler.
 770          *
 771          * Arguments are in registers (%rdi, %rsi, %rdx, %rcx)
 772          */
 773         regs->tf_rdi = sig;                             /* argument 1 */
 774         regs->tf_rdx = (register_t)&sfp->sf_uc;         /* argument 3 */
 775
 776         if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 777                 /*
 778                  * Signal handler installed with SA_SIGINFO.
 779                  *
 780                  * action(signo, siginfo, ucontext)
 781                  */
 782                 regs->tf_rsi = (register_t)&sfp->sf_si; /* argument 2 */
 783                 regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */
 784                 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 785
 786                 /* fill siginfo structure */
 787                 sf.sf_si.si_signo = sig;
 788                 sf.sf_si.si_code = code;
 789                 sf.sf_si.si_addr = (void *)regs->tf_addr;
 790         } else {
 791                 /*
 792                  * Old FreeBSD-style arguments.
 793                  *
 794                  * handler (signo, code, [uc], addr)
 795                  */
 796                 regs->tf_rsi = (register_t)code;        /* argument 2 */
 797                 regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */
 798                 sf.sf_ahu.sf_handler = catcher;
 799         }
 800
 801         /*
 802          * If we're a vm86 process, we want to save the segment registers.
 803          * We also change eflags to be our emulated eflags, not the actual
 804          * eflags.
 805          */
 806 #if 0 /* JG */
 807         if (regs->tf_eflags & PSL_VM) {
 808                 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 809                 struct vm86_kernel *vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86;
 810
 811                 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
 812                 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
 813                 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
 814                 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
 815
 816                 if (vm86->vm86_has_vme == 0)
 817                         sf.sf_uc.uc_mcontext.mc_eflags =
 818                             (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 819                             (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 820
 821                 /*
 822                  * Clear PSL_NT to inhibit T_TSSFLT faults on return from
 823                  * syscalls made by the signal handler.  This just avoids
 824                  * wasting time for our lazy fixup of such faults.  PSL_NT
 825                  * does nothing in vm86 mode, but vm86 programs can set it
 826                  * almost legitimately in probes for old cpu types.
 827                  */
 828                 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 829         }
 830 #endif
 831
 832         /*
 833          * Save the FPU state and reinit the FP unit
 834          */
 835         npxpush(&sf.sf_uc.uc_mcontext);
 836
 837         /*
 838          * Copy the sigframe out to the user's stack.
 839          */
 840         if (copyout(&sf, sfp, sizeof(struct sigframe)) != 0) {
 841                 /*
 842                  * Something is wrong with the stack pointer.
 843                  * ...Kill the process.
 844                  */
 845                 sigexit(lp, SIGILL);
 846         }
 847
 848         regs->tf_rsp = (register_t)sfp;
 849         regs->tf_rip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
 850
 851         /*
 852          * i386 abi specifies that the direction flag must be cleared
 853          * on function entry
 854          */
 855         regs->tf_rflags &= ~(PSL_T|PSL_D);
 856
 857         /*
 858          * 64 bit mode has a code and stack selector but
 859          * no data or extra selector.  %fs and %gs are not
 860          * stored in-context.
 861          */
 862         regs->tf_cs = _ucodesel;
 863         regs->tf_ss = _udatasel;
 864         clear_quickret();
 865 }
 866
 867 /*
 868  * Sanitize the trapframe for a virtual kernel passing control to a custom
 869  * VM context.  Remove any items that would otherwise create a privilage
 870  * issue.
 871  *
 872  * XXX at the moment we allow userland to set the resume flag.  Is this a
 873  * bad idea?
 874  */
 875 int
 876 cpu_sanitize_frame(struct trapframe *frame)
 877 {
 878         frame->tf_cs = _ucodesel;
 879         frame->tf_ss = _udatasel;
 880         /* XXX VM (8086) mode not supported? */
 881         frame->tf_rflags &= (PSL_RF | PSL_USERCHANGE | PSL_VM_UNSUPP);
 882         frame->tf_rflags |= PSL_RESERVED_DEFAULT | PSL_I;
 883
 884         return(0);
 885 }
 886
 887 /*
 888  * Sanitize the tls so loading the descriptor does not blow up
 889  * on us.  For x86_64 we don't have to do anything.
 890  */
 891 int
 892 cpu_sanitize_tls(struct savetls *tls)
 893 {
 894         return(0);
 895 }
 896
 897 /*
 898  * sigreturn(ucontext_t *sigcntxp)
 899  *
 900  * System call to cleanup state after a signal
 901  * has been taken.  Reset signal mask and
 902  * stack state from context left by sendsig (above).
 903  * Return to previous pc and psl as specified by
 904  * context left by sendsig. Check carefully to
 905  * make sure that the user has not modified the
 906  * state to gain improper privileges.
 907  *
 908  * MPSAFE
 909  */
 910 #define EFL_SECURE(ef, oef)     ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 911 #define CS_SECURE(cs)           (ISPL(cs) == SEL_UPL)
 912
 913 int
 914 sys_sigreturn(struct sigreturn_args *uap)
 915 {
 916         struct lwp *lp = curthread->td_lwp;
 917         struct trapframe *regs;
 918         ucontext_t uc;
 919         ucontext_t *ucp;
 920         register_t rflags;
 921         int cs;
 922         int error;
 923
 924         /*
 925          * We have to copy the information into kernel space so userland
 926          * can't modify it while we are sniffing it.
 927          */
 928         regs = lp->lwp_md.md_regs;
 929         error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 930         if (error)
 931                 return (error);
 932         ucp = &uc;
 933         rflags = ucp->uc_mcontext.mc_rflags;
 934
 935         /* VM (8086) mode not supported */
 936         rflags &= ~PSL_VM_UNSUPP;
 937
 938 #if 0 /* JG */
 939         if (eflags & PSL_VM) {
 940                 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 941                 struct vm86_kernel *vm86;
 942
 943                 /*
 944                  * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 945                  * set up the vm86 area, and we can't enter vm86 mode.
 946                  */
 947                 if (lp->lwp_thread->td_pcb->pcb_ext == 0)
 948                         return (EINVAL);
 949                 vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86;
 950                 if (vm86->vm86_inited == 0)
 951                         return (EINVAL);
 952
 953                 /* go back to user mode if both flags are set */
 954                 if ((eflags & PSL_VIP) && (eflags & PSL_VIF))
 955                         trapsignal(lp, SIGBUS, 0);
 956
 957                 if (vm86->vm86_has_vme) {
 958                         eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 959                             (eflags & VME_USERCHANGE) | PSL_VM;
 960                 } else {
 961                         vm86->vm86_eflags = eflags;     /* save VIF, VIP */
 962                         eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 963                             (eflags & VM_USERCHANGE) | PSL_VM;
 964                 }
 965                 bcopy(&ucp->uc_mcontext.mc_gs, tf, sizeof(struct trapframe));
 966                 tf->tf_eflags = eflags;
 967                 tf->tf_vm86_ds = tf->tf_ds;
 968                 tf->tf_vm86_es = tf->tf_es;
 969                 tf->tf_vm86_fs = tf->tf_fs;
 970                 tf->tf_vm86_gs = tf->tf_gs;
 971                 tf->tf_ds = _udatasel;
 972                 tf->tf_es = _udatasel;
 973                 tf->tf_fs = _udatasel;
 974                 tf->tf_gs = _udatasel;
 975         } else
 976 #endif
 977         {
 978                 /*
 979                  * Don't allow users to change privileged or reserved flags.
 980                  */
 981                 /*
 982                  * XXX do allow users to change the privileged flag PSL_RF.
 983                  * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
 984                  * should sometimes set it there too.  tf_eflags is kept in
 985                  * the signal context during signal handling and there is no
 986                  * other place to remember it, so the PSL_RF bit may be
 987                  * corrupted by the signal handler without us knowing.
 988                  * Corruption of the PSL_RF bit at worst causes one more or
 989                  * one less debugger trap, so allowing it is fairly harmless.
 990                  */
 991                 if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) {
 992                         kprintf("sigreturn: rflags = 0x%lx\n", (long)rflags);
 993                         return(EINVAL);
 994                 }
 995
 996                 /*
 997                  * Don't allow users to load a valid privileged %cs.  Let the
 998                  * hardware check for invalid selectors, excess privilege in
 999                  * other selectors, invalid %eip's and invalid %esp's.
1000                  */
1001                 cs = ucp->uc_mcontext.mc_cs;
1002                 if (!CS_SECURE(cs)) {
1003                         kprintf("sigreturn: cs = 0x%x\n", cs);
1004                         trapsignal(lp, SIGBUS, T_PROTFLT);
1005                         return(EINVAL);
1006                 }
1007                 bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(struct trapframe));
1008         }
1009
1010         /*
1011          * Restore the FPU state from the frame
1012          */
1013         crit_enter();
1014         npxpop(&ucp->uc_mcontext);
1015
1016         if (ucp->uc_mcontext.mc_onstack & 1)
1017                 lp->lwp_sigstk.ss_flags |= SS_ONSTACK;
1018         else
1019                 lp->lwp_sigstk.ss_flags &= ~SS_ONSTACK;
1020
1021         lp->lwp_sigmask = ucp->uc_sigmask;
1022         SIG_CANTMASK(lp->lwp_sigmask);
1023         clear_quickret();
1024         crit_exit();
1025         return(EJUSTRETURN);
1026 }
1027
1028 /*
1029  * Machine dependent boot() routine
1030  *
1031  * I haven't seen anything to put here yet
1032  * Possibly some stuff might be grafted back here from boot()
1033  */
1034 void
1035 cpu_boot(int howto)
1036 {
1037 }
1038
1039 /*
1040  * Shutdown the CPU as much as possible
1041  */
1042 void
1043 cpu_halt(void)
1044 {
1045         for (;;)
1046                 __asm__ __volatile("hlt");
1047 }
1048
1049 /*
1050  * cpu_idle() represents the idle LWKT.  You cannot return from this function
1051  * (unless you want to blow things up!).  Instead we look for runnable threads
1052  * and loop or halt as appropriate.  Giant is not held on entry to the thread.
1053  *
1054  * The main loop is entered with a critical section held, we must release
1055  * the critical section before doing anything else.  lwkt_switch() will
1056  * check for pending interrupts due to entering and exiting its own
1057  * critical section.
1058  *
1059  * NOTE: On an SMP system we rely on a scheduler IPI to wake a HLTed cpu up.
1060  *       However, there are cases where the idlethread will be entered with
1061  *       the possibility that no IPI will occur and in such cases
1062  *       lwkt_switch() sets TDF_IDLE_NOHLT.
1063  *
1064  * NOTE: cpu_idle_repeat determines how many entries into the idle thread
1065  *       must occur before it starts using ACPI halt.
1066  *
1067  * NOTE: Value overridden in hammer_time().
1068  */
1069 static int      cpu_idle_hlt = 2;
1070 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW,
1071     &cpu_idle_hlt, 0, "Idle loop HLT enable");
1072 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_repeat, CTLFLAG_RW,
1073     &cpu_idle_repeat, 0, "Idle entries before acpi hlt");
1074
1075 SYSCTL_PROC(_machdep, OID_AUTO, cpu_idle_hltcnt, (CTLTYPE_QUAD | CTLFLAG_RW),
1076     0, CPU_IDLE_STAT_HALT, sysctl_cpu_idle_cnt, "Q", "Idle loop entry halts");
1077 SYSCTL_PROC(_machdep, OID_AUTO, cpu_idle_spincnt, (CTLTYPE_QUAD | CTLFLAG_RW),
1078     0, CPU_IDLE_STAT_SPIN, sysctl_cpu_idle_cnt, "Q", "Idle loop entry spins");
1079
1080 static void
1081 cpu_idle_default_hook(void)
1082 {
1083         /*
1084          * We must guarentee that hlt is exactly the instruction
1085          * following the sti.
1086          */
1087         __asm __volatile("sti; hlt");
1088 }
1089
1090 /* Other subsystems (e.g., ACPI) can hook this later. */
1091 void (*cpu_idle_hook)(void) = cpu_idle_default_hook;
1092
1093 static __inline int
1094 cpu_mwait_cx_hint(struct cpu_idle_stat *stat)
1095 {
1096         int hint, cx_idx;
1097         u_int idx;
1098
1099         hint = stat->hint;
1100         if (hint >= 0)
1101                 goto done;
1102
1103         idx = (stat->repeat + stat->repeat_last + stat->repeat_delta) >>
1104             cpu_mwait_repeat_shift;
1105         if (idx >= cpu_mwait_c1_hints_cnt) {
1106                 /* Step up faster, once we walked through all C1 states */
1107                 stat->repeat_delta += 1 << (cpu_mwait_repeat_shift + 1);
1108         }
1109         if (hint == CPU_MWAIT_HINT_AUTODEEP) {
1110                 if (idx >= cpu_mwait_deep_hints_cnt)
1111                         idx = cpu_mwait_deep_hints_cnt - 1;
1112                 hint = cpu_mwait_deep_hints[idx];
1113         } else {
1114                 if (idx >= cpu_mwait_hints_cnt)
1115                         idx = cpu_mwait_hints_cnt - 1;
1116                 hint = cpu_mwait_hints[idx];
1117         }
1118 done:
1119         cx_idx = MWAIT_EAX_TO_CX(hint);
1120         if (cx_idx >= 0 && cx_idx < CPU_MWAIT_CX_MAX)
1121                 stat->mwait_cx[cx_idx]++;
1122         return hint;
1123 }
1124
1125 void
1126 cpu_idle(void)
1127 {
1128         globaldata_t gd = mycpu;
1129         struct cpu_idle_stat *stat = &cpu_idle_stats[gd->gd_cpuid];
1130         struct thread *td __debugvar = gd->gd_curthread;
1131         int reqflags;
1132         int quick;
1133
1134         stat->repeat = stat->repeat_last = cpu_idle_repeat_max;
1135
1136         crit_exit();
1137         KKASSERT(td->td_critcount == 0);
1138
1139         for (;;) {
1140                 /*
1141                  * See if there are any LWKTs ready to go.
1142                  */
1143                 lwkt_switch();
1144
1145                 /*
1146                  * When halting inside a cli we must check for reqflags
1147                  * races, particularly [re]schedule requests.  Running
1148                  * splz() does the job.
1149                  *
1150                  * cpu_idle_hlt:
1151                  *      0       Never halt, just spin
1152                  *
1153                  *      1       Always use HLT (or MONITOR/MWAIT if avail).
1154                  *
1155                  *              Better default for modern (Haswell+) Intel
1156                  *              cpus.
1157                  *
1158                  *      2       Use HLT/MONITOR/MWAIT up to a point and then
1159                  *              use the ACPI halt (default).  This is a hybrid
1160                  *              approach.  See machdep.cpu_idle_repeat.
1161                  *
1162                  *              Better default for modern AMD cpus and older
1163                  *              Intel cpus.
1164                  *
1165                  *      3       Always use the ACPI halt.  This typically
1166                  *              eats the least amount of power but the cpu
1167                  *              will be slow waking up.  Slows down e.g.
1168                  *              compiles and other pipe/event oriented stuff.
1169                  *
1170                  *      4       Always use HLT.
1171                  *
1172                  * NOTE: Interrupts are enabled and we are not in a critical
1173                  *       section.
1174                  *
1175                  * NOTE: Preemptions do not reset gd_idle_repeat.   Also we
1176                  *       don't bother capping gd_idle_repeat, it is ok if
1177                  *       it overflows.
1178                  *
1179                  * Implement optimized invltlb operations when halted
1180                  * in idle.  By setting the bit in smp_idleinvl_mask
1181                  * we inform other cpus that they can set _reqs to
1182                  * request an invltlb.  Current the code to do that
1183                  * sets the bits in _reqs anyway, but then check _mask
1184                  * to determine if they can assume the invltlb will execute.
1185                  *
1186                  * A critical section is required to ensure that interrupts
1187                  * do not fully run until after we've had a chance to execute
1188                  * the request.
1189                  */
1190                 if (gd->gd_idle_repeat == 0) {
1191                         stat->repeat = (stat->repeat + stat->repeat_last) >> 1;
1192                         if (stat->repeat > cpu_idle_repeat_max)
1193                                 stat->repeat = cpu_idle_repeat_max;
1194                         stat->repeat_last = 0;
1195                         stat->repeat_delta = 0;
1196                 }
1197                 ++stat->repeat_last;
1198
1199                 ++gd->gd_idle_repeat;
1200                 reqflags = gd->gd_reqflags;
1201                 quick = (cpu_idle_hlt == 1) ||
1202                         (cpu_idle_hlt < 3 &&
1203                          gd->gd_idle_repeat < cpu_idle_repeat);
1204
1205                 if (quick && (cpu_mi_feature & CPU_MI_MONITOR) &&
1206                     (reqflags & RQF_IDLECHECK_WK_MASK) == 0) {
1207                         splz(); /* XXX */
1208                         crit_enter_gd(gd);
1209                         ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask, gd->gd_cpuid);
1210                         cpu_mmw_pause_int(&gd->gd_reqflags, reqflags,
1211                             cpu_mwait_cx_hint(stat), 0);
1212                         stat->halt++;
1213                         ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask, gd->gd_cpuid);
1214                         if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs,
1215                                                       gd->gd_cpuid)) {
1216                                 cpu_invltlb();
1217                                 cpu_mfence();
1218                         }
1219                         crit_exit_gd(gd);
1220                 } else if (cpu_idle_hlt) {
1221                         __asm __volatile("cli");
1222                         splz();
1223                         crit_enter_gd(gd);
1224                         ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask, gd->gd_cpuid);
1225                         if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) {
1226                                 if (quick)
1227                                         cpu_idle_default_hook();
1228                                 else
1229                                         cpu_idle_hook();
1230                         }
1231                         __asm __volatile("sti");
1232                         stat->halt++;
1233                         ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask, gd->gd_cpuid);
1234                         if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs,
1235                                                       gd->gd_cpuid)) {
1236                                 cpu_invltlb();
1237                                 cpu_mfence();
1238                         }
1239                         crit_exit_gd(gd);
1240                 } else {
1241                         splz();
1242                         __asm __volatile("sti");
1243                         stat->spin++;
1244                 }
1245         }
1246 }
1247
1248 /*
1249  * Called in a loop indirectly via Xcpustop
1250  */
1251 void
1252 cpu_smp_stopped(void)
1253 {
1254         globaldata_t gd = mycpu;
1255         volatile __uint64_t *ptr;
1256         __uint64_t ovalue;
1257
1258         ptr = CPUMASK_ADDR(started_cpus, gd->gd_cpuid);
1259         ovalue = *ptr;
1260         if ((ovalue & CPUMASK_SIMPLE(gd->gd_cpuid & 63)) == 0) {
1261                 if (cpu_mi_feature & CPU_MI_MONITOR) {
1262                         cpu_mmw_pause_long(__DEVOLATILE(void *, ptr), ovalue,
1263                                            cpu_mwait_hints[CPU_MWAIT_C1], 0);
1264                 } else {
1265                         cpu_halt();     /* depend on lapic timer */
1266                 }
1267         }
1268 }
1269
1270 /*
1271  * This routine is called if a spinlock has been held through the
1272  * exponential backoff period and is seriously contested.  On a real cpu
1273  * we let it spin.
1274  */
1275 void
1276 cpu_spinlock_contested(void)
1277 {
1278         cpu_pause();
1279 }
1280
1281 /*
1282  * Clear registers on exec
1283  */
1284 void
1285 exec_setregs(u_long entry, u_long stack, u_long ps_strings)
1286 {
1287         struct thread *td = curthread;
1288         struct lwp *lp = td->td_lwp;
1289         struct pcb *pcb = td->td_pcb;
1290         struct trapframe *regs = lp->lwp_md.md_regs;
1291
1292         /* was i386_user_cleanup() in NetBSD */
1293         user_ldt_free(pcb);
1294
1295         clear_quickret();
1296         bzero((char *)regs, sizeof(struct trapframe));
1297         regs->tf_rip = entry;
1298         regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; /* align the stack */
1299         regs->tf_rdi = stack;           /* argv */
1300         regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
1301         regs->tf_ss = _udatasel;
1302         regs->tf_cs = _ucodesel;
1303         regs->tf_rbx = ps_strings;
1304
1305         /*
1306          * Reset the hardware debug registers if they were in use.
1307          * They won't have any meaning for the newly exec'd process.
1308          */
1309         if (pcb->pcb_flags & PCB_DBREGS) {
1310                 pcb->pcb_dr0 = 0;
1311                 pcb->pcb_dr1 = 0;
1312                 pcb->pcb_dr2 = 0;
1313                 pcb->pcb_dr3 = 0;
1314                 pcb->pcb_dr6 = 0;
1315                 pcb->pcb_dr7 = 0; /* JG set bit 10? */
1316                 if (pcb == td->td_pcb) {
1317                         /*
1318                          * Clear the debug registers on the running
1319                          * CPU, otherwise they will end up affecting
1320                          * the next process we switch to.
1321                          */
1322                         reset_dbregs();
1323                 }
1324                 pcb->pcb_flags &= ~PCB_DBREGS;
1325         }
1326
1327         /*
1328          * Initialize the math emulator (if any) for the current process.
1329          * Actually, just clear the bit that says that the emulator has
1330          * been initialized.  Initialization is delayed until the process
1331          * traps to the emulator (if it is done at all) mainly because
1332          * emulators don't provide an entry point for initialization.
1333          */
1334         pcb->pcb_flags &= ~FP_SOFTFP;
1335
1336         /*
1337          * NOTE: do not set CR0_TS here.  npxinit() must do it after clearing
1338          *       gd_npxthread.  Otherwise a preemptive interrupt thread
1339          *       may panic in npxdna().
1340          */
1341         crit_enter();
1342         load_cr0(rcr0() | CR0_MP);
1343
1344         /*
1345          * NOTE: The MSR values must be correct so we can return to
1346          *       userland.  gd_user_fs/gs must be correct so the switch
1347          *       code knows what the current MSR values are.
1348          */
1349         pcb->pcb_fsbase = 0;    /* Values loaded from PCB on switch */
1350         pcb->pcb_gsbase = 0;
1351         mdcpu->gd_user_fs = 0;  /* Cache of current MSR values */
1352         mdcpu->gd_user_gs = 0;
1353         wrmsr(MSR_FSBASE, 0);   /* Set MSR values for return to userland */
1354         wrmsr(MSR_KGSBASE, 0);
1355
1356         /* Initialize the npx (if any) for the current process. */
1357         npxinit();
1358         crit_exit();
1359
1360         pcb->pcb_ds = _udatasel;
1361         pcb->pcb_es = _udatasel;
1362         pcb->pcb_fs = _udatasel;
1363         pcb->pcb_gs = _udatasel;
1364 }
1365
1366 void
1367 cpu_setregs(void)
1368 {
1369         register_t cr0;
1370
1371         cr0 = rcr0();
1372         cr0 |= CR0_NE;                  /* Done by npxinit() */
1373         cr0 |= CR0_MP | CR0_TS;         /* Done at every execve() too. */
1374         cr0 |= CR0_WP | CR0_AM;
1375         load_cr0(cr0);
1376         load_gs(_udatasel);
1377 }
1378
1379 static int
1380 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS)
1381 {
1382         int error;
1383         error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2,
1384                 req);
1385         if (!error && req->newptr)
1386                 resettodr();
1387         return (error);
1388 }
1389
1390 SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
1391         &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", "");
1392
1393 SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set,
1394         CTLFLAG_RW, &disable_rtc_set, 0, "");
1395
1396 #if 0 /* JG */
1397 SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo,
1398         CTLFLAG_RD, &bootinfo, bootinfo, "");
1399 #endif
1400
1401 SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock,
1402         CTLFLAG_RW, &wall_cmos_clock, 0, "");
1403
1404 extern u_long bootdev;          /* not a cdev_t - encoding is different */
1405 SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev,
1406         CTLFLAG_RD, &bootdev, 0, "Boot device (not in cdev_t format)");
1407
1408 static int
1409 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1410 {
1411         struct efi_map_header *efihdr;
1412         caddr_t kmdp;
1413         uint32_t efisize;
1414
1415         kmdp = preload_search_by_type("elf kernel");
1416         if (kmdp == NULL)
1417                 kmdp = preload_search_by_type("elf64 kernel");
1418         efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1419             MODINFO_METADATA | MODINFOMD_EFI_MAP);
1420         if (efihdr == NULL)
1421                 return (0);
1422         efisize = *((uint32_t *)efihdr - 1);
1423         return (SYSCTL_OUT(req, efihdr, efisize));
1424 }
1425 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1426     efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
1427
1428 /*
1429  * Initialize 386 and configure to run kernel
1430  */
1431
1432 /*
1433  * Initialize segments & interrupt table
1434  */
1435
1436 int _default_ldt;
1437 struct user_segment_descriptor gdt[NGDT * MAXCPU];      /* global descriptor table */
1438 struct gate_descriptor idt_arr[MAXCPU][NIDT];
1439 #if 0 /* JG */
1440 union descriptor ldt[NLDT];             /* local descriptor table */
1441 #endif
1442
1443 /* table descriptors - used to load tables by cpu */
1444 struct region_descriptor r_gdt;
1445 struct region_descriptor r_idt_arr[MAXCPU];
1446
1447 /* JG proc0paddr is a virtual address */
1448 void *proc0paddr;
1449 /* JG alignment? */
1450 char proc0paddr_buff[LWKT_THREAD_STACK];
1451
1452
1453 /* software prototypes -- in more palatable form */
1454 struct soft_segment_descriptor gdt_segs[] = {
1455 /* GNULL_SEL    0 Null Descriptor */
1456 {       0x0,                    /* segment base address  */
1457         0x0,                    /* length */
1458         0,                      /* segment type */
1459         0,                      /* segment descriptor priority level */
1460         0,                      /* segment descriptor present */
1461         0,                      /* long */
1462         0,                      /* default 32 vs 16 bit size */
1463         0                       /* limit granularity (byte/page units)*/ },
1464 /* GCODE_SEL    1 Code Descriptor for kernel */
1465 {       0x0,                    /* segment base address  */
1466         0xfffff,                /* length - all address space */
1467         SDT_MEMERA,             /* segment type */
1468         SEL_KPL,                /* segment descriptor priority level */
1469         1,                      /* segment descriptor present */
1470         1,                      /* long */
1471         0,                      /* default 32 vs 16 bit size */
1472         1                       /* limit granularity (byte/page units)*/ },
1473 /* GDATA_SEL    2 Data Descriptor for kernel */
1474 {       0x0,                    /* segment base address  */
1475         0xfffff,                /* length - all address space */
1476         SDT_MEMRWA,             /* segment type */
1477         SEL_KPL,                /* segment descriptor priority level */
1478         1,                      /* segment descriptor present */
1479         1,                      /* long */
1480         0,                      /* default 32 vs 16 bit size */
1481         1                       /* limit granularity (byte/page units)*/ },
1482 /* GUCODE32_SEL 3 32 bit Code Descriptor for user */
1483 {       0x0,                    /* segment base address  */
1484         0xfffff,                /* length - all address space */
1485         SDT_MEMERA,             /* segment type */
1486         SEL_UPL,                /* segment descriptor priority level */
1487         1,                      /* segment descriptor present */
1488         0,                      /* long */
1489         1,                      /* default 32 vs 16 bit size */
1490         1                       /* limit granularity (byte/page units)*/ },
1491 /* GUDATA_SEL   4 32/64 bit Data Descriptor for user */
1492 {       0x0,                    /* segment base address  */
1493         0xfffff,                /* length - all address space */
1494         SDT_MEMRWA,             /* segment type */
1495         SEL_UPL,                /* segment descriptor priority level */
1496         1,                      /* segment descriptor present */
1497         0,                      /* long */
1498         1,                      /* default 32 vs 16 bit size */
1499         1                       /* limit granularity (byte/page units)*/ },
1500 /* GUCODE_SEL   5 64 bit Code Descriptor for user */
1501 {       0x0,                    /* segment base address  */
1502         0xfffff,                /* length - all address space */
1503         SDT_MEMERA,             /* segment type */
1504         SEL_UPL,                /* segment descriptor priority level */
1505         1,                      /* segment descriptor present */
1506         1,                      /* long */
1507         0,                      /* default 32 vs 16 bit size */
1508         1                       /* limit granularity (byte/page units)*/ },
1509 /* GPROC0_SEL   6 Proc 0 Tss Descriptor */
1510 {
1511         0x0,                    /* segment base address */
1512         sizeof(struct x86_64tss)-1,/* length - all address space */
1513         SDT_SYSTSS,             /* segment type */
1514         SEL_KPL,                /* segment descriptor priority level */
1515         1,                      /* segment descriptor present */
1516         0,                      /* long */
1517         0,                      /* unused - default 32 vs 16 bit size */
1518         0                       /* limit granularity (byte/page units)*/ },
1519 /* Actually, the TSS is a system descriptor which is double size */
1520 {       0x0,                    /* segment base address  */
1521         0x0,                    /* length */
1522         0,                      /* segment type */
1523         0,                      /* segment descriptor priority level */
1524         0,                      /* segment descriptor present */
1525         0,                      /* long */
1526         0,                      /* default 32 vs 16 bit size */
1527         0                       /* limit granularity (byte/page units)*/ },
1528 /* GUGS32_SEL   8 32 bit GS Descriptor for user */
1529 {       0x0,                    /* segment base address  */
1530         0xfffff,                /* length - all address space */
1531         SDT_MEMRWA,             /* segment type */
1532         SEL_UPL,                /* segment descriptor priority level */
1533         1,                      /* segment descriptor present */
1534         0,                      /* long */
1535         1,                      /* default 32 vs 16 bit size */
1536         1                       /* limit granularity (byte/page units)*/ },
1537 };
1538
1539 void
1540 setidt_global(int idx, inthand_t *func, int typ, int dpl, int ist)
1541 {
1542         int cpu;
1543
1544         for (cpu = 0; cpu < MAXCPU; ++cpu) {
1545                 struct gate_descriptor *ip = &idt_arr[cpu][idx];
1546
1547                 ip->gd_looffset = (uintptr_t)func;
1548                 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
1549                 ip->gd_ist = ist;
1550                 ip->gd_xx = 0;
1551                 ip->gd_type = typ;
1552                 ip->gd_dpl = dpl;
1553                 ip->gd_p = 1;
1554                 ip->gd_hioffset = ((uintptr_t)func)>>16 ;
1555         }
1556 }
1557
1558 void
1559 setidt(int idx, inthand_t *func, int typ, int dpl, int ist, int cpu)
1560 {
1561         struct gate_descriptor *ip;
1562
1563         KASSERT(cpu >= 0 && cpu < ncpus, ("invalid cpu %d", cpu));
1564
1565         ip = &idt_arr[cpu][idx];
1566         ip->gd_looffset = (uintptr_t)func;
1567         ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
1568         ip->gd_ist = ist;
1569         ip->gd_xx = 0;
1570         ip->gd_type = typ;
1571         ip->gd_dpl = dpl;
1572         ip->gd_p = 1;
1573         ip->gd_hioffset = ((uintptr_t)func)>>16 ;
1574 }
1575
1576 #define IDTVEC(name)    __CONCAT(X,name)
1577
1578 extern inthand_t
1579         IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
1580         IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
1581         IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
1582         IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
1583         IDTVEC(xmm), IDTVEC(dblfault),
1584         IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
1585
1586 void
1587 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd)
1588 {
1589         ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
1590         ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
1591         ssd->ssd_type  = sd->sd_type;
1592         ssd->ssd_dpl   = sd->sd_dpl;
1593         ssd->ssd_p     = sd->sd_p;
1594         ssd->ssd_def32 = sd->sd_def32;
1595         ssd->ssd_gran  = sd->sd_gran;
1596 }
1597
1598 void
1599 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd)
1600 {
1601
1602         sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
1603         sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
1604         sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
1605         sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
1606         sd->sd_type  = ssd->ssd_type;
1607         sd->sd_dpl   = ssd->ssd_dpl;
1608         sd->sd_p     = ssd->ssd_p;
1609         sd->sd_long  = ssd->ssd_long;
1610         sd->sd_def32 = ssd->ssd_def32;
1611         sd->sd_gran  = ssd->ssd_gran;
1612 }
1613
1614 void
1615 ssdtosyssd(struct soft_segment_descriptor *ssd,
1616     struct system_segment_descriptor *sd)
1617 {
1618
1619         sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
1620         sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
1621         sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
1622         sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
1623         sd->sd_type  = ssd->ssd_type;
1624         sd->sd_dpl   = ssd->ssd_dpl;
1625         sd->sd_p     = ssd->ssd_p;
1626         sd->sd_gran  = ssd->ssd_gran;
1627 }
1628
1629 /*
1630  * Populate the (physmap) array with base/bound pairs describing the
1631  * available physical memory in the system, then test this memory and
1632  * build the phys_avail array describing the actually-available memory.
1633  *
1634  * If we cannot accurately determine the physical memory map, then use
1635  * value from the 0xE801 call, and failing that, the RTC.
1636  *
1637  * Total memory size may be set by the kernel environment variable
1638  * hw.physmem or the compile-time define MAXMEM.
1639  *
1640  * Memory is aligned to PHYSMAP_ALIGN which must be a multiple
1641  * of PAGE_SIZE.  This also greatly reduces the memory test time
1642  * which would otherwise be excessive on machines with > 8G of ram.
1643  *
1644  * XXX first should be vm_paddr_t.
1645  */
1646
1647 #define PHYSMAP_ALIGN           (vm_paddr_t)(128 * 1024)
1648 #define PHYSMAP_ALIGN_MASK      (vm_paddr_t)(PHYSMAP_ALIGN - 1)
1649 #define PHYSMAP_SIZE            VM_PHYSSEG_MAX
1650
1651 vm_paddr_t physmap[PHYSMAP_SIZE];
1652 struct bios_smap *smapbase, *smap, *smapend;
1653 struct efi_map_header *efihdrbase;
1654 u_int32_t smapsize;
1655
1656 #define PHYSMAP_HANDWAVE        (vm_paddr_t)(2 * 1024 * 1024)
1657 #define PHYSMAP_HANDWAVE_MASK   (PHYSMAP_HANDWAVE - 1)
1658
1659 static void
1660 add_smap_entries(int *physmap_idx)
1661 {
1662         int i;
1663
1664         smapsize = *((u_int32_t *)smapbase - 1);
1665         smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1666
1667         for (smap = smapbase; smap < smapend; smap++) {
1668                 if (boothowto & RB_VERBOSE)
1669                         kprintf("SMAP type=%02x base=%016lx len=%016lx\n",
1670                             smap->type, smap->base, smap->length);
1671
1672                 if (smap->type != SMAP_TYPE_MEMORY)
1673                         continue;
1674
1675                 if (smap->length == 0)
1676                         continue;
1677
1678                 for (i = 0; i <= *physmap_idx; i += 2) {
1679                         if (smap->base < physmap[i + 1]) {
1680                                 if (boothowto & RB_VERBOSE) {
1681                                         kprintf("Overlapping or non-monotonic "
1682                                                 "memory region, ignoring "
1683                                                 "second region\n");
1684                                 }
1685                                 break;
1686                         }
1687                 }
1688                 if (i <= *physmap_idx)
1689                         continue;
1690
1691                 Realmem += smap->length;
1692
1693                 if (smap->base == physmap[*physmap_idx + 1]) {
1694                         physmap[*physmap_idx + 1] += smap->length;
1695                         continue;
1696                 }
1697
1698                 *physmap_idx += 2;
1699                 if (*physmap_idx == PHYSMAP_SIZE) {
1700                         kprintf("Too many segments in the physical "
1701                                 "address map, giving up\n");
1702                         break;
1703                 }
1704                 physmap[*physmap_idx] = smap->base;
1705                 physmap[*physmap_idx + 1] = smap->base + smap->length;
1706         }
1707 }
1708
1709 static void
1710 add_efi_map_entries(int *physmap_idx)
1711 {
1712          struct efi_md *map, *p;
1713          const char *type;
1714          size_t efisz;
1715          int i, ndesc;
1716
1717         static const char *types[] = {
1718                 "Reserved",
1719                 "LoaderCode",
1720                 "LoaderData",
1721                 "BootServicesCode",
1722                 "BootServicesData",
1723                 "RuntimeServicesCode",
1724                 "RuntimeServicesData",
1725                 "ConventionalMemory",
1726                 "UnusableMemory",
1727                 "ACPIReclaimMemory",
1728                 "ACPIMemoryNVS",
1729                 "MemoryMappedIO",
1730                 "MemoryMappedIOPortSpace",
1731                 "PalCode"
1732          };
1733
1734         /*
1735          * Memory map data provided by UEFI via the GetMemoryMap
1736          * Boot Services API.
1737          */
1738         efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
1739         map = (struct efi_md *)((uint8_t *)efihdrbase + efisz);
1740
1741         if (efihdrbase->descriptor_size == 0)
1742                 return;
1743         ndesc = efihdrbase->memory_size / efihdrbase->descriptor_size;
1744
1745         if (boothowto & RB_VERBOSE)
1746                 kprintf("%23s %12s %12s %8s %4s\n",
1747                     "Type", "Physical", "Virtual", "#Pages", "Attr");
1748
1749         for (i = 0, p = map; i < ndesc; i++,
1750             p = efi_next_descriptor(p, efihdrbase->descriptor_size)) {
1751                 if (boothowto & RB_VERBOSE) {
1752                         if (p->md_type <= EFI_MD_TYPE_PALCODE)
1753                                 type = types[p->md_type];
1754                         else
1755                                 type = "<INVALID>";
1756                         kprintf("%23s %012lx %12p %08lx ", type, p->md_phys,
1757                             p->md_virt, p->md_pages);
1758                         if (p->md_attr & EFI_MD_ATTR_UC)
1759                                 kprintf("UC ");
1760                         if (p->md_attr & EFI_MD_ATTR_WC)
1761                                 kprintf("WC ");
1762                         if (p->md_attr & EFI_MD_ATTR_WT)
1763                                 kprintf("WT ");
1764                         if (p->md_attr & EFI_MD_ATTR_WB)
1765                                 kprintf("WB ");
1766                         if (p->md_attr & EFI_MD_ATTR_UCE)
1767                                 kprintf("UCE ");
1768                         if (p->md_attr & EFI_MD_ATTR_WP)
1769                                 kprintf("WP ");
1770                         if (p->md_attr & EFI_MD_ATTR_RP)
1771                                 kprintf("RP ");
1772                         if (p->md_attr & EFI_MD_ATTR_XP)
1773                                 kprintf("XP ");
1774                         if (p->md_attr & EFI_MD_ATTR_RT)
1775                                 kprintf("RUNTIME");
1776                         kprintf("\n");
1777                 }
1778
1779                 switch (p->md_type) {
1780                 case EFI_MD_TYPE_CODE:
1781                 case EFI_MD_TYPE_DATA:
1782                 case EFI_MD_TYPE_BS_CODE:
1783                 case EFI_MD_TYPE_BS_DATA:
1784                 case EFI_MD_TYPE_FREE:
1785                         /*
1786                          * We're allowed to use any entry with these types.
1787                          */
1788                         break;
1789                 default:
1790                         continue;
1791                 }
1792
1793                 Realmem += p->md_pages * PAGE_SIZE;
1794
1795                 if (p->md_phys == physmap[*physmap_idx + 1]) {
1796                         physmap[*physmap_idx + 1] += p->md_pages * PAGE_SIZE;
1797                         continue;
1798                 }
1799
1800                 *physmap_idx += 2;
1801                 if (*physmap_idx == PHYSMAP_SIZE) {
1802                         kprintf("Too many segments in the physical "
1803                                 "address map, giving up\n");
1804                         break;
1805                 }
1806                 physmap[*physmap_idx] = p->md_phys;
1807                 physmap[*physmap_idx + 1] = p->md_phys + p->md_pages * PAGE_SIZE;
1808          }
1809 }
1810
1811 struct fb_info efi_fb_info;
1812 static int have_efi_framebuffer = 0;
1813
1814 static void
1815 efi_fb_init_vaddr(int direct_map)
1816 {
1817         uint64_t sz;
1818         vm_offset_t addr, v;
1819
1820         v = efi_fb_info.vaddr;
1821         sz = efi_fb_info.stride * efi_fb_info.height;
1822
1823         if (direct_map) {
1824                 addr = PHYS_TO_DMAP(efi_fb_info.paddr);
1825                 if (addr >= DMAP_MIN_ADDRESS && addr + sz < DMAP_MAX_ADDRESS)
1826                         efi_fb_info.vaddr = addr;
1827         } else {
1828                 efi_fb_info.vaddr = (vm_offset_t)pmap_mapdev_attr(
1829                     efi_fb_info.paddr, sz, PAT_WRITE_COMBINING);
1830         }
1831 }
1832
1833 int
1834 probe_efi_fb(int early)
1835 {
1836         struct efi_fb   *efifb;
1837         caddr_t         kmdp;
1838
1839         if (have_efi_framebuffer) {
1840                 if (!early &&
1841                     (efi_fb_info.vaddr == 0 ||
1842                      efi_fb_info.vaddr == PHYS_TO_DMAP(efi_fb_info.paddr)))
1843                         efi_fb_init_vaddr(0);
1844                 return 0;
1845         }
1846
1847         kmdp = preload_search_by_type("elf kernel");
1848         if (kmdp == NULL)
1849                 kmdp = preload_search_by_type("elf64 kernel");
1850         efifb = (struct efi_fb *)preload_search_info(kmdp,
1851             MODINFO_METADATA | MODINFOMD_EFI_FB);
1852         if (efifb == NULL)
1853                 return 1;
1854
1855         have_efi_framebuffer = 1;
1856
1857         efi_fb_info.is_vga_boot_display = 1;
1858         efi_fb_info.width = efifb->fb_width;
1859         efi_fb_info.height = efifb->fb_height;
1860         efi_fb_info.stride = efifb->fb_stride * 4;
1861         efi_fb_info.depth = 32;
1862         efi_fb_info.paddr = efifb->fb_addr;
1863         if (early) {
1864                 efi_fb_info.vaddr = 0;
1865         } else {
1866                 efi_fb_init_vaddr(0);
1867         }
1868         efi_fb_info.fbops.fb_set_par = NULL;
1869         efi_fb_info.fbops.fb_blank = NULL;
1870         efi_fb_info.fbops.fb_debug_enter = NULL;
1871         efi_fb_info.device = NULL;
1872
1873         return 0;
1874 }
1875
1876 static void
1877 efifb_startup(void *arg)
1878 {
1879         probe_efi_fb(0);
1880 }
1881
1882 SYSINIT(efi_fb_info, SI_BOOT1_POST, SI_ORDER_FIRST, efifb_startup, NULL);
1883
1884 static void
1885 getmemsize(caddr_t kmdp, u_int64_t first)
1886 {
1887         int off, physmap_idx, pa_indx, da_indx;
1888         int i, j;
1889         vm_paddr_t pa;
1890         vm_paddr_t msgbuf_size;
1891         u_long physmem_tunable;
1892         pt_entry_t *pte;
1893         quad_t dcons_addr, dcons_size;
1894
1895         bzero(physmap, sizeof(physmap));
1896         physmap_idx = 0;
1897
1898         /*
1899          * get memory map from INT 15:E820, kindly supplied by the loader.
1900          *
1901          * subr_module.c says:
1902          * "Consumer may safely assume that size value precedes data."
1903          * ie: an int32_t immediately precedes smap.
1904          */
1905         efihdrbase = (struct efi_map_header *)preload_search_info(kmdp,
1906                      MODINFO_METADATA | MODINFOMD_EFI_MAP);
1907         smapbase = (struct bios_smap *)preload_search_info(kmdp,
1908                    MODINFO_METADATA | MODINFOMD_SMAP);
1909         if (smapbase == NULL && efihdrbase == NULL)
1910                 panic("No BIOS smap or EFI map info from loader!");
1911
1912         if (efihdrbase == NULL)
1913                 add_smap_entries(&physmap_idx);
1914         else
1915                 add_efi_map_entries(&physmap_idx);
1916
1917         base_memory = physmap[1] / 1024;
1918         /* make hole for AP bootstrap code */
1919         physmap[1] = mp_bootaddress(base_memory);
1920
1921         /* Save EBDA address, if any */
1922         ebda_addr = (u_long)(*(u_short *)(KERNBASE + 0x40e));
1923         ebda_addr <<= 4;
1924
1925         /*
1926          * Maxmem isn't the "maximum memory", it's one larger than the
1927          * highest page of the physical address space.  It should be
1928          * called something like "Maxphyspage".  We may adjust this
1929          * based on ``hw.physmem'' and the results of the memory test.
1930          */
1931         Maxmem = atop(physmap[physmap_idx + 1]);
1932
1933 #ifdef MAXMEM
1934         Maxmem = MAXMEM / 4;
1935 #endif
1936
1937         if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
1938                 Maxmem = atop(physmem_tunable);
1939
1940         /*
1941          * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1942          * in the system.
1943          */
1944         if (Maxmem > atop(physmap[physmap_idx + 1]))
1945                 Maxmem = atop(physmap[physmap_idx + 1]);
1946
1947         /*
1948          * Blowing out the DMAP will blow up the system.
1949          */
1950         if (Maxmem > atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS)) {
1951                 kprintf("Limiting Maxmem due to DMAP size\n");
1952                 Maxmem = atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS);
1953         }
1954
1955         if (atop(physmap[physmap_idx + 1]) != Maxmem &&
1956             (boothowto & RB_VERBOSE)) {
1957                 kprintf("Physical memory use set to %ldK\n", Maxmem * 4);
1958         }
1959
1960         /*
1961          * Call pmap initialization to make new kernel address space
1962          *
1963          * Mask off page 0.
1964          */
1965         pmap_bootstrap(&first);
1966         physmap[0] = PAGE_SIZE;
1967
1968         /*
1969          * Align the physmap to PHYSMAP_ALIGN and cut out anything
1970          * exceeding Maxmem.
1971          */
1972         for (i = j = 0; i <= physmap_idx; i += 2) {
1973                 if (physmap[i+1] > ptoa(Maxmem))
1974                         physmap[i+1] = ptoa(Maxmem);
1975                 physmap[i] = (physmap[i] + PHYSMAP_ALIGN_MASK) &
1976                              ~PHYSMAP_ALIGN_MASK;
1977                 physmap[i+1] = physmap[i+1] & ~PHYSMAP_ALIGN_MASK;
1978
1979                 physmap[j] = physmap[i];
1980                 physmap[j+1] = physmap[i+1];
1981
1982                 if (physmap[i] < physmap[i+1])
1983                         j += 2;
1984         }
1985         physmap_idx = j - 2;
1986
1987         /*
1988          * Align anything else used in the validation loop.
1989          */
1990         first = (first + PHYSMAP_ALIGN_MASK) & ~PHYSMAP_ALIGN_MASK;
1991
1992         /*
1993          * Size up each available chunk of physical memory.
1994          */
1995         pa_indx = 0;
1996         da_indx = 0;
1997         phys_avail[pa_indx].phys_beg = physmap[0];
1998         phys_avail[pa_indx].phys_end = physmap[0];
1999         dump_avail[da_indx].phys_beg = 0;
2000         dump_avail[da_indx].phys_end = physmap[0];
2001         pte = CMAP1;
2002
2003         /*
2004          * Get dcons buffer address
2005          */
2006         if (kgetenv_quad("dcons.addr", &dcons_addr) == 0 ||
2007             kgetenv_quad("dcons.size", &dcons_size) == 0)
2008                 dcons_addr = 0;
2009
2010         /*
2011          * Validate the physical memory.  The physical memory segments
2012          * have already been aligned to PHYSMAP_ALIGN which is a multiple
2013          * of PAGE_SIZE.
2014          */
2015         for (i = 0; i <= physmap_idx; i += 2) {
2016                 vm_paddr_t end;
2017                 vm_paddr_t incr = PHYSMAP_ALIGN;
2018
2019                 end = physmap[i + 1];
2020
2021                 for (pa = physmap[i]; pa < end; pa += incr) {
2022                         int page_bad, full;
2023                         volatile uint64_t *ptr = (uint64_t *)CADDR1;
2024                         uint64_t tmp;
2025
2026                         incr = PHYSMAP_ALIGN;
2027                         full = FALSE;
2028
2029                         /*
2030                          * block out kernel memory as not available.
2031                          */
2032                         if (pa >= 0x200000 && pa < first)
2033                                 goto do_dump_avail;
2034
2035                         /*
2036                          * block out dcons buffer
2037                          */
2038                         if (dcons_addr > 0
2039                             && pa >= trunc_page(dcons_addr)
2040                             && pa < dcons_addr + dcons_size) {
2041                                 goto do_dump_avail;
2042                         }
2043
2044                         page_bad = FALSE;
2045
2046                         /*
2047                          * Always test the first and last block supplied in
2048                          * the map entry, but it just takes too long to run
2049                          * the test these days and we already have to skip
2050                          * pages.  Handwave it on PHYSMAP_HANDWAVE boundaries.
2051                          */
2052                         if (pa != physmap[i]) {
2053                                 vm_paddr_t bytes = end - pa;
2054                                 if ((pa & PHYSMAP_HANDWAVE_MASK) == 0 &&
2055                                     bytes >= PHYSMAP_HANDWAVE + PHYSMAP_ALIGN) {
2056                                         incr = PHYSMAP_HANDWAVE;
2057                                         goto handwaved;
2058                                 }
2059                         }
2060
2061                         /*
2062                          * map page into kernel: valid, read/write,non-cacheable
2063                          */
2064                         *pte = pa |
2065                             kernel_pmap.pmap_bits[PG_V_IDX] |
2066                             kernel_pmap.pmap_bits[PG_RW_IDX] |
2067                             kernel_pmap.pmap_bits[PG_N_IDX];
2068                         cpu_invlpg(__DEVOLATILE(void *, ptr));
2069                         cpu_mfence();
2070
2071                         tmp = *ptr;
2072                         /*
2073                          * Test for alternating 1's and 0's
2074                          */
2075                         *ptr = 0xaaaaaaaaaaaaaaaaLLU;
2076                         cpu_mfence();
2077                         if (*ptr != 0xaaaaaaaaaaaaaaaaLLU)
2078                                 page_bad = TRUE;
2079                         /*
2080                          * Test for alternating 0's and 1's
2081                          */
2082                         *ptr = 0x5555555555555555LLU;
2083                         cpu_mfence();
2084                         if (*ptr != 0x5555555555555555LLU)
2085                                 page_bad = TRUE;
2086                         /*
2087                          * Test for all 1's
2088                          */
2089                         *ptr = 0xffffffffffffffffLLU;
2090                         cpu_mfence();
2091                         if (*ptr != 0xffffffffffffffffLLU)
2092                                 page_bad = TRUE;
2093                         /*
2094                          * Test for all 0's
2095                          */
2096                         *ptr = 0x0;
2097                         cpu_mfence();
2098                         if (*ptr != 0x0)
2099                                 page_bad = TRUE;
2100                         /*
2101                          * Restore original value.
2102                          */
2103                         *ptr = tmp;
2104 handwaved:
2105
2106                         /*
2107                          * Adjust array of valid/good pages.
2108                          */
2109                         if (page_bad == TRUE)
2110                                 continue;
2111
2112                         /*
2113                          * If this good page is a continuation of the
2114                          * previous set of good pages, then just increase
2115                          * the end pointer. Otherwise start a new chunk.
2116                          * Note that "end" points one higher than end,
2117                          * making the range >= start and < end.
2118                          * If we're also doing a speculative memory
2119                          * test and we at or past the end, bump up Maxmem
2120                          * so that we keep going. The first bad page
2121                          * will terminate the loop.
2122                          */
2123                         if (phys_avail[pa_indx].phys_end == pa) {
2124                                 phys_avail[pa_indx].phys_end += incr;
2125                         } else {
2126                                 ++pa_indx;
2127                                 if (pa_indx == PHYS_AVAIL_ARRAY_END) {
2128                                         kprintf(
2129                 "Too many holes in the physical address space, giving up\n");
2130                                         --pa_indx;
2131                                         full = TRUE;
2132                                         goto do_dump_avail;
2133                                 }
2134                                 phys_avail[pa_indx].phys_beg = pa;
2135                                 phys_avail[pa_indx].phys_end = pa + incr;
2136                         }
2137                         physmem += incr / PAGE_SIZE;
2138 do_dump_avail:
2139                         if (dump_avail[da_indx].phys_end == pa) {
2140                                 dump_avail[da_indx].phys_end += incr;
2141                         } else {
2142                                 ++da_indx;
2143                                 if (da_indx == DUMP_AVAIL_ARRAY_END) {
2144                                         --da_indx;
2145                                         goto do_next;
2146                                 }
2147                                 dump_avail[da_indx].phys_beg = pa;
2148                                 dump_avail[da_indx].phys_end = pa + incr;
2149                         }
2150 do_next:
2151                         if (full)
2152                                 break;
2153                 }
2154         }
2155         *pte = 0;
2156         cpu_invltlb();
2157         cpu_mfence();
2158
2159         /*
2160          * The last chunk must contain at least one page plus the message
2161          * buffer to avoid complicating other code (message buffer address
2162          * calculation, etc.).
2163          */
2164         msgbuf_size = (MSGBUF_SIZE + PHYSMAP_ALIGN_MASK) & ~PHYSMAP_ALIGN_MASK;
2165
2166         while (phys_avail[pa_indx].phys_beg + PHYSMAP_ALIGN + msgbuf_size >=
2167                phys_avail[pa_indx].phys_end) {
2168                 physmem -= atop(phys_avail[pa_indx].phys_end -
2169                                 phys_avail[pa_indx].phys_beg);
2170                 phys_avail[pa_indx].phys_beg = 0;
2171                 phys_avail[pa_indx].phys_end = 0;
2172                 --pa_indx;
2173         }
2174
2175         Maxmem = atop(phys_avail[pa_indx].phys_end);
2176
2177         /* Trim off space for the message buffer. */
2178         phys_avail[pa_indx].phys_end -= msgbuf_size;
2179
2180         avail_end = phys_avail[pa_indx].phys_end;
2181
2182         /* Map the message buffer. */
2183         for (off = 0; off < msgbuf_size; off += PAGE_SIZE) {
2184                 pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off);
2185         }
2186         /* Try to get EFI framebuffer working as early as possible */
2187         if (have_efi_framebuffer)
2188                 efi_fb_init_vaddr(1);
2189 }
2190
2191 struct machintr_abi MachIntrABI;
2192
2193 /*
2194  * IDT VECTORS:
2195  *      0       Divide by zero
2196  *      1       Debug
2197  *      2       NMI
2198  *      3       BreakPoint
2199  *      4       OverFlow
2200  *      5       Bound-Range
2201  *      6       Invalid OpCode
2202  *      7       Device Not Available (x87)
2203  *      8       Double-Fault
2204  *      9       Coprocessor Segment overrun (unsupported, reserved)
2205  *      10      Invalid-TSS
2206  *      11      Segment not present
2207  *      12      Stack
2208  *      13      General Protection
2209  *      14      Page Fault
2210  *      15      Reserved
2211  *      16      x87 FP Exception pending
2212  *      17      Alignment Check
2213  *      18      Machine Check
2214  *      19      SIMD floating point
2215  *      20-31   reserved
2216  *      32-255  INTn/external sources
2217  */
2218 u_int64_t
2219 hammer_time(u_int64_t modulep, u_int64_t physfree)
2220 {
2221         caddr_t kmdp;
2222         int gsel_tss, x, cpu;
2223 #if 0 /* JG */
2224         int metadata_missing, off;
2225 #endif
2226         struct mdglobaldata *gd;
2227         u_int64_t msr;
2228
2229         /*
2230          * Prevent lowering of the ipl if we call tsleep() early.
2231          */
2232         gd = &CPU_prvspace[0]->mdglobaldata;
2233         bzero(gd, sizeof(*gd));
2234
2235         /*
2236          * Note: on both UP and SMP curthread must be set non-NULL
2237          * early in the boot sequence because the system assumes
2238          * that 'curthread' is never NULL.
2239          */
2240
2241         gd->mi.gd_curthread = &thread0;
2242         thread0.td_gd = &gd->mi;
2243
2244         atdevbase = ISA_HOLE_START + PTOV_OFFSET;
2245
2246 #if 0 /* JG */
2247         metadata_missing = 0;
2248         if (bootinfo.bi_modulep) {
2249                 preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
2250                 preload_bootstrap_relocate(KERNBASE);
2251         } else {
2252                 metadata_missing = 1;
2253         }
2254         if (bootinfo.bi_envp)
2255                 kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE;
2256 #endif
2257
2258         preload_metadata = (caddr_t)(uintptr_t)(modulep + PTOV_OFFSET);
2259         preload_bootstrap_relocate(PTOV_OFFSET);
2260         kmdp = preload_search_by_type("elf kernel");
2261         if (kmdp == NULL)
2262                 kmdp = preload_search_by_type("elf64 kernel");
2263         boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
2264         kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + PTOV_OFFSET;
2265 #ifdef DDB
2266         ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
2267         ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
2268 #endif
2269         efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
2270
2271         if (boothowto & RB_VERBOSE)
2272                 bootverbose++;
2273
2274         /*
2275          * Default MachIntrABI to ICU
2276          */
2277         MachIntrABI = MachIntrABI_ICU;
2278
2279         /*
2280          * start with one cpu.  Note: with one cpu, ncpus2_shift, ncpus2_mask,
2281          * and ncpus_fit_mask remain 0.
2282          */
2283         ncpus = 1;
2284         ncpus2 = 1;
2285         ncpus_fit = 1;
2286         /* Init basic tunables, hz etc */
2287         init_param1();
2288
2289         /*
2290          * make gdt memory segments
2291          */
2292         gdt_segs[GPROC0_SEL].ssd_base =
2293                 (uintptr_t) &CPU_prvspace[0]->mdglobaldata.gd_common_tss;
2294
2295         gd->mi.gd_prvspace = CPU_prvspace[0];
2296
2297         for (x = 0; x < NGDT; x++) {
2298                 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1))
2299                         ssdtosd(&gdt_segs[x], &gdt[x]);
2300         }
2301         ssdtosyssd(&gdt_segs[GPROC0_SEL],
2302             (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
2303
2304         r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
2305         r_gdt.rd_base =  (long) gdt;
2306         lgdt(&r_gdt);
2307
2308         wrmsr(MSR_FSBASE, 0);           /* User value */
2309         wrmsr(MSR_GSBASE, (u_int64_t)&gd->mi);
2310         wrmsr(MSR_KGSBASE, 0);          /* User value while in the kernel */
2311
2312         mi_gdinit(&gd->mi, 0);
2313         cpu_gdinit(gd, 0);
2314         proc0paddr = proc0paddr_buff;
2315         mi_proc0init(&gd->mi, proc0paddr);
2316         safepri = TDPRI_MAX;
2317
2318         /* spinlocks and the BGL */
2319         init_locks();
2320
2321         /* exceptions */
2322         for (x = 0; x < NIDT; x++)
2323                 setidt_global(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
2324         setidt_global(IDT_DE, &IDTVEC(div),  SDT_SYSIGT, SEL_KPL, 0);
2325         setidt_global(IDT_DB, &IDTVEC(dbg),  SDT_SYSIGT, SEL_KPL, 0);
2326         setidt_global(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 1);
2327         setidt_global(IDT_BP, &IDTVEC(bpt),  SDT_SYSIGT, SEL_UPL, 0);
2328         setidt_global(IDT_OF, &IDTVEC(ofl),  SDT_SYSIGT, SEL_KPL, 0);
2329         setidt_global(IDT_BR, &IDTVEC(bnd),  SDT_SYSIGT, SEL_KPL, 0);
2330         setidt_global(IDT_UD, &IDTVEC(ill),  SDT_SYSIGT, SEL_KPL, 0);
2331         setidt_global(IDT_NM, &IDTVEC(dna),  SDT_SYSIGT, SEL_KPL, 0);
2332         setidt_global(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
2333         setidt_global(IDT_FPUGP, &IDTVEC(fpusegm),  SDT_SYSIGT, SEL_KPL, 0);
2334         setidt_global(IDT_TS, &IDTVEC(tss),  SDT_SYSIGT, SEL_KPL, 0);
2335         setidt_global(IDT_NP, &IDTVEC(missing),  SDT_SYSIGT, SEL_KPL, 0);
2336         setidt_global(IDT_SS, &IDTVEC(stk),  SDT_SYSIGT, SEL_KPL, 0);
2337         setidt_global(IDT_GP, &IDTVEC(prot),  SDT_SYSIGT, SEL_KPL, 0);
2338         setidt_global(IDT_PF, &IDTVEC(page),  SDT_SYSIGT, SEL_KPL, 0);
2339         setidt_global(IDT_MF, &IDTVEC(fpu),  SDT_SYSIGT, SEL_KPL, 0);
2340         setidt_global(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0);
2341         setidt_global(IDT_MC, &IDTVEC(mchk),  SDT_SYSIGT, SEL_KPL, 0);
2342         setidt_global(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0);
2343
2344         for (cpu = 0; cpu < MAXCPU; ++cpu) {
2345                 r_idt_arr[cpu].rd_limit = sizeof(idt_arr[cpu]) - 1;
2346                 r_idt_arr[cpu].rd_base = (long) &idt_arr[cpu][0];
2347         }
2348
2349         lidt(&r_idt_arr[0]);
2350
2351         /*
2352          * Initialize the console before we print anything out.
2353          */
2354         cninit();
2355
2356 #if 0 /* JG */
2357         if (metadata_missing)
2358                 kprintf("WARNING: loader(8) metadata is missing!\n");
2359 #endif
2360
2361 #if     NISA >0
2362         elcr_probe();
2363         isa_defaultirq();
2364 #endif
2365         rand_initialize();
2366
2367         /*
2368          * Initialize IRQ mapping
2369          *
2370          * NOTE:
2371          * SHOULD be after elcr_probe()
2372          */
2373         MachIntrABI_ICU.initmap();
2374         MachIntrABI_IOAPIC.initmap();
2375
2376 #ifdef DDB
2377         kdb_init();
2378         if (boothowto & RB_KDB)
2379                 Debugger("Boot flags requested debugger");
2380 #endif
2381
2382 #if 0 /* JG */
2383         finishidentcpu();       /* Final stage of CPU initialization */
2384         setidt(6, &IDTVEC(ill),  SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
2385         setidt(13, &IDTVEC(prot),  SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
2386 #endif
2387         identify_cpu();         /* Final stage of CPU initialization */
2388         initializecpu(0);       /* Initialize CPU registers */
2389
2390         /*
2391          * On modern intel cpus, haswell or later, cpu_idle_hlt=1 is better
2392          * because the cpu does significant power management in MWAIT
2393          * (also suggested is to set sysctl machdep.mwait.CX.idle=AUTODEEP).
2394          *
2395          * On modern amd cpus cpu_idle_hlt=3 is better, because the cpu does
2396          * significant power management in HLT or ACPI (but cpu_idle_hlt=1
2397          * would try to use MWAIT).
2398          *
2399          * On older amd or intel cpus, cpu_idle_hlt=2 is better because ACPI
2400          * is needed to reduce power consumption, but wakeup times are often
2401          * longer.
2402          */
2403         if (cpu_vendor_id == CPU_VENDOR_INTEL &&
2404             CPUID_TO_MODEL(cpu_id) >= 0x3C) {   /* Haswell or later */
2405                 cpu_idle_hlt = 1;
2406         }
2407         if (cpu_vendor_id == CPU_VENDOR_AMD &&
2408             CPUID_TO_FAMILY(cpu_id) >= 0x14) {  /* Bobcat or later */
2409                 cpu_idle_hlt = 3;
2410         }
2411
2412         TUNABLE_INT_FETCH("hw.apic_io_enable", &ioapic_enable); /* for compat */
2413         TUNABLE_INT_FETCH("hw.ioapic_enable", &ioapic_enable);
2414         TUNABLE_INT_FETCH("hw.lapic_enable", &lapic_enable);
2415         TUNABLE_INT_FETCH("machdep.cpu_idle_hlt", &cpu_idle_hlt);
2416
2417         /*
2418          * Some of the virtual machines do not work w/ I/O APIC
2419          * enabled.  If the user does not explicitly enable or
2420          * disable the I/O APIC (ioapic_enable < 0), then we
2421          * disable I/O APIC on all virtual machines.
2422          *
2423          * NOTE:
2424          * This must be done after identify_cpu(), which sets
2425          * 'cpu_feature2'
2426          */
2427         if (ioapic_enable < 0) {
2428                 if (cpu_feature2 & CPUID2_VMM)
2429                         ioapic_enable = 0;
2430                 else
2431                         ioapic_enable = 1;
2432         }
2433
2434         /* make an initial tss so cpu can get interrupt stack on syscall! */
2435         gd->gd_common_tss.tss_rsp0 =
2436                 (register_t)(thread0.td_kstack +
2437                              KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb));
2438         /* Ensure the stack is aligned to 16 bytes */
2439         gd->gd_common_tss.tss_rsp0 &= ~(register_t)0xF;
2440
2441         /* double fault stack */
2442         gd->gd_common_tss.tss_ist1 =
2443                 (long)&gd->mi.gd_prvspace->idlestack[
2444                         sizeof(gd->mi.gd_prvspace->idlestack)];
2445
2446         /* Set the IO permission bitmap (empty due to tss seg limit) */
2447         gd->gd_common_tss.tss_iobase = sizeof(struct x86_64tss);
2448
2449         gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
2450         gd->gd_tss_gdt = &gdt[GPROC0_SEL];
2451         gd->gd_common_tssd = *gd->gd_tss_gdt;
2452         ltr(gsel_tss);
2453
2454         /* Set up the fast syscall stuff */
2455         msr = rdmsr(MSR_EFER) | EFER_SCE;
2456         wrmsr(MSR_EFER, msr);
2457         wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
2458         wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
2459         msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
2460               ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
2461         wrmsr(MSR_STAR, msr);
2462         wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D|PSL_IOPL);
2463
2464         getmemsize(kmdp, physfree);
2465         init_param2(physmem);
2466
2467         /* now running on new page tables, configured,and u/iom is accessible */
2468
2469         /* Map the message buffer. */
2470 #if 0 /* JG */
2471         for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE)
2472                 pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off);
2473 #endif
2474
2475         msgbufinit(msgbufp, MSGBUF_SIZE);
2476
2477
2478         /* transfer to user mode */
2479
2480         _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
2481         _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
2482         _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
2483
2484         load_ds(_udatasel);
2485         load_es(_udatasel);
2486         load_fs(_udatasel);
2487
2488         /* setup proc 0's pcb */
2489         thread0.td_pcb->pcb_flags = 0;
2490         thread0.td_pcb->pcb_cr3 = KPML4phys;
2491         thread0.td_pcb->pcb_ext = NULL;
2492         lwp0.lwp_md.md_regs = &proc0_tf;        /* XXX needed? */
2493
2494         /* Location of kernel stack for locore */
2495         return ((u_int64_t)thread0.td_pcb);
2496 }
2497
2498 /*
2499  * Initialize machine-dependant portions of the global data structure.
2500  * Note that the global data area and cpu0's idlestack in the private
2501  * data space were allocated in locore.
2502  *
2503  * Note: the idlethread's cpl is 0
2504  *
2505  * WARNING!  Called from early boot, 'mycpu' may not work yet.
2506  */
2507 void
2508 cpu_gdinit(struct mdglobaldata *gd, int cpu)
2509 {
2510         if (cpu)
2511                 gd->mi.gd_curthread = &gd->mi.gd_idlethread;
2512
2513         lwkt_init_thread(&gd->mi.gd_idlethread,
2514                         gd->mi.gd_prvspace->idlestack,
2515                         sizeof(gd->mi.gd_prvspace->idlestack),
2516                         0, &gd->mi);
2517         lwkt_set_comm(&gd->mi.gd_idlethread, "idle_%d", cpu);
2518         gd->mi.gd_idlethread.td_switch = cpu_lwkt_switch;
2519         gd->mi.gd_idlethread.td_sp -= sizeof(void *);
2520         *(void **)gd->mi.gd_idlethread.td_sp = cpu_idle_restore;
2521 }
2522
2523 /*
2524  * We only have to check for DMAP bounds, the globaldata space is
2525  * actually part of the kernel_map so we don't have to waste time
2526  * checking CPU_prvspace[*].
2527  */
2528 int
2529 is_globaldata_space(vm_offset_t saddr, vm_offset_t eaddr)
2530 {
2531 #if 0
2532         if (saddr >= (vm_offset_t)&CPU_prvspace[0] &&
2533             eaddr <= (vm_offset_t)&CPU_prvspace[MAXCPU]) {
2534                 return (TRUE);
2535         }
2536 #endif
2537         if (saddr >= DMAP_MIN_ADDRESS && eaddr <= DMAP_MAX_ADDRESS)
2538                 return (TRUE);
2539         return (FALSE);
2540 }
2541
2542 struct globaldata *
2543 globaldata_find(int cpu)
2544 {
2545         KKASSERT(cpu >= 0 && cpu < ncpus);
2546         return(&CPU_prvspace[cpu]->mdglobaldata.mi);
2547 }
2548
2549 /*
2550  * This path should be safe from the SYSRET issue because only stopped threads
2551  * can have their %rip adjusted this way (and all heavy weight thread switches
2552  * clear QUICKREF and thus do not use SYSRET).  However, the code path is
2553  * convoluted so add a safety by forcing %rip to be cannonical.
2554  */
2555 int
2556 ptrace_set_pc(struct lwp *lp, unsigned long addr)
2557 {
2558         if (addr & 0x0000800000000000LLU)
2559                 lp->lwp_md.md_regs->tf_rip = addr | 0xFFFF000000000000LLU;
2560         else
2561                 lp->lwp_md.md_regs->tf_rip = addr & 0x0000FFFFFFFFFFFFLLU;
2562         return (0);
2563 }
2564
2565 int
2566 ptrace_single_step(struct lwp *lp)
2567 {
2568         lp->lwp_md.md_regs->tf_rflags |= PSL_T;
2569         return (0);
2570 }
2571
2572 int
2573 fill_regs(struct lwp *lp, struct reg *regs)
2574 {
2575         struct trapframe *tp;
2576
2577         if ((tp = lp->lwp_md.md_regs) == NULL)
2578                 return EINVAL;
2579         bcopy(&tp->tf_rdi, &regs->r_rdi, sizeof(*regs));
2580         return (0);
2581 }
2582
2583 int
2584 set_regs(struct lwp *lp, struct reg *regs)
2585 {
2586         struct trapframe *tp;
2587
2588         tp = lp->lwp_md.md_regs;
2589         if (!EFL_SECURE(regs->r_rflags, tp->tf_rflags) ||
2590             !CS_SECURE(regs->r_cs))
2591                 return (EINVAL);
2592         bcopy(&regs->r_rdi, &tp->tf_rdi, sizeof(*regs));
2593         clear_quickret();
2594         return (0);
2595 }
2596
2597 static void
2598 fill_fpregs_xmm(struct savexmm *sv_xmm, struct save87 *sv_87)
2599 {
2600         struct env87 *penv_87 = &sv_87->sv_env;
2601         struct envxmm *penv_xmm = &sv_xmm->sv_env;
2602         int i;
2603
2604         /* FPU control/status */
2605         penv_87->en_cw = penv_xmm->en_cw;
2606         penv_87->en_sw = penv_xmm->en_sw;
2607         penv_87->en_tw = penv_xmm->en_tw;
2608         penv_87->en_fip = penv_xmm->en_fip;
2609         penv_87->en_fcs = penv_xmm->en_fcs;
2610         penv_87->en_opcode = penv_xmm->en_opcode;
2611         penv_87->en_foo = penv_xmm->en_foo;
2612         penv_87->en_fos = penv_xmm->en_fos;
2613
2614         /* FPU registers */
2615         for (i = 0; i < 8; ++i)
2616                 sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc;
2617 }
2618
2619 static void
2620 set_fpregs_xmm(struct save87 *sv_87, struct savexmm *sv_xmm)
2621 {
2622         struct env87 *penv_87 = &sv_87->sv_env;
2623         struct envxmm *penv_xmm = &sv_xmm->sv_env;
2624         int i;
2625
2626         /* FPU control/status */
2627         penv_xmm->en_cw = penv_87->en_cw;
2628         penv_xmm->en_sw = penv_87->en_sw;
2629         penv_xmm->en_tw = penv_87->en_tw;
2630         penv_xmm->en_fip = penv_87->en_fip;
2631         penv_xmm->en_fcs = penv_87->en_fcs;
2632         penv_xmm->en_opcode = penv_87->en_opcode;
2633         penv_xmm->en_foo = penv_87->en_foo;
2634         penv_xmm->en_fos = penv_87->en_fos;
2635
2636         /* FPU registers */
2637         for (i = 0; i < 8; ++i)
2638                 sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i];
2639 }
2640
2641 int
2642 fill_fpregs(struct lwp *lp, struct fpreg *fpregs)
2643 {
2644         if (lp->lwp_thread == NULL || lp->lwp_thread->td_pcb == NULL)
2645                 return EINVAL;
2646         if (cpu_fxsr) {
2647                 fill_fpregs_xmm(&lp->lwp_thread->td_pcb->pcb_save.sv_xmm,
2648                                 (struct save87 *)fpregs);
2649                 return (0);
2650         }
2651         bcopy(&lp->lwp_thread->td_pcb->pcb_save.sv_87, fpregs, sizeof *fpregs);
2652         return (0);
2653 }
2654
2655 int
2656 set_fpregs(struct lwp *lp, struct fpreg *fpregs)
2657 {
2658         if (cpu_fxsr) {
2659                 set_fpregs_xmm((struct save87 *)fpregs,
2660                                &lp->lwp_thread->td_pcb->pcb_save.sv_xmm);
2661                 return (0);
2662         }
2663         bcopy(fpregs, &lp->lwp_thread->td_pcb->pcb_save.sv_87, sizeof *fpregs);
2664         return (0);
2665 }
2666
2667 int
2668 fill_dbregs(struct lwp *lp, struct dbreg *dbregs)
2669 {
2670         struct pcb *pcb;
2671
2672         if (lp == NULL) {
2673                 dbregs->dr[0] = rdr0();
2674                 dbregs->dr[1] = rdr1();
2675                 dbregs->dr[2] = rdr2();
2676                 dbregs->dr[3] = rdr3();
2677                 dbregs->dr[4] = rdr4();
2678                 dbregs->dr[5] = rdr5();
2679                 dbregs->dr[6] = rdr6();
2680                 dbregs->dr[7] = rdr7();
2681                 return (0);
2682         }
2683         if (lp->lwp_thread == NULL || (pcb = lp->lwp_thread->td_pcb) == NULL)
2684                 return EINVAL;
2685         dbregs->dr[0] = pcb->pcb_dr0;
2686         dbregs->dr[1] = pcb->pcb_dr1;
2687         dbregs->dr[2] = pcb->pcb_dr2;
2688         dbregs->dr[3] = pcb->pcb_dr3;
2689         dbregs->dr[4] = 0;
2690         dbregs->dr[5] = 0;
2691         dbregs->dr[6] = pcb->pcb_dr6;
2692         dbregs->dr[7] = pcb->pcb_dr7;
2693         return (0);
2694 }
2695
2696 int
2697 set_dbregs(struct lwp *lp, struct dbreg *dbregs)
2698 {
2699         if (lp == NULL) {
2700                 load_dr0(dbregs->dr[0]);
2701                 load_dr1(dbregs->dr[1]);
2702                 load_dr2(dbregs->dr[2]);
2703                 load_dr3(dbregs->dr[3]);
2704                 load_dr4(dbregs->dr[4]);
2705                 load_dr5(dbregs->dr[5]);
2706                 load_dr6(dbregs->dr[6]);
2707                 load_dr7(dbregs->dr[7]);
2708         } else {
2709                 struct pcb *pcb;
2710                 struct ucred *ucred;
2711                 int i;
2712                 uint64_t mask1, mask2;
2713
2714                 /*
2715                  * Don't let an illegal value for dr7 get set.  Specifically,
2716                  * check for undefined settings.  Setting these bit patterns
2717                  * result in undefined behaviour and can lead to an unexpected
2718                  * TRCTRAP.
2719                  */
2720                 /* JG this loop looks unreadable */
2721                 /* Check 4 2-bit fields for invalid patterns.
2722                  * These fields are R/Wi, for i = 0..3
2723                  */
2724                 /* Is 10 in LENi allowed when running in compatibility mode? */
2725                 /* Pattern 10 in R/Wi might be used to indicate
2726                  * breakpoint on I/O. Further analysis should be
2727                  * carried to decide if it is safe and useful to
2728                  * provide access to that capability
2729                  */
2730                 for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 4;
2731                      i++, mask1 <<= 4, mask2 <<= 4)
2732                         if ((dbregs->dr[7] & mask1) == mask2)
2733                                 return (EINVAL);
2734
2735                 pcb = lp->lwp_thread->td_pcb;
2736                 ucred = lp->lwp_proc->p_ucred;
2737
2738                 /*
2739                  * Don't let a process set a breakpoint that is not within the
2740                  * process's address space.  If a process could do this, it
2741                  * could halt the system by setting a breakpoint in the kernel
2742                  * (if ddb was enabled).  Thus, we need to check to make sure
2743                  * that no breakpoints are being enabled for addresses outside
2744                  * process's address space, unless, perhaps, we were called by
2745                  * uid 0.
2746                  *
2747                  * XXX - what about when the watched area of the user's
2748                  * address space is written into from within the kernel
2749                  * ... wouldn't that still cause a breakpoint to be generated
2750                  * from within kernel mode?
2751                  */
2752
2753                 if (priv_check_cred(ucred, PRIV_ROOT, 0) != 0) {
2754                         if (dbregs->dr[7] & 0x3) {
2755                                 /* dr0 is enabled */
2756                                 if (dbregs->dr[0] >= VM_MAX_USER_ADDRESS)
2757                                         return (EINVAL);
2758                         }
2759
2760                         if (dbregs->dr[7] & (0x3<<2)) {
2761                                 /* dr1 is enabled */
2762                                 if (dbregs->dr[1] >= VM_MAX_USER_ADDRESS)
2763                                         return (EINVAL);
2764                         }
2765
2766                         if (dbregs->dr[7] & (0x3<<4)) {
2767                                 /* dr2 is enabled */
2768                                 if (dbregs->dr[2] >= VM_MAX_USER_ADDRESS)
2769                                         return (EINVAL);
2770                         }
2771
2772                         if (dbregs->dr[7] & (0x3<<6)) {
2773                                 /* dr3 is enabled */
2774                                 if (dbregs->dr[3] >= VM_MAX_USER_ADDRESS)
2775                                         return (EINVAL);
2776                         }
2777                 }
2778
2779                 pcb->pcb_dr0 = dbregs->dr[0];
2780                 pcb->pcb_dr1 = dbregs->dr[1];
2781                 pcb->pcb_dr2 = dbregs->dr[2];
2782                 pcb->pcb_dr3 = dbregs->dr[3];
2783                 pcb->pcb_dr6 = dbregs->dr[6];
2784                 pcb->pcb_dr7 = dbregs->dr[7];
2785
2786                 pcb->pcb_flags |= PCB_DBREGS;
2787         }
2788
2789         return (0);
2790 }
2791
2792 /*
2793  * Return > 0 if a hardware breakpoint has been hit, and the
2794  * breakpoint was in user space.  Return 0, otherwise.
2795  */
2796 int
2797 user_dbreg_trap(void)
2798 {
2799         u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */
2800         u_int64_t bp;       /* breakpoint bits extracted from dr6 */
2801         int nbp;            /* number of breakpoints that triggered */
2802         caddr_t addr[4];    /* breakpoint addresses */
2803         int i;
2804
2805         dr7 = rdr7();
2806         if ((dr7 & 0xff) == 0) {
2807                 /*
2808                  * all GE and LE bits in the dr7 register are zero,
2809                  * thus the trap couldn't have been caused by the
2810                  * hardware debug registers
2811                  */
2812                 return 0;
2813         }
2814
2815         nbp = 0;
2816         dr6 = rdr6();
2817         bp = dr6 & 0xf;
2818
2819         if (bp == 0) {
2820                 /*
2821                  * None of the breakpoint bits are set meaning this
2822                  * trap was not caused by any of the debug registers
2823                  */
2824                 return 0;
2825         }
2826
2827         /*
2828          * at least one of the breakpoints were hit, check to see
2829          * which ones and if any of them are user space addresses
2830          */
2831
2832         if (bp & 0x01) {
2833                 addr[nbp++] = (caddr_t)rdr0();
2834         }
2835         if (bp & 0x02) {
2836                 addr[nbp++] = (caddr_t)rdr1();
2837         }
2838         if (bp & 0x04) {
2839                 addr[nbp++] = (caddr_t)rdr2();
2840         }
2841         if (bp & 0x08) {
2842                 addr[nbp++] = (caddr_t)rdr3();
2843         }
2844
2845         for (i=0; i<nbp; i++) {
2846                 if (addr[i] <
2847                     (caddr_t)VM_MAX_USER_ADDRESS) {
2848                         /*
2849                          * addr[i] is in user space
2850                          */
2851                         return nbp;
2852                 }
2853         }
2854
2855         /*
2856          * None of the breakpoints are in user space.
2857          */
2858         return 0;
2859 }
2860
2861
2862 #ifndef DDB
2863 void
2864 Debugger(const char *msg)
2865 {
2866         kprintf("Debugger(\"%s\") called.\n", msg);
2867 }
2868 #endif /* no DDB */
2869
2870 #ifdef DDB
2871
2872 /*
2873  * Provide inb() and outb() as functions.  They are normally only
2874  * available as macros calling inlined functions, thus cannot be
2875  * called inside DDB.
2876  *
2877  * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
2878  */
2879
2880 #undef inb
2881 #undef outb
2882
2883 /* silence compiler warnings */
2884 u_char inb(u_int);
2885 void outb(u_int, u_char);
2886
2887 u_char
2888 inb(u_int port)
2889 {
2890         u_char  data;
2891         /*
2892          * We use %%dx and not %1 here because i/o is done at %dx and not at
2893          * %edx, while gcc generates inferior code (movw instead of movl)
2894          * if we tell it to load (u_short) port.
2895          */
2896         __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port));
2897         return (data);
2898 }
2899
2900 void
2901 outb(u_int port, u_char data)
2902 {
2903         u_char  al;
2904         /*
2905          * Use an unnecessary assignment to help gcc's register allocator.
2906          * This make a large difference for gcc-1.40 and a tiny difference
2907          * for gcc-2.6.0.  For gcc-1.40, al had to be ``asm("ax")'' for
2908          * best results.  gcc-2.6.0 can't handle this.
2909          */
2910         al = data;
2911         __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port));
2912 }
2913
2914 #endif /* DDB */
2915
2916
2917
2918 /*
2919  * initialize all the SMP locks
2920  */
2921
2922 /* critical region when masking or unmasking interupts */
2923 struct spinlock_deprecated imen_spinlock;
2924
2925 /* lock region used by kernel profiling */
2926 struct spinlock_deprecated mcount_spinlock;
2927
2928 /* locks com (tty) data/hardware accesses: a FASTINTR() */
2929 struct spinlock_deprecated com_spinlock;
2930
2931 /* lock regions around the clock hardware */
2932 struct spinlock_deprecated clock_spinlock;
2933
2934 static void
2935 init_locks(void)
2936 {
2937         /*
2938          * Get the initial mplock with a count of 1 for the BSP.
2939          * This uses a LOGICAL cpu ID, ie BSP == 0.
2940          */
2941         cpu_get_initial_mplock();
2942         /* DEPRECATED */
2943         spin_init_deprecated(&mcount_spinlock);
2944         spin_init_deprecated(&imen_spinlock);
2945         spin_init_deprecated(&com_spinlock);
2946         spin_init_deprecated(&clock_spinlock);
2947
2948         /* our token pool needs to work early */
2949         lwkt_token_pool_init();
2950 }
2951
2952 boolean_t
2953 cpu_mwait_hint_valid(uint32_t hint)
2954 {
2955         int cx_idx, sub;
2956
2957         cx_idx = MWAIT_EAX_TO_CX(hint);
2958         if (cx_idx >= CPU_MWAIT_CX_MAX)
2959                 return FALSE;
2960
2961         sub = MWAIT_EAX_TO_CX_SUB(hint);
2962         if (sub >= cpu_mwait_cx_info[cx_idx].subcnt)
2963                 return FALSE;
2964
2965         return TRUE;
2966 }
2967
2968 void
2969 cpu_mwait_cx_no_bmsts(void)
2970 {
2971         atomic_clear_int(&cpu_mwait_c3_preamble, CPU_MWAIT_C3_PREAMBLE_BM_STS);
2972 }
2973
2974 void
2975 cpu_mwait_cx_no_bmarb(void)
2976 {
2977         atomic_clear_int(&cpu_mwait_c3_preamble, CPU_MWAIT_C3_PREAMBLE_BM_ARB);
2978 }
2979
2980 static int
2981 cpu_mwait_cx_hint2name(int hint, char *name, int namelen, boolean_t allow_auto)
2982 {
2983         int old_cx_idx, sub = 0;
2984
2985         if (hint >= 0) {
2986                 old_cx_idx = MWAIT_EAX_TO_CX(hint);
2987                 sub = MWAIT_EAX_TO_CX_SUB(hint);
2988         } else if (hint == CPU_MWAIT_HINT_AUTO) {
2989                 old_cx_idx = allow_auto ? CPU_MWAIT_C2 : CPU_MWAIT_CX_MAX;
2990         } else if (hint == CPU_MWAIT_HINT_AUTODEEP) {
2991                 old_cx_idx = allow_auto ? CPU_MWAIT_C3 : CPU_MWAIT_CX_MAX;
2992         } else {
2993                 old_cx_idx = CPU_MWAIT_CX_MAX;
2994         }
2995
2996         if (!CPU_MWAIT_HAS_CX)
2997                 strlcpy(name, "NONE", namelen);
2998         else if (allow_auto && hint == CPU_MWAIT_HINT_AUTO)
2999                 strlcpy(name, "AUTO", namelen);
3000         else if (allow_auto && hint == CPU_MWAIT_HINT_AUTODEEP)
3001                 strlcpy(name, "AUTODEEP", namelen);
3002         else if (old_cx_idx >= CPU_MWAIT_CX_MAX ||
3003             sub >= cpu_mwait_cx_info[old_cx_idx].subcnt)
3004                 strlcpy(name, "INVALID", namelen);
3005         else
3006                 ksnprintf(name, namelen, "C%d/%d", old_cx_idx, sub);
3007
3008         return old_cx_idx;
3009 }
3010
3011 static int
3012 cpu_mwait_cx_name2hint(char *name, int *hint0, boolean_t allow_auto)
3013 {
3014         int cx_idx, sub, hint;
3015         char *ptr, *start;
3016
3017         if (allow_auto && strcmp(name, "AUTO") == 0) {
3018                 hint = CPU_MWAIT_HINT_AUTO;
3019                 cx_idx = CPU_MWAIT_C2;
3020                 goto done;
3021         }
3022         if (allow_auto && strcmp(name, "AUTODEEP") == 0) {
3023                 hint = CPU_MWAIT_HINT_AUTODEEP;
3024                 cx_idx = CPU_MWAIT_C3;
3025                 goto done;
3026         }
3027
3028         if (strlen(name) < 4 || toupper(name[0]) != 'C')
3029                 return -1;
3030         start = &name[1];
3031         ptr = NULL;
3032
3033         cx_idx = strtol(start, &ptr, 10);
3034         if (ptr == start || *ptr != '/')
3035                 return -1;
3036         if (cx_idx < 0 || cx_idx >= CPU_MWAIT_CX_MAX)
3037                 return -1;
3038
3039         start = ptr + 1;
3040         ptr = NULL;
3041
3042         sub = strtol(start, &ptr, 10);
3043         if (*ptr != '\0')
3044                 return -1;
3045         if (sub < 0 || sub >= cpu_mwait_cx_info[cx_idx].subcnt)
3046                 return -1;
3047
3048         hint = MWAIT_EAX_HINT(cx_idx, sub);
3049 done:
3050         *hint0 = hint;
3051         return cx_idx;
3052 }
3053
3054 static int
3055 cpu_mwait_cx_transit(int old_cx_idx, int cx_idx)
3056 {
3057         if (cx_idx >= CPU_MWAIT_C3 && cpu_mwait_c3_preamble)
3058                 return EOPNOTSUPP;
3059         if (old_cx_idx < CPU_MWAIT_C3 && cx_idx >= CPU_MWAIT_C3) {
3060                 int error;
3061
3062                 error = cputimer_intr_powersave_addreq();
3063                 if (error)
3064                         return error;
3065         } else if (old_cx_idx >= CPU_MWAIT_C3 && cx_idx < CPU_MWAIT_C3) {
3066                 cputimer_intr_powersave_remreq();
3067         }
3068         return 0;
3069 }
3070
3071 static int
3072 cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS, int *hint0,
3073     boolean_t allow_auto)
3074 {
3075         int error, cx_idx, old_cx_idx, hint;
3076         char name[CPU_MWAIT_CX_NAMELEN];
3077
3078         hint = *hint0;
3079         old_cx_idx = cpu_mwait_cx_hint2name(hint, name, sizeof(name),
3080             allow_auto);
3081
3082         error = sysctl_handle_string(oidp, name, sizeof(name), req);
3083         if (error != 0 || req->newptr == NULL)
3084                 return error;
3085
3086         if (!CPU_MWAIT_HAS_CX)
3087                 return EOPNOTSUPP;
3088
3089         cx_idx = cpu_mwait_cx_name2hint(name, &hint, allow_auto);
3090         if (cx_idx < 0)
3091                 return EINVAL;
3092
3093         error = cpu_mwait_cx_transit(old_cx_idx, cx_idx);
3094         if (error)
3095                 return error;
3096
3097         *hint0 = hint;
3098         return 0;
3099 }
3100
3101 static int
3102 cpu_mwait_cx_setname(struct cpu_idle_stat *stat, const char *cx_name)
3103 {
3104         int error, cx_idx, old_cx_idx, hint;
3105         char name[CPU_MWAIT_CX_NAMELEN];
3106
3107         KASSERT(CPU_MWAIT_HAS_CX, ("cpu does not support mwait CX extension"));
3108
3109         hint = stat->hint;
3110         old_cx_idx = cpu_mwait_cx_hint2name(hint, name, sizeof(name), TRUE);
3111
3112         strlcpy(name, cx_name, sizeof(name));
3113         cx_idx = cpu_mwait_cx_name2hint(name, &hint, TRUE);
3114         if (cx_idx < 0)
3115                 return EINVAL;
3116
3117         error = cpu_mwait_cx_transit(old_cx_idx, cx_idx);
3118         if (error)
3119                 return error;
3120
3121         stat->hint = hint;
3122         return 0;
3123 }
3124
3125 static int
3126 cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS)
3127 {
3128         int hint = cpu_mwait_halt_global;
3129         int error, cx_idx, cpu;
3130         char name[CPU_MWAIT_CX_NAMELEN], cx_name[CPU_MWAIT_CX_NAMELEN];
3131
3132         cpu_mwait_cx_hint2name(hint, name, sizeof(name), TRUE);
3133
3134         error = sysctl_handle_string(oidp, name, sizeof(name), req);
3135         if (error != 0 || req->newptr == NULL)
3136                 return error;
3137
3138         if (!CPU_MWAIT_HAS_CX)
3139                 return EOPNOTSUPP;
3140
3141         /* Save name for later per-cpu CX configuration */
3142         strlcpy(cx_name, name, sizeof(cx_name));
3143
3144         cx_idx = cpu_mwait_cx_name2hint(name, &hint, TRUE);
3145         if (cx_idx < 0)
3146                 return EINVAL;
3147
3148         /* Change per-cpu CX configuration */
3149         for (cpu = 0; cpu < ncpus; ++cpu) {
3150                 error = cpu_mwait_cx_setname(&cpu_idle_stats[cpu], cx_name);
3151                 if (error)
3152                         return error;
3153         }
3154
3155         cpu_mwait_halt_global = hint;
3156         return 0;
3157 }
3158
3159 static int
3160 cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS)
3161 {
3162         struct cpu_idle_stat *stat = arg1;
3163         int error;
3164
3165         error = cpu_mwait_cx_select_sysctl(oidp, arg1, arg2, req,
3166             &stat->hint, TRUE);
3167         return error;
3168 }
3169
3170 static int
3171 cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS)
3172 {
3173         int error;
3174
3175         error = cpu_mwait_cx_select_sysctl(oidp, arg1, arg2, req,
3176             &cpu_mwait_spin, FALSE);
3177         return error;
3178 }
3179
3180 /*
3181  * This manual debugging code is called unconditionally from Xtimer
3182  * (the per-cpu timer interrupt) whether the current thread is in a
3183  * critical section or not) and can be useful in tracking down lockups.
3184  *
3185  * NOTE: MANUAL DEBUG CODE
3186  */
3187 #if 0
3188 static int saveticks[SMP_MAXCPU];
3189 static int savecounts[SMP_MAXCPU];
3190 #endif
3191
3192 void
3193 pcpu_timer_always(struct intrframe *frame)
3194 {
3195 #if 0
3196         globaldata_t gd = mycpu;
3197         int cpu = gd->gd_cpuid;
3198         char buf[64];
3199         short *gptr;
3200         int i;
3201
3202         if (cpu <= 20) {
3203                 gptr = (short *)0xFFFFFFFF800b8000 + 80 * cpu;
3204                 *gptr = ((*gptr + 1) & 0x00FF) | 0x0700;
3205                 ++gptr;
3206
3207                 ksnprintf(buf, sizeof(buf), " %p %16s %d %16s ",
3208                     (void *)frame->if_rip, gd->gd_curthread->td_comm, ticks,
3209                     gd->gd_infomsg);
3210                 for (i = 0; buf[i]; ++i) {
3211                         gptr[i] = 0x0700 | (unsigned char)buf[i];
3212                 }
3213         }
3214 #if 0
3215         if (saveticks[gd->gd_cpuid] != ticks) {
3216                 saveticks[gd->gd_cpuid] = ticks;
3217                 savecounts[gd->gd_cpuid] = 0;
3218         }
3219         ++savecounts[gd->gd_cpuid];
3220         if (savecounts[gd->gd_cpuid] > 2000 && panicstr == NULL) {
3221                 panic("cpud %d panicing on ticks failure",
3222                         gd->gd_cpuid);
3223         }
3224         for (i = 0; i < ncpus; ++i) {
3225                 int delta;
3226                 if (saveticks[i] && panicstr == NULL) {
3227                         delta = saveticks[i] - ticks;
3228                         if (delta < -10 || delta > 10) {
3229                                 panic("cpu %d panicing on cpu %d watchdog",
3230                                       gd->gd_cpuid, i);
3231                         }
3232                 }
3233         }
3234 #endif
3235 #endif
3236 }