2 * Copyright (c) 2003 Peter Wemm.
3 * Copyright (c) 1993 The Regents of the University of California.
4 * Copyright (c) 2008 The DragonFly Project.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * $FreeBSD: src/sys/amd64/include/cpufunc.h,v 1.139 2004/01/28 23:53:04 peter Exp $
35 * Functions to provide access to special i386 instructions.
36 * This in included in sys/systm.h, and that file should be
37 * used in preference to this.
40 #ifndef _CPU_CPUFUNC_H_
41 #define _CPU_CPUFUNC_H_
43 #include <sys/cdefs.h>
44 #include <sys/thread.h>
45 #include <machine/clock.h>
46 #include <machine/psl.h>
47 #include <machine/smp.h>
50 struct region_descriptor
;
54 #define readb(va) (*(volatile u_int8_t *) (va))
55 #define readw(va) (*(volatile u_int16_t *) (va))
56 #define readl(va) (*(volatile u_int32_t *) (va))
57 #define readq(va) (*(volatile u_int64_t *) (va))
59 #define writeb(va, d) (*(volatile u_int8_t *) (va) = (d))
60 #define writew(va, d) (*(volatile u_int16_t *) (va) = (d))
61 #define writel(va, d) (*(volatile u_int32_t *) (va) = (d))
62 #define writeq(va, d) (*(volatile u_int64_t *) (va) = (d))
66 #include <machine/lock.h> /* XXX */
73 __asm
__volatile("int $3");
79 __asm
__volatile("pause":::"memory");
87 __asm
__volatile("bsfl %1,%0" : "=r" (result
) : "rm" (mask
));
91 static __inline u_long
96 __asm
__volatile("bsfq %1,%0" : "=r" (result
) : "rm" (mask
));
100 static __inline u_long
105 __asm
__volatile("bsfq %1,%0" : "=r" (result
) : "rm" (mask
));
109 static __inline u_int
114 __asm
__volatile("bsrl %1,%0" : "=r" (result
) : "rm" (mask
));
118 static __inline u_long
123 __asm
__volatile("bsrq %1,%0" : "=r" (result
) : "rm" (mask
));
130 __asm
__volatile("clflush %0" : : "m" (*(char *) addr
));
134 do_cpuid(u_int ax
, u_int
*p
)
136 __asm
__volatile("cpuid"
137 : "=a" (p
[0]), "=b" (p
[1]), "=c" (p
[2]), "=d" (p
[3])
142 cpuid_count(u_int ax
, u_int cx
, u_int
*p
)
144 __asm
__volatile("cpuid"
145 : "=a" (p
[0]), "=b" (p
[1]), "=c" (p
[2]), "=d" (p
[3])
146 : "0" (ax
), "c" (cx
));
149 #ifndef _CPU_DISABLE_INTR_DEFINED
152 cpu_disable_intr(void)
154 __asm
__volatile("cli" : : : "memory");
159 #ifndef _CPU_ENABLE_INTR_DEFINED
162 cpu_enable_intr(void)
164 __asm
__volatile("sti");
170 * Cpu and compiler memory ordering fence. mfence ensures strong read and
173 * A serializing or fence instruction is required here. A locked bus
174 * cycle on data for which we already own cache mastership is the most
180 __asm
__volatile("mfence" : : : "memory");
184 * cpu_lfence() ensures strong read ordering for reads issued prior
185 * to the instruction verses reads issued afterwords.
187 * A serializing or fence instruction is required here. A locked bus
188 * cycle on data for which we already own cache mastership is the most
194 __asm
__volatile("lfence" : : : "memory");
198 * cpu_sfence() ensures strong write ordering for writes issued prior
199 * to the instruction verses writes issued afterwords. Writes are
200 * ordered on intel cpus so we do not actually have to do anything.
207 * Don't use 'sfence' here, as it will create a lot of
208 * unnecessary stalls.
210 __asm
__volatile("" : : : "memory");
214 * cpu_ccfence() prevents the compiler from reordering instructions, in
215 * particular stores, relative to the current cpu. Use cpu_sfence() if
216 * you need to guarentee ordering by both the compiler and by the cpu.
218 * This also prevents the compiler from caching memory loads into local
219 * variables across the routine.
224 __asm
__volatile("" : : : "memory");
228 * This is a horrible, horrible hack that might have to be put at the
229 * end of certain procedures (on a case by case basis), just before it
230 * returns to avoid what we believe to be an unreported AMD cpu bug.
231 * Found to occur on both a Phenom II X4 820 (two of them), as well
232 * as a 48-core built around an Opteron 6168 (Id = 0x100f91 Stepping = 1).
233 * The problem does not appear to occur w/Intel cpus.
235 * The bug is likely related to either a write combining issue or the
236 * Return Address Stack (RAS) hardware cache.
238 * In particular, we had to do this for GCC's fill_sons_in_loop() routine
239 * which due to its deep recursion and stack flow appears to be able to
240 * tickle the amd cpu bug (w/ gcc-4.4.7). Adding a single 'nop' to the
241 * end of the routine just before it returns works around the bug.
243 * The bug appears to be extremely sensitive to %rip and %rsp values, to
244 * the point where even just inserting an instruction in an unrelated
245 * procedure (shifting the entire code base being run) effects the outcome.
246 * DragonFly is probably able to more readily reproduce the bug due to
247 * the stackgap randomization code. We would expect OpenBSD (where we got
248 * the stackgap randomization code from) to also be able to reproduce the
249 * issue. To date we have only reproduced the issue in DragonFly.
251 #define __AMDCPUBUG_DFLY01_AVAILABLE__
254 cpu_amdcpubug_dfly01(void)
256 __asm
__volatile("nop" : : : "memory");
261 #define HAVE_INLINE_FFS
266 return (__builtin_ffs(mask
));
269 #define HAVE_INLINE_FFSL
274 return (__builtin_ffsl(mask
));
277 #define HAVE_INLINE_FLS
282 return (mask
== 0 ? mask
: (int)bsrl((u_int
)mask
) + 1);
285 #define HAVE_INLINE_FLSL
290 return (mask
== 0 ? mask
: (int)bsrq((u_long
)mask
) + 1);
293 #define HAVE_INLINE_FLSLL
296 flsll(long long mask
)
298 return (flsl((long)mask
));
306 __asm
__volatile("hlt");
310 * The following complications are to get around gcc not having a
311 * constraint letter for the range 0..255. We still put "d" in the
312 * constraint because "i" isn't a valid constraint when the port
313 * isn't constant. This only matters for -O0 because otherwise
314 * the non-working version gets optimized away.
316 * Use an expression-statement instead of a conditional expression
317 * because gcc-2.6.0 would promote the operands of the conditional
318 * and produce poor code for "if ((inb(var) & const1) == const2)".
320 * The unnecessary test `(port) < 0x10000' is to generate a warning if
321 * the `port' has type u_short or smaller. Such types are pessimal.
322 * This actually only works for signed types. The range check is
323 * careful to avoid generating warnings.
325 #define inb(port) __extension__ ({ \
327 if (__builtin_constant_p(port) && ((port) & 0xffff) < 0x100 \
328 && (port) < 0x10000) \
329 _data = inbc(port); \
331 _data = inbv(port); \
334 #define outb(port, data) ( \
335 __builtin_constant_p(port) && ((port) & 0xffff) < 0x100 \
336 && (port) < 0x10000 \
337 ? outbc(port, data) : outbv(port, data))
339 static __inline u_char
344 __asm
__volatile("inb %1,%0" : "=a" (data
) : "id" ((u_short
)(port
)));
349 outbc(u_int port
, u_char data
)
351 __asm
__volatile("outb %0,%1" : : "a" (data
), "id" ((u_short
)(port
)));
354 static __inline u_char
359 * We use %%dx and not %1 here because i/o is done at %dx and not at
360 * %edx, while gcc generates inferior code (movw instead of movl)
361 * if we tell it to load (u_short) port.
363 __asm
__volatile("inb %%dx,%0" : "=a" (data
) : "d" (port
));
367 static __inline u_int
372 __asm
__volatile("inl %%dx,%0" : "=a" (data
) : "d" (port
));
377 insb(u_int port
, void *addr
, size_t cnt
)
379 __asm
__volatile("cld; rep; insb"
380 : "+D" (addr
), "+c" (cnt
)
386 insw(u_int port
, void *addr
, size_t cnt
)
388 __asm
__volatile("cld; rep; insw"
389 : "+D" (addr
), "+c" (cnt
)
395 insl(u_int port
, void *addr
, size_t cnt
)
397 __asm
__volatile("cld; rep; insl"
398 : "+D" (addr
), "+c" (cnt
)
406 __asm
__volatile("invd");
411 #ifndef _CPU_INVLPG_DEFINED
414 * Invalidate a particular VA on this cpu only
416 * TLB flush for an individual page (even if it has PG_G).
417 * Only works on 486+ CPUs (i386 does not have PG_G).
420 cpu_invlpg(void *addr
)
422 __asm
__volatile("invlpg %0" : : "m" (*(char *)addr
) : "memory");
430 __asm
__volatile("rep; nop");
435 static __inline u_short
440 __asm
__volatile("inw %%dx,%0" : "=a" (data
) : "d" (port
));
444 static __inline u_int
445 loadandclear(volatile u_int
*addr
)
449 __asm
__volatile("xorl %0,%0; xchgl %1,%0"
450 : "=&r" (result
) : "m" (*addr
));
455 outbv(u_int port
, u_char data
)
459 * Use an unnecessary assignment to help gcc's register allocator.
460 * This make a large difference for gcc-1.40 and a tiny difference
461 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for
462 * best results. gcc-2.6.0 can't handle this.
465 __asm
__volatile("outb %0,%%dx" : : "a" (al
), "d" (port
));
469 outl(u_int port
, u_int data
)
472 * outl() and outw() aren't used much so we haven't looked at
473 * possible micro-optimizations such as the unnecessary
474 * assignment for them.
476 __asm
__volatile("outl %0,%%dx" : : "a" (data
), "d" (port
));
480 outsb(u_int port
, const void *addr
, size_t cnt
)
482 __asm
__volatile("cld; rep; outsb"
483 : "+S" (addr
), "+c" (cnt
)
488 outsw(u_int port
, const void *addr
, size_t cnt
)
490 __asm
__volatile("cld; rep; outsw"
491 : "+S" (addr
), "+c" (cnt
)
496 outsl(u_int port
, const void *addr
, size_t cnt
)
498 __asm
__volatile("cld; rep; outsl"
499 : "+S" (addr
), "+c" (cnt
)
504 outw(u_int port
, u_short data
)
506 __asm
__volatile("outw %0,%%dx" : : "a" (data
), "d" (port
));
512 __asm
__volatile("pause");
515 static __inline u_long
520 __asm
__volatile("pushfq; popq %0" : "=r" (rf
));
524 static __inline u_int64_t
529 __asm
__volatile("rdmsr" : "=a" (low
), "=d" (high
) : "c" (msr
));
530 return (low
| ((u_int64_t
)high
<< 32));
533 static __inline u_int64_t
538 __asm
__volatile("rdpmc" : "=a" (low
), "=d" (high
) : "c" (pmc
));
539 return (low
| ((u_int64_t
)high
<< 32));
542 #define _RDTSC_SUPPORTED_
544 static __inline tsc_uclock_t
549 __asm
__volatile("rdtsc" : "=a" (low
), "=d" (high
));
550 return (low
| ((tsc_uclock_t
)high
<< 32));
554 #include <machine/cputypes.h>
555 #include <machine/md_var.h>
557 static __inline tsc_uclock_t
560 if (cpu_vendor_id
== CPU_VENDOR_INTEL
)
571 __asm
__volatile("wbinvd");
575 void cpu_wbinvd_on_all_cpus_callback(void *arg
);
578 cpu_wbinvd_on_all_cpus(void)
580 lwkt_cpusync_simple(smp_active_mask
, cpu_wbinvd_on_all_cpus_callback
, NULL
);
585 write_rflags(u_long rf
)
587 __asm
__volatile("pushq %0; popfq" : : "r" (rf
));
591 wrmsr(u_int msr
, u_int64_t newval
)
597 __asm
__volatile("wrmsr"
599 : "a" (low
), "d" (high
), "c" (msr
)
604 load_xcr(u_int xcr
, uint64_t newval
)
611 __asm
__volatile("xsetbv"
613 : "a" (low
), "d" (high
), "c" (xcr
)
617 static __inline
uint64_t
622 __asm
__volatile("xgetbv" : "=a" (low
), "=d" (high
) : "c" (xcr
));
623 return (low
| ((uint64_t)high
<< 32));
627 load_cr0(u_long data
)
629 __asm
__volatile("movq %0,%%cr0" : : "r" (data
) : "memory");
632 static __inline u_long
637 __asm
__volatile("movq %%cr0,%0" : "=r" (data
));
642 load_cr2(u_long data
)
644 __asm
__volatile("movq %0,%%cr2" : : "r" (data
) : "memory");
647 static __inline u_long
652 __asm
__volatile("movq %%cr2,%0" : "=r" (data
));
657 load_cr3(u_long data
)
659 __asm
__volatile("movq %0,%%cr3" : : "r" (data
) : "memory");
662 static __inline u_long
667 __asm
__volatile("movq %%cr3,%0" : "=r" (data
));
672 load_cr4(u_long data
)
674 __asm
__volatile("movq %0,%%cr4" : : "r" (data
) : "memory");
677 static __inline u_long
682 __asm
__volatile("movq %%cr4,%0" : "=r" (data
));
686 #ifndef _CPU_INVLTLB_DEFINED
689 * Invalidate the TLB on this cpu only
695 #if defined(SWTCH_OPTIM_STATS)
702 void smp_invltlb(void);
703 void smp_sniff(void);
705 void hard_sniff(struct trapframe
*);
707 static __inline u_short
711 __asm
__volatile("movw %%fs,%0" : "=rm" (sel
));
715 static __inline u_short
719 __asm
__volatile("movw %%gs,%0" : "=rm" (sel
));
726 __asm
__volatile("movw %0,%%ds" : : "rm" (sel
));
732 __asm
__volatile("movw %0,%%es" : : "rm" (sel
));
736 /* This is defined in <machine/specialreg.h> but is too painful to get to */
738 #define MSR_FSBASE 0xc0000100
743 /* Preserve the fsbase value across the selector load */
744 __asm
__volatile("rdmsr; movw %0,%%fs; wrmsr"
745 : : "rm" (sel
), "c" (MSR_FSBASE
) : "eax", "edx");
749 #define MSR_GSBASE 0xc0000101
755 * Preserve the gsbase value across the selector load.
756 * Note that we have to disable interrupts because the gsbase
757 * being trashed happens to be the kernel gsbase at the time.
759 __asm
__volatile("pushfq; cli; rdmsr; movw %0,%%gs; wrmsr; popfq"
760 : : "rm" (sel
), "c" (MSR_GSBASE
) : "eax", "edx");
763 /* Usable by userland */
767 __asm
__volatile("movw %0,%%fs" : : "rm" (sel
));
773 __asm
__volatile("movw %0,%%gs" : : "rm" (sel
));
777 /* void lidt(struct region_descriptor *addr); */
779 lidt(struct region_descriptor
*addr
)
781 __asm
__volatile("lidt (%0)" : : "r" (addr
));
784 /* void lldt(u_short sel); */
788 __asm
__volatile("lldt %0" : : "r" (sel
));
791 /* void ltr(u_short sel); */
795 __asm
__volatile("ltr %0" : : "r" (sel
));
798 static __inline u_int64_t
802 __asm
__volatile("movq %%dr0,%0" : "=r" (data
));
807 load_dr0(u_int64_t dr0
)
809 __asm
__volatile("movq %0,%%dr0" : : "r" (dr0
) : "memory");
812 static __inline u_int64_t
816 __asm
__volatile("movq %%dr1,%0" : "=r" (data
));
821 load_dr1(u_int64_t dr1
)
823 __asm
__volatile("movq %0,%%dr1" : : "r" (dr1
) : "memory");
826 static __inline u_int64_t
830 __asm
__volatile("movq %%dr2,%0" : "=r" (data
));
835 load_dr2(u_int64_t dr2
)
837 __asm
__volatile("movq %0,%%dr2" : : "r" (dr2
) : "memory");
840 static __inline u_int64_t
844 __asm
__volatile("movq %%dr3,%0" : "=r" (data
));
849 load_dr3(u_int64_t dr3
)
851 __asm
__volatile("movq %0,%%dr3" : : "r" (dr3
) : "memory");
854 static __inline u_int64_t
858 __asm
__volatile("movq %%dr4,%0" : "=r" (data
));
863 load_dr4(u_int64_t dr4
)
865 __asm
__volatile("movq %0,%%dr4" : : "r" (dr4
) : "memory");
868 static __inline u_int64_t
872 __asm
__volatile("movq %%dr5,%0" : "=r" (data
));
877 load_dr5(u_int64_t dr5
)
879 __asm
__volatile("movq %0,%%dr5" : : "r" (dr5
) : "memory");
882 static __inline u_int64_t
886 __asm
__volatile("movq %%dr6,%0" : "=r" (data
));
891 load_dr6(u_int64_t dr6
)
893 __asm
__volatile("movq %0,%%dr6" : : "r" (dr6
) : "memory");
896 static __inline u_int64_t
900 __asm
__volatile("movq %%dr7,%0" : "=r" (data
));
905 load_dr7(u_int64_t dr7
)
907 __asm
__volatile("movq %0,%%dr7" : : "r" (dr7
) : "memory");
910 static __inline register_t
915 rflags
= read_rflags();
921 intr_restore(register_t rflags
)
923 write_rflags(rflags
);
926 #else /* !__GNUC__ */
928 int breakpoint(void);
929 void cpu_pause(void);
930 u_int
bsfl(u_int mask
);
931 u_int
bsrl(u_int mask
);
932 void cpu_disable_intr(void);
933 void cpu_enable_intr(void);
934 void cpu_invlpg(u_long addr
);
935 void cpu_invlpg_range(u_long start
, u_long end
);
936 void do_cpuid(u_int ax
, u_int
*p
);
938 u_char
inb(u_int port
);
939 u_int
inl(u_int port
);
940 void insb(u_int port
, void *addr
, size_t cnt
);
941 void insl(u_int port
, void *addr
, size_t cnt
);
942 void insw(u_int port
, void *addr
, size_t cnt
);
944 void invlpg_range(u_int start
, u_int end
);
945 void cpu_invltlb(void);
946 u_short
inw(u_int port
);
947 void load_cr0(u_int cr0
);
948 void load_cr2(u_int cr2
);
949 void load_cr3(u_int cr3
);
950 void load_cr4(u_int cr4
);
951 void load_fs(u_int sel
);
952 void load_gs(u_int sel
);
953 void lidt(struct region_descriptor
*addr
);
954 void lldt(u_short sel
);
955 void ltr(u_short sel
);
956 void outb(u_int port
, u_char data
);
957 void outl(u_int port
, u_int data
);
958 void outsb(u_int port
, void *addr
, size_t cnt
);
959 void outsl(u_int port
, void *addr
, size_t cnt
);
960 void outsw(u_int port
, void *addr
, size_t cnt
);
961 void outw(u_int port
, u_short data
);
962 void ia32_pause(void);
969 u_int64_t
rdmsr(u_int msr
);
970 u_int64_t
rdpmc(u_int pmc
);
971 tsc_uclock_t
rdtsc(void);
972 u_int
read_rflags(void);
974 void write_rflags(u_int rf
);
975 void wrmsr(u_int msr
, u_int64_t newval
);
976 u_int64_t
rdr0(void);
977 void load_dr0(u_int64_t dr0
);
978 u_int64_t
rdr1(void);
979 void load_dr1(u_int64_t dr1
);
980 u_int64_t
rdr2(void);
981 void load_dr2(u_int64_t dr2
);
982 u_int64_t
rdr3(void);
983 void load_dr3(u_int64_t dr3
);
984 u_int64_t
rdr4(void);
985 void load_dr4(u_int64_t dr4
);
986 u_int64_t
rdr5(void);
987 void load_dr5(u_int64_t dr5
);
988 u_int64_t
rdr6(void);
989 void load_dr6(u_int64_t dr6
);
990 u_int64_t
rdr7(void);
991 void load_dr7(u_int64_t dr7
);
992 register_t
intr_disable(void);
993 void intr_restore(register_t rf
);
995 #endif /* __GNUC__ */
997 int rdmsr_safe(u_int msr
, uint64_t *val
);
998 int wrmsr_safe(u_int msr
, uint64_t newval
);
999 void reset_dbregs(void);
1000 void smap_open(void);
1001 void smap_close(void);
1005 #endif /* !_CPU_CPUFUNC_H_ */