1 // Copyright 2019 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
16 #include "hwy/targets.h"
18 #ifndef __STDC_FORMAT_MACROS
19 #define __STDC_FORMAT_MACROS // before inttypes.h
21 #include <inttypes.h> // IWYU pragma: keep (PRIx64)
24 #include <stdlib.h> // abort / exit
26 #include "hwy/highway.h"
27 #include "hwy/per_target.h" // VectorBytes
29 #if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
30 #include "sanitizer/common_interface_defs.h" // __sanitizer_print_stack_trace
34 #include <xmmintrin.h>
37 #else // !HWY_COMPILER_MSVC
39 #endif // HWY_COMPILER_MSVC
41 #elif (HWY_ARCH_ARM || HWY_ARCH_PPC) && HWY_OS_LINUX
42 // sys/auxv.h does not always include asm/hwcap.h, or define HWCAP*, hence we
43 // still include this directly. See #1199.
44 #ifndef TOOLCHAIN_MISS_ASM_HWCAP_H
45 #include <asm/hwcap.h>
47 #ifndef TOOLCHAIN_MISS_SYS_AUXV_H
56 // When running tests, this value can be set to the mocked supported targets
57 // mask. Only written to from a single thread before the test starts.
58 int64_t supported_targets_for_test_
= 0;
60 // Mask of targets disabled at runtime with DisableTargets.
61 int64_t supported_mask_
= LimitsMax
<int64_t>();
63 #if HWY_ARCH_X86 && HWY_HAVE_RUNTIME_DISPATCH
66 // Calls CPUID instruction with eax=level and ecx=count and returns the result
67 // in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd).
68 HWY_INLINE
void Cpuid(const uint32_t level
, const uint32_t count
,
69 uint32_t* HWY_RESTRICT abcd
) {
72 __cpuidex(regs
, level
, count
);
73 for (int i
= 0; i
< 4; ++i
) {
76 #else // HWY_COMPILER_MSVC
81 __cpuid_count(level
, count
, a
, b
, c
, d
);
86 #endif // HWY_COMPILER_MSVC
89 HWY_INLINE
bool IsBitSet(const uint32_t reg
, const int index
) {
90 return (reg
& (1U << index
)) != 0;
93 // Returns the lower 32 bits of extended control register 0.
94 // Requires CPU support for "OSXSAVE" (see below).
97 return static_cast<uint32_t>(_xgetbv(0));
98 #else // HWY_COMPILER_MSVC
99 uint32_t xcr0
, xcr0_high
;
100 const uint32_t index
= 0;
101 asm volatile(".byte 0x0F, 0x01, 0xD0"
102 : "=a"(xcr0
), "=d"(xcr0_high
)
105 #endif // HWY_COMPILER_MSVC
111 const uint32_t max_level
= abcd
[0];
112 return max_level
>= 1 && abcd
[1] == 0x68747541 && abcd
[2] == 0x444d4163 &&
113 abcd
[3] == 0x69746e65;
116 // Arbitrary bit indices indicating which instruction set extensions are
117 // supported. Use enum to ensure values are distinct.
118 enum class FeatureIndex
: uint32_t {
154 static_assert(static_cast<size_t>(FeatureIndex::kSentinel
) < 64,
155 "Too many bits for u64");
157 HWY_INLINE
constexpr uint64_t Bit(FeatureIndex index
) {
158 return 1ull << static_cast<size_t>(index
);
161 // Returns bit array of FeatureIndex from CPUID feature flags.
162 uint64_t FlagsFromCPUID() {
163 uint64_t flags
= 0; // return value
166 const uint32_t max_level
= abcd
[0];
168 // Standard feature flags
170 flags
|= IsBitSet(abcd
[3], 25) ? Bit(FeatureIndex::kSSE
) : 0;
171 flags
|= IsBitSet(abcd
[3], 26) ? Bit(FeatureIndex::kSSE2
) : 0;
172 flags
|= IsBitSet(abcd
[2], 0) ? Bit(FeatureIndex::kSSE3
) : 0;
173 flags
|= IsBitSet(abcd
[2], 1) ? Bit(FeatureIndex::kCLMUL
) : 0;
174 flags
|= IsBitSet(abcd
[2], 9) ? Bit(FeatureIndex::kSSSE3
) : 0;
175 flags
|= IsBitSet(abcd
[2], 12) ? Bit(FeatureIndex::kFMA
) : 0;
176 flags
|= IsBitSet(abcd
[2], 19) ? Bit(FeatureIndex::kSSE41
) : 0;
177 flags
|= IsBitSet(abcd
[2], 20) ? Bit(FeatureIndex::kSSE42
) : 0;
178 flags
|= IsBitSet(abcd
[2], 25) ? Bit(FeatureIndex::kAES
) : 0;
179 flags
|= IsBitSet(abcd
[2], 28) ? Bit(FeatureIndex::kAVX
) : 0;
180 flags
|= IsBitSet(abcd
[2], 29) ? Bit(FeatureIndex::kF16C
) : 0;
182 // Extended feature flags
183 Cpuid(0x80000001U
, 0, abcd
);
184 flags
|= IsBitSet(abcd
[2], 5) ? Bit(FeatureIndex::kLZCNT
) : 0;
187 if (max_level
>= 7) {
189 flags
|= IsBitSet(abcd
[1], 3) ? Bit(FeatureIndex::kBMI
) : 0;
190 flags
|= IsBitSet(abcd
[1], 5) ? Bit(FeatureIndex::kAVX2
) : 0;
191 flags
|= IsBitSet(abcd
[1], 8) ? Bit(FeatureIndex::kBMI2
) : 0;
193 flags
|= IsBitSet(abcd
[1], 16) ? Bit(FeatureIndex::kAVX512F
) : 0;
194 flags
|= IsBitSet(abcd
[1], 17) ? Bit(FeatureIndex::kAVX512DQ
) : 0;
195 flags
|= IsBitSet(abcd
[1], 28) ? Bit(FeatureIndex::kAVX512CD
) : 0;
196 flags
|= IsBitSet(abcd
[1], 30) ? Bit(FeatureIndex::kAVX512BW
) : 0;
197 flags
|= IsBitSet(abcd
[1], 31) ? Bit(FeatureIndex::kAVX512VL
) : 0;
199 flags
|= IsBitSet(abcd
[2], 1) ? Bit(FeatureIndex::kVBMI
) : 0;
200 flags
|= IsBitSet(abcd
[2], 6) ? Bit(FeatureIndex::kVBMI2
) : 0;
201 flags
|= IsBitSet(abcd
[2], 8) ? Bit(FeatureIndex::kGFNI
) : 0;
202 flags
|= IsBitSet(abcd
[2], 9) ? Bit(FeatureIndex::kVAES
) : 0;
203 flags
|= IsBitSet(abcd
[2], 10) ? Bit(FeatureIndex::kVPCLMULQDQ
) : 0;
204 flags
|= IsBitSet(abcd
[2], 11) ? Bit(FeatureIndex::kVNNI
) : 0;
205 flags
|= IsBitSet(abcd
[2], 12) ? Bit(FeatureIndex::kBITALG
) : 0;
206 flags
|= IsBitSet(abcd
[2], 14) ? Bit(FeatureIndex::kPOPCNTDQ
) : 0;
212 // Each Highway target requires a 'group' of multiple features/flags.
213 constexpr uint64_t kGroupSSE2
=
214 Bit(FeatureIndex::kSSE
) | Bit(FeatureIndex::kSSE2
);
216 constexpr uint64_t kGroupSSSE3
=
217 Bit(FeatureIndex::kSSE3
) | Bit(FeatureIndex::kSSSE3
) | kGroupSSE2
;
219 constexpr uint64_t kGroupSSE4
=
220 Bit(FeatureIndex::kSSE41
) | Bit(FeatureIndex::kSSE42
) |
221 Bit(FeatureIndex::kCLMUL
) | Bit(FeatureIndex::kAES
) | kGroupSSSE3
;
223 // We normally assume BMI/BMI2/FMA are available if AVX2 is. This allows us to
224 // use BZHI and (compiler-generated) MULX. However, VirtualBox lacks them
225 // [https://www.virtualbox.org/ticket/15471]. Thus we provide the option of
226 // avoiding using and requiring these so AVX2 can still be used.
227 #ifdef HWY_DISABLE_BMI2_FMA
228 constexpr uint64_t kGroupBMI2_FMA
= 0;
230 constexpr uint64_t kGroupBMI2_FMA
= Bit(FeatureIndex::kBMI
) |
231 Bit(FeatureIndex::kBMI2
) |
232 Bit(FeatureIndex::kFMA
);
235 #ifdef HWY_DISABLE_F16C
236 constexpr uint64_t kGroupF16C
= 0;
238 constexpr uint64_t kGroupF16C
= Bit(FeatureIndex::kF16C
);
241 constexpr uint64_t kGroupAVX2
=
242 Bit(FeatureIndex::kAVX
) | Bit(FeatureIndex::kAVX2
) |
243 Bit(FeatureIndex::kLZCNT
) | kGroupBMI2_FMA
| kGroupF16C
| kGroupSSE4
;
245 constexpr uint64_t kGroupAVX3
=
246 Bit(FeatureIndex::kAVX512F
) | Bit(FeatureIndex::kAVX512VL
) |
247 Bit(FeatureIndex::kAVX512DQ
) | Bit(FeatureIndex::kAVX512BW
) |
248 Bit(FeatureIndex::kAVX512CD
) | kGroupAVX2
;
250 constexpr uint64_t kGroupAVX3_DL
=
251 Bit(FeatureIndex::kVNNI
) | Bit(FeatureIndex::kVPCLMULQDQ
) |
252 Bit(FeatureIndex::kVBMI
) | Bit(FeatureIndex::kVBMI2
) |
253 Bit(FeatureIndex::kVAES
) | Bit(FeatureIndex::kPOPCNTDQ
) |
254 Bit(FeatureIndex::kBITALG
) | Bit(FeatureIndex::kGFNI
) | kGroupAVX3
;
256 int64_t DetectTargets() {
257 int64_t bits
= 0; // return value of supported targets.
259 bits
|= HWY_SSE2
; // always present in x64
262 const uint64_t flags
= FlagsFromCPUID();
263 // Set target bit(s) if all their group's flags are all set.
264 if ((flags
& kGroupAVX3_DL
) == kGroupAVX3_DL
) {
267 if ((flags
& kGroupAVX3
) == kGroupAVX3
) {
270 if ((flags
& kGroupAVX2
) == kGroupAVX2
) {
273 if ((flags
& kGroupSSE4
) == kGroupSSE4
) {
276 if ((flags
& kGroupSSSE3
) == kGroupSSSE3
) {
280 if ((flags
& kGroupSSE2
) == kGroupSSE2
) {
285 // Clear bits if the OS does not support XSAVE - otherwise, registers
286 // are not preserved across context switches.
289 const bool has_osxsave
= IsBitSet(abcd
[2], 27);
291 const uint32_t xcr0
= ReadXCR0();
292 const int64_t min_avx3
= HWY_AVX3
| HWY_AVX3_DL
;
293 const int64_t min_avx2
= HWY_AVX2
| min_avx3
;
295 if (!IsBitSet(xcr0
, 1)) {
297 // The HWY_SSE2, HWY_SSSE3, and HWY_SSE4 bits do not need to be
298 // cleared on x86_64, even if bit 1 of XCR0 is not set, as
299 // the lower 128 bits of XMM0-XMM15 are guaranteed to be
300 // preserved across context switches on x86_64
302 // Only clear the AVX2/AVX3 bits on x86_64 if bit 1 of XCR0 is not set
305 bits
&= ~(HWY_SSE2
| HWY_SSSE3
| HWY_SSE4
| min_avx2
);
309 if (!IsBitSet(xcr0
, 2)) {
313 if (!IsBitSet(xcr0
, 5) || !IsBitSet(xcr0
, 6) || !IsBitSet(xcr0
, 7)) {
318 // This is mainly to work around the slow Zen4 CompressStore. It's unclear
319 // whether subsequent AMD models will be affected; assume yes.
320 if ((bits
& HWY_AVX3_DL
) && IsAMD()) {
321 bits
|= HWY_AVX3_ZEN4
;
328 #elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
330 int64_t DetectTargets() {
331 int64_t bits
= 0; // return value of supported targets.
332 using CapBits
= unsigned long; // NOLINT
333 const CapBits hw
= getauxval(AT_HWCAP
);
337 bits
|= HWY_NEON_WITHOUT_AES
; // aarch64 always has NEON and VFPv4..
339 // .. but not necessarily AES, which is required for HWY_NEON.
340 #if defined(HWCAP_AES)
341 if (hw
& HWCAP_AES
) {
346 #if defined(HWCAP_SVE)
347 if (hw
& HWCAP_SVE
) {
352 #if defined(HWCAP2_SVE2) && defined(HWCAP2_SVEAES)
353 const CapBits hw2
= getauxval(AT_HWCAP2
);
354 if ((hw2
& HWCAP2_SVE2
) && (hw2
& HWCAP2_SVEAES
)) {
359 #else // !HWY_ARCH_ARM_A64
361 // Some old auxv.h / hwcap.h do not define these. If not, treat as unsupported.
362 #if defined(HWCAP_NEON) && defined(HWCAP_VFPv4)
363 if ((hw
& HWCAP_NEON
) && (hw
& HWCAP_VFPv4
)) {
364 bits
|= HWY_NEON_WITHOUT_AES
;
368 // aarch32 would check getauxval(AT_HWCAP2) & HWCAP2_AES, but we do not yet
369 // support that platform, and Armv7 lacks AES entirely. Because HWY_NEON
370 // requires native AES instructions, we do not enable that target here.
372 #endif // HWY_ARCH_ARM_A64
376 #elif HWY_ARCH_PPC && HWY_HAVE_RUNTIME_DISPATCH
379 #ifndef PPC_FEATURE_HAS_ALTIVEC
380 #define PPC_FEATURE_HAS_ALTIVEC 0x10000000
383 #ifndef PPC_FEATURE_HAS_VSX
384 #define PPC_FEATURE_HAS_VSX 0x00000080
387 #ifndef PPC_FEATURE2_ARCH_2_07
388 #define PPC_FEATURE2_ARCH_2_07 0x80000000
391 #ifndef PPC_FEATURE2_VEC_CRYPTO
392 #define PPC_FEATURE2_VEC_CRYPTO 0x02000000
395 #ifndef PPC_FEATURE2_ARCH_3_00
396 #define PPC_FEATURE2_ARCH_3_00 0x00800000
399 #ifndef PPC_FEATURE2_ARCH_3_1
400 #define PPC_FEATURE2_ARCH_3_1 0x00040000
403 using CapBits
= unsigned long; // NOLINT
405 // For AT_HWCAP, the others are for AT_HWCAP2
406 constexpr CapBits kGroupVSX
= PPC_FEATURE_HAS_ALTIVEC
| PPC_FEATURE_HAS_VSX
;
408 #if defined(HWY_DISABLE_PPC8_CRYPTO)
409 constexpr CapBits kGroupPPC8
= PPC_FEATURE2_ARCH_2_07
;
411 constexpr CapBits kGroupPPC8
= PPC_FEATURE2_ARCH_2_07
| PPC_FEATURE2_VEC_CRYPTO
;
413 constexpr CapBits kGroupPPC9
= kGroupPPC8
| PPC_FEATURE2_ARCH_3_00
;
414 constexpr CapBits kGroupPPC10
= kGroupPPC9
| PPC_FEATURE2_ARCH_3_1
;
416 int64_t DetectTargets() {
417 int64_t bits
= 0; // return value of supported targets.
418 const CapBits hw
= getauxval(AT_HWCAP
);
420 if ((hw
& kGroupVSX
) == kGroupVSX
) {
421 const CapBits hw2
= getauxval(AT_HWCAP2
);
422 if ((hw2
& kGroupPPC8
) == kGroupPPC8
) {
425 if ((hw2
& kGroupPPC9
) == kGroupPPC9
) {
428 if ((hw2
& kGroupPPC10
) == kGroupPPC10
) {
435 #endif // HWY_ARCH_X86
437 // Returns targets supported by the CPU, independently of DisableTargets.
438 // Factored out of SupportedTargets to make its structure more obvious. Note
439 // that x86 CPUID may take several hundred cycles.
440 int64_t DetectTargets() {
441 // Apps will use only one of these (the default is EMU128), but compile flags
442 // for this TU may differ from that of the app, so allow both.
443 int64_t bits
= HWY_SCALAR
| HWY_EMU128
;
445 #if HWY_ARCH_X86 && HWY_HAVE_RUNTIME_DISPATCH
446 bits
|= x86::DetectTargets();
447 #elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
448 bits
|= arm::DetectTargets();
449 #elif HWY_ARCH_PPC && HWY_HAVE_RUNTIME_DISPATCH
450 bits
|= ppc::DetectTargets();
453 // TODO(janwas): detect support for WASM/RVV.
454 // This file is typically compiled without HWY_IS_TEST, but targets_test has
455 // it set, and will expect all of its HWY_TARGETS (= all attainable) to be
457 bits
|= HWY_ENABLED_BASELINE
;
460 if ((bits
& HWY_ENABLED_BASELINE
) != HWY_ENABLED_BASELINE
) {
462 "WARNING: CPU supports %" PRIx64
" but software requires %" PRIx64
464 bits
, static_cast<int64_t>(HWY_ENABLED_BASELINE
));
472 HWY_DLLEXPORT HWY_NORETURN
void HWY_FORMAT(3, 4)
473 Abort(const char* file
, int line
, const char* format
, ...) {
476 va_start(args
, format
);
477 vsnprintf(buf
, sizeof(buf
), format
, args
);
480 fprintf(stderr
, "Abort at %s:%d: %s\n", file
, line
, buf
);
482 // If compiled with any sanitizer, they can also print a stack trace.
483 #if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
484 __sanitizer_print_stack_trace();
488 // Now terminate the program:
490 exit(1); // trap/abort just freeze Spike.
491 #elif HWY_IS_DEBUG_BUILD && !HWY_COMPILER_MSVC
492 // Facilitates breaking into a debugger, but don't use this in non-debug
493 // builds because it looks like "illegal instruction", which is misleading.
496 abort(); // Compile error without this due to HWY_NORETURN.
500 HWY_DLLEXPORT
void DisableTargets(int64_t disabled_targets
) {
501 supported_mask_
= static_cast<int64_t>(~disabled_targets
);
502 // This will take effect on the next call to SupportedTargets, which is
503 // called right before GetChosenTarget::Update. However, calling Update here
504 // would make it appear that HWY_DYNAMIC_DISPATCH was called, which we want
505 // to check in tests. We instead de-initialize such that the next
506 // HWY_DYNAMIC_DISPATCH calls GetChosenTarget::Update via FunctionCache.
507 GetChosenTarget().DeInit();
510 HWY_DLLEXPORT
void SetSupportedTargetsForTest(int64_t targets
) {
511 supported_targets_for_test_
= targets
;
512 GetChosenTarget().DeInit(); // see comment above
515 HWY_DLLEXPORT
int64_t SupportedTargets() {
516 int64_t targets
= supported_targets_for_test_
;
517 if (HWY_LIKELY(targets
== 0)) {
518 // Mock not active. Re-detect instead of caching just in case we're on a
519 // heterogeneous ISA (also requires some app support to pin threads). This
520 // is only reached on the first HWY_DYNAMIC_DISPATCH or after each call to
521 // DisableTargets or SetSupportedTargetsForTest.
522 targets
= DetectTargets();
524 // VectorBytes invokes HWY_DYNAMIC_DISPATCH. To prevent infinite recursion,
525 // first set up ChosenTarget. No need to Update() again afterwards with the
526 // final targets - that will be done by a caller of this function.
527 GetChosenTarget().Update(targets
);
529 // Now that we can call VectorBytes, check for targets with specific sizes.
530 if (HWY_ARCH_ARM_A64
) {
531 const size_t vec_bytes
= VectorBytes(); // uncached, see declaration
532 if ((targets
& HWY_SVE
) && vec_bytes
== 32) {
533 targets
= static_cast<int64_t>(targets
| HWY_SVE_256
);
535 targets
= static_cast<int64_t>(targets
& ~HWY_SVE_256
);
537 if ((targets
& HWY_SVE2
) && vec_bytes
== 16) {
538 targets
= static_cast<int64_t>(targets
| HWY_SVE2_128
);
540 targets
= static_cast<int64_t>(targets
& ~HWY_SVE2_128
);
542 } // HWY_ARCH_ARM_A64
545 targets
&= supported_mask_
;
546 return targets
== 0 ? HWY_STATIC_TARGET
: targets
;
549 HWY_DLLEXPORT ChosenTarget
& GetChosenTarget() {
550 static ChosenTarget chosen_target
;
551 return chosen_target
;