1 // Copyright 2020 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
16 #ifndef HIGHWAY_HWY_BASE_H_
17 #define HIGHWAY_HWY_BASE_H_
19 // For SIMD module implementations and their callers, target-independent.
21 // IWYU pragma: begin_exports
25 #include "hwy/detect_compiler_arch.h"
26 #include "hwy/highway_export.h"
28 // "IWYU pragma: keep" does not work for this include, so hide it from the IDE.
29 #if ((HWY_ARCH_X86 && !defined(HWY_NO_LIBCXX)) || HWY_COMPILER_MSVC) && !HWY_IDE
32 // IWYU pragma: end_exports
35 #include <string.h> // memcpy
38 //------------------------------------------------------------------------------
39 // Compiler-specific definitions
41 #define HWY_STR_IMPL(macro) #macro
42 #define HWY_STR(macro) HWY_STR_IMPL(macro)
48 #define HWY_RESTRICT __restrict
49 #define HWY_INLINE __forceinline
50 #define HWY_NOINLINE __declspec(noinline)
52 #define HWY_NORETURN __declspec(noreturn)
53 #define HWY_LIKELY(expr) (expr)
54 #define HWY_UNLIKELY(expr) (expr)
55 #define HWY_PRAGMA(tokens) __pragma(tokens)
56 #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens))
57 #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc)
58 #define HWY_MAYBE_UNUSED
59 #define HWY_HAS_ASSUME_ALIGNED 0
60 #if (_MSC_VER >= 1700)
61 #define HWY_MUST_USE_RESULT _Check_return_
63 #define HWY_MUST_USE_RESULT
68 #define HWY_RESTRICT __restrict__
69 // force inlining without optimization enabled creates very inefficient code
70 // that can cause compiler timeout
72 #define HWY_INLINE inline __attribute__((always_inline))
74 #define HWY_INLINE inline
76 #define HWY_NOINLINE __attribute__((noinline))
77 #define HWY_FLATTEN __attribute__((flatten))
78 #define HWY_NORETURN __attribute__((noreturn))
79 #define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1)
80 #define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
81 #define HWY_PRAGMA(tokens) _Pragma(#tokens)
82 #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens)
83 #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc)
84 // Encountered "attribute list cannot appear here" when using the C++17
85 // [[maybe_unused]], so only use the old style attribute for now.
86 #define HWY_MAYBE_UNUSED __attribute__((unused))
87 #define HWY_MUST_USE_RESULT __attribute__((warn_unused_result))
89 #endif // !HWY_COMPILER_MSVC
91 //------------------------------------------------------------------------------
94 // Enables error-checking of format strings.
95 #if HWY_HAS_ATTRIBUTE(__format__)
96 #define HWY_FORMAT(idx_fmt, idx_arg) \
97 __attribute__((__format__(__printf__, idx_fmt, idx_arg)))
99 #define HWY_FORMAT(idx_fmt, idx_arg)
102 // Returns a void* pointer which the compiler then assumes is N-byte aligned.
103 // Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32);
105 // The assignment semantics are required by GCC/Clang. ICC provides an in-place
106 // __assume_aligned, whereas MSVC's __assume appears unsuitable.
107 #if HWY_HAS_BUILTIN(__builtin_assume_aligned)
108 #define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
110 #define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
113 // Clang and GCC require attributes on each function into which SIMD intrinsics
114 // are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
115 // automatic annotation via pragmas.
116 #if HWY_COMPILER_CLANG
117 #define HWY_PUSH_ATTRIBUTES(targets_str) \
118 HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \
119 apply_to = function))
120 #define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop)
121 #elif HWY_COMPILER_GCC
122 #define HWY_PUSH_ATTRIBUTES(targets_str) \
123 HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str)
124 #define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options)
126 #define HWY_PUSH_ATTRIBUTES(targets_str)
127 #define HWY_POP_ATTRIBUTES
130 //------------------------------------------------------------------------------
133 #define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED
135 #define HWY_CONCAT_IMPL(a, b) a##b
136 #define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b)
138 #define HWY_MIN(a, b) ((a) < (b) ? (a) : (b))
139 #define HWY_MAX(a, b) ((a) > (b) ? (a) : (b))
141 #if HWY_COMPILER_GCC_ACTUAL
142 // nielskm: GCC does not support '#pragma GCC unroll' without the factor.
143 #define HWY_UNROLL(factor) HWY_PRAGMA(GCC unroll factor)
144 #define HWY_DEFAULT_UNROLL HWY_UNROLL(4)
145 #elif HWY_COMPILER_CLANG || HWY_COMPILER_ICC || HWY_COMPILER_ICX
146 #define HWY_UNROLL(factor) HWY_PRAGMA(unroll factor)
147 #define HWY_DEFAULT_UNROLL HWY_UNROLL()
149 #define HWY_UNROLL(factor)
150 #define HWY_DEFAULT_UNROLL
153 // Tell a compiler that the expression always evaluates to true.
154 // The expression should be free from any side effects.
155 // Some older compilers may have trouble with complex expressions, therefore
156 // it is advisable to split multiple conditions into separate assume statements,
157 // and manually check the generated code.
158 // OK but could fail:
159 // HWY_ASSUME(x == 2 && y == 3);
161 // HWY_ASSUME(x == 2);
162 // HWY_ASSUME(y == 3);
163 #if HWY_HAS_CPP_ATTRIBUTE(assume)
164 #define HWY_ASSUME(expr) [[assume(expr)]]
165 #elif HWY_COMPILER_MSVC || HWY_COMPILER_ICC
166 #define HWY_ASSUME(expr) __assume(expr)
167 // __builtin_assume() was added in clang 3.6.
168 #elif HWY_COMPILER_CLANG && HWY_HAS_BUILTIN(__builtin_assume)
169 #define HWY_ASSUME(expr) __builtin_assume(expr)
170 // __builtin_unreachable() was added in GCC 4.5, but __has_builtin() was added
171 // later, so check for the compiler version directly.
172 #elif HWY_COMPILER_GCC_ACTUAL >= 405
173 #define HWY_ASSUME(expr) \
174 ((expr) ? static_cast<void>(0) : __builtin_unreachable())
176 #define HWY_ASSUME(expr) static_cast<void>(0)
179 // Compile-time fence to prevent undesirable code reordering. On Clang x86, the
180 // typical asm volatile("" : : : "memory") has no effect, whereas atomic fence
181 // does, without generating code.
182 #if HWY_ARCH_X86 && !defined(HWY_NO_LIBCXX)
183 #define HWY_FENCE std::atomic_thread_fence(std::memory_order_acq_rel)
185 // TODO(janwas): investigate alternatives. On Arm, the above generates barriers.
189 // 4 instances of a given literal value, useful as input to LoadDup128.
190 #define HWY_REP4(literal) literal, literal, literal, literal
192 #define HWY_ABORT(format, ...) \
193 ::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__)
196 #define HWY_ASSERT(condition) \
198 if (!(condition)) { \
199 HWY_ABORT("Assert %s", #condition); \
203 #if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER)
204 #define HWY_IS_MSAN 1
206 #define HWY_IS_MSAN 0
209 #if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER)
210 #define HWY_IS_ASAN 1
212 #define HWY_IS_ASAN 0
215 #if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER)
216 #define HWY_IS_TSAN 1
218 #define HWY_IS_TSAN 0
221 // MSAN may cause lengthy build times or false positives e.g. in AVX3 DemoteTo.
222 // You can disable MSAN by adding this attribute to the function that fails.
224 #define HWY_ATTR_NO_MSAN __attribute__((no_sanitize_memory))
226 #define HWY_ATTR_NO_MSAN
229 // For enabling HWY_DASSERT and shortening tests in slower debug builds
230 #if !defined(HWY_IS_DEBUG_BUILD)
231 // Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent
232 // MSVC defines NDEBUG (if not, could instead check _DEBUG).
233 #if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \
234 HWY_IS_MSAN || HWY_IS_TSAN || defined(__clang_analyzer__)
235 #define HWY_IS_DEBUG_BUILD 1
237 #define HWY_IS_DEBUG_BUILD 0
239 #endif // HWY_IS_DEBUG_BUILD
241 #if HWY_IS_DEBUG_BUILD
242 #define HWY_DASSERT(condition) HWY_ASSERT(condition)
244 #define HWY_DASSERT(condition) \
251 //------------------------------------------------------------------------------
252 // kMaxVectorSize (undocumented, pending removal)
255 static constexpr HWY_MAYBE_UNUSED
size_t kMaxVectorSize
= 64; // AVX-512
256 #elif HWY_ARCH_RVV && defined(__riscv_v_intrinsic) && \
257 __riscv_v_intrinsic >= 11000
258 // Not actually an upper bound on the size.
259 static constexpr HWY_MAYBE_UNUSED
size_t kMaxVectorSize
= 4096;
261 static constexpr HWY_MAYBE_UNUSED
size_t kMaxVectorSize
= 16;
264 //------------------------------------------------------------------------------
267 // Potentially useful for LoadDup128 and capped vectors. In other cases, arrays
268 // should be allocated dynamically via aligned_allocator.h because Lanes() may
269 // exceed the stack size.
271 #define HWY_ALIGN_MAX alignas(64)
272 #elif HWY_ARCH_RVV && defined(__riscv_v_intrinsic) && \
273 __riscv_v_intrinsic >= 11000
274 #define HWY_ALIGN_MAX alignas(8) // only elements need be aligned
276 #define HWY_ALIGN_MAX alignas(16)
279 //------------------------------------------------------------------------------
282 // Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
283 // by concatenating base type and bits.
285 #pragma pack(push, 1)
287 // ACLE (https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html):
288 // always supported on Armv8, for Armv7 only if -mfp16-format is given.
289 #if ((HWY_ARCH_ARM_A64 || (__ARM_FP & 2)) && HWY_COMPILER_GCC)
290 using float16_t
= __fp16
;
291 // C11 extension ISO/IEC TS 18661-3:2015 but not supported on all targets.
292 // Required for Clang RVV if the float16 extension is used.
293 #elif HWY_ARCH_RVV && HWY_COMPILER_CLANG && defined(__riscv_zvfh)
294 using float16_t
= _Float16
;
308 using float32_t
= float;
309 using float64_t
= double;
311 #pragma pack(push, 1)
313 // Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it:
314 // https://reviews.llvm.org/D86310
315 struct alignas(16) uint128_t
{
316 uint64_t lo
; // little-endian layout
320 // 64 bit key plus 64 bit value. Faster than using uint128_t when only the key
321 // field is to be compared (Lt128Upper instead of Lt128).
322 struct alignas(16) K64V64
{
323 uint64_t value
; // little-endian layout
327 // 32 bit key plus 32 bit value. Allows vqsort recursions to terminate earlier
328 // than when considering both to be a 64-bit key.
329 struct alignas(8) K32V32
{
330 uint32_t value
; // little-endian layout
336 static inline HWY_MAYBE_UNUSED
bool operator<(const uint128_t
& a
,
337 const uint128_t
& b
) {
338 return (a
.hi
== b
.hi
) ? a
.lo
< b
.lo
: a
.hi
< b
.hi
;
340 // Required for std::greater.
341 static inline HWY_MAYBE_UNUSED
bool operator>(const uint128_t
& a
,
342 const uint128_t
& b
) {
345 static inline HWY_MAYBE_UNUSED
bool operator==(const uint128_t
& a
,
346 const uint128_t
& b
) {
347 return a
.lo
== b
.lo
&& a
.hi
== b
.hi
;
350 static inline HWY_MAYBE_UNUSED
bool operator<(const K64V64
& a
,
352 return a
.key
< b
.key
;
354 // Required for std::greater.
355 static inline HWY_MAYBE_UNUSED
bool operator>(const K64V64
& a
,
359 static inline HWY_MAYBE_UNUSED
bool operator==(const K64V64
& a
,
361 return a
.key
== b
.key
;
364 static inline HWY_MAYBE_UNUSED
bool operator<(const K32V32
& a
,
366 return a
.key
< b
.key
;
368 // Required for std::greater.
369 static inline HWY_MAYBE_UNUSED
bool operator>(const K32V32
& a
,
373 static inline HWY_MAYBE_UNUSED
bool operator==(const K32V32
& a
,
375 return a
.key
== b
.key
;
378 //------------------------------------------------------------------------------
379 // Controlling overload resolution (SFINAE)
381 template <bool Condition
>
384 struct EnableIfT
<true> {
388 template <bool Condition
>
389 using EnableIf
= typename EnableIfT
<Condition
>::type
;
391 template <typename T
, typename U
>
396 template <typename T
>
397 struct IsSameT
<T
, T
> {
401 template <typename T
, typename U
>
402 HWY_API
constexpr bool IsSame() {
403 return IsSameT
<T
, U
>::value
;
406 template <bool Condition
, typename Then
, typename Else
>
411 template <class Then
, class Else
>
412 struct IfT
<false, Then
, Else
> {
416 template <bool Condition
, typename Then
, typename Else
>
417 using If
= typename IfT
<Condition
, Then
, Else
>::type
;
419 // Insert into template/function arguments to enable this overload only for
420 // vectors of exactly, at most (LE), or more than (GT) this many bytes.
422 // As an example, checking for a total size of 16 bytes will match both
423 // Simd<uint8_t, 16, 0> and Simd<uint8_t, 8, 1>.
424 #define HWY_IF_V_SIZE(T, kN, bytes) \
425 hwy::EnableIf<kN * sizeof(T) == bytes>* = nullptr
426 #define HWY_IF_V_SIZE_LE(T, kN, bytes) \
427 hwy::EnableIf<kN * sizeof(T) <= bytes>* = nullptr
428 #define HWY_IF_V_SIZE_GT(T, kN, bytes) \
429 hwy::EnableIf<(kN * sizeof(T) > bytes)>* = nullptr
431 #define HWY_IF_LANES(kN, lanes) hwy::EnableIf<(kN == lanes)>* = nullptr
432 #define HWY_IF_LANES_LE(kN, lanes) hwy::EnableIf<(kN <= lanes)>* = nullptr
433 #define HWY_IF_LANES_GT(kN, lanes) hwy::EnableIf<(kN > lanes)>* = nullptr
435 #define HWY_IF_UNSIGNED(T) hwy::EnableIf<!IsSigned<T>()>* = nullptr
436 #define HWY_IF_SIGNED(T) \
437 hwy::EnableIf<IsSigned<T>() && !IsFloat<T>() && !IsSpecialFloat<T>()>* = \
439 #define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
440 #define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
441 #define HWY_IF_SPECIAL_FLOAT(T) \
442 hwy::EnableIf<hwy::IsSpecialFloat<T>()>* = nullptr
443 #define HWY_IF_NOT_SPECIAL_FLOAT(T) \
444 hwy::EnableIf<!hwy::IsSpecialFloat<T>()>* = nullptr
445 #define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T) \
446 hwy::EnableIf<!hwy::IsFloat<T>() && !hwy::IsSpecialFloat<T>()>* = nullptr
448 #define HWY_IF_T_SIZE(T, bytes) hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
449 #define HWY_IF_NOT_T_SIZE(T, bytes) \
450 hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
451 // bit_array = 0x102 means 1 or 8 bytes. There is no NONE_OF because it sounds
452 // too similar. If you want the opposite of this (2 or 4 bytes), ask for those
453 // bits explicitly (0x14) instead of attempting to 'negate' 0x102.
454 #define HWY_IF_T_SIZE_ONE_OF(T, bit_array) \
455 hwy::EnableIf<((size_t{1} << sizeof(T)) & (bit_array)) != 0>* = nullptr
457 // Use instead of HWY_IF_T_SIZE to avoid ambiguity with float/double
459 #define HWY_IF_UI32(T) \
460 hwy::EnableIf<IsSame<T, uint32_t>() || IsSame<T, int32_t>()>* = nullptr
461 #define HWY_IF_UI64(T) \
462 hwy::EnableIf<IsSame<T, uint64_t>() || IsSame<T, int64_t>()>* = nullptr
464 #define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
465 hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
467 // Empty struct used as a size tag type.
472 struct RemoveConstT
{
476 struct RemoveConstT
<const T
> {
481 using RemoveConst
= typename RemoveConstT
<T
>::type
;
488 struct RemoveRefT
<T
&> {
493 using RemoveRef
= typename RemoveRefT
<T
>::type
;
495 //------------------------------------------------------------------------------
500 template <typename T
>
503 struct Relations
<uint8_t> {
504 using Unsigned
= uint8_t;
505 using Signed
= int8_t;
506 using Wide
= uint16_t;
507 enum { is_signed
= 0, is_float
= 0 };
510 struct Relations
<int8_t> {
511 using Unsigned
= uint8_t;
512 using Signed
= int8_t;
513 using Wide
= int16_t;
514 enum { is_signed
= 1, is_float
= 0 };
517 struct Relations
<uint16_t> {
518 using Unsigned
= uint16_t;
519 using Signed
= int16_t;
520 using Wide
= uint32_t;
521 using Narrow
= uint8_t;
522 enum { is_signed
= 0, is_float
= 0 };
525 struct Relations
<int16_t> {
526 using Unsigned
= uint16_t;
527 using Signed
= int16_t;
528 using Wide
= int32_t;
529 using Narrow
= int8_t;
530 enum { is_signed
= 1, is_float
= 0 };
533 struct Relations
<uint32_t> {
534 using Unsigned
= uint32_t;
535 using Signed
= int32_t;
537 using Wide
= uint64_t;
538 using Narrow
= uint16_t;
539 enum { is_signed
= 0, is_float
= 0 };
542 struct Relations
<int32_t> {
543 using Unsigned
= uint32_t;
544 using Signed
= int32_t;
546 using Wide
= int64_t;
547 using Narrow
= int16_t;
548 enum { is_signed
= 1, is_float
= 0 };
551 struct Relations
<uint64_t> {
552 using Unsigned
= uint64_t;
553 using Signed
= int64_t;
554 using Float
= double;
555 using Wide
= uint128_t
;
556 using Narrow
= uint32_t;
557 enum { is_signed
= 0, is_float
= 0 };
560 struct Relations
<int64_t> {
561 using Unsigned
= uint64_t;
562 using Signed
= int64_t;
563 using Float
= double;
564 using Narrow
= int32_t;
565 enum { is_signed
= 1, is_float
= 0 };
568 struct Relations
<uint128_t
> {
569 using Unsigned
= uint128_t
;
570 using Narrow
= uint64_t;
571 enum { is_signed
= 0, is_float
= 0 };
574 struct Relations
<float16_t
> {
575 using Unsigned
= uint16_t;
576 using Signed
= int16_t;
577 using Float
= float16_t
;
579 enum { is_signed
= 1, is_float
= 1 };
582 struct Relations
<bfloat16_t
> {
583 using Unsigned
= uint16_t;
584 using Signed
= int16_t;
586 enum { is_signed
= 1, is_float
= 1 };
589 struct Relations
<float> {
590 using Unsigned
= uint32_t;
591 using Signed
= int32_t;
594 using Narrow
= float16_t
;
595 enum { is_signed
= 1, is_float
= 1 };
598 struct Relations
<double> {
599 using Unsigned
= uint64_t;
600 using Signed
= int64_t;
601 using Float
= double;
602 using Narrow
= float;
603 enum { is_signed
= 1, is_float
= 1 };
609 struct TypeFromSize
<1> {
610 using Unsigned
= uint8_t;
611 using Signed
= int8_t;
614 struct TypeFromSize
<2> {
615 using Unsigned
= uint16_t;
616 using Signed
= int16_t;
619 struct TypeFromSize
<4> {
620 using Unsigned
= uint32_t;
621 using Signed
= int32_t;
625 struct TypeFromSize
<8> {
626 using Unsigned
= uint64_t;
627 using Signed
= int64_t;
628 using Float
= double;
631 struct TypeFromSize
<16> {
632 using Unsigned
= uint128_t
;
635 } // namespace detail
637 // Aliases for types of a different category, but the same size.
638 template <typename T
>
639 using MakeUnsigned
= typename
detail::Relations
<T
>::Unsigned
;
640 template <typename T
>
641 using MakeSigned
= typename
detail::Relations
<T
>::Signed
;
642 template <typename T
>
643 using MakeFloat
= typename
detail::Relations
<T
>::Float
;
645 // Aliases for types of the same category, but different size.
646 template <typename T
>
647 using MakeWide
= typename
detail::Relations
<T
>::Wide
;
648 template <typename T
>
649 using MakeNarrow
= typename
detail::Relations
<T
>::Narrow
;
651 // Obtain type from its size [bytes].
653 using UnsignedFromSize
= typename
detail::TypeFromSize
<N
>::Unsigned
;
655 using SignedFromSize
= typename
detail::TypeFromSize
<N
>::Signed
;
657 using FloatFromSize
= typename
detail::TypeFromSize
<N
>::Float
;
659 // Avoid confusion with SizeTag where the parameter is a lane size.
660 using UnsignedTag
= SizeTag
<0>;
661 using SignedTag
= SizeTag
<0x100>; // integer
662 using FloatTag
= SizeTag
<0x200>;
664 template <typename T
, class R
= detail::Relations
<T
>>
665 constexpr auto TypeTag() -> hwy::SizeTag
<((R::is_signed
+ R::is_float
) << 8)> {
666 return hwy::SizeTag
<((R::is_signed
+ R::is_float
) << 8)>();
669 // For when we only want to distinguish FloatTag from everything else.
670 using NonFloatTag
= SizeTag
<0x400>;
672 template <typename T
, class R
= detail::Relations
<T
>>
673 constexpr auto IsFloatTag() -> hwy::SizeTag
<(R::is_float
? 0x200 : 0x400)> {
674 return hwy::SizeTag
<(R::is_float
? 0x200 : 0x400)>();
677 //------------------------------------------------------------------------------
680 template <typename T
>
681 HWY_API
constexpr bool IsFloat() {
682 // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
683 // from a float, not compared.
684 return IsSame
<T
, float>() || IsSame
<T
, double>();
687 // These types are often special-cased and not supported in all ops.
688 template <typename T
>
689 HWY_API
constexpr bool IsSpecialFloat() {
690 return IsSame
<T
, float16_t
>() || IsSame
<T
, bfloat16_t
>();
693 template <typename T
>
694 HWY_API
constexpr bool IsSigned() {
698 constexpr bool IsSigned
<float16_t
>() {
702 constexpr bool IsSigned
<bfloat16_t
>() {
706 // Largest/smallest representable integer values.
707 template <typename T
>
708 HWY_API
constexpr T
LimitsMax() {
709 static_assert(!IsFloat
<T
>(), "Only for integer types");
710 using TU
= MakeUnsigned
<T
>;
711 return static_cast<T
>(IsSigned
<T
>() ? (static_cast<TU
>(~0ull) >> 1)
712 : static_cast<TU
>(~0ull));
714 template <typename T
>
715 HWY_API
constexpr T
LimitsMin() {
716 static_assert(!IsFloat
<T
>(), "Only for integer types");
717 return IsSigned
<T
>() ? T(-1) - LimitsMax
<T
>() : T(0);
720 // Largest/smallest representable value (integer or float). This naming avoids
721 // confusion with numeric_limits<float>::min() (the smallest positive value).
722 template <typename T
>
723 HWY_API
constexpr T
LowestValue() {
724 return LimitsMin
<T
>();
727 constexpr float LowestValue
<float>() {
728 return -3.402823466e+38F
;
731 constexpr double LowestValue
<double>() {
732 return -1.7976931348623158e+308;
735 template <typename T
>
736 HWY_API
constexpr T
HighestValue() {
737 return LimitsMax
<T
>();
740 constexpr float HighestValue
<float>() {
741 return 3.402823466e+38F
;
744 constexpr double HighestValue
<double>() {
745 return 1.7976931348623158e+308;
748 // Difference between 1.0 and the next representable value.
749 template <typename T
>
750 HWY_API
constexpr T
Epsilon() {
754 constexpr float Epsilon
<float>() {
755 return 1.192092896e-7f
;
758 constexpr double Epsilon
<double>() {
759 return 2.2204460492503131e-16;
762 // Returns width in bits of the mantissa field in IEEE binary32/64.
763 template <typename T
>
764 constexpr int MantissaBits() {
765 static_assert(sizeof(T
) == 0, "Only instantiate the specializations");
769 constexpr int MantissaBits
<float>() {
773 constexpr int MantissaBits
<double>() {
777 // Returns the (left-shifted by one bit) IEEE binary32/64 representation with
778 // the largest possible (biased) exponent field. Used by IsInf.
779 template <typename T
>
780 constexpr MakeSigned
<T
> MaxExponentTimes2() {
781 return -(MakeSigned
<T
>{1} << (MantissaBits
<T
>() + 1));
784 // Returns bitmask of the sign bit in IEEE binary32/64.
785 template <typename T
>
786 constexpr MakeUnsigned
<T
> SignMask() {
787 return MakeUnsigned
<T
>{1} << (sizeof(T
) * 8 - 1);
790 // Returns bitmask of the exponent field in IEEE binary32/64.
791 template <typename T
>
792 constexpr MakeUnsigned
<T
> ExponentMask() {
793 return (~(MakeUnsigned
<T
>{1} << MantissaBits
<T
>()) + 1) & ~SignMask
<T
>();
796 // Returns bitmask of the mantissa field in IEEE binary32/64.
797 template <typename T
>
798 constexpr MakeUnsigned
<T
> MantissaMask() {
799 return (MakeUnsigned
<T
>{1} << MantissaBits
<T
>()) - 1;
802 // Returns 1 << mantissa_bits as a floating-point number. All integers whose
803 // absolute value are less than this can be represented exactly.
804 template <typename T
>
805 constexpr T
MantissaEnd() {
806 static_assert(sizeof(T
) == 0, "Only instantiate the specializations");
810 constexpr float MantissaEnd
<float>() {
811 return 8388608.0f
; // 1 << 23
814 constexpr double MantissaEnd
<double>() {
815 // floating point literal with p52 requires C++17.
816 return 4503599627370496.0; // 1 << 52
819 // Returns width in bits of the exponent field in IEEE binary32/64.
820 template <typename T
>
821 constexpr int ExponentBits() {
822 // Exponent := remaining bits after deducting sign and mantissa.
823 return 8 * sizeof(T
) - 1 - MantissaBits
<T
>();
826 // Returns largest value of the biased exponent field in IEEE binary32/64,
827 // right-shifted so that the LSB is bit zero. Example: 0xFF for float.
828 // This is expressed as a signed integer for more efficient comparison.
829 template <typename T
>
830 constexpr MakeSigned
<T
> MaxExponentField() {
831 return (MakeSigned
<T
>{1} << ExponentBits
<T
>()) - 1;
834 //------------------------------------------------------------------------------
837 template <typename T1
, typename T2
>
838 constexpr inline T1
DivCeil(T1 a
, T2 b
) {
839 return (a
+ b
- 1) / b
;
842 // Works for any `align`; if a power of two, compiler emits ADD+AND.
843 constexpr inline size_t RoundUpTo(size_t what
, size_t align
) {
844 return DivCeil(what
, align
) * align
;
847 // Undefined results for x == 0.
848 HWY_API
size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x
) {
849 #if HWY_COMPILER_MSVC
850 unsigned long index
; // NOLINT
851 _BitScanForward(&index
, x
);
853 #else // HWY_COMPILER_MSVC
854 return static_cast<size_t>(__builtin_ctz(x
));
855 #endif // HWY_COMPILER_MSVC
858 HWY_API
size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x
) {
859 #if HWY_COMPILER_MSVC
861 unsigned long index
; // NOLINT
862 _BitScanForward64(&index
, x
);
864 #else // HWY_ARCH_X86_64
865 // _BitScanForward64 not available
866 uint32_t lsb
= static_cast<uint32_t>(x
& 0xFFFFFFFF);
867 unsigned long index
; // NOLINT
869 uint32_t msb
= static_cast<uint32_t>(x
>> 32u);
870 _BitScanForward(&index
, msb
);
873 _BitScanForward(&index
, lsb
);
876 #endif // HWY_ARCH_X86_64
877 #else // HWY_COMPILER_MSVC
878 return static_cast<size_t>(__builtin_ctzll(x
));
879 #endif // HWY_COMPILER_MSVC
882 // Undefined results for x == 0.
883 HWY_API
size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x
) {
884 #if HWY_COMPILER_MSVC
885 unsigned long index
; // NOLINT
886 _BitScanReverse(&index
, x
);
888 #else // HWY_COMPILER_MSVC
889 return static_cast<size_t>(__builtin_clz(x
));
890 #endif // HWY_COMPILER_MSVC
893 HWY_API
size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x
) {
894 #if HWY_COMPILER_MSVC
896 unsigned long index
; // NOLINT
897 _BitScanReverse64(&index
, x
);
899 #else // HWY_ARCH_X86_64
900 // _BitScanReverse64 not available
901 const uint32_t msb
= static_cast<uint32_t>(x
>> 32u);
902 unsigned long index
; // NOLINT
904 const uint32_t lsb
= static_cast<uint32_t>(x
& 0xFFFFFFFF);
905 _BitScanReverse(&index
, lsb
);
908 _BitScanReverse(&index
, msb
);
911 #endif // HWY_ARCH_X86_64
912 #else // HWY_COMPILER_MSVC
913 return static_cast<size_t>(__builtin_clzll(x
));
914 #endif // HWY_COMPILER_MSVC
917 HWY_API
size_t PopCount(uint64_t x
) {
918 #if HWY_COMPILER_GCC // includes clang
919 return static_cast<size_t>(__builtin_popcountll(x
));
920 // This instruction has a separate feature flag, but is often called from
921 // non-SIMD code, so we don't want to require dynamic dispatch. It was first
922 // supported by Intel in Nehalem (SSE4.2), but MSVC only predefines a macro
923 // for AVX, so check for that.
924 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
925 return _mm_popcnt_u64(x
);
926 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
927 return _mm_popcnt_u32(static_cast<uint32_t>(x
& 0xFFFFFFFFu
)) +
928 _mm_popcnt_u32(static_cast<uint32_t>(x
>> 32));
930 x
-= ((x
>> 1) & 0x5555555555555555ULL
);
931 x
= (((x
>> 2) & 0x3333333333333333ULL
) + (x
& 0x3333333333333333ULL
));
932 x
= (((x
>> 4) + x
) & 0x0F0F0F0F0F0F0F0FULL
);
936 return static_cast<size_t>(x
& 0x7Fu
);
940 // Skip HWY_API due to GCC "function not considered for inlining". Previously
941 // such errors were caused by underlying type mismatches, but it's not clear
942 // what is still mismatched despite all the casts.
943 template <typename TI
>
944 /*HWY_API*/ constexpr size_t FloorLog2(TI x
) {
947 : static_cast<size_t>(FloorLog2(static_cast<TI
>(x
>> 1)) + 1);
950 template <typename TI
>
951 /*HWY_API*/ constexpr size_t CeilLog2(TI x
) {
954 : static_cast<size_t>(FloorLog2(static_cast<TI
>(x
- 1)) + 1);
957 template <typename T
>
958 HWY_INLINE
constexpr T
AddWithWraparound(hwy::FloatTag
/*tag*/, T t
, size_t n
) {
959 return t
+ static_cast<T
>(n
);
962 template <typename T
>
963 HWY_INLINE
constexpr T
AddWithWraparound(hwy::NonFloatTag
/*tag*/, T t
,
965 using TU
= MakeUnsigned
<T
>;
966 return static_cast<T
>(
967 static_cast<TU
>(static_cast<TU
>(t
) + static_cast<TU
>(n
)) &
968 hwy::LimitsMax
<TU
>());
971 #if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
972 #pragma intrinsic(_umul128)
975 // 64 x 64 = 128 bit multiplication
976 HWY_API
uint64_t Mul128(uint64_t a
, uint64_t b
, uint64_t* HWY_RESTRICT upper
) {
977 #if defined(__SIZEOF_INT128__)
978 __uint128_t product
= (__uint128_t
)a
* (__uint128_t
)b
;
979 *upper
= (uint64_t)(product
>> 64);
980 return (uint64_t)(product
& 0xFFFFFFFFFFFFFFFFULL
);
981 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
982 return _umul128(a
, b
, upper
);
984 constexpr uint64_t kLo32
= 0xFFFFFFFFU
;
985 const uint64_t lo_lo
= (a
& kLo32
) * (b
& kLo32
);
986 const uint64_t hi_lo
= (a
>> 32) * (b
& kLo32
);
987 const uint64_t lo_hi
= (a
& kLo32
) * (b
>> 32);
988 const uint64_t hi_hi
= (a
>> 32) * (b
>> 32);
989 const uint64_t t
= (lo_lo
>> 32) + (hi_lo
& kLo32
) + lo_hi
;
990 *upper
= (hi_lo
>> 32) + (t
>> 32) + hi_hi
;
991 return (t
<< 32) | (lo_lo
& kLo32
);
995 #if HWY_COMPILER_MSVC
996 #pragma intrinsic(memcpy)
997 #pragma intrinsic(memset)
1000 // The source/destination must not overlap/alias.
1001 template <size_t kBytes
, typename From
, typename To
>
1002 HWY_API
void CopyBytes(const From
* from
, To
* to
) {
1003 #if HWY_COMPILER_MSVC
1004 memcpy(to
, from
, kBytes
);
1007 static_cast<void*>(to
), static_cast<const void*>(from
), kBytes
);
1011 // Same as CopyBytes, but for same-sized objects; avoids a size argument.
1012 template <typename From
, typename To
>
1013 HWY_API
void CopySameSize(const From
* HWY_RESTRICT from
, To
* HWY_RESTRICT to
) {
1014 static_assert(sizeof(From
) == sizeof(To
), "");
1015 CopyBytes
<sizeof(From
)>(from
, to
);
1018 template <size_t kBytes
, typename To
>
1019 HWY_API
void ZeroBytes(To
* to
) {
1020 #if HWY_COMPILER_MSVC
1021 memset(to
, 0, kBytes
);
1023 __builtin_memset(to
, 0, kBytes
);
1027 HWY_API
float F32FromBF16(bfloat16_t bf
) {
1028 uint32_t bits
= bf
.bits
;
1031 CopySameSize(&bits
, &f
);
1035 HWY_API bfloat16_t
BF16FromF32(float f
) {
1037 CopySameSize(&f
, &bits
);
1039 bf
.bits
= static_cast<uint16_t>(bits
>> 16);
1043 HWY_DLLEXPORT HWY_NORETURN
void HWY_FORMAT(3, 4)
1044 Abort(const char* file
, int line
, const char* format
, ...);
1046 // Prevents the compiler from eliding the computations that led to "output".
1048 HWY_API
void PreventElision(T
&& output
) {
1049 #if HWY_COMPILER_MSVC
1050 // MSVC does not support inline assembly anymore (and never supported GCC's
1051 // RTL constraints). Self-assignment with #pragma optimize("off") might be
1052 // expected to prevent elision, but it does not with MSVC 2015. Type-punning
1053 // with volatile pointers generates inefficient code on MSVC 2017.
1054 static std::atomic
<RemoveRef
<T
>> dummy
;
1055 dummy
.store(output
, std::memory_order_relaxed
);
1057 // Works by indicating to the compiler that "output" is being read and
1058 // modified. The +r constraint avoids unnecessary writes to memory, but only
1059 // works for built-in types (typically FuncOutput).
1060 asm volatile("" : "+r"(output
) : : "memory");
1066 #endif // HIGHWAY_HWY_BASE_H_