Bug 1867925 - Mark some storage-access-api tests as intermittent after wpt-sync....
[gecko.git] / third_party / highway / hwy / base.h
blob0ca755a8f84ab070c78650d6408842ac52cecf2b
1 // Copyright 2020 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
16 #ifndef HIGHWAY_HWY_BASE_H_
17 #define HIGHWAY_HWY_BASE_H_
19 // For SIMD module implementations and their callers, target-independent.
21 // IWYU pragma: begin_exports
22 #include <stddef.h>
23 #include <stdint.h>
25 #include "hwy/detect_compiler_arch.h"
26 #include "hwy/highway_export.h"
28 // "IWYU pragma: keep" does not work for this include, so hide it from the IDE.
29 #if ((HWY_ARCH_X86 && !defined(HWY_NO_LIBCXX)) || HWY_COMPILER_MSVC) && !HWY_IDE
30 #include <atomic>
31 #endif
32 // IWYU pragma: end_exports
34 #if HWY_COMPILER_MSVC
35 #include <string.h> // memcpy
36 #endif
38 //------------------------------------------------------------------------------
39 // Compiler-specific definitions
41 #define HWY_STR_IMPL(macro) #macro
42 #define HWY_STR(macro) HWY_STR_IMPL(macro)
44 #if HWY_COMPILER_MSVC
46 #include <intrin.h>
48 #define HWY_RESTRICT __restrict
49 #define HWY_INLINE __forceinline
50 #define HWY_NOINLINE __declspec(noinline)
51 #define HWY_FLATTEN
52 #define HWY_NORETURN __declspec(noreturn)
53 #define HWY_LIKELY(expr) (expr)
54 #define HWY_UNLIKELY(expr) (expr)
55 #define HWY_PRAGMA(tokens) __pragma(tokens)
56 #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens))
57 #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc)
58 #define HWY_MAYBE_UNUSED
59 #define HWY_HAS_ASSUME_ALIGNED 0
60 #if (_MSC_VER >= 1700)
61 #define HWY_MUST_USE_RESULT _Check_return_
62 #else
63 #define HWY_MUST_USE_RESULT
64 #endif
66 #else
68 #define HWY_RESTRICT __restrict__
69 // force inlining without optimization enabled creates very inefficient code
70 // that can cause compiler timeout
71 #ifdef __OPTIMIZE__
72 #define HWY_INLINE inline __attribute__((always_inline))
73 #else
74 #define HWY_INLINE inline
75 #endif
76 #define HWY_NOINLINE __attribute__((noinline))
77 #define HWY_FLATTEN __attribute__((flatten))
78 #define HWY_NORETURN __attribute__((noreturn))
79 #define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1)
80 #define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
81 #define HWY_PRAGMA(tokens) _Pragma(#tokens)
82 #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens)
83 #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc)
84 // Encountered "attribute list cannot appear here" when using the C++17
85 // [[maybe_unused]], so only use the old style attribute for now.
86 #define HWY_MAYBE_UNUSED __attribute__((unused))
87 #define HWY_MUST_USE_RESULT __attribute__((warn_unused_result))
89 #endif // !HWY_COMPILER_MSVC
91 //------------------------------------------------------------------------------
92 // Builtin/attributes
94 // Enables error-checking of format strings.
95 #if HWY_HAS_ATTRIBUTE(__format__)
96 #define HWY_FORMAT(idx_fmt, idx_arg) \
97 __attribute__((__format__(__printf__, idx_fmt, idx_arg)))
98 #else
99 #define HWY_FORMAT(idx_fmt, idx_arg)
100 #endif
102 // Returns a void* pointer which the compiler then assumes is N-byte aligned.
103 // Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32);
105 // The assignment semantics are required by GCC/Clang. ICC provides an in-place
106 // __assume_aligned, whereas MSVC's __assume appears unsuitable.
107 #if HWY_HAS_BUILTIN(__builtin_assume_aligned)
108 #define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
109 #else
110 #define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
111 #endif
113 // Clang and GCC require attributes on each function into which SIMD intrinsics
114 // are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
115 // automatic annotation via pragmas.
116 #if HWY_COMPILER_CLANG
117 #define HWY_PUSH_ATTRIBUTES(targets_str) \
118 HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \
119 apply_to = function))
120 #define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop)
121 #elif HWY_COMPILER_GCC
122 #define HWY_PUSH_ATTRIBUTES(targets_str) \
123 HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str)
124 #define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options)
125 #else
126 #define HWY_PUSH_ATTRIBUTES(targets_str)
127 #define HWY_POP_ATTRIBUTES
128 #endif
130 //------------------------------------------------------------------------------
131 // Macros
133 #define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED
135 #define HWY_CONCAT_IMPL(a, b) a##b
136 #define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b)
138 #define HWY_MIN(a, b) ((a) < (b) ? (a) : (b))
139 #define HWY_MAX(a, b) ((a) > (b) ? (a) : (b))
141 #if HWY_COMPILER_GCC_ACTUAL
142 // nielskm: GCC does not support '#pragma GCC unroll' without the factor.
143 #define HWY_UNROLL(factor) HWY_PRAGMA(GCC unroll factor)
144 #define HWY_DEFAULT_UNROLL HWY_UNROLL(4)
145 #elif HWY_COMPILER_CLANG || HWY_COMPILER_ICC || HWY_COMPILER_ICX
146 #define HWY_UNROLL(factor) HWY_PRAGMA(unroll factor)
147 #define HWY_DEFAULT_UNROLL HWY_UNROLL()
148 #else
149 #define HWY_UNROLL(factor)
150 #define HWY_DEFAULT_UNROLL
151 #endif
153 // Tell a compiler that the expression always evaluates to true.
154 // The expression should be free from any side effects.
155 // Some older compilers may have trouble with complex expressions, therefore
156 // it is advisable to split multiple conditions into separate assume statements,
157 // and manually check the generated code.
158 // OK but could fail:
159 // HWY_ASSUME(x == 2 && y == 3);
160 // Better:
161 // HWY_ASSUME(x == 2);
162 // HWY_ASSUME(y == 3);
163 #if HWY_HAS_CPP_ATTRIBUTE(assume)
164 #define HWY_ASSUME(expr) [[assume(expr)]]
165 #elif HWY_COMPILER_MSVC || HWY_COMPILER_ICC
166 #define HWY_ASSUME(expr) __assume(expr)
167 // __builtin_assume() was added in clang 3.6.
168 #elif HWY_COMPILER_CLANG && HWY_HAS_BUILTIN(__builtin_assume)
169 #define HWY_ASSUME(expr) __builtin_assume(expr)
170 // __builtin_unreachable() was added in GCC 4.5, but __has_builtin() was added
171 // later, so check for the compiler version directly.
172 #elif HWY_COMPILER_GCC_ACTUAL >= 405
173 #define HWY_ASSUME(expr) \
174 ((expr) ? static_cast<void>(0) : __builtin_unreachable())
175 #else
176 #define HWY_ASSUME(expr) static_cast<void>(0)
177 #endif
179 // Compile-time fence to prevent undesirable code reordering. On Clang x86, the
180 // typical asm volatile("" : : : "memory") has no effect, whereas atomic fence
181 // does, without generating code.
182 #if HWY_ARCH_X86 && !defined(HWY_NO_LIBCXX)
183 #define HWY_FENCE std::atomic_thread_fence(std::memory_order_acq_rel)
184 #else
185 // TODO(janwas): investigate alternatives. On Arm, the above generates barriers.
186 #define HWY_FENCE
187 #endif
189 // 4 instances of a given literal value, useful as input to LoadDup128.
190 #define HWY_REP4(literal) literal, literal, literal, literal
192 #define HWY_ABORT(format, ...) \
193 ::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__)
195 // Always enabled.
196 #define HWY_ASSERT(condition) \
197 do { \
198 if (!(condition)) { \
199 HWY_ABORT("Assert %s", #condition); \
201 } while (0)
203 #if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER)
204 #define HWY_IS_MSAN 1
205 #else
206 #define HWY_IS_MSAN 0
207 #endif
209 #if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER)
210 #define HWY_IS_ASAN 1
211 #else
212 #define HWY_IS_ASAN 0
213 #endif
215 #if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER)
216 #define HWY_IS_TSAN 1
217 #else
218 #define HWY_IS_TSAN 0
219 #endif
221 // MSAN may cause lengthy build times or false positives e.g. in AVX3 DemoteTo.
222 // You can disable MSAN by adding this attribute to the function that fails.
223 #if HWY_IS_MSAN
224 #define HWY_ATTR_NO_MSAN __attribute__((no_sanitize_memory))
225 #else
226 #define HWY_ATTR_NO_MSAN
227 #endif
229 // For enabling HWY_DASSERT and shortening tests in slower debug builds
230 #if !defined(HWY_IS_DEBUG_BUILD)
231 // Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent
232 // MSVC defines NDEBUG (if not, could instead check _DEBUG).
233 #if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \
234 HWY_IS_MSAN || HWY_IS_TSAN || defined(__clang_analyzer__)
235 #define HWY_IS_DEBUG_BUILD 1
236 #else
237 #define HWY_IS_DEBUG_BUILD 0
238 #endif
239 #endif // HWY_IS_DEBUG_BUILD
241 #if HWY_IS_DEBUG_BUILD
242 #define HWY_DASSERT(condition) HWY_ASSERT(condition)
243 #else
244 #define HWY_DASSERT(condition) \
245 do { \
246 } while (0)
247 #endif
249 namespace hwy {
251 //------------------------------------------------------------------------------
252 // kMaxVectorSize (undocumented, pending removal)
254 #if HWY_ARCH_X86
255 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64; // AVX-512
256 #elif HWY_ARCH_RVV && defined(__riscv_v_intrinsic) && \
257 __riscv_v_intrinsic >= 11000
258 // Not actually an upper bound on the size.
259 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096;
260 #else
261 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
262 #endif
264 //------------------------------------------------------------------------------
265 // Alignment
267 // Potentially useful for LoadDup128 and capped vectors. In other cases, arrays
268 // should be allocated dynamically via aligned_allocator.h because Lanes() may
269 // exceed the stack size.
270 #if HWY_ARCH_X86
271 #define HWY_ALIGN_MAX alignas(64)
272 #elif HWY_ARCH_RVV && defined(__riscv_v_intrinsic) && \
273 __riscv_v_intrinsic >= 11000
274 #define HWY_ALIGN_MAX alignas(8) // only elements need be aligned
275 #else
276 #define HWY_ALIGN_MAX alignas(16)
277 #endif
279 //------------------------------------------------------------------------------
280 // Lane types
282 // Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
283 // by concatenating base type and bits.
285 #pragma pack(push, 1)
287 // ACLE (https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html):
288 // always supported on Armv8, for Armv7 only if -mfp16-format is given.
289 #if ((HWY_ARCH_ARM_A64 || (__ARM_FP & 2)) && HWY_COMPILER_GCC)
290 using float16_t = __fp16;
291 // C11 extension ISO/IEC TS 18661-3:2015 but not supported on all targets.
292 // Required for Clang RVV if the float16 extension is used.
293 #elif HWY_ARCH_RVV && HWY_COMPILER_CLANG && defined(__riscv_zvfh)
294 using float16_t = _Float16;
295 // Otherwise emulate
296 #else
297 struct float16_t {
298 uint16_t bits;
300 #endif
302 struct bfloat16_t {
303 uint16_t bits;
306 #pragma pack(pop)
308 using float32_t = float;
309 using float64_t = double;
311 #pragma pack(push, 1)
313 // Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it:
314 // https://reviews.llvm.org/D86310
315 struct alignas(16) uint128_t {
316 uint64_t lo; // little-endian layout
317 uint64_t hi;
320 // 64 bit key plus 64 bit value. Faster than using uint128_t when only the key
321 // field is to be compared (Lt128Upper instead of Lt128).
322 struct alignas(16) K64V64 {
323 uint64_t value; // little-endian layout
324 uint64_t key;
327 // 32 bit key plus 32 bit value. Allows vqsort recursions to terminate earlier
328 // than when considering both to be a 64-bit key.
329 struct alignas(8) K32V32 {
330 uint32_t value; // little-endian layout
331 uint32_t key;
334 #pragma pack(pop)
336 static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a,
337 const uint128_t& b) {
338 return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi;
340 // Required for std::greater.
341 static inline HWY_MAYBE_UNUSED bool operator>(const uint128_t& a,
342 const uint128_t& b) {
343 return b < a;
345 static inline HWY_MAYBE_UNUSED bool operator==(const uint128_t& a,
346 const uint128_t& b) {
347 return a.lo == b.lo && a.hi == b.hi;
350 static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a,
351 const K64V64& b) {
352 return a.key < b.key;
354 // Required for std::greater.
355 static inline HWY_MAYBE_UNUSED bool operator>(const K64V64& a,
356 const K64V64& b) {
357 return b < a;
359 static inline HWY_MAYBE_UNUSED bool operator==(const K64V64& a,
360 const K64V64& b) {
361 return a.key == b.key;
364 static inline HWY_MAYBE_UNUSED bool operator<(const K32V32& a,
365 const K32V32& b) {
366 return a.key < b.key;
368 // Required for std::greater.
369 static inline HWY_MAYBE_UNUSED bool operator>(const K32V32& a,
370 const K32V32& b) {
371 return b < a;
373 static inline HWY_MAYBE_UNUSED bool operator==(const K32V32& a,
374 const K32V32& b) {
375 return a.key == b.key;
378 //------------------------------------------------------------------------------
379 // Controlling overload resolution (SFINAE)
381 template <bool Condition>
382 struct EnableIfT {};
383 template <>
384 struct EnableIfT<true> {
385 using type = void;
388 template <bool Condition>
389 using EnableIf = typename EnableIfT<Condition>::type;
391 template <typename T, typename U>
392 struct IsSameT {
393 enum { value = 0 };
396 template <typename T>
397 struct IsSameT<T, T> {
398 enum { value = 1 };
401 template <typename T, typename U>
402 HWY_API constexpr bool IsSame() {
403 return IsSameT<T, U>::value;
406 template <bool Condition, typename Then, typename Else>
407 struct IfT {
408 using type = Then;
411 template <class Then, class Else>
412 struct IfT<false, Then, Else> {
413 using type = Else;
416 template <bool Condition, typename Then, typename Else>
417 using If = typename IfT<Condition, Then, Else>::type;
419 // Insert into template/function arguments to enable this overload only for
420 // vectors of exactly, at most (LE), or more than (GT) this many bytes.
422 // As an example, checking for a total size of 16 bytes will match both
423 // Simd<uint8_t, 16, 0> and Simd<uint8_t, 8, 1>.
424 #define HWY_IF_V_SIZE(T, kN, bytes) \
425 hwy::EnableIf<kN * sizeof(T) == bytes>* = nullptr
426 #define HWY_IF_V_SIZE_LE(T, kN, bytes) \
427 hwy::EnableIf<kN * sizeof(T) <= bytes>* = nullptr
428 #define HWY_IF_V_SIZE_GT(T, kN, bytes) \
429 hwy::EnableIf<(kN * sizeof(T) > bytes)>* = nullptr
431 #define HWY_IF_LANES(kN, lanes) hwy::EnableIf<(kN == lanes)>* = nullptr
432 #define HWY_IF_LANES_LE(kN, lanes) hwy::EnableIf<(kN <= lanes)>* = nullptr
433 #define HWY_IF_LANES_GT(kN, lanes) hwy::EnableIf<(kN > lanes)>* = nullptr
435 #define HWY_IF_UNSIGNED(T) hwy::EnableIf<!IsSigned<T>()>* = nullptr
436 #define HWY_IF_SIGNED(T) \
437 hwy::EnableIf<IsSigned<T>() && !IsFloat<T>() && !IsSpecialFloat<T>()>* = \
438 nullptr
439 #define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
440 #define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
441 #define HWY_IF_SPECIAL_FLOAT(T) \
442 hwy::EnableIf<hwy::IsSpecialFloat<T>()>* = nullptr
443 #define HWY_IF_NOT_SPECIAL_FLOAT(T) \
444 hwy::EnableIf<!hwy::IsSpecialFloat<T>()>* = nullptr
445 #define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T) \
446 hwy::EnableIf<!hwy::IsFloat<T>() && !hwy::IsSpecialFloat<T>()>* = nullptr
448 #define HWY_IF_T_SIZE(T, bytes) hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
449 #define HWY_IF_NOT_T_SIZE(T, bytes) \
450 hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
451 // bit_array = 0x102 means 1 or 8 bytes. There is no NONE_OF because it sounds
452 // too similar. If you want the opposite of this (2 or 4 bytes), ask for those
453 // bits explicitly (0x14) instead of attempting to 'negate' 0x102.
454 #define HWY_IF_T_SIZE_ONE_OF(T, bit_array) \
455 hwy::EnableIf<((size_t{1} << sizeof(T)) & (bit_array)) != 0>* = nullptr
457 // Use instead of HWY_IF_T_SIZE to avoid ambiguity with float/double
458 // overloads.
459 #define HWY_IF_UI32(T) \
460 hwy::EnableIf<IsSame<T, uint32_t>() || IsSame<T, int32_t>()>* = nullptr
461 #define HWY_IF_UI64(T) \
462 hwy::EnableIf<IsSame<T, uint64_t>() || IsSame<T, int64_t>()>* = nullptr
464 #define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
465 hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
467 // Empty struct used as a size tag type.
468 template <size_t N>
469 struct SizeTag {};
471 template <class T>
472 struct RemoveConstT {
473 using type = T;
475 template <class T>
476 struct RemoveConstT<const T> {
477 using type = T;
480 template <class T>
481 using RemoveConst = typename RemoveConstT<T>::type;
483 template <class T>
484 struct RemoveRefT {
485 using type = T;
487 template <class T>
488 struct RemoveRefT<T&> {
489 using type = T;
492 template <class T>
493 using RemoveRef = typename RemoveRefT<T>::type;
495 //------------------------------------------------------------------------------
496 // Type relations
498 namespace detail {
500 template <typename T>
501 struct Relations;
502 template <>
503 struct Relations<uint8_t> {
504 using Unsigned = uint8_t;
505 using Signed = int8_t;
506 using Wide = uint16_t;
507 enum { is_signed = 0, is_float = 0 };
509 template <>
510 struct Relations<int8_t> {
511 using Unsigned = uint8_t;
512 using Signed = int8_t;
513 using Wide = int16_t;
514 enum { is_signed = 1, is_float = 0 };
516 template <>
517 struct Relations<uint16_t> {
518 using Unsigned = uint16_t;
519 using Signed = int16_t;
520 using Wide = uint32_t;
521 using Narrow = uint8_t;
522 enum { is_signed = 0, is_float = 0 };
524 template <>
525 struct Relations<int16_t> {
526 using Unsigned = uint16_t;
527 using Signed = int16_t;
528 using Wide = int32_t;
529 using Narrow = int8_t;
530 enum { is_signed = 1, is_float = 0 };
532 template <>
533 struct Relations<uint32_t> {
534 using Unsigned = uint32_t;
535 using Signed = int32_t;
536 using Float = float;
537 using Wide = uint64_t;
538 using Narrow = uint16_t;
539 enum { is_signed = 0, is_float = 0 };
541 template <>
542 struct Relations<int32_t> {
543 using Unsigned = uint32_t;
544 using Signed = int32_t;
545 using Float = float;
546 using Wide = int64_t;
547 using Narrow = int16_t;
548 enum { is_signed = 1, is_float = 0 };
550 template <>
551 struct Relations<uint64_t> {
552 using Unsigned = uint64_t;
553 using Signed = int64_t;
554 using Float = double;
555 using Wide = uint128_t;
556 using Narrow = uint32_t;
557 enum { is_signed = 0, is_float = 0 };
559 template <>
560 struct Relations<int64_t> {
561 using Unsigned = uint64_t;
562 using Signed = int64_t;
563 using Float = double;
564 using Narrow = int32_t;
565 enum { is_signed = 1, is_float = 0 };
567 template <>
568 struct Relations<uint128_t> {
569 using Unsigned = uint128_t;
570 using Narrow = uint64_t;
571 enum { is_signed = 0, is_float = 0 };
573 template <>
574 struct Relations<float16_t> {
575 using Unsigned = uint16_t;
576 using Signed = int16_t;
577 using Float = float16_t;
578 using Wide = float;
579 enum { is_signed = 1, is_float = 1 };
581 template <>
582 struct Relations<bfloat16_t> {
583 using Unsigned = uint16_t;
584 using Signed = int16_t;
585 using Wide = float;
586 enum { is_signed = 1, is_float = 1 };
588 template <>
589 struct Relations<float> {
590 using Unsigned = uint32_t;
591 using Signed = int32_t;
592 using Float = float;
593 using Wide = double;
594 using Narrow = float16_t;
595 enum { is_signed = 1, is_float = 1 };
597 template <>
598 struct Relations<double> {
599 using Unsigned = uint64_t;
600 using Signed = int64_t;
601 using Float = double;
602 using Narrow = float;
603 enum { is_signed = 1, is_float = 1 };
606 template <size_t N>
607 struct TypeFromSize;
608 template <>
609 struct TypeFromSize<1> {
610 using Unsigned = uint8_t;
611 using Signed = int8_t;
613 template <>
614 struct TypeFromSize<2> {
615 using Unsigned = uint16_t;
616 using Signed = int16_t;
618 template <>
619 struct TypeFromSize<4> {
620 using Unsigned = uint32_t;
621 using Signed = int32_t;
622 using Float = float;
624 template <>
625 struct TypeFromSize<8> {
626 using Unsigned = uint64_t;
627 using Signed = int64_t;
628 using Float = double;
630 template <>
631 struct TypeFromSize<16> {
632 using Unsigned = uint128_t;
635 } // namespace detail
637 // Aliases for types of a different category, but the same size.
638 template <typename T>
639 using MakeUnsigned = typename detail::Relations<T>::Unsigned;
640 template <typename T>
641 using MakeSigned = typename detail::Relations<T>::Signed;
642 template <typename T>
643 using MakeFloat = typename detail::Relations<T>::Float;
645 // Aliases for types of the same category, but different size.
646 template <typename T>
647 using MakeWide = typename detail::Relations<T>::Wide;
648 template <typename T>
649 using MakeNarrow = typename detail::Relations<T>::Narrow;
651 // Obtain type from its size [bytes].
652 template <size_t N>
653 using UnsignedFromSize = typename detail::TypeFromSize<N>::Unsigned;
654 template <size_t N>
655 using SignedFromSize = typename detail::TypeFromSize<N>::Signed;
656 template <size_t N>
657 using FloatFromSize = typename detail::TypeFromSize<N>::Float;
659 // Avoid confusion with SizeTag where the parameter is a lane size.
660 using UnsignedTag = SizeTag<0>;
661 using SignedTag = SizeTag<0x100>; // integer
662 using FloatTag = SizeTag<0x200>;
664 template <typename T, class R = detail::Relations<T>>
665 constexpr auto TypeTag() -> hwy::SizeTag<((R::is_signed + R::is_float) << 8)> {
666 return hwy::SizeTag<((R::is_signed + R::is_float) << 8)>();
669 // For when we only want to distinguish FloatTag from everything else.
670 using NonFloatTag = SizeTag<0x400>;
672 template <typename T, class R = detail::Relations<T>>
673 constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 : 0x400)> {
674 return hwy::SizeTag<(R::is_float ? 0x200 : 0x400)>();
677 //------------------------------------------------------------------------------
678 // Type traits
680 template <typename T>
681 HWY_API constexpr bool IsFloat() {
682 // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
683 // from a float, not compared.
684 return IsSame<T, float>() || IsSame<T, double>();
687 // These types are often special-cased and not supported in all ops.
688 template <typename T>
689 HWY_API constexpr bool IsSpecialFloat() {
690 return IsSame<T, float16_t>() || IsSame<T, bfloat16_t>();
693 template <typename T>
694 HWY_API constexpr bool IsSigned() {
695 return T(0) > T(-1);
697 template <>
698 constexpr bool IsSigned<float16_t>() {
699 return true;
701 template <>
702 constexpr bool IsSigned<bfloat16_t>() {
703 return true;
706 // Largest/smallest representable integer values.
707 template <typename T>
708 HWY_API constexpr T LimitsMax() {
709 static_assert(!IsFloat<T>(), "Only for integer types");
710 using TU = MakeUnsigned<T>;
711 return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~0ull) >> 1)
712 : static_cast<TU>(~0ull));
714 template <typename T>
715 HWY_API constexpr T LimitsMin() {
716 static_assert(!IsFloat<T>(), "Only for integer types");
717 return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0);
720 // Largest/smallest representable value (integer or float). This naming avoids
721 // confusion with numeric_limits<float>::min() (the smallest positive value).
722 template <typename T>
723 HWY_API constexpr T LowestValue() {
724 return LimitsMin<T>();
726 template <>
727 constexpr float LowestValue<float>() {
728 return -3.402823466e+38F;
730 template <>
731 constexpr double LowestValue<double>() {
732 return -1.7976931348623158e+308;
735 template <typename T>
736 HWY_API constexpr T HighestValue() {
737 return LimitsMax<T>();
739 template <>
740 constexpr float HighestValue<float>() {
741 return 3.402823466e+38F;
743 template <>
744 constexpr double HighestValue<double>() {
745 return 1.7976931348623158e+308;
748 // Difference between 1.0 and the next representable value.
749 template <typename T>
750 HWY_API constexpr T Epsilon() {
751 return 1;
753 template <>
754 constexpr float Epsilon<float>() {
755 return 1.192092896e-7f;
757 template <>
758 constexpr double Epsilon<double>() {
759 return 2.2204460492503131e-16;
762 // Returns width in bits of the mantissa field in IEEE binary32/64.
763 template <typename T>
764 constexpr int MantissaBits() {
765 static_assert(sizeof(T) == 0, "Only instantiate the specializations");
766 return 0;
768 template <>
769 constexpr int MantissaBits<float>() {
770 return 23;
772 template <>
773 constexpr int MantissaBits<double>() {
774 return 52;
777 // Returns the (left-shifted by one bit) IEEE binary32/64 representation with
778 // the largest possible (biased) exponent field. Used by IsInf.
779 template <typename T>
780 constexpr MakeSigned<T> MaxExponentTimes2() {
781 return -(MakeSigned<T>{1} << (MantissaBits<T>() + 1));
784 // Returns bitmask of the sign bit in IEEE binary32/64.
785 template <typename T>
786 constexpr MakeUnsigned<T> SignMask() {
787 return MakeUnsigned<T>{1} << (sizeof(T) * 8 - 1);
790 // Returns bitmask of the exponent field in IEEE binary32/64.
791 template <typename T>
792 constexpr MakeUnsigned<T> ExponentMask() {
793 return (~(MakeUnsigned<T>{1} << MantissaBits<T>()) + 1) & ~SignMask<T>();
796 // Returns bitmask of the mantissa field in IEEE binary32/64.
797 template <typename T>
798 constexpr MakeUnsigned<T> MantissaMask() {
799 return (MakeUnsigned<T>{1} << MantissaBits<T>()) - 1;
802 // Returns 1 << mantissa_bits as a floating-point number. All integers whose
803 // absolute value are less than this can be represented exactly.
804 template <typename T>
805 constexpr T MantissaEnd() {
806 static_assert(sizeof(T) == 0, "Only instantiate the specializations");
807 return 0;
809 template <>
810 constexpr float MantissaEnd<float>() {
811 return 8388608.0f; // 1 << 23
813 template <>
814 constexpr double MantissaEnd<double>() {
815 // floating point literal with p52 requires C++17.
816 return 4503599627370496.0; // 1 << 52
819 // Returns width in bits of the exponent field in IEEE binary32/64.
820 template <typename T>
821 constexpr int ExponentBits() {
822 // Exponent := remaining bits after deducting sign and mantissa.
823 return 8 * sizeof(T) - 1 - MantissaBits<T>();
826 // Returns largest value of the biased exponent field in IEEE binary32/64,
827 // right-shifted so that the LSB is bit zero. Example: 0xFF for float.
828 // This is expressed as a signed integer for more efficient comparison.
829 template <typename T>
830 constexpr MakeSigned<T> MaxExponentField() {
831 return (MakeSigned<T>{1} << ExponentBits<T>()) - 1;
834 //------------------------------------------------------------------------------
835 // Helper functions
837 template <typename T1, typename T2>
838 constexpr inline T1 DivCeil(T1 a, T2 b) {
839 return (a + b - 1) / b;
842 // Works for any `align`; if a power of two, compiler emits ADD+AND.
843 constexpr inline size_t RoundUpTo(size_t what, size_t align) {
844 return DivCeil(what, align) * align;
847 // Undefined results for x == 0.
848 HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
849 #if HWY_COMPILER_MSVC
850 unsigned long index; // NOLINT
851 _BitScanForward(&index, x);
852 return index;
853 #else // HWY_COMPILER_MSVC
854 return static_cast<size_t>(__builtin_ctz(x));
855 #endif // HWY_COMPILER_MSVC
858 HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
859 #if HWY_COMPILER_MSVC
860 #if HWY_ARCH_X86_64
861 unsigned long index; // NOLINT
862 _BitScanForward64(&index, x);
863 return index;
864 #else // HWY_ARCH_X86_64
865 // _BitScanForward64 not available
866 uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
867 unsigned long index; // NOLINT
868 if (lsb == 0) {
869 uint32_t msb = static_cast<uint32_t>(x >> 32u);
870 _BitScanForward(&index, msb);
871 return 32 + index;
872 } else {
873 _BitScanForward(&index, lsb);
874 return index;
876 #endif // HWY_ARCH_X86_64
877 #else // HWY_COMPILER_MSVC
878 return static_cast<size_t>(__builtin_ctzll(x));
879 #endif // HWY_COMPILER_MSVC
882 // Undefined results for x == 0.
883 HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {
884 #if HWY_COMPILER_MSVC
885 unsigned long index; // NOLINT
886 _BitScanReverse(&index, x);
887 return 31 - index;
888 #else // HWY_COMPILER_MSVC
889 return static_cast<size_t>(__builtin_clz(x));
890 #endif // HWY_COMPILER_MSVC
893 HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
894 #if HWY_COMPILER_MSVC
895 #if HWY_ARCH_X86_64
896 unsigned long index; // NOLINT
897 _BitScanReverse64(&index, x);
898 return 63 - index;
899 #else // HWY_ARCH_X86_64
900 // _BitScanReverse64 not available
901 const uint32_t msb = static_cast<uint32_t>(x >> 32u);
902 unsigned long index; // NOLINT
903 if (msb == 0) {
904 const uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
905 _BitScanReverse(&index, lsb);
906 return 63 - index;
907 } else {
908 _BitScanReverse(&index, msb);
909 return 31 - index;
911 #endif // HWY_ARCH_X86_64
912 #else // HWY_COMPILER_MSVC
913 return static_cast<size_t>(__builtin_clzll(x));
914 #endif // HWY_COMPILER_MSVC
917 HWY_API size_t PopCount(uint64_t x) {
918 #if HWY_COMPILER_GCC // includes clang
919 return static_cast<size_t>(__builtin_popcountll(x));
920 // This instruction has a separate feature flag, but is often called from
921 // non-SIMD code, so we don't want to require dynamic dispatch. It was first
922 // supported by Intel in Nehalem (SSE4.2), but MSVC only predefines a macro
923 // for AVX, so check for that.
924 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
925 return _mm_popcnt_u64(x);
926 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
927 return _mm_popcnt_u32(static_cast<uint32_t>(x & 0xFFFFFFFFu)) +
928 _mm_popcnt_u32(static_cast<uint32_t>(x >> 32));
929 #else
930 x -= ((x >> 1) & 0x5555555555555555ULL);
931 x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL));
932 x = (((x >> 4) + x) & 0x0F0F0F0F0F0F0F0FULL);
933 x += (x >> 8);
934 x += (x >> 16);
935 x += (x >> 32);
936 return static_cast<size_t>(x & 0x7Fu);
937 #endif
940 // Skip HWY_API due to GCC "function not considered for inlining". Previously
941 // such errors were caused by underlying type mismatches, but it's not clear
942 // what is still mismatched despite all the casts.
943 template <typename TI>
944 /*HWY_API*/ constexpr size_t FloorLog2(TI x) {
945 return x == TI{1}
947 : static_cast<size_t>(FloorLog2(static_cast<TI>(x >> 1)) + 1);
950 template <typename TI>
951 /*HWY_API*/ constexpr size_t CeilLog2(TI x) {
952 return x == TI{1}
954 : static_cast<size_t>(FloorLog2(static_cast<TI>(x - 1)) + 1);
957 template <typename T>
958 HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag /*tag*/, T t, size_t n) {
959 return t + static_cast<T>(n);
962 template <typename T>
963 HWY_INLINE constexpr T AddWithWraparound(hwy::NonFloatTag /*tag*/, T t,
964 size_t n) {
965 using TU = MakeUnsigned<T>;
966 return static_cast<T>(
967 static_cast<TU>(static_cast<TU>(t) + static_cast<TU>(n)) &
968 hwy::LimitsMax<TU>());
971 #if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
972 #pragma intrinsic(_umul128)
973 #endif
975 // 64 x 64 = 128 bit multiplication
976 HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {
977 #if defined(__SIZEOF_INT128__)
978 __uint128_t product = (__uint128_t)a * (__uint128_t)b;
979 *upper = (uint64_t)(product >> 64);
980 return (uint64_t)(product & 0xFFFFFFFFFFFFFFFFULL);
981 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
982 return _umul128(a, b, upper);
983 #else
984 constexpr uint64_t kLo32 = 0xFFFFFFFFU;
985 const uint64_t lo_lo = (a & kLo32) * (b & kLo32);
986 const uint64_t hi_lo = (a >> 32) * (b & kLo32);
987 const uint64_t lo_hi = (a & kLo32) * (b >> 32);
988 const uint64_t hi_hi = (a >> 32) * (b >> 32);
989 const uint64_t t = (lo_lo >> 32) + (hi_lo & kLo32) + lo_hi;
990 *upper = (hi_lo >> 32) + (t >> 32) + hi_hi;
991 return (t << 32) | (lo_lo & kLo32);
992 #endif
995 #if HWY_COMPILER_MSVC
996 #pragma intrinsic(memcpy)
997 #pragma intrinsic(memset)
998 #endif
1000 // The source/destination must not overlap/alias.
1001 template <size_t kBytes, typename From, typename To>
1002 HWY_API void CopyBytes(const From* from, To* to) {
1003 #if HWY_COMPILER_MSVC
1004 memcpy(to, from, kBytes);
1005 #else
1006 __builtin_memcpy(
1007 static_cast<void*>(to), static_cast<const void*>(from), kBytes);
1008 #endif
1011 // Same as CopyBytes, but for same-sized objects; avoids a size argument.
1012 template <typename From, typename To>
1013 HWY_API void CopySameSize(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) {
1014 static_assert(sizeof(From) == sizeof(To), "");
1015 CopyBytes<sizeof(From)>(from, to);
1018 template <size_t kBytes, typename To>
1019 HWY_API void ZeroBytes(To* to) {
1020 #if HWY_COMPILER_MSVC
1021 memset(to, 0, kBytes);
1022 #else
1023 __builtin_memset(to, 0, kBytes);
1024 #endif
1027 HWY_API float F32FromBF16(bfloat16_t bf) {
1028 uint32_t bits = bf.bits;
1029 bits <<= 16;
1030 float f;
1031 CopySameSize(&bits, &f);
1032 return f;
1035 HWY_API bfloat16_t BF16FromF32(float f) {
1036 uint32_t bits;
1037 CopySameSize(&f, &bits);
1038 bfloat16_t bf;
1039 bf.bits = static_cast<uint16_t>(bits >> 16);
1040 return bf;
1043 HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
1044 Abort(const char* file, int line, const char* format, ...);
1046 // Prevents the compiler from eliding the computations that led to "output".
1047 template <class T>
1048 HWY_API void PreventElision(T&& output) {
1049 #if HWY_COMPILER_MSVC
1050 // MSVC does not support inline assembly anymore (and never supported GCC's
1051 // RTL constraints). Self-assignment with #pragma optimize("off") might be
1052 // expected to prevent elision, but it does not with MSVC 2015. Type-punning
1053 // with volatile pointers generates inefficient code on MSVC 2017.
1054 static std::atomic<RemoveRef<T>> dummy;
1055 dummy.store(output, std::memory_order_relaxed);
1056 #else
1057 // Works by indicating to the compiler that "output" is being read and
1058 // modified. The +r constraint avoids unnecessary writes to memory, but only
1059 // works for built-in types (typically FuncOutput).
1060 asm volatile("" : "+r"(output) : : "memory");
1061 #endif
1064 } // namespace hwy
1066 #endif // HIGHWAY_HWY_BASE_H_