third_party/highway/hwy/base.h

   1 // Copyright 2020 Google LLC
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4 // Licensed under the Apache License, Version 2.0 (the "License");
   5 // you may not use this file except in compliance with the License.
   6 // You may obtain a copy of the License at
   7 //
   8 //      http://www.apache.org/licenses/LICENSE-2.0
   9 //
  10 // Unless required by applicable law or agreed to in writing, software
  11 // distributed under the License is distributed on an "AS IS" BASIS,
  12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 // See the License for the specific language governing permissions and
  14 // limitations under the License.
  15
  16 #ifndef HIGHWAY_HWY_BASE_H_
  17 #define HIGHWAY_HWY_BASE_H_
  18
  19 // For SIMD module implementations and their callers, target-independent.
  20
  21 // IWYU pragma: begin_exports
  22 #include <stddef.h>
  23 #include <stdint.h>
  24
  25 #include "hwy/detect_compiler_arch.h"
  26 #include "hwy/highway_export.h"
  27
  28 // "IWYU pragma: keep" does not work for this include, so hide it from the IDE.
  29 #if ((HWY_ARCH_X86 && !defined(HWY_NO_LIBCXX)) || HWY_COMPILER_MSVC) && !HWY_IDE
  30 #include <atomic>
  31 #endif
  32 // IWYU pragma: end_exports
  33
  34 #if HWY_COMPILER_MSVC
  35 #include <string.h>  // memcpy
  36 #endif
  37
  38 //------------------------------------------------------------------------------
  39 // Compiler-specific definitions
  40
  41 #define HWY_STR_IMPL(macro) #macro
  42 #define HWY_STR(macro) HWY_STR_IMPL(macro)
  43
  44 #if HWY_COMPILER_MSVC
  45
  46 #include <intrin.h>
  47
  48 #define HWY_RESTRICT __restrict
  49 #define HWY_INLINE __forceinline
  50 #define HWY_NOINLINE __declspec(noinline)
  51 #define HWY_FLATTEN
  52 #define HWY_NORETURN __declspec(noreturn)
  53 #define HWY_LIKELY(expr) (expr)
  54 #define HWY_UNLIKELY(expr) (expr)
  55 #define HWY_PRAGMA(tokens) __pragma(tokens)
  56 #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens))
  57 #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc)
  58 #define HWY_MAYBE_UNUSED
  59 #define HWY_HAS_ASSUME_ALIGNED 0
  60 #if (_MSC_VER >= 1700)
  61 #define HWY_MUST_USE_RESULT _Check_return_
  62 #else
  63 #define HWY_MUST_USE_RESULT
  64 #endif
  65
  66 #else
  67
  68 #define HWY_RESTRICT __restrict__
  69 // force inlining without optimization enabled creates very inefficient code
  70 // that can cause compiler timeout
  71 #ifdef __OPTIMIZE__
  72 #define HWY_INLINE inline __attribute__((always_inline))
  73 #else
  74 #define HWY_INLINE inline
  75 #endif
  76 #define HWY_NOINLINE __attribute__((noinline))
  77 #define HWY_FLATTEN __attribute__((flatten))
  78 #define HWY_NORETURN __attribute__((noreturn))
  79 #define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1)
  80 #define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
  81 #define HWY_PRAGMA(tokens) _Pragma(#tokens)
  82 #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens)
  83 #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc)
  84 // Encountered "attribute list cannot appear here" when using the C++17
  85 // [[maybe_unused]], so only use the old style attribute for now.
  86 #define HWY_MAYBE_UNUSED __attribute__((unused))
  87 #define HWY_MUST_USE_RESULT __attribute__((warn_unused_result))
  88
  89 #endif  // !HWY_COMPILER_MSVC
  90
  91 //------------------------------------------------------------------------------
  92 // Builtin/attributes
  93
  94 // Enables error-checking of format strings.
  95 #if HWY_HAS_ATTRIBUTE(__format__)
  96 #define HWY_FORMAT(idx_fmt, idx_arg) \
  97   __attribute__((__format__(__printf__, idx_fmt, idx_arg)))
  98 #else
  99 #define HWY_FORMAT(idx_fmt, idx_arg)
 100 #endif
 101
 102 // Returns a void* pointer which the compiler then assumes is N-byte aligned.
 103 // Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32);
 104 //
 105 // The assignment semantics are required by GCC/Clang. ICC provides an in-place
 106 // __assume_aligned, whereas MSVC's __assume appears unsuitable.
 107 #if HWY_HAS_BUILTIN(__builtin_assume_aligned)
 108 #define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
 109 #else
 110 #define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
 111 #endif
 112
 113 // Clang and GCC require attributes on each function into which SIMD intrinsics
 114 // are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
 115 // automatic annotation via pragmas.
 116 #if HWY_COMPILER_CLANG
 117 #define HWY_PUSH_ATTRIBUTES(targets_str)                                \
 118   HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \
 119                                   apply_to = function))
 120 #define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop)
 121 #elif HWY_COMPILER_GCC
 122 #define HWY_PUSH_ATTRIBUTES(targets_str) \
 123   HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str)
 124 #define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options)
 125 #else
 126 #define HWY_PUSH_ATTRIBUTES(targets_str)
 127 #define HWY_POP_ATTRIBUTES
 128 #endif
 129
 130 //------------------------------------------------------------------------------
 131 // Macros
 132
 133 #define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED
 134
 135 #define HWY_CONCAT_IMPL(a, b) a##b
 136 #define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b)
 137
 138 #define HWY_MIN(a, b) ((a) < (b) ? (a) : (b))
 139 #define HWY_MAX(a, b) ((a) > (b) ? (a) : (b))
 140
 141 #if HWY_COMPILER_GCC_ACTUAL
 142 // nielskm: GCC does not support '#pragma GCC unroll' without the factor.
 143 #define HWY_UNROLL(factor) HWY_PRAGMA(GCC unroll factor)
 144 #define HWY_DEFAULT_UNROLL HWY_UNROLL(4)
 145 #elif HWY_COMPILER_CLANG || HWY_COMPILER_ICC || HWY_COMPILER_ICX
 146 #define HWY_UNROLL(factor) HWY_PRAGMA(unroll factor)
 147 #define HWY_DEFAULT_UNROLL HWY_UNROLL()
 148 #else
 149 #define HWY_UNROLL(factor)
 150 #define HWY_DEFAULT_UNROLL
 151 #endif
 152
 153 // Tell a compiler that the expression always evaluates to true.
 154 // The expression should be free from any side effects.
 155 // Some older compilers may have trouble with complex expressions, therefore
 156 // it is advisable to split multiple conditions into separate assume statements,
 157 // and manually check the generated code.
 158 // OK but could fail:
 159 //   HWY_ASSUME(x == 2 && y == 3);
 160 // Better:
 161 //   HWY_ASSUME(x == 2);
 162 //   HWY_ASSUME(y == 3);
 163 #if HWY_HAS_CPP_ATTRIBUTE(assume)
 164 #define HWY_ASSUME(expr) [[assume(expr)]]
 165 #elif HWY_COMPILER_MSVC || HWY_COMPILER_ICC
 166 #define HWY_ASSUME(expr) __assume(expr)
 167 // __builtin_assume() was added in clang 3.6.
 168 #elif HWY_COMPILER_CLANG && HWY_HAS_BUILTIN(__builtin_assume)
 169 #define HWY_ASSUME(expr) __builtin_assume(expr)
 170 // __builtin_unreachable() was added in GCC 4.5, but __has_builtin() was added
 171 // later, so check for the compiler version directly.
 172 #elif HWY_COMPILER_GCC_ACTUAL >= 405
 173 #define HWY_ASSUME(expr) \
 174   ((expr) ? static_cast<void>(0) : __builtin_unreachable())
 175 #else
 176 #define HWY_ASSUME(expr) static_cast<void>(0)
 177 #endif
 178
 179 // Compile-time fence to prevent undesirable code reordering. On Clang x86, the
 180 // typical asm volatile("" : : : "memory") has no effect, whereas atomic fence
 181 // does, without generating code.
 182 #if HWY_ARCH_X86 && !defined(HWY_NO_LIBCXX)
 183 #define HWY_FENCE std::atomic_thread_fence(std::memory_order_acq_rel)
 184 #else
 185 // TODO(janwas): investigate alternatives. On Arm, the above generates barriers.
 186 #define HWY_FENCE
 187 #endif
 188
 189 // 4 instances of a given literal value, useful as input to LoadDup128.
 190 #define HWY_REP4(literal) literal, literal, literal, literal
 191
 192 #define HWY_ABORT(format, ...) \
 193   ::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__)
 194
 195 // Always enabled.
 196 #define HWY_ASSERT(condition)             \
 197   do {                                    \
 198     if (!(condition)) {                   \
 199       HWY_ABORT("Assert %s", #condition); \
 200     }                                     \
 201   } while (0)
 202
 203 #if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER)
 204 #define HWY_IS_MSAN 1
 205 #else
 206 #define HWY_IS_MSAN 0
 207 #endif
 208
 209 #if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER)
 210 #define HWY_IS_ASAN 1
 211 #else
 212 #define HWY_IS_ASAN 0
 213 #endif
 214
 215 #if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER)
 216 #define HWY_IS_TSAN 1
 217 #else
 218 #define HWY_IS_TSAN 0
 219 #endif
 220
 221 // MSAN may cause lengthy build times or false positives e.g. in AVX3 DemoteTo.
 222 // You can disable MSAN by adding this attribute to the function that fails.
 223 #if HWY_IS_MSAN
 224 #define HWY_ATTR_NO_MSAN __attribute__((no_sanitize_memory))
 225 #else
 226 #define HWY_ATTR_NO_MSAN
 227 #endif
 228
 229 // For enabling HWY_DASSERT and shortening tests in slower debug builds
 230 #if !defined(HWY_IS_DEBUG_BUILD)
 231 // Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent
 232 // MSVC defines NDEBUG (if not, could instead check _DEBUG).
 233 #if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \
 234     HWY_IS_MSAN || HWY_IS_TSAN || defined(__clang_analyzer__)
 235 #define HWY_IS_DEBUG_BUILD 1
 236 #else
 237 #define HWY_IS_DEBUG_BUILD 0
 238 #endif
 239 #endif  // HWY_IS_DEBUG_BUILD
 240
 241 #if HWY_IS_DEBUG_BUILD
 242 #define HWY_DASSERT(condition) HWY_ASSERT(condition)
 243 #else
 244 #define HWY_DASSERT(condition) \
 245   do {                         \
 246   } while (0)
 247 #endif
 248
 249 namespace hwy {
 250
 251 //------------------------------------------------------------------------------
 252 // kMaxVectorSize (undocumented, pending removal)
 253
 254 #if HWY_ARCH_X86
 255 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64;  // AVX-512
 256 #elif HWY_ARCH_RVV && defined(__riscv_v_intrinsic) && \
 257     __riscv_v_intrinsic >= 11000
 258 // Not actually an upper bound on the size.
 259 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096;
 260 #else
 261 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
 262 #endif
 263
 264 //------------------------------------------------------------------------------
 265 // Alignment
 266
 267 // Potentially useful for LoadDup128 and capped vectors. In other cases, arrays
 268 // should be allocated dynamically via aligned_allocator.h because Lanes() may
 269 // exceed the stack size.
 270 #if HWY_ARCH_X86
 271 #define HWY_ALIGN_MAX alignas(64)
 272 #elif HWY_ARCH_RVV && defined(__riscv_v_intrinsic) && \
 273     __riscv_v_intrinsic >= 11000
 274 #define HWY_ALIGN_MAX alignas(8)  // only elements need be aligned
 275 #else
 276 #define HWY_ALIGN_MAX alignas(16)
 277 #endif
 278
 279 //------------------------------------------------------------------------------
 280 // Lane types
 281
 282 // Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
 283 // by concatenating base type and bits.
 284
 285 #pragma pack(push, 1)
 286
 287 // ACLE (https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html):
 288 // always supported on Armv8, for Armv7 only if -mfp16-format is given.
 289 #if ((HWY_ARCH_ARM_A64 || (__ARM_FP & 2)) && HWY_COMPILER_GCC)
 290 using float16_t = __fp16;
 291 // C11 extension ISO/IEC TS 18661-3:2015 but not supported on all targets.
 292 // Required for Clang RVV if the float16 extension is used.
 293 #elif HWY_ARCH_RVV && HWY_COMPILER_CLANG && defined(__riscv_zvfh)
 294 using float16_t = _Float16;
 295 // Otherwise emulate
 296 #else
 297 struct float16_t {
 298   uint16_t bits;
 299 };
 300 #endif
 301
 302 struct bfloat16_t {
 303   uint16_t bits;
 304 };
 305
 306 #pragma pack(pop)
 307
 308 using float32_t = float;
 309 using float64_t = double;
 310
 311 #pragma pack(push, 1)
 312
 313 // Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it:
 314 // https://reviews.llvm.org/D86310
 315 struct alignas(16) uint128_t {
 316   uint64_t lo;  // little-endian layout
 317   uint64_t hi;
 318 };
 319
 320 // 64 bit key plus 64 bit value. Faster than using uint128_t when only the key
 321 // field is to be compared (Lt128Upper instead of Lt128).
 322 struct alignas(16) K64V64 {
 323   uint64_t value;  // little-endian layout
 324   uint64_t key;
 325 };
 326
 327 // 32 bit key plus 32 bit value. Allows vqsort recursions to terminate earlier
 328 // than when considering both to be a 64-bit key.
 329 struct alignas(8) K32V32 {
 330   uint32_t value;  // little-endian layout
 331   uint32_t key;
 332 };
 333
 334 #pragma pack(pop)
 335
 336 static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a,
 337                                               const uint128_t& b) {
 338   return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi;
 339 }
 340 // Required for std::greater.
 341 static inline HWY_MAYBE_UNUSED bool operator>(const uint128_t& a,
 342                                               const uint128_t& b) {
 343   return b < a;
 344 }
 345 static inline HWY_MAYBE_UNUSED bool operator==(const uint128_t& a,
 346                                                const uint128_t& b) {
 347   return a.lo == b.lo && a.hi == b.hi;
 348 }
 349
 350 static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a,
 351                                               const K64V64& b) {
 352   return a.key < b.key;
 353 }
 354 // Required for std::greater.
 355 static inline HWY_MAYBE_UNUSED bool operator>(const K64V64& a,
 356                                               const K64V64& b) {
 357   return b < a;
 358 }
 359 static inline HWY_MAYBE_UNUSED bool operator==(const K64V64& a,
 360                                                const K64V64& b) {
 361   return a.key == b.key;
 362 }
 363
 364 static inline HWY_MAYBE_UNUSED bool operator<(const K32V32& a,
 365                                               const K32V32& b) {
 366   return a.key < b.key;
 367 }
 368 // Required for std::greater.
 369 static inline HWY_MAYBE_UNUSED bool operator>(const K32V32& a,
 370                                               const K32V32& b) {
 371   return b < a;
 372 }
 373 static inline HWY_MAYBE_UNUSED bool operator==(const K32V32& a,
 374                                                const K32V32& b) {
 375   return a.key == b.key;
 376 }
 377
 378 //------------------------------------------------------------------------------
 379 // Controlling overload resolution (SFINAE)
 380
 381 template <bool Condition>
 382 struct EnableIfT {};
 383 template <>
 384 struct EnableIfT<true> {
 385   using type = void;
 386 };
 387
 388 template <bool Condition>
 389 using EnableIf = typename EnableIfT<Condition>::type;
 390
 391 template <typename T, typename U>
 392 struct IsSameT {
 393   enum { value = 0 };
 394 };
 395
 396 template <typename T>
 397 struct IsSameT<T, T> {
 398   enum { value = 1 };
 399 };
 400
 401 template <typename T, typename U>
 402 HWY_API constexpr bool IsSame() {
 403   return IsSameT<T, U>::value;
 404 }
 405
 406 template <bool Condition, typename Then, typename Else>
 407 struct IfT {
 408   using type = Then;
 409 };
 410
 411 template <class Then, class Else>
 412 struct IfT<false, Then, Else> {
 413   using type = Else;
 414 };
 415
 416 template <bool Condition, typename Then, typename Else>
 417 using If = typename IfT<Condition, Then, Else>::type;
 418
 419 // Insert into template/function arguments to enable this overload only for
 420 // vectors of exactly, at most (LE), or more than (GT) this many bytes.
 421 //
 422 // As an example, checking for a total size of 16 bytes will match both
 423 // Simd<uint8_t, 16, 0> and Simd<uint8_t, 8, 1>.
 424 #define HWY_IF_V_SIZE(T, kN, bytes) \
 425   hwy::EnableIf<kN * sizeof(T) == bytes>* = nullptr
 426 #define HWY_IF_V_SIZE_LE(T, kN, bytes) \
 427   hwy::EnableIf<kN * sizeof(T) <= bytes>* = nullptr
 428 #define HWY_IF_V_SIZE_GT(T, kN, bytes) \
 429   hwy::EnableIf<(kN * sizeof(T) > bytes)>* = nullptr
 430
 431 #define HWY_IF_LANES(kN, lanes) hwy::EnableIf<(kN == lanes)>* = nullptr
 432 #define HWY_IF_LANES_LE(kN, lanes) hwy::EnableIf<(kN <= lanes)>* = nullptr
 433 #define HWY_IF_LANES_GT(kN, lanes) hwy::EnableIf<(kN > lanes)>* = nullptr
 434
 435 #define HWY_IF_UNSIGNED(T) hwy::EnableIf<!IsSigned<T>()>* = nullptr
 436 #define HWY_IF_SIGNED(T)                                                   \
 437   hwy::EnableIf<IsSigned<T>() && !IsFloat<T>() && !IsSpecialFloat<T>()>* = \
 438       nullptr
 439 #define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
 440 #define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
 441 #define HWY_IF_SPECIAL_FLOAT(T) \
 442   hwy::EnableIf<hwy::IsSpecialFloat<T>()>* = nullptr
 443 #define HWY_IF_NOT_SPECIAL_FLOAT(T) \
 444   hwy::EnableIf<!hwy::IsSpecialFloat<T>()>* = nullptr
 445 #define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T) \
 446   hwy::EnableIf<!hwy::IsFloat<T>() && !hwy::IsSpecialFloat<T>()>* = nullptr
 447
 448 #define HWY_IF_T_SIZE(T, bytes) hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
 449 #define HWY_IF_NOT_T_SIZE(T, bytes) \
 450   hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
 451 // bit_array = 0x102 means 1 or 8 bytes. There is no NONE_OF because it sounds
 452 // too similar. If you want the opposite of this (2 or 4 bytes), ask for those
 453 // bits explicitly (0x14) instead of attempting to 'negate' 0x102.
 454 #define HWY_IF_T_SIZE_ONE_OF(T, bit_array) \
 455   hwy::EnableIf<((size_t{1} << sizeof(T)) & (bit_array)) != 0>* = nullptr
 456
 457 // Use instead of HWY_IF_T_SIZE to avoid ambiguity with float/double
 458 // overloads.
 459 #define HWY_IF_UI32(T) \
 460   hwy::EnableIf<IsSame<T, uint32_t>() || IsSame<T, int32_t>()>* = nullptr
 461 #define HWY_IF_UI64(T) \
 462   hwy::EnableIf<IsSame<T, uint64_t>() || IsSame<T, int64_t>()>* = nullptr
 463
 464 #define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
 465   hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
 466
 467 // Empty struct used as a size tag type.
 468 template <size_t N>
 469 struct SizeTag {};
 470
 471 template <class T>
 472 struct RemoveConstT {
 473   using type = T;
 474 };
 475 template <class T>
 476 struct RemoveConstT<const T> {
 477   using type = T;
 478 };
 479
 480 template <class T>
 481 using RemoveConst = typename RemoveConstT<T>::type;
 482
 483 template <class T>
 484 struct RemoveRefT {
 485   using type = T;
 486 };
 487 template <class T>
 488 struct RemoveRefT<T&> {
 489   using type = T;
 490 };
 491
 492 template <class T>
 493 using RemoveRef = typename RemoveRefT<T>::type;
 494
 495 //------------------------------------------------------------------------------
 496 // Type relations
 497
 498 namespace detail {
 499
 500 template <typename T>
 501 struct Relations;
 502 template <>
 503 struct Relations<uint8_t> {
 504   using Unsigned = uint8_t;
 505   using Signed = int8_t;
 506   using Wide = uint16_t;
 507   enum { is_signed = 0, is_float = 0 };
 508 };
 509 template <>
 510 struct Relations<int8_t> {
 511   using Unsigned = uint8_t;
 512   using Signed = int8_t;
 513   using Wide = int16_t;
 514   enum { is_signed = 1, is_float = 0 };
 515 };
 516 template <>
 517 struct Relations<uint16_t> {
 518   using Unsigned = uint16_t;
 519   using Signed = int16_t;
 520   using Wide = uint32_t;
 521   using Narrow = uint8_t;
 522   enum { is_signed = 0, is_float = 0 };
 523 };
 524 template <>
 525 struct Relations<int16_t> {
 526   using Unsigned = uint16_t;
 527   using Signed = int16_t;
 528   using Wide = int32_t;
 529   using Narrow = int8_t;
 530   enum { is_signed = 1, is_float = 0 };
 531 };
 532 template <>
 533 struct Relations<uint32_t> {
 534   using Unsigned = uint32_t;
 535   using Signed = int32_t;
 536   using Float = float;
 537   using Wide = uint64_t;
 538   using Narrow = uint16_t;
 539   enum { is_signed = 0, is_float = 0 };
 540 };
 541 template <>
 542 struct Relations<int32_t> {
 543   using Unsigned = uint32_t;
 544   using Signed = int32_t;
 545   using Float = float;
 546   using Wide = int64_t;
 547   using Narrow = int16_t;
 548   enum { is_signed = 1, is_float = 0 };
 549 };
 550 template <>
 551 struct Relations<uint64_t> {
 552   using Unsigned = uint64_t;
 553   using Signed = int64_t;
 554   using Float = double;
 555   using Wide = uint128_t;
 556   using Narrow = uint32_t;
 557   enum { is_signed = 0, is_float = 0 };
 558 };
 559 template <>
 560 struct Relations<int64_t> {
 561   using Unsigned = uint64_t;
 562   using Signed = int64_t;
 563   using Float = double;
 564   using Narrow = int32_t;
 565   enum { is_signed = 1, is_float = 0 };
 566 };
 567 template <>
 568 struct Relations<uint128_t> {
 569   using Unsigned = uint128_t;
 570   using Narrow = uint64_t;
 571   enum { is_signed = 0, is_float = 0 };
 572 };
 573 template <>
 574 struct Relations<float16_t> {
 575   using Unsigned = uint16_t;
 576   using Signed = int16_t;
 577   using Float = float16_t;
 578   using Wide = float;
 579   enum { is_signed = 1, is_float = 1 };
 580 };
 581 template <>
 582 struct Relations<bfloat16_t> {
 583   using Unsigned = uint16_t;
 584   using Signed = int16_t;
 585   using Wide = float;
 586   enum { is_signed = 1, is_float = 1 };
 587 };
 588 template <>
 589 struct Relations<float> {
 590   using Unsigned = uint32_t;
 591   using Signed = int32_t;
 592   using Float = float;
 593   using Wide = double;
 594   using Narrow = float16_t;
 595   enum { is_signed = 1, is_float = 1 };
 596 };
 597 template <>
 598 struct Relations<double> {
 599   using Unsigned = uint64_t;
 600   using Signed = int64_t;
 601   using Float = double;
 602   using Narrow = float;
 603   enum { is_signed = 1, is_float = 1 };
 604 };
 605
 606 template <size_t N>
 607 struct TypeFromSize;
 608 template <>
 609 struct TypeFromSize<1> {
 610   using Unsigned = uint8_t;
 611   using Signed = int8_t;
 612 };
 613 template <>
 614 struct TypeFromSize<2> {
 615   using Unsigned = uint16_t;
 616   using Signed = int16_t;
 617 };
 618 template <>
 619 struct TypeFromSize<4> {
 620   using Unsigned = uint32_t;
 621   using Signed = int32_t;
 622   using Float = float;
 623 };
 624 template <>
 625 struct TypeFromSize<8> {
 626   using Unsigned = uint64_t;
 627   using Signed = int64_t;
 628   using Float = double;
 629 };
 630 template <>
 631 struct TypeFromSize<16> {
 632   using Unsigned = uint128_t;
 633 };
 634
 635 }  // namespace detail
 636
 637 // Aliases for types of a different category, but the same size.
 638 template <typename T>
 639 using MakeUnsigned = typename detail::Relations<T>::Unsigned;
 640 template <typename T>
 641 using MakeSigned = typename detail::Relations<T>::Signed;
 642 template <typename T>
 643 using MakeFloat = typename detail::Relations<T>::Float;
 644
 645 // Aliases for types of the same category, but different size.
 646 template <typename T>
 647 using MakeWide = typename detail::Relations<T>::Wide;
 648 template <typename T>
 649 using MakeNarrow = typename detail::Relations<T>::Narrow;
 650
 651 // Obtain type from its size [bytes].
 652 template <size_t N>
 653 using UnsignedFromSize = typename detail::TypeFromSize<N>::Unsigned;
 654 template <size_t N>
 655 using SignedFromSize = typename detail::TypeFromSize<N>::Signed;
 656 template <size_t N>
 657 using FloatFromSize = typename detail::TypeFromSize<N>::Float;
 658
 659 // Avoid confusion with SizeTag where the parameter is a lane size.
 660 using UnsignedTag = SizeTag<0>;
 661 using SignedTag = SizeTag<0x100>;  // integer
 662 using FloatTag = SizeTag<0x200>;
 663
 664 template <typename T, class R = detail::Relations<T>>
 665 constexpr auto TypeTag() -> hwy::SizeTag<((R::is_signed + R::is_float) << 8)> {
 666   return hwy::SizeTag<((R::is_signed + R::is_float) << 8)>();
 667 }
 668
 669 // For when we only want to distinguish FloatTag from everything else.
 670 using NonFloatTag = SizeTag<0x400>;
 671
 672 template <typename T, class R = detail::Relations<T>>
 673 constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 : 0x400)> {
 674   return hwy::SizeTag<(R::is_float ? 0x200 : 0x400)>();
 675 }
 676
 677 //------------------------------------------------------------------------------
 678 // Type traits
 679
 680 template <typename T>
 681 HWY_API constexpr bool IsFloat() {
 682   // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
 683   // from a float, not compared.
 684   return IsSame<T, float>() || IsSame<T, double>();
 685 }
 686
 687 // These types are often special-cased and not supported in all ops.
 688 template <typename T>
 689 HWY_API constexpr bool IsSpecialFloat() {
 690   return IsSame<T, float16_t>() || IsSame<T, bfloat16_t>();
 691 }
 692
 693 template <typename T>
 694 HWY_API constexpr bool IsSigned() {
 695   return T(0) > T(-1);
 696 }
 697 template <>
 698 constexpr bool IsSigned<float16_t>() {
 699   return true;
 700 }
 701 template <>
 702 constexpr bool IsSigned<bfloat16_t>() {
 703   return true;
 704 }
 705
 706 // Largest/smallest representable integer values.
 707 template <typename T>
 708 HWY_API constexpr T LimitsMax() {
 709   static_assert(!IsFloat<T>(), "Only for integer types");
 710   using TU = MakeUnsigned<T>;
 711   return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~0ull) >> 1)
 712                                       : static_cast<TU>(~0ull));
 713 }
 714 template <typename T>
 715 HWY_API constexpr T LimitsMin() {
 716   static_assert(!IsFloat<T>(), "Only for integer types");
 717   return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0);
 718 }
 719
 720 // Largest/smallest representable value (integer or float). This naming avoids
 721 // confusion with numeric_limits<float>::min() (the smallest positive value).
 722 template <typename T>
 723 HWY_API constexpr T LowestValue() {
 724   return LimitsMin<T>();
 725 }
 726 template <>
 727 constexpr float LowestValue<float>() {
 728   return -3.402823466e+38F;
 729 }
 730 template <>
 731 constexpr double LowestValue<double>() {
 732   return -1.7976931348623158e+308;
 733 }
 734
 735 template <typename T>
 736 HWY_API constexpr T HighestValue() {
 737   return LimitsMax<T>();
 738 }
 739 template <>
 740 constexpr float HighestValue<float>() {
 741   return 3.402823466e+38F;
 742 }
 743 template <>
 744 constexpr double HighestValue<double>() {
 745   return 1.7976931348623158e+308;
 746 }
 747
 748 // Difference between 1.0 and the next representable value.
 749 template <typename T>
 750 HWY_API constexpr T Epsilon() {
 751   return 1;
 752 }
 753 template <>
 754 constexpr float Epsilon<float>() {
 755   return 1.192092896e-7f;
 756 }
 757 template <>
 758 constexpr double Epsilon<double>() {
 759   return 2.2204460492503131e-16;
 760 }
 761
 762 // Returns width in bits of the mantissa field in IEEE binary32/64.
 763 template <typename T>
 764 constexpr int MantissaBits() {
 765   static_assert(sizeof(T) == 0, "Only instantiate the specializations");
 766   return 0;
 767 }
 768 template <>
 769 constexpr int MantissaBits<float>() {
 770   return 23;
 771 }
 772 template <>
 773 constexpr int MantissaBits<double>() {
 774   return 52;
 775 }
 776
 777 // Returns the (left-shifted by one bit) IEEE binary32/64 representation with
 778 // the largest possible (biased) exponent field. Used by IsInf.
 779 template <typename T>
 780 constexpr MakeSigned<T> MaxExponentTimes2() {
 781   return -(MakeSigned<T>{1} << (MantissaBits<T>() + 1));
 782 }
 783
 784 // Returns bitmask of the sign bit in IEEE binary32/64.
 785 template <typename T>
 786 constexpr MakeUnsigned<T> SignMask() {
 787   return MakeUnsigned<T>{1} << (sizeof(T) * 8 - 1);
 788 }
 789
 790 // Returns bitmask of the exponent field in IEEE binary32/64.
 791 template <typename T>
 792 constexpr MakeUnsigned<T> ExponentMask() {
 793   return (~(MakeUnsigned<T>{1} << MantissaBits<T>()) + 1) & ~SignMask<T>();
 794 }
 795
 796 // Returns bitmask of the mantissa field in IEEE binary32/64.
 797 template <typename T>
 798 constexpr MakeUnsigned<T> MantissaMask() {
 799   return (MakeUnsigned<T>{1} << MantissaBits<T>()) - 1;
 800 }
 801
 802 // Returns 1 << mantissa_bits as a floating-point number. All integers whose
 803 // absolute value are less than this can be represented exactly.
 804 template <typename T>
 805 constexpr T MantissaEnd() {
 806   static_assert(sizeof(T) == 0, "Only instantiate the specializations");
 807   return 0;
 808 }
 809 template <>
 810 constexpr float MantissaEnd<float>() {
 811   return 8388608.0f;  // 1 << 23
 812 }
 813 template <>
 814 constexpr double MantissaEnd<double>() {
 815   // floating point literal with p52 requires C++17.
 816   return 4503599627370496.0;  // 1 << 52
 817 }
 818
 819 // Returns width in bits of the exponent field in IEEE binary32/64.
 820 template <typename T>
 821 constexpr int ExponentBits() {
 822   // Exponent := remaining bits after deducting sign and mantissa.
 823   return 8 * sizeof(T) - 1 - MantissaBits<T>();
 824 }
 825
 826 // Returns largest value of the biased exponent field in IEEE binary32/64,
 827 // right-shifted so that the LSB is bit zero. Example: 0xFF for float.
 828 // This is expressed as a signed integer for more efficient comparison.
 829 template <typename T>
 830 constexpr MakeSigned<T> MaxExponentField() {
 831   return (MakeSigned<T>{1} << ExponentBits<T>()) - 1;
 832 }
 833
 834 //------------------------------------------------------------------------------
 835 // Helper functions
 836
 837 template <typename T1, typename T2>
 838 constexpr inline T1 DivCeil(T1 a, T2 b) {
 839   return (a + b - 1) / b;
 840 }
 841
 842 // Works for any `align`; if a power of two, compiler emits ADD+AND.
 843 constexpr inline size_t RoundUpTo(size_t what, size_t align) {
 844   return DivCeil(what, align) * align;
 845 }
 846
 847 // Undefined results for x == 0.
 848 HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
 849 #if HWY_COMPILER_MSVC
 850   unsigned long index;  // NOLINT
 851   _BitScanForward(&index, x);
 852   return index;
 853 #else   // HWY_COMPILER_MSVC
 854   return static_cast<size_t>(__builtin_ctz(x));
 855 #endif  // HWY_COMPILER_MSVC
 856 }
 857
 858 HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
 859 #if HWY_COMPILER_MSVC
 860 #if HWY_ARCH_X86_64
 861   unsigned long index;  // NOLINT
 862   _BitScanForward64(&index, x);
 863   return index;
 864 #else   // HWY_ARCH_X86_64
 865   // _BitScanForward64 not available
 866   uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
 867   unsigned long index;  // NOLINT
 868   if (lsb == 0) {
 869     uint32_t msb = static_cast<uint32_t>(x >> 32u);
 870     _BitScanForward(&index, msb);
 871     return 32 + index;
 872   } else {
 873     _BitScanForward(&index, lsb);
 874     return index;
 875   }
 876 #endif  // HWY_ARCH_X86_64
 877 #else   // HWY_COMPILER_MSVC
 878   return static_cast<size_t>(__builtin_ctzll(x));
 879 #endif  // HWY_COMPILER_MSVC
 880 }
 881
 882 // Undefined results for x == 0.
 883 HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {
 884 #if HWY_COMPILER_MSVC
 885   unsigned long index;  // NOLINT
 886   _BitScanReverse(&index, x);
 887   return 31 - index;
 888 #else   // HWY_COMPILER_MSVC
 889   return static_cast<size_t>(__builtin_clz(x));
 890 #endif  // HWY_COMPILER_MSVC
 891 }
 892
 893 HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
 894 #if HWY_COMPILER_MSVC
 895 #if HWY_ARCH_X86_64
 896   unsigned long index;  // NOLINT
 897   _BitScanReverse64(&index, x);
 898   return 63 - index;
 899 #else   // HWY_ARCH_X86_64
 900   // _BitScanReverse64 not available
 901   const uint32_t msb = static_cast<uint32_t>(x >> 32u);
 902   unsigned long index;  // NOLINT
 903   if (msb == 0) {
 904     const uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
 905     _BitScanReverse(&index, lsb);
 906     return 63 - index;
 907   } else {
 908     _BitScanReverse(&index, msb);
 909     return 31 - index;
 910   }
 911 #endif  // HWY_ARCH_X86_64
 912 #else   // HWY_COMPILER_MSVC
 913   return static_cast<size_t>(__builtin_clzll(x));
 914 #endif  // HWY_COMPILER_MSVC
 915 }
 916
 917 HWY_API size_t PopCount(uint64_t x) {
 918 #if HWY_COMPILER_GCC  // includes clang
 919   return static_cast<size_t>(__builtin_popcountll(x));
 920   // This instruction has a separate feature flag, but is often called from
 921   // non-SIMD code, so we don't want to require dynamic dispatch. It was first
 922   // supported by Intel in Nehalem (SSE4.2), but MSVC only predefines a macro
 923   // for AVX, so check for that.
 924 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
 925   return _mm_popcnt_u64(x);
 926 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
 927   return _mm_popcnt_u32(static_cast<uint32_t>(x & 0xFFFFFFFFu)) +
 928          _mm_popcnt_u32(static_cast<uint32_t>(x >> 32));
 929 #else
 930   x -= ((x >> 1) & 0x5555555555555555ULL);
 931   x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL));
 932   x = (((x >> 4) + x) & 0x0F0F0F0F0F0F0F0FULL);
 933   x += (x >> 8);
 934   x += (x >> 16);
 935   x += (x >> 32);
 936   return static_cast<size_t>(x & 0x7Fu);
 937 #endif
 938 }
 939
 940 // Skip HWY_API due to GCC "function not considered for inlining". Previously
 941 // such errors were caused by underlying type mismatches, but it's not clear
 942 // what is still mismatched despite all the casts.
 943 template <typename TI>
 944 /*HWY_API*/ constexpr size_t FloorLog2(TI x) {
 945   return x == TI{1}
 946              ? 0
 947              : static_cast<size_t>(FloorLog2(static_cast<TI>(x >> 1)) + 1);
 948 }
 949
 950 template <typename TI>
 951 /*HWY_API*/ constexpr size_t CeilLog2(TI x) {
 952   return x == TI{1}
 953              ? 0
 954              : static_cast<size_t>(FloorLog2(static_cast<TI>(x - 1)) + 1);
 955 }
 956
 957 template <typename T>
 958 HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag /*tag*/, T t, size_t n) {
 959   return t + static_cast<T>(n);
 960 }
 961
 962 template <typename T>
 963 HWY_INLINE constexpr T AddWithWraparound(hwy::NonFloatTag /*tag*/, T t,
 964                                          size_t n) {
 965   using TU = MakeUnsigned<T>;
 966   return static_cast<T>(
 967       static_cast<TU>(static_cast<TU>(t) + static_cast<TU>(n)) &
 968       hwy::LimitsMax<TU>());
 969 }
 970
 971 #if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
 972 #pragma intrinsic(_umul128)
 973 #endif
 974
 975 // 64 x 64 = 128 bit multiplication
 976 HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {
 977 #if defined(__SIZEOF_INT128__)
 978   __uint128_t product = (__uint128_t)a * (__uint128_t)b;
 979   *upper = (uint64_t)(product >> 64);
 980   return (uint64_t)(product & 0xFFFFFFFFFFFFFFFFULL);
 981 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
 982   return _umul128(a, b, upper);
 983 #else
 984   constexpr uint64_t kLo32 = 0xFFFFFFFFU;
 985   const uint64_t lo_lo = (a & kLo32) * (b & kLo32);
 986   const uint64_t hi_lo = (a >> 32) * (b & kLo32);
 987   const uint64_t lo_hi = (a & kLo32) * (b >> 32);
 988   const uint64_t hi_hi = (a >> 32) * (b >> 32);
 989   const uint64_t t = (lo_lo >> 32) + (hi_lo & kLo32) + lo_hi;
 990   *upper = (hi_lo >> 32) + (t >> 32) + hi_hi;
 991   return (t << 32) | (lo_lo & kLo32);
 992 #endif
 993 }
 994
 995 #if HWY_COMPILER_MSVC
 996 #pragma intrinsic(memcpy)
 997 #pragma intrinsic(memset)
 998 #endif
 999
1000 // The source/destination must not overlap/alias.
1001 template <size_t kBytes, typename From, typename To>
1002 HWY_API void CopyBytes(const From* from, To* to) {
1003 #if HWY_COMPILER_MSVC
1004   memcpy(to, from, kBytes);
1005 #else
1006   __builtin_memcpy(
1007       static_cast<void*>(to), static_cast<const void*>(from), kBytes);
1008 #endif
1009 }
1010
1011 // Same as CopyBytes, but for same-sized objects; avoids a size argument.
1012 template <typename From, typename To>
1013 HWY_API void CopySameSize(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) {
1014   static_assert(sizeof(From) == sizeof(To), "");
1015   CopyBytes<sizeof(From)>(from, to);
1016 }
1017
1018 template <size_t kBytes, typename To>
1019 HWY_API void ZeroBytes(To* to) {
1020 #if HWY_COMPILER_MSVC
1021   memset(to, 0, kBytes);
1022 #else
1023   __builtin_memset(to, 0, kBytes);
1024 #endif
1025 }
1026
1027 HWY_API float F32FromBF16(bfloat16_t bf) {
1028   uint32_t bits = bf.bits;
1029   bits <<= 16;
1030   float f;
1031   CopySameSize(&bits, &f);
1032   return f;
1033 }
1034
1035 HWY_API bfloat16_t BF16FromF32(float f) {
1036   uint32_t bits;
1037   CopySameSize(&f, &bits);
1038   bfloat16_t bf;
1039   bf.bits = static_cast<uint16_t>(bits >> 16);
1040   return bf;
1041 }
1042
1043 HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
1044     Abort(const char* file, int line, const char* format, ...);
1045
1046 // Prevents the compiler from eliding the computations that led to "output".
1047 template <class T>
1048 HWY_API void PreventElision(T&& output) {
1049 #if HWY_COMPILER_MSVC
1050   // MSVC does not support inline assembly anymore (and never supported GCC's
1051   // RTL constraints). Self-assignment with #pragma optimize("off") might be
1052   // expected to prevent elision, but it does not with MSVC 2015. Type-punning
1053   // with volatile pointers generates inefficient code on MSVC 2017.
1054   static std::atomic<RemoveRef<T>> dummy;
1055   dummy.store(output, std::memory_order_relaxed);
1056 #else
1057   // Works by indicating to the compiler that "output" is being read and
1058   // modified. The +r constraint avoids unnecessary writes to memory, but only
1059   // works for built-in types (typically FuncOutput).
1060   asm volatile("" : "+r"(output) : : "memory");
1061 #endif
1062 }
1063
1064 }  // namespace hwy
1065
1066 #endif  // HIGHWAY_HWY_BASE_H_