1 // Copyright 2020 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
16 // Main header required before using vector types.
18 // IWYU pragma: begin_exports
20 #include "hwy/detect_compiler_arch.h"
21 #include "hwy/highway_export.h"
22 #include "hwy/targets.h"
23 // IWYU pragma: end_exports
25 // This include guard is checked by foreach_target, so avoid the usual _H_
26 // suffix to prevent copybara from renaming it. NOTE: ops/*-inl.h are included
27 // after/outside this include guard.
28 #ifndef HWY_HIGHWAY_INCLUDED
29 #define HWY_HIGHWAY_INCLUDED
33 // API version (https://semver.org/); keep in sync with CMakeLists.txt.
38 //------------------------------------------------------------------------------
39 // Shorthand for tags (defined in shared-inl.h) used to select overloads.
40 // Note that ScalableTag<T> is preferred over HWY_FULL, and CappedTag<T, N> over
43 // HWY_FULL(T[,LMUL=1]) is a native vector/group. LMUL is the number of
44 // registers in the group, and is ignored on targets that do not support groups.
45 #define HWY_FULL1(T) hwy::HWY_NAMESPACE::ScalableTag<T>
46 #define HWY_FULL2(T, LMUL) \
47 hwy::HWY_NAMESPACE::ScalableTag<T, hwy::CeilLog2(HWY_MAX(0, LMUL))>
48 #define HWY_3TH_ARG(arg1, arg2, arg3, ...) arg3
49 // Workaround for MSVC grouping __VA_ARGS__ into a single argument
50 #define HWY_FULL_RECOMPOSER(args_with_paren) HWY_3TH_ARG args_with_paren
51 // Trailing comma avoids -pedantic false alarm
52 #define HWY_CHOOSE_FULL(...) \
53 HWY_FULL_RECOMPOSER((__VA_ARGS__, HWY_FULL2, HWY_FULL1, ))
54 #define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__)
56 // Vector of up to MAX_N lanes. It's better to use full vectors where possible.
57 #define HWY_CAPPED(T, MAX_N) hwy::HWY_NAMESPACE::CappedTag<T, MAX_N>
59 //------------------------------------------------------------------------------
60 // Export user functions for static/dynamic dispatch
62 // Evaluates to 0 inside a translation unit if it is generating anything but the
63 // static target (the last one if multiple targets are enabled). Used to prevent
64 // redefinitions of HWY_EXPORT. Unless foreach_target.h is included, we only
65 // compile once anyway, so this is 1 unless it is or has been included.
70 // HWY_STATIC_DISPATCH(FUNC_NAME) is the namespace-qualified FUNC_NAME for
71 // HWY_STATIC_TARGET (the only defined namespace unless HWY_TARGET_INCLUDE is
72 // defined), and can be used to deduce the return type of Choose*.
73 #if HWY_STATIC_TARGET == HWY_SCALAR
74 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SCALAR::FUNC_NAME
75 #elif HWY_STATIC_TARGET == HWY_EMU128
76 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_EMU128::FUNC_NAME
77 #elif HWY_STATIC_TARGET == HWY_RVV
78 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_RVV::FUNC_NAME
79 #elif HWY_STATIC_TARGET == HWY_WASM_EMU256
80 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM_EMU256::FUNC_NAME
81 #elif HWY_STATIC_TARGET == HWY_WASM
82 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME
83 #elif HWY_STATIC_TARGET == HWY_NEON_WITHOUT_AES
84 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON_WITHOUT_AES::FUNC_NAME
85 #elif HWY_STATIC_TARGET == HWY_NEON
86 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON::FUNC_NAME
87 #elif HWY_STATIC_TARGET == HWY_SVE
88 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE::FUNC_NAME
89 #elif HWY_STATIC_TARGET == HWY_SVE2
90 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2::FUNC_NAME
91 #elif HWY_STATIC_TARGET == HWY_SVE_256
92 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE_256::FUNC_NAME
93 #elif HWY_STATIC_TARGET == HWY_SVE2_128
94 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2_128::FUNC_NAME
95 #elif HWY_STATIC_TARGET == HWY_PPC8
96 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME
97 #elif HWY_STATIC_TARGET == HWY_PPC9
98 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC9::FUNC_NAME
99 #elif HWY_STATIC_TARGET == HWY_PPC10
100 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC10::FUNC_NAME
101 #elif HWY_STATIC_TARGET == HWY_Z14
102 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_Z14::FUNC_NAME
103 #elif HWY_STATIC_TARGET == HWY_Z15
104 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_Z15::FUNC_NAME
105 #elif HWY_STATIC_TARGET == HWY_SSE2
106 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSE2::FUNC_NAME
107 #elif HWY_STATIC_TARGET == HWY_SSSE3
108 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSSE3::FUNC_NAME
109 #elif HWY_STATIC_TARGET == HWY_SSE4
110 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSE4::FUNC_NAME
111 #elif HWY_STATIC_TARGET == HWY_AVX2
112 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX2::FUNC_NAME
113 #elif HWY_STATIC_TARGET == HWY_AVX3
114 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3::FUNC_NAME
115 #elif HWY_STATIC_TARGET == HWY_AVX3_DL
116 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_DL::FUNC_NAME
117 #elif HWY_STATIC_TARGET == HWY_AVX3_ZEN4
118 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_ZEN4::FUNC_NAME
119 #elif HWY_STATIC_TARGET == HWY_AVX3_SPR
120 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_SPR::FUNC_NAME
123 // HWY_CHOOSE_*(FUNC_NAME) expands to the function pointer for that target or
124 // nullptr is that target was not compiled.
125 #if HWY_TARGETS & HWY_EMU128
126 #define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_EMU128::FUNC_NAME
127 #elif HWY_TARGETS & HWY_SCALAR
128 #define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_SCALAR::FUNC_NAME
130 // When HWY_SCALAR/HWY_EMU128 are not present and other targets were disabled at
131 // runtime, fall back to the baseline with HWY_STATIC_DISPATCH().
132 #define HWY_CHOOSE_FALLBACK(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME)
135 #if HWY_TARGETS & HWY_WASM_EMU256
136 #define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) &N_WASM_EMU256::FUNC_NAME
138 #define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) nullptr
141 #if HWY_TARGETS & HWY_WASM
142 #define HWY_CHOOSE_WASM(FUNC_NAME) &N_WASM::FUNC_NAME
144 #define HWY_CHOOSE_WASM(FUNC_NAME) nullptr
147 #if HWY_TARGETS & HWY_RVV
148 #define HWY_CHOOSE_RVV(FUNC_NAME) &N_RVV::FUNC_NAME
150 #define HWY_CHOOSE_RVV(FUNC_NAME) nullptr
153 #if HWY_TARGETS & HWY_NEON_WITHOUT_AES
154 #define HWY_CHOOSE_NEON_WITHOUT_AES(FUNC_NAME) &N_NEON_WITHOUT_AES::FUNC_NAME
156 #define HWY_CHOOSE_NEON_WITHOUT_AES(FUNC_NAME) nullptr
159 #if HWY_TARGETS & HWY_NEON
160 #define HWY_CHOOSE_NEON(FUNC_NAME) &N_NEON::FUNC_NAME
162 #define HWY_CHOOSE_NEON(FUNC_NAME) nullptr
165 #if HWY_TARGETS & HWY_SVE
166 #define HWY_CHOOSE_SVE(FUNC_NAME) &N_SVE::FUNC_NAME
168 #define HWY_CHOOSE_SVE(FUNC_NAME) nullptr
171 #if HWY_TARGETS & HWY_SVE2
172 #define HWY_CHOOSE_SVE2(FUNC_NAME) &N_SVE2::FUNC_NAME
174 #define HWY_CHOOSE_SVE2(FUNC_NAME) nullptr
177 #if HWY_TARGETS & HWY_SVE_256
178 #define HWY_CHOOSE_SVE_256(FUNC_NAME) &N_SVE_256::FUNC_NAME
180 #define HWY_CHOOSE_SVE_256(FUNC_NAME) nullptr
183 #if HWY_TARGETS & HWY_SVE2_128
184 #define HWY_CHOOSE_SVE2_128(FUNC_NAME) &N_SVE2_128::FUNC_NAME
186 #define HWY_CHOOSE_SVE2_128(FUNC_NAME) nullptr
189 #if HWY_TARGETS & HWY_PPC8
190 #define HWY_CHOOSE_PPC8(FUNC_NAME) &N_PPC8::FUNC_NAME
192 #define HWY_CHOOSE_PPC8(FUNC_NAME) nullptr
195 #if HWY_TARGETS & HWY_PPC9
196 #define HWY_CHOOSE_PPC9(FUNC_NAME) &N_PPC9::FUNC_NAME
198 #define HWY_CHOOSE_PPC9(FUNC_NAME) nullptr
201 #if HWY_TARGETS & HWY_PPC10
202 #define HWY_CHOOSE_PPC10(FUNC_NAME) &N_PPC10::FUNC_NAME
204 #define HWY_CHOOSE_PPC10(FUNC_NAME) nullptr
207 #if HWY_TARGETS & HWY_Z14
208 #define HWY_CHOOSE_Z14(FUNC_NAME) &N_Z14::FUNC_NAME
210 #define HWY_CHOOSE_Z14(FUNC_NAME) nullptr
213 #if HWY_TARGETS & HWY_Z15
214 #define HWY_CHOOSE_Z15(FUNC_NAME) &N_Z15::FUNC_NAME
216 #define HWY_CHOOSE_Z15(FUNC_NAME) nullptr
219 #if HWY_TARGETS & HWY_SSE2
220 #define HWY_CHOOSE_SSE2(FUNC_NAME) &N_SSE2::FUNC_NAME
222 #define HWY_CHOOSE_SSE2(FUNC_NAME) nullptr
225 #if HWY_TARGETS & HWY_SSSE3
226 #define HWY_CHOOSE_SSSE3(FUNC_NAME) &N_SSSE3::FUNC_NAME
228 #define HWY_CHOOSE_SSSE3(FUNC_NAME) nullptr
231 #if HWY_TARGETS & HWY_SSE4
232 #define HWY_CHOOSE_SSE4(FUNC_NAME) &N_SSE4::FUNC_NAME
234 #define HWY_CHOOSE_SSE4(FUNC_NAME) nullptr
237 #if HWY_TARGETS & HWY_AVX2
238 #define HWY_CHOOSE_AVX2(FUNC_NAME) &N_AVX2::FUNC_NAME
240 #define HWY_CHOOSE_AVX2(FUNC_NAME) nullptr
243 #if HWY_TARGETS & HWY_AVX3
244 #define HWY_CHOOSE_AVX3(FUNC_NAME) &N_AVX3::FUNC_NAME
246 #define HWY_CHOOSE_AVX3(FUNC_NAME) nullptr
249 #if HWY_TARGETS & HWY_AVX3_DL
250 #define HWY_CHOOSE_AVX3_DL(FUNC_NAME) &N_AVX3_DL::FUNC_NAME
252 #define HWY_CHOOSE_AVX3_DL(FUNC_NAME) nullptr
255 #if HWY_TARGETS & HWY_AVX3_ZEN4
256 #define HWY_CHOOSE_AVX3_ZEN4(FUNC_NAME) &N_AVX3_ZEN4::FUNC_NAME
258 #define HWY_CHOOSE_AVX3_ZEN4(FUNC_NAME) nullptr
261 #if HWY_TARGETS & HWY_AVX3_SPR
262 #define HWY_CHOOSE_AVX3_SPR(FUNC_NAME) &N_AVX3_SPR::FUNC_NAME
264 #define HWY_CHOOSE_AVX3_SPR(FUNC_NAME) nullptr
267 // MSVC 2017 workaround: the non-type template parameter to ChooseAndCall
268 // apparently cannot be an array. Use a function pointer instead, which has the
269 // disadvantage that we call the static (not best) target on the first call to
270 // any HWY_DYNAMIC_DISPATCH.
271 #if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1915
272 #define HWY_DISPATCH_WORKAROUND 1
274 #define HWY_DISPATCH_WORKAROUND 0
277 // Provides a static member function which is what is called during the first
278 // HWY_DYNAMIC_DISPATCH, where GetIndex is still zero, and instantiations of
279 // this function are the first entry in the tables created by HWY_EXPORT.
280 template <typename RetType
, typename
... Args
>
281 struct FunctionCache
{
283 typedef RetType(FunctionType
)(Args
...);
285 #if HWY_DISPATCH_WORKAROUND
286 template <FunctionType
* const func
>
287 static RetType
ChooseAndCall(Args
... args
) {
288 ChosenTarget
& chosen_target
= GetChosenTarget();
289 chosen_target
.Update(SupportedTargets());
290 return (*func
)(args
...);
293 // A template function that when instantiated has the same signature as the
294 // function being called. This function initializes the bit array of targets
295 // supported by the current CPU and then calls the appropriate entry within
296 // the HWY_EXPORT table. Subsequent calls via HWY_DYNAMIC_DISPATCH to any
297 // exported functions, even those defined by different translation units,
298 // will dispatch directly to the best available target.
299 template <FunctionType
* const table
[]>
300 static RetType
ChooseAndCall(Args
... args
) {
301 ChosenTarget
& chosen_target
= GetChosenTarget();
302 chosen_target
.Update(SupportedTargets());
303 return (table
[chosen_target
.GetIndex()])(args
...);
305 #endif // HWY_DISPATCH_WORKAROUND
308 // Used to deduce the template parameters RetType and Args from a function.
309 template <typename RetType
, typename
... Args
>
310 FunctionCache
<RetType
, Args
...> DeduceFunctionCache(RetType (*)(Args
...)) {
311 return FunctionCache
<RetType
, Args
...>();
314 #define HWY_DISPATCH_TABLE(FUNC_NAME) \
315 HWY_CONCAT(FUNC_NAME, HighwayDispatchTable)
317 // HWY_EXPORT(FUNC_NAME); expands to a static array that is used by
318 // HWY_DYNAMIC_DISPATCH() to call the appropriate function at runtime. This
319 // static array must be defined at the same namespace level as the function
321 // After being exported, it can be called from other parts of the same source
322 // file using HWY_DYNAMIC_DISPATCH(), in particular from a function wrapper
323 // like in the following example:
325 // #include "hwy/highway.h"
326 // HWY_BEFORE_NAMESPACE();
327 // namespace skeleton {
328 // namespace HWY_NAMESPACE {
330 // void MyFunction(int a, char b, const char* c) { ... }
332 // // NOLINTNEXTLINE(google-readability-namespace-comments)
333 // } // namespace HWY_NAMESPACE
334 // } // namespace skeleton
335 // HWY_AFTER_NAMESPACE();
337 // namespace skeleton {
338 // HWY_EXPORT(MyFunction); // Defines the dispatch table in this scope.
340 // void MyFunction(int a, char b, const char* c) {
341 // return HWY_DYNAMIC_DISPATCH(MyFunction)(a, b, c);
343 // } // namespace skeleton
346 #if HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)
348 // Simplified version for IDE or the dynamic dispatch case with only one target.
349 // This case still uses a table, although of a single element, to provide the
350 // same compile error conditions as with the dynamic dispatch case when multiple
351 // targets are being compiled.
352 #define HWY_EXPORT(FUNC_NAME) \
353 HWY_MAYBE_UNUSED static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const \
354 HWY_DISPATCH_TABLE(FUNC_NAME)[1] = {&HWY_STATIC_DISPATCH(FUNC_NAME)}
355 #define HWY_DYNAMIC_DISPATCH(FUNC_NAME) HWY_STATIC_DISPATCH(FUNC_NAME)
356 #define HWY_DYNAMIC_POINTER(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME)
360 // Simplified version for MSVC 2017: function pointer instead of table.
361 #if HWY_DISPATCH_WORKAROUND
363 #define HWY_EXPORT(FUNC_NAME) \
364 static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \
365 FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = { \
366 /* The first entry in the table initializes the global cache and \
367 * calls the function from HWY_STATIC_TARGET. */ \
368 &decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH( \
369 FUNC_NAME)))::ChooseAndCall<&HWY_STATIC_DISPATCH(FUNC_NAME)>, \
370 HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \
371 HWY_CHOOSE_FALLBACK(FUNC_NAME), \
376 // Dynamic dispatch case with one entry per dynamic target plus the fallback
377 // target and the initialization wrapper.
378 #define HWY_EXPORT(FUNC_NAME) \
379 static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \
380 FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = { \
381 /* The first entry in the table initializes the global cache and \
382 * calls the appropriate function. */ \
383 &decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH( \
384 FUNC_NAME)))::ChooseAndCall<HWY_DISPATCH_TABLE(FUNC_NAME)>, \
385 HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \
386 HWY_CHOOSE_FALLBACK(FUNC_NAME), \
389 #endif // HWY_DISPATCH_WORKAROUND
391 #define HWY_DYNAMIC_DISPATCH(FUNC_NAME) \
392 (*(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::GetChosenTarget().GetIndex()]))
393 #define HWY_DYNAMIC_POINTER(FUNC_NAME) \
394 (HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::GetChosenTarget().GetIndex()])
396 #endif // HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)
398 // DEPRECATED names; please use HWY_HAVE_* instead.
399 #define HWY_CAP_INTEGER64 HWY_HAVE_INTEGER64
400 #define HWY_CAP_FLOAT16 HWY_HAVE_FLOAT16
401 #define HWY_CAP_FLOAT64 HWY_HAVE_FLOAT64
405 #endif // HWY_HIGHWAY_INCLUDED
407 //------------------------------------------------------------------------------
409 // NOTE: the following definitions and ops/*.h depend on HWY_TARGET, so we want
410 // to include them once per target, which is ensured by the toggle check.
411 // Because ops/*.h are included under it, they do not need their own guard.
412 #if defined(HWY_HIGHWAY_PER_TARGET) == defined(HWY_TARGET_TOGGLE)
413 #ifdef HWY_HIGHWAY_PER_TARGET
414 #undef HWY_HIGHWAY_PER_TARGET
416 #define HWY_HIGHWAY_PER_TARGET
419 // These define ops inside namespace hwy::HWY_NAMESPACE.
420 #if HWY_TARGET == HWY_SSE2 || HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
421 #include "hwy/ops/x86_128-inl.h"
422 #elif HWY_TARGET == HWY_AVX2
423 #include "hwy/ops/x86_256-inl.h"
424 #elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL || \
425 HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX3_SPR
426 #include "hwy/ops/x86_512-inl.h"
427 #elif HWY_TARGET == HWY_Z14 || HWY_TARGET == HWY_Z15 || \
428 HWY_TARGET == HWY_PPC8 || HWY_TARGET == HWY_PPC9 || \
429 HWY_TARGET == HWY_PPC10
430 #include "hwy/ops/ppc_vsx-inl.h"
431 #elif HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES
432 #include "hwy/ops/arm_neon-inl.h"
433 #elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2 || \
434 HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
435 #include "hwy/ops/arm_sve-inl.h"
436 #elif HWY_TARGET == HWY_WASM_EMU256
437 #include "hwy/ops/wasm_256-inl.h"
438 #elif HWY_TARGET == HWY_WASM
439 #include "hwy/ops/wasm_128-inl.h"
440 #elif HWY_TARGET == HWY_RVV
441 #include "hwy/ops/rvv-inl.h"
442 #elif HWY_TARGET == HWY_EMU128
443 #include "hwy/ops/emu128-inl.h"
444 #elif HWY_TARGET == HWY_SCALAR
445 #include "hwy/ops/scalar-inl.h"
447 #pragma message("HWY_TARGET does not match any known target")
450 #include "hwy/ops/generic_ops-inl.h"
452 #endif // HWY_HIGHWAY_PER_TARGET