1 /*===---- avxintrin.h - AVX intrinsics -------------------------------------===
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 *===-----------------------------------------------------------------------===
25 #error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
28 typedef double __v4df
__attribute__ ((__vector_size__ (32)));
29 typedef float __v8sf
__attribute__ ((__vector_size__ (32)));
30 typedef long long __v4di
__attribute__ ((__vector_size__ (32)));
31 typedef int __v8si
__attribute__ ((__vector_size__ (32)));
32 typedef short __v16hi
__attribute__ ((__vector_size__ (32)));
33 typedef char __v32qi
__attribute__ ((__vector_size__ (32)));
35 typedef float __m256
__attribute__ ((__vector_size__ (32)));
36 typedef double __m256d
__attribute__((__vector_size__(32)));
37 typedef long long __m256i
__attribute__((__vector_size__(32)));
40 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
41 _mm256_add_pd(__m256d a
, __m256d b
)
46 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
47 _mm256_add_ps(__m256 a
, __m256 b
)
52 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
53 _mm256_sub_pd(__m256d a
, __m256d b
)
58 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
59 _mm256_sub_ps(__m256 a
, __m256 b
)
64 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
65 _mm256_addsub_pd(__m256d a
, __m256d b
)
67 return (__m256d
)__builtin_ia32_addsubpd256((__v4df
)a
, (__v4df
)b
);
70 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
71 _mm256_addsub_ps(__m256 a
, __m256 b
)
73 return (__m256
)__builtin_ia32_addsubps256((__v8sf
)a
, (__v8sf
)b
);
76 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
77 _mm256_div_pd(__m256d a
, __m256d b
)
82 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
83 _mm256_div_ps(__m256 a
, __m256 b
)
88 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
89 _mm256_max_pd(__m256d a
, __m256d b
)
91 return (__m256d
)__builtin_ia32_maxpd256((__v4df
)a
, (__v4df
)b
);
94 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
95 _mm256_max_ps(__m256 a
, __m256 b
)
97 return (__m256
)__builtin_ia32_maxps256((__v8sf
)a
, (__v8sf
)b
);
100 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
101 _mm256_min_pd(__m256d a
, __m256d b
)
103 return (__m256d
)__builtin_ia32_minpd256((__v4df
)a
, (__v4df
)b
);
106 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
107 _mm256_min_ps(__m256 a
, __m256 b
)
109 return (__m256
)__builtin_ia32_minps256((__v8sf
)a
, (__v8sf
)b
);
112 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
113 _mm256_mul_pd(__m256d a
, __m256d b
)
118 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
119 _mm256_mul_ps(__m256 a
, __m256 b
)
124 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
125 _mm256_sqrt_pd(__m256d a
)
127 return (__m256d
)__builtin_ia32_sqrtpd256((__v4df
)a
);
130 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
131 _mm256_sqrt_ps(__m256 a
)
133 return (__m256
)__builtin_ia32_sqrtps256((__v8sf
)a
);
136 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
137 _mm256_rsqrt_ps(__m256 a
)
139 return (__m256
)__builtin_ia32_rsqrtps256((__v8sf
)a
);
142 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
143 _mm256_rcp_ps(__m256 a
)
145 return (__m256
)__builtin_ia32_rcpps256((__v8sf
)a
);
148 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
149 _mm256_round_pd(__m256d v
, const int m
)
151 return (__m256d
)__builtin_ia32_roundpd256((__v4df
)v
, m
);
154 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
155 _mm256_round_ps(__m256 v
, const int m
)
157 return (__m256
)__builtin_ia32_roundps256((__v8sf
)v
, m
);
160 #define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
161 #define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
162 #define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
163 #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
166 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
167 _mm256_and_pd(__m256d a
, __m256d b
)
169 return (__m256d
)((__v4di
)a
& (__v4di
)b
);
172 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
173 _mm256_and_ps(__m256 a
, __m256 b
)
175 return (__m256
)((__v8si
)a
& (__v8si
)b
);
178 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
179 _mm256_andnot_pd(__m256d a
, __m256d b
)
181 return (__m256d
)(~(__v4di
)a
& (__v4di
)b
);
184 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
185 _mm256_andnot_ps(__m256 a
, __m256 b
)
187 return (__m256
)(~(__v8si
)a
& (__v8si
)b
);
190 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
191 _mm256_or_pd(__m256d a
, __m256d b
)
193 return (__m256d
)((__v4di
)a
| (__v4di
)b
);
196 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
197 _mm256_or_ps(__m256 a
, __m256 b
)
199 return (__m256
)((__v8si
)a
| (__v8si
)b
);
202 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
203 _mm256_xor_pd(__m256d a
, __m256d b
)
205 return (__m256d
)((__v4di
)a
^ (__v4di
)b
);
208 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
209 _mm256_xor_ps(__m256 a
, __m256 b
)
211 return (__m256
)((__v8si
)a
^ (__v8si
)b
);
214 /* Horizontal arithmetic */
215 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
216 _mm256_hadd_pd(__m256d a
, __m256d b
)
218 return (__m256d
)__builtin_ia32_haddpd256((__v4df
)a
, (__v4df
)b
);
221 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
222 _mm256_hadd_ps(__m256 a
, __m256 b
)
224 return (__m256
)__builtin_ia32_haddps256((__v8sf
)a
, (__v8sf
)b
);
227 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
228 _mm256_hsub_pd(__m256d a
, __m256d b
)
230 return (__m256d
)__builtin_ia32_hsubpd256((__v4df
)a
, (__v4df
)b
);
233 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
234 _mm256_hsub_ps(__m256 a
, __m256 b
)
236 return (__m256
)__builtin_ia32_hsubps256((__v8sf
)a
, (__v8sf
)b
);
239 /* Vector permutations */
240 static __inline __m128d
__attribute__((__always_inline__
, __nodebug__
))
241 _mm_permutevar_pd(__m128d a
, __m128i c
)
243 return (__m128d
)__builtin_ia32_vpermilvarpd((__v2df
)a
, (__v2di
)c
);
246 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
247 _mm256_permutevar_pd(__m256d a
, __m256i c
)
249 return (__m256d
)__builtin_ia32_vpermilvarpd256((__v4df
)a
, (__v4di
)c
);
252 static __inline __m128
__attribute__((__always_inline__
, __nodebug__
))
253 _mm_permutevar_ps(__m128 a
, __m128i c
)
255 return (__m128
)__builtin_ia32_vpermilvarps((__v4sf
)a
, (__v4si
)c
);
258 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
259 _mm256_permutevar_ps(__m256 a
, __m256i c
)
261 return (__m256
)__builtin_ia32_vpermilvarps256((__v8sf
)a
,
265 static __inline __m128d
__attribute__((__always_inline__
, __nodebug__
))
266 _mm_permute_pd(__m128d a
, const int c
)
268 return (__m128d
)__builtin_ia32_vpermilpd((__v2df
)a
, c
);
271 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
272 _mm256_permute_pd(__m256d a
, const int c
)
274 return (__m256d
)__builtin_ia32_vpermilpd256((__v4df
)a
, c
);
277 static __inline __m128
__attribute__((__always_inline__
, __nodebug__
))
278 _mm_permute_ps(__m128 a
, const int c
)
280 return (__m128
)__builtin_ia32_vpermilps((__v4sf
)a
, c
);
283 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
284 _mm256_permute_ps(__m256 a
, const int c
)
286 return (__m256
)__builtin_ia32_vpermilps256((__v8sf
)a
, c
);
289 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
290 _mm256_permute2f128_pd(__m256d a
, __m256d b
, const int c
)
292 return (__m256d
)__builtin_ia32_vperm2f128_pd256((__v4df
)a
, (__v4df
)b
, c
);
295 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
296 _mm256_permute2f128_ps(__m256 a
, __m256 b
, const int c
)
298 return (__m256
)__builtin_ia32_vperm2f128_ps256((__v8sf
)a
, (__v8sf
)b
, c
);
301 static __inline __m256i
__attribute__((__always_inline__
, __nodebug__
))
302 _mm256_permute2f128_si256(__m256i a
, __m256i b
, const int c
)
304 return (__m256i
)__builtin_ia32_vperm2f128_si256((__v8si
)a
, (__v8si
)b
, c
);
308 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
309 _mm256_blend_pd(__m256d a
, __m256d b
, const int c
)
311 return (__m256d
)__builtin_ia32_blendpd256((__v4df
)a
, (__v4df
)b
, c
);
314 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
315 _mm256_blend_ps(__m256 a
, __m256 b
, const int c
)
317 return (__m256
)__builtin_ia32_blendps256((__v8sf
)a
, (__v8sf
)b
, c
);
320 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
321 _mm256_blendv_pd(__m256d a
, __m256d b
, __m256d c
)
323 return (__m256d
)__builtin_ia32_blendvpd256((__v4df
)a
, (__v4df
)b
, (__v4df
)c
);
326 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
327 _mm256_blendv_ps(__m256 a
, __m256 b
, __m256 c
)
329 return (__m256
)__builtin_ia32_blendvps256((__v8sf
)a
, (__v8sf
)b
, (__v8sf
)c
);
332 /* Vector Dot Product */
333 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
334 _mm256_dp_ps(__m256 a
, __m256 b
, const int c
)
336 return (__m256
)__builtin_ia32_dpps256((__v8sf
)a
, (__v8sf
)b
, c
);
340 #define _mm256_shuffle_ps(a, b, mask) \
341 (__builtin_shufflevector((__v8sf)(a), (__v8sf)(b), \
342 (mask) & 0x3, ((mask) & 0xc) >> 2, \
343 (((mask) & 0x30) >> 4) + 8, (((mask) & 0xc0) >> 6) + 8, \
344 (mask) & 0x3 + 4, (((mask) & 0xc) >> 2) + 4, \
345 (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12))
347 #define _mm256_shuffle_pd(a, b, mask) \
348 (__builtin_shufflevector((__v4df)(a), (__v4df)(b), \
350 (((mask) & 0x2) >> 1) + 4, \
351 (((mask) & 0x4) >> 2) + 2, \
352 (((mask) & 0x8) >> 3) + 6))
355 #define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
356 #define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
357 #define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
358 #define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
359 #define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
360 #define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
361 #define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
362 #define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
363 #define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
364 #define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */
365 #define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
366 #define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
367 #define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
368 #define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
369 #define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
370 #define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
371 #define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
372 #define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
373 #define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
374 #define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
375 #define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
376 #define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
377 #define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */
378 #define _CMP_ORD_S 0x17 /* Ordered (signaling) */
379 #define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
380 #define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */
381 #define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
382 #define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
383 #define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
384 #define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
385 #define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
386 #define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
388 static __inline __m128d
__attribute__((__always_inline__
, __nodebug__
))
389 _mm_cmp_pd(__m128d a
, __m128d b
, const int c
)
391 return (__m128d
)__builtin_ia32_cmppd((__v2df
)a
, (__v2df
)b
, c
);
394 static __inline __m128
__attribute__((__always_inline__
, __nodebug__
))
395 _mm_cmp_ps(__m128 a
, __m128 b
, const int c
)
397 return (__m128
)__builtin_ia32_cmpps((__v4sf
)a
, (__v4sf
)b
, c
);
400 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
401 _mm256_cmp_pd(__m256d a
, __m256d b
, const int c
)
403 return (__m256d
)__builtin_ia32_cmppd256((__v4df
)a
, (__v4df
)b
, c
);
406 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
407 _mm256_cmp_ps(__m256 a
, __m256 b
, const int c
)
409 return (__m256
)__builtin_ia32_cmpps256((__v8sf
)a
, (__v8sf
)b
, c
);
412 static __inline __m128d
__attribute__((__always_inline__
, __nodebug__
))
413 _mm_cmp_sd(__m128d a
, __m128d b
, const int c
)
415 return (__m128d
)__builtin_ia32_cmpsd((__v2df
)a
, (__v2df
)b
, c
);
418 static __inline __m128
__attribute__((__always_inline__
, __nodebug__
))
419 _mm_cmp_ss(__m128 a
, __m128 b
, const int c
)
421 return (__m128
)__builtin_ia32_cmpss((__v4sf
)a
, (__v4sf
)b
, c
);
425 static __inline __m128d
__attribute__((__always_inline__
, __nodebug__
))
426 _mm256_extractf128_pd(__m256d a
, const int o
)
428 return (__m128d
)__builtin_ia32_vextractf128_pd256((__v4df
)a
, o
);
431 static __inline __m128
__attribute__((__always_inline__
, __nodebug__
))
432 _mm256_extractf128_ps(__m256 a
, const int o
)
434 return (__m128
)__builtin_ia32_vextractf128_ps256((__v8sf
)a
, o
);
437 static __inline __m128i
__attribute__((__always_inline__
, __nodebug__
))
438 _mm256_extractf128_si256(__m256i a
, const int o
)
440 return (__m128i
)__builtin_ia32_vextractf128_si256((__v8si
)a
, o
);
443 static __inline
int __attribute__((__always_inline__
, __nodebug__
))
444 _mm256_extract_epi32(__m256i a
, int const imm
)
446 __v8si b
= (__v8si
)a
;
450 static __inline
int __attribute__((__always_inline__
, __nodebug__
))
451 _mm256_extract_epi16(__m256i a
, int const imm
)
453 __v16hi b
= (__v16hi
)a
;
457 static __inline
int __attribute__((__always_inline__
, __nodebug__
))
458 _mm256_extract_epi8(__m256i a
, int const imm
)
460 __v32qi b
= (__v32qi
)a
;
465 static __inline
long long __attribute__((__always_inline__
, __nodebug__
))
466 _mm256_extract_epi64(__m256i a
, const int imm
)
468 __v4di b
= (__v4di
)a
;
474 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
475 _mm256_insertf128_pd(__m256d a
, __m128d b
, const int o
)
477 return (__m256d
)__builtin_ia32_vinsertf128_pd256((__v4df
)a
, (__v2df
)b
, o
);
480 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
481 _mm256_insertf128_ps(__m256 a
, __m128 b
, const int o
)
483 return (__m256
)__builtin_ia32_vinsertf128_ps256((__v8sf
)a
, (__v4sf
)b
, o
);
486 static __inline __m256i
__attribute__((__always_inline__
, __nodebug__
))
487 _mm256_insertf128_si256(__m256i a
, __m128i b
, const int o
)
489 return (__m256i
)__builtin_ia32_vinsertf128_si256((__v8si
)a
, (__v4si
)b
, o
);
492 static __inline __m256i
__attribute__((__always_inline__
, __nodebug__
))
493 _mm256_insert_epi32(__m256i a
, int b
, int const imm
)
495 __v8si c
= (__v8si
)a
;
500 static __inline __m256i
__attribute__((__always_inline__
, __nodebug__
))
501 _mm256_insert_epi16(__m256i a
, int b
, int const imm
)
503 __v16hi c
= (__v16hi
)a
;
508 static __inline __m256i
__attribute__((__always_inline__
, __nodebug__
))
509 _mm256_insert_epi8(__m256i a
, int b
, int const imm
)
511 __v32qi c
= (__v32qi
)a
;
517 static __inline __m256i
__attribute__((__always_inline__
, __nodebug__
))
518 _mm256_insert_epi64(__m256i a
, int b
, int const imm
)
520 __v4di c
= (__v4di
)a
;
527 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
528 _mm256_cvtepi32_pd(__m128i a
)
530 return (__m256d
)__builtin_ia32_cvtdq2pd256((__v4si
) a
);
533 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
534 _mm256_cvtepi32_ps(__m256i a
)
536 return (__m256
)__builtin_ia32_cvtdq2ps256((__v8si
) a
);
539 static __inline __m128
__attribute__((__always_inline__
, __nodebug__
))
540 _mm256_cvtpd_ps(__m256d a
)
542 return (__m128
)__builtin_ia32_cvtpd2ps256((__v4df
) a
);
545 static __inline __m256i
__attribute__((__always_inline__
, __nodebug__
))
546 _mm256_cvtps_epi32(__m256 a
)
548 return (__m256i
)__builtin_ia32_cvtps2dq256((__v8sf
) a
);
551 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
552 _mm256_cvtps_pd(__m128 a
)
554 return (__m256d
)__builtin_ia32_cvtps2pd256((__v4sf
) a
);
557 static __inline __m128i
__attribute__((__always_inline__
, __nodebug__
))
558 _mm256_cvttpd_epi32(__m256d a
)
560 return (__m128i
)__builtin_ia32_cvttpd2dq256((__v4df
) a
);
563 static __inline __m128i
__attribute__((__always_inline__
, __nodebug__
))
564 _mm256_cvtpd_epi32(__m256d a
)
566 return (__m128i
)__builtin_ia32_cvtpd2dq256((__v4df
) a
);
569 static __inline __m256i
__attribute__((__always_inline__
, __nodebug__
))
570 _mm256_cvttps_epi32(__m256 a
)
572 return (__m256i
)__builtin_ia32_cvttps2dq256((__v8sf
) a
);
575 /* Vector replicate */
576 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
577 _mm256_movehdup_ps(__m256 a
)
579 return __builtin_shufflevector(a
, a
, 1, 1, 3, 3, 5, 5, 7, 7);
582 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
583 _mm256_moveldup_ps(__m256 a
)
585 return __builtin_shufflevector(a
, a
, 0, 0, 2, 2, 4, 4, 6, 6);
588 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
589 _mm256_movedup_pd(__m256d a
)
591 return __builtin_shufflevector(a
, a
, 0, 0, 2, 2);
594 /* Unpack and Interleave */
595 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
596 _mm256_unpackhi_pd(__m256d a
, __m256d b
)
598 return __builtin_shufflevector(a
, b
, 1, 5, 1+2, 5+2);
601 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
602 _mm256_unpacklo_pd(__m256d a
, __m256d b
)
604 return __builtin_shufflevector(a
, b
, 0, 4, 0+2, 4+2);
607 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
608 _mm256_unpackhi_ps(__m256 a
, __m256 b
)
610 return __builtin_shufflevector(a
, b
, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
613 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
614 _mm256_unpacklo_ps(__m256 a
, __m256 b
)
616 return __builtin_shufflevector(a
, b
, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
620 static __inline
int __attribute__((__always_inline__
, __nodebug__
))
621 _mm_testz_pd(__m128d a
, __m128d b
)
623 return __builtin_ia32_vtestzpd((__v2df
)a
, (__v2df
)b
);
626 static __inline
int __attribute__((__always_inline__
, __nodebug__
))
627 _mm_testc_pd(__m128d a
, __m128d b
)
629 return __builtin_ia32_vtestcpd((__v2df
)a
, (__v2df
)b
);
632 static __inline
int __attribute__((__always_inline__
, __nodebug__
))
633 _mm_testnzc_pd(__m128d a
, __m128d b
)
635 return __builtin_ia32_vtestnzcpd((__v2df
)a
, (__v2df
)b
);
638 static __inline
int __attribute__((__always_inline__
, __nodebug__
))
639 _mm_testz_ps(__m128 a
, __m128 b
)
641 return __builtin_ia32_vtestzps((__v4sf
)a
, (__v4sf
)b
);
644 static __inline
int __attribute__((__always_inline__
, __nodebug__
))
645 _mm_testc_ps(__m128 a
, __m128 b
)
647 return __builtin_ia32_vtestcps((__v4sf
)a
, (__v4sf
)b
);
650 static __inline
int __attribute__((__always_inline__
, __nodebug__
))
651 _mm_testnzc_ps(__m128 a
, __m128 b
)
653 return __builtin_ia32_vtestnzcps((__v4sf
)a
, (__v4sf
)b
);
656 static __inline
int __attribute__((__always_inline__
, __nodebug__
))
657 _mm256_testz_pd(__m256d a
, __m256d b
)
659 return __builtin_ia32_vtestzpd256((__v4df
)a
, (__v4df
)b
);
662 static __inline
int __attribute__((__always_inline__
, __nodebug__
))
663 _mm256_testc_pd(__m256d a
, __m256d b
)
665 return __builtin_ia32_vtestcpd256((__v4df
)a
, (__v4df
)b
);
668 static __inline
int __attribute__((__always_inline__
, __nodebug__
))
669 _mm256_testnzc_pd(__m256d a
, __m256d b
)
671 return __builtin_ia32_vtestnzcpd256((__v4df
)a
, (__v4df
)b
);
674 static __inline
int __attribute__((__always_inline__
, __nodebug__
))
675 _mm256_testz_ps(__m256 a
, __m256 b
)
677 return __builtin_ia32_vtestzps256((__v8sf
)a
, (__v8sf
)b
);
680 static __inline
int __attribute__((__always_inline__
, __nodebug__
))
681 _mm256_testc_ps(__m256 a
, __m256 b
)
683 return __builtin_ia32_vtestcps256((__v8sf
)a
, (__v8sf
)b
);
686 static __inline
int __attribute__((__always_inline__
, __nodebug__
))
687 _mm256_testnzc_ps(__m256 a
, __m256 b
)
689 return __builtin_ia32_vtestnzcps256((__v8sf
)a
, (__v8sf
)b
);
692 static __inline
int __attribute__((__always_inline__
, __nodebug__
))
693 _mm256_testz_si256(__m256i a
, __m256i b
)
695 return __builtin_ia32_ptestz256((__v4di
)a
, (__v4di
)b
);
698 static __inline
int __attribute__((__always_inline__
, __nodebug__
))
699 _mm256_testc_si256(__m256i a
, __m256i b
)
701 return __builtin_ia32_ptestc256((__v4di
)a
, (__v4di
)b
);
704 static __inline
int __attribute__((__always_inline__
, __nodebug__
))
705 _mm256_testnzc_si256(__m256i a
, __m256i b
)
707 return __builtin_ia32_ptestnzc256((__v4di
)a
, (__v4di
)b
);
710 /* Vector extract sign mask */
711 static __inline
int __attribute__((__always_inline__
, __nodebug__
))
712 _mm256_movemask_pd(__m256d a
)
714 return __builtin_ia32_movmskpd256((__v4df
)a
);
717 static __inline
int __attribute__((__always_inline__
, __nodebug__
))
718 _mm256_movemask_ps(__m256 a
)
720 return __builtin_ia32_movmskps256((__v8sf
)a
);
724 static __inline
void __attribute__((__always_inline__
, __nodebug__
))
727 __builtin_ia32_vzeroall();
730 static __inline
void __attribute__((__always_inline__
, __nodebug__
))
731 _mm256_zeroupper(void)
733 __builtin_ia32_vzeroupper();
736 /* Vector load with broadcast */
737 static __inline __m128
__attribute__((__always_inline__
, __nodebug__
))
738 _mm_broadcast_ss(float const *a
)
740 return (__m128
)__builtin_ia32_vbroadcastss(a
);
743 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
744 _mm256_broadcast_sd(double const *a
)
746 return (__m256d
)__builtin_ia32_vbroadcastsd256(a
);
749 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
750 _mm256_broadcast_ss(float const *a
)
752 return (__m256
)__builtin_ia32_vbroadcastss256(a
);
755 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
756 _mm256_broadcast_pd(__m128d
const *a
)
758 return (__m256d
)__builtin_ia32_vbroadcastf128_pd256(a
);
761 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
762 _mm256_broadcast_ps(__m128
const *a
)
764 return (__m256
)__builtin_ia32_vbroadcastf128_ps256(a
);
768 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
769 _mm256_load_pd(double const *p
)
771 return *(__m256d
*)p
;
774 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
775 _mm256_load_ps(float const *p
)
780 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
781 _mm256_loadu_pd(double const *p
)
783 return (__m256d
)__builtin_ia32_loadupd256(p
);
786 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
787 _mm256_loadu_ps(float const *p
)
789 return (__m256
)__builtin_ia32_loadups256(p
);
792 static __inline __m256i
__attribute__((__always_inline__
, __nodebug__
))
793 _mm256_load_si256(__m256i
const *p
)
798 static __inline __m256i
__attribute__((__always_inline__
, __nodebug__
))
799 _mm256_loadu_si256(__m256i
const *p
)
801 return (__m256i
)__builtin_ia32_loaddqu256((char const *)p
);
804 static __inline __m256i
__attribute__((__always_inline__
, __nodebug__
))
805 _mm256_lddqu_si256(__m256i
const *p
)
807 return (__m256i
)__builtin_ia32_lddqu256((char const *)p
);
811 static __inline
void __attribute__((__always_inline__
, __nodebug__
))
812 _mm256_store_pd(double *p
, __m256d a
)
817 static __inline
void __attribute__((__always_inline__
, __nodebug__
))
818 _mm256_store_ps(float *p
, __m256 a
)
823 static __inline
void __attribute__((__always_inline__
, __nodebug__
))
824 _mm256_storeu_pd(double *p
, __m256d a
)
826 __builtin_ia32_storeupd256(p
, (__v4df
)a
);
829 static __inline
void __attribute__((__always_inline__
, __nodebug__
))
830 _mm256_storeu_ps(float *p
, __m256 a
)
832 __builtin_ia32_storeups256(p
, (__v8sf
)a
);
835 static __inline
void __attribute__((__always_inline__
, __nodebug__
))
836 _mm256_store_si256(__m256i
*p
, __m256i a
)
841 static __inline
void __attribute__((__always_inline__
, __nodebug__
))
842 _mm256_storeu_si256(__m256i
*p
, __m256i a
)
844 __builtin_ia32_storedqu256((char *)p
, (__v32qi
)a
);
847 /* Conditional load ops */
848 static __inline __m128d
__attribute__((__always_inline__
, __nodebug__
))
849 _mm_maskload_pd(double const *p
, __m128d m
)
851 return (__m128d
)__builtin_ia32_maskloadpd((const __v2df
*)p
, (__v2df
)m
);
854 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
855 _mm256_maskload_pd(double const *p
, __m256d m
)
857 return (__m256d
)__builtin_ia32_maskloadpd256((const __v4df
*)p
, (__v4df
)m
);
860 static __inline __m128
__attribute__((__always_inline__
, __nodebug__
))
861 _mm_maskload_ps(float const *p
, __m128 m
)
863 return (__m128
)__builtin_ia32_maskloadps((const __v4sf
*)p
, (__v4sf
)m
);
866 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
867 _mm256_maskload_ps(float const *p
, __m256 m
)
869 return (__m256
)__builtin_ia32_maskloadps256((const __v8sf
*)p
, (__v8sf
)m
);
872 /* Conditional store ops */
873 static __inline
void __attribute__((__always_inline__
, __nodebug__
))
874 _mm256_maskstore_ps(float *p
, __m256 m
, __m256 a
)
876 __builtin_ia32_maskstoreps256((__v8sf
*)p
, (__v8sf
)m
, (__v8sf
)a
);
879 static __inline
void __attribute__((__always_inline__
, __nodebug__
))
880 _mm_maskstore_pd(double *p
, __m128d m
, __m128d a
)
882 __builtin_ia32_maskstorepd((__v2df
*)p
, (__v2df
)m
, (__v2df
)a
);
885 static __inline
void __attribute__((__always_inline__
, __nodebug__
))
886 _mm256_maskstore_pd(double *p
, __m256d m
, __m256d a
)
888 __builtin_ia32_maskstorepd256((__v4df
*)p
, (__v4df
)m
, (__v4df
)a
);
891 static __inline
void __attribute__((__always_inline__
, __nodebug__
))
892 _mm_maskstore_ps(float *p
, __m128 m
, __m128 a
)
894 __builtin_ia32_maskstoreps((__v4sf
*)p
, (__v4sf
)m
, (__v4sf
)a
);
897 /* Cacheability support ops */
898 static __inline
void __attribute__((__always_inline__
, __nodebug__
))
899 _mm256_stream_si256(__m256i
*a
, __m256i b
)
901 __builtin_ia32_movntdq256((__v4di
*)a
, (__v4di
)b
);
904 static __inline
void __attribute__((__always_inline__
, __nodebug__
))
905 _mm256_stream_pd(double *a
, __m256d b
)
907 __builtin_ia32_movntpd256(a
, (__v4df
)b
);
910 static __inline
void __attribute__((__always_inline__
, __nodebug__
))
911 _mm256_stream_ps(float *p
, __m256 a
)
913 __builtin_ia32_movntps256(p
, (__v8sf
)a
);
917 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
918 _mm256_set_pd(double a
, double b
, double c
, double d
)
920 return (__m256d
){ d
, c
, b
, a
};
923 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
924 _mm256_set_ps(float a
, float b
, float c
, float d
,
925 float e
, float f
, float g
, float h
)
927 return (__m256
){ h
, g
, f
, e
, d
, c
, b
, a
};
930 static __inline __m256i
__attribute__((__always_inline__
, __nodebug__
))
931 _mm256_set_epi32(int i0
, int i1
, int i2
, int i3
,
932 int i4
, int i5
, int i6
, int i7
)
934 return (__m256i
)(__v8si
){ i7
, i6
, i5
, i4
, i3
, i2
, i1
, i0
};
937 static __inline __m256i
__attribute__((__always_inline__
, __nodebug__
))
938 _mm256_set_epi16(short w15
, short w14
, short w13
, short w12
,
939 short w11
, short w10
, short w09
, short w08
,
940 short w07
, short w06
, short w05
, short w04
,
941 short w03
, short w02
, short w01
, short w00
)
943 return (__m256i
)(__v16hi
){ w00
, w01
, w02
, w03
, w04
, w05
, w06
, w07
,
944 w08
, w09
, w10
, w11
, w12
, w13
, w14
, w15
};
947 static __inline __m256i
__attribute__((__always_inline__
, __nodebug__
))
948 _mm256_set_epi8(char b31
, char b30
, char b29
, char b28
,
949 char b27
, char b26
, char b25
, char b24
,
950 char b23
, char b22
, char b21
, char b20
,
951 char b19
, char b18
, char b17
, char b16
,
952 char b15
, char b14
, char b13
, char b12
,
953 char b11
, char b10
, char b09
, char b08
,
954 char b07
, char b06
, char b05
, char b04
,
955 char b03
, char b02
, char b01
, char b00
)
957 return (__m256i
)(__v32qi
){
958 b00
, b01
, b02
, b03
, b04
, b05
, b06
, b07
,
959 b08
, b09
, b10
, b11
, b12
, b13
, b14
, b15
,
960 b16
, b17
, b18
, b19
, b20
, b21
, b22
, b23
,
961 b24
, b25
, b26
, b27
, b28
, b29
, b30
, b31
965 static __inline __m256i
__attribute__((__always_inline__
, __nodebug__
))
966 _mm256_set_epi64x(long long a
, long long b
, long long c
, long long d
)
968 return (__m256i
)(__v4di
){ d
, c
, b
, a
};
971 /* Create vectors with elements in reverse order */
972 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
973 _mm256_setr_pd(double a
, double b
, double c
, double d
)
975 return (__m256d
){ a
, b
, c
, d
};
978 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
979 _mm256_setr_ps(float a
, float b
, float c
, float d
,
980 float e
, float f
, float g
, float h
)
982 return (__m256
){ a
, b
, c
, d
, e
, f
, g
, h
};
985 static __inline __m256i
__attribute__((__always_inline__
, __nodebug__
))
986 _mm256_setr_epi32(int i0
, int i1
, int i2
, int i3
,
987 int i4
, int i5
, int i6
, int i7
)
989 return (__m256i
)(__v8si
){ i0
, i1
, i2
, i3
, i4
, i5
, i6
, i7
};
992 static __inline __m256i
__attribute__((__always_inline__
, __nodebug__
))
993 _mm256_setr_epi16(short w15
, short w14
, short w13
, short w12
,
994 short w11
, short w10
, short w09
, short w08
,
995 short w07
, short w06
, short w05
, short w04
,
996 short w03
, short w02
, short w01
, short w00
)
998 return (__m256i
)(__v16hi
){ w15
, w14
, w13
, w12
, w11
, w10
, w09
, w08
,
999 w07
, w06
, w05
, w04
, w03
, w02
, w01
, w00
};
1002 static __inline __m256i
__attribute__((__always_inline__
, __nodebug__
))
1003 _mm256_setr_epi8(char b31
, char b30
, char b29
, char b28
,
1004 char b27
, char b26
, char b25
, char b24
,
1005 char b23
, char b22
, char b21
, char b20
,
1006 char b19
, char b18
, char b17
, char b16
,
1007 char b15
, char b14
, char b13
, char b12
,
1008 char b11
, char b10
, char b09
, char b08
,
1009 char b07
, char b06
, char b05
, char b04
,
1010 char b03
, char b02
, char b01
, char b00
)
1012 return (__m256i
)(__v32qi
){
1013 b31
, b30
, b29
, b28
, b27
, b26
, b25
, b24
,
1014 b23
, b22
, b21
, b20
, b19
, b18
, b17
, b16
,
1015 b15
, b14
, b13
, b12
, b11
, b10
, b09
, b08
,
1016 b07
, b06
, b05
, b04
, b03
, b02
, b01
, b00
};
1019 static __inline __m256i
__attribute__((__always_inline__
, __nodebug__
))
1020 _mm256_setr_epi64x(long long a
, long long b
, long long c
, long long d
)
1022 return (__m256i
)(__v4di
){ a
, b
, c
, d
};
1025 /* Create vectors with repeated elements */
1026 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
1027 _mm256_set1_pd(double w
)
1029 return (__m256d
){ w
, w
, w
, w
};
1032 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
1033 _mm256_set1_ps(float w
)
1035 return (__m256
){ w
, w
, w
, w
, w
, w
, w
, w
};
1038 static __inline __m256i
__attribute__((__always_inline__
, __nodebug__
))
1039 _mm256_set1_epi32(int i
)
1041 return (__m256i
)(__v8si
){ i
, i
, i
, i
, i
, i
, i
, i
};
1044 static __inline __m256i
__attribute__((__always_inline__
, __nodebug__
))
1045 _mm256_set1_epi16(short w
)
1047 return (__m256i
)(__v16hi
){ w
, w
, w
, w
, w
, w
, w
, w
, w
, w
, w
, w
, w
, w
, w
, w
};
1050 static __inline __m256i
__attribute__((__always_inline__
, __nodebug__
))
1051 _mm256_set1_epi8(char b
)
1053 return (__m256i
)(__v32qi
){ b
, b
, b
, b
, b
, b
, b
, b
, b
, b
, b
, b
, b
, b
, b
, b
,
1054 b
, b
, b
, b
, b
, b
, b
, b
, b
, b
, b
, b
, b
, b
, b
, b
};
1057 static __inline __m256i
__attribute__((__always_inline__
, __nodebug__
))
1058 _mm256_set1_epi64x(long long q
)
1060 return (__m256i
)(__v4di
){ q
, q
, q
, q
};
1063 /* Create zeroed vectors */
1064 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
1065 _mm256_setzero_pd(void)
1067 return (__m256d
){ 0, 0, 0, 0 };
1070 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
1071 _mm256_setzero_ps(void)
1073 return (__m256
){ 0, 0, 0, 0, 0, 0, 0, 0 };
1076 static __inline __m256i
__attribute__((__always_inline__
, __nodebug__
))
1077 _mm256_setzero_si256(void)
1079 return (__m256i
){ 0LL, 0LL, 0LL, 0LL };
1082 /* Cast between vector types */
1083 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
1084 _mm256_castpd_ps(__m256d in
)
1089 static __inline __m256i
__attribute__((__always_inline__
, __nodebug__
))
1090 _mm256_castpd_si256(__m256d in
)
1095 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
1096 _mm256_castps_pd(__m256 in
)
1101 static __inline __m256i
__attribute__((__always_inline__
, __nodebug__
))
1102 _mm256_castps_si256(__m256 in
)
1107 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
1108 _mm256_castsi256_ps(__m256i in
)
1113 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
1114 _mm256_castsi256_pd(__m256i in
)
1119 static __inline __m128d
__attribute__((__always_inline__
, __nodebug__
))
1120 _mm256_castpd256_pd128(__m256d in
)
1122 return __builtin_shufflevector(in
, in
, 0, 1);
1125 static __inline __m128
__attribute__((__always_inline__
, __nodebug__
))
1126 _mm256_castps256_ps128(__m256 in
)
1128 return __builtin_shufflevector(in
, in
, 0, 1, 2, 3);
1131 static __inline __m128i
__attribute__((__always_inline__
, __nodebug__
))
1132 _mm256_castsi256_si128(__m256i in
)
1134 return __builtin_shufflevector(in
, in
, 0, 1);
1137 static __inline __m256d
__attribute__((__always_inline__
, __nodebug__
))
1138 _mm256_castpd128_pd256(__m128d in
)
1140 __m128d zero
= _mm_setzero_pd();
1141 return __builtin_shufflevector(in
, zero
, 0, 1, 2, 2);
1144 static __inline __m256
__attribute__((__always_inline__
, __nodebug__
))
1145 _mm256_castps128_ps256(__m128 in
)
1147 __m128 zero
= _mm_setzero_ps();
1148 return __builtin_shufflevector(in
, zero
, 0, 1, 2, 3, 4, 4, 4, 4);
1151 static __inline __m256i
__attribute__((__always_inline__
, __nodebug__
))
1152 _mm256_castsi128_si256(__m128i in
)
1154 __m128i zero
= _mm_setzero_si128();
1155 return __builtin_shufflevector(in
, zero
, 0, 1, 2, 2);