Daily bump.
[official-gcc.git] / gcc / config / i386 / avxintrin.h
blobec9b9905b5f6bcc803e44e71fa395b09ba3c642e
1 /* Copyright (C) 2008-2024 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
24 /* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 11.0. */
27 #ifndef _IMMINTRIN_H_INCLUDED
28 # error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
29 #endif
31 #ifndef _AVXINTRIN_H_INCLUDED
32 #define _AVXINTRIN_H_INCLUDED
34 #ifndef __AVX__
35 #pragma GCC push_options
36 #pragma GCC target("avx")
37 #define __DISABLE_AVX__
38 #endif /* __AVX__ */
40 /* Internal data types for implementing the intrinsics. */
41 typedef double __v4df __attribute__ ((__vector_size__ (32)));
42 typedef float __v8sf __attribute__ ((__vector_size__ (32)));
43 typedef long long __v4di __attribute__ ((__vector_size__ (32)));
44 typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
45 typedef int __v8si __attribute__ ((__vector_size__ (32)));
46 typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
47 typedef short __v16hi __attribute__ ((__vector_size__ (32)));
48 typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
49 typedef char __v32qi __attribute__ ((__vector_size__ (32)));
50 typedef signed char __v32qs __attribute__ ((__vector_size__ (32)));
51 typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
53 /* The Intel API is flexible enough that we must allow aliasing with other
54 vector types, and their scalar components. */
55 typedef float __m256 __attribute__ ((__vector_size__ (32),
56 __may_alias__));
57 typedef long long __m256i __attribute__ ((__vector_size__ (32),
58 __may_alias__));
59 typedef double __m256d __attribute__ ((__vector_size__ (32),
60 __may_alias__));
62 /* Unaligned version of the same types. */
63 typedef float __m256_u __attribute__ ((__vector_size__ (32),
64 __may_alias__,
65 __aligned__ (1)));
66 typedef long long __m256i_u __attribute__ ((__vector_size__ (32),
67 __may_alias__,
68 __aligned__ (1)));
69 typedef double __m256d_u __attribute__ ((__vector_size__ (32),
70 __may_alias__,
71 __aligned__ (1)));
73 /* Compare predicates for scalar and packed compare intrinsics. */
75 /* Equal (unordered, non-signaling) */
76 #define _CMP_EQ_UQ 0x08
77 /* Not-greater-than-or-equal (unordered, signaling) */
78 #define _CMP_NGE_US 0x09
79 /* Not-greater-than (unordered, signaling) */
80 #define _CMP_NGT_US 0x0a
81 /* False (ordered, non-signaling) */
82 #define _CMP_FALSE_OQ 0x0b
83 /* Not-equal (ordered, non-signaling) */
84 #define _CMP_NEQ_OQ 0x0c
85 /* Greater-than-or-equal (ordered, signaling) */
86 #define _CMP_GE_OS 0x0d
87 /* Greater-than (ordered, signaling) */
88 #define _CMP_GT_OS 0x0e
89 /* True (unordered, non-signaling) */
90 #define _CMP_TRUE_UQ 0x0f
91 /* Equal (ordered, signaling) */
92 #define _CMP_EQ_OS 0x10
93 /* Less-than (ordered, non-signaling) */
94 #define _CMP_LT_OQ 0x11
95 /* Less-than-or-equal (ordered, non-signaling) */
96 #define _CMP_LE_OQ 0x12
97 /* Unordered (signaling) */
98 #define _CMP_UNORD_S 0x13
99 /* Not-equal (unordered, signaling) */
100 #define _CMP_NEQ_US 0x14
101 /* Not-less-than (unordered, non-signaling) */
102 #define _CMP_NLT_UQ 0x15
103 /* Not-less-than-or-equal (unordered, non-signaling) */
104 #define _CMP_NLE_UQ 0x16
105 /* Ordered (signaling) */
106 #define _CMP_ORD_S 0x17
107 /* Equal (unordered, signaling) */
108 #define _CMP_EQ_US 0x18
109 /* Not-greater-than-or-equal (unordered, non-signaling) */
110 #define _CMP_NGE_UQ 0x19
111 /* Not-greater-than (unordered, non-signaling) */
112 #define _CMP_NGT_UQ 0x1a
113 /* False (ordered, signaling) */
114 #define _CMP_FALSE_OS 0x1b
115 /* Not-equal (ordered, signaling) */
116 #define _CMP_NEQ_OS 0x1c
117 /* Greater-than-or-equal (ordered, non-signaling) */
118 #define _CMP_GE_OQ 0x1d
119 /* Greater-than (ordered, non-signaling) */
120 #define _CMP_GT_OQ 0x1e
121 /* True (unordered, signaling) */
122 #define _CMP_TRUE_US 0x1f
124 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
125 _mm256_add_pd (__m256d __A, __m256d __B)
127 return (__m256d) ((__v4df)__A + (__v4df)__B);
130 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
131 _mm256_add_ps (__m256 __A, __m256 __B)
133 return (__m256) ((__v8sf)__A + (__v8sf)__B);
136 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
137 _mm256_addsub_pd (__m256d __A, __m256d __B)
139 return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
142 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
143 _mm256_addsub_ps (__m256 __A, __m256 __B)
145 return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B);
149 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
150 _mm256_and_pd (__m256d __A, __m256d __B)
152 return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B);
155 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
156 _mm256_and_ps (__m256 __A, __m256 __B)
158 return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B);
161 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
162 _mm256_andnot_pd (__m256d __A, __m256d __B)
164 return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B);
167 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
168 _mm256_andnot_ps (__m256 __A, __m256 __B)
170 return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B);
173 /* Double/single precision floating point blend instructions - select
174 data from 2 sources using constant/variable mask. */
176 #ifdef __OPTIMIZE__
177 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
178 _mm256_blend_pd (__m256d __X, __m256d __Y, const int __M)
180 return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X,
181 (__v4df)__Y,
182 __M);
185 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
186 _mm256_blend_ps (__m256 __X, __m256 __Y, const int __M)
188 return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X,
189 (__v8sf)__Y,
190 __M);
192 #else
193 #define _mm256_blend_pd(X, Y, M) \
194 ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X), \
195 (__v4df)(__m256d)(Y), (int)(M)))
197 #define _mm256_blend_ps(X, Y, M) \
198 ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X), \
199 (__v8sf)(__m256)(Y), (int)(M)))
200 #endif
202 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
203 _mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M)
205 return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X,
206 (__v4df)__Y,
207 (__v4df)__M);
210 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
211 _mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
213 return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
214 (__v8sf)__Y,
215 (__v8sf)__M);
218 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
219 _mm256_div_pd (__m256d __A, __m256d __B)
221 return (__m256d) ((__v4df)__A / (__v4df)__B);
224 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
225 _mm256_div_ps (__m256 __A, __m256 __B)
227 return (__m256) ((__v8sf)__A / (__v8sf)__B);
230 /* Dot product instructions with mask-defined summing and zeroing parts
231 of result. */
233 #ifdef __OPTIMIZE__
234 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
235 _mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
237 return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
238 (__v8sf)__Y,
239 __M);
241 #else
242 #define _mm256_dp_ps(X, Y, M) \
243 ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X), \
244 (__v8sf)(__m256)(Y), (int)(M)))
245 #endif
247 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
248 _mm256_hadd_pd (__m256d __X, __m256d __Y)
250 return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y);
253 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
254 _mm256_hadd_ps (__m256 __X, __m256 __Y)
256 return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y);
259 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
260 _mm256_hsub_pd (__m256d __X, __m256d __Y)
262 return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y);
265 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
266 _mm256_hsub_ps (__m256 __X, __m256 __Y)
268 return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y);
271 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
272 _mm256_max_pd (__m256d __A, __m256d __B)
274 return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B);
277 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
278 _mm256_max_ps (__m256 __A, __m256 __B)
280 return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B);
283 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
284 _mm256_min_pd (__m256d __A, __m256d __B)
286 return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B);
289 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
290 _mm256_min_ps (__m256 __A, __m256 __B)
292 return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
295 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
296 _mm256_mul_pd (__m256d __A, __m256d __B)
298 return (__m256d) ((__v4df)__A * (__v4df)__B);
301 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
302 _mm256_mul_ps (__m256 __A, __m256 __B)
304 return (__m256) ((__v8sf)__A * (__v8sf)__B);
307 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
308 _mm256_or_pd (__m256d __A, __m256d __B)
310 return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
313 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
314 _mm256_or_ps (__m256 __A, __m256 __B)
316 return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B);
319 #ifdef __OPTIMIZE__
320 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
321 _mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask)
323 return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B,
324 __mask);
327 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
328 _mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask)
330 return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B,
331 __mask);
333 #else
334 #define _mm256_shuffle_pd(A, B, N) \
335 ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A), \
336 (__v4df)(__m256d)(B), (int)(N)))
338 #define _mm256_shuffle_ps(A, B, N) \
339 ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), \
340 (__v8sf)(__m256)(B), (int)(N)))
341 #endif
343 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
344 _mm256_sub_pd (__m256d __A, __m256d __B)
346 return (__m256d) ((__v4df)__A - (__v4df)__B);
349 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
350 _mm256_sub_ps (__m256 __A, __m256 __B)
352 return (__m256) ((__v8sf)__A - (__v8sf)__B);
355 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
356 _mm256_xor_pd (__m256d __A, __m256d __B)
358 return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
361 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
362 _mm256_xor_ps (__m256 __A, __m256 __B)
364 return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B);
367 #ifdef __OPTIMIZE__
368 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369 _mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
371 return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y,
372 __P);
375 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
376 _mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
378 return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
379 __P);
381 #else
382 #define _mm256_cmp_pd(X, Y, P) \
383 ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X), \
384 (__v4df)(__m256d)(Y), (int)(P)))
386 #define _mm256_cmp_ps(X, Y, P) \
387 ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X), \
388 (__v8sf)(__m256)(Y), (int)(P)))
389 #endif
391 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
392 _mm256_cvtsi256_si32 (__m256i __A)
394 __v8si __B = (__v8si) __A;
395 return __B[0];
398 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
399 _mm256_cvtepi32_pd (__m128i __A)
401 return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A);
404 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
405 _mm256_cvtepi32_ps (__m256i __A)
407 return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A);
410 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
411 _mm256_cvtpd_ps (__m256d __A)
413 return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A);
416 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
417 _mm256_cvtps_epi32 (__m256 __A)
419 return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A);
422 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
423 _mm256_cvtps_pd (__m128 __A)
425 return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A);
428 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
429 _mm256_cvttpd_epi32 (__m256d __A)
431 return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A);
434 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
435 _mm256_cvtpd_epi32 (__m256d __A)
437 return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A);
440 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
441 _mm256_cvttps_epi32 (__m256 __A)
443 return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A);
446 extern __inline double
447 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
448 _mm256_cvtsd_f64 (__m256d __A)
450 return __A[0];
453 extern __inline float
454 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
455 _mm256_cvtss_f32 (__m256 __A)
457 return __A[0];
460 #ifdef __OPTIMIZE__
461 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
462 _mm256_extractf128_pd (__m256d __X, const int __N)
464 return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
467 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
468 _mm256_extractf128_ps (__m256 __X, const int __N)
470 return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N);
473 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
474 _mm256_extractf128_si256 (__m256i __X, const int __N)
476 return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N);
479 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
480 _mm256_extract_epi32 (__m256i __X, int const __N)
482 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
483 return _mm_extract_epi32 (__Y, __N % 4);
486 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
487 _mm256_extract_epi16 (__m256i __X, int const __N)
489 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
490 return _mm_extract_epi16 (__Y, __N % 8);
493 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
494 _mm256_extract_epi8 (__m256i __X, int const __N)
496 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
497 return _mm_extract_epi8 (__Y, __N % 16);
500 #ifdef __x86_64__
501 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
502 _mm256_extract_epi64 (__m256i __X, const int __N)
504 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
505 return _mm_extract_epi64 (__Y, __N % 2);
507 #endif
508 #else
509 #define _mm256_extractf128_pd(X, N) \
510 ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X), \
511 (int)(N)))
513 #define _mm256_extractf128_ps(X, N) \
514 ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X), \
515 (int)(N)))
517 #define _mm256_extractf128_si256(X, N) \
518 ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X), \
519 (int)(N)))
521 #define _mm256_extract_epi32(X, N) \
522 (__extension__ \
523 ({ \
524 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
525 _mm_extract_epi32 (__Y, (N) % 4); \
528 #define _mm256_extract_epi16(X, N) \
529 (__extension__ \
530 ({ \
531 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
532 _mm_extract_epi16 (__Y, (N) % 8); \
535 #define _mm256_extract_epi8(X, N) \
536 (__extension__ \
537 ({ \
538 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
539 _mm_extract_epi8 (__Y, (N) % 16); \
542 #ifdef __x86_64__
543 #define _mm256_extract_epi64(X, N) \
544 (__extension__ \
545 ({ \
546 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
547 _mm_extract_epi64 (__Y, (N) % 2); \
549 #endif
550 #endif
552 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
553 _mm256_zeroall (void)
555 __builtin_ia32_vzeroall ();
558 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
559 _mm256_zeroupper (void)
561 __builtin_ia32_vzeroupper ();
564 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
565 _mm_permutevar_pd (__m128d __A, __m128i __C)
567 return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
568 (__v2di)__C);
571 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
572 _mm256_permutevar_pd (__m256d __A, __m256i __C)
574 return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
575 (__v4di)__C);
578 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
579 _mm_permutevar_ps (__m128 __A, __m128i __C)
581 return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
582 (__v4si)__C);
585 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
586 _mm256_permutevar_ps (__m256 __A, __m256i __C)
588 return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
589 (__v8si)__C);
592 #ifdef __OPTIMIZE__
593 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
594 _mm_permute_pd (__m128d __X, const int __C)
596 return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
599 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
600 _mm256_permute_pd (__m256d __X, const int __C)
602 return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
605 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
606 _mm_permute_ps (__m128 __X, const int __C)
608 return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
611 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
612 _mm256_permute_ps (__m256 __X, const int __C)
614 return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C);
616 #else
617 #define _mm_permute_pd(X, C) \
618 ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
620 #define _mm256_permute_pd(X, C) \
621 ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C)))
623 #define _mm_permute_ps(X, C) \
624 ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
626 #define _mm256_permute_ps(X, C) \
627 ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
628 #endif
630 #ifdef __OPTIMIZE__
631 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
632 _mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
634 return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
635 (__v4df)__Y,
636 __C);
639 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
640 _mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
642 return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
643 (__v8sf)__Y,
644 __C);
647 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
648 _mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
650 return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
651 (__v8si)__Y,
652 __C);
654 #else
655 #define _mm256_permute2f128_pd(X, Y, C) \
656 ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X), \
657 (__v4df)(__m256d)(Y), \
658 (int)(C)))
660 #define _mm256_permute2f128_ps(X, Y, C) \
661 ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X), \
662 (__v8sf)(__m256)(Y), \
663 (int)(C)))
665 #define _mm256_permute2f128_si256(X, Y, C) \
666 ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X), \
667 (__v8si)(__m256i)(Y), \
668 (int)(C)))
669 #endif
671 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
672 _mm_broadcast_ss (float const *__X)
674 return (__m128) __builtin_ia32_vbroadcastss (__X);
677 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
678 _mm256_broadcast_sd (double const *__X)
680 return (__m256d) __builtin_ia32_vbroadcastsd256 (__X);
683 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
684 _mm256_broadcast_ss (float const *__X)
686 return (__m256) __builtin_ia32_vbroadcastss256 (__X);
689 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
690 _mm256_broadcast_pd (__m128d const *__X)
692 return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X);
695 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
696 _mm256_broadcast_ps (__m128 const *__X)
698 return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X);
701 #ifdef __OPTIMIZE__
702 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
703 _mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O)
705 return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X,
706 (__v2df)__Y,
707 __O);
710 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
711 _mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
713 return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
714 (__v4sf)__Y,
715 __O);
718 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
719 _mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O)
721 return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X,
722 (__v4si)__Y,
723 __O);
726 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
727 _mm256_insert_epi32 (__m256i __X, int __D, int const __N)
729 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
730 __Y = _mm_insert_epi32 (__Y, __D, __N % 4);
731 return _mm256_insertf128_si256 (__X, __Y, __N >> 2);
734 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
735 _mm256_insert_epi16 (__m256i __X, int __D, int const __N)
737 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
738 __Y = _mm_insert_epi16 (__Y, __D, __N % 8);
739 return _mm256_insertf128_si256 (__X, __Y, __N >> 3);
742 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
743 _mm256_insert_epi8 (__m256i __X, int __D, int const __N)
745 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
746 __Y = _mm_insert_epi8 (__Y, __D, __N % 16);
747 return _mm256_insertf128_si256 (__X, __Y, __N >> 4);
750 #ifdef __x86_64__
751 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
752 _mm256_insert_epi64 (__m256i __X, long long __D, int const __N)
754 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
755 __Y = _mm_insert_epi64 (__Y, __D, __N % 2);
756 return _mm256_insertf128_si256 (__X, __Y, __N >> 1);
758 #endif
759 #else
760 #define _mm256_insertf128_pd(X, Y, O) \
761 ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X), \
762 (__v2df)(__m128d)(Y), \
763 (int)(O)))
765 #define _mm256_insertf128_ps(X, Y, O) \
766 ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X), \
767 (__v4sf)(__m128)(Y), \
768 (int)(O)))
770 #define _mm256_insertf128_si256(X, Y, O) \
771 ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X), \
772 (__v4si)(__m128i)(Y), \
773 (int)(O)))
775 #define _mm256_insert_epi32(X, D, N) \
776 (__extension__ \
777 ({ \
778 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
779 __Y = _mm_insert_epi32 (__Y, (D), (N) % 4); \
780 _mm256_insertf128_si256 ((X), __Y, (N) >> 2); \
783 #define _mm256_insert_epi16(X, D, N) \
784 (__extension__ \
785 ({ \
786 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
787 __Y = _mm_insert_epi16 (__Y, (D), (N) % 8); \
788 _mm256_insertf128_si256 ((X), __Y, (N) >> 3); \
791 #define _mm256_insert_epi8(X, D, N) \
792 (__extension__ \
793 ({ \
794 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
795 __Y = _mm_insert_epi8 (__Y, (D), (N) % 16); \
796 _mm256_insertf128_si256 ((X), __Y, (N) >> 4); \
799 #ifdef __x86_64__
800 #define _mm256_insert_epi64(X, D, N) \
801 (__extension__ \
802 ({ \
803 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
804 __Y = _mm_insert_epi64 (__Y, (D), (N) % 2); \
805 _mm256_insertf128_si256 ((X), __Y, (N) >> 1); \
807 #endif
808 #endif
810 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
811 _mm256_load_pd (double const *__P)
813 return *(__m256d *)__P;
816 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
817 _mm256_store_pd (double *__P, __m256d __A)
819 *(__m256d *)__P = __A;
822 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
823 _mm256_load_ps (float const *__P)
825 return *(__m256 *)__P;
828 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
829 _mm256_store_ps (float *__P, __m256 __A)
831 *(__m256 *)__P = __A;
834 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
835 _mm256_loadu_pd (double const *__P)
837 return *(__m256d_u *)__P;
840 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
841 _mm256_storeu_pd (double *__P, __m256d __A)
843 *(__m256d_u *)__P = __A;
846 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
847 _mm256_loadu_ps (float const *__P)
849 return *(__m256_u *)__P;
852 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
853 _mm256_storeu_ps (float *__P, __m256 __A)
855 *(__m256_u *)__P = __A;
858 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
859 _mm256_load_si256 (__m256i const *__P)
861 return *__P;
864 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
865 _mm256_store_si256 (__m256i *__P, __m256i __A)
867 *__P = __A;
870 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
871 _mm256_loadu_si256 (__m256i_u const *__P)
873 return *__P;
876 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
877 _mm256_storeu_si256 (__m256i_u *__P, __m256i __A)
879 *__P = __A;
882 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
883 _mm_maskload_pd (double const *__P, __m128i __M)
885 return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P,
886 (__v2di)__M);
889 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
890 _mm_maskstore_pd (double *__P, __m128i __M, __m128d __A)
892 __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A);
895 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
896 _mm256_maskload_pd (double const *__P, __m256i __M)
898 return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P,
899 (__v4di)__M);
902 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
903 _mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A)
905 __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A);
908 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
909 _mm_maskload_ps (float const *__P, __m128i __M)
911 return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P,
912 (__v4si)__M);
915 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
916 _mm_maskstore_ps (float *__P, __m128i __M, __m128 __A)
918 __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A);
921 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
922 _mm256_maskload_ps (float const *__P, __m256i __M)
924 return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P,
925 (__v8si)__M);
928 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
929 _mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A)
931 __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A);
934 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
935 _mm256_movehdup_ps (__m256 __X)
937 return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X);
940 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
941 _mm256_moveldup_ps (__m256 __X)
943 return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X);
946 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
947 _mm256_movedup_pd (__m256d __X)
949 return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X);
952 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
953 _mm256_lddqu_si256 (__m256i const *__P)
955 return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P);
958 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
959 _mm256_stream_si256 (__m256i *__A, __m256i __B)
961 __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B);
964 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
965 _mm256_stream_pd (double *__A, __m256d __B)
967 __builtin_ia32_movntpd256 (__A, (__v4df)__B);
970 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
971 _mm256_stream_ps (float *__P, __m256 __A)
973 __builtin_ia32_movntps256 (__P, (__v8sf)__A);
976 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
977 _mm256_rcp_ps (__m256 __A)
979 return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A);
982 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
983 _mm256_rsqrt_ps (__m256 __A)
985 return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A);
988 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
989 _mm256_sqrt_pd (__m256d __A)
991 return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A);
994 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
995 _mm256_sqrt_ps (__m256 __A)
997 return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A);
1000 #ifdef __OPTIMIZE__
1001 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1002 _mm256_round_pd (__m256d __V, const int __M)
1004 return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M);
1007 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1008 _mm256_round_ps (__m256 __V, const int __M)
1010 return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M);
1012 #else
1013 #define _mm256_round_pd(V, M) \
1014 ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M)))
1016 #define _mm256_round_ps(V, M) \
1017 ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M)))
1018 #endif
1020 #define _mm256_ceil_pd(V) _mm256_round_pd ((V), _MM_FROUND_CEIL)
1021 #define _mm256_floor_pd(V) _mm256_round_pd ((V), _MM_FROUND_FLOOR)
1022 #define _mm256_ceil_ps(V) _mm256_round_ps ((V), _MM_FROUND_CEIL)
1023 #define _mm256_floor_ps(V) _mm256_round_ps ((V), _MM_FROUND_FLOOR)
1025 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1026 _mm256_unpackhi_pd (__m256d __A, __m256d __B)
1028 return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B);
1031 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1032 _mm256_unpacklo_pd (__m256d __A, __m256d __B)
1034 return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B);
1037 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1038 _mm256_unpackhi_ps (__m256 __A, __m256 __B)
1040 return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B);
1043 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1044 _mm256_unpacklo_ps (__m256 __A, __m256 __B)
1046 return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B);
1049 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1050 _mm_testz_pd (__m128d __M, __m128d __V)
1052 return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V);
1055 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1056 _mm_testc_pd (__m128d __M, __m128d __V)
1058 return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V);
1061 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1062 _mm_testnzc_pd (__m128d __M, __m128d __V)
1064 return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V);
1067 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1068 _mm_testz_ps (__m128 __M, __m128 __V)
1070 return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V);
1073 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1074 _mm_testc_ps (__m128 __M, __m128 __V)
1076 return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V);
1079 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1080 _mm_testnzc_ps (__m128 __M, __m128 __V)
1082 return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V);
1085 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1086 _mm256_testz_pd (__m256d __M, __m256d __V)
1088 return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V);
1091 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1092 _mm256_testc_pd (__m256d __M, __m256d __V)
1094 return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V);
1097 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1098 _mm256_testnzc_pd (__m256d __M, __m256d __V)
1100 return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V);
1103 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1104 _mm256_testz_ps (__m256 __M, __m256 __V)
1106 return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V);
1109 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1110 _mm256_testc_ps (__m256 __M, __m256 __V)
1112 return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V);
1115 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1116 _mm256_testnzc_ps (__m256 __M, __m256 __V)
1118 return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V);
1121 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1122 _mm256_testz_si256 (__m256i __M, __m256i __V)
1124 return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V);
1127 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1128 _mm256_testc_si256 (__m256i __M, __m256i __V)
1130 return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V);
1133 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1134 _mm256_testnzc_si256 (__m256i __M, __m256i __V)
1136 return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V);
1139 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1140 _mm256_movemask_pd (__m256d __A)
1142 return __builtin_ia32_movmskpd256 ((__v4df)__A);
1145 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1146 _mm256_movemask_ps (__m256 __A)
1148 return __builtin_ia32_movmskps256 ((__v8sf)__A);
1151 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1152 _mm256_undefined_pd (void)
1154 #pragma GCC diagnostic push
1155 #pragma GCC diagnostic ignored "-Winit-self"
1156 __m256d __Y = __Y;
1157 #pragma GCC diagnostic pop
1158 return __Y;
1161 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1162 _mm256_undefined_ps (void)
1164 #pragma GCC diagnostic push
1165 #pragma GCC diagnostic ignored "-Winit-self"
1166 __m256 __Y = __Y;
1167 #pragma GCC diagnostic pop
1168 return __Y;
1171 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1172 _mm256_undefined_si256 (void)
1174 #pragma GCC diagnostic push
1175 #pragma GCC diagnostic ignored "-Winit-self"
1176 __m256i __Y = __Y;
1177 #pragma GCC diagnostic pop
1178 return __Y;
1181 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1182 _mm256_setzero_pd (void)
1184 return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
1187 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1188 _mm256_setzero_ps (void)
1190 return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
1191 0.0, 0.0, 0.0, 0.0 };
1194 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1195 _mm256_setzero_si256 (void)
1197 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
1200 /* Create the vector [A B C D]. */
1201 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1202 _mm256_set_pd (double __A, double __B, double __C, double __D)
1204 return __extension__ (__m256d){ __D, __C, __B, __A };
1207 /* Create the vector [A B C D E F G H]. */
1208 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1209 _mm256_set_ps (float __A, float __B, float __C, float __D,
1210 float __E, float __F, float __G, float __H)
1212 return __extension__ (__m256){ __H, __G, __F, __E,
1213 __D, __C, __B, __A };
1216 /* Create the vector [A B C D E F G H]. */
1217 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1218 _mm256_set_epi32 (int __A, int __B, int __C, int __D,
1219 int __E, int __F, int __G, int __H)
1221 return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E,
1222 __D, __C, __B, __A };
1225 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1226 _mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12,
1227 short __q11, short __q10, short __q09, short __q08,
1228 short __q07, short __q06, short __q05, short __q04,
1229 short __q03, short __q02, short __q01, short __q00)
1231 return __extension__ (__m256i)(__v16hi){
1232 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1233 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
1237 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1238 _mm256_set_epi8 (char __q31, char __q30, char __q29, char __q28,
1239 char __q27, char __q26, char __q25, char __q24,
1240 char __q23, char __q22, char __q21, char __q20,
1241 char __q19, char __q18, char __q17, char __q16,
1242 char __q15, char __q14, char __q13, char __q12,
1243 char __q11, char __q10, char __q09, char __q08,
1244 char __q07, char __q06, char __q05, char __q04,
1245 char __q03, char __q02, char __q01, char __q00)
1247 return __extension__ (__m256i)(__v32qi){
1248 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1249 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
1250 __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
1251 __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
1255 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1256 _mm256_set_epi64x (long long __A, long long __B, long long __C,
1257 long long __D)
1259 return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A };
1262 /* Create a vector with all elements equal to A. */
1263 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1264 _mm256_set1_pd (double __A)
1266 return __extension__ (__m256d){ __A, __A, __A, __A };
1269 /* Create a vector with all elements equal to A. */
1270 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1271 _mm256_set1_ps (float __A)
1273 return __extension__ (__m256){ __A, __A, __A, __A,
1274 __A, __A, __A, __A };
1277 /* Create a vector with all elements equal to A. */
1278 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1279 _mm256_set1_epi32 (int __A)
1281 return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
1282 __A, __A, __A, __A };
1285 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1286 _mm256_set1_epi16 (short __A)
1288 return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
1289 __A, __A, __A, __A, __A, __A, __A, __A);
1292 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1293 _mm256_set1_epi8 (char __A)
1295 return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
1296 __A, __A, __A, __A, __A, __A, __A, __A,
1297 __A, __A, __A, __A, __A, __A, __A, __A,
1298 __A, __A, __A, __A, __A, __A, __A, __A);
1301 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1302 _mm256_set1_epi64x (long long __A)
1304 return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A };
1307 /* Create vectors of elements in the reversed order from the
1308 _mm256_set_XXX functions. */
1310 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1311 _mm256_setr_pd (double __A, double __B, double __C, double __D)
1313 return _mm256_set_pd (__D, __C, __B, __A);
1316 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1317 _mm256_setr_ps (float __A, float __B, float __C, float __D,
1318 float __E, float __F, float __G, float __H)
1320 return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A);
1323 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1324 _mm256_setr_epi32 (int __A, int __B, int __C, int __D,
1325 int __E, int __F, int __G, int __H)
1327 return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A);
1330 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1331 _mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12,
1332 short __q11, short __q10, short __q09, short __q08,
1333 short __q07, short __q06, short __q05, short __q04,
1334 short __q03, short __q02, short __q01, short __q00)
1336 return _mm256_set_epi16 (__q00, __q01, __q02, __q03,
1337 __q04, __q05, __q06, __q07,
1338 __q08, __q09, __q10, __q11,
1339 __q12, __q13, __q14, __q15);
1342 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1343 _mm256_setr_epi8 (char __q31, char __q30, char __q29, char __q28,
1344 char __q27, char __q26, char __q25, char __q24,
1345 char __q23, char __q22, char __q21, char __q20,
1346 char __q19, char __q18, char __q17, char __q16,
1347 char __q15, char __q14, char __q13, char __q12,
1348 char __q11, char __q10, char __q09, char __q08,
1349 char __q07, char __q06, char __q05, char __q04,
1350 char __q03, char __q02, char __q01, char __q00)
1352 return _mm256_set_epi8 (__q00, __q01, __q02, __q03,
1353 __q04, __q05, __q06, __q07,
1354 __q08, __q09, __q10, __q11,
1355 __q12, __q13, __q14, __q15,
1356 __q16, __q17, __q18, __q19,
1357 __q20, __q21, __q22, __q23,
1358 __q24, __q25, __q26, __q27,
1359 __q28, __q29, __q30, __q31);
1362 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1363 _mm256_setr_epi64x (long long __A, long long __B, long long __C,
1364 long long __D)
1366 return _mm256_set_epi64x (__D, __C, __B, __A);
1369 /* Casts between various SP, DP, INT vector types. Note that these do no
1370 conversion of values, they just change the type. */
1371 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1372 _mm256_castpd_ps (__m256d __A)
1374 return (__m256) __A;
1377 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1378 _mm256_castpd_si256 (__m256d __A)
1380 return (__m256i) __A;
1383 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1384 _mm256_castps_pd (__m256 __A)
1386 return (__m256d) __A;
1389 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1390 _mm256_castps_si256(__m256 __A)
1392 return (__m256i) __A;
1395 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1396 _mm256_castsi256_ps (__m256i __A)
1398 return (__m256) __A;
1401 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1402 _mm256_castsi256_pd (__m256i __A)
1404 return (__m256d) __A;
1407 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1408 _mm256_castpd256_pd128 (__m256d __A)
1410 return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A);
1413 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1414 _mm256_castps256_ps128 (__m256 __A)
1416 return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A);
1419 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1420 _mm256_castsi256_si128 (__m256i __A)
1422 return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A);
1425 /* When cast is done from a 128 to 256-bit type, the low 128 bits of
1426 the 256-bit result contain source parameter value and the upper 128
1427 bits of the result are undefined. Those intrinsics shouldn't
1428 generate any extra moves. */
1430 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1431 _mm256_castpd128_pd256 (__m128d __A)
1433 return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A);
1436 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1437 _mm256_castps128_ps256 (__m128 __A)
1439 return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A);
1442 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1443 _mm256_castsi128_si256 (__m128i __A)
1445 return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);
1448 /* Similarly, but with zero extension instead of undefined values. */
1450 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1451 _mm256_zextpd128_pd256 (__m128d __A)
1453 return _mm256_insertf128_pd (_mm256_setzero_pd (), __A, 0);
1456 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1457 _mm256_zextps128_ps256 (__m128 __A)
1459 return _mm256_insertf128_ps (_mm256_setzero_ps (), __A, 0);
1462 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1463 _mm256_zextsi128_si256 (__m128i __A)
1465 return _mm256_insertf128_si256 (_mm256_setzero_si256 (), __A, 0);
1468 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1469 _mm256_set_m128 ( __m128 __H, __m128 __L)
1471 return _mm256_insertf128_ps (_mm256_castps128_ps256 (__L), __H, 1);
1474 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1475 _mm256_set_m128d (__m128d __H, __m128d __L)
1477 return _mm256_insertf128_pd (_mm256_castpd128_pd256 (__L), __H, 1);
1480 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1481 _mm256_set_m128i (__m128i __H, __m128i __L)
1483 return _mm256_insertf128_si256 (_mm256_castsi128_si256 (__L), __H, 1);
1486 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1487 _mm256_setr_m128 (__m128 __L, __m128 __H)
1489 return _mm256_set_m128 (__H, __L);
1492 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1493 _mm256_setr_m128d (__m128d __L, __m128d __H)
1495 return _mm256_set_m128d (__H, __L);
1498 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1499 _mm256_setr_m128i (__m128i __L, __m128i __H)
1501 return _mm256_set_m128i (__H, __L);
1504 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1505 _mm256_loadu2_m128 (float const *__PH, float const *__PL)
1507 return _mm256_insertf128_ps (_mm256_castps128_ps256 (_mm_loadu_ps (__PL)),
1508 _mm_loadu_ps (__PH), 1);
1511 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1512 _mm256_storeu2_m128 (float *__PH, float *__PL, __m256 __A)
1514 _mm_storeu_ps (__PL, _mm256_castps256_ps128 (__A));
1515 _mm_storeu_ps (__PH, _mm256_extractf128_ps (__A, 1));
1518 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1519 _mm256_loadu2_m128d (double const *__PH, double const *__PL)
1521 return _mm256_insertf128_pd (_mm256_castpd128_pd256 (_mm_loadu_pd (__PL)),
1522 _mm_loadu_pd (__PH), 1);
1525 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1526 _mm256_storeu2_m128d (double *__PH, double *__PL, __m256d __A)
1528 _mm_storeu_pd (__PL, _mm256_castpd256_pd128 (__A));
1529 _mm_storeu_pd (__PH, _mm256_extractf128_pd (__A, 1));
1532 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1533 _mm256_loadu2_m128i (__m128i_u const *__PH, __m128i_u const *__PL)
1535 return _mm256_insertf128_si256 (_mm256_castsi128_si256 (_mm_loadu_si128 (__PL)),
1536 _mm_loadu_si128 (__PH), 1);
1539 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1540 _mm256_storeu2_m128i (__m128i_u *__PH, __m128i_u *__PL, __m256i __A)
1542 _mm_storeu_si128 (__PL, _mm256_castsi256_si128 (__A));
1543 _mm_storeu_si128 (__PH, _mm256_extractf128_si256 (__A, 1));
1546 #ifdef __DISABLE_AVX__
1547 #undef __DISABLE_AVX__
1548 #pragma GCC pop_options
1549 #endif /* __DISABLE_AVX__ */
1551 #endif /* _AVXINTRIN_H_INCLUDED */