2016-11-17 Aaron Sawdey <acsawdey@linux.vnet.ibm.com>
[official-gcc.git] / gcc / config / i386 / avxintrin.h
blob9cd9aab918197bbfa4514e1f64fe33ad16b2c151
1 /* Copyright (C) 2008-2016 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
24 /* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 11.0. */
27 #ifndef _IMMINTRIN_H_INCLUDED
28 # error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
29 #endif
31 #ifndef _AVXINTRIN_H_INCLUDED
32 #define _AVXINTRIN_H_INCLUDED
34 #ifndef __AVX__
35 #pragma GCC push_options
36 #pragma GCC target("avx")
37 #define __DISABLE_AVX__
38 #endif /* __AVX__ */
40 /* Internal data types for implementing the intrinsics. */
41 typedef double __v4df __attribute__ ((__vector_size__ (32)));
42 typedef float __v8sf __attribute__ ((__vector_size__ (32)));
43 typedef long long __v4di __attribute__ ((__vector_size__ (32)));
44 typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
45 typedef int __v8si __attribute__ ((__vector_size__ (32)));
46 typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
47 typedef short __v16hi __attribute__ ((__vector_size__ (32)));
48 typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
49 typedef char __v32qi __attribute__ ((__vector_size__ (32)));
50 typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
52 /* The Intel API is flexible enough that we must allow aliasing with other
53 vector types, and their scalar components. */
54 typedef float __m256 __attribute__ ((__vector_size__ (32),
55 __may_alias__));
56 typedef long long __m256i __attribute__ ((__vector_size__ (32),
57 __may_alias__));
58 typedef double __m256d __attribute__ ((__vector_size__ (32),
59 __may_alias__));
61 /* Unaligned version of the same types. */
62 typedef float __m256_u __attribute__ ((__vector_size__ (32),
63 __may_alias__,
64 __aligned__ (1)));
65 typedef long long __m256i_u __attribute__ ((__vector_size__ (32),
66 __may_alias__,
67 __aligned__ (1)));
68 typedef double __m256d_u __attribute__ ((__vector_size__ (32),
69 __may_alias__,
70 __aligned__ (1)));
72 /* Compare predicates for scalar and packed compare intrinsics. */
74 /* Equal (ordered, non-signaling) */
75 #define _CMP_EQ_OQ 0x00
76 /* Less-than (ordered, signaling) */
77 #define _CMP_LT_OS 0x01
78 /* Less-than-or-equal (ordered, signaling) */
79 #define _CMP_LE_OS 0x02
80 /* Unordered (non-signaling) */
81 #define _CMP_UNORD_Q 0x03
82 /* Not-equal (unordered, non-signaling) */
83 #define _CMP_NEQ_UQ 0x04
84 /* Not-less-than (unordered, signaling) */
85 #define _CMP_NLT_US 0x05
86 /* Not-less-than-or-equal (unordered, signaling) */
87 #define _CMP_NLE_US 0x06
88 /* Ordered (nonsignaling) */
89 #define _CMP_ORD_Q 0x07
90 /* Equal (unordered, non-signaling) */
91 #define _CMP_EQ_UQ 0x08
92 /* Not-greater-than-or-equal (unordered, signaling) */
93 #define _CMP_NGE_US 0x09
94 /* Not-greater-than (unordered, signaling) */
95 #define _CMP_NGT_US 0x0a
96 /* False (ordered, non-signaling) */
97 #define _CMP_FALSE_OQ 0x0b
98 /* Not-equal (ordered, non-signaling) */
99 #define _CMP_NEQ_OQ 0x0c
100 /* Greater-than-or-equal (ordered, signaling) */
101 #define _CMP_GE_OS 0x0d
102 /* Greater-than (ordered, signaling) */
103 #define _CMP_GT_OS 0x0e
104 /* True (unordered, non-signaling) */
105 #define _CMP_TRUE_UQ 0x0f
106 /* Equal (ordered, signaling) */
107 #define _CMP_EQ_OS 0x10
108 /* Less-than (ordered, non-signaling) */
109 #define _CMP_LT_OQ 0x11
110 /* Less-than-or-equal (ordered, non-signaling) */
111 #define _CMP_LE_OQ 0x12
112 /* Unordered (signaling) */
113 #define _CMP_UNORD_S 0x13
114 /* Not-equal (unordered, signaling) */
115 #define _CMP_NEQ_US 0x14
116 /* Not-less-than (unordered, non-signaling) */
117 #define _CMP_NLT_UQ 0x15
118 /* Not-less-than-or-equal (unordered, non-signaling) */
119 #define _CMP_NLE_UQ 0x16
120 /* Ordered (signaling) */
121 #define _CMP_ORD_S 0x17
122 /* Equal (unordered, signaling) */
123 #define _CMP_EQ_US 0x18
124 /* Not-greater-than-or-equal (unordered, non-signaling) */
125 #define _CMP_NGE_UQ 0x19
126 /* Not-greater-than (unordered, non-signaling) */
127 #define _CMP_NGT_UQ 0x1a
128 /* False (ordered, signaling) */
129 #define _CMP_FALSE_OS 0x1b
130 /* Not-equal (ordered, signaling) */
131 #define _CMP_NEQ_OS 0x1c
132 /* Greater-than-or-equal (ordered, non-signaling) */
133 #define _CMP_GE_OQ 0x1d
134 /* Greater-than (ordered, non-signaling) */
135 #define _CMP_GT_OQ 0x1e
136 /* True (unordered, signaling) */
137 #define _CMP_TRUE_US 0x1f
139 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
140 _mm256_add_pd (__m256d __A, __m256d __B)
142 return (__m256d) ((__v4df)__A + (__v4df)__B);
145 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
146 _mm256_add_ps (__m256 __A, __m256 __B)
148 return (__m256) ((__v8sf)__A + (__v8sf)__B);
151 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
152 _mm256_addsub_pd (__m256d __A, __m256d __B)
154 return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
157 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
158 _mm256_addsub_ps (__m256 __A, __m256 __B)
160 return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B);
164 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
165 _mm256_and_pd (__m256d __A, __m256d __B)
167 return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B);
170 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
171 _mm256_and_ps (__m256 __A, __m256 __B)
173 return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B);
176 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
177 _mm256_andnot_pd (__m256d __A, __m256d __B)
179 return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B);
182 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
183 _mm256_andnot_ps (__m256 __A, __m256 __B)
185 return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B);
188 /* Double/single precision floating point blend instructions - select
189 data from 2 sources using constant/variable mask. */
191 #ifdef __OPTIMIZE__
192 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
193 _mm256_blend_pd (__m256d __X, __m256d __Y, const int __M)
195 return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X,
196 (__v4df)__Y,
197 __M);
200 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
201 _mm256_blend_ps (__m256 __X, __m256 __Y, const int __M)
203 return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X,
204 (__v8sf)__Y,
205 __M);
207 #else
208 #define _mm256_blend_pd(X, Y, M) \
209 ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X), \
210 (__v4df)(__m256d)(Y), (int)(M)))
212 #define _mm256_blend_ps(X, Y, M) \
213 ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X), \
214 (__v8sf)(__m256)(Y), (int)(M)))
215 #endif
217 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
218 _mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M)
220 return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X,
221 (__v4df)__Y,
222 (__v4df)__M);
225 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
226 _mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
228 return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
229 (__v8sf)__Y,
230 (__v8sf)__M);
233 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
234 _mm256_div_pd (__m256d __A, __m256d __B)
236 return (__m256d) ((__v4df)__A / (__v4df)__B);
239 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
240 _mm256_div_ps (__m256 __A, __m256 __B)
242 return (__m256) ((__v8sf)__A / (__v8sf)__B);
245 /* Dot product instructions with mask-defined summing and zeroing parts
246 of result. */
248 #ifdef __OPTIMIZE__
249 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
250 _mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
252 return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
253 (__v8sf)__Y,
254 __M);
256 #else
257 #define _mm256_dp_ps(X, Y, M) \
258 ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X), \
259 (__v8sf)(__m256)(Y), (int)(M)))
260 #endif
262 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
263 _mm256_hadd_pd (__m256d __X, __m256d __Y)
265 return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y);
268 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
269 _mm256_hadd_ps (__m256 __X, __m256 __Y)
271 return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y);
274 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
275 _mm256_hsub_pd (__m256d __X, __m256d __Y)
277 return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y);
280 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
281 _mm256_hsub_ps (__m256 __X, __m256 __Y)
283 return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y);
286 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
287 _mm256_max_pd (__m256d __A, __m256d __B)
289 return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B);
292 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
293 _mm256_max_ps (__m256 __A, __m256 __B)
295 return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B);
298 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
299 _mm256_min_pd (__m256d __A, __m256d __B)
301 return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B);
304 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
305 _mm256_min_ps (__m256 __A, __m256 __B)
307 return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
310 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
311 _mm256_mul_pd (__m256d __A, __m256d __B)
313 return (__m256d) ((__v4df)__A * (__v4df)__B);
316 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
317 _mm256_mul_ps (__m256 __A, __m256 __B)
319 return (__m256) ((__v8sf)__A * (__v8sf)__B);
322 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
323 _mm256_or_pd (__m256d __A, __m256d __B)
325 return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
328 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
329 _mm256_or_ps (__m256 __A, __m256 __B)
331 return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B);
334 #ifdef __OPTIMIZE__
335 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
336 _mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask)
338 return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B,
339 __mask);
342 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
343 _mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask)
345 return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B,
346 __mask);
348 #else
349 #define _mm256_shuffle_pd(A, B, N) \
350 ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A), \
351 (__v4df)(__m256d)(B), (int)(N)))
353 #define _mm256_shuffle_ps(A, B, N) \
354 ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), \
355 (__v8sf)(__m256)(B), (int)(N)))
356 #endif
358 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
359 _mm256_sub_pd (__m256d __A, __m256d __B)
361 return (__m256d) ((__v4df)__A - (__v4df)__B);
364 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
365 _mm256_sub_ps (__m256 __A, __m256 __B)
367 return (__m256) ((__v8sf)__A - (__v8sf)__B);
370 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
371 _mm256_xor_pd (__m256d __A, __m256d __B)
373 return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
376 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
377 _mm256_xor_ps (__m256 __A, __m256 __B)
379 return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B);
382 #ifdef __OPTIMIZE__
383 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
384 _mm_cmp_pd (__m128d __X, __m128d __Y, const int __P)
386 return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P);
389 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
390 _mm_cmp_ps (__m128 __X, __m128 __Y, const int __P)
392 return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P);
395 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
396 _mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
398 return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y,
399 __P);
402 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
403 _mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
405 return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
406 __P);
409 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
410 _mm_cmp_sd (__m128d __X, __m128d __Y, const int __P)
412 return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P);
415 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
416 _mm_cmp_ss (__m128 __X, __m128 __Y, const int __P)
418 return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P);
420 #else
421 #define _mm_cmp_pd(X, Y, P) \
422 ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X), \
423 (__v2df)(__m128d)(Y), (int)(P)))
425 #define _mm_cmp_ps(X, Y, P) \
426 ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X), \
427 (__v4sf)(__m128)(Y), (int)(P)))
429 #define _mm256_cmp_pd(X, Y, P) \
430 ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X), \
431 (__v4df)(__m256d)(Y), (int)(P)))
433 #define _mm256_cmp_ps(X, Y, P) \
434 ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X), \
435 (__v8sf)(__m256)(Y), (int)(P)))
437 #define _mm_cmp_sd(X, Y, P) \
438 ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X), \
439 (__v2df)(__m128d)(Y), (int)(P)))
441 #define _mm_cmp_ss(X, Y, P) \
442 ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X), \
443 (__v4sf)(__m128)(Y), (int)(P)))
444 #endif
446 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
447 _mm256_cvtepi32_pd (__m128i __A)
449 return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A);
452 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
453 _mm256_cvtepi32_ps (__m256i __A)
455 return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A);
458 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
459 _mm256_cvtpd_ps (__m256d __A)
461 return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A);
464 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
465 _mm256_cvtps_epi32 (__m256 __A)
467 return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A);
470 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
471 _mm256_cvtps_pd (__m128 __A)
473 return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A);
476 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
477 _mm256_cvttpd_epi32 (__m256d __A)
479 return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A);
482 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
483 _mm256_cvtpd_epi32 (__m256d __A)
485 return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A);
488 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
489 _mm256_cvttps_epi32 (__m256 __A)
491 return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A);
494 #ifdef __OPTIMIZE__
495 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
496 _mm256_extractf128_pd (__m256d __X, const int __N)
498 return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
501 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
502 _mm256_extractf128_ps (__m256 __X, const int __N)
504 return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N);
507 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
508 _mm256_extractf128_si256 (__m256i __X, const int __N)
510 return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N);
513 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
514 _mm256_extract_epi32 (__m256i __X, int const __N)
516 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
517 return _mm_extract_epi32 (__Y, __N % 4);
520 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
521 _mm256_extract_epi16 (__m256i __X, int const __N)
523 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
524 return _mm_extract_epi16 (__Y, __N % 8);
527 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
528 _mm256_extract_epi8 (__m256i __X, int const __N)
530 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
531 return _mm_extract_epi8 (__Y, __N % 16);
534 #ifdef __x86_64__
535 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
536 _mm256_extract_epi64 (__m256i __X, const int __N)
538 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
539 return _mm_extract_epi64 (__Y, __N % 2);
541 #endif
542 #else
543 #define _mm256_extractf128_pd(X, N) \
544 ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X), \
545 (int)(N)))
547 #define _mm256_extractf128_ps(X, N) \
548 ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X), \
549 (int)(N)))
551 #define _mm256_extractf128_si256(X, N) \
552 ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X), \
553 (int)(N)))
555 #define _mm256_extract_epi32(X, N) \
556 (__extension__ \
557 ({ \
558 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
559 _mm_extract_epi32 (__Y, (N) % 4); \
562 #define _mm256_extract_epi16(X, N) \
563 (__extension__ \
564 ({ \
565 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
566 _mm_extract_epi16 (__Y, (N) % 8); \
569 #define _mm256_extract_epi8(X, N) \
570 (__extension__ \
571 ({ \
572 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
573 _mm_extract_epi8 (__Y, (N) % 16); \
576 #ifdef __x86_64__
577 #define _mm256_extract_epi64(X, N) \
578 (__extension__ \
579 ({ \
580 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
581 _mm_extract_epi64 (__Y, (N) % 2); \
583 #endif
584 #endif
586 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
587 _mm256_zeroall (void)
589 __builtin_ia32_vzeroall ();
592 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
593 _mm256_zeroupper (void)
595 __builtin_ia32_vzeroupper ();
598 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
599 _mm_permutevar_pd (__m128d __A, __m128i __C)
601 return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
602 (__v2di)__C);
605 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
606 _mm256_permutevar_pd (__m256d __A, __m256i __C)
608 return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
609 (__v4di)__C);
612 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
613 _mm_permutevar_ps (__m128 __A, __m128i __C)
615 return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
616 (__v4si)__C);
619 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
620 _mm256_permutevar_ps (__m256 __A, __m256i __C)
622 return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
623 (__v8si)__C);
626 #ifdef __OPTIMIZE__
627 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
628 _mm_permute_pd (__m128d __X, const int __C)
630 return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
633 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
634 _mm256_permute_pd (__m256d __X, const int __C)
636 return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
639 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
640 _mm_permute_ps (__m128 __X, const int __C)
642 return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
645 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
646 _mm256_permute_ps (__m256 __X, const int __C)
648 return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C);
650 #else
651 #define _mm_permute_pd(X, C) \
652 ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
654 #define _mm256_permute_pd(X, C) \
655 ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C)))
657 #define _mm_permute_ps(X, C) \
658 ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
660 #define _mm256_permute_ps(X, C) \
661 ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
662 #endif
664 #ifdef __OPTIMIZE__
665 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
666 _mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
668 return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
669 (__v4df)__Y,
670 __C);
673 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
674 _mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
676 return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
677 (__v8sf)__Y,
678 __C);
681 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
682 _mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
684 return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
685 (__v8si)__Y,
686 __C);
688 #else
689 #define _mm256_permute2f128_pd(X, Y, C) \
690 ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X), \
691 (__v4df)(__m256d)(Y), \
692 (int)(C)))
694 #define _mm256_permute2f128_ps(X, Y, C) \
695 ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X), \
696 (__v8sf)(__m256)(Y), \
697 (int)(C)))
699 #define _mm256_permute2f128_si256(X, Y, C) \
700 ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X), \
701 (__v8si)(__m256i)(Y), \
702 (int)(C)))
703 #endif
705 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
706 _mm_broadcast_ss (float const *__X)
708 return (__m128) __builtin_ia32_vbroadcastss (__X);
711 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
712 _mm256_broadcast_sd (double const *__X)
714 return (__m256d) __builtin_ia32_vbroadcastsd256 (__X);
717 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
718 _mm256_broadcast_ss (float const *__X)
720 return (__m256) __builtin_ia32_vbroadcastss256 (__X);
723 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
724 _mm256_broadcast_pd (__m128d const *__X)
726 return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X);
729 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
730 _mm256_broadcast_ps (__m128 const *__X)
732 return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X);
735 #ifdef __OPTIMIZE__
736 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
737 _mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O)
739 return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X,
740 (__v2df)__Y,
741 __O);
744 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
745 _mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
747 return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
748 (__v4sf)__Y,
749 __O);
752 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
753 _mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O)
755 return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X,
756 (__v4si)__Y,
757 __O);
760 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
761 _mm256_insert_epi32 (__m256i __X, int __D, int const __N)
763 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
764 __Y = _mm_insert_epi32 (__Y, __D, __N % 4);
765 return _mm256_insertf128_si256 (__X, __Y, __N >> 2);
768 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
769 _mm256_insert_epi16 (__m256i __X, int __D, int const __N)
771 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
772 __Y = _mm_insert_epi16 (__Y, __D, __N % 8);
773 return _mm256_insertf128_si256 (__X, __Y, __N >> 3);
776 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
777 _mm256_insert_epi8 (__m256i __X, int __D, int const __N)
779 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
780 __Y = _mm_insert_epi8 (__Y, __D, __N % 16);
781 return _mm256_insertf128_si256 (__X, __Y, __N >> 4);
784 #ifdef __x86_64__
785 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
786 _mm256_insert_epi64 (__m256i __X, long long __D, int const __N)
788 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
789 __Y = _mm_insert_epi64 (__Y, __D, __N % 2);
790 return _mm256_insertf128_si256 (__X, __Y, __N >> 1);
792 #endif
793 #else
794 #define _mm256_insertf128_pd(X, Y, O) \
795 ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X), \
796 (__v2df)(__m128d)(Y), \
797 (int)(O)))
799 #define _mm256_insertf128_ps(X, Y, O) \
800 ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X), \
801 (__v4sf)(__m128)(Y), \
802 (int)(O)))
804 #define _mm256_insertf128_si256(X, Y, O) \
805 ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X), \
806 (__v4si)(__m128i)(Y), \
807 (int)(O)))
809 #define _mm256_insert_epi32(X, D, N) \
810 (__extension__ \
811 ({ \
812 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
813 __Y = _mm_insert_epi32 (__Y, (D), (N) % 4); \
814 _mm256_insertf128_si256 ((X), __Y, (N) >> 2); \
817 #define _mm256_insert_epi16(X, D, N) \
818 (__extension__ \
819 ({ \
820 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
821 __Y = _mm_insert_epi16 (__Y, (D), (N) % 8); \
822 _mm256_insertf128_si256 ((X), __Y, (N) >> 3); \
825 #define _mm256_insert_epi8(X, D, N) \
826 (__extension__ \
827 ({ \
828 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
829 __Y = _mm_insert_epi8 (__Y, (D), (N) % 16); \
830 _mm256_insertf128_si256 ((X), __Y, (N) >> 4); \
833 #ifdef __x86_64__
834 #define _mm256_insert_epi64(X, D, N) \
835 (__extension__ \
836 ({ \
837 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
838 __Y = _mm_insert_epi64 (__Y, (D), (N) % 2); \
839 _mm256_insertf128_si256 ((X), __Y, (N) >> 1); \
841 #endif
842 #endif
844 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
845 _mm256_load_pd (double const *__P)
847 return *(__m256d *)__P;
850 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
851 _mm256_store_pd (double *__P, __m256d __A)
853 *(__m256d *)__P = __A;
856 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
857 _mm256_load_ps (float const *__P)
859 return *(__m256 *)__P;
862 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
863 _mm256_store_ps (float *__P, __m256 __A)
865 *(__m256 *)__P = __A;
868 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
869 _mm256_loadu_pd (double const *__P)
871 return *(__m256d_u *)__P;
874 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
875 _mm256_storeu_pd (double *__P, __m256d __A)
877 *(__m256d_u *)__P = __A;
880 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
881 _mm256_loadu_ps (float const *__P)
883 return *(__m256_u *)__P;
886 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
887 _mm256_storeu_ps (float *__P, __m256 __A)
889 *(__m256_u *)__P = __A;
892 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
893 _mm256_load_si256 (__m256i const *__P)
895 return *__P;
898 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
899 _mm256_store_si256 (__m256i *__P, __m256i __A)
901 *__P = __A;
904 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
905 _mm256_loadu_si256 (__m256i_u const *__P)
907 return *__P;
910 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
911 _mm256_storeu_si256 (__m256i_u *__P, __m256i __A)
913 *__P = __A;
916 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
917 _mm_maskload_pd (double const *__P, __m128i __M)
919 return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P,
920 (__v2di)__M);
923 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
924 _mm_maskstore_pd (double *__P, __m128i __M, __m128d __A)
926 __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A);
929 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
930 _mm256_maskload_pd (double const *__P, __m256i __M)
932 return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P,
933 (__v4di)__M);
936 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
937 _mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A)
939 __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A);
942 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
943 _mm_maskload_ps (float const *__P, __m128i __M)
945 return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P,
946 (__v4si)__M);
949 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
950 _mm_maskstore_ps (float *__P, __m128i __M, __m128 __A)
952 __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A);
955 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
956 _mm256_maskload_ps (float const *__P, __m256i __M)
958 return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P,
959 (__v8si)__M);
962 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
963 _mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A)
965 __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A);
968 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
969 _mm256_movehdup_ps (__m256 __X)
971 return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X);
974 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
975 _mm256_moveldup_ps (__m256 __X)
977 return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X);
980 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
981 _mm256_movedup_pd (__m256d __X)
983 return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X);
986 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
987 _mm256_lddqu_si256 (__m256i const *__P)
989 return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P);
992 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
993 _mm256_stream_si256 (__m256i *__A, __m256i __B)
995 __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B);
998 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
999 _mm256_stream_pd (double *__A, __m256d __B)
1001 __builtin_ia32_movntpd256 (__A, (__v4df)__B);
1004 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1005 _mm256_stream_ps (float *__P, __m256 __A)
1007 __builtin_ia32_movntps256 (__P, (__v8sf)__A);
1010 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1011 _mm256_rcp_ps (__m256 __A)
1013 return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A);
1016 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1017 _mm256_rsqrt_ps (__m256 __A)
1019 return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A);
1022 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1023 _mm256_sqrt_pd (__m256d __A)
1025 return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A);
1028 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1029 _mm256_sqrt_ps (__m256 __A)
1031 return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A);
1034 #ifdef __OPTIMIZE__
1035 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1036 _mm256_round_pd (__m256d __V, const int __M)
1038 return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M);
1041 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1042 _mm256_round_ps (__m256 __V, const int __M)
1044 return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M);
1046 #else
1047 #define _mm256_round_pd(V, M) \
1048 ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M)))
1050 #define _mm256_round_ps(V, M) \
1051 ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M)))
1052 #endif
1054 #define _mm256_ceil_pd(V) _mm256_round_pd ((V), _MM_FROUND_CEIL)
1055 #define _mm256_floor_pd(V) _mm256_round_pd ((V), _MM_FROUND_FLOOR)
1056 #define _mm256_ceil_ps(V) _mm256_round_ps ((V), _MM_FROUND_CEIL)
1057 #define _mm256_floor_ps(V) _mm256_round_ps ((V), _MM_FROUND_FLOOR)
1059 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1060 _mm256_unpackhi_pd (__m256d __A, __m256d __B)
1062 return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B);
1065 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1066 _mm256_unpacklo_pd (__m256d __A, __m256d __B)
1068 return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B);
1071 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1072 _mm256_unpackhi_ps (__m256 __A, __m256 __B)
1074 return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B);
1077 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1078 _mm256_unpacklo_ps (__m256 __A, __m256 __B)
1080 return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B);
1083 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1084 _mm_testz_pd (__m128d __M, __m128d __V)
1086 return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V);
1089 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1090 _mm_testc_pd (__m128d __M, __m128d __V)
1092 return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V);
1095 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1096 _mm_testnzc_pd (__m128d __M, __m128d __V)
1098 return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V);
1101 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1102 _mm_testz_ps (__m128 __M, __m128 __V)
1104 return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V);
1107 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1108 _mm_testc_ps (__m128 __M, __m128 __V)
1110 return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V);
1113 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1114 _mm_testnzc_ps (__m128 __M, __m128 __V)
1116 return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V);
1119 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1120 _mm256_testz_pd (__m256d __M, __m256d __V)
1122 return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V);
1125 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1126 _mm256_testc_pd (__m256d __M, __m256d __V)
1128 return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V);
1131 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1132 _mm256_testnzc_pd (__m256d __M, __m256d __V)
1134 return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V);
1137 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1138 _mm256_testz_ps (__m256 __M, __m256 __V)
1140 return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V);
1143 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1144 _mm256_testc_ps (__m256 __M, __m256 __V)
1146 return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V);
1149 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1150 _mm256_testnzc_ps (__m256 __M, __m256 __V)
1152 return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V);
1155 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1156 _mm256_testz_si256 (__m256i __M, __m256i __V)
1158 return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V);
1161 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1162 _mm256_testc_si256 (__m256i __M, __m256i __V)
1164 return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V);
1167 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1168 _mm256_testnzc_si256 (__m256i __M, __m256i __V)
1170 return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V);
1173 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1174 _mm256_movemask_pd (__m256d __A)
1176 return __builtin_ia32_movmskpd256 ((__v4df)__A);
1179 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1180 _mm256_movemask_ps (__m256 __A)
1182 return __builtin_ia32_movmskps256 ((__v8sf)__A);
1185 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1186 _mm256_undefined_pd (void)
1188 __m256d __Y = __Y;
1189 return __Y;
1192 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1193 _mm256_undefined_ps (void)
1195 __m256 __Y = __Y;
1196 return __Y;
1199 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1200 _mm256_undefined_si256 (void)
1202 __m256i __Y = __Y;
1203 return __Y;
1206 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1207 _mm256_setzero_pd (void)
1209 return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
1212 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1213 _mm256_setzero_ps (void)
1215 return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
1216 0.0, 0.0, 0.0, 0.0 };
1219 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1220 _mm256_setzero_si256 (void)
1222 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
1225 /* Create the vector [A B C D]. */
1226 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1227 _mm256_set_pd (double __A, double __B, double __C, double __D)
1229 return __extension__ (__m256d){ __D, __C, __B, __A };
1232 /* Create the vector [A B C D E F G H]. */
1233 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1234 _mm256_set_ps (float __A, float __B, float __C, float __D,
1235 float __E, float __F, float __G, float __H)
1237 return __extension__ (__m256){ __H, __G, __F, __E,
1238 __D, __C, __B, __A };
1241 /* Create the vector [A B C D E F G H]. */
1242 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1243 _mm256_set_epi32 (int __A, int __B, int __C, int __D,
1244 int __E, int __F, int __G, int __H)
1246 return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E,
1247 __D, __C, __B, __A };
1250 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1251 _mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12,
1252 short __q11, short __q10, short __q09, short __q08,
1253 short __q07, short __q06, short __q05, short __q04,
1254 short __q03, short __q02, short __q01, short __q00)
1256 return __extension__ (__m256i)(__v16hi){
1257 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1258 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
1262 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1263 _mm256_set_epi8 (char __q31, char __q30, char __q29, char __q28,
1264 char __q27, char __q26, char __q25, char __q24,
1265 char __q23, char __q22, char __q21, char __q20,
1266 char __q19, char __q18, char __q17, char __q16,
1267 char __q15, char __q14, char __q13, char __q12,
1268 char __q11, char __q10, char __q09, char __q08,
1269 char __q07, char __q06, char __q05, char __q04,
1270 char __q03, char __q02, char __q01, char __q00)
1272 return __extension__ (__m256i)(__v32qi){
1273 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1274 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
1275 __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
1276 __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
1280 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1281 _mm256_set_epi64x (long long __A, long long __B, long long __C,
1282 long long __D)
1284 return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A };
1287 /* Create a vector with all elements equal to A. */
1288 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1289 _mm256_set1_pd (double __A)
1291 return __extension__ (__m256d){ __A, __A, __A, __A };
1294 /* Create a vector with all elements equal to A. */
1295 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1296 _mm256_set1_ps (float __A)
1298 return __extension__ (__m256){ __A, __A, __A, __A,
1299 __A, __A, __A, __A };
1302 /* Create a vector with all elements equal to A. */
1303 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1304 _mm256_set1_epi32 (int __A)
1306 return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
1307 __A, __A, __A, __A };
1310 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1311 _mm256_set1_epi16 (short __A)
1313 return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
1314 __A, __A, __A, __A, __A, __A, __A, __A);
1317 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1318 _mm256_set1_epi8 (char __A)
1320 return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
1321 __A, __A, __A, __A, __A, __A, __A, __A,
1322 __A, __A, __A, __A, __A, __A, __A, __A,
1323 __A, __A, __A, __A, __A, __A, __A, __A);
1326 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1327 _mm256_set1_epi64x (long long __A)
1329 return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A };
1332 /* Create vectors of elements in the reversed order from the
1333 _mm256_set_XXX functions. */
1335 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1336 _mm256_setr_pd (double __A, double __B, double __C, double __D)
1338 return _mm256_set_pd (__D, __C, __B, __A);
1341 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1342 _mm256_setr_ps (float __A, float __B, float __C, float __D,
1343 float __E, float __F, float __G, float __H)
1345 return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A);
1348 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1349 _mm256_setr_epi32 (int __A, int __B, int __C, int __D,
1350 int __E, int __F, int __G, int __H)
1352 return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A);
1355 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1356 _mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12,
1357 short __q11, short __q10, short __q09, short __q08,
1358 short __q07, short __q06, short __q05, short __q04,
1359 short __q03, short __q02, short __q01, short __q00)
1361 return _mm256_set_epi16 (__q00, __q01, __q02, __q03,
1362 __q04, __q05, __q06, __q07,
1363 __q08, __q09, __q10, __q11,
1364 __q12, __q13, __q14, __q15);
1367 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1368 _mm256_setr_epi8 (char __q31, char __q30, char __q29, char __q28,
1369 char __q27, char __q26, char __q25, char __q24,
1370 char __q23, char __q22, char __q21, char __q20,
1371 char __q19, char __q18, char __q17, char __q16,
1372 char __q15, char __q14, char __q13, char __q12,
1373 char __q11, char __q10, char __q09, char __q08,
1374 char __q07, char __q06, char __q05, char __q04,
1375 char __q03, char __q02, char __q01, char __q00)
1377 return _mm256_set_epi8 (__q00, __q01, __q02, __q03,
1378 __q04, __q05, __q06, __q07,
1379 __q08, __q09, __q10, __q11,
1380 __q12, __q13, __q14, __q15,
1381 __q16, __q17, __q18, __q19,
1382 __q20, __q21, __q22, __q23,
1383 __q24, __q25, __q26, __q27,
1384 __q28, __q29, __q30, __q31);
1387 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1388 _mm256_setr_epi64x (long long __A, long long __B, long long __C,
1389 long long __D)
1391 return _mm256_set_epi64x (__D, __C, __B, __A);
1394 /* Casts between various SP, DP, INT vector types. Note that these do no
1395 conversion of values, they just change the type. */
1396 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1397 _mm256_castpd_ps (__m256d __A)
1399 return (__m256) __A;
1402 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1403 _mm256_castpd_si256 (__m256d __A)
1405 return (__m256i) __A;
1408 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1409 _mm256_castps_pd (__m256 __A)
1411 return (__m256d) __A;
1414 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1415 _mm256_castps_si256(__m256 __A)
1417 return (__m256i) __A;
1420 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1421 _mm256_castsi256_ps (__m256i __A)
1423 return (__m256) __A;
1426 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1427 _mm256_castsi256_pd (__m256i __A)
1429 return (__m256d) __A;
1432 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1433 _mm256_castpd256_pd128 (__m256d __A)
1435 return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A);
1438 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1439 _mm256_castps256_ps128 (__m256 __A)
1441 return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A);
1444 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1445 _mm256_castsi256_si128 (__m256i __A)
1447 return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A);
1450 /* When cast is done from a 128 to 256-bit type, the low 128 bits of
1451 the 256-bit result contain source parameter value and the upper 128
1452 bits of the result are undefined. Those intrinsics shouldn't
1453 generate any extra moves. */
1455 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1456 _mm256_castpd128_pd256 (__m128d __A)
1458 return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A);
1461 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1462 _mm256_castps128_ps256 (__m128 __A)
1464 return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A);
1467 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1468 _mm256_castsi128_si256 (__m128i __A)
1470 return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);
1473 #ifdef __DISABLE_AVX__
1474 #undef __DISABLE_AVX__
1475 #pragma GCC pop_options
1476 #endif /* __DISABLE_AVX__ */
1478 #endif /* _AVXINTRIN_H_INCLUDED */