Merge with main truk.
[official-gcc.git] / gcc / config / i386 / avxintrin.h
blob2ea327c5a1b6c11e25ab50a74fb1c884363bfc83
1 /* Copyright (C) 2008-2014 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
24 /* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 11.0. */
27 #ifndef _IMMINTRIN_H_INCLUDED
28 # error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
29 #endif
31 #ifndef _AVXINTRIN_H_INCLUDED
32 #define _AVXINTRIN_H_INCLUDED
34 #ifndef __AVX__
35 #pragma GCC push_options
36 #pragma GCC target("avx")
37 #define __DISABLE_AVX__
38 #endif /* __AVX__ */
40 /* Internal data types for implementing the intrinsics. */
41 typedef double __v4df __attribute__ ((__vector_size__ (32)));
42 typedef float __v8sf __attribute__ ((__vector_size__ (32)));
43 typedef long long __v4di __attribute__ ((__vector_size__ (32)));
44 typedef int __v8si __attribute__ ((__vector_size__ (32)));
45 typedef short __v16hi __attribute__ ((__vector_size__ (32)));
46 typedef char __v32qi __attribute__ ((__vector_size__ (32)));
48 /* The Intel API is flexible enough that we must allow aliasing with other
49 vector types, and their scalar components. */
50 typedef float __m256 __attribute__ ((__vector_size__ (32),
51 __may_alias__));
52 typedef long long __m256i __attribute__ ((__vector_size__ (32),
53 __may_alias__));
54 typedef double __m256d __attribute__ ((__vector_size__ (32),
55 __may_alias__));
57 /* Compare predicates for scalar and packed compare intrinsics. */
59 /* Equal (ordered, non-signaling) */
60 #define _CMP_EQ_OQ 0x00
61 /* Less-than (ordered, signaling) */
62 #define _CMP_LT_OS 0x01
63 /* Less-than-or-equal (ordered, signaling) */
64 #define _CMP_LE_OS 0x02
65 /* Unordered (non-signaling) */
66 #define _CMP_UNORD_Q 0x03
67 /* Not-equal (unordered, non-signaling) */
68 #define _CMP_NEQ_UQ 0x04
69 /* Not-less-than (unordered, signaling) */
70 #define _CMP_NLT_US 0x05
71 /* Not-less-than-or-equal (unordered, signaling) */
72 #define _CMP_NLE_US 0x06
73 /* Ordered (nonsignaling) */
74 #define _CMP_ORD_Q 0x07
75 /* Equal (unordered, non-signaling) */
76 #define _CMP_EQ_UQ 0x08
77 /* Not-greater-than-or-equal (unordered, signaling) */
78 #define _CMP_NGE_US 0x09
79 /* Not-greater-than (unordered, signaling) */
80 #define _CMP_NGT_US 0x0a
81 /* False (ordered, non-signaling) */
82 #define _CMP_FALSE_OQ 0x0b
83 /* Not-equal (ordered, non-signaling) */
84 #define _CMP_NEQ_OQ 0x0c
85 /* Greater-than-or-equal (ordered, signaling) */
86 #define _CMP_GE_OS 0x0d
87 /* Greater-than (ordered, signaling) */
88 #define _CMP_GT_OS 0x0e
89 /* True (unordered, non-signaling) */
90 #define _CMP_TRUE_UQ 0x0f
91 /* Equal (ordered, signaling) */
92 #define _CMP_EQ_OS 0x10
93 /* Less-than (ordered, non-signaling) */
94 #define _CMP_LT_OQ 0x11
95 /* Less-than-or-equal (ordered, non-signaling) */
96 #define _CMP_LE_OQ 0x12
97 /* Unordered (signaling) */
98 #define _CMP_UNORD_S 0x13
99 /* Not-equal (unordered, signaling) */
100 #define _CMP_NEQ_US 0x14
101 /* Not-less-than (unordered, non-signaling) */
102 #define _CMP_NLT_UQ 0x15
103 /* Not-less-than-or-equal (unordered, non-signaling) */
104 #define _CMP_NLE_UQ 0x16
105 /* Ordered (signaling) */
106 #define _CMP_ORD_S 0x17
107 /* Equal (unordered, signaling) */
108 #define _CMP_EQ_US 0x18
109 /* Not-greater-than-or-equal (unordered, non-signaling) */
110 #define _CMP_NGE_UQ 0x19
111 /* Not-greater-than (unordered, non-signaling) */
112 #define _CMP_NGT_UQ 0x1a
113 /* False (ordered, signaling) */
114 #define _CMP_FALSE_OS 0x1b
115 /* Not-equal (ordered, signaling) */
116 #define _CMP_NEQ_OS 0x1c
117 /* Greater-than-or-equal (ordered, non-signaling) */
118 #define _CMP_GE_OQ 0x1d
119 /* Greater-than (ordered, non-signaling) */
120 #define _CMP_GT_OQ 0x1e
121 /* True (unordered, signaling) */
122 #define _CMP_TRUE_US 0x1f
124 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
125 _mm256_add_pd (__m256d __A, __m256d __B)
127 return (__m256d) __builtin_ia32_addpd256 ((__v4df)__A, (__v4df)__B);
130 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
131 _mm256_add_ps (__m256 __A, __m256 __B)
133 return (__m256) __builtin_ia32_addps256 ((__v8sf)__A, (__v8sf)__B);
136 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
137 _mm256_addsub_pd (__m256d __A, __m256d __B)
139 return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
142 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
143 _mm256_addsub_ps (__m256 __A, __m256 __B)
145 return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B);
149 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
150 _mm256_and_pd (__m256d __A, __m256d __B)
152 return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B);
155 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
156 _mm256_and_ps (__m256 __A, __m256 __B)
158 return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B);
161 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
162 _mm256_andnot_pd (__m256d __A, __m256d __B)
164 return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B);
167 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
168 _mm256_andnot_ps (__m256 __A, __m256 __B)
170 return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B);
173 /* Double/single precision floating point blend instructions - select
174 data from 2 sources using constant/variable mask. */
176 #ifdef __OPTIMIZE__
177 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
178 _mm256_blend_pd (__m256d __X, __m256d __Y, const int __M)
180 return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X,
181 (__v4df)__Y,
182 __M);
185 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
186 _mm256_blend_ps (__m256 __X, __m256 __Y, const int __M)
188 return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X,
189 (__v8sf)__Y,
190 __M);
192 #else
193 #define _mm256_blend_pd(X, Y, M) \
194 ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X), \
195 (__v4df)(__m256d)(Y), (int)(M)))
197 #define _mm256_blend_ps(X, Y, M) \
198 ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X), \
199 (__v8sf)(__m256)(Y), (int)(M)))
200 #endif
202 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
203 _mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M)
205 return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X,
206 (__v4df)__Y,
207 (__v4df)__M);
210 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
211 _mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
213 return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
214 (__v8sf)__Y,
215 (__v8sf)__M);
218 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
219 _mm256_div_pd (__m256d __A, __m256d __B)
221 return (__m256d) __builtin_ia32_divpd256 ((__v4df)__A, (__v4df)__B);
224 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
225 _mm256_div_ps (__m256 __A, __m256 __B)
227 return (__m256) __builtin_ia32_divps256 ((__v8sf)__A, (__v8sf)__B);
230 /* Dot product instructions with mask-defined summing and zeroing parts
231 of result. */
233 #ifdef __OPTIMIZE__
234 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
235 _mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
237 return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
238 (__v8sf)__Y,
239 __M);
241 #else
242 #define _mm256_dp_ps(X, Y, M) \
243 ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X), \
244 (__v8sf)(__m256)(Y), (int)(M)))
245 #endif
247 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
248 _mm256_hadd_pd (__m256d __X, __m256d __Y)
250 return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y);
253 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
254 _mm256_hadd_ps (__m256 __X, __m256 __Y)
256 return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y);
259 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
260 _mm256_hsub_pd (__m256d __X, __m256d __Y)
262 return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y);
265 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
266 _mm256_hsub_ps (__m256 __X, __m256 __Y)
268 return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y);
271 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
272 _mm256_max_pd (__m256d __A, __m256d __B)
274 return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B);
277 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
278 _mm256_max_ps (__m256 __A, __m256 __B)
280 return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B);
283 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
284 _mm256_min_pd (__m256d __A, __m256d __B)
286 return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B);
289 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
290 _mm256_min_ps (__m256 __A, __m256 __B)
292 return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
295 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
296 _mm256_mul_pd (__m256d __A, __m256d __B)
298 return (__m256d) __builtin_ia32_mulpd256 ((__v4df)__A, (__v4df)__B);
301 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
302 _mm256_mul_ps (__m256 __A, __m256 __B)
304 return (__m256) __builtin_ia32_mulps256 ((__v8sf)__A, (__v8sf)__B);
307 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
308 _mm256_or_pd (__m256d __A, __m256d __B)
310 return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
313 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
314 _mm256_or_ps (__m256 __A, __m256 __B)
316 return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B);
319 #ifdef __OPTIMIZE__
320 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
321 _mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask)
323 return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B,
324 __mask);
327 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
328 _mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask)
330 return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B,
331 __mask);
333 #else
334 #define _mm256_shuffle_pd(A, B, N) \
335 ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A), \
336 (__v4df)(__m256d)(B), (int)(N)))
338 #define _mm256_shuffle_ps(A, B, N) \
339 ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), \
340 (__v8sf)(__m256)(B), (int)(N)))
341 #endif
343 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
344 _mm256_sub_pd (__m256d __A, __m256d __B)
346 return (__m256d) __builtin_ia32_subpd256 ((__v4df)__A, (__v4df)__B);
349 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
350 _mm256_sub_ps (__m256 __A, __m256 __B)
352 return (__m256) __builtin_ia32_subps256 ((__v8sf)__A, (__v8sf)__B);
355 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
356 _mm256_xor_pd (__m256d __A, __m256d __B)
358 return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
361 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
362 _mm256_xor_ps (__m256 __A, __m256 __B)
364 return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B);
367 #ifdef __OPTIMIZE__
368 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369 _mm_cmp_pd (__m128d __X, __m128d __Y, const int __P)
371 return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P);
374 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
375 _mm_cmp_ps (__m128 __X, __m128 __Y, const int __P)
377 return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P);
380 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
381 _mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
383 return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y,
384 __P);
387 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
388 _mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
390 return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
391 __P);
394 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
395 _mm_cmp_sd (__m128d __X, __m128d __Y, const int __P)
397 return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P);
400 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
401 _mm_cmp_ss (__m128 __X, __m128 __Y, const int __P)
403 return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P);
405 #else
406 #define _mm_cmp_pd(X, Y, P) \
407 ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X), \
408 (__v2df)(__m128d)(Y), (int)(P)))
410 #define _mm_cmp_ps(X, Y, P) \
411 ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X), \
412 (__v4sf)(__m128)(Y), (int)(P)))
414 #define _mm256_cmp_pd(X, Y, P) \
415 ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X), \
416 (__v4df)(__m256d)(Y), (int)(P)))
418 #define _mm256_cmp_ps(X, Y, P) \
419 ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X), \
420 (__v8sf)(__m256)(Y), (int)(P)))
422 #define _mm_cmp_sd(X, Y, P) \
423 ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X), \
424 (__v2df)(__m128d)(Y), (int)(P)))
426 #define _mm_cmp_ss(X, Y, P) \
427 ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X), \
428 (__v4sf)(__m128)(Y), (int)(P)))
429 #endif
431 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
432 _mm256_cvtepi32_pd (__m128i __A)
434 return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A);
437 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
438 _mm256_cvtepi32_ps (__m256i __A)
440 return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A);
443 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
444 _mm256_cvtpd_ps (__m256d __A)
446 return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A);
449 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
450 _mm256_cvtps_epi32 (__m256 __A)
452 return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A);
455 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
456 _mm256_cvtps_pd (__m128 __A)
458 return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A);
461 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
462 _mm256_cvttpd_epi32 (__m256d __A)
464 return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A);
467 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
468 _mm256_cvtpd_epi32 (__m256d __A)
470 return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A);
473 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
474 _mm256_cvttps_epi32 (__m256 __A)
476 return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A);
479 #ifdef __OPTIMIZE__
480 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
481 _mm256_extractf128_pd (__m256d __X, const int __N)
483 return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
486 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
487 _mm256_extractf128_ps (__m256 __X, const int __N)
489 return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N);
492 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
493 _mm256_extractf128_si256 (__m256i __X, const int __N)
495 return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N);
498 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
499 _mm256_extract_epi32 (__m256i __X, int const __N)
501 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
502 return _mm_extract_epi32 (__Y, __N % 4);
505 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
506 _mm256_extract_epi16 (__m256i __X, int const __N)
508 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
509 return _mm_extract_epi16 (__Y, __N % 8);
512 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
513 _mm256_extract_epi8 (__m256i __X, int const __N)
515 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
516 return _mm_extract_epi8 (__Y, __N % 16);
519 #ifdef __x86_64__
520 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
521 _mm256_extract_epi64 (__m256i __X, const int __N)
523 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
524 return _mm_extract_epi64 (__Y, __N % 2);
526 #endif
527 #else
528 #define _mm256_extractf128_pd(X, N) \
529 ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X), \
530 (int)(N)))
532 #define _mm256_extractf128_ps(X, N) \
533 ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X), \
534 (int)(N)))
536 #define _mm256_extractf128_si256(X, N) \
537 ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X), \
538 (int)(N)))
540 #define _mm256_extract_epi32(X, N) \
541 (__extension__ \
542 ({ \
543 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
544 _mm_extract_epi32 (__Y, (N) % 4); \
547 #define _mm256_extract_epi16(X, N) \
548 (__extension__ \
549 ({ \
550 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
551 _mm_extract_epi16 (__Y, (N) % 8); \
554 #define _mm256_extract_epi8(X, N) \
555 (__extension__ \
556 ({ \
557 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
558 _mm_extract_epi8 (__Y, (N) % 16); \
561 #ifdef __x86_64__
562 #define _mm256_extract_epi64(X, N) \
563 (__extension__ \
564 ({ \
565 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
566 _mm_extract_epi64 (__Y, (N) % 2); \
568 #endif
569 #endif
571 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
572 _mm256_zeroall (void)
574 __builtin_ia32_vzeroall ();
577 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
578 _mm256_zeroupper (void)
580 __builtin_ia32_vzeroupper ();
583 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
584 _mm_permutevar_pd (__m128d __A, __m128i __C)
586 return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
587 (__v2di)__C);
590 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
591 _mm256_permutevar_pd (__m256d __A, __m256i __C)
593 return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
594 (__v4di)__C);
597 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
598 _mm_permutevar_ps (__m128 __A, __m128i __C)
600 return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
601 (__v4si)__C);
604 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
605 _mm256_permutevar_ps (__m256 __A, __m256i __C)
607 return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
608 (__v8si)__C);
611 #ifdef __OPTIMIZE__
612 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
613 _mm_permute_pd (__m128d __X, const int __C)
615 return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
618 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
619 _mm256_permute_pd (__m256d __X, const int __C)
621 return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
624 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
625 _mm_permute_ps (__m128 __X, const int __C)
627 return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
630 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
631 _mm256_permute_ps (__m256 __X, const int __C)
633 return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C);
635 #else
636 #define _mm_permute_pd(X, C) \
637 ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
639 #define _mm256_permute_pd(X, C) \
640 ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C)))
642 #define _mm_permute_ps(X, C) \
643 ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
645 #define _mm256_permute_ps(X, C) \
646 ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
647 #endif
649 #ifdef __OPTIMIZE__
650 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
651 _mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
653 return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
654 (__v4df)__Y,
655 __C);
658 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
659 _mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
661 return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
662 (__v8sf)__Y,
663 __C);
666 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
667 _mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
669 return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
670 (__v8si)__Y,
671 __C);
673 #else
674 #define _mm256_permute2f128_pd(X, Y, C) \
675 ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X), \
676 (__v4df)(__m256d)(Y), \
677 (int)(C)))
679 #define _mm256_permute2f128_ps(X, Y, C) \
680 ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X), \
681 (__v8sf)(__m256)(Y), \
682 (int)(C)))
684 #define _mm256_permute2f128_si256(X, Y, C) \
685 ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X), \
686 (__v8si)(__m256i)(Y), \
687 (int)(C)))
688 #endif
690 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
691 _mm_broadcast_ss (float const *__X)
693 return (__m128) __builtin_ia32_vbroadcastss (__X);
696 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
697 _mm256_broadcast_sd (double const *__X)
699 return (__m256d) __builtin_ia32_vbroadcastsd256 (__X);
702 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
703 _mm256_broadcast_ss (float const *__X)
705 return (__m256) __builtin_ia32_vbroadcastss256 (__X);
708 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
709 _mm256_broadcast_pd (__m128d const *__X)
711 return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X);
714 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
715 _mm256_broadcast_ps (__m128 const *__X)
717 return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X);
720 #ifdef __OPTIMIZE__
721 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
722 _mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O)
724 return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X,
725 (__v2df)__Y,
726 __O);
729 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
730 _mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
732 return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
733 (__v4sf)__Y,
734 __O);
737 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
738 _mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O)
740 return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X,
741 (__v4si)__Y,
742 __O);
745 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
746 _mm256_insert_epi32 (__m256i __X, int __D, int const __N)
748 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
749 __Y = _mm_insert_epi32 (__Y, __D, __N % 4);
750 return _mm256_insertf128_si256 (__X, __Y, __N >> 2);
753 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
754 _mm256_insert_epi16 (__m256i __X, int __D, int const __N)
756 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
757 __Y = _mm_insert_epi16 (__Y, __D, __N % 8);
758 return _mm256_insertf128_si256 (__X, __Y, __N >> 3);
761 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
762 _mm256_insert_epi8 (__m256i __X, int __D, int const __N)
764 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
765 __Y = _mm_insert_epi8 (__Y, __D, __N % 16);
766 return _mm256_insertf128_si256 (__X, __Y, __N >> 4);
769 #ifdef __x86_64__
770 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
771 _mm256_insert_epi64 (__m256i __X, long long __D, int const __N)
773 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
774 __Y = _mm_insert_epi64 (__Y, __D, __N % 2);
775 return _mm256_insertf128_si256 (__X, __Y, __N >> 1);
777 #endif
778 #else
779 #define _mm256_insertf128_pd(X, Y, O) \
780 ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X), \
781 (__v2df)(__m128d)(Y), \
782 (int)(O)))
784 #define _mm256_insertf128_ps(X, Y, O) \
785 ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X), \
786 (__v4sf)(__m128)(Y), \
787 (int)(O)))
789 #define _mm256_insertf128_si256(X, Y, O) \
790 ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X), \
791 (__v4si)(__m128i)(Y), \
792 (int)(O)))
794 #define _mm256_insert_epi32(X, D, N) \
795 (__extension__ \
796 ({ \
797 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
798 __Y = _mm_insert_epi32 (__Y, (D), (N) % 4); \
799 _mm256_insertf128_si256 ((X), __Y, (N) >> 2); \
802 #define _mm256_insert_epi16(X, D, N) \
803 (__extension__ \
804 ({ \
805 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
806 __Y = _mm_insert_epi16 (__Y, (D), (N) % 8); \
807 _mm256_insertf128_si256 ((X), __Y, (N) >> 3); \
810 #define _mm256_insert_epi8(X, D, N) \
811 (__extension__ \
812 ({ \
813 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
814 __Y = _mm_insert_epi8 (__Y, (D), (N) % 16); \
815 _mm256_insertf128_si256 ((X), __Y, (N) >> 4); \
818 #ifdef __x86_64__
819 #define _mm256_insert_epi64(X, D, N) \
820 (__extension__ \
821 ({ \
822 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
823 __Y = _mm_insert_epi64 (__Y, (D), (N) % 2); \
824 _mm256_insertf128_si256 ((X), __Y, (N) >> 1); \
826 #endif
827 #endif
829 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
830 _mm256_load_pd (double const *__P)
832 return *(__m256d *)__P;
835 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
836 _mm256_store_pd (double *__P, __m256d __A)
838 *(__m256d *)__P = __A;
841 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
842 _mm256_load_ps (float const *__P)
844 return *(__m256 *)__P;
847 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
848 _mm256_store_ps (float *__P, __m256 __A)
850 *(__m256 *)__P = __A;
853 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
854 _mm256_loadu_pd (double const *__P)
856 return (__m256d) __builtin_ia32_loadupd256 (__P);
859 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
860 _mm256_storeu_pd (double *__P, __m256d __A)
862 __builtin_ia32_storeupd256 (__P, (__v4df)__A);
865 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
866 _mm256_loadu_ps (float const *__P)
868 return (__m256) __builtin_ia32_loadups256 (__P);
871 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
872 _mm256_storeu_ps (float *__P, __m256 __A)
874 __builtin_ia32_storeups256 (__P, (__v8sf)__A);
877 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
878 _mm256_load_si256 (__m256i const *__P)
880 return *__P;
883 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
884 _mm256_store_si256 (__m256i *__P, __m256i __A)
886 *__P = __A;
889 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
890 _mm256_loadu_si256 (__m256i const *__P)
892 return (__m256i) __builtin_ia32_loaddqu256 ((char const *)__P);
895 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
896 _mm256_storeu_si256 (__m256i *__P, __m256i __A)
898 __builtin_ia32_storedqu256 ((char *)__P, (__v32qi)__A);
901 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
902 _mm_maskload_pd (double const *__P, __m128i __M)
904 return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P,
905 (__v2di)__M);
908 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
909 _mm_maskstore_pd (double *__P, __m128i __M, __m128d __A)
911 __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A);
914 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
915 _mm256_maskload_pd (double const *__P, __m256i __M)
917 return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P,
918 (__v4di)__M);
921 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
922 _mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A)
924 __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A);
927 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
928 _mm_maskload_ps (float const *__P, __m128i __M)
930 return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P,
931 (__v4si)__M);
934 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
935 _mm_maskstore_ps (float *__P, __m128i __M, __m128 __A)
937 __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A);
940 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
941 _mm256_maskload_ps (float const *__P, __m256i __M)
943 return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P,
944 (__v8si)__M);
947 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
948 _mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A)
950 __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A);
953 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
954 _mm256_movehdup_ps (__m256 __X)
956 return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X);
959 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
960 _mm256_moveldup_ps (__m256 __X)
962 return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X);
965 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
966 _mm256_movedup_pd (__m256d __X)
968 return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X);
971 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
972 _mm256_lddqu_si256 (__m256i const *__P)
974 return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P);
977 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
978 _mm256_stream_si256 (__m256i *__A, __m256i __B)
980 __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B);
983 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
984 _mm256_stream_pd (double *__A, __m256d __B)
986 __builtin_ia32_movntpd256 (__A, (__v4df)__B);
989 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
990 _mm256_stream_ps (float *__P, __m256 __A)
992 __builtin_ia32_movntps256 (__P, (__v8sf)__A);
995 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
996 _mm256_rcp_ps (__m256 __A)
998 return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A);
1001 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1002 _mm256_rsqrt_ps (__m256 __A)
1004 return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A);
1007 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1008 _mm256_sqrt_pd (__m256d __A)
1010 return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A);
1013 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1014 _mm256_sqrt_ps (__m256 __A)
1016 return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A);
1019 #ifdef __OPTIMIZE__
1020 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1021 _mm256_round_pd (__m256d __V, const int __M)
1023 return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M);
1026 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1027 _mm256_round_ps (__m256 __V, const int __M)
1029 return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M);
1031 #else
1032 #define _mm256_round_pd(V, M) \
1033 ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M)))
1035 #define _mm256_round_ps(V, M) \
1036 ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M)))
1037 #endif
1039 #define _mm256_ceil_pd(V) _mm256_round_pd ((V), _MM_FROUND_CEIL)
1040 #define _mm256_floor_pd(V) _mm256_round_pd ((V), _MM_FROUND_FLOOR)
1041 #define _mm256_ceil_ps(V) _mm256_round_ps ((V), _MM_FROUND_CEIL)
1042 #define _mm256_floor_ps(V) _mm256_round_ps ((V), _MM_FROUND_FLOOR)
1044 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1045 _mm256_unpackhi_pd (__m256d __A, __m256d __B)
1047 return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B);
1050 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1051 _mm256_unpacklo_pd (__m256d __A, __m256d __B)
1053 return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B);
1056 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1057 _mm256_unpackhi_ps (__m256 __A, __m256 __B)
1059 return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B);
1062 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1063 _mm256_unpacklo_ps (__m256 __A, __m256 __B)
1065 return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B);
1068 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1069 _mm_testz_pd (__m128d __M, __m128d __V)
1071 return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V);
1074 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1075 _mm_testc_pd (__m128d __M, __m128d __V)
1077 return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V);
1080 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1081 _mm_testnzc_pd (__m128d __M, __m128d __V)
1083 return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V);
1086 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1087 _mm_testz_ps (__m128 __M, __m128 __V)
1089 return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V);
1092 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1093 _mm_testc_ps (__m128 __M, __m128 __V)
1095 return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V);
1098 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1099 _mm_testnzc_ps (__m128 __M, __m128 __V)
1101 return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V);
1104 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1105 _mm256_testz_pd (__m256d __M, __m256d __V)
1107 return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V);
1110 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1111 _mm256_testc_pd (__m256d __M, __m256d __V)
1113 return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V);
1116 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1117 _mm256_testnzc_pd (__m256d __M, __m256d __V)
1119 return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V);
1122 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1123 _mm256_testz_ps (__m256 __M, __m256 __V)
1125 return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V);
1128 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1129 _mm256_testc_ps (__m256 __M, __m256 __V)
1131 return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V);
1134 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1135 _mm256_testnzc_ps (__m256 __M, __m256 __V)
1137 return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V);
1140 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1141 _mm256_testz_si256 (__m256i __M, __m256i __V)
1143 return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V);
1146 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1147 _mm256_testc_si256 (__m256i __M, __m256i __V)
1149 return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V);
1152 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1153 _mm256_testnzc_si256 (__m256i __M, __m256i __V)
1155 return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V);
1158 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1159 _mm256_movemask_pd (__m256d __A)
1161 return __builtin_ia32_movmskpd256 ((__v4df)__A);
1164 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1165 _mm256_movemask_ps (__m256 __A)
1167 return __builtin_ia32_movmskps256 ((__v8sf)__A);
1170 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1171 _mm256_undefined_pd (void)
1173 __m256d __Y = __Y;
1174 return __Y;
1177 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1178 _mm256_undefined_ps (void)
1180 __m256 __Y = __Y;
1181 return __Y;
1184 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1185 _mm256_undefined_si256 (void)
1187 __m256i __Y = __Y;
1188 return __Y;
1191 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1192 _mm256_setzero_pd (void)
1194 return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
1197 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1198 _mm256_setzero_ps (void)
1200 return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
1201 0.0, 0.0, 0.0, 0.0 };
1204 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1205 _mm256_setzero_si256 (void)
1207 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
1210 /* Create the vector [A B C D]. */
1211 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1212 _mm256_set_pd (double __A, double __B, double __C, double __D)
1214 return __extension__ (__m256d){ __D, __C, __B, __A };
1217 /* Create the vector [A B C D E F G H]. */
1218 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1219 _mm256_set_ps (float __A, float __B, float __C, float __D,
1220 float __E, float __F, float __G, float __H)
1222 return __extension__ (__m256){ __H, __G, __F, __E,
1223 __D, __C, __B, __A };
1226 /* Create the vector [A B C D E F G H]. */
1227 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1228 _mm256_set_epi32 (int __A, int __B, int __C, int __D,
1229 int __E, int __F, int __G, int __H)
1231 return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E,
1232 __D, __C, __B, __A };
1235 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1236 _mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12,
1237 short __q11, short __q10, short __q09, short __q08,
1238 short __q07, short __q06, short __q05, short __q04,
1239 short __q03, short __q02, short __q01, short __q00)
1241 return __extension__ (__m256i)(__v16hi){
1242 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1243 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
1247 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1248 _mm256_set_epi8 (char __q31, char __q30, char __q29, char __q28,
1249 char __q27, char __q26, char __q25, char __q24,
1250 char __q23, char __q22, char __q21, char __q20,
1251 char __q19, char __q18, char __q17, char __q16,
1252 char __q15, char __q14, char __q13, char __q12,
1253 char __q11, char __q10, char __q09, char __q08,
1254 char __q07, char __q06, char __q05, char __q04,
1255 char __q03, char __q02, char __q01, char __q00)
1257 return __extension__ (__m256i)(__v32qi){
1258 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1259 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
1260 __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
1261 __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
1265 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1266 _mm256_set_epi64x (long long __A, long long __B, long long __C,
1267 long long __D)
1269 return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A };
1272 /* Create a vector with all elements equal to A. */
1273 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1274 _mm256_set1_pd (double __A)
1276 return __extension__ (__m256d){ __A, __A, __A, __A };
1279 /* Create a vector with all elements equal to A. */
1280 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1281 _mm256_set1_ps (float __A)
1283 return __extension__ (__m256){ __A, __A, __A, __A,
1284 __A, __A, __A, __A };
1287 /* Create a vector with all elements equal to A. */
1288 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1289 _mm256_set1_epi32 (int __A)
1291 return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
1292 __A, __A, __A, __A };
1295 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1296 _mm256_set1_epi16 (short __A)
1298 return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
1299 __A, __A, __A, __A, __A, __A, __A, __A);
1302 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1303 _mm256_set1_epi8 (char __A)
1305 return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
1306 __A, __A, __A, __A, __A, __A, __A, __A,
1307 __A, __A, __A, __A, __A, __A, __A, __A,
1308 __A, __A, __A, __A, __A, __A, __A, __A);
1311 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1312 _mm256_set1_epi64x (long long __A)
1314 return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A };
1317 /* Create vectors of elements in the reversed order from the
1318 _mm256_set_XXX functions. */
1320 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1321 _mm256_setr_pd (double __A, double __B, double __C, double __D)
1323 return _mm256_set_pd (__D, __C, __B, __A);
1326 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1327 _mm256_setr_ps (float __A, float __B, float __C, float __D,
1328 float __E, float __F, float __G, float __H)
1330 return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A);
1333 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1334 _mm256_setr_epi32 (int __A, int __B, int __C, int __D,
1335 int __E, int __F, int __G, int __H)
1337 return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A);
1340 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1341 _mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12,
1342 short __q11, short __q10, short __q09, short __q08,
1343 short __q07, short __q06, short __q05, short __q04,
1344 short __q03, short __q02, short __q01, short __q00)
1346 return _mm256_set_epi16 (__q00, __q01, __q02, __q03,
1347 __q04, __q05, __q06, __q07,
1348 __q08, __q09, __q10, __q11,
1349 __q12, __q13, __q14, __q15);
1352 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1353 _mm256_setr_epi8 (char __q31, char __q30, char __q29, char __q28,
1354 char __q27, char __q26, char __q25, char __q24,
1355 char __q23, char __q22, char __q21, char __q20,
1356 char __q19, char __q18, char __q17, char __q16,
1357 char __q15, char __q14, char __q13, char __q12,
1358 char __q11, char __q10, char __q09, char __q08,
1359 char __q07, char __q06, char __q05, char __q04,
1360 char __q03, char __q02, char __q01, char __q00)
1362 return _mm256_set_epi8 (__q00, __q01, __q02, __q03,
1363 __q04, __q05, __q06, __q07,
1364 __q08, __q09, __q10, __q11,
1365 __q12, __q13, __q14, __q15,
1366 __q16, __q17, __q18, __q19,
1367 __q20, __q21, __q22, __q23,
1368 __q24, __q25, __q26, __q27,
1369 __q28, __q29, __q30, __q31);
1372 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1373 _mm256_setr_epi64x (long long __A, long long __B, long long __C,
1374 long long __D)
1376 return _mm256_set_epi64x (__D, __C, __B, __A);
1379 /* Casts between various SP, DP, INT vector types. Note that these do no
1380 conversion of values, they just change the type. */
1381 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1382 _mm256_castpd_ps (__m256d __A)
1384 return (__m256) __A;
1387 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1388 _mm256_castpd_si256 (__m256d __A)
1390 return (__m256i) __A;
1393 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1394 _mm256_castps_pd (__m256 __A)
1396 return (__m256d) __A;
1399 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1400 _mm256_castps_si256(__m256 __A)
1402 return (__m256i) __A;
1405 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1406 _mm256_castsi256_ps (__m256i __A)
1408 return (__m256) __A;
1411 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1412 _mm256_castsi256_pd (__m256i __A)
1414 return (__m256d) __A;
1417 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1418 _mm256_castpd256_pd128 (__m256d __A)
1420 return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A);
1423 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1424 _mm256_castps256_ps128 (__m256 __A)
1426 return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A);
1429 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1430 _mm256_castsi256_si128 (__m256i __A)
1432 return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A);
1435 /* When cast is done from a 128 to 256-bit type, the low 128 bits of
1436 the 256-bit result contain source parameter value and the upper 128
1437 bits of the result are undefined. Those intrinsics shouldn't
1438 generate any extra moves. */
1440 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1441 _mm256_castpd128_pd256 (__m128d __A)
1443 return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A);
1446 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1447 _mm256_castps128_ps256 (__m128 __A)
1449 return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A);
1452 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1453 _mm256_castsi128_si256 (__m128i __A)
1455 return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);
1458 #ifdef __DISABLE_AVX__
1459 #undef __DISABLE_AVX__
1460 #pragma GCC pop_options
1461 #endif /* __DISABLE_AVX__ */
1463 #endif /* _AVXINTRIN_H_INCLUDED */