Implement -mmemcpy-strategy= and -mmemset-strategy= options
[official-gcc.git] / gcc / config / i386 / avxintrin.h
blob7f2109a72994f0d9e782793b865a3c078e793e0b
1 /* Copyright (C) 2008-2013 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
24 /* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 11.0. */
27 #ifndef _IMMINTRIN_H_INCLUDED
28 # error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
29 #endif
31 #ifndef _AVXINTRIN_H_INCLUDED
32 #define _AVXINTRIN_H_INCLUDED
34 #ifndef __AVX__
35 #pragma GCC push_options
36 #pragma GCC target("avx")
37 #define __DISABLE_AVX__
38 #endif /* __AVX__ */
40 /* Internal data types for implementing the intrinsics. */
41 typedef double __v4df __attribute__ ((__vector_size__ (32)));
42 typedef float __v8sf __attribute__ ((__vector_size__ (32)));
43 typedef long long __v4di __attribute__ ((__vector_size__ (32)));
44 typedef int __v8si __attribute__ ((__vector_size__ (32)));
45 typedef short __v16hi __attribute__ ((__vector_size__ (32)));
46 typedef char __v32qi __attribute__ ((__vector_size__ (32)));
48 /* The Intel API is flexible enough that we must allow aliasing with other
49 vector types, and their scalar components. */
50 typedef float __m256 __attribute__ ((__vector_size__ (32),
51 __may_alias__));
52 typedef long long __m256i __attribute__ ((__vector_size__ (32),
53 __may_alias__));
54 typedef double __m256d __attribute__ ((__vector_size__ (32),
55 __may_alias__));
57 /* Compare predicates for scalar and packed compare intrinsics. */
59 /* Equal (ordered, non-signaling) */
60 #define _CMP_EQ_OQ 0x00
61 /* Less-than (ordered, signaling) */
62 #define _CMP_LT_OS 0x01
63 /* Less-than-or-equal (ordered, signaling) */
64 #define _CMP_LE_OS 0x02
65 /* Unordered (non-signaling) */
66 #define _CMP_UNORD_Q 0x03
67 /* Not-equal (unordered, non-signaling) */
68 #define _CMP_NEQ_UQ 0x04
69 /* Not-less-than (unordered, signaling) */
70 #define _CMP_NLT_US 0x05
71 /* Not-less-than-or-equal (unordered, signaling) */
72 #define _CMP_NLE_US 0x06
73 /* Ordered (nonsignaling) */
74 #define _CMP_ORD_Q 0x07
75 /* Equal (unordered, non-signaling) */
76 #define _CMP_EQ_UQ 0x08
77 /* Not-greater-than-or-equal (unordered, signaling) */
78 #define _CMP_NGE_US 0x09
79 /* Not-greater-than (unordered, signaling) */
80 #define _CMP_NGT_US 0x0a
81 /* False (ordered, non-signaling) */
82 #define _CMP_FALSE_OQ 0x0b
83 /* Not-equal (ordered, non-signaling) */
84 #define _CMP_NEQ_OQ 0x0c
85 /* Greater-than-or-equal (ordered, signaling) */
86 #define _CMP_GE_OS 0x0d
87 /* Greater-than (ordered, signaling) */
88 #define _CMP_GT_OS 0x0e
89 /* True (unordered, non-signaling) */
90 #define _CMP_TRUE_UQ 0x0f
91 /* Equal (ordered, signaling) */
92 #define _CMP_EQ_OS 0x10
93 /* Less-than (ordered, non-signaling) */
94 #define _CMP_LT_OQ 0x11
95 /* Less-than-or-equal (ordered, non-signaling) */
96 #define _CMP_LE_OQ 0x12
97 /* Unordered (signaling) */
98 #define _CMP_UNORD_S 0x13
99 /* Not-equal (unordered, signaling) */
100 #define _CMP_NEQ_US 0x14
101 /* Not-less-than (unordered, non-signaling) */
102 #define _CMP_NLT_UQ 0x15
103 /* Not-less-than-or-equal (unordered, non-signaling) */
104 #define _CMP_NLE_UQ 0x16
105 /* Ordered (signaling) */
106 #define _CMP_ORD_S 0x17
107 /* Equal (unordered, signaling) */
108 #define _CMP_EQ_US 0x18
109 /* Not-greater-than-or-equal (unordered, non-signaling) */
110 #define _CMP_NGE_UQ 0x19
111 /* Not-greater-than (unordered, non-signaling) */
112 #define _CMP_NGT_UQ 0x1a
113 /* False (ordered, signaling) */
114 #define _CMP_FALSE_OS 0x1b
115 /* Not-equal (ordered, signaling) */
116 #define _CMP_NEQ_OS 0x1c
117 /* Greater-than-or-equal (ordered, non-signaling) */
118 #define _CMP_GE_OQ 0x1d
119 /* Greater-than (ordered, non-signaling) */
120 #define _CMP_GT_OQ 0x1e
121 /* True (unordered, signaling) */
122 #define _CMP_TRUE_US 0x1f
124 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
125 _mm256_add_pd (__m256d __A, __m256d __B)
127 return (__m256d) __builtin_ia32_addpd256 ((__v4df)__A, (__v4df)__B);
130 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
131 _mm256_add_ps (__m256 __A, __m256 __B)
133 return (__m256) __builtin_ia32_addps256 ((__v8sf)__A, (__v8sf)__B);
136 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
137 _mm256_addsub_pd (__m256d __A, __m256d __B)
139 return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
142 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
143 _mm256_addsub_ps (__m256 __A, __m256 __B)
145 return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B);
149 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
150 _mm256_and_pd (__m256d __A, __m256d __B)
152 return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B);
155 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
156 _mm256_and_ps (__m256 __A, __m256 __B)
158 return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B);
161 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
162 _mm256_andnot_pd (__m256d __A, __m256d __B)
164 return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B);
167 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
168 _mm256_andnot_ps (__m256 __A, __m256 __B)
170 return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B);
173 /* Double/single precision floating point blend instructions - select
174 data from 2 sources using constant/variable mask. */
176 #ifdef __OPTIMIZE__
177 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
178 _mm256_blend_pd (__m256d __X, __m256d __Y, const int __M)
180 return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X,
181 (__v4df)__Y,
182 __M);
185 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
186 _mm256_blend_ps (__m256 __X, __m256 __Y, const int __M)
188 return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X,
189 (__v8sf)__Y,
190 __M);
192 #else
193 #define _mm256_blend_pd(X, Y, M) \
194 ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X), \
195 (__v4df)(__m256d)(Y), (int)(M)))
197 #define _mm256_blend_ps(X, Y, M) \
198 ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X), \
199 (__v8sf)(__m256)(Y), (int)(M)))
200 #endif
202 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
203 _mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M)
205 return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X,
206 (__v4df)__Y,
207 (__v4df)__M);
210 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
211 _mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
213 return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
214 (__v8sf)__Y,
215 (__v8sf)__M);
218 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
219 _mm256_div_pd (__m256d __A, __m256d __B)
221 return (__m256d) __builtin_ia32_divpd256 ((__v4df)__A, (__v4df)__B);
224 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
225 _mm256_div_ps (__m256 __A, __m256 __B)
227 return (__m256) __builtin_ia32_divps256 ((__v8sf)__A, (__v8sf)__B);
230 /* Dot product instructions with mask-defined summing and zeroing parts
231 of result. */
233 #ifdef __OPTIMIZE__
234 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
235 _mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
237 return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
238 (__v8sf)__Y,
239 __M);
241 #else
242 #define _mm256_dp_ps(X, Y, M) \
243 ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X), \
244 (__v8sf)(__m256)(Y), (int)(M)))
245 #endif
247 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
248 _mm256_hadd_pd (__m256d __X, __m256d __Y)
250 return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y);
253 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
254 _mm256_hadd_ps (__m256 __X, __m256 __Y)
256 return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y);
259 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
260 _mm256_hsub_pd (__m256d __X, __m256d __Y)
262 return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y);
265 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
266 _mm256_hsub_ps (__m256 __X, __m256 __Y)
268 return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y);
271 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
272 _mm256_max_pd (__m256d __A, __m256d __B)
274 return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B);
277 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
278 _mm256_max_ps (__m256 __A, __m256 __B)
280 return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B);
283 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
284 _mm256_min_pd (__m256d __A, __m256d __B)
286 return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B);
289 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
290 _mm256_min_ps (__m256 __A, __m256 __B)
292 return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
295 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
296 _mm256_mul_pd (__m256d __A, __m256d __B)
298 return (__m256d) __builtin_ia32_mulpd256 ((__v4df)__A, (__v4df)__B);
301 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
302 _mm256_mul_ps (__m256 __A, __m256 __B)
304 return (__m256) __builtin_ia32_mulps256 ((__v8sf)__A, (__v8sf)__B);
307 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
308 _mm256_or_pd (__m256d __A, __m256d __B)
310 return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
313 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
314 _mm256_or_ps (__m256 __A, __m256 __B)
316 return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B);
319 #ifdef __OPTIMIZE__
320 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
321 _mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask)
323 return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B,
324 __mask);
327 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
328 _mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask)
330 return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B,
331 __mask);
333 #else
334 #define _mm256_shuffle_pd(A, B, N) \
335 ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A), \
336 (__v4df)(__m256d)(B), (int)(N)))
338 #define _mm256_shuffle_ps(A, B, N) \
339 ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), \
340 (__v8sf)(__m256)(B), (int)(N)))
341 #endif
343 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
344 _mm256_sub_pd (__m256d __A, __m256d __B)
346 return (__m256d) __builtin_ia32_subpd256 ((__v4df)__A, (__v4df)__B);
349 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
350 _mm256_sub_ps (__m256 __A, __m256 __B)
352 return (__m256) __builtin_ia32_subps256 ((__v8sf)__A, (__v8sf)__B);
355 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
356 _mm256_xor_pd (__m256d __A, __m256d __B)
358 return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
361 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
362 _mm256_xor_ps (__m256 __A, __m256 __B)
364 return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B);
367 #ifdef __OPTIMIZE__
368 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369 _mm_cmp_pd (__m128d __X, __m128d __Y, const int __P)
371 return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P);
374 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
375 _mm_cmp_ps (__m128 __X, __m128 __Y, const int __P)
377 return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P);
380 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
381 _mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
383 return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y,
384 __P);
387 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
388 _mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
390 return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
391 __P);
394 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
395 _mm_cmp_sd (__m128d __X, __m128d __Y, const int __P)
397 return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P);
400 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
401 _mm_cmp_ss (__m128 __X, __m128 __Y, const int __P)
403 return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P);
405 #else
406 #define _mm_cmp_pd(X, Y, P) \
407 ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X), \
408 (__v2df)(__m128d)(Y), (int)(P)))
410 #define _mm_cmp_ps(X, Y, P) \
411 ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X), \
412 (__v4sf)(__m128)(Y), (int)(P)))
414 #define _mm256_cmp_pd(X, Y, P) \
415 ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X), \
416 (__v4df)(__m256d)(Y), (int)(P)))
418 #define _mm256_cmp_ps(X, Y, P) \
419 ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X), \
420 (__v8sf)(__m256)(Y), (int)(P)))
422 #define _mm_cmp_sd(X, Y, P) \
423 ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X), \
424 (__v2df)(__m128d)(Y), (int)(P)))
426 #define _mm_cmp_ss(X, Y, P) \
427 ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X), \
428 (__v4sf)(__m128)(Y), (int)(P)))
429 #endif
431 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
432 _mm256_cvtepi32_pd (__m128i __A)
434 return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A);
437 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
438 _mm256_cvtepi32_ps (__m256i __A)
440 return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A);
443 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
444 _mm256_cvtpd_ps (__m256d __A)
446 return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A);
449 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
450 _mm256_cvtps_epi32 (__m256 __A)
452 return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A);
455 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
456 _mm256_cvtps_pd (__m128 __A)
458 return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A);
461 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
462 _mm256_cvttpd_epi32 (__m256d __A)
464 return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A);
467 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
468 _mm256_cvtpd_epi32 (__m256d __A)
470 return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A);
473 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
474 _mm256_cvttps_epi32 (__m256 __A)
476 return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A);
479 #ifdef __OPTIMIZE__
480 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
481 _mm256_extractf128_pd (__m256d __X, const int __N)
483 return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
486 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
487 _mm256_extractf128_ps (__m256 __X, const int __N)
489 return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N);
492 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
493 _mm256_extractf128_si256 (__m256i __X, const int __N)
495 return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N);
498 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
499 _mm256_extract_epi32 (__m256i __X, int const __N)
501 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
502 return _mm_extract_epi32 (__Y, __N % 4);
505 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
506 _mm256_extract_epi16 (__m256i __X, int const __N)
508 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
509 return _mm_extract_epi16 (__Y, __N % 8);
512 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
513 _mm256_extract_epi8 (__m256i __X, int const __N)
515 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
516 return _mm_extract_epi8 (__Y, __N % 16);
519 #ifdef __x86_64__
520 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
521 _mm256_extract_epi64 (__m256i __X, const int __N)
523 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
524 return _mm_extract_epi64 (__Y, __N % 2);
526 #endif
527 #else
528 #define _mm256_extractf128_pd(X, N) \
529 ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X), \
530 (int)(N)))
532 #define _mm256_extractf128_ps(X, N) \
533 ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X), \
534 (int)(N)))
536 #define _mm256_extractf128_si256(X, N) \
537 ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X), \
538 (int)(N)))
540 #define _mm256_extract_epi32(X, N) \
541 (__extension__ \
542 ({ \
543 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
544 _mm_extract_epi32 (__Y, (N) % 4); \
547 #define _mm256_extract_epi16(X, N) \
548 (__extension__ \
549 ({ \
550 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
551 _mm_extract_epi16 (__Y, (N) % 8); \
554 #define _mm256_extract_epi8(X, N) \
555 (__extension__ \
556 ({ \
557 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
558 _mm_extract_epi8 (__Y, (N) % 16); \
561 #ifdef __x86_64__
562 #define _mm256_extract_epi64(X, N) \
563 (__extension__ \
564 ({ \
565 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
566 _mm_extract_epi64 (__Y, (N) % 2); \
568 #endif
569 #endif
571 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
572 _mm256_zeroall (void)
574 __builtin_ia32_vzeroall ();
577 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
578 _mm256_zeroupper (void)
580 __builtin_ia32_vzeroupper ();
583 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
584 _mm_permutevar_pd (__m128d __A, __m128i __C)
586 return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
587 (__v2di)__C);
590 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
591 _mm256_permutevar_pd (__m256d __A, __m256i __C)
593 return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
594 (__v4di)__C);
597 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
598 _mm_permutevar_ps (__m128 __A, __m128i __C)
600 return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
601 (__v4si)__C);
604 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
605 _mm256_permutevar_ps (__m256 __A, __m256i __C)
607 return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
608 (__v8si)__C);
611 #ifdef __OPTIMIZE__
612 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
613 _mm_permute_pd (__m128d __X, const int __C)
615 return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
618 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
619 _mm256_permute_pd (__m256d __X, const int __C)
621 return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
624 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
625 _mm_permute_ps (__m128 __X, const int __C)
627 return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
630 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
631 _mm256_permute_ps (__m256 __X, const int __C)
633 return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C);
635 #else
636 #define _mm_permute_pd(X, C) \
637 ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
639 #define _mm256_permute_pd(X, C) \
640 ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C)))
642 #define _mm_permute_ps(X, C) \
643 ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
645 #define _mm256_permute_ps(X, C) \
646 ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
647 #endif
649 #ifdef __OPTIMIZE__
650 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
651 _mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
653 return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
654 (__v4df)__Y,
655 __C);
658 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
659 _mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
661 return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
662 (__v8sf)__Y,
663 __C);
666 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
667 _mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
669 return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
670 (__v8si)__Y,
671 __C);
673 #else
674 #define _mm256_permute2f128_pd(X, Y, C) \
675 ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X), \
676 (__v4df)(__m256d)(Y), \
677 (int)(C)))
679 #define _mm256_permute2f128_ps(X, Y, C) \
680 ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X), \
681 (__v8sf)(__m256)(Y), \
682 (int)(C)))
684 #define _mm256_permute2f128_si256(X, Y, C) \
685 ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X), \
686 (__v8si)(__m256i)(Y), \
687 (int)(C)))
688 #endif
690 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
691 _mm_broadcast_ss (float const *__X)
693 return (__m128) __builtin_ia32_vbroadcastss (__X);
696 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
697 _mm256_broadcast_sd (double const *__X)
699 return (__m256d) __builtin_ia32_vbroadcastsd256 (__X);
702 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
703 _mm256_broadcast_ss (float const *__X)
705 return (__m256) __builtin_ia32_vbroadcastss256 (__X);
708 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
709 _mm256_broadcast_pd (__m128d const *__X)
711 return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X);
714 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
715 _mm256_broadcast_ps (__m128 const *__X)
717 return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X);
720 #ifdef __OPTIMIZE__
721 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
722 _mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O)
724 return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X,
725 (__v2df)__Y,
726 __O);
729 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
730 _mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
732 return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
733 (__v4sf)__Y,
734 __O);
737 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
738 _mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O)
740 return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X,
741 (__v4si)__Y,
742 __O);
745 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
746 _mm256_insert_epi32 (__m256i __X, int __D, int const __N)
748 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
749 __Y = _mm_insert_epi32 (__Y, __D, __N % 4);
750 return _mm256_insertf128_si256 (__X, __Y, __N >> 2);
753 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
754 _mm256_insert_epi16 (__m256i __X, int __D, int const __N)
756 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
757 __Y = _mm_insert_epi16 (__Y, __D, __N % 8);
758 return _mm256_insertf128_si256 (__X, __Y, __N >> 3);
761 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
762 _mm256_insert_epi8 (__m256i __X, int __D, int const __N)
764 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
765 __Y = _mm_insert_epi8 (__Y, __D, __N % 16);
766 return _mm256_insertf128_si256 (__X, __Y, __N >> 4);
769 #ifdef __x86_64__
770 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
771 _mm256_insert_epi64 (__m256i __X, long long __D, int const __N)
773 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
774 __Y = _mm_insert_epi64 (__Y, __D, __N % 2);
775 return _mm256_insertf128_si256 (__X, __Y, __N >> 1);
777 #endif
778 #else
779 #define _mm256_insertf128_pd(X, Y, O) \
780 ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X), \
781 (__v2df)(__m128d)(Y), \
782 (int)(O)))
784 #define _mm256_insertf128_ps(X, Y, O) \
785 ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X), \
786 (__v4sf)(__m128)(Y), \
787 (int)(O)))
789 #define _mm256_insertf128_si256(X, Y, O) \
790 ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X), \
791 (__v4si)(__m128i)(Y), \
792 (int)(O)))
794 #define _mm256_insert_epi32(X, D, N) \
795 (__extension__ \
796 ({ \
797 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
798 __Y = _mm_insert_epi32 (__Y, (D), (N) % 4); \
799 _mm256_insertf128_si256 ((X), __Y, (N) >> 2); \
802 #define _mm256_insert_epi16(X, D, N) \
803 (__extension__ \
804 ({ \
805 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
806 __Y = _mm_insert_epi16 (__Y, (D), (N) % 8); \
807 _mm256_insertf128_si256 ((X), __Y, (N) >> 3); \
810 #define _mm256_insert_epi8(X, D, N) \
811 (__extension__ \
812 ({ \
813 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
814 __Y = _mm_insert_epi8 (__Y, (D), (N) % 16); \
815 _mm256_insertf128_si256 ((X), __Y, (N) >> 4); \
818 #ifdef __x86_64__
819 #define _mm256_insert_epi64(X, D, N) \
820 (__extension__ \
821 ({ \
822 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
823 __Y = _mm_insert_epi64 (__Y, (D), (N) % 2); \
824 _mm256_insertf128_si256 ((X), __Y, (N) >> 1); \
826 #endif
827 #endif
829 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
830 _mm256_load_pd (double const *__P)
832 return *(__m256d *)__P;
835 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
836 _mm256_store_pd (double *__P, __m256d __A)
838 *(__m256d *)__P = __A;
841 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
842 _mm256_load_ps (float const *__P)
844 return *(__m256 *)__P;
847 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
848 _mm256_store_ps (float *__P, __m256 __A)
850 *(__m256 *)__P = __A;
853 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
854 _mm256_loadu_pd (double const *__P)
856 return (__m256d) __builtin_ia32_loadupd256 (__P);
859 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
860 _mm256_storeu_pd (double *__P, __m256d __A)
862 __builtin_ia32_storeupd256 (__P, (__v4df)__A);
865 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
866 _mm256_loadu_ps (float const *__P)
868 return (__m256) __builtin_ia32_loadups256 (__P);
871 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
872 _mm256_storeu_ps (float *__P, __m256 __A)
874 __builtin_ia32_storeups256 (__P, (__v8sf)__A);
877 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
878 _mm256_load_si256 (__m256i const *__P)
880 return *__P;
883 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
884 _mm256_store_si256 (__m256i *__P, __m256i __A)
886 *__P = __A;
889 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
890 _mm256_loadu_si256 (__m256i const *__P)
892 return (__m256i) __builtin_ia32_loaddqu256 ((char const *)__P);
895 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
896 _mm256_storeu_si256 (__m256i *__P, __m256i __A)
898 __builtin_ia32_storedqu256 ((char *)__P, (__v32qi)__A);
901 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
902 _mm_maskload_pd (double const *__P, __m128i __M)
904 return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P,
905 (__v2di)__M);
908 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
909 _mm_maskstore_pd (double *__P, __m128i __M, __m128d __A)
911 __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A);
914 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
915 _mm256_maskload_pd (double const *__P, __m256i __M)
917 return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P,
918 (__v4di)__M);
921 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
922 _mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A)
924 __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A);
927 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
928 _mm_maskload_ps (float const *__P, __m128i __M)
930 return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P,
931 (__v4si)__M);
934 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
935 _mm_maskstore_ps (float *__P, __m128i __M, __m128 __A)
937 __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A);
940 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
941 _mm256_maskload_ps (float const *__P, __m256i __M)
943 return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P,
944 (__v8si)__M);
947 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
948 _mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A)
950 __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A);
953 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
954 _mm256_movehdup_ps (__m256 __X)
956 return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X);
959 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
960 _mm256_moveldup_ps (__m256 __X)
962 return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X);
965 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
966 _mm256_movedup_pd (__m256d __X)
968 return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X);
971 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
972 _mm256_lddqu_si256 (__m256i const *__P)
974 return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P);
977 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
978 _mm256_stream_si256 (__m256i *__A, __m256i __B)
980 __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B);
983 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
984 _mm256_stream_pd (double *__A, __m256d __B)
986 __builtin_ia32_movntpd256 (__A, (__v4df)__B);
989 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
990 _mm256_stream_ps (float *__P, __m256 __A)
992 __builtin_ia32_movntps256 (__P, (__v8sf)__A);
995 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
996 _mm256_rcp_ps (__m256 __A)
998 return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A);
1001 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1002 _mm256_rsqrt_ps (__m256 __A)
1004 return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A);
1007 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1008 _mm256_sqrt_pd (__m256d __A)
1010 return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A);
1013 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1014 _mm256_sqrt_ps (__m256 __A)
1016 return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A);
1019 #ifdef __OPTIMIZE__
1020 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1021 _mm256_round_pd (__m256d __V, const int __M)
1023 return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M);
1026 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1027 _mm256_round_ps (__m256 __V, const int __M)
1029 return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M);
1031 #else
1032 #define _mm256_round_pd(V, M) \
1033 ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M)))
1035 #define _mm256_round_ps(V, M) \
1036 ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M)))
1037 #endif
1039 #define _mm256_ceil_pd(V) _mm256_round_pd ((V), _MM_FROUND_CEIL)
1040 #define _mm256_floor_pd(V) _mm256_round_pd ((V), _MM_FROUND_FLOOR)
1041 #define _mm256_ceil_ps(V) _mm256_round_ps ((V), _MM_FROUND_CEIL)
1042 #define _mm256_floor_ps(V) _mm256_round_ps ((V), _MM_FROUND_FLOOR)
1044 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1045 _mm256_unpackhi_pd (__m256d __A, __m256d __B)
1047 return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B);
1050 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1051 _mm256_unpacklo_pd (__m256d __A, __m256d __B)
1053 return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B);
1056 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1057 _mm256_unpackhi_ps (__m256 __A, __m256 __B)
1059 return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B);
1062 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1063 _mm256_unpacklo_ps (__m256 __A, __m256 __B)
1065 return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B);
1068 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1069 _mm_testz_pd (__m128d __M, __m128d __V)
1071 return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V);
1074 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1075 _mm_testc_pd (__m128d __M, __m128d __V)
1077 return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V);
1080 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1081 _mm_testnzc_pd (__m128d __M, __m128d __V)
1083 return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V);
1086 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1087 _mm_testz_ps (__m128 __M, __m128 __V)
1089 return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V);
1092 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1093 _mm_testc_ps (__m128 __M, __m128 __V)
1095 return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V);
1098 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1099 _mm_testnzc_ps (__m128 __M, __m128 __V)
1101 return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V);
1104 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1105 _mm256_testz_pd (__m256d __M, __m256d __V)
1107 return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V);
1110 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1111 _mm256_testc_pd (__m256d __M, __m256d __V)
1113 return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V);
1116 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1117 _mm256_testnzc_pd (__m256d __M, __m256d __V)
1119 return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V);
1122 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1123 _mm256_testz_ps (__m256 __M, __m256 __V)
1125 return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V);
1128 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1129 _mm256_testc_ps (__m256 __M, __m256 __V)
1131 return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V);
1134 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1135 _mm256_testnzc_ps (__m256 __M, __m256 __V)
1137 return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V);
1140 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1141 _mm256_testz_si256 (__m256i __M, __m256i __V)
1143 return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V);
1146 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1147 _mm256_testc_si256 (__m256i __M, __m256i __V)
1149 return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V);
1152 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1153 _mm256_testnzc_si256 (__m256i __M, __m256i __V)
1155 return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V);
1158 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1159 _mm256_movemask_pd (__m256d __A)
1161 return __builtin_ia32_movmskpd256 ((__v4df)__A);
1164 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1165 _mm256_movemask_ps (__m256 __A)
1167 return __builtin_ia32_movmskps256 ((__v8sf)__A);
1170 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1171 _mm256_setzero_pd (void)
1173 return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
1176 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1177 _mm256_setzero_ps (void)
1179 return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
1180 0.0, 0.0, 0.0, 0.0 };
1183 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1184 _mm256_setzero_si256 (void)
1186 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
1189 /* Create the vector [A B C D]. */
1190 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1191 _mm256_set_pd (double __A, double __B, double __C, double __D)
1193 return __extension__ (__m256d){ __D, __C, __B, __A };
1196 /* Create the vector [A B C D E F G H]. */
1197 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1198 _mm256_set_ps (float __A, float __B, float __C, float __D,
1199 float __E, float __F, float __G, float __H)
1201 return __extension__ (__m256){ __H, __G, __F, __E,
1202 __D, __C, __B, __A };
1205 /* Create the vector [A B C D E F G H]. */
1206 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1207 _mm256_set_epi32 (int __A, int __B, int __C, int __D,
1208 int __E, int __F, int __G, int __H)
1210 return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E,
1211 __D, __C, __B, __A };
1214 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1215 _mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12,
1216 short __q11, short __q10, short __q09, short __q08,
1217 short __q07, short __q06, short __q05, short __q04,
1218 short __q03, short __q02, short __q01, short __q00)
1220 return __extension__ (__m256i)(__v16hi){
1221 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1222 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
1226 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1227 _mm256_set_epi8 (char __q31, char __q30, char __q29, char __q28,
1228 char __q27, char __q26, char __q25, char __q24,
1229 char __q23, char __q22, char __q21, char __q20,
1230 char __q19, char __q18, char __q17, char __q16,
1231 char __q15, char __q14, char __q13, char __q12,
1232 char __q11, char __q10, char __q09, char __q08,
1233 char __q07, char __q06, char __q05, char __q04,
1234 char __q03, char __q02, char __q01, char __q00)
1236 return __extension__ (__m256i)(__v32qi){
1237 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1238 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
1239 __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
1240 __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
1244 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1245 _mm256_set_epi64x (long long __A, long long __B, long long __C,
1246 long long __D)
1248 return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A };
1251 /* Create a vector with all elements equal to A. */
1252 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1253 _mm256_set1_pd (double __A)
1255 return __extension__ (__m256d){ __A, __A, __A, __A };
1258 /* Create a vector with all elements equal to A. */
1259 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1260 _mm256_set1_ps (float __A)
1262 return __extension__ (__m256){ __A, __A, __A, __A,
1263 __A, __A, __A, __A };
1266 /* Create a vector with all elements equal to A. */
1267 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1268 _mm256_set1_epi32 (int __A)
1270 return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
1271 __A, __A, __A, __A };
1274 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1275 _mm256_set1_epi16 (short __A)
1277 return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
1278 __A, __A, __A, __A, __A, __A, __A, __A);
1281 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1282 _mm256_set1_epi8 (char __A)
1284 return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
1285 __A, __A, __A, __A, __A, __A, __A, __A,
1286 __A, __A, __A, __A, __A, __A, __A, __A,
1287 __A, __A, __A, __A, __A, __A, __A, __A);
1290 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1291 _mm256_set1_epi64x (long long __A)
1293 return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A };
1296 /* Create vectors of elements in the reversed order from the
1297 _mm256_set_XXX functions. */
1299 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1300 _mm256_setr_pd (double __A, double __B, double __C, double __D)
1302 return _mm256_set_pd (__D, __C, __B, __A);
1305 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1306 _mm256_setr_ps (float __A, float __B, float __C, float __D,
1307 float __E, float __F, float __G, float __H)
1309 return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A);
1312 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1313 _mm256_setr_epi32 (int __A, int __B, int __C, int __D,
1314 int __E, int __F, int __G, int __H)
1316 return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A);
1319 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1320 _mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12,
1321 short __q11, short __q10, short __q09, short __q08,
1322 short __q07, short __q06, short __q05, short __q04,
1323 short __q03, short __q02, short __q01, short __q00)
1325 return _mm256_set_epi16 (__q00, __q01, __q02, __q03,
1326 __q04, __q05, __q06, __q07,
1327 __q08, __q09, __q10, __q11,
1328 __q12, __q13, __q14, __q15);
1331 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1332 _mm256_setr_epi8 (char __q31, char __q30, char __q29, char __q28,
1333 char __q27, char __q26, char __q25, char __q24,
1334 char __q23, char __q22, char __q21, char __q20,
1335 char __q19, char __q18, char __q17, char __q16,
1336 char __q15, char __q14, char __q13, char __q12,
1337 char __q11, char __q10, char __q09, char __q08,
1338 char __q07, char __q06, char __q05, char __q04,
1339 char __q03, char __q02, char __q01, char __q00)
1341 return _mm256_set_epi8 (__q00, __q01, __q02, __q03,
1342 __q04, __q05, __q06, __q07,
1343 __q08, __q09, __q10, __q11,
1344 __q12, __q13, __q14, __q15,
1345 __q16, __q17, __q18, __q19,
1346 __q20, __q21, __q22, __q23,
1347 __q24, __q25, __q26, __q27,
1348 __q28, __q29, __q30, __q31);
1351 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1352 _mm256_setr_epi64x (long long __A, long long __B, long long __C,
1353 long long __D)
1355 return _mm256_set_epi64x (__D, __C, __B, __A);
1358 /* Casts between various SP, DP, INT vector types. Note that these do no
1359 conversion of values, they just change the type. */
1360 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1361 _mm256_castpd_ps (__m256d __A)
1363 return (__m256) __A;
1366 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1367 _mm256_castpd_si256 (__m256d __A)
1369 return (__m256i) __A;
1372 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1373 _mm256_castps_pd (__m256 __A)
1375 return (__m256d) __A;
1378 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1379 _mm256_castps_si256(__m256 __A)
1381 return (__m256i) __A;
1384 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1385 _mm256_castsi256_ps (__m256i __A)
1387 return (__m256) __A;
1390 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1391 _mm256_castsi256_pd (__m256i __A)
1393 return (__m256d) __A;
1396 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1397 _mm256_castpd256_pd128 (__m256d __A)
1399 return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A);
1402 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1403 _mm256_castps256_ps128 (__m256 __A)
1405 return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A);
1408 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1409 _mm256_castsi256_si128 (__m256i __A)
1411 return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A);
1414 /* When cast is done from a 128 to 256-bit type, the low 128 bits of
1415 the 256-bit result contain source parameter value and the upper 128
1416 bits of the result are undefined. Those intrinsics shouldn't
1417 generate any extra moves. */
1419 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1420 _mm256_castpd128_pd256 (__m128d __A)
1422 return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A);
1425 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1426 _mm256_castps128_ps256 (__m128 __A)
1428 return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A);
1431 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1432 _mm256_castsi128_si256 (__m128i __A)
1434 return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);
1437 #ifdef __DISABLE_AVX__
1438 #undef __DISABLE_AVX__
1439 #pragma GCC pop_options
1440 #endif /* __DISABLE_AVX__ */
1442 #endif /* _AVXINTRIN_H_INCLUDED */