Install gcc-4.4.0-tdm-1-core-2.tar.gz
[msysgit.git] / mingw / lib / gcc / mingw32 / 4.4.0 / include / avxintrin.h
blob26925fd7fbbba6f6956c4260059103b855651dc7
1 /* Copyright (C) 2008, 2009 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
24 /* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 11.0. */
27 #ifndef _IMMINTRIN_H_INCLUDED
28 # error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
29 #endif
31 /* Internal data types for implementing the intrinsics. */
32 typedef double __v4df __attribute__ ((__vector_size__ (32)));
33 typedef float __v8sf __attribute__ ((__vector_size__ (32)));
34 typedef long long __v4di __attribute__ ((__vector_size__ (32)));
35 typedef int __v8si __attribute__ ((__vector_size__ (32)));
36 typedef short __v16hi __attribute__ ((__vector_size__ (32)));
37 typedef char __v32qi __attribute__ ((__vector_size__ (32)));
39 /* The Intel API is flexible enough that we must allow aliasing with other
40 vector types, and their scalar components. */
41 typedef float __m256 __attribute__ ((__vector_size__ (32),
42 __may_alias__));
43 typedef long long __m256i __attribute__ ((__vector_size__ (32),
44 __may_alias__));
45 typedef double __m256d __attribute__ ((__vector_size__ (32),
46 __may_alias__));
48 /* Compare predicates for scalar and packed compare intrinsics. */
50 /* Equal (ordered, non-signaling) */
51 #define _CMP_EQ_OQ 0x00
52 /* Less-than (ordered, signaling) */
53 #define _CMP_LT_OS 0x01
54 /* Less-than-or-equal (ordered, signaling) */
55 #define _CMP_LE_OS 0x02
56 /* Unordered (non-signaling) */
57 #define _CMP_UNORD_Q 0x03
58 /* Not-equal (unordered, non-signaling) */
59 #define _CMP_NEQ_UQ 0x04
60 /* Not-less-than (unordered, signaling) */
61 #define _CMP_NLT_US 0x05
62 /* Not-less-than-or-equal (unordered, signaling) */
63 #define _CMP_NLE_US 0x06
64 /* Ordered (nonsignaling) */
65 #define _CMP_ORD_Q 0x07
66 /* Equal (unordered, non-signaling) */
67 #define _CMP_EQ_UQ 0x08
68 /* Not-greater-than-or-equal (unordered, signaling) */
69 #define _CMP_NGE_US 0x09
70 /* Not-greater-than (unordered, signaling) */
71 #define _CMP_NGT_US 0x0a
72 /* False (ordered, non-signaling) */
73 #define _CMP_FALSE_OQ 0x0b
74 /* Not-equal (ordered, non-signaling) */
75 #define _CMP_NEQ_OQ 0x0c
76 /* Greater-than-or-equal (ordered, signaling) */
77 #define _CMP_GE_OS 0x0d
78 /* Greater-than (ordered, signaling) */
79 #define _CMP_GT_OS 0x0e
80 /* True (unordered, non-signaling) */
81 #define _CMP_TRUE_UQ 0x0f
82 /* Equal (ordered, signaling) */
83 #define _CMP_EQ_OS 0x10
84 /* Less-than (ordered, non-signaling) */
85 #define _CMP_LT_OQ 0x11
86 /* Less-than-or-equal (ordered, non-signaling) */
87 #define _CMP_LE_OQ 0x12
88 /* Unordered (signaling) */
89 #define _CMP_UNORD_S 0x13
90 /* Not-equal (unordered, signaling) */
91 #define _CMP_NEQ_US 0x14
92 /* Not-less-than (unordered, non-signaling) */
93 #define _CMP_NLT_UQ 0x15
94 /* Not-less-than-or-equal (unordered, non-signaling) */
95 #define _CMP_NLE_UQ 0x16
96 /* Ordered (signaling) */
97 #define _CMP_ORD_S 0x17
98 /* Equal (unordered, signaling) */
99 #define _CMP_EQ_US 0x18
100 /* Not-greater-than-or-equal (unordered, non-signaling) */
101 #define _CMP_NGE_UQ 0x19
102 /* Not-greater-than (unordered, non-signaling) */
103 #define _CMP_NGT_UQ 0x1a
104 /* False (ordered, signaling) */
105 #define _CMP_FALSE_OS 0x1b
106 /* Not-equal (ordered, signaling) */
107 #define _CMP_NEQ_OS 0x1c
108 /* Greater-than-or-equal (ordered, non-signaling) */
109 #define _CMP_GE_OQ 0x1d
110 /* Greater-than (ordered, non-signaling) */
111 #define _CMP_GT_OQ 0x1e
112 /* True (unordered, signaling) */
113 #define _CMP_TRUE_US 0x1f
115 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
116 _mm256_add_pd (__m256d __A, __m256d __B)
118 return (__m256d) __builtin_ia32_addpd256 ((__v4df)__A, (__v4df)__B);
121 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
122 _mm256_add_ps (__m256 __A, __m256 __B)
124 return (__m256) __builtin_ia32_addps256 ((__v8sf)__A, (__v8sf)__B);
127 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
128 _mm256_addsub_pd (__m256d __A, __m256d __B)
130 return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
133 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
134 _mm256_addsub_ps (__m256 __A, __m256 __B)
136 return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B);
140 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
141 _mm256_and_pd (__m256d __A, __m256d __B)
143 return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B);
146 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
147 _mm256_and_ps (__m256 __A, __m256 __B)
149 return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B);
152 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
153 _mm256_andnot_pd (__m256d __A, __m256d __B)
155 return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B);
158 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
159 _mm256_andnot_ps (__m256 __A, __m256 __B)
161 return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B);
164 /* Double/single precision floating point blend instructions - select
165 data from 2 sources using constant/variable mask. */
167 #ifdef __OPTIMIZE__
168 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
169 _mm256_blend_pd (__m256d __X, __m256d __Y, const int __M)
171 return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X,
172 (__v4df)__Y,
173 __M);
176 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
177 _mm256_blend_ps (__m256 __X, __m256 __Y, const int __M)
179 return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X,
180 (__v8sf)__Y,
181 __M);
183 #else
184 #define _mm256_blend_pd(X, Y, M) \
185 ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X), \
186 (__v4df)(__m256d)(Y), (int)(M)))
188 #define _mm256_blend_ps(X, Y, M) \
189 ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X), \
190 (__v8sf)(__m256)(Y), (int)(M)))
191 #endif
193 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
194 _mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M)
196 return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X,
197 (__v4df)__Y,
198 (__v4df)__M);
201 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
202 _mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
204 return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
205 (__v8sf)__Y,
206 (__v8sf)__M);
209 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
210 _mm256_div_pd (__m256d __A, __m256d __B)
212 return (__m256d) __builtin_ia32_divpd256 ((__v4df)__A, (__v4df)__B);
215 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
216 _mm256_div_ps (__m256 __A, __m256 __B)
218 return (__m256) __builtin_ia32_divps256 ((__v8sf)__A, (__v8sf)__B);
221 /* Dot product instructions with mask-defined summing and zeroing parts
222 of result. */
224 #ifdef __OPTIMIZE__
225 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
226 _mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
228 return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
229 (__v8sf)__Y,
230 __M);
232 #else
233 #define _mm256_dp_ps(X, Y, M) \
234 ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X), \
235 (__v8sf)(__m256)(Y), (int)(M)))
236 #endif
238 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
239 _mm256_hadd_pd (__m256d __X, __m256d __Y)
241 return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y);
244 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
245 _mm256_hadd_ps (__m256 __X, __m256 __Y)
247 return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y);
250 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
251 _mm256_hsub_pd (__m256d __X, __m256d __Y)
253 return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y);
256 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
257 _mm256_hsub_ps (__m256 __X, __m256 __Y)
259 return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y);
262 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
263 _mm256_max_pd (__m256d __A, __m256d __B)
265 return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B);
268 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
269 _mm256_max_ps (__m256 __A, __m256 __B)
271 return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B);
274 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
275 _mm256_min_pd (__m256d __A, __m256d __B)
277 return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B);
280 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
281 _mm256_min_ps (__m256 __A, __m256 __B)
283 return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
286 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
287 _mm256_mul_pd (__m256d __A, __m256d __B)
289 return (__m256d) __builtin_ia32_mulpd256 ((__v4df)__A, (__v4df)__B);
292 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
293 _mm256_mul_ps (__m256 __A, __m256 __B)
295 return (__m256) __builtin_ia32_mulps256 ((__v8sf)__A, (__v8sf)__B);
298 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
299 _mm256_or_pd (__m256d __A, __m256d __B)
301 return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
304 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
305 _mm256_or_ps (__m256 __A, __m256 __B)
307 return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B);
310 #ifdef __OPTIMIZE__
311 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
312 _mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask)
314 return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B,
315 __mask);
318 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
319 _mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask)
321 return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B,
322 __mask);
324 #else
325 #define _mm256_shuffle_pd(A, B, N) \
326 ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A), \
327 (__v4df)(__m256d)(B), (int)(N)))
329 #define _mm256_shuffle_ps(A, B, N) \
330 ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), \
331 (__v8sf)(__m256)(B), (int)(N)))
332 #endif
334 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
335 _mm256_sub_pd (__m256d __A, __m256d __B)
337 return (__m256d) __builtin_ia32_subpd256 ((__v4df)__A, (__v4df)__B);
340 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
341 _mm256_sub_ps (__m256 __A, __m256 __B)
343 return (__m256) __builtin_ia32_subps256 ((__v8sf)__A, (__v8sf)__B);
346 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
347 _mm256_xor_pd (__m256d __A, __m256d __B)
349 return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
352 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
353 _mm256_xor_ps (__m256 __A, __m256 __B)
355 return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B);
358 #ifdef __OPTIMIZE__
359 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
360 _mm_cmp_pd (__m128d __X, __m128d __Y, const int __P)
362 return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P);
365 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
366 _mm_cmp_ps (__m128 __X, __m128 __Y, const int __P)
368 return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P);
371 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
372 _mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
374 return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y,
375 __P);
378 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
379 _mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
381 return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
382 __P);
385 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
386 _mm_cmp_sd (__m128d __X, __m128d __Y, const int __P)
388 return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P);
391 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
392 _mm_cmp_ss (__m128 __X, __m128 __Y, const int __P)
394 return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P);
396 #else
397 #define _mm_cmp_pd(X, Y, P) \
398 ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X), \
399 (__v2df)(__m128d)(Y), (int)(P)))
401 #define _mm_cmp_ps(X, Y, P) \
402 ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X), \
403 (__v4sf)(__m128)(Y), (int)(P)))
405 #define _mm256_cmp_pd(X, Y, P) \
406 ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X), \
407 (__v4df)(__m256d)(Y), (int)(P)))
409 #define _mm256_cmp_ps(X, Y, P) \
410 ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X), \
411 (__v8sf)(__m256)(Y), (int)(P)))
413 #define _mm_cmp_sd(X, Y, P) \
414 ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X), \
415 (__v2df)(__m128d)(Y), (int)(P)))
417 #define _mm_cmp_ss(X, Y, P) \
418 ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X), \
419 (__v4sf)(__m128)(Y), (int)(P)))
420 #endif
422 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
423 _mm256_cvtepi32_pd (__m128i __A)
425 return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A);
428 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
429 _mm256_cvtepi32_ps (__m256i __A)
431 return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A);
434 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
435 _mm256_cvtpd_ps (__m256d __A)
437 return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A);
440 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
441 _mm256_cvtps_epi32 (__m256 __A)
443 return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A);
446 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
447 _mm256_cvtps_pd (__m128 __A)
449 return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A);
452 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
453 _mm256_cvttpd_epi32 (__m256d __A)
455 return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A);
458 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
459 _mm256_cvtpd_epi32 (__m256d __A)
461 return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A);
464 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
465 _mm256_cvttps_epi32 (__m256 __A)
467 return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A);
470 #ifdef __OPTIMIZE__
471 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
472 _mm256_extractf128_pd (__m256d __X, const int __N)
474 return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
477 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
478 _mm256_extractf128_ps (__m256 __X, const int __N)
480 return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N);
483 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
484 _mm256_extractf128_si256 (__m256i __X, const int __N)
486 return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N);
489 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
490 _mm256_extract_epi32 (__m256i __X, int const __N)
492 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
493 return _mm_extract_epi32 (__Y, __N % 4);
496 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
497 _mm256_extract_epi16 (__m256i __X, int const __N)
499 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
500 return _mm_extract_epi16 (__Y, __N % 8);
503 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
504 _mm256_extract_epi8 (__m256i __X, int const __N)
506 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
507 return _mm_extract_epi8 (__Y, __N % 16);
510 #ifdef __x86_64__
511 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
512 _mm256_extract_epi64 (__m256i __X, const int __N)
514 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
515 return _mm_extract_epi64 (__Y, __N % 2);
517 #endif
518 #else
519 #define _mm256_extractf128_pd(X, N) \
520 ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X), \
521 (int)(N)))
523 #define _mm256_extractf128_ps(X, N) \
524 ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X), \
525 (int)(N)))
527 #define _mm256_extractf128_si256(X, N) \
528 ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X), \
529 (int)(N)))
531 #define _mm256_extract_epi32(X, N) \
532 (__extension__ \
533 ({ \
534 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
535 _mm_extract_epi32 (__Y, (N) % 4); \
538 #define _mm256_extract_epi16(X, N) \
539 (__extension__ \
540 ({ \
541 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
542 _mm_extract_epi16 (__Y, (N) % 8); \
545 #define _mm256_extract_epi8(X, N) \
546 (__extension__ \
547 ({ \
548 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
549 _mm_extract_epi8 (__Y, (N) % 16); \
552 #ifdef __x86_64__
553 #define _mm256_extract_epi64(X, N) \
554 (__extension__ \
555 ({ \
556 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
557 _mm_extract_epi64 (__Y, (N) % 2); \
559 #endif
560 #endif
562 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
563 _mm256_zeroall (void)
565 __builtin_ia32_vzeroall ();
568 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
569 _mm256_zeroupper (void)
571 __builtin_ia32_vzeroupper ();
574 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
575 _mm_permutevar_pd (__m128d __A, __m128i __C)
577 return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
578 (__v2di)__C);
581 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
582 _mm256_permutevar_pd (__m256d __A, __m256i __C)
584 return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
585 (__v4di)__C);
588 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
589 _mm_permutevar_ps (__m128 __A, __m128i __C)
591 return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
592 (__v4si)__C);
595 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
596 _mm256_permutevar_ps (__m256 __A, __m256i __C)
598 return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
599 (__v8si)__C);
602 #ifdef __OPTIMIZE__
603 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
604 _mm_permute_pd (__m128d __X, const int __C)
606 return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
609 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
610 _mm256_permute_pd (__m256d __X, const int __C)
612 return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
615 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
616 _mm_permute_ps (__m128 __X, const int __C)
618 return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
621 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
622 _mm256_permute_ps (__m256 __X, const int __C)
624 return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C);
626 #else
627 #define _mm_permute_pd(X, C) \
628 ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
630 #define _mm256_permute_pd(X, C) \
631 ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C)))
633 #define _mm_permute_ps(X, C) \
634 ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
636 #define _mm256_permute_ps(X, C) \
637 ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
638 #endif
640 #ifdef __OPTIMIZE__
641 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
642 _mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
644 return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
645 (__v4df)__Y,
646 __C);
649 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
650 _mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
652 return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
653 (__v8sf)__Y,
654 __C);
657 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
658 _mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
660 return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
661 (__v8si)__Y,
662 __C);
664 #else
665 #define _mm256_permute2f128_pd(X, Y, C) \
666 ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X), \
667 (__v4df)(__m256d)(Y), \
668 (int)(C)))
670 #define _mm256_permute2f128_ps(X, Y, C) \
671 ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X), \
672 (__v8sf)(__m256)(Y), \
673 (int)(C)))
675 #define _mm256_permute2f128_si256(X, Y, C) \
676 ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X), \
677 (__v8si)(__m256i)(Y), \
678 (int)(C)))
679 #endif
681 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
682 _mm_broadcast_ss (float const *__X)
684 return (__m128) __builtin_ia32_vbroadcastss (__X);
687 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
688 _mm256_broadcast_sd (double const *__X)
690 return (__m256d) __builtin_ia32_vbroadcastsd256 (__X);
693 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
694 _mm256_broadcast_ss (float const *__X)
696 return (__m256) __builtin_ia32_vbroadcastss256 (__X);
699 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
700 _mm256_broadcast_pd (__m128d const *__X)
702 return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X);
705 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
706 _mm256_broadcast_ps (__m128 const *__X)
708 return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X);
711 #ifdef __OPTIMIZE__
712 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
713 _mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O)
715 return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X,
716 (__v2df)__Y,
717 __O);
720 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
721 _mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
723 return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
724 (__v4sf)__Y,
725 __O);
728 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
729 _mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O)
731 return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X,
732 (__v4si)__Y,
733 __O);
736 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
737 _mm256_insert_epi32 (__m256i __X, int __D, int const __N)
739 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
740 __Y = _mm_insert_epi16 (__Y, __D, __N % 4);
741 return _mm256_insertf128_si256 (__X, __Y, __N >> 2);
744 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
745 _mm256_insert_epi16 (__m256i __X, int __D, int const __N)
747 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
748 __Y = _mm_insert_epi16 (__Y, __D, __N % 8);
749 return _mm256_insertf128_si256 (__X, __Y, __N >> 3);
752 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
753 _mm256_insert_epi8 (__m256i __X, int __D, int const __N)
755 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
756 __Y = _mm_insert_epi8 (__Y, __D, __N % 16);
757 return _mm256_insertf128_si256 (__X, __Y, __N >> 4);
760 #ifdef __x86_64__
761 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
762 _mm256_insert_epi64 (__m256i __X, int __D, int const __N)
764 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
765 __Y = _mm_insert_epi16 (__Y, __D, __N % 2);
766 return _mm256_insertf128_si256 (__X, __Y, __N >> 1);
768 #endif
769 #else
770 #define _mm256_insertf128_pd(X, Y, O) \
771 ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X), \
772 (__v2df)(__m128d)(Y), \
773 (int)(O)))
775 #define _mm256_insertf128_ps(X, Y, O) \
776 ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X), \
777 (__v4sf)(__m128)(Y), \
778 (int)(O)))
780 #define _mm256_insertf128_si256(X, Y, O) \
781 ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X), \
782 (__v4si)(__m128i)(Y), \
783 (int)(O)))
785 #define _mm256_insert_epi32(X, D, N) \
786 (__extension__ \
787 ({ \
788 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
789 __Y = _mm_insert_epi32 (__Y, (D), (N) % 4); \
790 _mm256_insertf128_si256 ((X), __Y, (N) >> 2); \
793 #define _mm256_insert_epi16(X, D, N) \
794 (__extension__ \
795 ({ \
796 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
797 __Y = _mm_insert_epi16 (__Y, (D), (N) % 8); \
798 _mm256_insertf128_si256 ((X), __Y, (N) >> 3); \
801 #define _mm256_insert_epi8(X, D, N) \
802 (__extension__ \
803 ({ \
804 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
805 __Y = _mm_insert_epi8 (__Y, (D), (N) % 16); \
806 _mm256_insertf128_si256 ((X), __Y, (N) >> 4); \
809 #ifdef __x86_64__
810 #define _mm256_insert_epi64(X, D, N) \
811 (__extension__ \
812 ({ \
813 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
814 __Y = _mm_insert_epi64 (__Y, (D), (N) % 2); \
815 _mm256_insertf128_si256 ((X), __Y, (N) >> 1); \
817 #endif
818 #endif
820 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
821 _mm256_load_pd (double const *__P)
823 return *(__m256d *)__P;
826 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
827 _mm256_store_pd (double *__P, __m256d __A)
829 *(__m256d *)__P = __A;
832 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
833 _mm256_load_ps (float const *__P)
835 return *(__m256 *)__P;
838 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
839 _mm256_store_ps (float *__P, __m256 __A)
841 *(__m256 *)__P = __A;
844 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
845 _mm256_loadu_pd (double const *__P)
847 return (__m256d) __builtin_ia32_loadupd256 (__P);
850 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
851 _mm256_storeu_pd (double *__P, __m256d __A)
853 __builtin_ia32_storeupd256 (__P, (__v4df)__A);
856 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
857 _mm256_loadu_ps (float const *__P)
859 return (__m256) __builtin_ia32_loadups256 (__P);
862 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
863 _mm256_storeu_ps (float *__P, __m256 __A)
865 __builtin_ia32_storeups256 (__P, (__v8sf)__A);
868 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
869 _mm256_load_si256 (__m256i const *__P)
871 return *__P;
874 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
875 _mm256_store_si256 (__m256i *__P, __m256i __A)
877 *__P = __A;
880 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
881 _mm256_loadu_si256 (__m256i const *__P)
883 return (__m256i) __builtin_ia32_loaddqu256 ((char const *)__P);
886 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
887 _mm256_storeu_si256 (__m256i *__P, __m256i __A)
889 __builtin_ia32_storedqu256 ((char *)__P, (__v32qi)__A);
892 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
893 _mm_maskload_pd (double const *__P, __m128d __M)
895 return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P,
896 (__v2df)__M);
899 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
900 _mm_maskstore_pd (double *__P, __m128d __M, __m128d __A)
902 __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2df)__M, (__v2df)__A);
905 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
906 _mm256_maskload_pd (double const *__P, __m256d __M)
908 return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P,
909 (__v4df)__M);
912 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
913 _mm256_maskstore_pd (double *__P, __m256d __M, __m256d __A)
915 __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4df)__M, (__v4df)__A);
918 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
919 _mm_maskload_ps (float const *__P, __m128 __M)
921 return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P,
922 (__v4sf)__M);
925 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
926 _mm_maskstore_ps (float *__P, __m128 __M, __m128 __A)
928 __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4sf)__M, (__v4sf)__A);
931 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
932 _mm256_maskload_ps (float const *__P, __m256 __M)
934 return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P,
935 (__v8sf)__M);
938 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
939 _mm256_maskstore_ps (float *__P, __m256 __M, __m256 __A)
941 __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8sf)__M, (__v8sf)__A);
944 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
945 _mm256_movehdup_ps (__m256 __X)
947 return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X);
950 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
951 _mm256_moveldup_ps (__m256 __X)
953 return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X);
956 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
957 _mm256_movedup_pd (__m256d __X)
959 return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X);
962 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
963 _mm256_lddqu_si256 (__m256i const *__P)
965 return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P);
968 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
969 _mm256_stream_si256 (__m256i *__A, __m256i __B)
971 __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B);
974 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
975 _mm256_stream_pd (double *__A, __m256d __B)
977 __builtin_ia32_movntpd256 (__A, (__v4df)__B);
980 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
981 _mm256_stream_ps (float *__P, __m256 __A)
983 __builtin_ia32_movntps256 (__P, (__v8sf)__A);
986 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
987 _mm256_rcp_ps (__m256 __A)
989 return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A);
992 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
993 _mm256_rsqrt_ps (__m256 __A)
995 return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A);
998 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
999 _mm256_sqrt_pd (__m256d __A)
1001 return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A);
1004 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1005 _mm256_sqrt_ps (__m256 __A)
1007 return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A);
1010 #ifdef __OPTIMIZE__
1011 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1012 _mm256_round_pd (__m256d __V, const int __M)
1014 return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M);
1017 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1018 _mm256_round_ps (__m256 __V, const int __M)
1020 return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M);
1022 #else
1023 #define _mm256_round_pd(V, M) \
1024 ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M)))
1026 #define _mm256_round_ps(V, M) \
1027 ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M)))
1028 #endif
1030 #define _mm256_ceil_pd(V) _mm256_round_pd ((V), _MM_FROUND_CEIL)
1031 #define _mm256_floor_pd(V) _mm256_round_pd ((V), _MM_FROUND_FLOOR)
1032 #define _mm256_ceil_ps(V) _mm256_round_ps ((V), _MM_FROUND_CEIL)
1033 #define _mm256_floor_ps(V) _mm256_round_ps ((V), _MM_FROUND_FLOOR)
1035 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1036 _mm256_unpackhi_pd (__m256d __A, __m256d __B)
1038 return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B);
1041 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1042 _mm256_unpacklo_pd (__m256d __A, __m256d __B)
1044 return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B);
1047 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1048 _mm256_unpackhi_ps (__m256 __A, __m256 __B)
1050 return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B);
1053 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1054 _mm256_unpacklo_ps (__m256 __A, __m256 __B)
1056 return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B);
1059 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1060 _mm_testz_pd (__m128d __M, __m128d __V)
1062 return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V);
1065 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1066 _mm_testc_pd (__m128d __M, __m128d __V)
1068 return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V);
1071 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1072 _mm_testnzc_pd (__m128d __M, __m128d __V)
1074 return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V);
1077 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1078 _mm_testz_ps (__m128 __M, __m128 __V)
1080 return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V);
1083 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1084 _mm_testc_ps (__m128 __M, __m128 __V)
1086 return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V);
1089 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1090 _mm_testnzc_ps (__m128 __M, __m128 __V)
1092 return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V);
1095 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1096 _mm256_testz_pd (__m256d __M, __m256d __V)
1098 return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V);
1101 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1102 _mm256_testc_pd (__m256d __M, __m256d __V)
1104 return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V);
1107 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1108 _mm256_testnzc_pd (__m256d __M, __m256d __V)
1110 return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V);
1113 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1114 _mm256_testz_ps (__m256 __M, __m256 __V)
1116 return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V);
1119 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1120 _mm256_testc_ps (__m256 __M, __m256 __V)
1122 return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V);
1125 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1126 _mm256_testnzc_ps (__m256 __M, __m256 __V)
1128 return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V);
1131 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1132 _mm256_testz_si256 (__m256i __M, __m256i __V)
1134 return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V);
1137 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1138 _mm256_testc_si256 (__m256i __M, __m256i __V)
1140 return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V);
1143 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1144 _mm256_testnzc_si256 (__m256i __M, __m256i __V)
1146 return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V);
1149 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1150 _mm256_movemask_pd (__m256d __A)
1152 return __builtin_ia32_movmskpd256 ((__v4df)__A);
1155 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1156 _mm256_movemask_ps (__m256 __A)
1158 return __builtin_ia32_movmskps256 ((__v8sf)__A);
1161 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1162 _mm256_setzero_pd (void)
1164 return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
1167 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1168 _mm256_setzero_ps (void)
1170 return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
1171 0.0, 0.0, 0.0, 0.0 };
1174 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1175 _mm256_setzero_si256 (void)
1177 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
1180 /* Create the vector [A B C D]. */
1181 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1182 _mm256_set_pd (double __A, double __B, double __C, double __D)
1184 return __extension__ (__m256d){ __D, __C, __B, __A };
1187 /* Create the vector [A B C D E F G H]. */
1188 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1189 _mm256_set_ps (float __A, float __B, float __C, float __D,
1190 float __E, float __F, float __G, float __H)
1192 return __extension__ (__m256){ __H, __G, __F, __E,
1193 __D, __C, __B, __A };
1196 /* Create the vector [A B C D E F G H]. */
1197 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1198 _mm256_set_epi32 (int __A, int __B, int __C, int __D,
1199 int __E, int __F, int __G, int __H)
1201 return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E,
1202 __D, __C, __B, __A };
1205 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1206 _mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12,
1207 short __q11, short __q10, short __q09, short __q08,
1208 short __q07, short __q06, short __q05, short __q04,
1209 short __q03, short __q02, short __q01, short __q00)
1211 return __extension__ (__m256i)(__v16hi){
1212 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1213 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
1217 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1218 _mm256_set_epi8 (char __q31, char __q30, char __q29, char __q28,
1219 char __q27, char __q26, char __q25, char __q24,
1220 char __q23, char __q22, char __q21, char __q20,
1221 char __q19, char __q18, char __q17, char __q16,
1222 char __q15, char __q14, char __q13, char __q12,
1223 char __q11, char __q10, char __q09, char __q08,
1224 char __q07, char __q06, char __q05, char __q04,
1225 char __q03, char __q02, char __q01, char __q00)
1227 return __extension__ (__m256i)(__v32qi){
1228 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1229 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
1230 __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
1231 __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
1235 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1236 _mm256_set_epi64x (long long __A, long long __B, long long __C,
1237 long long __D)
1239 return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A };
1242 /* Create a vector with all elements equal to A. */
1243 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1244 _mm256_set1_pd (double __A)
1246 return __extension__ (__m256d){ __A, __A, __A, __A };
1249 /* Create a vector with all elements equal to A. */
1250 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1251 _mm256_set1_ps (float __A)
1253 return __extension__ (__m256){ __A, __A, __A, __A,
1254 __A, __A, __A, __A };
1257 /* Create a vector with all elements equal to A. */
1258 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1259 _mm256_set1_epi32 (int __A)
1261 return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
1262 __A, __A, __A, __A };
1265 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1266 _mm256_set1_epi16 (short __A)
1268 return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
1269 __A, __A, __A, __A, __A, __A, __A, __A);
1272 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1273 _mm256_set1_epi8 (char __A)
1275 return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
1276 __A, __A, __A, __A, __A, __A, __A, __A,
1277 __A, __A, __A, __A, __A, __A, __A, __A,
1278 __A, __A, __A, __A, __A, __A, __A, __A);
1281 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1282 _mm256_set1_epi64x (long long __A)
1284 return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A };
1287 /* Create vectors of elements in the reversed order from the
1288 _mm256_set_XXX functions. */
1290 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1291 _mm256_setr_pd (double __A, double __B, double __C, double __D)
1293 return _mm256_set_pd (__D, __C, __B, __A);
1296 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1297 _mm256_setr_ps (float __A, float __B, float __C, float __D,
1298 float __E, float __F, float __G, float __H)
1300 return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A);
1303 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1304 _mm256_setr_epi32 (int __A, int __B, int __C, int __D,
1305 int __E, int __F, int __G, int __H)
1307 return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A);
1310 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1311 _mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12,
1312 short __q11, short __q10, short __q09, short __q08,
1313 short __q07, short __q06, short __q05, short __q04,
1314 short __q03, short __q02, short __q01, short __q00)
1316 return _mm256_set_epi16 (__q00, __q01, __q02, __q03,
1317 __q04, __q05, __q06, __q07,
1318 __q08, __q09, __q10, __q11,
1319 __q12, __q13, __q14, __q15);
1322 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1323 _mm256_setr_epi8 (char __q31, char __q30, char __q29, char __q28,
1324 char __q27, char __q26, char __q25, char __q24,
1325 char __q23, char __q22, char __q21, char __q20,
1326 char __q19, char __q18, char __q17, char __q16,
1327 char __q15, char __q14, char __q13, char __q12,
1328 char __q11, char __q10, char __q09, char __q08,
1329 char __q07, char __q06, char __q05, char __q04,
1330 char __q03, char __q02, char __q01, char __q00)
1332 return _mm256_set_epi8 (__q00, __q01, __q02, __q03,
1333 __q04, __q05, __q06, __q07,
1334 __q08, __q09, __q10, __q11,
1335 __q12, __q13, __q14, __q15,
1336 __q16, __q17, __q18, __q19,
1337 __q20, __q21, __q22, __q23,
1338 __q24, __q25, __q26, __q27,
1339 __q28, __q29, __q30, __q31);
1342 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1343 _mm256_setr_epi64x (long long __A, long long __B, long long __C,
1344 long long __D)
1346 return _mm256_set_epi64x (__D, __C, __B, __A);
1349 /* Casts between various SP, DP, INT vector types. Note that these do no
1350 conversion of values, they just change the type. */
1351 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1352 _mm256_castpd_ps (__m256d __A)
1354 return (__m256) __A;
1357 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1358 _mm256_castpd_si256 (__m256d __A)
1360 return (__m256i) __A;
1363 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1364 _mm256_castps_pd (__m256 __A)
1366 return (__m256d) __A;
1369 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1370 _mm256_castps_si256(__m256 __A)
1372 return (__m256i) __A;
1375 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1376 _mm256_castsi256_ps (__m256i __A)
1378 return (__m256) __A;
1381 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1382 _mm256_castsi256_pd (__m256i __A)
1384 return (__m256d) __A;
1387 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1388 _mm256_castpd256_pd128 (__m256d __A)
1390 return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A);
1393 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1394 _mm256_castps256_ps128 (__m256 __A)
1396 return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A);
1399 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1400 _mm256_castsi256_si128 (__m256i __A)
1402 return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A);
1405 /* When cast is done from a 128 to 256-bit type, the low 128 bits of
1406 the 256-bit result contain source parameter value and the upper 128
1407 bits of the result are undefined. Those intrinsics shouldn't
1408 generate any extra moves. */
1410 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1411 _mm256_castpd128_pd256 (__m128d __A)
1413 return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A);
1416 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1417 _mm256_castps128_ps256 (__m128 __A)
1419 return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A);
1422 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1423 _mm256_castsi128_si256 (__m128i __A)
1425 return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);