2008-11-18 Kai Tietz <kai.tietz@onevision.com>
[official-gcc.git] / gcc / config / i386 / gmmintrin.h
blob1c6bb18be83fff41b7f3d07b3242687aa3088104
1 /* Copyright (C) 2008 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING. If not, write to
17 the Free Software Foundation, 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
20 /* As a special exception, if you include this header file into source
21 files compiled by GCC, this header file does not by itself cause
22 the resulting executable to be covered by the GNU General Public
23 License. This exception does not however invalidate any other
24 reasons why the executable file might be covered by the GNU General
25 Public License. */
27 /* Implemented from the specification included in the Intel C++ Compiler
28 User Guide and Reference, version 11.0. */
30 #ifndef _GMMINTRIN_H_INCLUDED
31 #define _GMMINTRIN_H_INCLUDED
33 #ifndef __AVX__
34 # error "AVX instruction set not enabled"
35 #else
37 /* We need definitions from the SSE4, SSSE3, SSE3, SSE2 and SSE header
38 files. */
39 #include <smmintrin.h>
41 /* Internal data types for implementing the intrinsics. */
42 typedef double __v4df __attribute__ ((__vector_size__ (32)));
43 typedef float __v8sf __attribute__ ((__vector_size__ (32)));
44 typedef long long __v4di __attribute__ ((__vector_size__ (32)));
45 typedef int __v8si __attribute__ ((__vector_size__ (32)));
46 typedef short __v16hi __attribute__ ((__vector_size__ (32)));
47 typedef char __v32qi __attribute__ ((__vector_size__ (32)));
49 /* The Intel API is flexible enough that we must allow aliasing with other
50 vector types, and their scalar components. */
51 typedef float __m256 __attribute__ ((__vector_size__ (32),
52 __may_alias__));
53 typedef long long __m256i __attribute__ ((__vector_size__ (32),
54 __may_alias__));
55 typedef double __m256d __attribute__ ((__vector_size__ (32),
56 __may_alias__));
58 /* Compare predicates for scalar and packed compare intrinsics. */
60 /* Equal (ordered, non-signaling) */
61 #define _CMP_EQ_OQ 0x00
62 /* Less-than (ordered, signaling) */
63 #define _CMP_LT_OS 0x01
64 /* Less-than-or-equal (ordered, signaling) */
65 #define _CMP_LE_OS 0x02
66 /* Unordered (non-signaling) */
67 #define _CMP_UNORD_Q 0x03
68 /* Not-equal (unordered, non-signaling) */
69 #define _CMP_NEQ_UQ 0x04
70 /* Not-less-than (unordered, signaling) */
71 #define _CMP_NLT_US 0x05
72 /* Not-less-than-or-equal (unordered, signaling) */
73 #define _CMP_NLE_US 0x06
74 /* Ordered (nonsignaling) */
75 #define _CMP_ORD_Q 0x07
76 /* Equal (unordered, non-signaling) */
77 #define _CMP_EQ_UQ 0x08
78 /* Not-greater-than-or-equal (unordered, signaling) */
79 #define _CMP_NGE_US 0x09
80 /* Not-greater-than (unordered, signaling) */
81 #define _CMP_NGT_US 0x0a
82 /* False (ordered, non-signaling) */
83 #define _CMP_FALSE_OQ 0x0b
84 /* Not-equal (ordered, non-signaling) */
85 #define _CMP_NEQ_OQ 0x0c
86 /* Greater-than-or-equal (ordered, signaling) */
87 #define _CMP_GE_OS 0x0d
88 /* Greater-than (ordered, signaling) */
89 #define _CMP_GT_OS 0x0e
90 /* True (unordered, non-signaling) */
91 #define _CMP_TRUE_UQ 0x0f
92 /* Equal (ordered, signaling) */
93 #define _CMP_EQ_OS 0x10
94 /* Less-than (ordered, non-signaling) */
95 #define _CMP_LT_OQ 0x11
96 /* Less-than-or-equal (ordered, non-signaling) */
97 #define _CMP_LE_OQ 0x12
98 /* Unordered (signaling) */
99 #define _CMP_UNORD_S 0x13
100 /* Not-equal (unordered, signaling) */
101 #define _CMP_NEQ_US 0x14
102 /* Not-less-than (unordered, non-signaling) */
103 #define _CMP_NLT_UQ 0x15
104 /* Not-less-than-or-equal (unordered, non-signaling) */
105 #define _CMP_NLE_UQ 0x16
106 /* Ordered (signaling) */
107 #define _CMP_ORD_S 0x17
108 /* Equal (unordered, signaling) */
109 #define _CMP_EQ_US 0x18
110 /* Not-greater-than-or-equal (unordered, non-signaling) */
111 #define _CMP_NGE_UQ 0x19
112 /* Not-greater-than (unordered, non-signaling) */
113 #define _CMP_NGT_UQ 0x1a
114 /* False (ordered, signaling) */
115 #define _CMP_FALSE_OS 0x1b
116 /* Not-equal (ordered, signaling) */
117 #define _CMP_NEQ_OS 0x1c
118 /* Greater-than-or-equal (ordered, non-signaling) */
119 #define _CMP_GE_OQ 0x1d
120 /* Greater-than (ordered, non-signaling) */
121 #define _CMP_GT_OQ 0x1e
122 /* True (unordered, signaling) */
123 #define _CMP_TRUE_US 0x1f
125 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
126 _mm256_add_pd (__m256d __A, __m256d __B)
128 return (__m256d) __builtin_ia32_addpd256 ((__v4df)__A, (__v4df)__B);
131 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
132 _mm256_add_ps (__m256 __A, __m256 __B)
134 return (__m256) __builtin_ia32_addps256 ((__v8sf)__A, (__v8sf)__B);
137 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
138 _mm256_addsub_pd (__m256d __A, __m256d __B)
140 return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
143 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
144 _mm256_addsub_ps (__m256 __A, __m256 __B)
146 return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B);
150 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
151 _mm256_and_pd (__m256d __A, __m256d __B)
153 return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B);
156 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
157 _mm256_and_ps (__m256 __A, __m256 __B)
159 return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B);
162 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
163 _mm256_andnot_pd (__m256d __A, __m256d __B)
165 return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B);
168 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
169 _mm256_andnot_ps (__m256 __A, __m256 __B)
171 return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B);
174 /* Double/single precision floating point blend instructions - select
175 data from 2 sources using constant/variable mask. */
177 #ifdef __OPTIMIZE__
178 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
179 _mm256_blend_pd (__m256d __X, __m256d __Y, const int __M)
181 return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X,
182 (__v4df)__Y,
183 __M);
186 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
187 _mm256_blend_ps (__m256 __X, __m256 __Y, const int __M)
189 return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X,
190 (__v8sf)__Y,
191 __M);
193 #else
194 #define _mm256_blend_pd(X, Y, M) \
195 ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X), \
196 (__v4df)(__m256d)(Y), (int)(M)))
198 #define _mm256_blend_ps(X, Y, M) \
199 ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X), \
200 (__v8sf)(__m256)(Y), (int)(M)))
201 #endif
203 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
204 _mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M)
206 return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X,
207 (__v4df)__Y,
208 (__v4df)__M);
211 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
212 _mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
214 return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
215 (__v8sf)__Y,
216 (__v8sf)__M);
219 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
220 _mm256_div_pd (__m256d __A, __m256d __B)
222 return (__m256d) __builtin_ia32_divpd256 ((__v4df)__A, (__v4df)__B);
225 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
226 _mm256_div_ps (__m256 __A, __m256 __B)
228 return (__m256) __builtin_ia32_divps256 ((__v8sf)__A, (__v8sf)__B);
231 /* Dot product instructions with mask-defined summing and zeroing parts
232 of result. */
234 #ifdef __OPTIMIZE__
235 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
236 _mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
238 return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
239 (__v8sf)__Y,
240 __M);
242 #else
243 #define _mm256_dp_ps(X, Y, M) \
244 ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X), \
245 (__v8sf)(__m256)(Y), (int)(M)))
246 #endif
248 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
249 _mm256_hadd_pd (__m256d __X, __m256d __Y)
251 return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y);
254 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
255 _mm256_hadd_ps (__m256 __X, __m256 __Y)
257 return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y);
260 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
261 _mm256_hsub_pd (__m256d __X, __m256d __Y)
263 return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y);
266 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
267 _mm256_hsub_ps (__m256 __X, __m256 __Y)
269 return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y);
272 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
273 _mm256_max_pd (__m256d __A, __m256d __B)
275 return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B);
278 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
279 _mm256_max_ps (__m256 __A, __m256 __B)
281 return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B);
284 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
285 _mm256_min_pd (__m256d __A, __m256d __B)
287 return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B);
290 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
291 _mm256_min_ps (__m256 __A, __m256 __B)
293 return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
296 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
297 _mm256_mul_pd (__m256d __A, __m256d __B)
299 return (__m256d) __builtin_ia32_mulpd256 ((__v4df)__A, (__v4df)__B);
302 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
303 _mm256_mul_ps (__m256 __A, __m256 __B)
305 return (__m256) __builtin_ia32_mulps256 ((__v8sf)__A, (__v8sf)__B);
308 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
309 _mm256_or_pd (__m256d __A, __m256d __B)
311 return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
314 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
315 _mm256_or_ps (__m256 __A, __m256 __B)
317 return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B);
320 #ifdef __OPTIMIZE__
321 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
322 _mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask)
324 return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B,
325 __mask);
328 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
329 _mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask)
331 return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B,
332 __mask);
334 #else
335 #define _mm256_shuffle_pd(A, B, N) \
336 ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A), \
337 (__v4df)(__m256d)(B), (int)(N)))
339 #define _mm256_shuffle_ps(A, B, N) \
340 ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), \
341 (__v8sf)(__m256)(B), (int)(N)))
342 #endif
344 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
345 _mm256_sub_pd (__m256d __A, __m256d __B)
347 return (__m256d) __builtin_ia32_subpd256 ((__v4df)__A, (__v4df)__B);
350 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
351 _mm256_sub_ps (__m256 __A, __m256 __B)
353 return (__m256) __builtin_ia32_subps256 ((__v8sf)__A, (__v8sf)__B);
356 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
357 _mm256_xor_pd (__m256d __A, __m256d __B)
359 return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
362 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
363 _mm256_xor_ps (__m256 __A, __m256 __B)
365 return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B);
368 #ifdef __OPTIMIZE__
369 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
370 _mm_cmp_pd (__m128d __X, __m128d __Y, const int __P)
372 return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P);
375 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
376 _mm_cmp_ps (__m128 __X, __m128 __Y, const int __P)
378 return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P);
381 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
382 _mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
384 return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y,
385 __P);
388 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
389 _mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
391 return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
392 __P);
395 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
396 _mm_cmp_sd (__m128d __X, __m128d __Y, const int __P)
398 return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P);
401 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
402 _mm_cmp_ss (__m128 __X, __m128 __Y, const int __P)
404 return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P);
406 #else
407 #define _mm_cmp_pd(X, Y, P) \
408 ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X), \
409 (__v2df)(__m128d)(Y), (int)(P)))
411 #define _mm_cmp_ps(X, Y, P) \
412 ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X), \
413 (__v4sf)(__m128)(Y), (int)(P)))
415 #define _mm256_cmp_pd(X, Y, P) \
416 ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X), \
417 (__v4df)(__m256d)(Y), (int)(P)))
419 #define _mm256_cmp_ps(X, Y, P) \
420 ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X), \
421 (__v8sf)(__m256)(Y), (int)(P)))
423 #define _mm_cmp_sd(X, Y, P) \
424 ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X), \
425 (__v2df)(__m128d)(Y), (int)(P)))
427 #define _mm_cmp_ss(X, Y, P) \
428 ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X), \
429 (__v4sf)(__m128)(Y), (int)(P)))
430 #endif
432 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
433 _mm256_cvtepi32_pd (__m128i __A)
435 return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A);
438 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
439 _mm256_cvtepi32_ps (__m256i __A)
441 return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A);
444 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
445 _mm256_cvtpd_ps (__m256d __A)
447 return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A);
450 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
451 _mm256_cvtps_epi32 (__m256 __A)
453 return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A);
456 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
457 _mm256_cvtps_pd (__m128 __A)
459 return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A);
462 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
463 _mm256_cvttpd_epi32 (__m256d __A)
465 return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A);
468 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
469 _mm256_cvtpd_epi32 (__m256d __A)
471 return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A);
474 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
475 _mm256_cvttps_epi32 (__m256 __A)
477 return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A);
480 #ifdef __OPTIMIZE__
481 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
482 _mm256_extractf128_pd (__m256d __X, const int __N)
484 return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
487 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
488 _mm256_extractf128_ps (__m256 __X, const int __N)
490 return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N);
493 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
494 _mm256_extractf128_si256 (__m256i __X, const int __N)
496 return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N);
499 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
500 _mm256_extract_epi32 (__m256i __X, int const __N)
502 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
503 return _mm_extract_epi32 (__Y, __N % 4);
506 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
507 _mm256_extract_epi16 (__m256i __X, int const __N)
509 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
510 return _mm_extract_epi16 (__Y, __N % 8);
513 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
514 _mm256_extract_epi8 (__m256i __X, int const __N)
516 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
517 return _mm_extract_epi8 (__Y, __N % 16);
520 #ifdef __x86_64__
521 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
522 _mm256_extract_epi64 (__m256i __X, const int __N)
524 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
525 return _mm_extract_epi64 (__Y, __N % 2);
527 #endif
528 #else
529 #define _mm256_extractf128_pd(X, N) \
530 ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X), \
531 (int)(N)))
533 #define _mm256_extractf128_ps(X, N) \
534 ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X), \
535 (int)(N)))
537 #define _mm256_extractf128_si256(X, N) \
538 ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X), \
539 (int)(N)))
541 #define _mm256_extract_epi32(X, N) \
542 (__extension__ \
543 ({ \
544 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
545 _mm_extract_epi32 (__Y, (N) % 4); \
548 #define _mm256_extract_epi16(X, N) \
549 (__extension__ \
550 ({ \
551 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
552 _mm_extract_epi16 (__Y, (N) % 8); \
555 #define _mm256_extract_epi8(X, N) \
556 (__extension__ \
557 ({ \
558 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
559 _mm_extract_epi8 (__Y, (N) % 16); \
562 #ifdef __x86_64__
563 #define _mm256_extract_epi64(X, N) \
564 (__extension__ \
565 ({ \
566 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
567 _mm_extract_epi64 (__Y, (N) % 2); \
569 #endif
570 #endif
572 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
573 _mm256_zeroall (void)
575 __builtin_ia32_vzeroall ();
578 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
579 _mm256_zeroupper (void)
581 __builtin_ia32_vzeroupper ();
584 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
585 _mm_permutevar_pd (__m128d __A, __m128i __C)
587 return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
588 (__v2di)__C);
591 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
592 _mm256_permutevar_pd (__m256d __A, __m256i __C)
594 return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
595 (__v4di)__C);
598 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
599 _mm_permutevar_ps (__m128 __A, __m128i __C)
601 return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
602 (__v4si)__C);
605 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
606 _mm256_permutevar_ps (__m256 __A, __m256i __C)
608 return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
609 (__v8si)__C);
612 #ifdef __OPTIMIZE__
613 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
614 _mm_permute_pd (__m128d __X, const int __C)
616 return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
619 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
620 _mm256_permute_pd (__m256d __X, const int __C)
622 return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
625 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
626 _mm_permute_ps (__m128 __X, const int __C)
628 return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
631 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
632 _mm256_permute_ps (__m256 __X, const int __C)
634 return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C);
637 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
638 _mm_permute2_pd (__m128d __X, __m128d __Y, __m128i __C, const int __I)
640 return (__m128d) __builtin_ia32_vpermil2pd ((__v2df)__X,
641 (__v2df)__Y,
642 (__v2di)__C,
643 __I);
646 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
647 _mm256_permute2_pd (__m256d __X, __m256d __Y, __m256i __C, const int __I)
649 return (__m256d) __builtin_ia32_vpermil2pd256 ((__v4df)__X,
650 (__v4df)__Y,
651 (__v4di)__C,
652 __I);
655 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
656 _mm_permute2_ps (__m128 __X, __m128 __Y, __m128i __C, const int __I)
658 return (__m128) __builtin_ia32_vpermil2ps ((__v4sf)__X,
659 (__v4sf)__Y,
660 (__v4si)__C,
661 __I);
664 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
665 _mm256_permute2_ps (__m256 __X, __m256 __Y, __m256i __C, const int __I)
667 return (__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)__X,
668 (__v8sf)__Y,
669 (__v8si)__C,
670 __I);
672 #else
673 #define _mm_permute_pd(X, C) \
674 ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
676 #define _mm256_permute_pd(X, C) \
677 ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C)))
679 #define _mm_permute_ps(X, C) \
680 ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
682 #define _mm256_permute_ps(X, C) \
683 ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
685 #define _mm_permute2_pd(X, Y, C, I) \
686 ((__m128d) __builtin_ia32_vpermil2pd ((__v2df)(__m128d)(X), \
687 (__v2df)(__m128d)(Y), \
688 (__v2di)(__m128d)(C), \
689 (int)(I)))
691 #define _mm256_permute2_pd(X, Y, C, I) \
692 ((__m256d) __builtin_ia32_vpermil2pd256 ((__v4df)(__m256d)(X), \
693 (__v4df)(__m256d)(Y), \
694 (__v4di)(__m256d)(C), \
695 (int)(I)))
697 #define _mm_permute2_ps(X, Y, C, I) \
698 ((__m128) __builtin_ia32_vpermil2ps ((__v4sf)(__m128)(X), \
699 (__v4sf)(__m128)(Y), \
700 (__v4si)(__m128)(C), \
701 (int)(I)))
703 #define _mm256_permute2_ps(X, Y, C, I) \
704 ((__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)(__m256)(X), \
705 (__v8sf)(__m256)(Y), \
706 (__v8si)(__m256)(C), \
707 (int)(I)))
708 #endif
710 #ifdef __OPTIMIZE__
711 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
712 _mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
714 return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
715 (__v4df)__Y,
716 __C);
719 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
720 _mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
722 return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
723 (__v8sf)__Y,
724 __C);
727 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
728 _mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
730 return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
731 (__v8si)__Y,
732 __C);
734 #else
735 #define _mm256_permute2f128_pd(X, Y, C) \
736 ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X), \
737 (__v4df)(__m256d)(Y), \
738 (int)(C)))
740 #define _mm256_permute2f128_ps(X, Y, C) \
741 ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X), \
742 (__v8sf)(__m256)(Y), \
743 (int)(C)))
745 #define _mm256_permute2f128_si256(X, Y, C) \
746 ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X), \
747 (__v8si)(__m256i)(Y), \
748 (int)(C)))
749 #endif
751 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
752 _mm_broadcast_ss (float const *__X)
754 return (__m128) __builtin_ia32_vbroadcastss (__X);
757 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
758 _mm256_broadcast_sd (double const *__X)
760 return (__m256d) __builtin_ia32_vbroadcastsd256 (__X);
763 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
764 _mm256_broadcast_ss (float const *__X)
766 return (__m256) __builtin_ia32_vbroadcastss256 (__X);
769 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
770 _mm256_broadcast_pd (__m128d const *__X)
772 return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X);
775 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
776 _mm256_broadcast_ps (__m128 const *__X)
778 return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X);
781 #ifdef __OPTIMIZE__
782 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
783 _mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O)
785 return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X,
786 (__v2df)__Y,
787 __O);
790 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
791 _mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
793 return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
794 (__v4sf)__Y,
795 __O);
798 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
799 _mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O)
801 return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X,
802 (__v4si)__Y,
803 __O);
806 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
807 _mm256_insert_epi32 (__m256i __X, int __D, int const __N)
809 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
810 __Y = _mm_insert_epi16 (__Y, __D, __N % 4);
811 return _mm256_insertf128_si256 (__X, __Y, __N >> 2);
814 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
815 _mm256_insert_epi16 (__m256i __X, int __D, int const __N)
817 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
818 __Y = _mm_insert_epi16 (__Y, __D, __N % 8);
819 return _mm256_insertf128_si256 (__X, __Y, __N >> 3);
822 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
823 _mm256_insert_epi8 (__m256i __X, int __D, int const __N)
825 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
826 __Y = _mm_insert_epi8 (__Y, __D, __N % 16);
827 return _mm256_insertf128_si256 (__X, __Y, __N >> 4);
830 #ifdef __x86_64__
831 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
832 _mm256_insert_epi64 (__m256i __X, int __D, int const __N)
834 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
835 __Y = _mm_insert_epi16 (__Y, __D, __N % 2);
836 return _mm256_insertf128_si256 (__X, __Y, __N >> 1);
838 #endif
839 #else
840 #define _mm256_insertf128_pd(X, Y, O) \
841 ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X), \
842 (__v2df)(__m128d)(Y), \
843 (int)(O)))
845 #define _mm256_insertf128_ps(X, Y, O) \
846 ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X), \
847 (__v4sf)(__m128)(Y), \
848 (int)(O)))
850 #define _mm256_insertf128_si256(X, Y, O) \
851 ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X), \
852 (__v4si)(__m128i)(Y), \
853 (int)(O)))
855 #define _mm256_insert_epi32(X, D, N) \
856 (__extension__ \
857 ({ \
858 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
859 __Y = _mm_insert_epi32 (__Y, (D), (N) % 4); \
860 _mm256_insertf128_si256 ((X), __Y, (N) >> 2); \
863 #define _mm256_insert_epi16(X, D, N) \
864 (__extension__ \
865 ({ \
866 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
867 __Y = _mm_insert_epi16 (__Y, (D), (N) % 8); \
868 _mm256_insertf128_si256 ((X), __Y, (N) >> 3); \
871 #define _mm256_insert_epi8(X, D, N) \
872 (__extension__ \
873 ({ \
874 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
875 __Y = _mm_insert_epi8 (__Y, (D), (N) % 16); \
876 _mm256_insertf128_si256 ((X), __Y, (N) >> 4); \
879 #ifdef __x86_64__
880 #define _mm256_insert_epi64(X, D, N) \
881 (__extension__ \
882 ({ \
883 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
884 __Y = _mm_insert_epi64 (__Y, (D), (N) % 2); \
885 _mm256_insertf128_si256 ((X), __Y, (N) >> 1); \
887 #endif
888 #endif
890 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
891 _mm256_load_pd (double const *__P)
893 return *(__m256d *)__P;
896 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
897 _mm256_store_pd (double *__P, __m256d __A)
899 *(__m256d *)__P = __A;
902 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
903 _mm256_load_ps (float const *__P)
905 return *(__m256 *)__P;
908 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
909 _mm256_store_ps (float *__P, __m256 __A)
911 *(__m256 *)__P = __A;
914 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
915 _mm256_loadu_pd (double const *__P)
917 return (__m256d) __builtin_ia32_loadupd256 (__P);
920 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
921 _mm256_storeu_pd (double *__P, __m256d __A)
923 __builtin_ia32_storeupd256 (__P, (__v4df)__A);
926 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
927 _mm256_loadu_ps (float const *__P)
929 return (__m256) __builtin_ia32_loadups256 (__P);
932 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
933 _mm256_storeu_ps (float *__P, __m256 __A)
935 __builtin_ia32_storeups256 (__P, (__v8sf)__A);
938 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
939 _mm256_load_si256 (__m256i const *__P)
941 return *__P;
944 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
945 _mm256_store_si256 (__m256i *__P, __m256i __A)
947 *__P = __A;
950 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
951 _mm256_loadu_si256 (__m256i const *__P)
953 return (__m256i) __builtin_ia32_loaddqu256 ((char const *)__P);
956 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
957 _mm256_storeu_si256 (__m256i *__P, __m256i __A)
959 __builtin_ia32_storedqu256 ((char *)__P, (__v32qi)__A);
962 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
963 _mm_maskload_pd (double const *__P, __m128d __M)
965 return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P,
966 (__v2df)__M);
969 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
970 _mm_maskstore_pd (double *__P, __m128d __M, __m128d __A)
972 __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2df)__M, (__v2df)__A);
975 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
976 _mm256_maskload_pd (double const *__P, __m256d __M)
978 return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P,
979 (__v4df)__M);
982 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
983 _mm256_maskstore_pd (double *__P, __m256d __M, __m256d __A)
985 __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4df)__M, (__v4df)__A);
988 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
989 _mm_maskload_ps (float const *__P, __m128 __M)
991 return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P,
992 (__v4sf)__M);
995 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
996 _mm_maskstore_ps (float *__P, __m128 __M, __m128 __A)
998 __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4sf)__M, (__v4sf)__A);
1001 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1002 _mm256_maskload_ps (float const *__P, __m256 __M)
1004 return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P,
1005 (__v8sf)__M);
1008 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1009 _mm256_maskstore_ps (float *__P, __m256 __M, __m256 __A)
1011 __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8sf)__M, (__v8sf)__A);
1014 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1015 _mm256_movehdup_ps (__m256 __X)
1017 return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X);
1020 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1021 _mm256_moveldup_ps (__m256 __X)
1023 return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X);
1026 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1027 _mm256_movedup_pd (__m256d __X)
1029 return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X);
1032 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1033 _mm256_lddqu_si256 (__m256i const *__P)
1035 return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P);
1038 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1039 _mm256_rcp_ps (__m256 __A)
1041 return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A);
1044 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1045 _mm256_rsqrt_ps (__m256 __A)
1047 return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A);
1050 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1051 _mm256_sqrt_pd (__m256d __A)
1053 return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A);
1056 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1057 _mm256_sqrt_ps (__m256 __A)
1059 return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A);
1062 #ifdef __OPTIMIZE__
1063 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1064 _mm256_round_pd (__m256d __V, const int __M)
1066 return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M);
1069 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1070 _mm256_round_ps (__m256 __V, const int __M)
1072 return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M);
1074 #else
1075 #define _mm256_round_pd(V, M) \
1076 ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M)))
1078 #define _mm256_round_ps(V, M) \
1079 ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M)))
1080 #endif
1082 #define _mm256_ceil_pd(V) _mm256_round_pd ((V), _MM_FROUND_CEIL)
1083 #define _mm256_floor_pd(V) _mm256_round_pd ((V), _MM_FROUND_FLOOR)
1084 #define _mm256_ceil_ps(V) _mm256_round_ps ((V), _MM_FROUND_CEIL)
1085 #define _mm256_floor_ps(V) _mm256_round_ps ((V), _MM_FROUND_FLOOR)
1087 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1088 _mm256_unpackhi_pd (__m256d __A, __m256d __B)
1090 return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B);
1093 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1094 _mm256_unpacklo_pd (__m256d __A, __m256d __B)
1096 return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B);
1099 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1100 _mm256_unpackhi_ps (__m256 __A, __m256 __B)
1102 return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B);
1105 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1106 _mm256_unpacklo_ps (__m256 __A, __m256 __B)
1108 return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B);
1111 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1112 _mm_testz_pd (__m128d __M, __m128d __V)
1114 return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V);
1117 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1118 _mm_testc_pd (__m128d __M, __m128d __V)
1120 return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V);
1123 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1124 _mm_testnzc_pd (__m128d __M, __m128d __V)
1126 return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V);
1129 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1130 _mm_testz_ps (__m128 __M, __m128 __V)
1132 return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V);
1135 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1136 _mm_testc_ps (__m128 __M, __m128 __V)
1138 return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V);
1141 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1142 _mm_testnzc_ps (__m128 __M, __m128 __V)
1144 return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V);
1147 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1148 _mm256_testz_pd (__m256d __M, __m256d __V)
1150 return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V);
1153 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1154 _mm256_testc_pd (__m256d __M, __m256d __V)
1156 return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V);
1159 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1160 _mm256_testnzc_pd (__m256d __M, __m256d __V)
1162 return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V);
1165 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1166 _mm256_testz_ps (__m256 __M, __m256 __V)
1168 return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V);
1171 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1172 _mm256_testc_ps (__m256 __M, __m256 __V)
1174 return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V);
1177 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1178 _mm256_testnzc_ps (__m256 __M, __m256 __V)
1180 return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V);
1183 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1184 _mm256_testz_si256 (__m256i __M, __m256i __V)
1186 return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V);
1189 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1190 _mm256_testc_si256 (__m256i __M, __m256i __V)
1192 return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V);
1195 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1196 _mm256_testnzc_si256 (__m256i __M, __m256i __V)
1198 return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V);
1201 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1202 _mm256_movemask_pd (__m256d __A)
1204 return __builtin_ia32_movmskpd256 ((__v4df)__A);
1207 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1208 _mm256_movemask_ps (__m256 __A)
1210 return __builtin_ia32_movmskps256 ((__v8sf)__A);
1213 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1214 _mm256_setzero_pd (void)
1216 return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
1219 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1220 _mm256_setzero_ps (void)
1222 return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
1223 0.0, 0.0, 0.0, 0.0 };
1226 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1227 _mm256_setzero_si256 (void)
1229 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
1232 /* Create the vector [A B C D]. */
1233 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1234 _mm256_set_pd (double __A, double __B, double __C, double __D)
1236 return __extension__ (__m256d){ __D, __C, __B, __A };
1239 /* Create the vector [A B C D E F G H]. */
1240 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1241 _mm256_set_ps (float __A, float __B, float __C, float __D,
1242 float __E, float __F, float __G, float __H)
1244 return __extension__ (__m256){ __H, __G, __F, __E,
1245 __D, __C, __B, __A };
1248 /* Create the vector [A B C D E F G H]. */
1249 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1250 _mm256_set_epi32 (int __A, int __B, int __C, int __D,
1251 int __E, int __F, int __G, int __H)
1253 return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E,
1254 __D, __C, __B, __A };
1257 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1258 _mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12,
1259 short __q11, short __q10, short __q09, short __q08,
1260 short __q07, short __q06, short __q05, short __q04,
1261 short __q03, short __q02, short __q01, short __q00)
1263 return __extension__ (__m256i)(__v16hi){
1264 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1265 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
1269 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1270 _mm256_set_epi8 (char __q31, char __q30, char __q29, char __q28,
1271 char __q27, char __q26, char __q25, char __q24,
1272 char __q23, char __q22, char __q21, char __q20,
1273 char __q19, char __q18, char __q17, char __q16,
1274 char __q15, char __q14, char __q13, char __q12,
1275 char __q11, char __q10, char __q09, char __q08,
1276 char __q07, char __q06, char __q05, char __q04,
1277 char __q03, char __q02, char __q01, char __q00)
1279 return __extension__ (__m256i)(__v32qi){
1280 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1281 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
1282 __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
1283 __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
1287 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1288 _mm256_set_epi64x (long long __A, long long __B, long long __C,
1289 long long __D)
1291 return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A };
1294 /* Create a vector with all elements equal to A. */
1295 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1296 _mm256_set1_pd (double __A)
1298 return __extension__ (__m256d){ __A, __A, __A, __A };
1301 /* Create a vector with all elements equal to A. */
1302 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1303 _mm256_set1_ps (float __A)
1305 return __extension__ (__m256){ __A, __A, __A, __A,
1306 __A, __A, __A, __A };
1309 /* Create a vector with all elements equal to A. */
1310 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1311 _mm256_set1_epi32 (int __A)
1313 return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
1314 __A, __A, __A, __A };
1317 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1318 _mm256_set1_epi16 (short __A)
1320 return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
1321 __A, __A, __A, __A, __A, __A, __A, __A);
1324 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1325 _mm256_set1_epi8 (char __A)
1327 return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
1328 __A, __A, __A, __A, __A, __A, __A, __A,
1329 __A, __A, __A, __A, __A, __A, __A, __A,
1330 __A, __A, __A, __A, __A, __A, __A, __A);
1333 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1334 _mm256_set1_epi64x (long long __A)
1336 return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A };
1339 /* Create vectors of elements in the reversed order from the
1340 _mm256_set_XXX functions. */
1342 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1343 _mm256_setr_pd (double __A, double __B, double __C, double __D)
1345 return _mm256_set_pd (__D, __C, __B, __A);
1348 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1349 _mm256_setr_ps (float __A, float __B, float __C, float __D,
1350 float __E, float __F, float __G, float __H)
1352 return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A);
1355 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1356 _mm256_setr_epi32 (int __A, int __B, int __C, int __D,
1357 int __E, int __F, int __G, int __H)
1359 return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A);
1362 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1363 _mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12,
1364 short __q11, short __q10, short __q09, short __q08,
1365 short __q07, short __q06, short __q05, short __q04,
1366 short __q03, short __q02, short __q01, short __q00)
1368 return _mm256_set_epi16 (__q00, __q01, __q02, __q03,
1369 __q04, __q05, __q06, __q07,
1370 __q08, __q09, __q10, __q11,
1371 __q12, __q13, __q14, __q15);
1374 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1375 _mm256_setr_epi8 (char __q31, char __q30, char __q29, char __q28,
1376 char __q27, char __q26, char __q25, char __q24,
1377 char __q23, char __q22, char __q21, char __q20,
1378 char __q19, char __q18, char __q17, char __q16,
1379 char __q15, char __q14, char __q13, char __q12,
1380 char __q11, char __q10, char __q09, char __q08,
1381 char __q07, char __q06, char __q05, char __q04,
1382 char __q03, char __q02, char __q01, char __q00)
1384 return _mm256_set_epi8 (__q00, __q01, __q02, __q03,
1385 __q04, __q05, __q06, __q07,
1386 __q08, __q09, __q10, __q11,
1387 __q12, __q13, __q14, __q15,
1388 __q16, __q17, __q18, __q19,
1389 __q20, __q21, __q22, __q23,
1390 __q24, __q25, __q26, __q27,
1391 __q28, __q29, __q30, __q31);
1394 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1395 _mm256_setr_epi64x (long long __A, long long __B, long long __C,
1396 long long __D)
1398 return _mm256_set_epi64x (__D, __C, __B, __A);
1401 /* Casts between various SP, DP, INT vector types. Note that these do no
1402 conversion of values, they just change the type. */
1403 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1404 _mm256_castpd_ps (__m256d __A)
1406 return (__m256) __A;
1409 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1410 _mm256_castpd_si256 (__m256d __A)
1412 return (__m256i) __A;
1415 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1416 _mm256_castps_pd (__m256 __A)
1418 return (__m256d) __A;
1421 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1422 _mm256_castps_si256(__m256 __A)
1424 return (__m256i) __A;
1427 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1428 _mm256_castsi256_ps (__m256i __A)
1430 return (__m256) __A;
1433 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1434 _mm256_castsi256_pd (__m256i __A)
1436 return (__m256d) __A;
1439 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1440 _mm256_castpd256_pd128 (__m256d __A)
1442 return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A);
1445 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1446 _mm256_castps256_ps128 (__m256 __A)
1448 return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A);
1451 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1452 _mm256_castsi256_si128 (__m256i __A)
1454 return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A);
1457 /* When cast is done from a 128 to 256-bit type, the low 128 bits of
1458 the 256-bit result contain source parameter value and the upper 128
1459 bits of the result are undefined. Those intrinsics shouldn't
1460 generate any extra moves. */
1462 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1463 _mm256_castpd128_pd256 (__m128d __A)
1465 return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A);
1468 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1469 _mm256_castps128_ps256 (__m128 __A)
1471 return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A);
1474 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1475 _mm256_castsi128_si256 (__m128i __A)
1477 return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);
1480 #endif /* __AVX__ */
1482 #endif /* _GMMINTRIN_H_INCLUDED */