xmmintrin.h (_mm_add_ps, [...]): Use vector extensions instead of builtins.
[official-gcc.git] / gcc / config / i386 / avx2intrin.h
blobbca9c9e620db9c7fecafc4fd0c319478944a66df
1 /* Copyright (C) 2011-2014 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
24 #ifndef _IMMINTRIN_H_INCLUDED
25 # error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
26 #endif
28 #ifndef _AVX2INTRIN_H_INCLUDED
29 #define _AVX2INTRIN_H_INCLUDED
31 #ifndef __AVX2__
32 #pragma GCC push_options
33 #pragma GCC target("avx2")
34 #define __DISABLE_AVX2__
35 #endif /* __AVX2__ */
37 /* Sum absolute 8-bit integer difference of adjacent groups of 4
38 byte integers in the first 2 operands. Starting offsets within
39 operands are determined by the 3rd mask operand. */
40 #ifdef __OPTIMIZE__
41 extern __inline __m256i
42 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
43 _mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M)
45 return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X,
46 (__v32qi)__Y, __M);
48 #else
49 #define _mm256_mpsadbw_epu8(X, Y, M) \
50 ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X), \
51 (__v32qi)(__m256i)(Y), (int)(M)))
52 #endif
54 extern __inline __m256i
55 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
56 _mm256_abs_epi8 (__m256i __A)
58 return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A);
61 extern __inline __m256i
62 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
63 _mm256_abs_epi16 (__m256i __A)
65 return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A);
68 extern __inline __m256i
69 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
70 _mm256_abs_epi32 (__m256i __A)
72 return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A);
75 extern __inline __m256i
76 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
77 _mm256_packs_epi32 (__m256i __A, __m256i __B)
79 return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B);
82 extern __inline __m256i
83 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
84 _mm256_packs_epi16 (__m256i __A, __m256i __B)
86 return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B);
89 extern __inline __m256i
90 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
91 _mm256_packus_epi32 (__m256i __A, __m256i __B)
93 return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B);
96 extern __inline __m256i
97 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
98 _mm256_packus_epi16 (__m256i __A, __m256i __B)
100 return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B);
103 extern __inline __m256i
104 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
105 _mm256_add_epi8 (__m256i __A, __m256i __B)
107 return (__m256i) ((__v32qu)__A + (__v32qu)__B);
110 extern __inline __m256i
111 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
112 _mm256_add_epi16 (__m256i __A, __m256i __B)
114 return (__m256i) ((__v16hu)__A + (__v16hu)__B);
117 extern __inline __m256i
118 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
119 _mm256_add_epi32 (__m256i __A, __m256i __B)
121 return (__m256i) ((__v8su)__A + (__v8su)__B);
124 extern __inline __m256i
125 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
126 _mm256_add_epi64 (__m256i __A, __m256i __B)
128 return (__m256i) ((__v4du)__A + (__v4du)__B);
131 extern __inline __m256i
132 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
133 _mm256_adds_epi8 (__m256i __A, __m256i __B)
135 return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B);
138 extern __inline __m256i
139 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
140 _mm256_adds_epi16 (__m256i __A, __m256i __B)
142 return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B);
145 extern __inline __m256i
146 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
147 _mm256_adds_epu8 (__m256i __A, __m256i __B)
149 return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B);
152 extern __inline __m256i
153 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
154 _mm256_adds_epu16 (__m256i __A, __m256i __B)
156 return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B);
159 #ifdef __OPTIMIZE__
160 extern __inline __m256i
161 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
162 _mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N)
164 return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A,
165 (__v4di)__B,
166 __N * 8);
168 #else
169 /* In that case (__N*8) will be in vreg, and insn will not be matched. */
170 /* Use define instead */
171 #define _mm256_alignr_epi8(A, B, N) \
172 ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A), \
173 (__v4di)(__m256i)(B), \
174 (int)(N) * 8))
175 #endif
177 extern __inline __m256i
178 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
179 _mm256_and_si256 (__m256i __A, __m256i __B)
181 return (__m256i) ((__v4du)__A & (__v4du)__B);
184 extern __inline __m256i
185 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
186 _mm256_andnot_si256 (__m256i __A, __m256i __B)
188 return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B);
191 extern __inline __m256i
192 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
193 _mm256_avg_epu8 (__m256i __A, __m256i __B)
195 return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B);
198 extern __inline __m256i
199 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
200 _mm256_avg_epu16 (__m256i __A, __m256i __B)
202 return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B);
205 extern __inline __m256i
206 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
207 _mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M)
209 return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X,
210 (__v32qi)__Y,
211 (__v32qi)__M);
214 #ifdef __OPTIMIZE__
215 extern __inline __m256i
216 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
217 _mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M)
219 return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X,
220 (__v16hi)__Y,
221 __M);
223 #else
224 #define _mm256_blend_epi16(X, Y, M) \
225 ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X), \
226 (__v16hi)(__m256i)(Y), (int)(M)))
227 #endif
229 extern __inline __m256i
230 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
231 _mm256_cmpeq_epi8 (__m256i __A, __m256i __B)
233 return (__m256i) ((__v32qi)__A == (__v32qi)__B);
236 extern __inline __m256i
237 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
238 _mm256_cmpeq_epi16 (__m256i __A, __m256i __B)
240 return (__m256i) ((__v16hi)__A == (__v16hi)__B);
243 extern __inline __m256i
244 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
245 _mm256_cmpeq_epi32 (__m256i __A, __m256i __B)
247 return (__m256i) ((__v8si)__A == (__v8si)__B);
250 extern __inline __m256i
251 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
252 _mm256_cmpeq_epi64 (__m256i __A, __m256i __B)
254 return (__m256i) ((__v4di)__A == (__v4di)__B);
257 extern __inline __m256i
258 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
259 _mm256_cmpgt_epi8 (__m256i __A, __m256i __B)
261 return (__m256i) ((__v32qi)__A > (__v32qi)__B);
264 extern __inline __m256i
265 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
266 _mm256_cmpgt_epi16 (__m256i __A, __m256i __B)
268 return (__m256i) ((__v16hi)__A > (__v16hi)__B);
271 extern __inline __m256i
272 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
273 _mm256_cmpgt_epi32 (__m256i __A, __m256i __B)
275 return (__m256i) ((__v8si)__A > (__v8si)__B);
278 extern __inline __m256i
279 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
280 _mm256_cmpgt_epi64 (__m256i __A, __m256i __B)
282 return (__m256i) ((__v4di)__A > (__v4di)__B);
285 extern __inline __m256i
286 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
287 _mm256_hadd_epi16 (__m256i __X, __m256i __Y)
289 return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X,
290 (__v16hi)__Y);
293 extern __inline __m256i
294 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
295 _mm256_hadd_epi32 (__m256i __X, __m256i __Y)
297 return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y);
300 extern __inline __m256i
301 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
302 _mm256_hadds_epi16 (__m256i __X, __m256i __Y)
304 return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X,
305 (__v16hi)__Y);
308 extern __inline __m256i
309 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
310 _mm256_hsub_epi16 (__m256i __X, __m256i __Y)
312 return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X,
313 (__v16hi)__Y);
316 extern __inline __m256i
317 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
318 _mm256_hsub_epi32 (__m256i __X, __m256i __Y)
320 return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y);
323 extern __inline __m256i
324 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
325 _mm256_hsubs_epi16 (__m256i __X, __m256i __Y)
327 return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X,
328 (__v16hi)__Y);
331 extern __inline __m256i
332 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
333 _mm256_maddubs_epi16 (__m256i __X, __m256i __Y)
335 return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X,
336 (__v32qi)__Y);
339 extern __inline __m256i
340 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
341 _mm256_madd_epi16 (__m256i __A, __m256i __B)
343 return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A,
344 (__v16hi)__B);
347 extern __inline __m256i
348 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
349 _mm256_max_epi8 (__m256i __A, __m256i __B)
351 return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B);
354 extern __inline __m256i
355 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
356 _mm256_max_epi16 (__m256i __A, __m256i __B)
358 return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B);
361 extern __inline __m256i
362 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
363 _mm256_max_epi32 (__m256i __A, __m256i __B)
365 return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B);
368 extern __inline __m256i
369 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
370 _mm256_max_epu8 (__m256i __A, __m256i __B)
372 return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B);
375 extern __inline __m256i
376 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
377 _mm256_max_epu16 (__m256i __A, __m256i __B)
379 return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B);
382 extern __inline __m256i
383 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
384 _mm256_max_epu32 (__m256i __A, __m256i __B)
386 return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B);
389 extern __inline __m256i
390 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
391 _mm256_min_epi8 (__m256i __A, __m256i __B)
393 return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B);
396 extern __inline __m256i
397 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
398 _mm256_min_epi16 (__m256i __A, __m256i __B)
400 return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B);
403 extern __inline __m256i
404 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
405 _mm256_min_epi32 (__m256i __A, __m256i __B)
407 return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B);
410 extern __inline __m256i
411 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
412 _mm256_min_epu8 (__m256i __A, __m256i __B)
414 return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B);
417 extern __inline __m256i
418 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
419 _mm256_min_epu16 (__m256i __A, __m256i __B)
421 return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B);
424 extern __inline __m256i
425 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
426 _mm256_min_epu32 (__m256i __A, __m256i __B)
428 return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B);
431 extern __inline int
432 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
433 _mm256_movemask_epi8 (__m256i __A)
435 return __builtin_ia32_pmovmskb256 ((__v32qi)__A);
438 extern __inline __m256i
439 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
440 _mm256_cvtepi8_epi16 (__m128i __X)
442 return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X);
445 extern __inline __m256i
446 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
447 _mm256_cvtepi8_epi32 (__m128i __X)
449 return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X);
452 extern __inline __m256i
453 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
454 _mm256_cvtepi8_epi64 (__m128i __X)
456 return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X);
459 extern __inline __m256i
460 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
461 _mm256_cvtepi16_epi32 (__m128i __X)
463 return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X);
466 extern __inline __m256i
467 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
468 _mm256_cvtepi16_epi64 (__m128i __X)
470 return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X);
473 extern __inline __m256i
474 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
475 _mm256_cvtepi32_epi64 (__m128i __X)
477 return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X);
480 extern __inline __m256i
481 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
482 _mm256_cvtepu8_epi16 (__m128i __X)
484 return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X);
487 extern __inline __m256i
488 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
489 _mm256_cvtepu8_epi32 (__m128i __X)
491 return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X);
494 extern __inline __m256i
495 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
496 _mm256_cvtepu8_epi64 (__m128i __X)
498 return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X);
501 extern __inline __m256i
502 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
503 _mm256_cvtepu16_epi32 (__m128i __X)
505 return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X);
508 extern __inline __m256i
509 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
510 _mm256_cvtepu16_epi64 (__m128i __X)
512 return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X);
515 extern __inline __m256i
516 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
517 _mm256_cvtepu32_epi64 (__m128i __X)
519 return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X);
522 extern __inline __m256i
523 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
524 _mm256_mul_epi32 (__m256i __X, __m256i __Y)
526 return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y);
529 extern __inline __m256i
530 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
531 _mm256_mulhrs_epi16 (__m256i __X, __m256i __Y)
533 return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X,
534 (__v16hi)__Y);
537 extern __inline __m256i
538 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
539 _mm256_mulhi_epu16 (__m256i __A, __m256i __B)
541 return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B);
544 extern __inline __m256i
545 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
546 _mm256_mulhi_epi16 (__m256i __A, __m256i __B)
548 return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B);
551 extern __inline __m256i
552 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
553 _mm256_mullo_epi16 (__m256i __A, __m256i __B)
555 return (__m256i) ((__v16hu)__A * (__v16hu)__B);
558 extern __inline __m256i
559 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
560 _mm256_mullo_epi32 (__m256i __A, __m256i __B)
562 return (__m256i) ((__v8su)__A * (__v8su)__B);
565 extern __inline __m256i
566 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
567 _mm256_mul_epu32 (__m256i __A, __m256i __B)
569 return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B);
572 extern __inline __m256i
573 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
574 _mm256_or_si256 (__m256i __A, __m256i __B)
576 return (__m256i) ((__v4du)__A | (__v4du)__B);
579 extern __inline __m256i
580 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
581 _mm256_sad_epu8 (__m256i __A, __m256i __B)
583 return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B);
586 extern __inline __m256i
587 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
588 _mm256_shuffle_epi8 (__m256i __X, __m256i __Y)
590 return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X,
591 (__v32qi)__Y);
594 #ifdef __OPTIMIZE__
595 extern __inline __m256i
596 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
597 _mm256_shuffle_epi32 (__m256i __A, const int __mask)
599 return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask);
602 extern __inline __m256i
603 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
604 _mm256_shufflehi_epi16 (__m256i __A, const int __mask)
606 return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask);
609 extern __inline __m256i
610 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
611 _mm256_shufflelo_epi16 (__m256i __A, const int __mask)
613 return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask);
615 #else
616 #define _mm256_shuffle_epi32(A, N) \
617 ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N)))
618 #define _mm256_shufflehi_epi16(A, N) \
619 ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N)))
620 #define _mm256_shufflelo_epi16(A, N) \
621 ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N)))
622 #endif
624 extern __inline __m256i
625 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
626 _mm256_sign_epi8 (__m256i __X, __m256i __Y)
628 return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y);
631 extern __inline __m256i
632 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
633 _mm256_sign_epi16 (__m256i __X, __m256i __Y)
635 return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y);
638 extern __inline __m256i
639 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
640 _mm256_sign_epi32 (__m256i __X, __m256i __Y)
642 return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y);
645 #ifdef __OPTIMIZE__
646 extern __inline __m256i
647 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
648 _mm256_slli_si256 (__m256i __A, const int __N)
650 return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
652 #else
653 #define _mm256_slli_si256(A, N) \
654 ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
655 #endif
657 extern __inline __m256i
658 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
659 _mm256_slli_epi16 (__m256i __A, int __B)
661 return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
664 extern __inline __m256i
665 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
666 _mm256_sll_epi16 (__m256i __A, __m128i __B)
668 return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B);
671 extern __inline __m256i
672 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
673 _mm256_slli_epi32 (__m256i __A, int __B)
675 return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
678 extern __inline __m256i
679 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
680 _mm256_sll_epi32 (__m256i __A, __m128i __B)
682 return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B);
685 extern __inline __m256i
686 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
687 _mm256_slli_epi64 (__m256i __A, int __B)
689 return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B);
692 extern __inline __m256i
693 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
694 _mm256_sll_epi64 (__m256i __A, __m128i __B)
696 return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B);
699 extern __inline __m256i
700 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
701 _mm256_srai_epi16 (__m256i __A, int __B)
703 return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B);
706 extern __inline __m256i
707 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
708 _mm256_sra_epi16 (__m256i __A, __m128i __B)
710 return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B);
713 extern __inline __m256i
714 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
715 _mm256_srai_epi32 (__m256i __A, int __B)
717 return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B);
720 extern __inline __m256i
721 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
722 _mm256_sra_epi32 (__m256i __A, __m128i __B)
724 return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B);
727 #ifdef __OPTIMIZE__
728 extern __inline __m256i
729 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
730 _mm256_srli_si256 (__m256i __A, const int __N)
732 return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
734 #else
735 #define _mm256_srli_si256(A, N) \
736 ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
737 #endif
739 extern __inline __m256i
740 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
741 _mm256_srli_epi16 (__m256i __A, int __B)
743 return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B);
746 extern __inline __m256i
747 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
748 _mm256_srl_epi16 (__m256i __A, __m128i __B)
750 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B);
753 extern __inline __m256i
754 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
755 _mm256_srli_epi32 (__m256i __A, int __B)
757 return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B);
760 extern __inline __m256i
761 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
762 _mm256_srl_epi32 (__m256i __A, __m128i __B)
764 return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B);
767 extern __inline __m256i
768 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
769 _mm256_srli_epi64 (__m256i __A, int __B)
771 return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B);
774 extern __inline __m256i
775 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
776 _mm256_srl_epi64 (__m256i __A, __m128i __B)
778 return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B);
781 extern __inline __m256i
782 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
783 _mm256_sub_epi8 (__m256i __A, __m256i __B)
785 return (__m256i) ((__v32qu)__A - (__v32qu)__B);
788 extern __inline __m256i
789 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
790 _mm256_sub_epi16 (__m256i __A, __m256i __B)
792 return (__m256i) ((__v16hu)__A - (__v16hu)__B);
795 extern __inline __m256i
796 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
797 _mm256_sub_epi32 (__m256i __A, __m256i __B)
799 return (__m256i) ((__v8su)__A - (__v8su)__B);
802 extern __inline __m256i
803 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
804 _mm256_sub_epi64 (__m256i __A, __m256i __B)
806 return (__m256i) ((__v4du)__A - (__v4du)__B);
809 extern __inline __m256i
810 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
811 _mm256_subs_epi8 (__m256i __A, __m256i __B)
813 return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B);
816 extern __inline __m256i
817 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
818 _mm256_subs_epi16 (__m256i __A, __m256i __B)
820 return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B);
823 extern __inline __m256i
824 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
825 _mm256_subs_epu8 (__m256i __A, __m256i __B)
827 return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B);
830 extern __inline __m256i
831 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
832 _mm256_subs_epu16 (__m256i __A, __m256i __B)
834 return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B);
837 extern __inline __m256i
838 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
839 _mm256_unpackhi_epi8 (__m256i __A, __m256i __B)
841 return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B);
844 extern __inline __m256i
845 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
846 _mm256_unpackhi_epi16 (__m256i __A, __m256i __B)
848 return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B);
851 extern __inline __m256i
852 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
853 _mm256_unpackhi_epi32 (__m256i __A, __m256i __B)
855 return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B);
858 extern __inline __m256i
859 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
860 _mm256_unpackhi_epi64 (__m256i __A, __m256i __B)
862 return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B);
865 extern __inline __m256i
866 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
867 _mm256_unpacklo_epi8 (__m256i __A, __m256i __B)
869 return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B);
872 extern __inline __m256i
873 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
874 _mm256_unpacklo_epi16 (__m256i __A, __m256i __B)
876 return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B);
879 extern __inline __m256i
880 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
881 _mm256_unpacklo_epi32 (__m256i __A, __m256i __B)
883 return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B);
886 extern __inline __m256i
887 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
888 _mm256_unpacklo_epi64 (__m256i __A, __m256i __B)
890 return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B);
893 extern __inline __m256i
894 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
895 _mm256_xor_si256 (__m256i __A, __m256i __B)
897 return (__m256i) ((__v4du)__A ^ (__v4du)__B);
900 extern __inline __m256i
901 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
902 _mm256_stream_load_si256 (__m256i const *__X)
904 return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X);
907 extern __inline __m128
908 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
909 _mm_broadcastss_ps (__m128 __X)
911 return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X);
914 extern __inline __m256
915 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
916 _mm256_broadcastss_ps (__m128 __X)
918 return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X);
921 extern __inline __m256d
922 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
923 _mm256_broadcastsd_pd (__m128d __X)
925 return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X);
928 extern __inline __m256i
929 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
930 _mm256_broadcastsi128_si256 (__m128i __X)
932 return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X);
935 #ifdef __OPTIMIZE__
936 extern __inline __m128i
937 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
938 _mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M)
940 return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X,
941 (__v4si)__Y,
942 __M);
944 #else
945 #define _mm_blend_epi32(X, Y, M) \
946 ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X), \
947 (__v4si)(__m128i)(Y), (int)(M)))
948 #endif
950 #ifdef __OPTIMIZE__
951 extern __inline __m256i
952 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
953 _mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M)
955 return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X,
956 (__v8si)__Y,
957 __M);
959 #else
960 #define _mm256_blend_epi32(X, Y, M) \
961 ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X), \
962 (__v8si)(__m256i)(Y), (int)(M)))
963 #endif
965 extern __inline __m256i
966 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
967 _mm256_broadcastb_epi8 (__m128i __X)
969 return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X);
972 extern __inline __m256i
973 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
974 _mm256_broadcastw_epi16 (__m128i __X)
976 return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X);
979 extern __inline __m256i
980 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
981 _mm256_broadcastd_epi32 (__m128i __X)
983 return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X);
986 extern __inline __m256i
987 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
988 _mm256_broadcastq_epi64 (__m128i __X)
990 return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X);
993 extern __inline __m128i
994 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
995 _mm_broadcastb_epi8 (__m128i __X)
997 return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X);
1000 extern __inline __m128i
1001 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1002 _mm_broadcastw_epi16 (__m128i __X)
1004 return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X);
1007 extern __inline __m128i
1008 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1009 _mm_broadcastd_epi32 (__m128i __X)
1011 return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X);
1014 extern __inline __m128i
1015 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1016 _mm_broadcastq_epi64 (__m128i __X)
1018 return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X);
1021 extern __inline __m256i
1022 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1023 _mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y)
1025 return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y);
1028 #ifdef __OPTIMIZE__
1029 extern __inline __m256d
1030 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1031 _mm256_permute4x64_pd (__m256d __X, const int __M)
1033 return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M);
1035 #else
1036 #define _mm256_permute4x64_pd(X, M) \
1037 ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M)))
1038 #endif
1040 extern __inline __m256
1041 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1042 _mm256_permutevar8x32_ps (__m256 __X, __m256i __Y)
1044 return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y);
1047 #ifdef __OPTIMIZE__
1048 extern __inline __m256i
1049 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1050 _mm256_permute4x64_epi64 (__m256i __X, const int __M)
1052 return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M);
1054 #else
1055 #define _mm256_permute4x64_epi64(X, M) \
1056 ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M)))
1057 #endif
1060 #ifdef __OPTIMIZE__
1061 extern __inline __m256i
1062 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1063 _mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M)
1065 return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M);
1067 #else
1068 #define _mm256_permute2x128_si256(X, Y, M) \
1069 ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M)))
1070 #endif
1072 #ifdef __OPTIMIZE__
1073 extern __inline __m128i
1074 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1075 _mm256_extracti128_si256 (__m256i __X, const int __M)
1077 return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M);
1079 #else
1080 #define _mm256_extracti128_si256(X, M) \
1081 ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M)))
1082 #endif
1084 #ifdef __OPTIMIZE__
1085 extern __inline __m256i
1086 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1087 _mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M)
1089 return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M);
1091 #else
1092 #define _mm256_inserti128_si256(X, Y, M) \
1093 ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \
1094 (__v2di)(__m128i)(Y), \
1095 (int)(M)))
1096 #endif
1098 extern __inline __m256i
1099 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1100 _mm256_maskload_epi32 (int const *__X, __m256i __M )
1102 return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X,
1103 (__v8si)__M);
1106 extern __inline __m256i
1107 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1108 _mm256_maskload_epi64 (long long const *__X, __m256i __M )
1110 return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X,
1111 (__v4di)__M);
1114 extern __inline __m128i
1115 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1116 _mm_maskload_epi32 (int const *__X, __m128i __M )
1118 return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X,
1119 (__v4si)__M);
1122 extern __inline __m128i
1123 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1124 _mm_maskload_epi64 (long long const *__X, __m128i __M )
1126 return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X,
1127 (__v2di)__M);
1130 extern __inline void
1131 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1132 _mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y )
1134 __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
1137 extern __inline void
1138 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1139 _mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y )
1141 __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
1144 extern __inline void
1145 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1146 _mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y )
1148 __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
1151 extern __inline void
1152 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1153 _mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y )
1155 __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
1158 extern __inline __m256i
1159 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1160 _mm256_sllv_epi32 (__m256i __X, __m256i __Y)
1162 return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y);
1165 extern __inline __m128i
1166 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1167 _mm_sllv_epi32 (__m128i __X, __m128i __Y)
1169 return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y);
1172 extern __inline __m256i
1173 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1174 _mm256_sllv_epi64 (__m256i __X, __m256i __Y)
1176 return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y);
1179 extern __inline __m128i
1180 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1181 _mm_sllv_epi64 (__m128i __X, __m128i __Y)
1183 return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y);
1186 extern __inline __m256i
1187 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1188 _mm256_srav_epi32 (__m256i __X, __m256i __Y)
1190 return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y);
1193 extern __inline __m128i
1194 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1195 _mm_srav_epi32 (__m128i __X, __m128i __Y)
1197 return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y);
1200 extern __inline __m256i
1201 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1202 _mm256_srlv_epi32 (__m256i __X, __m256i __Y)
1204 return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y);
1207 extern __inline __m128i
1208 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1209 _mm_srlv_epi32 (__m128i __X, __m128i __Y)
1211 return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y);
1214 extern __inline __m256i
1215 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1216 _mm256_srlv_epi64 (__m256i __X, __m256i __Y)
1218 return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y);
1221 extern __inline __m128i
1222 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1223 _mm_srlv_epi64 (__m128i __X, __m128i __Y)
1225 return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y);
1228 #ifdef __OPTIMIZE__
1229 extern __inline __m128d
1230 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1231 _mm_i32gather_pd (double const *base, __m128i index, const int scale)
1233 __v2df zero = _mm_setzero_pd ();
1234 __v2df mask = _mm_cmpeq_pd (zero, zero);
1236 return (__m128d) __builtin_ia32_gathersiv2df (_mm_undefined_pd (),
1237 base,
1238 (__v4si)index,
1239 mask,
1240 scale);
1243 extern __inline __m128d
1244 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1245 _mm_mask_i32gather_pd (__m128d src, double const *base, __m128i index,
1246 __m128d mask, const int scale)
1248 return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)src,
1249 base,
1250 (__v4si)index,
1251 (__v2df)mask,
1252 scale);
1255 extern __inline __m256d
1256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1257 _mm256_i32gather_pd (double const *base, __m128i index, const int scale)
1259 __v4df zero = _mm256_setzero_pd ();
1260 __v4df mask = _mm256_cmp_pd (zero, zero, _CMP_EQ_OQ);
1262 return (__m256d) __builtin_ia32_gathersiv4df (_mm256_undefined_pd (),
1263 base,
1264 (__v4si)index,
1265 mask,
1266 scale);
1269 extern __inline __m256d
1270 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1271 _mm256_mask_i32gather_pd (__m256d src, double const *base,
1272 __m128i index, __m256d mask, const int scale)
1274 return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)src,
1275 base,
1276 (__v4si)index,
1277 (__v4df)mask,
1278 scale);
1281 extern __inline __m128d
1282 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1283 _mm_i64gather_pd (double const *base, __m128i index, const int scale)
1285 __v2df src = _mm_setzero_pd ();
1286 __v2df mask = _mm_cmpeq_pd (src, src);
1288 return (__m128d) __builtin_ia32_gatherdiv2df (src,
1289 base,
1290 (__v2di)index,
1291 mask,
1292 scale);
1295 extern __inline __m128d
1296 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1297 _mm_mask_i64gather_pd (__m128d src, double const *base, __m128i index,
1298 __m128d mask, const int scale)
1300 return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)src,
1301 base,
1302 (__v2di)index,
1303 (__v2df)mask,
1304 scale);
1307 extern __inline __m256d
1308 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1309 _mm256_i64gather_pd (double const *base, __m256i index, const int scale)
1311 __v4df src = _mm256_setzero_pd ();
1312 __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ);
1314 return (__m256d) __builtin_ia32_gatherdiv4df (src,
1315 base,
1316 (__v4di)index,
1317 mask,
1318 scale);
1321 extern __inline __m256d
1322 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1323 _mm256_mask_i64gather_pd (__m256d src, double const *base,
1324 __m256i index, __m256d mask, const int scale)
1326 return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)src,
1327 base,
1328 (__v4di)index,
1329 (__v4df)mask,
1330 scale);
1333 extern __inline __m128
1334 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1335 _mm_i32gather_ps (float const *base, __m128i index, const int scale)
1337 __v4sf src = _mm_setzero_ps ();
1338 __v4sf mask = _mm_cmpeq_ps (src, src);
1340 return (__m128) __builtin_ia32_gathersiv4sf (src,
1341 base,
1342 (__v4si)index,
1343 mask,
1344 scale);
1347 extern __inline __m128
1348 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1349 _mm_mask_i32gather_ps (__m128 src, float const *base, __m128i index,
1350 __m128 mask, const int scale)
1352 return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)src,
1353 base,
1354 (__v4si)index,
1355 (__v4sf)mask,
1356 scale);
1359 extern __inline __m256
1360 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1361 _mm256_i32gather_ps (float const *base, __m256i index, const int scale)
1363 __v8sf src = _mm256_setzero_ps ();
1364 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
1366 return (__m256) __builtin_ia32_gathersiv8sf (src,
1367 base,
1368 (__v8si)index,
1369 mask,
1370 scale);
1373 extern __inline __m256
1374 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1375 _mm256_mask_i32gather_ps (__m256 src, float const *base,
1376 __m256i index, __m256 mask, const int scale)
1378 return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)src,
1379 base,
1380 (__v8si)index,
1381 (__v8sf)mask,
1382 scale);
1385 extern __inline __m128
1386 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1387 _mm_i64gather_ps (float const *base, __m128i index, const int scale)
1389 __v4sf src = _mm_setzero_ps ();
1390 __v4sf mask = _mm_cmpeq_ps (src, src);
1392 return (__m128) __builtin_ia32_gatherdiv4sf (src,
1393 base,
1394 (__v2di)index,
1395 mask,
1396 scale);
1399 extern __inline __m128
1400 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1401 _mm_mask_i64gather_ps (__m128 src, float const *base, __m128i index,
1402 __m128 mask, const int scale)
1404 return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)src,
1405 base,
1406 (__v2di)index,
1407 (__v4sf)mask,
1408 scale);
1411 extern __inline __m128
1412 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1413 _mm256_i64gather_ps (float const *base, __m256i index, const int scale)
1415 __v4sf src = _mm_setzero_ps ();
1416 __v4sf mask = _mm_cmpeq_ps (src, src);
1418 return (__m128) __builtin_ia32_gatherdiv4sf256 (src,
1419 base,
1420 (__v4di)index,
1421 mask,
1422 scale);
1425 extern __inline __m128
1426 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1427 _mm256_mask_i64gather_ps (__m128 src, float const *base,
1428 __m256i index, __m128 mask, const int scale)
1430 return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)src,
1431 base,
1432 (__v4di)index,
1433 (__v4sf)mask,
1434 scale);
1437 extern __inline __m128i
1438 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1439 _mm_i32gather_epi64 (long long int const *base,
1440 __m128i index, const int scale)
1442 __v2di src = __extension__ (__v2di){ 0, 0 };
1443 __v2di mask = __extension__ (__v2di){ ~0, ~0 };
1445 return (__m128i) __builtin_ia32_gathersiv2di (src,
1446 base,
1447 (__v4si)index,
1448 mask,
1449 scale);
1452 extern __inline __m128i
1453 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1454 _mm_mask_i32gather_epi64 (__m128i src, long long int const *base,
1455 __m128i index, __m128i mask, const int scale)
1457 return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)src,
1458 base,
1459 (__v4si)index,
1460 (__v2di)mask,
1461 scale);
1464 extern __inline __m256i
1465 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1466 _mm256_i32gather_epi64 (long long int const *base,
1467 __m128i index, const int scale)
1469 __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 };
1470 __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
1472 return (__m256i) __builtin_ia32_gathersiv4di (src,
1473 base,
1474 (__v4si)index,
1475 mask,
1476 scale);
1479 extern __inline __m256i
1480 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1481 _mm256_mask_i32gather_epi64 (__m256i src, long long int const *base,
1482 __m128i index, __m256i mask, const int scale)
1484 return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)src,
1485 base,
1486 (__v4si)index,
1487 (__v4di)mask,
1488 scale);
1491 extern __inline __m128i
1492 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1493 _mm_i64gather_epi64 (long long int const *base,
1494 __m128i index, const int scale)
1496 __v2di src = __extension__ (__v2di){ 0, 0 };
1497 __v2di mask = __extension__ (__v2di){ ~0, ~0 };
1499 return (__m128i) __builtin_ia32_gatherdiv2di (src,
1500 base,
1501 (__v2di)index,
1502 mask,
1503 scale);
1506 extern __inline __m128i
1507 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1508 _mm_mask_i64gather_epi64 (__m128i src, long long int const *base, __m128i index,
1509 __m128i mask, const int scale)
1511 return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)src,
1512 base,
1513 (__v2di)index,
1514 (__v2di)mask,
1515 scale);
1518 extern __inline __m256i
1519 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1520 _mm256_i64gather_epi64 (long long int const *base,
1521 __m256i index, const int scale)
1523 __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 };
1524 __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
1526 return (__m256i) __builtin_ia32_gatherdiv4di (src,
1527 base,
1528 (__v4di)index,
1529 mask,
1530 scale);
1533 extern __inline __m256i
1534 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1535 _mm256_mask_i64gather_epi64 (__m256i src, long long int const *base,
1536 __m256i index, __m256i mask, const int scale)
1538 return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)src,
1539 base,
1540 (__v4di)index,
1541 (__v4di)mask,
1542 scale);
1545 extern __inline __m128i
1546 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1547 _mm_i32gather_epi32 (int const *base, __m128i index, const int scale)
1549 __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
1550 __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1552 return (__m128i) __builtin_ia32_gathersiv4si (src,
1553 base,
1554 (__v4si)index,
1555 mask,
1556 scale);
1559 extern __inline __m128i
1560 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1561 _mm_mask_i32gather_epi32 (__m128i src, int const *base, __m128i index,
1562 __m128i mask, const int scale)
1564 return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)src,
1565 base,
1566 (__v4si)index,
1567 (__v4si)mask,
1568 scale);
1571 extern __inline __m256i
1572 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1573 _mm256_i32gather_epi32 (int const *base, __m256i index, const int scale)
1575 __v8si src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 };
1576 __v8si mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 };
1578 return (__m256i) __builtin_ia32_gathersiv8si (src,
1579 base,
1580 (__v8si)index,
1581 mask,
1582 scale);
1585 extern __inline __m256i
1586 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1587 _mm256_mask_i32gather_epi32 (__m256i src, int const *base,
1588 __m256i index, __m256i mask, const int scale)
1590 return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)src,
1591 base,
1592 (__v8si)index,
1593 (__v8si)mask,
1594 scale);
1597 extern __inline __m128i
1598 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1599 _mm_i64gather_epi32 (int const *base, __m128i index, const int scale)
1601 __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
1602 __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1604 return (__m128i) __builtin_ia32_gatherdiv4si (src,
1605 base,
1606 (__v2di)index,
1607 mask,
1608 scale);
1611 extern __inline __m128i
1612 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1613 _mm_mask_i64gather_epi32 (__m128i src, int const *base, __m128i index,
1614 __m128i mask, const int scale)
1616 return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)src,
1617 base,
1618 (__v2di)index,
1619 (__v4si)mask,
1620 scale);
1623 extern __inline __m128i
1624 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1625 _mm256_i64gather_epi32 (int const *base, __m256i index, const int scale)
1627 __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
1628 __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1630 return (__m128i) __builtin_ia32_gatherdiv4si256 (src,
1631 base,
1632 (__v4di)index,
1633 mask,
1634 scale);
1637 extern __inline __m128i
1638 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1639 _mm256_mask_i64gather_epi32 (__m128i src, int const *base,
1640 __m256i index, __m128i mask, const int scale)
1642 return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)src,
1643 base,
1644 (__v4di)index,
1645 (__v4si)mask,
1646 scale);
1648 #else /* __OPTIMIZE__ */
1649 #define _mm_i32gather_pd(BASE, INDEX, SCALE) \
1650 (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (), \
1651 (double const *)BASE, \
1652 (__v4si)(__m128i)INDEX, \
1653 (__v2df)_mm_set1_pd( \
1654 (double)(long long int) -1), \
1655 (int)SCALE)
1657 #define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1658 (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d)SRC, \
1659 (double const *)BASE, \
1660 (__v4si)(__m128i)INDEX, \
1661 (__v2df)(__m128d)MASK, \
1662 (int)SCALE)
1664 #define _mm256_i32gather_pd(BASE, INDEX, SCALE) \
1665 (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (), \
1666 (double const *)BASE, \
1667 (__v4si)(__m128i)INDEX, \
1668 (__v4df)_mm256_set1_pd( \
1669 (double)(long long int) -1), \
1670 (int)SCALE)
1672 #define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1673 (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d)SRC, \
1674 (double const *)BASE, \
1675 (__v4si)(__m128i)INDEX, \
1676 (__v4df)(__m256d)MASK, \
1677 (int)SCALE)
1679 #define _mm_i64gather_pd(BASE, INDEX, SCALE) \
1680 (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (), \
1681 (double const *)BASE, \
1682 (__v2di)(__m128i)INDEX, \
1683 (__v2df)_mm_set1_pd( \
1684 (double)(long long int) -1), \
1685 (int)SCALE)
1687 #define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1688 (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d)SRC, \
1689 (double const *)BASE, \
1690 (__v2di)(__m128i)INDEX, \
1691 (__v2df)(__m128d)MASK, \
1692 (int)SCALE)
1694 #define _mm256_i64gather_pd(BASE, INDEX, SCALE) \
1695 (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (), \
1696 (double const *)BASE, \
1697 (__v4di)(__m256i)INDEX, \
1698 (__v4df)_mm256_set1_pd( \
1699 (double)(long long int) -1), \
1700 (int)SCALE)
1702 #define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1703 (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d)SRC, \
1704 (double const *)BASE, \
1705 (__v4di)(__m256i)INDEX, \
1706 (__v4df)(__m256d)MASK, \
1707 (int)SCALE)
1709 #define _mm_i32gather_ps(BASE, INDEX, SCALE) \
1710 (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (), \
1711 (float const *)BASE, \
1712 (__v4si)(__m128i)INDEX, \
1713 _mm_set1_ps ((float)(int) -1), \
1714 (int)SCALE)
1716 #define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1717 (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128d)SRC, \
1718 (float const *)BASE, \
1719 (__v4si)(__m128i)INDEX, \
1720 (__v4sf)(__m128d)MASK, \
1721 (int)SCALE)
1723 #define _mm256_i32gather_ps(BASE, INDEX, SCALE) \
1724 (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \
1725 (float const *)BASE, \
1726 (__v8si)(__m256i)INDEX, \
1727 (__v8sf)_mm256_set1_ps ( \
1728 (float)(int) -1), \
1729 (int)SCALE)
1731 #define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1732 (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC, \
1733 (float const *)BASE, \
1734 (__v8si)(__m256i)INDEX, \
1735 (__v8sf)(__m256d)MASK, \
1736 (int)SCALE)
1738 #define _mm_i64gather_ps(BASE, INDEX, SCALE) \
1739 (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (), \
1740 (float const *)BASE, \
1741 (__v2di)(__m128i)INDEX, \
1742 (__v4sf)_mm_set1_ps ( \
1743 (float)(int) -1), \
1744 (int)SCALE)
1746 #define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1747 (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC, \
1748 (float const *)BASE, \
1749 (__v2di)(__m128i)INDEX, \
1750 (__v4sf)(__m128d)MASK, \
1751 (int)SCALE)
1753 #define _mm256_i64gather_ps(BASE, INDEX, SCALE) \
1754 (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (), \
1755 (float const *)BASE, \
1756 (__v4di)(__m256i)INDEX, \
1757 (__v4sf)_mm_set1_ps( \
1758 (float)(int) -1), \
1759 (int)SCALE)
1761 #define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1762 (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128)SRC, \
1763 (float const *)BASE, \
1764 (__v4di)(__m256i)INDEX, \
1765 (__v4sf)(__m128)MASK, \
1766 (int)SCALE)
1768 #define _mm_i32gather_epi64(BASE, INDEX, SCALE) \
1769 (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \
1770 (long long const *)BASE, \
1771 (__v4si)(__m128i)INDEX, \
1772 (__v2di)_mm_set1_epi64x (-1), \
1773 (int)SCALE)
1775 #define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1776 (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i)SRC, \
1777 (long long const *)BASE, \
1778 (__v4si)(__m128i)INDEX, \
1779 (__v2di)(__m128i)MASK, \
1780 (int)SCALE)
1782 #define _mm256_i32gather_epi64(BASE, INDEX, SCALE) \
1783 (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \
1784 (long long const *)BASE, \
1785 (__v4si)(__m128i)INDEX, \
1786 (__v4di)_mm256_set1_epi64x (-1), \
1787 (int)SCALE)
1789 #define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1790 (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i)SRC, \
1791 (long long const *)BASE, \
1792 (__v4si)(__m128i)INDEX, \
1793 (__v4di)(__m256i)MASK, \
1794 (int)SCALE)
1796 #define _mm_i64gather_epi64(BASE, INDEX, SCALE) \
1797 (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \
1798 (long long const *)BASE, \
1799 (__v2di)(__m128i)INDEX, \
1800 (__v2di)_mm_set1_epi64x (-1), \
1801 (int)SCALE)
1803 #define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1804 (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i)SRC, \
1805 (long long const *)BASE, \
1806 (__v2di)(__m128i)INDEX, \
1807 (__v2di)(__m128i)MASK, \
1808 (int)SCALE)
1810 #define _mm256_i64gather_epi64(BASE, INDEX, SCALE) \
1811 (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \
1812 (long long const *)BASE, \
1813 (__v4di)(__m256i)INDEX, \
1814 (__v4di)_mm256_set1_epi64x (-1), \
1815 (int)SCALE)
1817 #define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1818 (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i)SRC, \
1819 (long long const *)BASE, \
1820 (__v4di)(__m256i)INDEX, \
1821 (__v4di)(__m256i)MASK, \
1822 (int)SCALE)
1824 #define _mm_i32gather_epi32(BASE, INDEX, SCALE) \
1825 (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (), \
1826 (int const *)BASE, \
1827 (__v4si)(__m128i)INDEX, \
1828 (__v4si)_mm_set1_epi32 (-1), \
1829 (int)SCALE)
1831 #define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1832 (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i)SRC, \
1833 (int const *)BASE, \
1834 (__v4si)(__m128i)INDEX, \
1835 (__v4si)(__m128i)MASK, \
1836 (int)SCALE)
1838 #define _mm256_i32gather_epi32(BASE, INDEX, SCALE) \
1839 (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \
1840 (int const *)BASE, \
1841 (__v8si)(__m256i)INDEX, \
1842 (__v8si)_mm256_set1_epi32 (-1), \
1843 (int)SCALE)
1845 #define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1846 (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i)SRC, \
1847 (int const *)BASE, \
1848 (__v8si)(__m256i)INDEX, \
1849 (__v8si)(__m256i)MASK, \
1850 (int)SCALE)
1852 #define _mm_i64gather_epi32(BASE, INDEX, SCALE) \
1853 (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (), \
1854 (int const *)BASE, \
1855 (__v2di)(__m128i)INDEX, \
1856 (__v4si)_mm_set1_epi32 (-1), \
1857 (int)SCALE)
1859 #define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1860 (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i)SRC, \
1861 (int const *)BASE, \
1862 (__v2di)(__m128i)INDEX, \
1863 (__v4si)(__m128i)MASK, \
1864 (int)SCALE)
1866 #define _mm256_i64gather_epi32(BASE, INDEX, SCALE) \
1867 (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \
1868 (int const *)BASE, \
1869 (__v4di)(__m256i)INDEX, \
1870 (__v4si)_mm_set1_epi32(-1), \
1871 (int)SCALE)
1873 #define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1874 (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i)SRC, \
1875 (int const *)BASE, \
1876 (__v4di)(__m256i)INDEX, \
1877 (__v4si)(__m128i)MASK, \
1878 (int)SCALE)
1879 #endif /* __OPTIMIZE__ */
1881 #ifdef __DISABLE_AVX2__
1882 #undef __DISABLE_AVX2__
1883 #pragma GCC pop_options
1884 #endif /* __DISABLE_AVX2__ */
1886 #endif /* _AVX2INTRIN_H_INCLUDED */