[AVX-512] Enable QI-mode mask logic patterns on non-AVX-512DQ targets.
[official-gcc.git] / gcc / config / i386 / avx2intrin.h
blobb2a2f488c011b5ec91be650d4d27771766474636
1 /* Copyright (C) 2011-2015 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
24 #ifndef _IMMINTRIN_H_INCLUDED
25 # error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
26 #endif
28 #ifndef _AVX2INTRIN_H_INCLUDED
29 #define _AVX2INTRIN_H_INCLUDED
31 #ifndef __AVX2__
32 #pragma GCC push_options
33 #pragma GCC target("avx2")
34 #define __DISABLE_AVX2__
35 #endif /* __AVX2__ */
37 /* Sum absolute 8-bit integer difference of adjacent groups of 4
38 byte integers in the first 2 operands. Starting offsets within
39 operands are determined by the 3rd mask operand. */
40 #ifdef __OPTIMIZE__
41 extern __inline __m256i
42 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
43 _mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M)
45 return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X,
46 (__v32qi)__Y, __M);
48 #else
49 #define _mm256_mpsadbw_epu8(X, Y, M) \
50 ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X), \
51 (__v32qi)(__m256i)(Y), (int)(M)))
52 #endif
54 extern __inline __m256i
55 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
56 _mm256_abs_epi8 (__m256i __A)
58 return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A);
61 extern __inline __m256i
62 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
63 _mm256_abs_epi16 (__m256i __A)
65 return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A);
68 extern __inline __m256i
69 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
70 _mm256_abs_epi32 (__m256i __A)
72 return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A);
75 extern __inline __m256i
76 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
77 _mm256_packs_epi32 (__m256i __A, __m256i __B)
79 return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B);
82 extern __inline __m256i
83 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
84 _mm256_packs_epi16 (__m256i __A, __m256i __B)
86 return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B);
89 extern __inline __m256i
90 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
91 _mm256_packus_epi32 (__m256i __A, __m256i __B)
93 return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B);
96 extern __inline __m256i
97 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
98 _mm256_packus_epi16 (__m256i __A, __m256i __B)
100 return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B);
103 extern __inline __m256i
104 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
105 _mm256_add_epi8 (__m256i __A, __m256i __B)
107 return (__m256i) ((__v32qu)__A + (__v32qu)__B);
110 extern __inline __m256i
111 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
112 _mm256_add_epi16 (__m256i __A, __m256i __B)
114 return (__m256i) ((__v16hu)__A + (__v16hu)__B);
117 extern __inline __m256i
118 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
119 _mm256_add_epi32 (__m256i __A, __m256i __B)
121 return (__m256i) ((__v8su)__A + (__v8su)__B);
124 extern __inline __m256i
125 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
126 _mm256_add_epi64 (__m256i __A, __m256i __B)
128 return (__m256i) ((__v4du)__A + (__v4du)__B);
131 extern __inline __m256i
132 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
133 _mm256_adds_epi8 (__m256i __A, __m256i __B)
135 return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B);
138 extern __inline __m256i
139 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
140 _mm256_adds_epi16 (__m256i __A, __m256i __B)
142 return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B);
145 extern __inline __m256i
146 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
147 _mm256_adds_epu8 (__m256i __A, __m256i __B)
149 return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B);
152 extern __inline __m256i
153 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
154 _mm256_adds_epu16 (__m256i __A, __m256i __B)
156 return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B);
159 #ifdef __OPTIMIZE__
160 extern __inline __m256i
161 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
162 _mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N)
164 return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A,
165 (__v4di)__B,
166 __N * 8);
168 #else
169 /* In that case (__N*8) will be in vreg, and insn will not be matched. */
170 /* Use define instead */
171 #define _mm256_alignr_epi8(A, B, N) \
172 ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A), \
173 (__v4di)(__m256i)(B), \
174 (int)(N) * 8))
175 #endif
177 extern __inline __m256i
178 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
179 _mm256_and_si256 (__m256i __A, __m256i __B)
181 return (__m256i) ((__v4du)__A & (__v4du)__B);
184 extern __inline __m256i
185 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
186 _mm256_andnot_si256 (__m256i __A, __m256i __B)
188 return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B);
191 extern __inline __m256i
192 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
193 _mm256_avg_epu8 (__m256i __A, __m256i __B)
195 return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B);
198 extern __inline __m256i
199 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
200 _mm256_avg_epu16 (__m256i __A, __m256i __B)
202 return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B);
205 extern __inline __m256i
206 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
207 _mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M)
209 return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X,
210 (__v32qi)__Y,
211 (__v32qi)__M);
214 #ifdef __OPTIMIZE__
215 extern __inline __m256i
216 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
217 _mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M)
219 return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X,
220 (__v16hi)__Y,
221 __M);
223 #else
224 #define _mm256_blend_epi16(X, Y, M) \
225 ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X), \
226 (__v16hi)(__m256i)(Y), (int)(M)))
227 #endif
229 extern __inline __m256i
230 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
231 _mm256_cmpeq_epi8 (__m256i __A, __m256i __B)
233 return (__m256i) ((__v32qi)__A == (__v32qi)__B);
236 extern __inline __m256i
237 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
238 _mm256_cmpeq_epi16 (__m256i __A, __m256i __B)
240 return (__m256i) ((__v16hi)__A == (__v16hi)__B);
243 extern __inline __m256i
244 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
245 _mm256_cmpeq_epi32 (__m256i __A, __m256i __B)
247 return (__m256i) ((__v8si)__A == (__v8si)__B);
250 extern __inline __m256i
251 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
252 _mm256_cmpeq_epi64 (__m256i __A, __m256i __B)
254 return (__m256i) ((__v4di)__A == (__v4di)__B);
257 extern __inline __m256i
258 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
259 _mm256_cmpgt_epi8 (__m256i __A, __m256i __B)
261 return (__m256i) ((__v32qi)__A > (__v32qi)__B);
264 extern __inline __m256i
265 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
266 _mm256_cmpgt_epi16 (__m256i __A, __m256i __B)
268 return (__m256i) ((__v16hi)__A > (__v16hi)__B);
271 extern __inline __m256i
272 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
273 _mm256_cmpgt_epi32 (__m256i __A, __m256i __B)
275 return (__m256i) ((__v8si)__A > (__v8si)__B);
278 extern __inline __m256i
279 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
280 _mm256_cmpgt_epi64 (__m256i __A, __m256i __B)
282 return (__m256i) ((__v4di)__A > (__v4di)__B);
285 extern __inline __m256i
286 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
287 _mm256_hadd_epi16 (__m256i __X, __m256i __Y)
289 return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X,
290 (__v16hi)__Y);
293 extern __inline __m256i
294 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
295 _mm256_hadd_epi32 (__m256i __X, __m256i __Y)
297 return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y);
300 extern __inline __m256i
301 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
302 _mm256_hadds_epi16 (__m256i __X, __m256i __Y)
304 return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X,
305 (__v16hi)__Y);
308 extern __inline __m256i
309 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
310 _mm256_hsub_epi16 (__m256i __X, __m256i __Y)
312 return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X,
313 (__v16hi)__Y);
316 extern __inline __m256i
317 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
318 _mm256_hsub_epi32 (__m256i __X, __m256i __Y)
320 return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y);
323 extern __inline __m256i
324 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
325 _mm256_hsubs_epi16 (__m256i __X, __m256i __Y)
327 return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X,
328 (__v16hi)__Y);
331 extern __inline __m256i
332 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
333 _mm256_maddubs_epi16 (__m256i __X, __m256i __Y)
335 return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X,
336 (__v32qi)__Y);
339 extern __inline __m256i
340 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
341 _mm256_madd_epi16 (__m256i __A, __m256i __B)
343 return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A,
344 (__v16hi)__B);
347 extern __inline __m256i
348 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
349 _mm256_max_epi8 (__m256i __A, __m256i __B)
351 return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B);
354 extern __inline __m256i
355 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
356 _mm256_max_epi16 (__m256i __A, __m256i __B)
358 return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B);
361 extern __inline __m256i
362 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
363 _mm256_max_epi32 (__m256i __A, __m256i __B)
365 return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B);
368 extern __inline __m256i
369 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
370 _mm256_max_epu8 (__m256i __A, __m256i __B)
372 return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B);
375 extern __inline __m256i
376 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
377 _mm256_max_epu16 (__m256i __A, __m256i __B)
379 return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B);
382 extern __inline __m256i
383 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
384 _mm256_max_epu32 (__m256i __A, __m256i __B)
386 return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B);
389 extern __inline __m256i
390 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
391 _mm256_min_epi8 (__m256i __A, __m256i __B)
393 return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B);
396 extern __inline __m256i
397 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
398 _mm256_min_epi16 (__m256i __A, __m256i __B)
400 return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B);
403 extern __inline __m256i
404 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
405 _mm256_min_epi32 (__m256i __A, __m256i __B)
407 return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B);
410 extern __inline __m256i
411 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
412 _mm256_min_epu8 (__m256i __A, __m256i __B)
414 return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B);
417 extern __inline __m256i
418 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
419 _mm256_min_epu16 (__m256i __A, __m256i __B)
421 return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B);
424 extern __inline __m256i
425 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
426 _mm256_min_epu32 (__m256i __A, __m256i __B)
428 return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B);
431 extern __inline int
432 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
433 _mm256_movemask_epi8 (__m256i __A)
435 return __builtin_ia32_pmovmskb256 ((__v32qi)__A);
438 extern __inline __m256i
439 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
440 _mm256_cvtepi8_epi16 (__m128i __X)
442 return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X);
445 extern __inline __m256i
446 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
447 _mm256_cvtepi8_epi32 (__m128i __X)
449 return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X);
452 extern __inline __m256i
453 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
454 _mm256_cvtepi8_epi64 (__m128i __X)
456 return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X);
459 extern __inline __m256i
460 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
461 _mm256_cvtepi16_epi32 (__m128i __X)
463 return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X);
466 extern __inline __m256i
467 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
468 _mm256_cvtepi16_epi64 (__m128i __X)
470 return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X);
473 extern __inline __m256i
474 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
475 _mm256_cvtepi32_epi64 (__m128i __X)
477 return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X);
480 extern __inline __m256i
481 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
482 _mm256_cvtepu8_epi16 (__m128i __X)
484 return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X);
487 extern __inline __m256i
488 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
489 _mm256_cvtepu8_epi32 (__m128i __X)
491 return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X);
494 extern __inline __m256i
495 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
496 _mm256_cvtepu8_epi64 (__m128i __X)
498 return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X);
501 extern __inline __m256i
502 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
503 _mm256_cvtepu16_epi32 (__m128i __X)
505 return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X);
508 extern __inline __m256i
509 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
510 _mm256_cvtepu16_epi64 (__m128i __X)
512 return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X);
515 extern __inline __m256i
516 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
517 _mm256_cvtepu32_epi64 (__m128i __X)
519 return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X);
522 extern __inline __m256i
523 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
524 _mm256_mul_epi32 (__m256i __X, __m256i __Y)
526 return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y);
529 extern __inline __m256i
530 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
531 _mm256_mulhrs_epi16 (__m256i __X, __m256i __Y)
533 return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X,
534 (__v16hi)__Y);
537 extern __inline __m256i
538 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
539 _mm256_mulhi_epu16 (__m256i __A, __m256i __B)
541 return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B);
544 extern __inline __m256i
545 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
546 _mm256_mulhi_epi16 (__m256i __A, __m256i __B)
548 return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B);
551 extern __inline __m256i
552 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
553 _mm256_mullo_epi16 (__m256i __A, __m256i __B)
555 return (__m256i) ((__v16hu)__A * (__v16hu)__B);
558 extern __inline __m256i
559 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
560 _mm256_mullo_epi32 (__m256i __A, __m256i __B)
562 return (__m256i) ((__v8su)__A * (__v8su)__B);
565 extern __inline __m256i
566 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
567 _mm256_mul_epu32 (__m256i __A, __m256i __B)
569 return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B);
572 extern __inline __m256i
573 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
574 _mm256_or_si256 (__m256i __A, __m256i __B)
576 return (__m256i) ((__v4du)__A | (__v4du)__B);
579 extern __inline __m256i
580 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
581 _mm256_sad_epu8 (__m256i __A, __m256i __B)
583 return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B);
586 extern __inline __m256i
587 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
588 _mm256_shuffle_epi8 (__m256i __X, __m256i __Y)
590 return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X,
591 (__v32qi)__Y);
594 #ifdef __OPTIMIZE__
595 extern __inline __m256i
596 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
597 _mm256_shuffle_epi32 (__m256i __A, const int __mask)
599 return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask);
602 extern __inline __m256i
603 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
604 _mm256_shufflehi_epi16 (__m256i __A, const int __mask)
606 return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask);
609 extern __inline __m256i
610 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
611 _mm256_shufflelo_epi16 (__m256i __A, const int __mask)
613 return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask);
615 #else
616 #define _mm256_shuffle_epi32(A, N) \
617 ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N)))
618 #define _mm256_shufflehi_epi16(A, N) \
619 ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N)))
620 #define _mm256_shufflelo_epi16(A, N) \
621 ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N)))
622 #endif
624 extern __inline __m256i
625 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
626 _mm256_sign_epi8 (__m256i __X, __m256i __Y)
628 return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y);
631 extern __inline __m256i
632 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
633 _mm256_sign_epi16 (__m256i __X, __m256i __Y)
635 return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y);
638 extern __inline __m256i
639 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
640 _mm256_sign_epi32 (__m256i __X, __m256i __Y)
642 return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y);
645 #ifdef __OPTIMIZE__
646 extern __inline __m256i
647 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
648 _mm256_bslli_epi128 (__m256i __A, const int __N)
650 return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
653 extern __inline __m256i
654 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
655 _mm256_slli_si256 (__m256i __A, const int __N)
657 return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
659 #else
660 #define _mm256_bslli_epi128(A, N) \
661 ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
662 #define _mm256_slli_si256(A, N) \
663 ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
664 #endif
666 extern __inline __m256i
667 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
668 _mm256_slli_epi16 (__m256i __A, int __B)
670 return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
673 extern __inline __m256i
674 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
675 _mm256_sll_epi16 (__m256i __A, __m128i __B)
677 return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B);
680 extern __inline __m256i
681 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
682 _mm256_slli_epi32 (__m256i __A, int __B)
684 return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
687 extern __inline __m256i
688 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
689 _mm256_sll_epi32 (__m256i __A, __m128i __B)
691 return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B);
694 extern __inline __m256i
695 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
696 _mm256_slli_epi64 (__m256i __A, int __B)
698 return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B);
701 extern __inline __m256i
702 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
703 _mm256_sll_epi64 (__m256i __A, __m128i __B)
705 return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B);
708 extern __inline __m256i
709 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
710 _mm256_srai_epi16 (__m256i __A, int __B)
712 return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B);
715 extern __inline __m256i
716 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
717 _mm256_sra_epi16 (__m256i __A, __m128i __B)
719 return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B);
722 extern __inline __m256i
723 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
724 _mm256_srai_epi32 (__m256i __A, int __B)
726 return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B);
729 extern __inline __m256i
730 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
731 _mm256_sra_epi32 (__m256i __A, __m128i __B)
733 return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B);
736 #ifdef __OPTIMIZE__
737 extern __inline __m256i
738 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
739 _mm256_bsrli_epi128 (__m256i __A, const int __N)
741 return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
744 extern __inline __m256i
745 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
746 _mm256_srli_si256 (__m256i __A, const int __N)
748 return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
750 #else
751 #define _mm256_bsrli_epi128(A, N) \
752 ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
753 #define _mm256_srli_si256(A, N) \
754 ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
755 #endif
757 extern __inline __m256i
758 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
759 _mm256_srli_epi16 (__m256i __A, int __B)
761 return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B);
764 extern __inline __m256i
765 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
766 _mm256_srl_epi16 (__m256i __A, __m128i __B)
768 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B);
771 extern __inline __m256i
772 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
773 _mm256_srli_epi32 (__m256i __A, int __B)
775 return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B);
778 extern __inline __m256i
779 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
780 _mm256_srl_epi32 (__m256i __A, __m128i __B)
782 return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B);
785 extern __inline __m256i
786 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
787 _mm256_srli_epi64 (__m256i __A, int __B)
789 return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B);
792 extern __inline __m256i
793 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
794 _mm256_srl_epi64 (__m256i __A, __m128i __B)
796 return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B);
799 extern __inline __m256i
800 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
801 _mm256_sub_epi8 (__m256i __A, __m256i __B)
803 return (__m256i) ((__v32qu)__A - (__v32qu)__B);
806 extern __inline __m256i
807 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
808 _mm256_sub_epi16 (__m256i __A, __m256i __B)
810 return (__m256i) ((__v16hu)__A - (__v16hu)__B);
813 extern __inline __m256i
814 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
815 _mm256_sub_epi32 (__m256i __A, __m256i __B)
817 return (__m256i) ((__v8su)__A - (__v8su)__B);
820 extern __inline __m256i
821 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
822 _mm256_sub_epi64 (__m256i __A, __m256i __B)
824 return (__m256i) ((__v4du)__A - (__v4du)__B);
827 extern __inline __m256i
828 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
829 _mm256_subs_epi8 (__m256i __A, __m256i __B)
831 return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B);
834 extern __inline __m256i
835 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
836 _mm256_subs_epi16 (__m256i __A, __m256i __B)
838 return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B);
841 extern __inline __m256i
842 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
843 _mm256_subs_epu8 (__m256i __A, __m256i __B)
845 return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B);
848 extern __inline __m256i
849 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
850 _mm256_subs_epu16 (__m256i __A, __m256i __B)
852 return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B);
855 extern __inline __m256i
856 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
857 _mm256_unpackhi_epi8 (__m256i __A, __m256i __B)
859 return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B);
862 extern __inline __m256i
863 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
864 _mm256_unpackhi_epi16 (__m256i __A, __m256i __B)
866 return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B);
869 extern __inline __m256i
870 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
871 _mm256_unpackhi_epi32 (__m256i __A, __m256i __B)
873 return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B);
876 extern __inline __m256i
877 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
878 _mm256_unpackhi_epi64 (__m256i __A, __m256i __B)
880 return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B);
883 extern __inline __m256i
884 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
885 _mm256_unpacklo_epi8 (__m256i __A, __m256i __B)
887 return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B);
890 extern __inline __m256i
891 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
892 _mm256_unpacklo_epi16 (__m256i __A, __m256i __B)
894 return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B);
897 extern __inline __m256i
898 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
899 _mm256_unpacklo_epi32 (__m256i __A, __m256i __B)
901 return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B);
904 extern __inline __m256i
905 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
906 _mm256_unpacklo_epi64 (__m256i __A, __m256i __B)
908 return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B);
911 extern __inline __m256i
912 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
913 _mm256_xor_si256 (__m256i __A, __m256i __B)
915 return (__m256i) ((__v4du)__A ^ (__v4du)__B);
918 extern __inline __m256i
919 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
920 _mm256_stream_load_si256 (__m256i const *__X)
922 return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X);
925 extern __inline __m128
926 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
927 _mm_broadcastss_ps (__m128 __X)
929 return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X);
932 extern __inline __m256
933 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
934 _mm256_broadcastss_ps (__m128 __X)
936 return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X);
939 extern __inline __m256d
940 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
941 _mm256_broadcastsd_pd (__m128d __X)
943 return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X);
946 extern __inline __m256i
947 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
948 _mm256_broadcastsi128_si256 (__m128i __X)
950 return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X);
953 #ifdef __OPTIMIZE__
954 extern __inline __m128i
955 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
956 _mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M)
958 return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X,
959 (__v4si)__Y,
960 __M);
962 #else
963 #define _mm_blend_epi32(X, Y, M) \
964 ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X), \
965 (__v4si)(__m128i)(Y), (int)(M)))
966 #endif
968 #ifdef __OPTIMIZE__
969 extern __inline __m256i
970 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
971 _mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M)
973 return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X,
974 (__v8si)__Y,
975 __M);
977 #else
978 #define _mm256_blend_epi32(X, Y, M) \
979 ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X), \
980 (__v8si)(__m256i)(Y), (int)(M)))
981 #endif
983 extern __inline __m256i
984 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
985 _mm256_broadcastb_epi8 (__m128i __X)
987 return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X);
990 extern __inline __m256i
991 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
992 _mm256_broadcastw_epi16 (__m128i __X)
994 return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X);
997 extern __inline __m256i
998 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
999 _mm256_broadcastd_epi32 (__m128i __X)
1001 return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X);
1004 extern __inline __m256i
1005 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1006 _mm256_broadcastq_epi64 (__m128i __X)
1008 return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X);
1011 extern __inline __m128i
1012 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1013 _mm_broadcastb_epi8 (__m128i __X)
1015 return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X);
1018 extern __inline __m128i
1019 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1020 _mm_broadcastw_epi16 (__m128i __X)
1022 return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X);
1025 extern __inline __m128i
1026 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1027 _mm_broadcastd_epi32 (__m128i __X)
1029 return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X);
1032 extern __inline __m128i
1033 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1034 _mm_broadcastq_epi64 (__m128i __X)
1036 return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X);
1039 extern __inline __m256i
1040 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1041 _mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y)
1043 return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y);
1046 #ifdef __OPTIMIZE__
1047 extern __inline __m256d
1048 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1049 _mm256_permute4x64_pd (__m256d __X, const int __M)
1051 return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M);
1053 #else
1054 #define _mm256_permute4x64_pd(X, M) \
1055 ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M)))
1056 #endif
1058 extern __inline __m256
1059 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1060 _mm256_permutevar8x32_ps (__m256 __X, __m256i __Y)
1062 return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y);
1065 #ifdef __OPTIMIZE__
1066 extern __inline __m256i
1067 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1068 _mm256_permute4x64_epi64 (__m256i __X, const int __M)
1070 return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M);
1072 #else
1073 #define _mm256_permute4x64_epi64(X, M) \
1074 ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M)))
1075 #endif
1078 #ifdef __OPTIMIZE__
1079 extern __inline __m256i
1080 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1081 _mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M)
1083 return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M);
1085 #else
1086 #define _mm256_permute2x128_si256(X, Y, M) \
1087 ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M)))
1088 #endif
1090 #ifdef __OPTIMIZE__
1091 extern __inline __m128i
1092 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1093 _mm256_extracti128_si256 (__m256i __X, const int __M)
1095 return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M);
1097 #else
1098 #define _mm256_extracti128_si256(X, M) \
1099 ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M)))
1100 #endif
1102 #ifdef __OPTIMIZE__
1103 extern __inline __m256i
1104 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1105 _mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M)
1107 return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M);
1109 #else
1110 #define _mm256_inserti128_si256(X, Y, M) \
1111 ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \
1112 (__v2di)(__m128i)(Y), \
1113 (int)(M)))
1114 #endif
1116 extern __inline __m256i
1117 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1118 _mm256_maskload_epi32 (int const *__X, __m256i __M )
1120 return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X,
1121 (__v8si)__M);
1124 extern __inline __m256i
1125 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1126 _mm256_maskload_epi64 (long long const *__X, __m256i __M )
1128 return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X,
1129 (__v4di)__M);
1132 extern __inline __m128i
1133 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1134 _mm_maskload_epi32 (int const *__X, __m128i __M )
1136 return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X,
1137 (__v4si)__M);
1140 extern __inline __m128i
1141 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1142 _mm_maskload_epi64 (long long const *__X, __m128i __M )
1144 return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X,
1145 (__v2di)__M);
1148 extern __inline void
1149 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1150 _mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y )
1152 __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
1155 extern __inline void
1156 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1157 _mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y )
1159 __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
1162 extern __inline void
1163 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1164 _mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y )
1166 __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
1169 extern __inline void
1170 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1171 _mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y )
1173 __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
1176 extern __inline __m256i
1177 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1178 _mm256_sllv_epi32 (__m256i __X, __m256i __Y)
1180 return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y);
1183 extern __inline __m128i
1184 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1185 _mm_sllv_epi32 (__m128i __X, __m128i __Y)
1187 return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y);
1190 extern __inline __m256i
1191 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1192 _mm256_sllv_epi64 (__m256i __X, __m256i __Y)
1194 return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y);
1197 extern __inline __m128i
1198 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1199 _mm_sllv_epi64 (__m128i __X, __m128i __Y)
1201 return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y);
1204 extern __inline __m256i
1205 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1206 _mm256_srav_epi32 (__m256i __X, __m256i __Y)
1208 return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y);
1211 extern __inline __m128i
1212 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1213 _mm_srav_epi32 (__m128i __X, __m128i __Y)
1215 return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y);
1218 extern __inline __m256i
1219 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1220 _mm256_srlv_epi32 (__m256i __X, __m256i __Y)
1222 return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y);
1225 extern __inline __m128i
1226 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1227 _mm_srlv_epi32 (__m128i __X, __m128i __Y)
1229 return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y);
1232 extern __inline __m256i
1233 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1234 _mm256_srlv_epi64 (__m256i __X, __m256i __Y)
1236 return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y);
1239 extern __inline __m128i
1240 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1241 _mm_srlv_epi64 (__m128i __X, __m128i __Y)
1243 return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y);
1246 #ifdef __OPTIMIZE__
1247 extern __inline __m128d
1248 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1249 _mm_i32gather_pd (double const *base, __m128i index, const int scale)
1251 __v2df zero = _mm_setzero_pd ();
1252 __v2df mask = _mm_cmpeq_pd (zero, zero);
1254 return (__m128d) __builtin_ia32_gathersiv2df (_mm_undefined_pd (),
1255 base,
1256 (__v4si)index,
1257 mask,
1258 scale);
1261 extern __inline __m128d
1262 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1263 _mm_mask_i32gather_pd (__m128d src, double const *base, __m128i index,
1264 __m128d mask, const int scale)
1266 return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)src,
1267 base,
1268 (__v4si)index,
1269 (__v2df)mask,
1270 scale);
1273 extern __inline __m256d
1274 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1275 _mm256_i32gather_pd (double const *base, __m128i index, const int scale)
1277 __v4df zero = _mm256_setzero_pd ();
1278 __v4df mask = _mm256_cmp_pd (zero, zero, _CMP_EQ_OQ);
1280 return (__m256d) __builtin_ia32_gathersiv4df (_mm256_undefined_pd (),
1281 base,
1282 (__v4si)index,
1283 mask,
1284 scale);
1287 extern __inline __m256d
1288 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1289 _mm256_mask_i32gather_pd (__m256d src, double const *base,
1290 __m128i index, __m256d mask, const int scale)
1292 return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)src,
1293 base,
1294 (__v4si)index,
1295 (__v4df)mask,
1296 scale);
1299 extern __inline __m128d
1300 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1301 _mm_i64gather_pd (double const *base, __m128i index, const int scale)
1303 __v2df src = _mm_setzero_pd ();
1304 __v2df mask = _mm_cmpeq_pd (src, src);
1306 return (__m128d) __builtin_ia32_gatherdiv2df (src,
1307 base,
1308 (__v2di)index,
1309 mask,
1310 scale);
1313 extern __inline __m128d
1314 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1315 _mm_mask_i64gather_pd (__m128d src, double const *base, __m128i index,
1316 __m128d mask, const int scale)
1318 return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)src,
1319 base,
1320 (__v2di)index,
1321 (__v2df)mask,
1322 scale);
1325 extern __inline __m256d
1326 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1327 _mm256_i64gather_pd (double const *base, __m256i index, const int scale)
1329 __v4df src = _mm256_setzero_pd ();
1330 __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ);
1332 return (__m256d) __builtin_ia32_gatherdiv4df (src,
1333 base,
1334 (__v4di)index,
1335 mask,
1336 scale);
1339 extern __inline __m256d
1340 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1341 _mm256_mask_i64gather_pd (__m256d src, double const *base,
1342 __m256i index, __m256d mask, const int scale)
1344 return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)src,
1345 base,
1346 (__v4di)index,
1347 (__v4df)mask,
1348 scale);
1351 extern __inline __m128
1352 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1353 _mm_i32gather_ps (float const *base, __m128i index, const int scale)
1355 __v4sf src = _mm_setzero_ps ();
1356 __v4sf mask = _mm_cmpeq_ps (src, src);
1358 return (__m128) __builtin_ia32_gathersiv4sf (src,
1359 base,
1360 (__v4si)index,
1361 mask,
1362 scale);
1365 extern __inline __m128
1366 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1367 _mm_mask_i32gather_ps (__m128 src, float const *base, __m128i index,
1368 __m128 mask, const int scale)
1370 return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)src,
1371 base,
1372 (__v4si)index,
1373 (__v4sf)mask,
1374 scale);
1377 extern __inline __m256
1378 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1379 _mm256_i32gather_ps (float const *base, __m256i index, const int scale)
1381 __v8sf src = _mm256_setzero_ps ();
1382 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
1384 return (__m256) __builtin_ia32_gathersiv8sf (src,
1385 base,
1386 (__v8si)index,
1387 mask,
1388 scale);
1391 extern __inline __m256
1392 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1393 _mm256_mask_i32gather_ps (__m256 src, float const *base,
1394 __m256i index, __m256 mask, const int scale)
1396 return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)src,
1397 base,
1398 (__v8si)index,
1399 (__v8sf)mask,
1400 scale);
1403 extern __inline __m128
1404 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1405 _mm_i64gather_ps (float const *base, __m128i index, const int scale)
1407 __v4sf src = _mm_setzero_ps ();
1408 __v4sf mask = _mm_cmpeq_ps (src, src);
1410 return (__m128) __builtin_ia32_gatherdiv4sf (src,
1411 base,
1412 (__v2di)index,
1413 mask,
1414 scale);
1417 extern __inline __m128
1418 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1419 _mm_mask_i64gather_ps (__m128 src, float const *base, __m128i index,
1420 __m128 mask, const int scale)
1422 return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)src,
1423 base,
1424 (__v2di)index,
1425 (__v4sf)mask,
1426 scale);
1429 extern __inline __m128
1430 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1431 _mm256_i64gather_ps (float const *base, __m256i index, const int scale)
1433 __v4sf src = _mm_setzero_ps ();
1434 __v4sf mask = _mm_cmpeq_ps (src, src);
1436 return (__m128) __builtin_ia32_gatherdiv4sf256 (src,
1437 base,
1438 (__v4di)index,
1439 mask,
1440 scale);
1443 extern __inline __m128
1444 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1445 _mm256_mask_i64gather_ps (__m128 src, float const *base,
1446 __m256i index, __m128 mask, const int scale)
1448 return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)src,
1449 base,
1450 (__v4di)index,
1451 (__v4sf)mask,
1452 scale);
1455 extern __inline __m128i
1456 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1457 _mm_i32gather_epi64 (long long int const *base,
1458 __m128i index, const int scale)
1460 __v2di src = __extension__ (__v2di){ 0, 0 };
1461 __v2di mask = __extension__ (__v2di){ ~0, ~0 };
1463 return (__m128i) __builtin_ia32_gathersiv2di (src,
1464 base,
1465 (__v4si)index,
1466 mask,
1467 scale);
1470 extern __inline __m128i
1471 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1472 _mm_mask_i32gather_epi64 (__m128i src, long long int const *base,
1473 __m128i index, __m128i mask, const int scale)
1475 return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)src,
1476 base,
1477 (__v4si)index,
1478 (__v2di)mask,
1479 scale);
1482 extern __inline __m256i
1483 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1484 _mm256_i32gather_epi64 (long long int const *base,
1485 __m128i index, const int scale)
1487 __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 };
1488 __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
1490 return (__m256i) __builtin_ia32_gathersiv4di (src,
1491 base,
1492 (__v4si)index,
1493 mask,
1494 scale);
1497 extern __inline __m256i
1498 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1499 _mm256_mask_i32gather_epi64 (__m256i src, long long int const *base,
1500 __m128i index, __m256i mask, const int scale)
1502 return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)src,
1503 base,
1504 (__v4si)index,
1505 (__v4di)mask,
1506 scale);
1509 extern __inline __m128i
1510 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1511 _mm_i64gather_epi64 (long long int const *base,
1512 __m128i index, const int scale)
1514 __v2di src = __extension__ (__v2di){ 0, 0 };
1515 __v2di mask = __extension__ (__v2di){ ~0, ~0 };
1517 return (__m128i) __builtin_ia32_gatherdiv2di (src,
1518 base,
1519 (__v2di)index,
1520 mask,
1521 scale);
1524 extern __inline __m128i
1525 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1526 _mm_mask_i64gather_epi64 (__m128i src, long long int const *base, __m128i index,
1527 __m128i mask, const int scale)
1529 return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)src,
1530 base,
1531 (__v2di)index,
1532 (__v2di)mask,
1533 scale);
1536 extern __inline __m256i
1537 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1538 _mm256_i64gather_epi64 (long long int const *base,
1539 __m256i index, const int scale)
1541 __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 };
1542 __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
1544 return (__m256i) __builtin_ia32_gatherdiv4di (src,
1545 base,
1546 (__v4di)index,
1547 mask,
1548 scale);
1551 extern __inline __m256i
1552 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1553 _mm256_mask_i64gather_epi64 (__m256i src, long long int const *base,
1554 __m256i index, __m256i mask, const int scale)
1556 return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)src,
1557 base,
1558 (__v4di)index,
1559 (__v4di)mask,
1560 scale);
1563 extern __inline __m128i
1564 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1565 _mm_i32gather_epi32 (int const *base, __m128i index, const int scale)
1567 __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
1568 __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1570 return (__m128i) __builtin_ia32_gathersiv4si (src,
1571 base,
1572 (__v4si)index,
1573 mask,
1574 scale);
1577 extern __inline __m128i
1578 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1579 _mm_mask_i32gather_epi32 (__m128i src, int const *base, __m128i index,
1580 __m128i mask, const int scale)
1582 return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)src,
1583 base,
1584 (__v4si)index,
1585 (__v4si)mask,
1586 scale);
1589 extern __inline __m256i
1590 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1591 _mm256_i32gather_epi32 (int const *base, __m256i index, const int scale)
1593 __v8si src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 };
1594 __v8si mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 };
1596 return (__m256i) __builtin_ia32_gathersiv8si (src,
1597 base,
1598 (__v8si)index,
1599 mask,
1600 scale);
1603 extern __inline __m256i
1604 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1605 _mm256_mask_i32gather_epi32 (__m256i src, int const *base,
1606 __m256i index, __m256i mask, const int scale)
1608 return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)src,
1609 base,
1610 (__v8si)index,
1611 (__v8si)mask,
1612 scale);
1615 extern __inline __m128i
1616 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1617 _mm_i64gather_epi32 (int const *base, __m128i index, const int scale)
1619 __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
1620 __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1622 return (__m128i) __builtin_ia32_gatherdiv4si (src,
1623 base,
1624 (__v2di)index,
1625 mask,
1626 scale);
1629 extern __inline __m128i
1630 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1631 _mm_mask_i64gather_epi32 (__m128i src, int const *base, __m128i index,
1632 __m128i mask, const int scale)
1634 return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)src,
1635 base,
1636 (__v2di)index,
1637 (__v4si)mask,
1638 scale);
1641 extern __inline __m128i
1642 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1643 _mm256_i64gather_epi32 (int const *base, __m256i index, const int scale)
1645 __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
1646 __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1648 return (__m128i) __builtin_ia32_gatherdiv4si256 (src,
1649 base,
1650 (__v4di)index,
1651 mask,
1652 scale);
1655 extern __inline __m128i
1656 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1657 _mm256_mask_i64gather_epi32 (__m128i src, int const *base,
1658 __m256i index, __m128i mask, const int scale)
1660 return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)src,
1661 base,
1662 (__v4di)index,
1663 (__v4si)mask,
1664 scale);
1666 #else /* __OPTIMIZE__ */
1667 #define _mm_i32gather_pd(BASE, INDEX, SCALE) \
1668 (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (), \
1669 (double const *)BASE, \
1670 (__v4si)(__m128i)INDEX, \
1671 (__v2df)_mm_set1_pd( \
1672 (double)(long long int) -1), \
1673 (int)SCALE)
1675 #define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1676 (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d)SRC, \
1677 (double const *)BASE, \
1678 (__v4si)(__m128i)INDEX, \
1679 (__v2df)(__m128d)MASK, \
1680 (int)SCALE)
1682 #define _mm256_i32gather_pd(BASE, INDEX, SCALE) \
1683 (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (), \
1684 (double const *)BASE, \
1685 (__v4si)(__m128i)INDEX, \
1686 (__v4df)_mm256_set1_pd( \
1687 (double)(long long int) -1), \
1688 (int)SCALE)
1690 #define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1691 (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d)SRC, \
1692 (double const *)BASE, \
1693 (__v4si)(__m128i)INDEX, \
1694 (__v4df)(__m256d)MASK, \
1695 (int)SCALE)
1697 #define _mm_i64gather_pd(BASE, INDEX, SCALE) \
1698 (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (), \
1699 (double const *)BASE, \
1700 (__v2di)(__m128i)INDEX, \
1701 (__v2df)_mm_set1_pd( \
1702 (double)(long long int) -1), \
1703 (int)SCALE)
1705 #define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1706 (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d)SRC, \
1707 (double const *)BASE, \
1708 (__v2di)(__m128i)INDEX, \
1709 (__v2df)(__m128d)MASK, \
1710 (int)SCALE)
1712 #define _mm256_i64gather_pd(BASE, INDEX, SCALE) \
1713 (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (), \
1714 (double const *)BASE, \
1715 (__v4di)(__m256i)INDEX, \
1716 (__v4df)_mm256_set1_pd( \
1717 (double)(long long int) -1), \
1718 (int)SCALE)
1720 #define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1721 (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d)SRC, \
1722 (double const *)BASE, \
1723 (__v4di)(__m256i)INDEX, \
1724 (__v4df)(__m256d)MASK, \
1725 (int)SCALE)
1727 #define _mm_i32gather_ps(BASE, INDEX, SCALE) \
1728 (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (), \
1729 (float const *)BASE, \
1730 (__v4si)(__m128i)INDEX, \
1731 _mm_set1_ps ((float)(int) -1), \
1732 (int)SCALE)
1734 #define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1735 (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128d)SRC, \
1736 (float const *)BASE, \
1737 (__v4si)(__m128i)INDEX, \
1738 (__v4sf)(__m128d)MASK, \
1739 (int)SCALE)
1741 #define _mm256_i32gather_ps(BASE, INDEX, SCALE) \
1742 (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \
1743 (float const *)BASE, \
1744 (__v8si)(__m256i)INDEX, \
1745 (__v8sf)_mm256_set1_ps ( \
1746 (float)(int) -1), \
1747 (int)SCALE)
1749 #define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1750 (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC, \
1751 (float const *)BASE, \
1752 (__v8si)(__m256i)INDEX, \
1753 (__v8sf)(__m256d)MASK, \
1754 (int)SCALE)
1756 #define _mm_i64gather_ps(BASE, INDEX, SCALE) \
1757 (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (), \
1758 (float const *)BASE, \
1759 (__v2di)(__m128i)INDEX, \
1760 (__v4sf)_mm_set1_ps ( \
1761 (float)(int) -1), \
1762 (int)SCALE)
1764 #define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1765 (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC, \
1766 (float const *)BASE, \
1767 (__v2di)(__m128i)INDEX, \
1768 (__v4sf)(__m128d)MASK, \
1769 (int)SCALE)
1771 #define _mm256_i64gather_ps(BASE, INDEX, SCALE) \
1772 (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (), \
1773 (float const *)BASE, \
1774 (__v4di)(__m256i)INDEX, \
1775 (__v4sf)_mm_set1_ps( \
1776 (float)(int) -1), \
1777 (int)SCALE)
1779 #define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1780 (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128)SRC, \
1781 (float const *)BASE, \
1782 (__v4di)(__m256i)INDEX, \
1783 (__v4sf)(__m128)MASK, \
1784 (int)SCALE)
1786 #define _mm_i32gather_epi64(BASE, INDEX, SCALE) \
1787 (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \
1788 (long long const *)BASE, \
1789 (__v4si)(__m128i)INDEX, \
1790 (__v2di)_mm_set1_epi64x (-1), \
1791 (int)SCALE)
1793 #define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1794 (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i)SRC, \
1795 (long long const *)BASE, \
1796 (__v4si)(__m128i)INDEX, \
1797 (__v2di)(__m128i)MASK, \
1798 (int)SCALE)
1800 #define _mm256_i32gather_epi64(BASE, INDEX, SCALE) \
1801 (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \
1802 (long long const *)BASE, \
1803 (__v4si)(__m128i)INDEX, \
1804 (__v4di)_mm256_set1_epi64x (-1), \
1805 (int)SCALE)
1807 #define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1808 (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i)SRC, \
1809 (long long const *)BASE, \
1810 (__v4si)(__m128i)INDEX, \
1811 (__v4di)(__m256i)MASK, \
1812 (int)SCALE)
1814 #define _mm_i64gather_epi64(BASE, INDEX, SCALE) \
1815 (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \
1816 (long long const *)BASE, \
1817 (__v2di)(__m128i)INDEX, \
1818 (__v2di)_mm_set1_epi64x (-1), \
1819 (int)SCALE)
1821 #define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1822 (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i)SRC, \
1823 (long long const *)BASE, \
1824 (__v2di)(__m128i)INDEX, \
1825 (__v2di)(__m128i)MASK, \
1826 (int)SCALE)
1828 #define _mm256_i64gather_epi64(BASE, INDEX, SCALE) \
1829 (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \
1830 (long long const *)BASE, \
1831 (__v4di)(__m256i)INDEX, \
1832 (__v4di)_mm256_set1_epi64x (-1), \
1833 (int)SCALE)
1835 #define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1836 (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i)SRC, \
1837 (long long const *)BASE, \
1838 (__v4di)(__m256i)INDEX, \
1839 (__v4di)(__m256i)MASK, \
1840 (int)SCALE)
1842 #define _mm_i32gather_epi32(BASE, INDEX, SCALE) \
1843 (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (), \
1844 (int const *)BASE, \
1845 (__v4si)(__m128i)INDEX, \
1846 (__v4si)_mm_set1_epi32 (-1), \
1847 (int)SCALE)
1849 #define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1850 (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i)SRC, \
1851 (int const *)BASE, \
1852 (__v4si)(__m128i)INDEX, \
1853 (__v4si)(__m128i)MASK, \
1854 (int)SCALE)
1856 #define _mm256_i32gather_epi32(BASE, INDEX, SCALE) \
1857 (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \
1858 (int const *)BASE, \
1859 (__v8si)(__m256i)INDEX, \
1860 (__v8si)_mm256_set1_epi32 (-1), \
1861 (int)SCALE)
1863 #define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1864 (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i)SRC, \
1865 (int const *)BASE, \
1866 (__v8si)(__m256i)INDEX, \
1867 (__v8si)(__m256i)MASK, \
1868 (int)SCALE)
1870 #define _mm_i64gather_epi32(BASE, INDEX, SCALE) \
1871 (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (), \
1872 (int const *)BASE, \
1873 (__v2di)(__m128i)INDEX, \
1874 (__v4si)_mm_set1_epi32 (-1), \
1875 (int)SCALE)
1877 #define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1878 (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i)SRC, \
1879 (int const *)BASE, \
1880 (__v2di)(__m128i)INDEX, \
1881 (__v4si)(__m128i)MASK, \
1882 (int)SCALE)
1884 #define _mm256_i64gather_epi32(BASE, INDEX, SCALE) \
1885 (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \
1886 (int const *)BASE, \
1887 (__v4di)(__m256i)INDEX, \
1888 (__v4si)_mm_set1_epi32(-1), \
1889 (int)SCALE)
1891 #define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1892 (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i)SRC, \
1893 (int const *)BASE, \
1894 (__v4di)(__m256i)INDEX, \
1895 (__v4si)(__m128i)MASK, \
1896 (int)SCALE)
1897 #endif /* __OPTIMIZE__ */
1899 #ifdef __DISABLE_AVX2__
1900 #undef __DISABLE_AVX2__
1901 #pragma GCC pop_options
1902 #endif /* __DISABLE_AVX2__ */
1904 #endif /* _AVX2INTRIN_H_INCLUDED */