PR/56490
[official-gcc.git] / gcc / config / i386 / avx2intrin.h
blob801867ecd4f7bba90e08a8ccb9db43af76bd8b49
1 /* Copyright (C) 2011-2013 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
24 #ifndef _IMMINTRIN_H_INCLUDED
25 # error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
26 #endif
28 /* Sum absolute 8-bit integer difference of adjacent groups of 4
29 byte integers in the first 2 operands. Starting offsets within
30 operands are determined by the 3rd mask operand. */
31 #ifdef __OPTIMIZE__
32 extern __inline __m256i
33 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
34 _mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M)
36 return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X,
37 (__v32qi)__Y, __M);
39 #else
40 #define _mm256_mpsadbw_epu8(X, Y, M) \
41 ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X), \
42 (__v32qi)(__m256i)(Y), (int)(M)))
43 #endif
45 extern __inline __m256i
46 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
47 _mm256_abs_epi8 (__m256i __A)
49 return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A);
52 extern __inline __m256i
53 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
54 _mm256_abs_epi16 (__m256i __A)
56 return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A);
59 extern __inline __m256i
60 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
61 _mm256_abs_epi32 (__m256i __A)
63 return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A);
66 extern __inline __m256i
67 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
68 _mm256_packs_epi32 (__m256i __A, __m256i __B)
70 return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B);
73 extern __inline __m256i
74 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
75 _mm256_packs_epi16 (__m256i __A, __m256i __B)
77 return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B);
80 extern __inline __m256i
81 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
82 _mm256_packus_epi32 (__m256i __A, __m256i __B)
84 return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B);
87 extern __inline __m256i
88 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
89 _mm256_packus_epi16 (__m256i __A, __m256i __B)
91 return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B);
94 extern __inline __m256i
95 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
96 _mm256_add_epi8 (__m256i __A, __m256i __B)
98 return (__m256i)__builtin_ia32_paddb256 ((__v32qi)__A, (__v32qi)__B);
101 extern __inline __m256i
102 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
103 _mm256_add_epi16 (__m256i __A, __m256i __B)
105 return (__m256i)__builtin_ia32_paddw256 ((__v16hi)__A, (__v16hi)__B);
108 extern __inline __m256i
109 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
110 _mm256_add_epi32 (__m256i __A, __m256i __B)
112 return (__m256i)__builtin_ia32_paddd256 ((__v8si)__A, (__v8si)__B);
115 extern __inline __m256i
116 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
117 _mm256_add_epi64 (__m256i __A, __m256i __B)
119 return (__m256i)__builtin_ia32_paddq256 ((__v4di)__A, (__v4di)__B);
122 extern __inline __m256i
123 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
124 _mm256_adds_epi8 (__m256i __A, __m256i __B)
126 return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B);
129 extern __inline __m256i
130 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
131 _mm256_adds_epi16 (__m256i __A, __m256i __B)
133 return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B);
136 extern __inline __m256i
137 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
138 _mm256_adds_epu8 (__m256i __A, __m256i __B)
140 return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B);
143 extern __inline __m256i
144 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
145 _mm256_adds_epu16 (__m256i __A, __m256i __B)
147 return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B);
150 #ifdef __OPTIMIZE__
151 extern __inline __m256i
152 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
153 _mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N)
155 return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A,
156 (__v4di)__B,
157 __N * 8);
159 #else
160 /* In that case (__N*8) will be in vreg, and insn will not be matched. */
161 /* Use define instead */
162 #define _mm256_alignr_epi8(A, B, N) \
163 ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A), \
164 (__v4di)(__m256i)(B), \
165 (int)(N) * 8))
166 #endif
168 extern __inline __m256i
169 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
170 _mm256_and_si256 (__m256i __A, __m256i __B)
172 return (__m256i) __builtin_ia32_andsi256 ((__v4di)__A, (__v4di)__B);
175 extern __inline __m256i
176 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
177 _mm256_andnot_si256 (__m256i __A, __m256i __B)
179 return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B);
182 extern __inline __m256i
183 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
184 _mm256_avg_epu8 (__m256i __A, __m256i __B)
186 return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B);
189 extern __inline __m256i
190 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
191 _mm256_avg_epu16 (__m256i __A, __m256i __B)
193 return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B);
196 extern __inline __m256i
197 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
198 _mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M)
200 return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X,
201 (__v32qi)__Y,
202 (__v32qi)__M);
205 #ifdef __OPTIMIZE__
206 extern __inline __m256i
207 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
208 _mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M)
210 return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X,
211 (__v16hi)__Y,
212 __M);
214 #else
215 #define _mm256_blend_epi16(X, Y, M) \
216 ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X), \
217 (__v16hi)(__m256i)(Y), (int)(M)))
218 #endif
220 extern __inline __m256i
221 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
222 _mm256_cmpeq_epi8 (__m256i __A, __m256i __B)
224 return (__m256i)__builtin_ia32_pcmpeqb256 ((__v32qi)__A, (__v32qi)__B);
227 extern __inline __m256i
228 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
229 _mm256_cmpeq_epi16 (__m256i __A, __m256i __B)
231 return (__m256i)__builtin_ia32_pcmpeqw256 ((__v16hi)__A, (__v16hi)__B);
234 extern __inline __m256i
235 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
236 _mm256_cmpeq_epi32 (__m256i __A, __m256i __B)
238 return (__m256i)__builtin_ia32_pcmpeqd256 ((__v8si)__A, (__v8si)__B);
241 extern __inline __m256i
242 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
243 _mm256_cmpeq_epi64 (__m256i __A, __m256i __B)
245 return (__m256i)__builtin_ia32_pcmpeqq256 ((__v4di)__A, (__v4di)__B);
248 extern __inline __m256i
249 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
250 _mm256_cmpgt_epi8 (__m256i __A, __m256i __B)
252 return (__m256i)__builtin_ia32_pcmpgtb256 ((__v32qi)__A,
253 (__v32qi)__B);
256 extern __inline __m256i
257 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
258 _mm256_cmpgt_epi16 (__m256i __A, __m256i __B)
260 return (__m256i)__builtin_ia32_pcmpgtw256 ((__v16hi)__A,
261 (__v16hi)__B);
264 extern __inline __m256i
265 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
266 _mm256_cmpgt_epi32 (__m256i __A, __m256i __B)
268 return (__m256i)__builtin_ia32_pcmpgtd256 ((__v8si)__A,
269 (__v8si)__B);
272 extern __inline __m256i
273 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
274 _mm256_cmpgt_epi64 (__m256i __A, __m256i __B)
276 return (__m256i)__builtin_ia32_pcmpgtq256 ((__v4di)__A, (__v4di)__B);
279 extern __inline __m256i
280 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
281 _mm256_hadd_epi16 (__m256i __X, __m256i __Y)
283 return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X,
284 (__v16hi)__Y);
287 extern __inline __m256i
288 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
289 _mm256_hadd_epi32 (__m256i __X, __m256i __Y)
291 return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y);
294 extern __inline __m256i
295 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
296 _mm256_hadds_epi16 (__m256i __X, __m256i __Y)
298 return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X,
299 (__v16hi)__Y);
302 extern __inline __m256i
303 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
304 _mm256_hsub_epi16 (__m256i __X, __m256i __Y)
306 return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X,
307 (__v16hi)__Y);
310 extern __inline __m256i
311 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
312 _mm256_hsub_epi32 (__m256i __X, __m256i __Y)
314 return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y);
317 extern __inline __m256i
318 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
319 _mm256_hsubs_epi16 (__m256i __X, __m256i __Y)
321 return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X,
322 (__v16hi)__Y);
325 extern __inline __m256i
326 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
327 _mm256_maddubs_epi16 (__m256i __X, __m256i __Y)
329 return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X,
330 (__v32qi)__Y);
333 extern __inline __m256i
334 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
335 _mm256_madd_epi16 (__m256i __A, __m256i __B)
337 return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A,
338 (__v16hi)__B);
341 extern __inline __m256i
342 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
343 _mm256_max_epi8 (__m256i __A, __m256i __B)
345 return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B);
348 extern __inline __m256i
349 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
350 _mm256_max_epi16 (__m256i __A, __m256i __B)
352 return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B);
355 extern __inline __m256i
356 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
357 _mm256_max_epi32 (__m256i __A, __m256i __B)
359 return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B);
362 extern __inline __m256i
363 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
364 _mm256_max_epu8 (__m256i __A, __m256i __B)
366 return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B);
369 extern __inline __m256i
370 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
371 _mm256_max_epu16 (__m256i __A, __m256i __B)
373 return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B);
376 extern __inline __m256i
377 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
378 _mm256_max_epu32 (__m256i __A, __m256i __B)
380 return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B);
383 extern __inline __m256i
384 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
385 _mm256_min_epi8 (__m256i __A, __m256i __B)
387 return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B);
390 extern __inline __m256i
391 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
392 _mm256_min_epi16 (__m256i __A, __m256i __B)
394 return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B);
397 extern __inline __m256i
398 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
399 _mm256_min_epi32 (__m256i __A, __m256i __B)
401 return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B);
404 extern __inline __m256i
405 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
406 _mm256_min_epu8 (__m256i __A, __m256i __B)
408 return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B);
411 extern __inline __m256i
412 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
413 _mm256_min_epu16 (__m256i __A, __m256i __B)
415 return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B);
418 extern __inline __m256i
419 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
420 _mm256_min_epu32 (__m256i __A, __m256i __B)
422 return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B);
425 extern __inline int
426 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
427 _mm256_movemask_epi8 (__m256i __A)
429 return __builtin_ia32_pmovmskb256 ((__v32qi)__A);
432 extern __inline __m256i
433 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
434 _mm256_cvtepi8_epi16 (__m128i __X)
436 return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X);
439 extern __inline __m256i
440 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
441 _mm256_cvtepi8_epi32 (__m128i __X)
443 return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X);
446 extern __inline __m256i
447 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
448 _mm256_cvtepi8_epi64 (__m128i __X)
450 return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X);
453 extern __inline __m256i
454 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
455 _mm256_cvtepi16_epi32 (__m128i __X)
457 return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X);
460 extern __inline __m256i
461 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
462 _mm256_cvtepi16_epi64 (__m128i __X)
464 return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X);
467 extern __inline __m256i
468 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
469 _mm256_cvtepi32_epi64 (__m128i __X)
471 return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X);
474 extern __inline __m256i
475 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
476 _mm256_cvtepu8_epi16 (__m128i __X)
478 return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X);
481 extern __inline __m256i
482 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
483 _mm256_cvtepu8_epi32 (__m128i __X)
485 return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X);
488 extern __inline __m256i
489 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
490 _mm256_cvtepu8_epi64 (__m128i __X)
492 return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X);
495 extern __inline __m256i
496 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
497 _mm256_cvtepu16_epi32 (__m128i __X)
499 return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X);
502 extern __inline __m256i
503 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
504 _mm256_cvtepu16_epi64 (__m128i __X)
506 return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X);
509 extern __inline __m256i
510 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
511 _mm256_cvtepu32_epi64 (__m128i __X)
513 return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X);
516 extern __inline __m256i
517 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
518 _mm256_mul_epi32 (__m256i __X, __m256i __Y)
520 return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y);
523 extern __inline __m256i
524 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
525 _mm256_mulhrs_epi16 (__m256i __X, __m256i __Y)
527 return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X,
528 (__v16hi)__Y);
531 extern __inline __m256i
532 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
533 _mm256_mulhi_epu16 (__m256i __A, __m256i __B)
535 return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B);
538 extern __inline __m256i
539 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
540 _mm256_mulhi_epi16 (__m256i __A, __m256i __B)
542 return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B);
545 extern __inline __m256i
546 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
547 _mm256_mullo_epi16 (__m256i __A, __m256i __B)
549 return (__m256i)__builtin_ia32_pmullw256 ((__v16hi)__A, (__v16hi)__B);
552 extern __inline __m256i
553 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
554 _mm256_mullo_epi32 (__m256i __A, __m256i __B)
556 return (__m256i)__builtin_ia32_pmulld256 ((__v8si)__A, (__v8si)__B);
559 extern __inline __m256i
560 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
561 _mm256_mul_epu32 (__m256i __A, __m256i __B)
563 return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B);
566 extern __inline __m256i
567 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
568 _mm256_or_si256 (__m256i __A, __m256i __B)
570 return (__m256i)__builtin_ia32_por256 ((__v4di)__A, (__v4di)__B);
573 extern __inline __m256i
574 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
575 _mm256_sad_epu8 (__m256i __A, __m256i __B)
577 return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B);
580 extern __inline __m256i
581 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
582 _mm256_shuffle_epi8 (__m256i __X, __m256i __Y)
584 return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X,
585 (__v32qi)__Y);
588 #ifdef __OPTIMIZE__
589 extern __inline __m256i
590 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
591 _mm256_shuffle_epi32 (__m256i __A, const int __mask)
593 return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask);
596 extern __inline __m256i
597 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
598 _mm256_shufflehi_epi16 (__m256i __A, const int __mask)
600 return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask);
603 extern __inline __m256i
604 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
605 _mm256_shufflelo_epi16 (__m256i __A, const int __mask)
607 return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask);
609 #else
610 #define _mm256_shuffle_epi32(A, N) \
611 ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N)))
612 #define _mm256_shufflehi_epi16(A, N) \
613 ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N)))
614 #define _mm256_shufflelo_epi16(A, N) \
615 ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N)))
616 #endif
618 extern __inline __m256i
619 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
620 _mm256_sign_epi8 (__m256i __X, __m256i __Y)
622 return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y);
625 extern __inline __m256i
626 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
627 _mm256_sign_epi16 (__m256i __X, __m256i __Y)
629 return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y);
632 extern __inline __m256i
633 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
634 _mm256_sign_epi32 (__m256i __X, __m256i __Y)
636 return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y);
639 #ifdef __OPTIMIZE__
640 extern __inline __m256i
641 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
642 _mm256_slli_si256 (__m256i __A, const int __N)
644 return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
646 #else
647 #define _mm256_slli_si256(A, N) \
648 ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
649 #endif
651 extern __inline __m256i
652 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
653 _mm256_slli_epi16 (__m256i __A, int __B)
655 return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
658 extern __inline __m256i
659 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
660 _mm256_sll_epi16 (__m256i __A, __m128i __B)
662 return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B);
665 extern __inline __m256i
666 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
667 _mm256_slli_epi32 (__m256i __A, int __B)
669 return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
672 extern __inline __m256i
673 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
674 _mm256_sll_epi32 (__m256i __A, __m128i __B)
676 return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B);
679 extern __inline __m256i
680 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
681 _mm256_slli_epi64 (__m256i __A, int __B)
683 return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B);
686 extern __inline __m256i
687 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
688 _mm256_sll_epi64 (__m256i __A, __m128i __B)
690 return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B);
693 extern __inline __m256i
694 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
695 _mm256_srai_epi16 (__m256i __A, int __B)
697 return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B);
700 extern __inline __m256i
701 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
702 _mm256_sra_epi16 (__m256i __A, __m128i __B)
704 return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B);
707 extern __inline __m256i
708 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
709 _mm256_srai_epi32 (__m256i __A, int __B)
711 return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B);
714 extern __inline __m256i
715 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
716 _mm256_sra_epi32 (__m256i __A, __m128i __B)
718 return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B);
721 #ifdef __OPTIMIZE__
722 extern __inline __m256i
723 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
724 _mm256_srli_si256 (__m256i __A, const int __N)
726 return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
728 #else
729 #define _mm256_srli_si256(A, N) \
730 ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
731 #endif
733 extern __inline __m256i
734 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
735 _mm256_srli_epi16 (__m256i __A, int __B)
737 return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B);
740 extern __inline __m256i
741 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
742 _mm256_srl_epi16 (__m256i __A, __m128i __B)
744 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B);
747 extern __inline __m256i
748 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
749 _mm256_srli_epi32 (__m256i __A, int __B)
751 return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B);
754 extern __inline __m256i
755 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
756 _mm256_srl_epi32 (__m256i __A, __m128i __B)
758 return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B);
761 extern __inline __m256i
762 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
763 _mm256_srli_epi64 (__m256i __A, int __B)
765 return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B);
768 extern __inline __m256i
769 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
770 _mm256_srl_epi64 (__m256i __A, __m128i __B)
772 return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B);
775 extern __inline __m256i
776 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
777 _mm256_sub_epi8 (__m256i __A, __m256i __B)
779 return (__m256i)__builtin_ia32_psubb256 ((__v32qi)__A, (__v32qi)__B);
782 extern __inline __m256i
783 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
784 _mm256_sub_epi16 (__m256i __A, __m256i __B)
786 return (__m256i)__builtin_ia32_psubw256 ((__v16hi)__A, (__v16hi)__B);
789 extern __inline __m256i
790 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
791 _mm256_sub_epi32 (__m256i __A, __m256i __B)
793 return (__m256i)__builtin_ia32_psubd256 ((__v8si)__A, (__v8si)__B);
796 extern __inline __m256i
797 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
798 _mm256_sub_epi64 (__m256i __A, __m256i __B)
800 return (__m256i)__builtin_ia32_psubq256 ((__v4di)__A, (__v4di)__B);
803 extern __inline __m256i
804 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
805 _mm256_subs_epi8 (__m256i __A, __m256i __B)
807 return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B);
810 extern __inline __m256i
811 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
812 _mm256_subs_epi16 (__m256i __A, __m256i __B)
814 return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B);
817 extern __inline __m256i
818 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
819 _mm256_subs_epu8 (__m256i __A, __m256i __B)
821 return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B);
824 extern __inline __m256i
825 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
826 _mm256_subs_epu16 (__m256i __A, __m256i __B)
828 return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B);
831 extern __inline __m256i
832 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
833 _mm256_unpackhi_epi8 (__m256i __A, __m256i __B)
835 return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B);
838 extern __inline __m256i
839 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
840 _mm256_unpackhi_epi16 (__m256i __A, __m256i __B)
842 return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B);
845 extern __inline __m256i
846 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
847 _mm256_unpackhi_epi32 (__m256i __A, __m256i __B)
849 return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B);
852 extern __inline __m256i
853 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
854 _mm256_unpackhi_epi64 (__m256i __A, __m256i __B)
856 return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B);
859 extern __inline __m256i
860 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
861 _mm256_unpacklo_epi8 (__m256i __A, __m256i __B)
863 return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B);
866 extern __inline __m256i
867 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
868 _mm256_unpacklo_epi16 (__m256i __A, __m256i __B)
870 return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B);
873 extern __inline __m256i
874 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
875 _mm256_unpacklo_epi32 (__m256i __A, __m256i __B)
877 return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B);
880 extern __inline __m256i
881 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
882 _mm256_unpacklo_epi64 (__m256i __A, __m256i __B)
884 return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B);
887 extern __inline __m256i
888 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
889 _mm256_xor_si256 (__m256i __A, __m256i __B)
891 return (__m256i)__builtin_ia32_pxor256 ((__v4di)__A, (__v4di)__B);
894 extern __inline __m256i
895 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
896 _mm256_stream_load_si256 (__m256i const *__X)
898 return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X);
901 extern __inline __m128
902 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
903 _mm_broadcastss_ps (__m128 __X)
905 return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X);
908 extern __inline __m256
909 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
910 _mm256_broadcastss_ps (__m128 __X)
912 return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X);
915 extern __inline __m256d
916 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
917 _mm256_broadcastsd_pd (__m128d __X)
919 return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X);
922 extern __inline __m256i
923 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
924 _mm_broadcastsi128_si256 (__m128i __X)
926 return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X);
929 #ifdef __OPTIMIZE__
930 extern __inline __m128i
931 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
932 _mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M)
934 return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X,
935 (__v4si)__Y,
936 __M);
938 #else
939 #define _mm_blend_epi32(X, Y, M) \
940 ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X), \
941 (__v4si)(__m128i)(Y), (int)(M)))
942 #endif
944 #ifdef __OPTIMIZE__
945 extern __inline __m256i
946 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
947 _mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M)
949 return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X,
950 (__v8si)__Y,
951 __M);
953 #else
954 #define _mm256_blend_epi32(X, Y, M) \
955 ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X), \
956 (__v8si)(__m256i)(Y), (int)(M)))
957 #endif
959 extern __inline __m256i
960 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
961 _mm256_broadcastb_epi8 (__m128i __X)
963 return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X);
966 extern __inline __m256i
967 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
968 _mm256_broadcastw_epi16 (__m128i __X)
970 return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X);
973 extern __inline __m256i
974 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
975 _mm256_broadcastd_epi32 (__m128i __X)
977 return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X);
980 extern __inline __m256i
981 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
982 _mm256_broadcastq_epi64 (__m128i __X)
984 return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X);
987 extern __inline __m128i
988 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
989 _mm_broadcastb_epi8 (__m128i __X)
991 return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X);
994 extern __inline __m128i
995 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
996 _mm_broadcastw_epi16 (__m128i __X)
998 return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X);
1001 extern __inline __m128i
1002 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1003 _mm_broadcastd_epi32 (__m128i __X)
1005 return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X);
1008 extern __inline __m128i
1009 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1010 _mm_broadcastq_epi64 (__m128i __X)
1012 return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X);
1015 extern __inline __m256i
1016 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1017 _mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y)
1019 return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y);
1022 #ifdef __OPTIMIZE__
1023 extern __inline __m256d
1024 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1025 _mm256_permute4x64_pd (__m256d __X, const int __M)
1027 return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M);
1029 #else
1030 #define _mm256_permute4x64_pd(X, M) \
1031 ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M)))
1032 #endif
1034 extern __inline __m256
1035 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1036 _mm256_permutevar8x32_ps (__m256 __X, __m256i __Y)
1038 return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y);
1041 #ifdef __OPTIMIZE__
1042 extern __inline __m256i
1043 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1044 _mm256_permute4x64_epi64 (__m256i __X, const int __M)
1046 return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M);
1048 #else
1049 #define _mm256_permute4x64_epi64(X, M) \
1050 ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M)))
1051 #endif
1054 #ifdef __OPTIMIZE__
1055 extern __inline __m256i
1056 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1057 _mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M)
1059 return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M);
1061 #else
1062 #define _mm256_permute2x128_si256(X, Y, M) \
1063 ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M)))
1064 #endif
1066 #ifdef __OPTIMIZE__
1067 extern __inline __m128i
1068 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1069 _mm256_extracti128_si256 (__m256i __X, const int __M)
1071 return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M);
1073 #else
1074 #define _mm256_extracti128_si256(X, M) \
1075 ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M)))
1076 #endif
1078 #ifdef __OPTIMIZE__
1079 extern __inline __m256i
1080 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1081 _mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M)
1083 return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M);
1085 #else
1086 #define _mm256_inserti128_si256(X, Y, M) \
1087 ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \
1088 (__v2di)(__m128i)(Y), \
1089 (int)(M)))
1090 #endif
1092 extern __inline __m256i
1093 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1094 _mm256_maskload_epi32 (int const *__X, __m256i __M )
1096 return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X,
1097 (__v8si)__M);
1100 extern __inline __m256i
1101 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1102 _mm256_maskload_epi64 (long long const *__X, __m256i __M )
1104 return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X,
1105 (__v4di)__M);
1108 extern __inline __m128i
1109 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1110 _mm_maskload_epi32 (int const *__X, __m128i __M )
1112 return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X,
1113 (__v4si)__M);
1116 extern __inline __m128i
1117 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1118 _mm_maskload_epi64 (long long const *__X, __m128i __M )
1120 return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X,
1121 (__v2di)__M);
1124 extern __inline void
1125 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1126 _mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y )
1128 __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
1131 extern __inline void
1132 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1133 _mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y )
1135 __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
1138 extern __inline void
1139 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1140 _mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y )
1142 __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
1145 extern __inline void
1146 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1147 _mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y )
1149 __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
1152 extern __inline __m256i
1153 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1154 _mm256_sllv_epi32 (__m256i __X, __m256i __Y)
1156 return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y);
1159 extern __inline __m128i
1160 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1161 _mm_sllv_epi32 (__m128i __X, __m128i __Y)
1163 return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y);
1166 extern __inline __m256i
1167 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1168 _mm256_sllv_epi64 (__m256i __X, __m256i __Y)
1170 return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y);
1173 extern __inline __m128i
1174 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1175 _mm_sllv_epi64 (__m128i __X, __m128i __Y)
1177 return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y);
1180 extern __inline __m256i
1181 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1182 _mm256_srav_epi32 (__m256i __X, __m256i __Y)
1184 return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y);
1187 extern __inline __m128i
1188 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1189 _mm_srav_epi32 (__m128i __X, __m128i __Y)
1191 return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y);
1194 extern __inline __m256i
1195 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1196 _mm256_srlv_epi32 (__m256i __X, __m256i __Y)
1198 return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y);
1201 extern __inline __m128i
1202 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1203 _mm_srlv_epi32 (__m128i __X, __m128i __Y)
1205 return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y);
1208 extern __inline __m256i
1209 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1210 _mm256_srlv_epi64 (__m256i __X, __m256i __Y)
1212 return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y);
1215 extern __inline __m128i
1216 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1217 _mm_srlv_epi64 (__m128i __X, __m128i __Y)
1219 return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y);
1222 #ifdef __OPTIMIZE__
1223 extern __inline __m128d
1224 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1225 _mm_i32gather_pd (double const *base, __m128i index, const int scale)
1227 __v2df src = _mm_setzero_pd ();
1228 __v2df mask = _mm_cmpeq_pd (src, src);
1230 return (__m128d) __builtin_ia32_gathersiv2df (src,
1231 base,
1232 (__v4si)index,
1233 mask,
1234 scale);
1237 extern __inline __m128d
1238 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1239 _mm_mask_i32gather_pd (__m128d src, double const *base, __m128i index,
1240 __m128d mask, const int scale)
1242 return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)src,
1243 base,
1244 (__v4si)index,
1245 (__v2df)mask,
1246 scale);
1249 extern __inline __m256d
1250 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1251 _mm256_i32gather_pd (double const *base, __m128i index, const int scale)
1253 __v4df src = _mm256_setzero_pd ();
1254 __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ);
1256 return (__m256d) __builtin_ia32_gathersiv4df (src,
1257 base,
1258 (__v4si)index,
1259 mask,
1260 scale);
1263 extern __inline __m256d
1264 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1265 _mm256_mask_i32gather_pd (__m256d src, double const *base,
1266 __m128i index, __m256d mask, const int scale)
1268 return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)src,
1269 base,
1270 (__v4si)index,
1271 (__v4df)mask,
1272 scale);
1275 extern __inline __m128d
1276 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1277 _mm_i64gather_pd (double const *base, __m128i index, const int scale)
1279 __v2df src = _mm_setzero_pd ();
1280 __v2df mask = _mm_cmpeq_pd (src, src);
1282 return (__m128d) __builtin_ia32_gatherdiv2df (src,
1283 base,
1284 (__v2di)index,
1285 mask,
1286 scale);
1289 extern __inline __m128d
1290 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1291 _mm_mask_i64gather_pd (__m128d src, double const *base, __m128i index,
1292 __m128d mask, const int scale)
1294 return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)src,
1295 base,
1296 (__v2di)index,
1297 (__v2df)mask,
1298 scale);
1301 extern __inline __m256d
1302 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1303 _mm256_i64gather_pd (double const *base, __m256i index, const int scale)
1305 __v4df src = _mm256_setzero_pd ();
1306 __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ);
1308 return (__m256d) __builtin_ia32_gatherdiv4df (src,
1309 base,
1310 (__v4di)index,
1311 mask,
1312 scale);
1315 extern __inline __m256d
1316 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1317 _mm256_mask_i64gather_pd (__m256d src, double const *base,
1318 __m256i index, __m256d mask, const int scale)
1320 return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)src,
1321 base,
1322 (__v4di)index,
1323 (__v4df)mask,
1324 scale);
1327 extern __inline __m128
1328 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1329 _mm_i32gather_ps (float const *base, __m128i index, const int scale)
1331 __v4sf src = _mm_setzero_ps ();
1332 __v4sf mask = _mm_cmpeq_ps (src, src);
1334 return (__m128) __builtin_ia32_gathersiv4sf (src,
1335 base,
1336 (__v4si)index,
1337 mask,
1338 scale);
1341 extern __inline __m128
1342 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1343 _mm_mask_i32gather_ps (__m128 src, float const *base, __m128i index,
1344 __m128 mask, const int scale)
1346 return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)src,
1347 base,
1348 (__v4si)index,
1349 (__v4sf)mask,
1350 scale);
1353 extern __inline __m256
1354 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1355 _mm256_i32gather_ps (float const *base, __m256i index, const int scale)
1357 __v8sf src = _mm256_setzero_ps ();
1358 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
1360 return (__m256) __builtin_ia32_gathersiv8sf (src,
1361 base,
1362 (__v8si)index,
1363 mask,
1364 scale);
1367 extern __inline __m256
1368 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1369 _mm256_mask_i32gather_ps (__m256 src, float const *base,
1370 __m256i index, __m256 mask, const int scale)
1372 return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)src,
1373 base,
1374 (__v8si)index,
1375 (__v8sf)mask,
1376 scale);
1379 extern __inline __m128
1380 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1381 _mm_i64gather_ps (float const *base, __m128i index, const int scale)
1383 __v4sf src = _mm_setzero_ps ();
1384 __v4sf mask = _mm_cmpeq_ps (src, src);
1386 return (__m128) __builtin_ia32_gatherdiv4sf (src,
1387 base,
1388 (__v2di)index,
1389 mask,
1390 scale);
1393 extern __inline __m128
1394 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1395 _mm_mask_i64gather_ps (__m128 src, float const *base, __m128i index,
1396 __m128 mask, const int scale)
1398 return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)src,
1399 base,
1400 (__v2di)index,
1401 (__v4sf)mask,
1402 scale);
1405 extern __inline __m128
1406 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1407 _mm256_i64gather_ps (float const *base, __m256i index, const int scale)
1409 __v4sf src = _mm_setzero_ps ();
1410 __v4sf mask = _mm_cmpeq_ps (src, src);
1412 return (__m128) __builtin_ia32_gatherdiv4sf256 (src,
1413 base,
1414 (__v4di)index,
1415 mask,
1416 scale);
1419 extern __inline __m128
1420 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1421 _mm256_mask_i64gather_ps (__m128 src, float const *base,
1422 __m256i index, __m128 mask, const int scale)
1424 return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)src,
1425 base,
1426 (__v4di)index,
1427 (__v4sf)mask,
1428 scale);
1431 extern __inline __m128i
1432 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1433 _mm_i32gather_epi64 (long long int const *base,
1434 __m128i index, const int scale)
1436 __v2di src = __extension__ (__v2di){ 0, 0 };
1437 __v2di mask = __extension__ (__v2di){ ~0, ~0 };
1439 return (__m128i) __builtin_ia32_gathersiv2di (src,
1440 base,
1441 (__v4si)index,
1442 mask,
1443 scale);
1446 extern __inline __m128i
1447 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1448 _mm_mask_i32gather_epi64 (__m128i src, long long int const *base,
1449 __m128i index, __m128i mask, const int scale)
1451 return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)src,
1452 base,
1453 (__v4si)index,
1454 (__v2di)mask,
1455 scale);
1458 extern __inline __m256i
1459 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1460 _mm256_i32gather_epi64 (long long int const *base,
1461 __m128i index, const int scale)
1463 __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 };
1464 __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
1466 return (__m256i) __builtin_ia32_gathersiv4di (src,
1467 base,
1468 (__v4si)index,
1469 mask,
1470 scale);
1473 extern __inline __m256i
1474 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1475 _mm256_mask_i32gather_epi64 (__m256i src, long long int const *base,
1476 __m128i index, __m256i mask, const int scale)
1478 return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)src,
1479 base,
1480 (__v4si)index,
1481 (__v4di)mask,
1482 scale);
1485 extern __inline __m128i
1486 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1487 _mm_i64gather_epi64 (long long int const *base,
1488 __m128i index, const int scale)
1490 __v2di src = __extension__ (__v2di){ 0, 0 };
1491 __v2di mask = __extension__ (__v2di){ ~0, ~0 };
1493 return (__m128i) __builtin_ia32_gatherdiv2di (src,
1494 base,
1495 (__v2di)index,
1496 mask,
1497 scale);
1500 extern __inline __m128i
1501 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1502 _mm_mask_i64gather_epi64 (__m128i src, long long int const *base, __m128i index,
1503 __m128i mask, const int scale)
1505 return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)src,
1506 base,
1507 (__v2di)index,
1508 (__v2di)mask,
1509 scale);
1512 extern __inline __m256i
1513 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1514 _mm256_i64gather_epi64 (long long int const *base,
1515 __m256i index, const int scale)
1517 __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 };
1518 __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
1520 return (__m256i) __builtin_ia32_gatherdiv4di (src,
1521 base,
1522 (__v4di)index,
1523 mask,
1524 scale);
1527 extern __inline __m256i
1528 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1529 _mm256_mask_i64gather_epi64 (__m256i src, long long int const *base,
1530 __m256i index, __m256i mask, const int scale)
1532 return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)src,
1533 base,
1534 (__v4di)index,
1535 (__v4di)mask,
1536 scale);
1539 extern __inline __m128i
1540 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1541 _mm_i32gather_epi32 (int const *base, __m128i index, const int scale)
1543 __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
1544 __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1546 return (__m128i) __builtin_ia32_gathersiv4si (src,
1547 base,
1548 (__v4si)index,
1549 mask,
1550 scale);
1553 extern __inline __m128i
1554 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1555 _mm_mask_i32gather_epi32 (__m128i src, int const *base, __m128i index,
1556 __m128i mask, const int scale)
1558 return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)src,
1559 base,
1560 (__v4si)index,
1561 (__v4si)mask,
1562 scale);
1565 extern __inline __m256i
1566 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1567 _mm256_i32gather_epi32 (int const *base, __m256i index, const int scale)
1569 __v8si src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 };
1570 __v8si mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 };
1572 return (__m256i) __builtin_ia32_gathersiv8si (src,
1573 base,
1574 (__v8si)index,
1575 mask,
1576 scale);
1579 extern __inline __m256i
1580 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1581 _mm256_mask_i32gather_epi32 (__m256i src, int const *base,
1582 __m256i index, __m256i mask, const int scale)
1584 return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)src,
1585 base,
1586 (__v8si)index,
1587 (__v8si)mask,
1588 scale);
1591 extern __inline __m128i
1592 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1593 _mm_i64gather_epi32 (int const *base, __m128i index, const int scale)
1595 __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
1596 __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1598 return (__m128i) __builtin_ia32_gatherdiv4si (src,
1599 base,
1600 (__v2di)index,
1601 mask,
1602 scale);
1605 extern __inline __m128i
1606 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1607 _mm_mask_i64gather_epi32 (__m128i src, int const *base, __m128i index,
1608 __m128i mask, const int scale)
1610 return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)src,
1611 base,
1612 (__v2di)index,
1613 (__v4si)mask,
1614 scale);
1617 extern __inline __m128i
1618 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1619 _mm256_i64gather_epi32 (int const *base, __m256i index, const int scale)
1621 __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
1622 __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1624 return (__m128i) __builtin_ia32_gatherdiv4si256 (src,
1625 base,
1626 (__v4di)index,
1627 mask,
1628 scale);
1631 extern __inline __m128i
1632 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1633 _mm256_mask_i64gather_epi32 (__m128i src, int const *base,
1634 __m256i index, __m128i mask, const int scale)
1636 return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)src,
1637 base,
1638 (__v4di)index,
1639 (__v4si)mask,
1640 scale);
1642 #else /* __OPTIMIZE__ */
1643 #define _mm_i32gather_pd(BASE, INDEX, SCALE) \
1644 (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (), \
1645 (double const *)BASE, \
1646 (__v4si)(__m128i)INDEX, \
1647 (__v2df)_mm_set1_pd( \
1648 (double)(long long int) -1), \
1649 (int)SCALE)
1651 #define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1652 (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d)SRC, \
1653 (double const *)BASE, \
1654 (__v4si)(__m128i)INDEX, \
1655 (__v2df)(__m128d)MASK, \
1656 (int)SCALE)
1658 #define _mm256_i32gather_pd(BASE, INDEX, SCALE) \
1659 (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (), \
1660 (double const *)BASE, \
1661 (__v4si)(__m128i)INDEX, \
1662 (__v4df)_mm256_set1_pd( \
1663 (double)(long long int) -1), \
1664 (int)SCALE)
1666 #define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1667 (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d)SRC, \
1668 (double const *)BASE, \
1669 (__v4si)(__m128i)INDEX, \
1670 (__v4df)(__m256d)MASK, \
1671 (int)SCALE)
1673 #define _mm_i64gather_pd(BASE, INDEX, SCALE) \
1674 (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (), \
1675 (double const *)BASE, \
1676 (__v2di)(__m128i)INDEX, \
1677 (__v2df)_mm_set1_pd( \
1678 (double)(long long int) -1), \
1679 (int)SCALE)
1681 #define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1682 (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d)SRC, \
1683 (double const *)BASE, \
1684 (__v2di)(__m128i)INDEX, \
1685 (__v2df)(__m128d)MASK, \
1686 (int)SCALE)
1688 #define _mm256_i64gather_pd(BASE, INDEX, SCALE) \
1689 (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (), \
1690 (double const *)BASE, \
1691 (__v4di)(__m256i)INDEX, \
1692 (__v4df)_mm256_set1_pd( \
1693 (double)(long long int) -1), \
1694 (int)SCALE)
1696 #define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1697 (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d)SRC, \
1698 (double const *)BASE, \
1699 (__v4di)(__m256i)INDEX, \
1700 (__v4df)(__m256d)MASK, \
1701 (int)SCALE)
1703 #define _mm_i32gather_ps(BASE, INDEX, SCALE) \
1704 (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (), \
1705 (float const *)BASE, \
1706 (__v4si)(__m128i)INDEX, \
1707 _mm_set1_ps ((float)(int) -1), \
1708 (int)SCALE)
1710 #define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1711 (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128d)SRC, \
1712 (float const *)BASE, \
1713 (__v4si)(__m128i)INDEX, \
1714 (__v4sf)(__m128d)MASK, \
1715 (int)SCALE)
1717 #define _mm256_i32gather_ps(BASE, INDEX, SCALE) \
1718 (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \
1719 (float const *)BASE, \
1720 (__v8si)(__m256i)INDEX, \
1721 (__v8sf)_mm256_set1_ps ( \
1722 (float)(int) -1), \
1723 (int)SCALE)
1725 #define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1726 (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC, \
1727 (float const *)BASE, \
1728 (__v8si)(__m256i)INDEX, \
1729 (__v8sf)(__m256d)MASK, \
1730 (int)SCALE)
1732 #define _mm_i64gather_ps(BASE, INDEX, SCALE) \
1733 (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (), \
1734 (float const *)BASE, \
1735 (__v2di)(__m128i)INDEX, \
1736 (__v4sf)_mm_set1_ps ( \
1737 (float)(int) -1), \
1738 (int)SCALE)
1740 #define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1741 (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC, \
1742 (float const *)BASE, \
1743 (__v2di)(__m128i)INDEX, \
1744 (__v4sf)(__m128d)MASK, \
1745 (int)SCALE)
1747 #define _mm256_i64gather_ps(BASE, INDEX, SCALE) \
1748 (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (), \
1749 (float const *)BASE, \
1750 (__v4di)(__m256i)INDEX, \
1751 (__v4sf)_mm_set1_ps( \
1752 (float)(int) -1), \
1753 (int)SCALE)
1755 #define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1756 (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128)SRC, \
1757 (float const *)BASE, \
1758 (__v4di)(__m256i)INDEX, \
1759 (__v4sf)(__m128)MASK, \
1760 (int)SCALE)
1762 #define _mm_i32gather_epi64(BASE, INDEX, SCALE) \
1763 (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \
1764 (long long const *)BASE, \
1765 (__v4si)(__m128i)INDEX, \
1766 (__v2di)_mm_set1_epi64x (-1), \
1767 (int)SCALE)
1769 #define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1770 (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i)SRC, \
1771 (long long const *)BASE, \
1772 (__v4si)(__m128i)INDEX, \
1773 (__v2di)(__m128i)MASK, \
1774 (int)SCALE)
1776 #define _mm256_i32gather_epi64(BASE, INDEX, SCALE) \
1777 (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \
1778 (long long const *)BASE, \
1779 (__v4si)(__m128i)INDEX, \
1780 (__v4di)_mm256_set1_epi64x (-1), \
1781 (int)SCALE)
1783 #define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1784 (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i)SRC, \
1785 (long long const *)BASE, \
1786 (__v4si)(__m128i)INDEX, \
1787 (__v4di)(__m256i)MASK, \
1788 (int)SCALE)
1790 #define _mm_i64gather_epi64(BASE, INDEX, SCALE) \
1791 (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \
1792 (long long const *)BASE, \
1793 (__v2di)(__m128i)INDEX, \
1794 (__v2di)_mm_set1_epi64x (-1), \
1795 (int)SCALE)
1797 #define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1798 (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i)SRC, \
1799 (long long const *)BASE, \
1800 (__v2di)(__m128i)INDEX, \
1801 (__v2di)(__m128i)MASK, \
1802 (int)SCALE)
1804 #define _mm256_i64gather_epi64(BASE, INDEX, SCALE) \
1805 (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \
1806 (long long const *)BASE, \
1807 (__v4di)(__m256i)INDEX, \
1808 (__v4di)_mm256_set1_epi64x (-1), \
1809 (int)SCALE)
1811 #define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1812 (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i)SRC, \
1813 (long long const *)BASE, \
1814 (__v4di)(__m256i)INDEX, \
1815 (__v4di)(__m256i)MASK, \
1816 (int)SCALE)
1818 #define _mm_i32gather_epi32(BASE, INDEX, SCALE) \
1819 (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (), \
1820 (int const *)BASE, \
1821 (__v4si)(__m128i)INDEX, \
1822 (__v4si)_mm_set1_epi32 (-1), \
1823 (int)SCALE)
1825 #define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1826 (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i)SRC, \
1827 (int const *)BASE, \
1828 (__v4si)(__m128i)INDEX, \
1829 (__v4si)(__m128i)MASK, \
1830 (int)SCALE)
1832 #define _mm256_i32gather_epi32(BASE, INDEX, SCALE) \
1833 (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \
1834 (int const *)BASE, \
1835 (__v8si)(__m256i)INDEX, \
1836 (__v8si)_mm256_set1_epi32 (-1), \
1837 (int)SCALE)
1839 #define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1840 (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i)SRC, \
1841 (int const *)BASE, \
1842 (__v8si)(__m256i)INDEX, \
1843 (__v8si)(__m256i)MASK, \
1844 (int)SCALE)
1846 #define _mm_i64gather_epi32(BASE, INDEX, SCALE) \
1847 (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (), \
1848 (int const *)BASE, \
1849 (__v2di)(__m128i)INDEX, \
1850 (__v4si)_mm_set1_epi32 (-1), \
1851 (int)SCALE)
1853 #define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1854 (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i)SRC, \
1855 (int const *)BASE, \
1856 (__v2di)(__m128i)INDEX, \
1857 (__v4si)(__m128i)MASK, \
1858 (int)SCALE)
1860 #define _mm256_i64gather_epi32(BASE, INDEX, SCALE) \
1861 (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \
1862 (int const *)BASE, \
1863 (__v4di)(__m256i)INDEX, \
1864 (__v4si)_mm_set1_epi32(-1), \
1865 (int)SCALE)
1867 #define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1868 (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i)SRC, \
1869 (int const *)BASE, \
1870 (__v4di)(__m256i)INDEX, \
1871 (__v4si)(__m128i)MASK, \
1872 (int)SCALE)
1873 #endif /* __OPTIMIZE__ */