libstdc++: Define __glibcxx_assert_fail for non-verbose build [PR115585]
[official-gcc.git] / gcc / config / i386 / avx2intrin.h
blobcb0801eb801a49245a3c9a5b7b36eded748c9242
1 /* Copyright (C) 2011-2024 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
24 #ifndef _IMMINTRIN_H_INCLUDED
25 # error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
26 #endif
28 #ifndef _AVX2INTRIN_H_INCLUDED
29 #define _AVX2INTRIN_H_INCLUDED
31 #ifndef __AVX2__
32 #pragma GCC push_options
33 #pragma GCC target("avx2")
34 #define __DISABLE_AVX2__
35 #endif /* __AVX2__ */
37 /* Sum absolute 8-bit integer difference of adjacent groups of 4
38 byte integers in the first 2 operands. Starting offsets within
39 operands are determined by the 3rd mask operand. */
40 #ifdef __OPTIMIZE__
41 extern __inline __m256i
42 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
43 _mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M)
45 return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X,
46 (__v32qi)__Y, __M);
48 #else
49 #define _mm256_mpsadbw_epu8(X, Y, M) \
50 ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X), \
51 (__v32qi)(__m256i)(Y), (int)(M)))
52 #endif
54 extern __inline __m256i
55 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
56 _mm256_abs_epi8 (__m256i __A)
58 return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A);
61 extern __inline __m256i
62 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
63 _mm256_abs_epi16 (__m256i __A)
65 return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A);
68 extern __inline __m256i
69 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
70 _mm256_abs_epi32 (__m256i __A)
72 return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A);
75 extern __inline __m256i
76 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
77 _mm256_packs_epi32 (__m256i __A, __m256i __B)
79 return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B);
82 extern __inline __m256i
83 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
84 _mm256_packs_epi16 (__m256i __A, __m256i __B)
86 return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B);
89 extern __inline __m256i
90 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
91 _mm256_packus_epi32 (__m256i __A, __m256i __B)
93 return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B);
96 extern __inline __m256i
97 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
98 _mm256_packus_epi16 (__m256i __A, __m256i __B)
100 return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B);
103 extern __inline __m256i
104 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
105 _mm256_add_epi8 (__m256i __A, __m256i __B)
107 return (__m256i) ((__v32qu)__A + (__v32qu)__B);
110 extern __inline __m256i
111 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
112 _mm256_add_epi16 (__m256i __A, __m256i __B)
114 return (__m256i) ((__v16hu)__A + (__v16hu)__B);
117 extern __inline __m256i
118 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
119 _mm256_add_epi32 (__m256i __A, __m256i __B)
121 return (__m256i) ((__v8su)__A + (__v8su)__B);
124 extern __inline __m256i
125 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
126 _mm256_add_epi64 (__m256i __A, __m256i __B)
128 return (__m256i) ((__v4du)__A + (__v4du)__B);
131 extern __inline __m256i
132 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
133 _mm256_adds_epi8 (__m256i __A, __m256i __B)
135 return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B);
138 extern __inline __m256i
139 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
140 _mm256_adds_epi16 (__m256i __A, __m256i __B)
142 return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B);
145 extern __inline __m256i
146 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
147 _mm256_adds_epu8 (__m256i __A, __m256i __B)
149 return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B);
152 extern __inline __m256i
153 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
154 _mm256_adds_epu16 (__m256i __A, __m256i __B)
156 return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B);
159 #ifdef __OPTIMIZE__
160 extern __inline __m256i
161 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
162 _mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N)
164 return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A,
165 (__v4di)__B,
166 __N * 8);
168 #else
169 /* In that case (__N*8) will be in vreg, and insn will not be matched. */
170 /* Use define instead */
171 #define _mm256_alignr_epi8(A, B, N) \
172 ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A), \
173 (__v4di)(__m256i)(B), \
174 (int)(N) * 8))
175 #endif
177 extern __inline __m256i
178 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
179 _mm256_and_si256 (__m256i __A, __m256i __B)
181 return (__m256i) ((__v4du)__A & (__v4du)__B);
184 extern __inline __m256i
185 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
186 _mm256_andnot_si256 (__m256i __A, __m256i __B)
188 return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B);
191 extern __inline __m256i
192 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
193 _mm256_avg_epu8 (__m256i __A, __m256i __B)
195 return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B);
198 extern __inline __m256i
199 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
200 _mm256_avg_epu16 (__m256i __A, __m256i __B)
202 return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B);
205 extern __inline __m256i
206 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
207 _mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M)
209 return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X,
210 (__v32qi)__Y,
211 (__v32qi)__M);
214 #ifdef __OPTIMIZE__
215 extern __inline __m256i
216 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
217 _mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M)
219 return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X,
220 (__v16hi)__Y,
221 __M);
223 #else
224 #define _mm256_blend_epi16(X, Y, M) \
225 ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X), \
226 (__v16hi)(__m256i)(Y), (int)(M)))
227 #endif
229 extern __inline __m256i
230 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
231 _mm256_cmpeq_epi8 (__m256i __A, __m256i __B)
233 return (__m256i) ((__v32qi)__A == (__v32qi)__B);
236 extern __inline __m256i
237 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
238 _mm256_cmpeq_epi16 (__m256i __A, __m256i __B)
240 return (__m256i) ((__v16hi)__A == (__v16hi)__B);
243 extern __inline __m256i
244 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
245 _mm256_cmpeq_epi32 (__m256i __A, __m256i __B)
247 return (__m256i) ((__v8si)__A == (__v8si)__B);
250 extern __inline __m256i
251 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
252 _mm256_cmpeq_epi64 (__m256i __A, __m256i __B)
254 return (__m256i) ((__v4di)__A == (__v4di)__B);
257 extern __inline __m256i
258 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
259 _mm256_cmpgt_epi8 (__m256i __A, __m256i __B)
261 return (__m256i) ((__v32qs)__A > (__v32qs)__B);
264 extern __inline __m256i
265 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
266 _mm256_cmpgt_epi16 (__m256i __A, __m256i __B)
268 return (__m256i) ((__v16hi)__A > (__v16hi)__B);
271 extern __inline __m256i
272 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
273 _mm256_cmpgt_epi32 (__m256i __A, __m256i __B)
275 return (__m256i) ((__v8si)__A > (__v8si)__B);
278 extern __inline __m256i
279 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
280 _mm256_cmpgt_epi64 (__m256i __A, __m256i __B)
282 return (__m256i) ((__v4di)__A > (__v4di)__B);
285 extern __inline __m256i
286 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
287 _mm256_hadd_epi16 (__m256i __X, __m256i __Y)
289 return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X,
290 (__v16hi)__Y);
293 extern __inline __m256i
294 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
295 _mm256_hadd_epi32 (__m256i __X, __m256i __Y)
297 return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y);
300 extern __inline __m256i
301 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
302 _mm256_hadds_epi16 (__m256i __X, __m256i __Y)
304 return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X,
305 (__v16hi)__Y);
308 extern __inline __m256i
309 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
310 _mm256_hsub_epi16 (__m256i __X, __m256i __Y)
312 return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X,
313 (__v16hi)__Y);
316 extern __inline __m256i
317 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
318 _mm256_hsub_epi32 (__m256i __X, __m256i __Y)
320 return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y);
323 extern __inline __m256i
324 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
325 _mm256_hsubs_epi16 (__m256i __X, __m256i __Y)
327 return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X,
328 (__v16hi)__Y);
331 extern __inline __m256i
332 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
333 _mm256_maddubs_epi16 (__m256i __X, __m256i __Y)
335 return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X,
336 (__v32qi)__Y);
339 extern __inline __m256i
340 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
341 _mm256_madd_epi16 (__m256i __A, __m256i __B)
343 return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A,
344 (__v16hi)__B);
347 extern __inline __m256i
348 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
349 _mm256_max_epi8 (__m256i __A, __m256i __B)
351 return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B);
354 extern __inline __m256i
355 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
356 _mm256_max_epi16 (__m256i __A, __m256i __B)
358 return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B);
361 extern __inline __m256i
362 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
363 _mm256_max_epi32 (__m256i __A, __m256i __B)
365 return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B);
368 extern __inline __m256i
369 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
370 _mm256_max_epu8 (__m256i __A, __m256i __B)
372 return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B);
375 extern __inline __m256i
376 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
377 _mm256_max_epu16 (__m256i __A, __m256i __B)
379 return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B);
382 extern __inline __m256i
383 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
384 _mm256_max_epu32 (__m256i __A, __m256i __B)
386 return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B);
389 extern __inline __m256i
390 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
391 _mm256_min_epi8 (__m256i __A, __m256i __B)
393 return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B);
396 extern __inline __m256i
397 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
398 _mm256_min_epi16 (__m256i __A, __m256i __B)
400 return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B);
403 extern __inline __m256i
404 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
405 _mm256_min_epi32 (__m256i __A, __m256i __B)
407 return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B);
410 extern __inline __m256i
411 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
412 _mm256_min_epu8 (__m256i __A, __m256i __B)
414 return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B);
417 extern __inline __m256i
418 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
419 _mm256_min_epu16 (__m256i __A, __m256i __B)
421 return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B);
424 extern __inline __m256i
425 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
426 _mm256_min_epu32 (__m256i __A, __m256i __B)
428 return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B);
431 extern __inline int
432 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
433 _mm256_movemask_epi8 (__m256i __A)
435 return __builtin_ia32_pmovmskb256 ((__v32qi)__A);
438 extern __inline __m256i
439 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
440 _mm256_cvtepi8_epi16 (__m128i __X)
442 return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X);
445 extern __inline __m256i
446 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
447 _mm256_cvtepi8_epi32 (__m128i __X)
449 return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X);
452 extern __inline __m256i
453 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
454 _mm256_cvtepi8_epi64 (__m128i __X)
456 return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X);
459 extern __inline __m256i
460 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
461 _mm256_cvtepi16_epi32 (__m128i __X)
463 return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X);
466 extern __inline __m256i
467 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
468 _mm256_cvtepi16_epi64 (__m128i __X)
470 return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X);
473 extern __inline __m256i
474 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
475 _mm256_cvtepi32_epi64 (__m128i __X)
477 return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X);
480 extern __inline __m256i
481 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
482 _mm256_cvtepu8_epi16 (__m128i __X)
484 return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X);
487 extern __inline __m256i
488 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
489 _mm256_cvtepu8_epi32 (__m128i __X)
491 return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X);
494 extern __inline __m256i
495 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
496 _mm256_cvtepu8_epi64 (__m128i __X)
498 return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X);
501 extern __inline __m256i
502 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
503 _mm256_cvtepu16_epi32 (__m128i __X)
505 return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X);
508 extern __inline __m256i
509 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
510 _mm256_cvtepu16_epi64 (__m128i __X)
512 return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X);
515 extern __inline __m256i
516 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
517 _mm256_cvtepu32_epi64 (__m128i __X)
519 return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X);
522 extern __inline __m256i
523 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
524 _mm256_mul_epi32 (__m256i __X, __m256i __Y)
526 return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y);
529 extern __inline __m256i
530 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
531 _mm256_mulhrs_epi16 (__m256i __X, __m256i __Y)
533 return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X,
534 (__v16hi)__Y);
537 extern __inline __m256i
538 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
539 _mm256_mulhi_epu16 (__m256i __A, __m256i __B)
541 return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B);
544 extern __inline __m256i
545 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
546 _mm256_mulhi_epi16 (__m256i __A, __m256i __B)
548 return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B);
551 extern __inline __m256i
552 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
553 _mm256_mullo_epi16 (__m256i __A, __m256i __B)
555 return (__m256i) ((__v16hu)__A * (__v16hu)__B);
558 extern __inline __m256i
559 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
560 _mm256_mullo_epi32 (__m256i __A, __m256i __B)
562 return (__m256i) ((__v8su)__A * (__v8su)__B);
565 extern __inline __m256i
566 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
567 _mm256_mul_epu32 (__m256i __A, __m256i __B)
569 return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B);
572 extern __inline __m256i
573 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
574 _mm256_or_si256 (__m256i __A, __m256i __B)
576 return (__m256i) ((__v4du)__A | (__v4du)__B);
579 extern __inline __m256i
580 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
581 _mm256_sad_epu8 (__m256i __A, __m256i __B)
583 return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B);
586 extern __inline __m256i
587 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
588 _mm256_shuffle_epi8 (__m256i __X, __m256i __Y)
590 return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X,
591 (__v32qi)__Y);
594 #ifdef __OPTIMIZE__
595 extern __inline __m256i
596 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
597 _mm256_shuffle_epi32 (__m256i __A, const int __mask)
599 return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask);
602 extern __inline __m256i
603 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
604 _mm256_shufflehi_epi16 (__m256i __A, const int __mask)
606 return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask);
609 extern __inline __m256i
610 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
611 _mm256_shufflelo_epi16 (__m256i __A, const int __mask)
613 return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask);
615 #else
616 #define _mm256_shuffle_epi32(A, N) \
617 ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N)))
618 #define _mm256_shufflehi_epi16(A, N) \
619 ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N)))
620 #define _mm256_shufflelo_epi16(A, N) \
621 ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N)))
622 #endif
624 extern __inline __m256i
625 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
626 _mm256_sign_epi8 (__m256i __X, __m256i __Y)
628 return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y);
631 extern __inline __m256i
632 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
633 _mm256_sign_epi16 (__m256i __X, __m256i __Y)
635 return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y);
638 extern __inline __m256i
639 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
640 _mm256_sign_epi32 (__m256i __X, __m256i __Y)
642 return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y);
645 #ifdef __OPTIMIZE__
646 extern __inline __m256i
647 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
648 _mm256_bslli_epi128 (__m256i __A, const int __N)
650 return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
653 extern __inline __m256i
654 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
655 _mm256_slli_si256 (__m256i __A, const int __N)
657 return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
659 #else
660 #define _mm256_bslli_epi128(A, N) \
661 ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
662 #define _mm256_slli_si256(A, N) \
663 ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
664 #endif
666 extern __inline __m256i
667 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
668 _mm256_slli_epi16 (__m256i __A, int __B)
670 return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
673 extern __inline __m256i
674 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
675 _mm256_sll_epi16 (__m256i __A, __m128i __B)
677 return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B);
680 extern __inline __m256i
681 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
682 _mm256_slli_epi32 (__m256i __A, int __B)
684 return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
687 extern __inline __m256i
688 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
689 _mm256_sll_epi32 (__m256i __A, __m128i __B)
691 return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B);
694 extern __inline __m256i
695 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
696 _mm256_slli_epi64 (__m256i __A, int __B)
698 return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B);
701 extern __inline __m256i
702 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
703 _mm256_sll_epi64 (__m256i __A, __m128i __B)
705 return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B);
708 extern __inline __m256i
709 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
710 _mm256_srai_epi16 (__m256i __A, int __B)
712 return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B);
715 extern __inline __m256i
716 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
717 _mm256_sra_epi16 (__m256i __A, __m128i __B)
719 return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B);
722 extern __inline __m256i
723 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
724 _mm256_srai_epi32 (__m256i __A, int __B)
726 return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B);
729 extern __inline __m256i
730 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
731 _mm256_sra_epi32 (__m256i __A, __m128i __B)
733 return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B);
736 #ifdef __OPTIMIZE__
737 extern __inline __m256i
738 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
739 _mm256_bsrli_epi128 (__m256i __A, const int __N)
741 return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
744 extern __inline __m256i
745 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
746 _mm256_srli_si256 (__m256i __A, const int __N)
748 return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
750 #else
751 #define _mm256_bsrli_epi128(A, N) \
752 ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
753 #define _mm256_srli_si256(A, N) \
754 ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
755 #endif
757 extern __inline __m256i
758 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
759 _mm256_srli_epi16 (__m256i __A, int __B)
761 return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B);
764 extern __inline __m256i
765 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
766 _mm256_srl_epi16 (__m256i __A, __m128i __B)
768 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B);
771 extern __inline __m256i
772 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
773 _mm256_srli_epi32 (__m256i __A, int __B)
775 return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B);
778 extern __inline __m256i
779 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
780 _mm256_srl_epi32 (__m256i __A, __m128i __B)
782 return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B);
785 extern __inline __m256i
786 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
787 _mm256_srli_epi64 (__m256i __A, int __B)
789 return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B);
792 extern __inline __m256i
793 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
794 _mm256_srl_epi64 (__m256i __A, __m128i __B)
796 return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B);
799 extern __inline __m256i
800 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
801 _mm256_sub_epi8 (__m256i __A, __m256i __B)
803 return (__m256i) ((__v32qu)__A - (__v32qu)__B);
806 extern __inline __m256i
807 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
808 _mm256_sub_epi16 (__m256i __A, __m256i __B)
810 return (__m256i) ((__v16hu)__A - (__v16hu)__B);
813 extern __inline __m256i
814 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
815 _mm256_sub_epi32 (__m256i __A, __m256i __B)
817 return (__m256i) ((__v8su)__A - (__v8su)__B);
820 extern __inline __m256i
821 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
822 _mm256_sub_epi64 (__m256i __A, __m256i __B)
824 return (__m256i) ((__v4du)__A - (__v4du)__B);
827 extern __inline __m256i
828 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
829 _mm256_subs_epi8 (__m256i __A, __m256i __B)
831 return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B);
834 extern __inline __m256i
835 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
836 _mm256_subs_epi16 (__m256i __A, __m256i __B)
838 return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B);
841 extern __inline __m256i
842 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
843 _mm256_subs_epu8 (__m256i __A, __m256i __B)
845 return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B);
848 extern __inline __m256i
849 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
850 _mm256_subs_epu16 (__m256i __A, __m256i __B)
852 return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B);
855 extern __inline __m256i
856 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
857 _mm256_unpackhi_epi8 (__m256i __A, __m256i __B)
859 return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B);
862 extern __inline __m256i
863 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
864 _mm256_unpackhi_epi16 (__m256i __A, __m256i __B)
866 return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B);
869 extern __inline __m256i
870 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
871 _mm256_unpackhi_epi32 (__m256i __A, __m256i __B)
873 return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B);
876 extern __inline __m256i
877 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
878 _mm256_unpackhi_epi64 (__m256i __A, __m256i __B)
880 return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B);
883 extern __inline __m256i
884 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
885 _mm256_unpacklo_epi8 (__m256i __A, __m256i __B)
887 return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B);
890 extern __inline __m256i
891 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
892 _mm256_unpacklo_epi16 (__m256i __A, __m256i __B)
894 return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B);
897 extern __inline __m256i
898 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
899 _mm256_unpacklo_epi32 (__m256i __A, __m256i __B)
901 return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B);
904 extern __inline __m256i
905 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
906 _mm256_unpacklo_epi64 (__m256i __A, __m256i __B)
908 return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B);
911 extern __inline __m256i
912 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
913 _mm256_xor_si256 (__m256i __A, __m256i __B)
915 return (__m256i) ((__v4du)__A ^ (__v4du)__B);
918 extern __inline __m256i
919 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
920 _mm256_stream_load_si256 (__m256i const *__X)
922 return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X);
925 extern __inline __m128
926 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
927 _mm_broadcastss_ps (__m128 __X)
929 return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X);
932 extern __inline __m256
933 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
934 _mm256_broadcastss_ps (__m128 __X)
936 return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X);
939 extern __inline __m256d
940 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
941 _mm256_broadcastsd_pd (__m128d __X)
943 return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X);
946 extern __inline __m256i
947 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
948 _mm256_broadcastsi128_si256 (__m128i __X)
950 return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X);
953 #define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
954 #define _mm_broadcastsd_pd(X) _mm_movedup_pd(X)
956 #ifdef __OPTIMIZE__
957 extern __inline __m128i
958 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
959 _mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M)
961 return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X,
962 (__v4si)__Y,
963 __M);
965 #else
966 #define _mm_blend_epi32(X, Y, M) \
967 ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X), \
968 (__v4si)(__m128i)(Y), (int)(M)))
969 #endif
971 #ifdef __OPTIMIZE__
972 extern __inline __m256i
973 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
974 _mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M)
976 return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X,
977 (__v8si)__Y,
978 __M);
980 #else
981 #define _mm256_blend_epi32(X, Y, M) \
982 ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X), \
983 (__v8si)(__m256i)(Y), (int)(M)))
984 #endif
986 extern __inline __m256i
987 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
988 _mm256_broadcastb_epi8 (__m128i __X)
990 return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X);
993 extern __inline __m256i
994 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
995 _mm256_broadcastw_epi16 (__m128i __X)
997 return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X);
1000 extern __inline __m256i
1001 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1002 _mm256_broadcastd_epi32 (__m128i __X)
1004 return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X);
1007 extern __inline __m256i
1008 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1009 _mm256_broadcastq_epi64 (__m128i __X)
1011 return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X);
1014 extern __inline __m128i
1015 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1016 _mm_broadcastb_epi8 (__m128i __X)
1018 return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X);
1021 extern __inline __m128i
1022 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1023 _mm_broadcastw_epi16 (__m128i __X)
1025 return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X);
1028 extern __inline __m128i
1029 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1030 _mm_broadcastd_epi32 (__m128i __X)
1032 return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X);
1035 extern __inline __m128i
1036 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1037 _mm_broadcastq_epi64 (__m128i __X)
1039 return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X);
1042 extern __inline __m256i
1043 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1044 _mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y)
1046 return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y);
1049 #ifdef __OPTIMIZE__
1050 extern __inline __m256d
1051 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1052 _mm256_permute4x64_pd (__m256d __X, const int __M)
1054 return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M);
1056 #else
1057 #define _mm256_permute4x64_pd(X, M) \
1058 ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M)))
1059 #endif
1061 extern __inline __m256
1062 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1063 _mm256_permutevar8x32_ps (__m256 __X, __m256i __Y)
1065 return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y);
1068 #ifdef __OPTIMIZE__
1069 extern __inline __m256i
1070 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1071 _mm256_permute4x64_epi64 (__m256i __X, const int __M)
1073 return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M);
1075 #else
1076 #define _mm256_permute4x64_epi64(X, M) \
1077 ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M)))
1078 #endif
1081 #ifdef __OPTIMIZE__
1082 extern __inline __m256i
1083 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1084 _mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M)
1086 return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M);
1088 #else
1089 #define _mm256_permute2x128_si256(X, Y, M) \
1090 ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M)))
1091 #endif
1093 #ifdef __OPTIMIZE__
1094 extern __inline __m128i
1095 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1096 _mm256_extracti128_si256 (__m256i __X, const int __M)
1098 return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M);
1100 #else
1101 #define _mm256_extracti128_si256(X, M) \
1102 ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M)))
1103 #endif
1105 #ifdef __OPTIMIZE__
1106 extern __inline __m256i
1107 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1108 _mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M)
1110 return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M);
1112 #else
1113 #define _mm256_inserti128_si256(X, Y, M) \
1114 ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \
1115 (__v2di)(__m128i)(Y), \
1116 (int)(M)))
1117 #endif
1119 extern __inline __m256i
1120 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1121 _mm256_maskload_epi32 (int const *__X, __m256i __M )
1123 return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X,
1124 (__v8si)__M);
1127 extern __inline __m256i
1128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1129 _mm256_maskload_epi64 (long long const *__X, __m256i __M )
1131 return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X,
1132 (__v4di)__M);
1135 extern __inline __m128i
1136 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1137 _mm_maskload_epi32 (int const *__X, __m128i __M )
1139 return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X,
1140 (__v4si)__M);
1143 extern __inline __m128i
1144 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1145 _mm_maskload_epi64 (long long const *__X, __m128i __M )
1147 return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X,
1148 (__v2di)__M);
1151 extern __inline void
1152 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1153 _mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y )
1155 __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
1158 extern __inline void
1159 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1160 _mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y )
1162 __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
1165 extern __inline void
1166 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1167 _mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y )
1169 __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
1172 extern __inline void
1173 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1174 _mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y )
1176 __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
1179 extern __inline __m256i
1180 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1181 _mm256_sllv_epi32 (__m256i __X, __m256i __Y)
1183 return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y);
1186 extern __inline __m128i
1187 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1188 _mm_sllv_epi32 (__m128i __X, __m128i __Y)
1190 return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y);
1193 extern __inline __m256i
1194 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1195 _mm256_sllv_epi64 (__m256i __X, __m256i __Y)
1197 return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y);
1200 extern __inline __m128i
1201 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1202 _mm_sllv_epi64 (__m128i __X, __m128i __Y)
1204 return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y);
1207 extern __inline __m256i
1208 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1209 _mm256_srav_epi32 (__m256i __X, __m256i __Y)
1211 return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y);
1214 extern __inline __m128i
1215 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1216 _mm_srav_epi32 (__m128i __X, __m128i __Y)
1218 return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y);
1221 extern __inline __m256i
1222 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1223 _mm256_srlv_epi32 (__m256i __X, __m256i __Y)
1225 return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y);
1228 extern __inline __m128i
1229 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1230 _mm_srlv_epi32 (__m128i __X, __m128i __Y)
1232 return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y);
1235 extern __inline __m256i
1236 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1237 _mm256_srlv_epi64 (__m256i __X, __m256i __Y)
1239 return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y);
1242 extern __inline __m128i
1243 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1244 _mm_srlv_epi64 (__m128i __X, __m128i __Y)
1246 return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y);
1249 #ifdef __OPTIMIZE__
1250 extern __inline __m128d
1251 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1252 _mm_i32gather_pd (double const *__base, __m128i __index, const int __scale)
1254 __v2df __zero = _mm_setzero_pd ();
1255 __v2df __mask = _mm_cmpeq_pd (__zero, __zero);
1257 return (__m128d) __builtin_ia32_gathersiv2df (_mm_undefined_pd (),
1258 __base,
1259 (__v4si)__index,
1260 __mask,
1261 __scale);
1264 extern __inline __m128d
1265 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1266 _mm_mask_i32gather_pd (__m128d __src, double const *__base, __m128i __index,
1267 __m128d __mask, const int __scale)
1269 return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)__src,
1270 __base,
1271 (__v4si)__index,
1272 (__v2df)__mask,
1273 __scale);
1276 extern __inline __m256d
1277 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1278 _mm256_i32gather_pd (double const *__base, __m128i __index, const int __scale)
1280 __v4df __zero = _mm256_setzero_pd ();
1281 __v4df __mask = _mm256_cmp_pd (__zero, __zero, _CMP_EQ_OQ);
1283 return (__m256d) __builtin_ia32_gathersiv4df (_mm256_undefined_pd (),
1284 __base,
1285 (__v4si)__index,
1286 __mask,
1287 __scale);
1290 extern __inline __m256d
1291 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1292 _mm256_mask_i32gather_pd (__m256d __src, double const *__base,
1293 __m128i __index, __m256d __mask, const int __scale)
1295 return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)__src,
1296 __base,
1297 (__v4si)__index,
1298 (__v4df)__mask,
1299 __scale);
1302 extern __inline __m128d
1303 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1304 _mm_i64gather_pd (double const *__base, __m128i __index, const int __scale)
1306 __v2df __src = _mm_setzero_pd ();
1307 __v2df __mask = _mm_cmpeq_pd (__src, __src);
1309 return (__m128d) __builtin_ia32_gatherdiv2df (__src,
1310 __base,
1311 (__v2di)__index,
1312 __mask,
1313 __scale);
1316 extern __inline __m128d
1317 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1318 _mm_mask_i64gather_pd (__m128d __src, double const *__base, __m128i __index,
1319 __m128d __mask, const int __scale)
1321 return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)__src,
1322 __base,
1323 (__v2di)__index,
1324 (__v2df)__mask,
1325 __scale);
1328 extern __inline __m256d
1329 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1330 _mm256_i64gather_pd (double const *__base, __m256i __index, const int __scale)
1332 __v4df __src = _mm256_setzero_pd ();
1333 __v4df __mask = _mm256_cmp_pd (__src, __src, _CMP_EQ_OQ);
1335 return (__m256d) __builtin_ia32_gatherdiv4df (__src,
1336 __base,
1337 (__v4di)__index,
1338 __mask,
1339 __scale);
1342 extern __inline __m256d
1343 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1344 _mm256_mask_i64gather_pd (__m256d __src, double const *__base,
1345 __m256i __index, __m256d __mask, const int __scale)
1347 return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)__src,
1348 __base,
1349 (__v4di)__index,
1350 (__v4df)__mask,
1351 __scale);
1354 extern __inline __m128
1355 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1356 _mm_i32gather_ps (float const *__base, __m128i __index, const int __scale)
1358 __v4sf __src = _mm_setzero_ps ();
1359 __v4sf __mask = _mm_cmpeq_ps (__src, __src);
1361 return (__m128) __builtin_ia32_gathersiv4sf (__src,
1362 __base,
1363 (__v4si)__index,
1364 __mask,
1365 __scale);
1368 extern __inline __m128
1369 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1370 _mm_mask_i32gather_ps (__m128 __src, float const *__base, __m128i __index,
1371 __m128 __mask, const int __scale)
1373 return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)__src,
1374 __base,
1375 (__v4si)__index,
1376 (__v4sf)__mask,
1377 __scale);
1380 extern __inline __m256
1381 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1382 _mm256_i32gather_ps (float const *__base, __m256i __index, const int __scale)
1384 __v8sf __src = _mm256_setzero_ps ();
1385 __v8sf __mask = _mm256_cmp_ps (__src, __src, _CMP_EQ_OQ);
1387 return (__m256) __builtin_ia32_gathersiv8sf (__src,
1388 __base,
1389 (__v8si)__index,
1390 __mask,
1391 __scale);
1394 extern __inline __m256
1395 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1396 _mm256_mask_i32gather_ps (__m256 __src, float const *__base,
1397 __m256i __index, __m256 __mask, const int __scale)
1399 return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)__src,
1400 __base,
1401 (__v8si)__index,
1402 (__v8sf)__mask,
1403 __scale);
1406 extern __inline __m128
1407 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1408 _mm_i64gather_ps (float const *__base, __m128i __index, const int __scale)
1410 __v4sf __src = _mm_setzero_ps ();
1411 __v4sf __mask = _mm_cmpeq_ps (__src, __src);
1413 return (__m128) __builtin_ia32_gatherdiv4sf (__src,
1414 __base,
1415 (__v2di)__index,
1416 __mask,
1417 __scale);
1420 extern __inline __m128
1421 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1422 _mm_mask_i64gather_ps (__m128 __src, float const *__base, __m128i __index,
1423 __m128 __mask, const int __scale)
1425 return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)__src,
1426 __base,
1427 (__v2di)__index,
1428 (__v4sf)__mask,
1429 __scale);
1432 extern __inline __m128
1433 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1434 _mm256_i64gather_ps (float const *__base, __m256i __index, const int __scale)
1436 __v4sf __src = _mm_setzero_ps ();
1437 __v4sf __mask = _mm_cmpeq_ps (__src, __src);
1439 return (__m128) __builtin_ia32_gatherdiv4sf256 (__src,
1440 __base,
1441 (__v4di)__index,
1442 __mask,
1443 __scale);
1446 extern __inline __m128
1447 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1448 _mm256_mask_i64gather_ps (__m128 __src, float const *__base,
1449 __m256i __index, __m128 __mask, const int __scale)
1451 return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)__src,
1452 __base,
1453 (__v4di)__index,
1454 (__v4sf)__mask,
1455 __scale);
1458 extern __inline __m128i
1459 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1460 _mm_i32gather_epi64 (long long int const *__base,
1461 __m128i __index, const int __scale)
1463 __v2di __src = __extension__ (__v2di){ 0, 0 };
1464 __v2di __mask = __extension__ (__v2di){ ~0, ~0 };
1466 return (__m128i) __builtin_ia32_gathersiv2di (__src,
1467 __base,
1468 (__v4si)__index,
1469 __mask,
1470 __scale);
1473 extern __inline __m128i
1474 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1475 _mm_mask_i32gather_epi64 (__m128i __src, long long int const *__base,
1476 __m128i __index, __m128i __mask, const int __scale)
1478 return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)__src,
1479 __base,
1480 (__v4si)__index,
1481 (__v2di)__mask,
1482 __scale);
1485 extern __inline __m256i
1486 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1487 _mm256_i32gather_epi64 (long long int const *__base,
1488 __m128i __index, const int __scale)
1490 __v4di __src = __extension__ (__v4di){ 0, 0, 0, 0 };
1491 __v4di __mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
1493 return (__m256i) __builtin_ia32_gathersiv4di (__src,
1494 __base,
1495 (__v4si)__index,
1496 __mask,
1497 __scale);
1500 extern __inline __m256i
1501 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1502 _mm256_mask_i32gather_epi64 (__m256i __src, long long int const *__base,
1503 __m128i __index, __m256i __mask,
1504 const int __scale)
1506 return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)__src,
1507 __base,
1508 (__v4si)__index,
1509 (__v4di)__mask,
1510 __scale);
1513 extern __inline __m128i
1514 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1515 _mm_i64gather_epi64 (long long int const *__base,
1516 __m128i __index, const int __scale)
1518 __v2di __src = __extension__ (__v2di){ 0, 0 };
1519 __v2di __mask = __extension__ (__v2di){ ~0, ~0 };
1521 return (__m128i) __builtin_ia32_gatherdiv2di (__src,
1522 __base,
1523 (__v2di)__index,
1524 __mask,
1525 __scale);
1528 extern __inline __m128i
1529 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1530 _mm_mask_i64gather_epi64 (__m128i __src, long long int const *__base,
1531 __m128i __index, __m128i __mask, const int __scale)
1533 return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)__src,
1534 __base,
1535 (__v2di)__index,
1536 (__v2di)__mask,
1537 __scale);
1540 extern __inline __m256i
1541 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1542 _mm256_i64gather_epi64 (long long int const *__base,
1543 __m256i __index, const int __scale)
1545 __v4di __src = __extension__ (__v4di){ 0, 0, 0, 0 };
1546 __v4di __mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
1548 return (__m256i) __builtin_ia32_gatherdiv4di (__src,
1549 __base,
1550 (__v4di)__index,
1551 __mask,
1552 __scale);
1555 extern __inline __m256i
1556 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1557 _mm256_mask_i64gather_epi64 (__m256i __src, long long int const *__base,
1558 __m256i __index, __m256i __mask,
1559 const int __scale)
1561 return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)__src,
1562 __base,
1563 (__v4di)__index,
1564 (__v4di)__mask,
1565 __scale);
1568 extern __inline __m128i
1569 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1570 _mm_i32gather_epi32 (int const *__base, __m128i __index, const int __scale)
1572 __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 };
1573 __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1575 return (__m128i) __builtin_ia32_gathersiv4si (__src,
1576 __base,
1577 (__v4si)__index,
1578 __mask,
1579 __scale);
1582 extern __inline __m128i
1583 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1584 _mm_mask_i32gather_epi32 (__m128i __src, int const *__base, __m128i __index,
1585 __m128i __mask, const int __scale)
1587 return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)__src,
1588 __base,
1589 (__v4si)__index,
1590 (__v4si)__mask,
1591 __scale);
1594 extern __inline __m256i
1595 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1596 _mm256_i32gather_epi32 (int const *__base, __m256i __index, const int __scale)
1598 __v8si __src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 };
1599 __v8si __mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 };
1601 return (__m256i) __builtin_ia32_gathersiv8si (__src,
1602 __base,
1603 (__v8si)__index,
1604 __mask,
1605 __scale);
1608 extern __inline __m256i
1609 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1610 _mm256_mask_i32gather_epi32 (__m256i __src, int const *__base,
1611 __m256i __index, __m256i __mask,
1612 const int __scale)
1614 return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)__src,
1615 __base,
1616 (__v8si)__index,
1617 (__v8si)__mask,
1618 __scale);
1621 extern __inline __m128i
1622 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1623 _mm_i64gather_epi32 (int const *__base, __m128i __index, const int __scale)
1625 __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 };
1626 __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1628 return (__m128i) __builtin_ia32_gatherdiv4si (__src,
1629 __base,
1630 (__v2di)__index,
1631 __mask,
1632 __scale);
1635 extern __inline __m128i
1636 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1637 _mm_mask_i64gather_epi32 (__m128i __src, int const *__base, __m128i __index,
1638 __m128i __mask, const int __scale)
1640 return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)__src,
1641 __base,
1642 (__v2di)__index,
1643 (__v4si)__mask,
1644 __scale);
1647 extern __inline __m128i
1648 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1649 _mm256_i64gather_epi32 (int const *__base, __m256i __index, const int __scale)
1651 __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 };
1652 __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1654 return (__m128i) __builtin_ia32_gatherdiv4si256 (__src,
1655 __base,
1656 (__v4di)__index,
1657 __mask,
1658 __scale);
1661 extern __inline __m128i
1662 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1663 _mm256_mask_i64gather_epi32 (__m128i __src, int const *__base,
1664 __m256i __index, __m128i __mask,
1665 const int __scale)
1667 return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)__src,
1668 __base,
1669 (__v4di)__index,
1670 (__v4si)__mask,
1671 __scale);
1673 #else /* __OPTIMIZE__ */
1674 #define _mm_i32gather_pd(BASE, INDEX, SCALE) \
1675 (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (), \
1676 (double const *) (BASE), \
1677 (__v4si)(__m128i) (INDEX), \
1678 (__v2df) \
1679 _mm_cmpeq_pd (_mm_setzero_pd (),\
1680 _mm_setzero_pd ()),\
1681 (int) (SCALE))
1683 #define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1684 (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d) (SRC), \
1685 (double const *) (BASE), \
1686 (__v4si)(__m128i) (INDEX), \
1687 (__v2df)(__m128d) (MASK), \
1688 (int) (SCALE))
1690 #define _mm256_i32gather_pd(BASE, INDEX, SCALE) \
1691 (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (), \
1692 (double const *) (BASE), \
1693 (__v4si)(__m128i) (INDEX), \
1694 (__v4df) \
1695 _mm256_cmp_pd (_mm256_setzero_pd (),\
1696 _mm256_setzero_pd (),\
1697 _CMP_EQ_OQ), \
1698 (int) (SCALE))
1700 #define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1701 (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d) (SRC), \
1702 (double const *) (BASE), \
1703 (__v4si)(__m128i) (INDEX), \
1704 (__v4df)(__m256d) (MASK), \
1705 (int) (SCALE))
1707 #define _mm_i64gather_pd(BASE, INDEX, SCALE) \
1708 (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (), \
1709 (double const *) (BASE), \
1710 (__v2di)(__m128i) (INDEX), \
1711 (__v2df) \
1712 _mm_cmpeq_pd (_mm_setzero_pd (),\
1713 _mm_setzero_pd ()),\
1714 (int) (SCALE))
1716 #define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1717 (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d) (SRC), \
1718 (double const *) (BASE), \
1719 (__v2di)(__m128i) (INDEX), \
1720 (__v2df)(__m128d) (MASK), \
1721 (int) (SCALE))
1723 #define _mm256_i64gather_pd(BASE, INDEX, SCALE) \
1724 (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (), \
1725 (double const *) (BASE), \
1726 (__v4di)(__m256i) (INDEX), \
1727 (__v4df) \
1728 _mm256_cmp_pd (_mm256_setzero_pd (),\
1729 _mm256_setzero_pd (),\
1730 _CMP_EQ_OQ), \
1731 (int) (SCALE))
1733 #define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1734 (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d) (SRC), \
1735 (double const *) (BASE), \
1736 (__v4di)(__m256i) (INDEX), \
1737 (__v4df)(__m256d) (MASK), \
1738 (int) (SCALE))
1740 #define _mm_i32gather_ps(BASE, INDEX, SCALE) \
1741 (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (), \
1742 (float const *) (BASE), \
1743 (__v4si)(__m128i) (INDEX), \
1744 (__v4sf) \
1745 _mm_cmpeq_ps (_mm_setzero_ps (),\
1746 _mm_setzero_ps ()),\
1747 (int) (SCALE))
1749 #define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1750 (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128) (SRC), \
1751 (float const *) (BASE), \
1752 (__v4si)(__m128i) (INDEX), \
1753 (__v4sf)(__m128) (MASK), \
1754 (int) (SCALE))
1756 #define _mm256_i32gather_ps(BASE, INDEX, SCALE) \
1757 (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \
1758 (float const *) (BASE), \
1759 (__v8si)(__m256i) (INDEX), \
1760 (__v8sf) \
1761 _mm256_cmp_ps (_mm256_setzero_ps (),\
1762 _mm256_setzero_ps (),\
1763 _CMP_EQ_OQ), \
1764 (int) (SCALE))
1766 #define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1767 (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256) (SRC), \
1768 (float const *) (BASE), \
1769 (__v8si)(__m256i) (INDEX), \
1770 (__v8sf)(__m256) (MASK), \
1771 (int) (SCALE))
1773 #define _mm_i64gather_ps(BASE, INDEX, SCALE) \
1774 (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (), \
1775 (float const *) (BASE), \
1776 (__v2di)(__m128i) (INDEX), \
1777 (__v4sf) \
1778 _mm_cmpeq_ps (_mm_setzero_ps (),\
1779 _mm_setzero_ps ()),\
1780 (int) (SCALE))
1782 #define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1783 (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128) (SRC), \
1784 (float const *) (BASE), \
1785 (__v2di)(__m128i) (INDEX), \
1786 (__v4sf)(__m128) (MASK), \
1787 (int) (SCALE))
1789 #define _mm256_i64gather_ps(BASE, INDEX, SCALE) \
1790 (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (), \
1791 (float const *) (BASE), \
1792 (__v4di)(__m256i) (INDEX), \
1793 (__v4sf) \
1794 _mm_cmpeq_ps (_mm_setzero_ps (),\
1795 _mm_setzero_ps ()),\
1796 (int) (SCALE))
1798 #define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1799 (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128) (SRC), \
1800 (float const *) (BASE), \
1801 (__v4di)(__m256i) (INDEX), \
1802 (__v4sf)(__m128) (MASK), \
1803 (int) (SCALE))
1805 #define _mm_i32gather_epi64(BASE, INDEX, SCALE) \
1806 (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \
1807 (long long const *) (BASE), \
1808 (__v4si)(__m128i) (INDEX), \
1809 (__v2di)_mm_set1_epi64x (-1), \
1810 (int) (SCALE))
1812 #define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1813 (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i) (SRC), \
1814 (long long const *) (BASE), \
1815 (__v4si)(__m128i) (INDEX), \
1816 (__v2di)(__m128i) (MASK), \
1817 (int) (SCALE))
1819 #define _mm256_i32gather_epi64(BASE, INDEX, SCALE) \
1820 (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \
1821 (long long const *) (BASE), \
1822 (__v4si)(__m128i) (INDEX), \
1823 (__v4di)_mm256_set1_epi64x (-1), \
1824 (int) (SCALE))
1826 #define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1827 (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i) (SRC), \
1828 (long long const *) (BASE), \
1829 (__v4si)(__m128i) (INDEX), \
1830 (__v4di)(__m256i) (MASK), \
1831 (int) (SCALE))
1833 #define _mm_i64gather_epi64(BASE, INDEX, SCALE) \
1834 (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \
1835 (long long const *) (BASE), \
1836 (__v2di)(__m128i) (INDEX), \
1837 (__v2di)_mm_set1_epi64x (-1), \
1838 (int) (SCALE))
1840 #define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1841 (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i) (SRC), \
1842 (long long const *) (BASE), \
1843 (__v2di)(__m128i) (INDEX), \
1844 (__v2di)(__m128i) (MASK), \
1845 (int) (SCALE))
1847 #define _mm256_i64gather_epi64(BASE, INDEX, SCALE) \
1848 (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \
1849 (long long const *) (BASE), \
1850 (__v4di)(__m256i) (INDEX), \
1851 (__v4di)_mm256_set1_epi64x (-1), \
1852 (int) (SCALE))
1854 #define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1855 (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i) (SRC), \
1856 (long long const *) (BASE), \
1857 (__v4di)(__m256i) (INDEX), \
1858 (__v4di)(__m256i) (MASK), \
1859 (int) (SCALE))
1861 #define _mm_i32gather_epi32(BASE, INDEX, SCALE) \
1862 (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (), \
1863 (int const *) (BASE), \
1864 (__v4si)(__m128i) (INDEX), \
1865 (__v4si)_mm_set1_epi32 (-1), \
1866 (int) (SCALE))
1868 #define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1869 (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i) (SRC), \
1870 (int const *) (BASE), \
1871 (__v4si)(__m128i) (INDEX), \
1872 (__v4si)(__m128i) (MASK), \
1873 (int) (SCALE))
1875 #define _mm256_i32gather_epi32(BASE, INDEX, SCALE) \
1876 (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \
1877 (int const *) (BASE), \
1878 (__v8si)(__m256i) (INDEX), \
1879 (__v8si)_mm256_set1_epi32 (-1), \
1880 (int) (SCALE))
1882 #define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1883 (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i) (SRC), \
1884 (int const *) (BASE), \
1885 (__v8si)(__m256i) (INDEX), \
1886 (__v8si)(__m256i) (MASK), \
1887 (int) (SCALE))
1889 #define _mm_i64gather_epi32(BASE, INDEX, SCALE) \
1890 (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (), \
1891 (int const *) (BASE), \
1892 (__v2di)(__m128i) (INDEX), \
1893 (__v4si)_mm_set1_epi32 (-1), \
1894 (int) (SCALE))
1896 #define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1897 (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i) (SRC), \
1898 (int const *) (BASE), \
1899 (__v2di)(__m128i) (INDEX), \
1900 (__v4si)(__m128i) (MASK), \
1901 (int) (SCALE))
1903 #define _mm256_i64gather_epi32(BASE, INDEX, SCALE) \
1904 (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \
1905 (int const *) (BASE), \
1906 (__v4di)(__m256i) (INDEX), \
1907 (__v4si)_mm_set1_epi32(-1), \
1908 (int) (SCALE))
1910 #define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1911 (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i) (SRC), \
1912 (int const *) (BASE), \
1913 (__v4di)(__m256i) (INDEX), \
1914 (__v4si)(__m128i) (MASK), \
1915 (int) (SCALE))
1916 #endif /* __OPTIMIZE__ */
1918 #define _MM_REDUCE_OPERATOR_BASIC_EPI16(op) \
1919 __v8hi __T1 = (__v8hi)__W; \
1920 __v8hi __T2 = __builtin_shufflevector (__T1, __T1, 4, 5, 6, 7, 4, 5, 6, 7); \
1921 __v8hi __T3 = __T1 op __T2; \
1922 __v8hi __T4 = __builtin_shufflevector (__T3, __T3, 2, 3, 2, 3, 4, 5, 6, 7); \
1923 __v8hi __T5 = __T3 op __T4; \
1924 __v8hi __T6 = __builtin_shufflevector (__T5, __T5, 1, 1, 2, 3, 4, 5, 6, 7); \
1925 __v8hi __T7 = __T5 op __T6; \
1926 return __T7[0]
1928 extern __inline short
1929 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1930 _mm_reduce_add_epi16 (__m128i __W)
1932 _MM_REDUCE_OPERATOR_BASIC_EPI16 (+);
1935 extern __inline short
1936 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1937 _mm_reduce_mul_epi16 (__m128i __W)
1939 _MM_REDUCE_OPERATOR_BASIC_EPI16 (*);
1942 extern __inline short
1943 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1944 _mm_reduce_and_epi16 (__m128i __W)
1946 _MM_REDUCE_OPERATOR_BASIC_EPI16 (&);
1949 extern __inline short
1950 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1951 _mm_reduce_or_epi16 (__m128i __W)
1953 _MM_REDUCE_OPERATOR_BASIC_EPI16 (|);
1956 #define _MM_REDUCE_OPERATOR_MAX_MIN_EP16(op) \
1957 __m128i __T1 = (__m128i)__builtin_shufflevector ((__v8hi)__V, \
1958 (__v8hi)__V, 4, 5, 6, 7, 4, 5, 6, 7); \
1959 __m128i __T2 = _mm_##op (__V, __T1); \
1960 __m128i __T3 = (__m128i)__builtin_shufflevector ((__v8hi)__T2, \
1961 (__v8hi)__T2, 2, 3, 2, 3, 4, 5, 6, 7); \
1962 __m128i __T4 = _mm_##op (__T2, __T3); \
1963 __m128i __T5 = (__m128i)__builtin_shufflevector ((__v8hi)__T4, \
1964 (__v8hi)__T4, 1, 1, 2, 3, 4, 5, 6, 7); \
1965 __v8hi __T6 = (__v8hi)_mm_##op (__T4, __T5); \
1966 return __T6[0]
1968 extern __inline short
1969 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1970 _mm_reduce_max_epi16 (__m128i __V)
1972 _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epi16);
1975 extern __inline unsigned short
1976 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1977 _mm_reduce_max_epu16 (__m128i __V)
1979 _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epu16);
1982 extern __inline short
1983 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1984 _mm_reduce_min_epi16 (__m128i __V)
1986 _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epi16);
1989 extern __inline unsigned short
1990 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1991 _mm_reduce_min_epu16 (__m128i __V)
1993 _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epu16);
1996 #define _MM256_REDUCE_OPERATOR_BASIC_EPI16(op) \
1997 __v8hi __T1 = (__v8hi)_mm256_extracti128_si256 (__W, 0); \
1998 __v8hi __T2 = (__v8hi)_mm256_extracti128_si256 (__W, 1); \
1999 __v8hi __T3 = __T1 op __T2; \
2000 __v8hi __T4 = __builtin_shufflevector (__T3, __T3, 4, 5, 6, 7, 4, 5, 6, 7); \
2001 __v8hi __T5 = __T3 op __T4; \
2002 __v8hi __T6 = __builtin_shufflevector (__T5, __T5, 2, 3, 2, 3, 4, 5, 6, 7); \
2003 __v8hi __T7 = __T5 op __T6; \
2004 __v8hi __T8 = __builtin_shufflevector (__T7, __T7, 1, 1, 2, 3, 4, 5, 6, 7); \
2005 __v8hi __T9 = __T7 op __T8; \
2006 return __T9[0]
2008 extern __inline short
2009 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
2010 _mm256_reduce_add_epi16 (__m256i __W)
2012 _MM256_REDUCE_OPERATOR_BASIC_EPI16 (+);
2015 extern __inline short
2016 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
2017 _mm256_reduce_mul_epi16 (__m256i __W)
2019 _MM256_REDUCE_OPERATOR_BASIC_EPI16 (*);
2022 extern __inline short
2023 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
2024 _mm256_reduce_and_epi16 (__m256i __W)
2026 _MM256_REDUCE_OPERATOR_BASIC_EPI16 (&);
2029 extern __inline short
2030 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
2031 _mm256_reduce_or_epi16 (__m256i __W)
2033 _MM256_REDUCE_OPERATOR_BASIC_EPI16 (|);
2036 #define _MM256_REDUCE_OPERATOR_MAX_MIN_EP16(op) \
2037 __m128i __T1 = _mm256_extracti128_si256 (__V, 0); \
2038 __m128i __T2 = _mm256_extracti128_si256 (__V, 1); \
2039 __m128i __T3 = _mm_##op (__T1, __T2); \
2040 __m128i __T4 = (__m128i)__builtin_shufflevector ((__v8hi)__T3, \
2041 (__v8hi)__T3, 4, 5, 6, 7, 4, 5, 6, 7); \
2042 __m128i __T5 = _mm_##op (__T3, __T4); \
2043 __m128i __T6 = (__m128i)__builtin_shufflevector ((__v8hi)__T5, \
2044 (__v8hi)__T5, 2, 3, 2, 3, 4, 5, 6, 7); \
2045 __m128i __T7 = _mm_##op (__T5, __T6); \
2046 __m128i __T8 = (__m128i)__builtin_shufflevector ((__v8hi)__T7, \
2047 (__v8hi)__T7, 1, 1, 2, 3, 4, 5, 6, 7); \
2048 __v8hi __T9 = (__v8hi)_mm_##op (__T7, __T8); \
2049 return __T9[0]
2051 extern __inline short
2052 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
2053 _mm256_reduce_max_epi16 (__m256i __V)
2055 _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epi16);
2058 extern __inline unsigned short
2059 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
2060 _mm256_reduce_max_epu16 (__m256i __V)
2062 _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epu16);
2065 extern __inline short
2066 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
2067 _mm256_reduce_min_epi16 (__m256i __V)
2069 _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epi16);
2072 extern __inline unsigned short
2073 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
2074 _mm256_reduce_min_epu16 (__m256i __V)
2076 _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epu16);
2079 #define _MM_REDUCE_OPERATOR_BASIC_EPI8(op) \
2080 __v16qi __T1 = (__v16qi)__W; \
2081 __v16qi __T2 = __builtin_shufflevector (__T1, __T1, \
2082 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15); \
2083 __v16qi __T3 = __T1 op __T2; \
2084 __v16qi __T4 = __builtin_shufflevector (__T3, __T3, \
2085 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
2086 __v16qi __T5 = __T3 op __T4; \
2087 __v16qi __T6 = __builtin_shufflevector (__T5, __T5, \
2088 2, 3, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
2089 __v16qi __T7 = __T5 op __T6; \
2090 __v16qi __T8 = __builtin_shufflevector (__T7, __T7, \
2091 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
2092 __v16qi __T9 = __T7 op __T8; \
2093 return __T9[0]
2095 extern __inline char
2096 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
2097 _mm_reduce_add_epi8 (__m128i __W)
2099 _MM_REDUCE_OPERATOR_BASIC_EPI8 (+);
2102 extern __inline char
2103 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
2104 _mm_reduce_mul_epi8 (__m128i __W)
2106 _MM_REDUCE_OPERATOR_BASIC_EPI8 (*);
2109 extern __inline char
2110 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
2111 _mm_reduce_and_epi8 (__m128i __W)
2113 _MM_REDUCE_OPERATOR_BASIC_EPI8 (&);
2116 extern __inline char
2117 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
2118 _mm_reduce_or_epi8 (__m128i __W)
2120 _MM_REDUCE_OPERATOR_BASIC_EPI8 (|);
2123 #define _MM_REDUCE_OPERATOR_MAX_MIN_EP8(op) \
2124 __m128i __T1 = (__m128i)__builtin_shufflevector ((__v16qi)__V, (__v16qi)__V, \
2125 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15); \
2126 __m128i __T2 = _mm_##op (__V, __T1); \
2127 __m128i __T3 = (__m128i)__builtin_shufflevector ((__v16qi)__T2, \
2128 (__v16qi)__T2, \
2129 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
2130 __m128i __T4 = _mm_##op (__T2, __T3); \
2131 __m128i __T5 = (__m128i)__builtin_shufflevector ((__v16qi)__T4, \
2132 (__v16qi)__T4, \
2133 2, 3, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
2134 __m128i __T6 = _mm_##op (__T4, __T5); \
2135 __m128i __T7 = (__m128i)__builtin_shufflevector ((__v16qi)__T6, \
2136 (__v16qi)__T6, \
2137 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
2138 __v16qi __T8 = (__v16qi)_mm_##op (__T6, __T7); \
2139 return __T8[0]
2141 extern __inline signed char
2142 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
2143 _mm_reduce_max_epi8 (__m128i __V)
2145 _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epi8);
2148 extern __inline unsigned char
2149 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
2150 _mm_reduce_max_epu8 (__m128i __V)
2152 _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epu8);
2155 extern __inline signed char
2156 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
2157 _mm_reduce_min_epi8 (__m128i __V)
2159 _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epi8);
2162 extern __inline unsigned char
2163 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
2164 _mm_reduce_min_epu8 (__m128i __V)
2166 _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epu8);
2169 #define _MM256_REDUCE_OPERATOR_BASIC_EPI8(op) \
2170 __v16qi __T1 = (__v16qi)_mm256_extracti128_si256 (__W, 0); \
2171 __v16qi __T2 = (__v16qi)_mm256_extracti128_si256 (__W, 1); \
2172 __v16qi __T3 = __T1 op __T2; \
2173 __v16qi __T4 = __builtin_shufflevector (__T3, __T3, \
2174 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15); \
2175 __v16qi __T5 = __T3 op __T4; \
2176 __v16qi __T6 = __builtin_shufflevector (__T5, __T5, \
2177 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
2178 __v16qi __T7 = __T5 op __T6; \
2179 __v16qi __T8 = __builtin_shufflevector (__T7, __T7, \
2180 2, 3, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
2181 __v16qi __T9 = __T7 op __T8; \
2182 __v16qi __T10 = __builtin_shufflevector (__T9, __T9, \
2183 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
2184 __v16qi __T11 = __T9 op __T10; \
2185 return __T11[0]
2187 extern __inline char
2188 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
2189 _mm256_reduce_add_epi8 (__m256i __W)
2191 _MM256_REDUCE_OPERATOR_BASIC_EPI8 (+);
2194 extern __inline char
2195 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
2196 _mm256_reduce_mul_epi8 (__m256i __W)
2198 _MM256_REDUCE_OPERATOR_BASIC_EPI8 (*);
2201 extern __inline char
2202 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
2203 _mm256_reduce_and_epi8 (__m256i __W)
2205 _MM256_REDUCE_OPERATOR_BASIC_EPI8 (&);
2208 extern __inline char
2209 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
2210 _mm256_reduce_or_epi8 (__m256i __W)
2212 _MM256_REDUCE_OPERATOR_BASIC_EPI8 (|);
2215 #define _MM256_REDUCE_OPERATOR_MAX_MIN_EP8(op) \
2216 __m128i __T1 = _mm256_extracti128_si256 (__V, 0); \
2217 __m128i __T2 = _mm256_extracti128_si256 (__V, 1); \
2218 __m128i __T3 = _mm_##op (__T1, __T2); \
2219 __m128i __T4 = (__m128i)__builtin_shufflevector ((__v16qi)__T3, \
2220 (__v16qi)__T3, \
2221 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15); \
2222 __m128i __T5 = _mm_##op (__T3, __T4); \
2223 __m128i __T6 = (__m128i)__builtin_shufflevector ((__v16qi)__T5, \
2224 (__v16qi)__T5, \
2225 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
2226 __m128i __T7 = _mm_##op (__T5, __T6); \
2227 __m128i __T8 = (__m128i)__builtin_shufflevector ((__v16qi)__T7, \
2228 (__v16qi)__T5, \
2229 2, 3, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
2230 __m128i __T9 = _mm_##op (__T7, __T8); \
2231 __m128i __T10 = (__m128i)__builtin_shufflevector ((__v16qi)__T9, \
2232 (__v16qi)__T9, \
2233 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
2234 __v16qi __T11 = (__v16qi)_mm_##op (__T9, __T10); \
2235 return __T11[0]
2237 extern __inline signed char
2238 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
2239 _mm256_reduce_max_epi8 (__m256i __V)
2241 _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epi8);
2244 extern __inline unsigned char
2245 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
2246 _mm256_reduce_max_epu8 (__m256i __V)
2248 _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epu8);
2251 extern __inline signed char
2252 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
2253 _mm256_reduce_min_epi8 (__m256i __V)
2255 _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epi8);
2258 extern __inline unsigned char
2259 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
2260 _mm256_reduce_min_epu8 (__m256i __V)
2262 _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epu8);
2265 #ifdef __DISABLE_AVX2__
2266 #undef __DISABLE_AVX2__
2267 #pragma GCC pop_options
2268 #endif /* __DISABLE_AVX2__ */
2270 #endif /* _AVX2INTRIN_H_INCLUDED */