1 /* Copyright (C) 2003 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING. If not, write to
17 the Free Software Foundation, 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
20 /* As a special exception, if you include this header file into source
21 files compiled by GCC, this header file does not by itself cause
22 the resulting executable to be covered by the GNU General Public
23 License. This exception does not however invalidate any other
24 reasons why the executable file might be covered by the GNU General
27 /* Implemented from the specification included in the Intel C++ Compiler
28 User Guide and Reference, version 8.0. */
30 #ifndef _EMMINTRIN_H_INCLUDED
31 #define _EMMINTRIN_H_INCLUDED
34 #include <xmmintrin.h>
37 typedef int __v2df
__attribute__ ((mode (V2DF
)));
38 typedef int __v2di
__attribute__ ((mode (V2DI
)));
39 typedef int __v4si
__attribute__ ((mode (V4SI
)));
40 typedef int __v8hi
__attribute__ ((mode (V8HI
)));
41 typedef int __v16qi
__attribute__ ((mode (V16QI
)));
43 /* Create a selector for use with the SHUFPD instruction. */
44 #define _MM_SHUFFLE2(fp1,fp0) \
45 (((fp1) << 1) | (fp0))
47 #define __m128i __v2di
48 #define __m128d __v2df
50 /* Create a vector with element 0 as *P and the rest zero. */
51 static __inline __m128d
52 _mm_load_sd (double const *__P
)
54 return (__m128d
) __builtin_ia32_loadsd (__P
);
57 /* Create a vector with all two elements equal to *P. */
58 static __inline __m128d
59 _mm_load1_pd (double const *__P
)
61 __v2df __tmp
= __builtin_ia32_loadsd (__P
);
62 return (__m128d
) __builtin_ia32_shufpd (__tmp
, __tmp
, _MM_SHUFFLE2 (0,0));
65 static __inline __m128d
66 _mm_load_pd1 (double const *__P
)
68 return _mm_load1_pd (__P
);
71 /* Load two DPFP values from P. The address must be 16-byte aligned. */
72 static __inline __m128d
73 _mm_load_pd (double const *__P
)
75 return (__m128d
) __builtin_ia32_loadapd (__P
);
78 /* Load two DPFP values from P. The address need not be 16-byte aligned. */
79 static __inline __m128d
80 _mm_loadu_pd (double const *__P
)
82 return (__m128d
) __builtin_ia32_loadupd (__P
);
85 /* Load two DPFP values in reverse order. The address must be aligned. */
86 static __inline __m128d
87 _mm_loadr_pd (double const *__P
)
89 __v2df __tmp
= __builtin_ia32_loadapd (__P
);
90 return (__m128d
) __builtin_ia32_shufpd (__tmp
, __tmp
, _MM_SHUFFLE2 (0,1));
93 /* Create a vector with element 0 as F and the rest zero. */
94 static __inline __m128d
95 _mm_set_sd (double __F
)
97 return (__m128d
) __builtin_ia32_loadsd (&__F
);
100 /* Create a vector with all two elements equal to F. */
101 static __inline __m128d
102 _mm_set1_pd (double __F
)
104 __v2df __tmp
= __builtin_ia32_loadsd (&__F
);
105 return (__m128d
) __builtin_ia32_shufpd (__tmp
, __tmp
, _MM_SHUFFLE2 (0,0));
108 static __inline __m128d
109 _mm_set_pd1 (double __F
)
111 return _mm_set1_pd (__F
);
114 /* Create the vector [Z Y]. */
115 static __inline __m128d
116 _mm_set_pd (double __Z
, double __Y
)
129 /* Create the vector [Y Z]. */
130 static __inline __m128d
131 _mm_setr_pd (double __Z
, double __Y
)
133 return _mm_set_pd (__Y
, __Z
);
136 /* Create a vector of zeros. */
137 static __inline __m128d
138 _mm_setzero_pd (void)
140 return (__m128d
) __builtin_ia32_setzeropd ();
143 /* Stores the lower DPFP value. */
145 _mm_store_sd (double *__P
, __m128d __A
)
147 __builtin_ia32_storesd (__P
, (__v2df
)__A
);
150 /* Store the lower DPFP value across two words. */
152 _mm_store1_pd (double *__P
, __m128d __A
)
154 __v2df __va
= (__v2df
)__A
;
155 __v2df __tmp
= __builtin_ia32_shufpd (__va
, __va
, _MM_SHUFFLE2 (0,0));
156 __builtin_ia32_storeapd (__P
, __tmp
);
160 _mm_store_pd1 (double *__P
, __m128d __A
)
162 _mm_store1_pd (__P
, __A
);
165 /* Store two DPFP values. The address must be 16-byte aligned. */
167 _mm_store_pd (double *__P
, __m128d __A
)
169 __builtin_ia32_storeapd (__P
, (__v2df
)__A
);
172 /* Store two DPFP values. The address need not be 16-byte aligned. */
174 _mm_storeu_pd (double *__P
, __m128d __A
)
176 __builtin_ia32_storeupd (__P
, (__v2df
)__A
);
179 /* Store two DPFP values in reverse order. The address must be aligned. */
181 _mm_storer_pd (double *__P
, __m128d __A
)
183 __v2df __va
= (__v2df
)__A
;
184 __v2df __tmp
= __builtin_ia32_shufpd (__va
, __va
, _MM_SHUFFLE2 (0,1));
185 __builtin_ia32_storeapd (__P
, __tmp
);
188 /* Sets the low DPFP value of A from the low value of B. */
189 static __inline __m128d
190 _mm_move_sd (__m128d __A
, __m128d __B
)
192 return (__m128d
) __builtin_ia32_movsd ((__v2df
)__A
, (__v2df
)__B
);
196 static __inline __m128d
197 _mm_add_pd (__m128d __A
, __m128d __B
)
199 return (__m128d
)__builtin_ia32_addpd ((__v2df
)__A
, (__v2df
)__B
);
202 static __inline __m128d
203 _mm_add_sd (__m128d __A
, __m128d __B
)
205 return (__m128d
)__builtin_ia32_addsd ((__v2df
)__A
, (__v2df
)__B
);
208 static __inline __m128d
209 _mm_sub_pd (__m128d __A
, __m128d __B
)
211 return (__m128d
)__builtin_ia32_subpd ((__v2df
)__A
, (__v2df
)__B
);
214 static __inline __m128d
215 _mm_sub_sd (__m128d __A
, __m128d __B
)
217 return (__m128d
)__builtin_ia32_subsd ((__v2df
)__A
, (__v2df
)__B
);
220 static __inline __m128d
221 _mm_mul_pd (__m128d __A
, __m128d __B
)
223 return (__m128d
)__builtin_ia32_mulpd ((__v2df
)__A
, (__v2df
)__B
);
226 static __inline __m128d
227 _mm_mul_sd (__m128d __A
, __m128d __B
)
229 return (__m128d
)__builtin_ia32_mulsd ((__v2df
)__A
, (__v2df
)__B
);
232 static __inline __m128d
233 _mm_div_pd (__m128d __A
, __m128d __B
)
235 return (__m128d
)__builtin_ia32_divpd ((__v2df
)__A
, (__v2df
)__B
);
238 static __inline __m128d
239 _mm_div_sd (__m128d __A
, __m128d __B
)
241 return (__m128d
)__builtin_ia32_divsd ((__v2df
)__A
, (__v2df
)__B
);
244 static __inline __m128d
245 _mm_sqrt_pd (__m128d __A
)
247 return (__m128d
)__builtin_ia32_sqrtpd ((__v2df
)__A
);
250 /* Return pair {sqrt (A[0), B[1]}. */
251 static __inline __m128d
252 _mm_sqrt_sd (__m128d __A
, __m128d __B
)
254 __v2df __tmp
= __builtin_ia32_movsd ((__v2df
)__A
, (__v2df
)__B
);
255 return (__m128d
)__builtin_ia32_sqrtsd ((__v2df
)__tmp
);
258 static __inline __m128d
259 _mm_min_pd (__m128d __A
, __m128d __B
)
261 return (__m128d
)__builtin_ia32_minpd ((__v2df
)__A
, (__v2df
)__B
);
264 static __inline __m128d
265 _mm_min_sd (__m128d __A
, __m128d __B
)
267 return (__m128d
)__builtin_ia32_minsd ((__v2df
)__A
, (__v2df
)__B
);
270 static __inline __m128d
271 _mm_max_pd (__m128d __A
, __m128d __B
)
273 return (__m128d
)__builtin_ia32_maxpd ((__v2df
)__A
, (__v2df
)__B
);
276 static __inline __m128d
277 _mm_max_sd (__m128d __A
, __m128d __B
)
279 return (__m128d
)__builtin_ia32_maxsd ((__v2df
)__A
, (__v2df
)__B
);
282 static __inline __m128d
283 _mm_and_pd (__m128d __A
, __m128d __B
)
285 return (__m128d
)__builtin_ia32_andpd ((__v2df
)__A
, (__v2df
)__B
);
288 static __inline __m128d
289 _mm_andnot_pd (__m128d __A
, __m128d __B
)
291 return (__m128d
)__builtin_ia32_andnpd ((__v2df
)__A
, (__v2df
)__B
);
294 static __inline __m128d
295 _mm_or_pd (__m128d __A
, __m128d __B
)
297 return (__m128d
)__builtin_ia32_orpd ((__v2df
)__A
, (__v2df
)__B
);
300 static __inline __m128d
301 _mm_xor_pd (__m128d __A
, __m128d __B
)
303 return (__m128d
)__builtin_ia32_xorpd ((__v2df
)__A
, (__v2df
)__B
);
306 static __inline __m128d
307 _mm_cmpeq_pd (__m128d __A
, __m128d __B
)
309 return (__m128d
)__builtin_ia32_cmpeqpd ((__v2df
)__A
, (__v2df
)__B
);
312 static __inline __m128d
313 _mm_cmplt_pd (__m128d __A
, __m128d __B
)
315 return (__m128d
)__builtin_ia32_cmpltpd ((__v2df
)__A
, (__v2df
)__B
);
318 static __inline __m128d
319 _mm_cmple_pd (__m128d __A
, __m128d __B
)
321 return (__m128d
)__builtin_ia32_cmplepd ((__v2df
)__A
, (__v2df
)__B
);
324 static __inline __m128d
325 _mm_cmpgt_pd (__m128d __A
, __m128d __B
)
327 return (__m128d
)__builtin_ia32_cmpgtpd ((__v2df
)__A
, (__v2df
)__B
);
330 static __inline __m128d
331 _mm_cmpge_pd (__m128d __A
, __m128d __B
)
333 return (__m128d
)__builtin_ia32_cmpgepd ((__v2df
)__A
, (__v2df
)__B
);
336 static __inline __m128d
337 _mm_cmpneq_pd (__m128d __A
, __m128d __B
)
339 return (__m128d
)__builtin_ia32_cmpneqpd ((__v2df
)__A
, (__v2df
)__B
);
342 static __inline __m128d
343 _mm_cmpnlt_pd (__m128d __A
, __m128d __B
)
345 return (__m128d
)__builtin_ia32_cmpnltpd ((__v2df
)__A
, (__v2df
)__B
);
348 static __inline __m128d
349 _mm_cmpnle_pd (__m128d __A
, __m128d __B
)
351 return (__m128d
)__builtin_ia32_cmpnlepd ((__v2df
)__A
, (__v2df
)__B
);
354 static __inline __m128d
355 _mm_cmpngt_pd (__m128d __A
, __m128d __B
)
357 return (__m128d
)__builtin_ia32_cmpngtpd ((__v2df
)__A
, (__v2df
)__B
);
360 static __inline __m128d
361 _mm_cmpnge_pd (__m128d __A
, __m128d __B
)
363 return (__m128d
)__builtin_ia32_cmpngepd ((__v2df
)__A
, (__v2df
)__B
);
366 static __inline __m128d
367 _mm_cmpord_pd (__m128d __A
, __m128d __B
)
369 return (__m128d
)__builtin_ia32_cmpordpd ((__v2df
)__A
, (__v2df
)__B
);
372 static __inline __m128d
373 _mm_cmpunord_pd (__m128d __A
, __m128d __B
)
375 return (__m128d
)__builtin_ia32_cmpunordpd ((__v2df
)__A
, (__v2df
)__B
);
378 static __inline __m128d
379 _mm_cmpeq_sd (__m128d __A
, __m128d __B
)
381 return (__m128d
)__builtin_ia32_cmpeqsd ((__v2df
)__A
, (__v2df
)__B
);
384 static __inline __m128d
385 _mm_cmplt_sd (__m128d __A
, __m128d __B
)
387 return (__m128d
)__builtin_ia32_cmpltsd ((__v2df
)__A
, (__v2df
)__B
);
390 static __inline __m128d
391 _mm_cmple_sd (__m128d __A
, __m128d __B
)
393 return (__m128d
)__builtin_ia32_cmplesd ((__v2df
)__A
, (__v2df
)__B
);
396 static __inline __m128d
397 _mm_cmpgt_sd (__m128d __A
, __m128d __B
)
399 return (__m128d
) __builtin_ia32_movsd ((__v2df
) __A
,
401 __builtin_ia32_cmpltsd ((__v2df
) __B
,
406 static __inline __m128d
407 _mm_cmpge_sd (__m128d __A
, __m128d __B
)
409 return (__m128d
) __builtin_ia32_movsd ((__v2df
) __A
,
411 __builtin_ia32_cmplesd ((__v2df
) __B
,
416 static __inline __m128d
417 _mm_cmpneq_sd (__m128d __A
, __m128d __B
)
419 return (__m128d
)__builtin_ia32_cmpneqsd ((__v2df
)__A
, (__v2df
)__B
);
422 static __inline __m128d
423 _mm_cmpnlt_sd (__m128d __A
, __m128d __B
)
425 return (__m128d
)__builtin_ia32_cmpnltsd ((__v2df
)__A
, (__v2df
)__B
);
428 static __inline __m128d
429 _mm_cmpnle_sd (__m128d __A
, __m128d __B
)
431 return (__m128d
)__builtin_ia32_cmpnlesd ((__v2df
)__A
, (__v2df
)__B
);
434 static __inline __m128d
435 _mm_cmpngt_sd (__m128d __A
, __m128d __B
)
437 return (__m128d
) __builtin_ia32_movsd ((__v2df
) __A
,
439 __builtin_ia32_cmpnltsd ((__v2df
) __B
,
444 static __inline __m128d
445 _mm_cmpnge_sd (__m128d __A
, __m128d __B
)
447 return (__m128d
) __builtin_ia32_movsd ((__v2df
) __A
,
449 __builtin_ia32_cmpnlesd ((__v2df
) __B
,
454 static __inline __m128d
455 _mm_cmpord_sd (__m128d __A
, __m128d __B
)
457 return (__m128d
)__builtin_ia32_cmpordsd ((__v2df
)__A
, (__v2df
)__B
);
460 static __inline __m128d
461 _mm_cmpunord_sd (__m128d __A
, __m128d __B
)
463 return (__m128d
)__builtin_ia32_cmpunordsd ((__v2df
)__A
, (__v2df
)__B
);
467 _mm_comieq_sd (__m128d __A
, __m128d __B
)
469 return __builtin_ia32_comisdeq ((__v2df
)__A
, (__v2df
)__B
);
473 _mm_comilt_sd (__m128d __A
, __m128d __B
)
475 return __builtin_ia32_comisdlt ((__v2df
)__A
, (__v2df
)__B
);
479 _mm_comile_sd (__m128d __A
, __m128d __B
)
481 return __builtin_ia32_comisdle ((__v2df
)__A
, (__v2df
)__B
);
485 _mm_comigt_sd (__m128d __A
, __m128d __B
)
487 return __builtin_ia32_comisdgt ((__v2df
)__A
, (__v2df
)__B
);
491 _mm_comige_sd (__m128d __A
, __m128d __B
)
493 return __builtin_ia32_comisdge ((__v2df
)__A
, (__v2df
)__B
);
497 _mm_comineq_sd (__m128d __A
, __m128d __B
)
499 return __builtin_ia32_comisdneq ((__v2df
)__A
, (__v2df
)__B
);
503 _mm_ucomieq_sd (__m128d __A
, __m128d __B
)
505 return __builtin_ia32_ucomisdeq ((__v2df
)__A
, (__v2df
)__B
);
509 _mm_ucomilt_sd (__m128d __A
, __m128d __B
)
511 return __builtin_ia32_ucomisdlt ((__v2df
)__A
, (__v2df
)__B
);
515 _mm_ucomile_sd (__m128d __A
, __m128d __B
)
517 return __builtin_ia32_ucomisdle ((__v2df
)__A
, (__v2df
)__B
);
521 _mm_ucomigt_sd (__m128d __A
, __m128d __B
)
523 return __builtin_ia32_ucomisdgt ((__v2df
)__A
, (__v2df
)__B
);
527 _mm_ucomige_sd (__m128d __A
, __m128d __B
)
529 return __builtin_ia32_ucomisdge ((__v2df
)__A
, (__v2df
)__B
);
533 _mm_ucomineq_sd (__m128d __A
, __m128d __B
)
535 return __builtin_ia32_ucomisdneq ((__v2df
)__A
, (__v2df
)__B
);
538 /* Create a vector with element 0 as *P and the rest zero. */
540 static __inline __m128i
541 _mm_load_si128 (__m128i
const *__P
)
543 return (__m128i
) __builtin_ia32_loaddqa ((char const *)__P
);
546 static __inline __m128i
547 _mm_loadu_si128 (__m128i
const *__P
)
549 return (__m128i
) __builtin_ia32_loaddqu ((char const *)__P
);
552 static __inline __m128i
553 _mm_loadl_epi64 (__m128i
const *__P
)
555 return (__m128i
) __builtin_ia32_movq2dq (*(unsigned long long *)__P
);
559 _mm_store_si128 (__m128i
*__P
, __m128i __B
)
561 __builtin_ia32_storedqa ((char *)__P
, (__v16qi
)__B
);
565 _mm_storeu_si128 (__m128i
*__P
, __m128i __B
)
567 __builtin_ia32_storedqu ((char *)__P
, (__v16qi
)__B
);
571 _mm_storel_epi64 (__m128i
*__P
, __m128i __B
)
573 *(long long *)__P
= __builtin_ia32_movdq2q ((__v2di
)__B
);
576 static __inline __m64
577 _mm_movepi64_pi64 (__m128i __B
)
579 return (__m64
) __builtin_ia32_movdq2q ((__v2di
)__B
);
582 static __inline __m128i
583 _mm_move_epi64 (__m128i __A
)
585 return (__m128i
) __builtin_ia32_movq ((__v2di
)__A
);
588 /* Create a vector of zeros. */
589 static __inline __m128i
590 _mm_setzero_si128 (void)
592 return (__m128i
) __builtin_ia32_setzero128 ();
595 static __inline __m128i
596 _mm_set_epi64 (__m64 __A
, __m64 __B
)
598 __v2di __tmp
= (__v2di
)__builtin_ia32_movq2dq ((unsigned long long)__A
);
599 __v2di __tmp2
= (__v2di
)__builtin_ia32_movq2dq ((unsigned long long)__B
);
600 return (__m128i
)__builtin_ia32_punpcklqdq128 (__tmp2
, __tmp
);
603 /* Create the vector [Z Y X W]. */
604 static __inline __m128i
605 _mm_set_epi32 (int __Z
, int __Y
, int __X
, int __W
)
621 /* Create the vector [Z Y]. */
622 static __inline __m128i
623 _mm_set_epi64x (long long __Z
, long long __Y
)
637 /* Create the vector [S T U V Z Y X W]. */
638 static __inline __m128i
639 _mm_set_epi16 (short __Z
, short __Y
, short __X
, short __W
,
640 short __V
, short __U
, short __T
, short __S
)
659 /* Create the vector [S T U V Z Y X W]. */
660 static __inline __m128i
661 _mm_set_epi8 (char __Z
, char __Y
, char __X
, char __W
,
662 char __V
, char __U
, char __T
, char __S
,
663 char __Z1
, char __Y1
, char __X1
, char __W1
,
664 char __V1
, char __U1
, char __T1
, char __S1
)
691 static __inline __m128i
692 _mm_set1_epi64 (__m64 __A
)
694 __v2di __tmp
= (__v2di
)__builtin_ia32_movq2dq ((unsigned long long)__A
);
695 return (__m128i
)__builtin_ia32_punpcklqdq128 (__tmp
, __tmp
);
698 static __inline __m128i
699 _mm_set1_epi32 (int __A
)
701 __v4si __tmp
= (__v4si
)__builtin_ia32_loadd (&__A
);
702 return (__m128i
) __builtin_ia32_pshufd ((__v4si
)__tmp
, _MM_SHUFFLE (0,0,0,0));
706 static __inline __m128i
707 _mm_set1_epi64x (long long __A
)
709 __v2di __tmp
= (__v2di
)__builtin_ia32_movq2dq ((unsigned long long)__A
);
710 return (__m128i
) __builtin_ia32_shufpd ((__v2df
)__tmp
, (__v2df
)__tmp
, _MM_SHUFFLE2 (0,0));
714 static __inline __m128i
715 _mm_set1_epi16 (short __A
)
717 int __Acopy
= (unsigned short)__A
;
718 __v4si __tmp
= (__v4si
)__builtin_ia32_loadd (&__Acopy
);
719 __tmp
= (__v4si
)__builtin_ia32_punpcklwd128 ((__v8hi
)__tmp
, (__v8hi
)__tmp
);
720 return (__m128i
) __builtin_ia32_pshufd ((__v4si
)__tmp
, _MM_SHUFFLE (0,0,0,0));
723 static __inline __m128i
724 _mm_set1_epi8 (char __A
)
726 int __Acopy
= (unsigned char)__A
;
727 __v4si __tmp
= (__v4si
)__builtin_ia32_loadd (&__Acopy
);
728 __tmp
= (__v4si
)__builtin_ia32_punpcklbw128 ((__v16qi
)__tmp
, (__v16qi
)__tmp
);
729 __tmp
= (__v4si
)__builtin_ia32_punpcklbw128 ((__v16qi
)__tmp
, (__v16qi
)__tmp
);
730 return (__m128i
) __builtin_ia32_pshufd ((__v4si
)__tmp
, _MM_SHUFFLE (0,0,0,0));
733 static __inline __m128i
734 _mm_setr_epi64 (__m64 __A
, __m64 __B
)
736 __v2di __tmp
= (__v2di
)__builtin_ia32_movq2dq ((unsigned long long)__A
);
737 __v2di __tmp2
= (__v2di
)__builtin_ia32_movq2dq ((unsigned long long)__B
);
738 return (__m128i
)__builtin_ia32_punpcklqdq128 (__tmp
, __tmp2
);
741 /* Create the vector [Z Y X W]. */
742 static __inline __m128i
743 _mm_setr_epi32 (int __W
, int __X
, int __Y
, int __Z
)
757 /* Create the vector [S T U V Z Y X W]. */
758 static __inline __m128i
759 _mm_setr_epi16 (short __S
, short __T
, short __U
, short __V
,
760 short __W
, short __X
, short __Y
, short __Z
)
779 /* Create the vector [S T U V Z Y X W]. */
780 static __inline __m128i
781 _mm_setr_epi8 (char __S1
, char __T1
, char __U1
, char __V1
,
782 char __W1
, char __X1
, char __Y1
, char __Z1
,
783 char __S
, char __T
, char __U
, char __V
,
784 char __W
, char __X
, char __Y
, char __Z
)
811 static __inline __m128d
812 _mm_cvtepi32_pd (__m128i __A
)
814 return (__m128d
)__builtin_ia32_cvtdq2pd ((__v4si
) __A
);
817 static __inline __m128
818 _mm_cvtepi32_ps (__m128i __A
)
820 return (__m128
)__builtin_ia32_cvtdq2ps ((__v4si
) __A
);
823 static __inline __m128i
824 _mm_cvtpd_epi32 (__m128d __A
)
826 return (__m128i
)__builtin_ia32_cvtpd2dq ((__v2df
) __A
);
829 static __inline __m64
830 _mm_cvtpd_pi32 (__m128d __A
)
832 return (__m64
)__builtin_ia32_cvtpd2pi ((__v2df
) __A
);
835 static __inline __m128
836 _mm_cvtpd_ps (__m128d __A
)
838 return (__m128
)__builtin_ia32_cvtpd2ps ((__v2df
) __A
);
841 static __inline __m128i
842 _mm_cvttpd_epi32 (__m128d __A
)
844 return (__m128i
)__builtin_ia32_cvttpd2dq ((__v2df
) __A
);
847 static __inline __m64
848 _mm_cvttpd_pi32 (__m128d __A
)
850 return (__m64
)__builtin_ia32_cvttpd2pi ((__v2df
) __A
);
853 static __inline __m128d
854 _mm_cvtpi32_pd (__m64 __A
)
856 return (__m128d
)__builtin_ia32_cvtpi2pd ((__v2si
) __A
);
859 static __inline __m128i
860 _mm_cvtps_epi32 (__m128 __A
)
862 return (__m128i
)__builtin_ia32_cvtps2dq ((__v4sf
) __A
);
865 static __inline __m128i
866 _mm_cvttps_epi32 (__m128 __A
)
868 return (__m128i
)__builtin_ia32_cvttps2dq ((__v4sf
) __A
);
871 static __inline __m128d
872 _mm_cvtps_pd (__m128 __A
)
874 return (__m128d
)__builtin_ia32_cvtps2pd ((__v4sf
) __A
);
878 _mm_cvtsd_si32 (__m128d __A
)
880 return __builtin_ia32_cvtsd2si ((__v2df
) __A
);
884 static __inline
long long
885 _mm_cvtsd_si64x (__m128d __A
)
887 return __builtin_ia32_cvtsd2si64 ((__v2df
) __A
);
892 _mm_cvttsd_si32 (__m128d __A
)
894 return __builtin_ia32_cvttsd2si ((__v2df
) __A
);
898 static __inline
long long
899 _mm_cvttsd_si64x (__m128d __A
)
901 return __builtin_ia32_cvttsd2si64 ((__v2df
) __A
);
905 static __inline __m128
906 _mm_cvtsd_ss (__m128 __A
, __m128d __B
)
908 return (__m128
)__builtin_ia32_cvtsd2ss ((__v4sf
) __A
, (__v2df
) __B
);
911 static __inline __m128d
912 _mm_cvtsi32_sd (__m128d __A
, int __B
)
914 return (__m128d
)__builtin_ia32_cvtsi2sd ((__v2df
) __A
, __B
);
918 static __inline __m128d
919 _mm_cvtsi64x_sd (__m128d __A
, long long __B
)
921 return (__m128d
)__builtin_ia32_cvtsi642sd ((__v2df
) __A
, __B
);
925 static __inline __m128d
926 _mm_cvtss_sd (__m128d __A
, __m128 __B
)
928 return (__m128d
)__builtin_ia32_cvtss2sd ((__v2df
) __A
, (__v4sf
)__B
);
931 #define _mm_shuffle_pd(__A, __B, __C) ((__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, (__C)))
933 static __inline __m128d
934 _mm_unpackhi_pd (__m128d __A
, __m128d __B
)
936 return (__m128d
)__builtin_ia32_unpckhpd ((__v2df
)__A
, (__v2df
)__B
);
939 static __inline __m128d
940 _mm_unpacklo_pd (__m128d __A
, __m128d __B
)
942 return (__m128d
)__builtin_ia32_unpcklpd ((__v2df
)__A
, (__v2df
)__B
);
945 static __inline __m128d
946 _mm_loadh_pd (__m128d __A
, double const *__B
)
948 return (__m128d
)__builtin_ia32_loadhpd ((__v2df
)__A
, (__v2si
*)__B
);
952 _mm_storeh_pd (double *__A
, __m128d __B
)
954 __builtin_ia32_storehpd ((__v2si
*)__A
, (__v2df
)__B
);
957 static __inline __m128d
958 _mm_loadl_pd (__m128d __A
, double const *__B
)
960 return (__m128d
)__builtin_ia32_loadlpd ((__v2df
)__A
, (__v2si
*)__B
);
964 _mm_storel_pd (double *__A
, __m128d __B
)
966 __builtin_ia32_storelpd ((__v2si
*)__A
, (__v2df
)__B
);
970 _mm_movemask_pd (__m128d __A
)
972 return __builtin_ia32_movmskpd ((__v2df
)__A
);
975 static __inline __m128i
976 _mm_packs_epi16 (__m128i __A
, __m128i __B
)
978 return (__m128i
)__builtin_ia32_packsswb128 ((__v8hi
)__A
, (__v8hi
)__B
);
981 static __inline __m128i
982 _mm_packs_epi32 (__m128i __A
, __m128i __B
)
984 return (__m128i
)__builtin_ia32_packssdw128 ((__v4si
)__A
, (__v4si
)__B
);
987 static __inline __m128i
988 _mm_packus_epi16 (__m128i __A
, __m128i __B
)
990 return (__m128i
)__builtin_ia32_packuswb128 ((__v8hi
)__A
, (__v8hi
)__B
);
993 static __inline __m128i
994 _mm_unpackhi_epi8 (__m128i __A
, __m128i __B
)
996 return (__m128i
)__builtin_ia32_punpckhbw128 ((__v16qi
)__A
, (__v16qi
)__B
);
999 static __inline __m128i
1000 _mm_unpackhi_epi16 (__m128i __A
, __m128i __B
)
1002 return (__m128i
)__builtin_ia32_punpckhwd128 ((__v8hi
)__A
, (__v8hi
)__B
);
1005 static __inline __m128i
1006 _mm_unpackhi_epi32 (__m128i __A
, __m128i __B
)
1008 return (__m128i
)__builtin_ia32_punpckhdq128 ((__v4si
)__A
, (__v4si
)__B
);
1011 static __inline __m128i
1012 _mm_unpackhi_epi64 (__m128i __A
, __m128i __B
)
1014 return (__m128i
)__builtin_ia32_punpckhqdq128 ((__v2di
)__A
, (__v2di
)__B
);
1017 static __inline __m128i
1018 _mm_unpacklo_epi8 (__m128i __A
, __m128i __B
)
1020 return (__m128i
)__builtin_ia32_punpcklbw128 ((__v16qi
)__A
, (__v16qi
)__B
);
1023 static __inline __m128i
1024 _mm_unpacklo_epi16 (__m128i __A
, __m128i __B
)
1026 return (__m128i
)__builtin_ia32_punpcklwd128 ((__v8hi
)__A
, (__v8hi
)__B
);
1029 static __inline __m128i
1030 _mm_unpacklo_epi32 (__m128i __A
, __m128i __B
)
1032 return (__m128i
)__builtin_ia32_punpckldq128 ((__v4si
)__A
, (__v4si
)__B
);
1035 static __inline __m128i
1036 _mm_unpacklo_epi64 (__m128i __A
, __m128i __B
)
1038 return (__m128i
)__builtin_ia32_punpcklqdq128 ((__v2di
)__A
, (__v2di
)__B
);
1041 static __inline __m128i
1042 _mm_add_epi8 (__m128i __A
, __m128i __B
)
1044 return (__m128i
)__builtin_ia32_paddb128 ((__v16qi
)__A
, (__v16qi
)__B
);
1047 static __inline __m128i
1048 _mm_add_epi16 (__m128i __A
, __m128i __B
)
1050 return (__m128i
)__builtin_ia32_paddw128 ((__v8hi
)__A
, (__v8hi
)__B
);
1053 static __inline __m128i
1054 _mm_add_epi32 (__m128i __A
, __m128i __B
)
1056 return (__m128i
)__builtin_ia32_paddd128 ((__v4si
)__A
, (__v4si
)__B
);
1059 static __inline __m128i
1060 _mm_add_epi64 (__m128i __A
, __m128i __B
)
1062 return (__m128i
)__builtin_ia32_paddq128 ((__v2di
)__A
, (__v2di
)__B
);
1065 static __inline __m128i
1066 _mm_adds_epi8 (__m128i __A
, __m128i __B
)
1068 return (__m128i
)__builtin_ia32_paddsb128 ((__v16qi
)__A
, (__v16qi
)__B
);
1071 static __inline __m128i
1072 _mm_adds_epi16 (__m128i __A
, __m128i __B
)
1074 return (__m128i
)__builtin_ia32_paddsw128 ((__v8hi
)__A
, (__v8hi
)__B
);
1077 static __inline __m128i
1078 _mm_adds_epu8 (__m128i __A
, __m128i __B
)
1080 return (__m128i
)__builtin_ia32_paddusb128 ((__v16qi
)__A
, (__v16qi
)__B
);
1083 static __inline __m128i
1084 _mm_adds_epu16 (__m128i __A
, __m128i __B
)
1086 return (__m128i
)__builtin_ia32_paddusw128 ((__v8hi
)__A
, (__v8hi
)__B
);
1089 static __inline __m128i
1090 _mm_sub_epi8 (__m128i __A
, __m128i __B
)
1092 return (__m128i
)__builtin_ia32_psubb128 ((__v16qi
)__A
, (__v16qi
)__B
);
1095 static __inline __m128i
1096 _mm_sub_epi16 (__m128i __A
, __m128i __B
)
1098 return (__m128i
)__builtin_ia32_psubw128 ((__v8hi
)__A
, (__v8hi
)__B
);
1101 static __inline __m128i
1102 _mm_sub_epi32 (__m128i __A
, __m128i __B
)
1104 return (__m128i
)__builtin_ia32_psubd128 ((__v4si
)__A
, (__v4si
)__B
);
1107 static __inline __m128i
1108 _mm_sub_epi64 (__m128i __A
, __m128i __B
)
1110 return (__m128i
)__builtin_ia32_psubq128 ((__v2di
)__A
, (__v2di
)__B
);
1113 static __inline __m128i
1114 _mm_subs_epi8 (__m128i __A
, __m128i __B
)
1116 return (__m128i
)__builtin_ia32_psubsb128 ((__v16qi
)__A
, (__v16qi
)__B
);
1119 static __inline __m128i
1120 _mm_subs_epi16 (__m128i __A
, __m128i __B
)
1122 return (__m128i
)__builtin_ia32_psubsw128 ((__v8hi
)__A
, (__v8hi
)__B
);
1125 static __inline __m128i
1126 _mm_subs_epu8 (__m128i __A
, __m128i __B
)
1128 return (__m128i
)__builtin_ia32_psubusb128 ((__v16qi
)__A
, (__v16qi
)__B
);
1131 static __inline __m128i
1132 _mm_subs_epu16 (__m128i __A
, __m128i __B
)
1134 return (__m128i
)__builtin_ia32_psubusw128 ((__v8hi
)__A
, (__v8hi
)__B
);
1137 static __inline __m128i
1138 _mm_madd_epi16 (__m128i __A
, __m128i __B
)
1140 return (__m128i
)__builtin_ia32_pmaddwd128 ((__v8hi
)__A
, (__v8hi
)__B
);
1143 static __inline __m128i
1144 _mm_mulhi_epi16 (__m128i __A
, __m128i __B
)
1146 return (__m128i
)__builtin_ia32_pmulhw128 ((__v8hi
)__A
, (__v8hi
)__B
);
1149 static __inline __m128i
1150 _mm_mullo_epi16 (__m128i __A
, __m128i __B
)
1152 return (__m128i
)__builtin_ia32_pmullw128 ((__v8hi
)__A
, (__v8hi
)__B
);
1155 static __inline __m64
1156 _mm_mul_su32 (__m64 __A
, __m64 __B
)
1158 return (__m64
)__builtin_ia32_pmuludq ((__v2si
)__A
, (__v2si
)__B
);
1161 static __inline __m128i
1162 _mm_mul_epu32 (__m128i __A
, __m128i __B
)
1164 return (__m128i
)__builtin_ia32_pmuludq128 ((__v4si
)__A
, (__v4si
)__B
);
1167 static __inline __m128i
1168 _mm_sll_epi16 (__m128i __A
, __m128i __B
)
1170 return (__m128i
)__builtin_ia32_psllw128 ((__v8hi
)__A
, (__v2di
)__B
);
1173 static __inline __m128i
1174 _mm_sll_epi32 (__m128i __A
, __m128i __B
)
1176 return (__m128i
)__builtin_ia32_pslld128 ((__v4si
)__A
, (__v2di
)__B
);
1179 static __inline __m128i
1180 _mm_sll_epi64 (__m128i __A
, __m128i __B
)
1182 return (__m128i
)__builtin_ia32_psllq128 ((__v2di
)__A
, (__v2di
)__B
);
1185 static __inline __m128i
1186 _mm_sra_epi16 (__m128i __A
, __m128i __B
)
1188 return (__m128i
)__builtin_ia32_psraw128 ((__v8hi
)__A
, (__v2di
)__B
);
1191 static __inline __m128i
1192 _mm_sra_epi32 (__m128i __A
, __m128i __B
)
1194 return (__m128i
)__builtin_ia32_psrad128 ((__v4si
)__A
, (__v2di
)__B
);
1197 static __inline __m128i
1198 _mm_srl_epi16 (__m128i __A
, __m128i __B
)
1200 return (__m128i
)__builtin_ia32_psrlw128 ((__v8hi
)__A
, (__v2di
)__B
);
1203 static __inline __m128i
1204 _mm_srl_epi32 (__m128i __A
, __m128i __B
)
1206 return (__m128i
)__builtin_ia32_psrld128 ((__v4si
)__A
, (__v2di
)__B
);
1209 static __inline __m128i
1210 _mm_srl_epi64 (__m128i __A
, __m128i __B
)
1212 return (__m128i
)__builtin_ia32_psrlq128 ((__v2di
)__A
, (__v2di
)__B
);
1215 static __inline __m128i
1216 _mm_slli_epi16 (__m128i __A
, int __B
)
1218 return (__m128i
)__builtin_ia32_psllwi128 ((__v8hi
)__A
, __B
);
1221 static __inline __m128i
1222 _mm_slli_epi32 (__m128i __A
, int __B
)
1224 return (__m128i
)__builtin_ia32_pslldi128 ((__v4si
)__A
, __B
);
1227 static __inline __m128i
1228 _mm_slli_epi64 (__m128i __A
, int __B
)
1230 return (__m128i
)__builtin_ia32_psllqi128 ((__v2di
)__A
, __B
);
1233 static __inline __m128i
1234 _mm_srai_epi16 (__m128i __A
, int __B
)
1236 return (__m128i
)__builtin_ia32_psrawi128 ((__v8hi
)__A
, __B
);
1239 static __inline __m128i
1240 _mm_srai_epi32 (__m128i __A
, int __B
)
1242 return (__m128i
)__builtin_ia32_psradi128 ((__v4si
)__A
, __B
);
1246 static __m128i
__attribute__((__always_inline__
))
1247 _mm_srli_si128 (__m128i __A
, const int __B
)
1249 return ((__m128i
)__builtin_ia32_psrldqi128 (__A
, __B
))
1252 static __m128i
__attribute__((__always_inline__
))
1253 _mm_srli_si128 (__m128i __A
, const int __B
)
1255 return ((__m128i
)__builtin_ia32_pslldqi128 (__A
, __B
))
1258 #define _mm_srli_si128(__A, __B) ((__m128i)__builtin_ia32_psrldqi128 (__A, __B))
1259 #define _mm_slli_si128(__A, __B) ((__m128i)__builtin_ia32_pslldqi128 (__A, __B))
1261 static __inline __m128i
1262 _mm_srli_epi16 (__m128i __A
, int __B
)
1264 return (__m128i
)__builtin_ia32_psrlwi128 ((__v8hi
)__A
, __B
);
1267 static __inline __m128i
1268 _mm_srli_epi32 (__m128i __A
, int __B
)
1270 return (__m128i
)__builtin_ia32_psrldi128 ((__v4si
)__A
, __B
);
1273 static __inline __m128i
1274 _mm_srli_epi64 (__m128i __A
, int __B
)
1276 return (__m128i
)__builtin_ia32_psrlqi128 ((__v2di
)__A
, __B
);
1279 static __inline __m128i
1280 _mm_and_si128 (__m128i __A
, __m128i __B
)
1282 return (__m128i
)__builtin_ia32_pand128 ((__v2di
)__A
, (__v2di
)__B
);
1285 static __inline __m128i
1286 _mm_andnot_si128 (__m128i __A
, __m128i __B
)
1288 return (__m128i
)__builtin_ia32_pandn128 ((__v2di
)__A
, (__v2di
)__B
);
1291 static __inline __m128i
1292 _mm_or_si128 (__m128i __A
, __m128i __B
)
1294 return (__m128i
)__builtin_ia32_por128 ((__v2di
)__A
, (__v2di
)__B
);
1297 static __inline __m128i
1298 _mm_xor_si128 (__m128i __A
, __m128i __B
)
1300 return (__m128i
)__builtin_ia32_pxor128 ((__v2di
)__A
, (__v2di
)__B
);
1303 static __inline __m128i
1304 _mm_cmpeq_epi8 (__m128i __A
, __m128i __B
)
1306 return (__m128i
)__builtin_ia32_pcmpeqb128 ((__v16qi
)__A
, (__v16qi
)__B
);
1309 static __inline __m128i
1310 _mm_cmpeq_epi16 (__m128i __A
, __m128i __B
)
1312 return (__m128i
)__builtin_ia32_pcmpeqw128 ((__v8hi
)__A
, (__v8hi
)__B
);
1315 static __inline __m128i
1316 _mm_cmpeq_epi32 (__m128i __A
, __m128i __B
)
1318 return (__m128i
)__builtin_ia32_pcmpeqd128 ((__v4si
)__A
, (__v4si
)__B
);
1321 static __inline __m128i
1322 _mm_cmplt_epi8 (__m128i __A
, __m128i __B
)
1324 return (__m128i
)__builtin_ia32_pcmpgtb128 ((__v16qi
)__B
, (__v16qi
)__A
);
1327 static __inline __m128i
1328 _mm_cmplt_epi16 (__m128i __A
, __m128i __B
)
1330 return (__m128i
)__builtin_ia32_pcmpgtw128 ((__v8hi
)__B
, (__v8hi
)__A
);
1333 static __inline __m128i
1334 _mm_cmplt_epi32 (__m128i __A
, __m128i __B
)
1336 return (__m128i
)__builtin_ia32_pcmpgtd128 ((__v4si
)__B
, (__v4si
)__A
);
1339 static __inline __m128i
1340 _mm_cmpgt_epi8 (__m128i __A
, __m128i __B
)
1342 return (__m128i
)__builtin_ia32_pcmpgtb128 ((__v16qi
)__A
, (__v16qi
)__B
);
1345 static __inline __m128i
1346 _mm_cmpgt_epi16 (__m128i __A
, __m128i __B
)
1348 return (__m128i
)__builtin_ia32_pcmpgtw128 ((__v8hi
)__A
, (__v8hi
)__B
);
1351 static __inline __m128i
1352 _mm_cmpgt_epi32 (__m128i __A
, __m128i __B
)
1354 return (__m128i
)__builtin_ia32_pcmpgtd128 ((__v4si
)__A
, (__v4si
)__B
);
1357 #define _mm_extract_epi16(__A, __B) __builtin_ia32_pextrw128 ((__v8hi)__A, __B)
1359 #define _mm_insert_epi16(__A, __B, __C) ((__m128i)__builtin_ia32_pinsrw128 ((__v8hi)__A, __B, __C))
1361 static __inline __m128i
1362 _mm_max_epi16 (__m128i __A
, __m128i __B
)
1364 return (__m128i
)__builtin_ia32_pmaxsw128 ((__v8hi
)__A
, (__v8hi
)__B
);
1367 static __inline __m128i
1368 _mm_max_epu8 (__m128i __A
, __m128i __B
)
1370 return (__m128i
)__builtin_ia32_pmaxub128 ((__v16qi
)__A
, (__v16qi
)__B
);
1373 static __inline __m128i
1374 _mm_min_epi16 (__m128i __A
, __m128i __B
)
1376 return (__m128i
)__builtin_ia32_pminsw128 ((__v8hi
)__A
, (__v8hi
)__B
);
1379 static __inline __m128i
1380 _mm_min_epu8 (__m128i __A
, __m128i __B
)
1382 return (__m128i
)__builtin_ia32_pminub128 ((__v16qi
)__A
, (__v16qi
)__B
);
1386 _mm_movemask_epi8 (__m128i __A
)
1388 return __builtin_ia32_pmovmskb128 ((__v16qi
)__A
);
1391 static __inline __m128i
1392 _mm_mulhi_epu16 (__m128i __A
, __m128i __B
)
1394 return (__m128i
)__builtin_ia32_pmulhuw128 ((__v8hi
)__A
, (__v8hi
)__B
);
1397 #define _mm_shufflehi_epi16(__A, __B) ((__m128i)__builtin_ia32_pshufhw ((__v8hi)__A, __B))
1398 #define _mm_shufflelo_epi16(__A, __B) ((__m128i)__builtin_ia32_pshuflw ((__v8hi)__A, __B))
1399 #define _mm_shuffle_epi32(__A, __B) ((__m128i)__builtin_ia32_pshufd ((__v4si)__A, __B))
1401 static __inline
void
1402 _mm_maskmoveu_si128 (__m128i __A
, __m128i __B
, char *__C
)
1404 __builtin_ia32_maskmovdqu ((__v16qi
)__A
, (__v16qi
)__B
, __C
);
1407 static __inline __m128i
1408 _mm_avg_epu8 (__m128i __A
, __m128i __B
)
1410 return (__m128i
)__builtin_ia32_pavgb128 ((__v16qi
)__A
, (__v16qi
)__B
);
1413 static __inline __m128i
1414 _mm_avg_epu16 (__m128i __A
, __m128i __B
)
1416 return (__m128i
)__builtin_ia32_pavgw128 ((__v8hi
)__A
, (__v8hi
)__B
);
1419 static __inline __m128i
1420 _mm_sad_epu8 (__m128i __A
, __m128i __B
)
1422 return (__m128i
)__builtin_ia32_psadbw128 ((__v16qi
)__A
, (__v16qi
)__B
);
1425 static __inline
void
1426 _mm_stream_si32 (int *__A
, int __B
)
1428 __builtin_ia32_movnti (__A
, __B
);
1431 static __inline
void
1432 _mm_stream_si128 (__m128i
*__A
, __m128i __B
)
1434 __builtin_ia32_movntdq ((__v2di
*)__A
, (__v2di
)__B
);
1437 static __inline
void
1438 _mm_stream_pd (double *__A
, __m128d __B
)
1440 __builtin_ia32_movntpd (__A
, (__v2df
)__B
);
1443 static __inline __m128i
1444 _mm_movpi64_epi64 (__m64 __A
)
1446 return (__m128i
)__builtin_ia32_movq2dq ((unsigned long long)__A
);
1449 static __inline
void
1450 _mm_clflush (void const *__A
)
1452 return __builtin_ia32_clflush (__A
);
1455 static __inline
void
1458 __builtin_ia32_lfence ();
1461 static __inline
void
1464 __builtin_ia32_mfence ();
1467 static __inline __m128i
1468 _mm_cvtsi32_si128 (int __A
)
1470 return (__m128i
) __builtin_ia32_loadd (&__A
);
1474 static __inline __m128i
1475 _mm_cvtsi64x_si128 (long long __A
)
1477 return (__m128i
) __builtin_ia32_movq2dq (__A
);
1482 _mm_cvtsi128_si32 (__m128i __A
)
1485 __builtin_ia32_stored (&__tmp
, (__v4si
)__A
);
1490 static __inline
long long
1491 _mm_cvtsi128_si64x (__m128i __A
)
1493 return __builtin_ia32_movdq2q ((__v2di
)__A
);
1497 #endif /* __SSE2__ */
1499 #endif /* _EMMINTRIN_H_INCLUDED */