1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 *===-----------------------------------------------------------------------===
27 #include <xmmintrin.h>
29 typedef double __m128d
__attribute__((__vector_size__(16)));
30 typedef long long __m128i
__attribute__((__vector_size__(16)));
33 typedef double __v2df
__attribute__ ((__vector_size__ (16)));
34 typedef long long __v2di
__attribute__ ((__vector_size__ (16)));
35 typedef short __v8hi
__attribute__((__vector_size__(16)));
36 typedef char __v16qi
__attribute__((__vector_size__(16)));
39 typedef unsigned long long __v2du
__attribute__ ((__vector_size__ (16)));
40 typedef unsigned short __v8hu
__attribute__((__vector_size__(16)));
41 typedef unsigned char __v16qu
__attribute__((__vector_size__(16)));
43 /* We need an explicitly signed variant for char. Note that this shouldn't
44 * appear in the interface though. */
45 typedef signed char __v16qs
__attribute__((__vector_size__(16)));
47 #include <f16cintrin.h>
49 /* Define the default attributes for the functions in this file. */
50 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
52 static __inline__ __m128d __DEFAULT_FN_ATTRS
53 _mm_add_sd(__m128d __a
, __m128d __b
)
59 static __inline__ __m128d __DEFAULT_FN_ATTRS
60 _mm_add_pd(__m128d __a
, __m128d __b
)
62 return (__m128d
)((__v2df
)__a
+ (__v2df
)__b
);
65 static __inline__ __m128d __DEFAULT_FN_ATTRS
66 _mm_sub_sd(__m128d __a
, __m128d __b
)
72 static __inline__ __m128d __DEFAULT_FN_ATTRS
73 _mm_sub_pd(__m128d __a
, __m128d __b
)
75 return (__m128d
)((__v2df
)__a
- (__v2df
)__b
);
78 static __inline__ __m128d __DEFAULT_FN_ATTRS
79 _mm_mul_sd(__m128d __a
, __m128d __b
)
85 static __inline__ __m128d __DEFAULT_FN_ATTRS
86 _mm_mul_pd(__m128d __a
, __m128d __b
)
88 return (__m128d
)((__v2df
)__a
* (__v2df
)__b
);
91 static __inline__ __m128d __DEFAULT_FN_ATTRS
92 _mm_div_sd(__m128d __a
, __m128d __b
)
98 static __inline__ __m128d __DEFAULT_FN_ATTRS
99 _mm_div_pd(__m128d __a
, __m128d __b
)
101 return (__m128d
)((__v2df
)__a
/ (__v2df
)__b
);
104 static __inline__ __m128d __DEFAULT_FN_ATTRS
105 _mm_sqrt_sd(__m128d __a
, __m128d __b
)
107 __m128d __c
= __builtin_ia32_sqrtsd((__v2df
)__b
);
108 return (__m128d
) { __c
[0], __a
[1] };
111 static __inline__ __m128d __DEFAULT_FN_ATTRS
112 _mm_sqrt_pd(__m128d __a
)
114 return __builtin_ia32_sqrtpd((__v2df
)__a
);
117 static __inline__ __m128d __DEFAULT_FN_ATTRS
118 _mm_min_sd(__m128d __a
, __m128d __b
)
120 return __builtin_ia32_minsd((__v2df
)__a
, (__v2df
)__b
);
123 static __inline__ __m128d __DEFAULT_FN_ATTRS
124 _mm_min_pd(__m128d __a
, __m128d __b
)
126 return __builtin_ia32_minpd((__v2df
)__a
, (__v2df
)__b
);
129 static __inline__ __m128d __DEFAULT_FN_ATTRS
130 _mm_max_sd(__m128d __a
, __m128d __b
)
132 return __builtin_ia32_maxsd((__v2df
)__a
, (__v2df
)__b
);
135 static __inline__ __m128d __DEFAULT_FN_ATTRS
136 _mm_max_pd(__m128d __a
, __m128d __b
)
138 return __builtin_ia32_maxpd((__v2df
)__a
, (__v2df
)__b
);
141 static __inline__ __m128d __DEFAULT_FN_ATTRS
142 _mm_and_pd(__m128d __a
, __m128d __b
)
144 return (__m128d
)((__v4su
)__a
& (__v4su
)__b
);
147 static __inline__ __m128d __DEFAULT_FN_ATTRS
148 _mm_andnot_pd(__m128d __a
, __m128d __b
)
150 return (__m128d
)(~(__v4su
)__a
& (__v4su
)__b
);
153 static __inline__ __m128d __DEFAULT_FN_ATTRS
154 _mm_or_pd(__m128d __a
, __m128d __b
)
156 return (__m128d
)((__v4su
)__a
| (__v4su
)__b
);
159 static __inline__ __m128d __DEFAULT_FN_ATTRS
160 _mm_xor_pd(__m128d __a
, __m128d __b
)
162 return (__m128d
)((__v4su
)__a
^ (__v4su
)__b
);
165 static __inline__ __m128d __DEFAULT_FN_ATTRS
166 _mm_cmpeq_pd(__m128d __a
, __m128d __b
)
168 return (__m128d
)__builtin_ia32_cmpeqpd((__v2df
)__a
, (__v2df
)__b
);
171 static __inline__ __m128d __DEFAULT_FN_ATTRS
172 _mm_cmplt_pd(__m128d __a
, __m128d __b
)
174 return (__m128d
)__builtin_ia32_cmpltpd((__v2df
)__a
, (__v2df
)__b
);
177 static __inline__ __m128d __DEFAULT_FN_ATTRS
178 _mm_cmple_pd(__m128d __a
, __m128d __b
)
180 return (__m128d
)__builtin_ia32_cmplepd((__v2df
)__a
, (__v2df
)__b
);
183 static __inline__ __m128d __DEFAULT_FN_ATTRS
184 _mm_cmpgt_pd(__m128d __a
, __m128d __b
)
186 return (__m128d
)__builtin_ia32_cmpltpd((__v2df
)__b
, (__v2df
)__a
);
189 static __inline__ __m128d __DEFAULT_FN_ATTRS
190 _mm_cmpge_pd(__m128d __a
, __m128d __b
)
192 return (__m128d
)__builtin_ia32_cmplepd((__v2df
)__b
, (__v2df
)__a
);
195 static __inline__ __m128d __DEFAULT_FN_ATTRS
196 _mm_cmpord_pd(__m128d __a
, __m128d __b
)
198 return (__m128d
)__builtin_ia32_cmpordpd((__v2df
)__a
, (__v2df
)__b
);
201 static __inline__ __m128d __DEFAULT_FN_ATTRS
202 _mm_cmpunord_pd(__m128d __a
, __m128d __b
)
204 return (__m128d
)__builtin_ia32_cmpunordpd((__v2df
)__a
, (__v2df
)__b
);
207 static __inline__ __m128d __DEFAULT_FN_ATTRS
208 _mm_cmpneq_pd(__m128d __a
, __m128d __b
)
210 return (__m128d
)__builtin_ia32_cmpneqpd((__v2df
)__a
, (__v2df
)__b
);
213 static __inline__ __m128d __DEFAULT_FN_ATTRS
214 _mm_cmpnlt_pd(__m128d __a
, __m128d __b
)
216 return (__m128d
)__builtin_ia32_cmpnltpd((__v2df
)__a
, (__v2df
)__b
);
219 static __inline__ __m128d __DEFAULT_FN_ATTRS
220 _mm_cmpnle_pd(__m128d __a
, __m128d __b
)
222 return (__m128d
)__builtin_ia32_cmpnlepd((__v2df
)__a
, (__v2df
)__b
);
225 static __inline__ __m128d __DEFAULT_FN_ATTRS
226 _mm_cmpngt_pd(__m128d __a
, __m128d __b
)
228 return (__m128d
)__builtin_ia32_cmpnltpd((__v2df
)__b
, (__v2df
)__a
);
231 static __inline__ __m128d __DEFAULT_FN_ATTRS
232 _mm_cmpnge_pd(__m128d __a
, __m128d __b
)
234 return (__m128d
)__builtin_ia32_cmpnlepd((__v2df
)__b
, (__v2df
)__a
);
237 static __inline__ __m128d __DEFAULT_FN_ATTRS
238 _mm_cmpeq_sd(__m128d __a
, __m128d __b
)
240 return (__m128d
)__builtin_ia32_cmpeqsd((__v2df
)__a
, (__v2df
)__b
);
243 static __inline__ __m128d __DEFAULT_FN_ATTRS
244 _mm_cmplt_sd(__m128d __a
, __m128d __b
)
246 return (__m128d
)__builtin_ia32_cmpltsd((__v2df
)__a
, (__v2df
)__b
);
249 static __inline__ __m128d __DEFAULT_FN_ATTRS
250 _mm_cmple_sd(__m128d __a
, __m128d __b
)
252 return (__m128d
)__builtin_ia32_cmplesd((__v2df
)__a
, (__v2df
)__b
);
255 static __inline__ __m128d __DEFAULT_FN_ATTRS
256 _mm_cmpgt_sd(__m128d __a
, __m128d __b
)
258 __m128d __c
= __builtin_ia32_cmpltsd((__v2df
)__b
, (__v2df
)__a
);
259 return (__m128d
) { __c
[0], __a
[1] };
262 static __inline__ __m128d __DEFAULT_FN_ATTRS
263 _mm_cmpge_sd(__m128d __a
, __m128d __b
)
265 __m128d __c
= __builtin_ia32_cmplesd((__v2df
)__b
, (__v2df
)__a
);
266 return (__m128d
) { __c
[0], __a
[1] };
269 static __inline__ __m128d __DEFAULT_FN_ATTRS
270 _mm_cmpord_sd(__m128d __a
, __m128d __b
)
272 return (__m128d
)__builtin_ia32_cmpordsd((__v2df
)__a
, (__v2df
)__b
);
275 static __inline__ __m128d __DEFAULT_FN_ATTRS
276 _mm_cmpunord_sd(__m128d __a
, __m128d __b
)
278 return (__m128d
)__builtin_ia32_cmpunordsd((__v2df
)__a
, (__v2df
)__b
);
281 static __inline__ __m128d __DEFAULT_FN_ATTRS
282 _mm_cmpneq_sd(__m128d __a
, __m128d __b
)
284 return (__m128d
)__builtin_ia32_cmpneqsd((__v2df
)__a
, (__v2df
)__b
);
287 static __inline__ __m128d __DEFAULT_FN_ATTRS
288 _mm_cmpnlt_sd(__m128d __a
, __m128d __b
)
290 return (__m128d
)__builtin_ia32_cmpnltsd((__v2df
)__a
, (__v2df
)__b
);
293 static __inline__ __m128d __DEFAULT_FN_ATTRS
294 _mm_cmpnle_sd(__m128d __a
, __m128d __b
)
296 return (__m128d
)__builtin_ia32_cmpnlesd((__v2df
)__a
, (__v2df
)__b
);
299 static __inline__ __m128d __DEFAULT_FN_ATTRS
300 _mm_cmpngt_sd(__m128d __a
, __m128d __b
)
302 __m128d __c
= __builtin_ia32_cmpnltsd((__v2df
)__b
, (__v2df
)__a
);
303 return (__m128d
) { __c
[0], __a
[1] };
306 static __inline__ __m128d __DEFAULT_FN_ATTRS
307 _mm_cmpnge_sd(__m128d __a
, __m128d __b
)
309 __m128d __c
= __builtin_ia32_cmpnlesd((__v2df
)__b
, (__v2df
)__a
);
310 return (__m128d
) { __c
[0], __a
[1] };
313 static __inline__
int __DEFAULT_FN_ATTRS
314 _mm_comieq_sd(__m128d __a
, __m128d __b
)
316 return __builtin_ia32_comisdeq((__v2df
)__a
, (__v2df
)__b
);
319 static __inline__
int __DEFAULT_FN_ATTRS
320 _mm_comilt_sd(__m128d __a
, __m128d __b
)
322 return __builtin_ia32_comisdlt((__v2df
)__a
, (__v2df
)__b
);
325 static __inline__
int __DEFAULT_FN_ATTRS
326 _mm_comile_sd(__m128d __a
, __m128d __b
)
328 return __builtin_ia32_comisdle((__v2df
)__a
, (__v2df
)__b
);
331 static __inline__
int __DEFAULT_FN_ATTRS
332 _mm_comigt_sd(__m128d __a
, __m128d __b
)
334 return __builtin_ia32_comisdgt((__v2df
)__a
, (__v2df
)__b
);
337 static __inline__
int __DEFAULT_FN_ATTRS
338 _mm_comige_sd(__m128d __a
, __m128d __b
)
340 return __builtin_ia32_comisdge((__v2df
)__a
, (__v2df
)__b
);
343 static __inline__
int __DEFAULT_FN_ATTRS
344 _mm_comineq_sd(__m128d __a
, __m128d __b
)
346 return __builtin_ia32_comisdneq((__v2df
)__a
, (__v2df
)__b
);
349 static __inline__
int __DEFAULT_FN_ATTRS
350 _mm_ucomieq_sd(__m128d __a
, __m128d __b
)
352 return __builtin_ia32_ucomisdeq((__v2df
)__a
, (__v2df
)__b
);
355 static __inline__
int __DEFAULT_FN_ATTRS
356 _mm_ucomilt_sd(__m128d __a
, __m128d __b
)
358 return __builtin_ia32_ucomisdlt((__v2df
)__a
, (__v2df
)__b
);
361 static __inline__
int __DEFAULT_FN_ATTRS
362 _mm_ucomile_sd(__m128d __a
, __m128d __b
)
364 return __builtin_ia32_ucomisdle((__v2df
)__a
, (__v2df
)__b
);
367 static __inline__
int __DEFAULT_FN_ATTRS
368 _mm_ucomigt_sd(__m128d __a
, __m128d __b
)
370 return __builtin_ia32_ucomisdgt((__v2df
)__a
, (__v2df
)__b
);
373 static __inline__
int __DEFAULT_FN_ATTRS
374 _mm_ucomige_sd(__m128d __a
, __m128d __b
)
376 return __builtin_ia32_ucomisdge((__v2df
)__a
, (__v2df
)__b
);
379 static __inline__
int __DEFAULT_FN_ATTRS
380 _mm_ucomineq_sd(__m128d __a
, __m128d __b
)
382 return __builtin_ia32_ucomisdneq((__v2df
)__a
, (__v2df
)__b
);
385 static __inline__ __m128 __DEFAULT_FN_ATTRS
386 _mm_cvtpd_ps(__m128d __a
)
388 return __builtin_ia32_cvtpd2ps((__v2df
)__a
);
391 static __inline__ __m128d __DEFAULT_FN_ATTRS
392 _mm_cvtps_pd(__m128 __a
)
394 return (__m128d
) __builtin_convertvector(
395 __builtin_shufflevector((__v4sf
)__a
, (__v4sf
)__a
, 0, 1), __v2df
);
398 static __inline__ __m128d __DEFAULT_FN_ATTRS
399 _mm_cvtepi32_pd(__m128i __a
)
401 return (__m128d
) __builtin_convertvector(
402 __builtin_shufflevector((__v4si
)__a
, (__v4si
)__a
, 0, 1), __v2df
);
405 static __inline__ __m128i __DEFAULT_FN_ATTRS
406 _mm_cvtpd_epi32(__m128d __a
)
408 return __builtin_ia32_cvtpd2dq((__v2df
)__a
);
411 static __inline__
int __DEFAULT_FN_ATTRS
412 _mm_cvtsd_si32(__m128d __a
)
414 return __builtin_ia32_cvtsd2si((__v2df
)__a
);
417 static __inline__ __m128 __DEFAULT_FN_ATTRS
418 _mm_cvtsd_ss(__m128 __a
, __m128d __b
)
424 static __inline__ __m128d __DEFAULT_FN_ATTRS
425 _mm_cvtsi32_sd(__m128d __a
, int __b
)
431 static __inline__ __m128d __DEFAULT_FN_ATTRS
432 _mm_cvtss_sd(__m128d __a
, __m128 __b
)
438 static __inline__ __m128i __DEFAULT_FN_ATTRS
439 _mm_cvttpd_epi32(__m128d __a
)
441 return (__m128i
)__builtin_ia32_cvttpd2dq((__v2df
)__a
);
444 static __inline__
int __DEFAULT_FN_ATTRS
445 _mm_cvttsd_si32(__m128d __a
)
450 static __inline__ __m64 __DEFAULT_FN_ATTRS
451 _mm_cvtpd_pi32(__m128d __a
)
453 return (__m64
)__builtin_ia32_cvtpd2pi((__v2df
)__a
);
456 static __inline__ __m64 __DEFAULT_FN_ATTRS
457 _mm_cvttpd_pi32(__m128d __a
)
459 return (__m64
)__builtin_ia32_cvttpd2pi((__v2df
)__a
);
462 static __inline__ __m128d __DEFAULT_FN_ATTRS
463 _mm_cvtpi32_pd(__m64 __a
)
465 return __builtin_ia32_cvtpi2pd((__v2si
)__a
);
468 static __inline__
double __DEFAULT_FN_ATTRS
469 _mm_cvtsd_f64(__m128d __a
)
474 static __inline__ __m128d __DEFAULT_FN_ATTRS
475 _mm_load_pd(double const *__dp
)
477 return *(__m128d
*)__dp
;
480 static __inline__ __m128d __DEFAULT_FN_ATTRS
481 _mm_load1_pd(double const *__dp
)
483 struct __mm_load1_pd_struct
{
485 } __attribute__((__packed__
, __may_alias__
));
486 double __u
= ((struct __mm_load1_pd_struct
*)__dp
)->__u
;
487 return (__m128d
){ __u
, __u
};
490 #define _mm_load_pd1(dp) _mm_load1_pd(dp)
492 static __inline__ __m128d __DEFAULT_FN_ATTRS
493 _mm_loadr_pd(double const *__dp
)
495 __m128d __u
= *(__m128d
*)__dp
;
496 return __builtin_shufflevector((__v2df
)__u
, (__v2df
)__u
, 1, 0);
499 static __inline__ __m128d __DEFAULT_FN_ATTRS
500 _mm_loadu_pd(double const *__dp
)
504 } __attribute__((__packed__
, __may_alias__
));
505 return ((struct __loadu_pd
*)__dp
)->__v
;
508 static __inline__ __m128i __DEFAULT_FN_ATTRS
509 _mm_loadu_si64(void const *__a
)
511 struct __loadu_si64
{
513 } __attribute__((__packed__
, __may_alias__
));
514 long long __u
= ((struct __loadu_si64
*)__a
)->__v
;
515 return (__m128i
){__u
, 0L};
518 static __inline__ __m128d __DEFAULT_FN_ATTRS
519 _mm_load_sd(double const *__dp
)
521 struct __mm_load_sd_struct
{
523 } __attribute__((__packed__
, __may_alias__
));
524 double __u
= ((struct __mm_load_sd_struct
*)__dp
)->__u
;
525 return (__m128d
){ __u
, 0 };
528 static __inline__ __m128d __DEFAULT_FN_ATTRS
529 _mm_loadh_pd(__m128d __a
, double const *__dp
)
531 struct __mm_loadh_pd_struct
{
533 } __attribute__((__packed__
, __may_alias__
));
534 double __u
= ((struct __mm_loadh_pd_struct
*)__dp
)->__u
;
535 return (__m128d
){ __a
[0], __u
};
538 static __inline__ __m128d __DEFAULT_FN_ATTRS
539 _mm_loadl_pd(__m128d __a
, double const *__dp
)
541 struct __mm_loadl_pd_struct
{
543 } __attribute__((__packed__
, __may_alias__
));
544 double __u
= ((struct __mm_loadl_pd_struct
*)__dp
)->__u
;
545 return (__m128d
){ __u
, __a
[1] };
548 static __inline__ __m128d __DEFAULT_FN_ATTRS
549 _mm_undefined_pd(void)
551 return (__m128d
)__builtin_ia32_undef128();
554 static __inline__ __m128d __DEFAULT_FN_ATTRS
555 _mm_set_sd(double __w
)
557 return (__m128d
){ __w
, 0 };
560 static __inline__ __m128d __DEFAULT_FN_ATTRS
561 _mm_set1_pd(double __w
)
563 return (__m128d
){ __w
, __w
};
566 static __inline__ __m128d __DEFAULT_FN_ATTRS
567 _mm_set_pd(double __w
, double __x
)
569 return (__m128d
){ __x
, __w
};
572 static __inline__ __m128d __DEFAULT_FN_ATTRS
573 _mm_setr_pd(double __w
, double __x
)
575 return (__m128d
){ __w
, __x
};
578 static __inline__ __m128d __DEFAULT_FN_ATTRS
581 return (__m128d
){ 0, 0 };
584 static __inline__ __m128d __DEFAULT_FN_ATTRS
585 _mm_move_sd(__m128d __a
, __m128d __b
)
587 return (__m128d
){ __b
[0], __a
[1] };
590 static __inline__
void __DEFAULT_FN_ATTRS
591 _mm_store_sd(double *__dp
, __m128d __a
)
593 struct __mm_store_sd_struct
{
595 } __attribute__((__packed__
, __may_alias__
));
596 ((struct __mm_store_sd_struct
*)__dp
)->__u
= __a
[0];
599 static __inline__
void __DEFAULT_FN_ATTRS
600 _mm_store_pd(double *__dp
, __m128d __a
)
602 *(__m128d
*)__dp
= __a
;
605 static __inline__
void __DEFAULT_FN_ATTRS
606 _mm_store1_pd(double *__dp
, __m128d __a
)
608 __a
= __builtin_shufflevector((__v2df
)__a
, (__v2df
)__a
, 0, 0);
609 _mm_store_pd(__dp
, __a
);
612 static __inline__
void __DEFAULT_FN_ATTRS
613 _mm_store_pd1(double *__dp
, __m128d __a
)
615 return _mm_store1_pd(__dp
, __a
);
618 static __inline__
void __DEFAULT_FN_ATTRS
619 _mm_storeu_pd(double *__dp
, __m128d __a
)
623 } __attribute__((__packed__
, __may_alias__
));
624 ((struct __storeu_pd
*)__dp
)->__v
= __a
;
627 static __inline__
void __DEFAULT_FN_ATTRS
628 _mm_storer_pd(double *__dp
, __m128d __a
)
630 __a
= __builtin_shufflevector((__v2df
)__a
, (__v2df
)__a
, 1, 0);
631 *(__m128d
*)__dp
= __a
;
634 static __inline__
void __DEFAULT_FN_ATTRS
635 _mm_storeh_pd(double *__dp
, __m128d __a
)
637 struct __mm_storeh_pd_struct
{
639 } __attribute__((__packed__
, __may_alias__
));
640 ((struct __mm_storeh_pd_struct
*)__dp
)->__u
= __a
[1];
643 static __inline__
void __DEFAULT_FN_ATTRS
644 _mm_storel_pd(double *__dp
, __m128d __a
)
646 struct __mm_storeh_pd_struct
{
648 } __attribute__((__packed__
, __may_alias__
));
649 ((struct __mm_storeh_pd_struct
*)__dp
)->__u
= __a
[0];
652 static __inline__ __m128i __DEFAULT_FN_ATTRS
653 _mm_add_epi8(__m128i __a
, __m128i __b
)
655 return (__m128i
)((__v16qu
)__a
+ (__v16qu
)__b
);
658 static __inline__ __m128i __DEFAULT_FN_ATTRS
659 _mm_add_epi16(__m128i __a
, __m128i __b
)
661 return (__m128i
)((__v8hu
)__a
+ (__v8hu
)__b
);
664 static __inline__ __m128i __DEFAULT_FN_ATTRS
665 _mm_add_epi32(__m128i __a
, __m128i __b
)
667 return (__m128i
)((__v4su
)__a
+ (__v4su
)__b
);
670 static __inline__ __m64 __DEFAULT_FN_ATTRS
671 _mm_add_si64(__m64 __a
, __m64 __b
)
673 return (__m64
)__builtin_ia32_paddq((__v1di
)__a
, (__v1di
)__b
);
676 static __inline__ __m128i __DEFAULT_FN_ATTRS
677 _mm_add_epi64(__m128i __a
, __m128i __b
)
679 return (__m128i
)((__v2du
)__a
+ (__v2du
)__b
);
682 static __inline__ __m128i __DEFAULT_FN_ATTRS
683 _mm_adds_epi8(__m128i __a
, __m128i __b
)
685 return (__m128i
)__builtin_ia32_paddsb128((__v16qi
)__a
, (__v16qi
)__b
);
688 static __inline__ __m128i __DEFAULT_FN_ATTRS
689 _mm_adds_epi16(__m128i __a
, __m128i __b
)
691 return (__m128i
)__builtin_ia32_paddsw128((__v8hi
)__a
, (__v8hi
)__b
);
694 static __inline__ __m128i __DEFAULT_FN_ATTRS
695 _mm_adds_epu8(__m128i __a
, __m128i __b
)
697 return (__m128i
)__builtin_ia32_paddusb128((__v16qi
)__a
, (__v16qi
)__b
);
700 static __inline__ __m128i __DEFAULT_FN_ATTRS
701 _mm_adds_epu16(__m128i __a
, __m128i __b
)
703 return (__m128i
)__builtin_ia32_paddusw128((__v8hi
)__a
, (__v8hi
)__b
);
706 static __inline__ __m128i __DEFAULT_FN_ATTRS
707 _mm_avg_epu8(__m128i __a
, __m128i __b
)
709 return (__m128i
)__builtin_ia32_pavgb128((__v16qi
)__a
, (__v16qi
)__b
);
712 static __inline__ __m128i __DEFAULT_FN_ATTRS
713 _mm_avg_epu16(__m128i __a
, __m128i __b
)
715 return (__m128i
)__builtin_ia32_pavgw128((__v8hi
)__a
, (__v8hi
)__b
);
718 static __inline__ __m128i __DEFAULT_FN_ATTRS
719 _mm_madd_epi16(__m128i __a
, __m128i __b
)
721 return (__m128i
)__builtin_ia32_pmaddwd128((__v8hi
)__a
, (__v8hi
)__b
);
724 static __inline__ __m128i __DEFAULT_FN_ATTRS
725 _mm_max_epi16(__m128i __a
, __m128i __b
)
727 return (__m128i
)__builtin_ia32_pmaxsw128((__v8hi
)__a
, (__v8hi
)__b
);
730 static __inline__ __m128i __DEFAULT_FN_ATTRS
731 _mm_max_epu8(__m128i __a
, __m128i __b
)
733 return (__m128i
)__builtin_ia32_pmaxub128((__v16qi
)__a
, (__v16qi
)__b
);
736 static __inline__ __m128i __DEFAULT_FN_ATTRS
737 _mm_min_epi16(__m128i __a
, __m128i __b
)
739 return (__m128i
)__builtin_ia32_pminsw128((__v8hi
)__a
, (__v8hi
)__b
);
742 static __inline__ __m128i __DEFAULT_FN_ATTRS
743 _mm_min_epu8(__m128i __a
, __m128i __b
)
745 return (__m128i
)__builtin_ia32_pminub128((__v16qi
)__a
, (__v16qi
)__b
);
748 static __inline__ __m128i __DEFAULT_FN_ATTRS
749 _mm_mulhi_epi16(__m128i __a
, __m128i __b
)
751 return (__m128i
)__builtin_ia32_pmulhw128((__v8hi
)__a
, (__v8hi
)__b
);
754 static __inline__ __m128i __DEFAULT_FN_ATTRS
755 _mm_mulhi_epu16(__m128i __a
, __m128i __b
)
757 return (__m128i
)__builtin_ia32_pmulhuw128((__v8hi
)__a
, (__v8hi
)__b
);
760 /// \brief Multiplies the corresponding elements of two [8 x short] vectors and
761 /// returns a vector containing the low-order 16 bits of each 32-bit product
762 /// in the corresponding element.
764 /// \headerfile <x86intrin.h>
766 /// This intrinsic corresponds to the \c VPMULLW / PMULLW instruction.
769 /// A 128-bit integer vector containing one of the source operands.
771 /// A 128-bit integer vector containing one of the source operands.
772 /// \returns A 128-bit integer vector containing the products of both operands.
773 static __inline__ __m128i __DEFAULT_FN_ATTRS
774 _mm_mullo_epi16(__m128i __a
, __m128i __b
)
776 return (__m128i
)((__v8hu
)__a
* (__v8hu
)__b
);
779 /// \brief Multiplies 32-bit unsigned integer values contained in the lower bits
780 /// of the two 64-bit integer vectors and returns the 64-bit unsigned
783 /// \headerfile <x86intrin.h>
785 /// This intrinsic corresponds to the \c PMULUDQ instruction.
788 /// A 64-bit integer containing one of the source operands.
790 /// A 64-bit integer containing one of the source operands.
791 /// \returns A 64-bit integer vector containing the product of both operands.
792 static __inline__ __m64 __DEFAULT_FN_ATTRS
793 _mm_mul_su32(__m64 __a
, __m64 __b
)
795 return __builtin_ia32_pmuludq((__v2si
)__a
, (__v2si
)__b
);
798 /// \brief Multiplies 32-bit unsigned integer values contained in the lower
799 /// bits of the corresponding elements of two [2 x i64] vectors, and returns
800 /// the 64-bit products in the corresponding elements of a [2 x i64] vector.
802 /// \headerfile <x86intrin.h>
804 /// This intrinsic corresponds to the \c VPMULUDQ / PMULUDQ instruction.
807 /// A [2 x i64] vector containing one of the source operands.
809 /// A [2 x i64] vector containing one of the source operands.
810 /// \returns A [2 x i64] vector containing the product of both operands.
811 static __inline__ __m128i __DEFAULT_FN_ATTRS
812 _mm_mul_epu32(__m128i __a
, __m128i __b
)
814 return __builtin_ia32_pmuludq128((__v4si
)__a
, (__v4si
)__b
);
817 /// \brief Computes the absolute differences of corresponding 8-bit integer
818 /// values in two 128-bit vectors. Sums the first 8 absolute differences, and
819 /// separately sums the second 8 absolute differences. Packss these two
820 /// unsigned 16-bit integer sums into the upper and lower elements of a
821 /// [2 x i64] vector.
823 /// \headerfile <x86intrin.h>
825 /// This intrinsic corresponds to the \c VPSADBW / PSADBW instruction.
828 /// A 128-bit integer vector containing one of the source operands.
830 /// A 128-bit integer vector containing one of the source operands.
831 /// \returns A [2 x i64] vector containing the sums of the sets of absolute
832 /// differences between both operands.
833 static __inline__ __m128i __DEFAULT_FN_ATTRS
834 _mm_sad_epu8(__m128i __a
, __m128i __b
)
836 return __builtin_ia32_psadbw128((__v16qi
)__a
, (__v16qi
)__b
);
839 /// \brief Subtracts the corresponding 8-bit integer values in the operands.
841 /// \headerfile <x86intrin.h>
843 /// This intrinsic corresponds to the \c VPSUBB / PSUBB instruction.
846 /// A 128-bit integer vector containing the minuends.
848 /// A 128-bit integer vector containing the subtrahends.
849 /// \returns A 128-bit integer vector containing the differences of the values
851 static __inline__ __m128i __DEFAULT_FN_ATTRS
852 _mm_sub_epi8(__m128i __a
, __m128i __b
)
854 return (__m128i
)((__v16qu
)__a
- (__v16qu
)__b
);
857 /// \brief Subtracts the corresponding 16-bit integer values in the operands.
859 /// \headerfile <x86intrin.h>
861 /// This intrinsic corresponds to the \c VPSUBW / PSUBW instruction.
864 /// A 128-bit integer vector containing the minuends.
866 /// A 128-bit integer vector containing the subtrahends.
867 /// \returns A 128-bit integer vector containing the differences of the values
869 static __inline__ __m128i __DEFAULT_FN_ATTRS
870 _mm_sub_epi16(__m128i __a
, __m128i __b
)
872 return (__m128i
)((__v8hu
)__a
- (__v8hu
)__b
);
875 /// \brief Subtracts the corresponding 32-bit integer values in the operands.
877 /// \headerfile <x86intrin.h>
879 /// This intrinsic corresponds to the \c VPSUBD / PSUBD instruction.
882 /// A 128-bit integer vector containing the minuends.
884 /// A 128-bit integer vector containing the subtrahends.
885 /// \returns A 128-bit integer vector containing the differences of the values
887 static __inline__ __m128i __DEFAULT_FN_ATTRS
888 _mm_sub_epi32(__m128i __a
, __m128i __b
)
890 return (__m128i
)((__v4su
)__a
- (__v4su
)__b
);
893 /// \brief Subtracts signed or unsigned 64-bit integer values and writes the
894 /// difference to the corresponding bits in the destination.
896 /// \headerfile <x86intrin.h>
898 /// This intrinsic corresponds to the \c PSUBQ instruction.
901 /// A 64-bit integer vector containing the minuend.
903 /// A 64-bit integer vector containing the subtrahend.
904 /// \returns A 64-bit integer vector containing the difference of the values in
906 static __inline__ __m64 __DEFAULT_FN_ATTRS
907 _mm_sub_si64(__m64 __a
, __m64 __b
)
909 return (__m64
)__builtin_ia32_psubq((__v1di
)__a
, (__v1di
)__b
);
912 /// \brief Subtracts the corresponding elements of two [2 x i64] vectors.
914 /// \headerfile <x86intrin.h>
916 /// This intrinsic corresponds to the \c VPSUBQ / PSUBQ instruction.
919 /// A 128-bit integer vector containing the minuends.
921 /// A 128-bit integer vector containing the subtrahends.
922 /// \returns A 128-bit integer vector containing the differences of the values
924 static __inline__ __m128i __DEFAULT_FN_ATTRS
925 _mm_sub_epi64(__m128i __a
, __m128i __b
)
927 return (__m128i
)((__v2du
)__a
- (__v2du
)__b
);
930 /// \brief Subtracts corresponding 8-bit signed integer values in the input and
931 /// returns the differences in the corresponding bytes in the destination.
932 /// Differences greater than 7Fh are saturated to 7Fh, and differences less
933 /// than 80h are saturated to 80h.
935 /// \headerfile <x86intrin.h>
937 /// This intrinsic corresponds to the \c VPSUBSB / PSUBSB instruction.
940 /// A 128-bit integer vector containing the minuends.
942 /// A 128-bit integer vector containing the subtrahends.
943 /// \returns A 128-bit integer vector containing the differences of the values
945 static __inline__ __m128i __DEFAULT_FN_ATTRS
946 _mm_subs_epi8(__m128i __a
, __m128i __b
)
948 return (__m128i
)__builtin_ia32_psubsb128((__v16qi
)__a
, (__v16qi
)__b
);
951 /// \brief Subtracts corresponding 16-bit signed integer values in the input and
952 /// returns the differences in the corresponding bytes in the destination.
953 /// Differences greater than 7FFFh are saturated to 7FFFh, and values less
954 /// than 8000h are saturated to 8000h.
956 /// \headerfile <x86intrin.h>
958 /// This intrinsic corresponds to the \c VPSUBSW / PSUBSW instruction.
961 /// A 128-bit integer vector containing the minuends.
963 /// A 128-bit integer vector containing the subtrahends.
964 /// \returns A 128-bit integer vector containing the differences of the values
966 static __inline__ __m128i __DEFAULT_FN_ATTRS
967 _mm_subs_epi16(__m128i __a
, __m128i __b
)
969 return (__m128i
)__builtin_ia32_psubsw128((__v8hi
)__a
, (__v8hi
)__b
);
972 /// \brief Subtracts corresponding 8-bit unsigned integer values in the input
973 /// and returns the differences in the corresponding bytes in the
974 /// destination. Differences less than 00h are saturated to 00h.
976 /// \headerfile <x86intrin.h>
978 /// This intrinsic corresponds to the \c VPSUBUSB / PSUBUSB instruction.
981 /// A 128-bit integer vector containing the minuends.
983 /// A 128-bit integer vector containing the subtrahends.
984 /// \returns A 128-bit integer vector containing the unsigned integer
985 /// differences of the values in the operands.
986 static __inline__ __m128i __DEFAULT_FN_ATTRS
987 _mm_subs_epu8(__m128i __a
, __m128i __b
)
989 return (__m128i
)__builtin_ia32_psubusb128((__v16qi
)__a
, (__v16qi
)__b
);
992 /// \brief Subtracts corresponding 16-bit unsigned integer values in the input
993 /// and returns the differences in the corresponding bytes in the
994 /// destination. Differences less than 0000h are saturated to 0000h.
996 /// \headerfile <x86intrin.h>
998 /// This intrinsic corresponds to the \c VPSUBUSW / PSUBUSW instruction.
1001 /// A 128-bit integer vector containing the minuends.
1003 /// A 128-bit integer vector containing the subtrahends.
1004 /// \returns A 128-bit integer vector containing the unsigned integer
1005 /// differences of the values in the operands.
1006 static __inline__ __m128i __DEFAULT_FN_ATTRS
1007 _mm_subs_epu16(__m128i __a
, __m128i __b
)
1009 return (__m128i
)__builtin_ia32_psubusw128((__v8hi
)__a
, (__v8hi
)__b
);
1012 /// \brief Performs a bitwise AND of two 128-bit integer vectors.
1014 /// \headerfile <x86intrin.h>
1016 /// This intrinsic corresponds to the \c VPAND / PAND instruction.
1019 /// A 128-bit integer vector containing one of the source operands.
1021 /// A 128-bit integer vector containing one of the source operands.
1022 /// \returns A 128-bit integer vector containing the bitwise AND of the values
1023 /// in both operands.
1024 static __inline__ __m128i __DEFAULT_FN_ATTRS
1025 _mm_and_si128(__m128i __a
, __m128i __b
)
1027 return (__m128i
)((__v2du
)__a
& (__v2du
)__b
);
1030 /// \brief Performs a bitwise AND of two 128-bit integer vectors, using the
1031 /// one's complement of the values contained in the first source operand.
1033 /// \headerfile <x86intrin.h>
1035 /// This intrinsic corresponds to the \c VPANDN / PANDN instruction.
1038 /// A 128-bit vector containing the left source operand. The one's complement
1039 /// of this value is used in the bitwise AND.
1041 /// A 128-bit vector containing the right source operand.
1042 /// \returns A 128-bit integer vector containing the bitwise AND of the one's
1043 /// complement of the first operand and the values in the second operand.
1044 static __inline__ __m128i __DEFAULT_FN_ATTRS
1045 _mm_andnot_si128(__m128i __a
, __m128i __b
)
1047 return (__m128i
)(~(__v2du
)__a
& (__v2du
)__b
);
1049 /// \brief Performs a bitwise OR of two 128-bit integer vectors.
1051 /// \headerfile <x86intrin.h>
1053 /// This intrinsic corresponds to the \c VPOR / POR instruction.
1056 /// A 128-bit integer vector containing one of the source operands.
1058 /// A 128-bit integer vector containing one of the source operands.
1059 /// \returns A 128-bit integer vector containing the bitwise OR of the values
1060 /// in both operands.
1061 static __inline__ __m128i __DEFAULT_FN_ATTRS
1062 _mm_or_si128(__m128i __a
, __m128i __b
)
1064 return (__m128i
)((__v2du
)__a
| (__v2du
)__b
);
1067 /// \brief Performs a bitwise exclusive OR of two 128-bit integer vectors.
1069 /// \headerfile <x86intrin.h>
1071 /// This intrinsic corresponds to the \c VPXOR / PXOR instruction.
1074 /// A 128-bit integer vector containing one of the source operands.
1076 /// A 128-bit integer vector containing one of the source operands.
1077 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
1078 /// values in both operands.
1079 static __inline__ __m128i __DEFAULT_FN_ATTRS
1080 _mm_xor_si128(__m128i __a
, __m128i __b
)
1082 return (__m128i
)((__v2du
)__a
^ (__v2du
)__b
);
1085 /// \brief Left-shifts the 128-bit integer vector operand by the specified
1086 /// number of bytes. Low-order bits are cleared.
1088 /// \headerfile <x86intrin.h>
1091 /// __m128i _mm_slli_si128(__m128i a, const int imm);
1094 /// This intrinsic corresponds to the \c VPSLLDQ / PSLLDQ instruction.
1097 /// A 128-bit integer vector containing the source operand.
1099 /// An immediate value specifying the number of bytes to left-shift
1101 /// \returns A 128-bit integer vector containing the left-shifted value.
1102 #define _mm_slli_si128(a, imm) __extension__ ({ \
1103 (__m128i)__builtin_shufflevector( \
1104 (__v16qi)_mm_setzero_si128(), \
1105 (__v16qi)(__m128i)(a), \
1106 ((char)(imm)&0xF0) ? 0 : 16 - (char)(imm), \
1107 ((char)(imm)&0xF0) ? 1 : 17 - (char)(imm), \
1108 ((char)(imm)&0xF0) ? 2 : 18 - (char)(imm), \
1109 ((char)(imm)&0xF0) ? 3 : 19 - (char)(imm), \
1110 ((char)(imm)&0xF0) ? 4 : 20 - (char)(imm), \
1111 ((char)(imm)&0xF0) ? 5 : 21 - (char)(imm), \
1112 ((char)(imm)&0xF0) ? 6 : 22 - (char)(imm), \
1113 ((char)(imm)&0xF0) ? 7 : 23 - (char)(imm), \
1114 ((char)(imm)&0xF0) ? 8 : 24 - (char)(imm), \
1115 ((char)(imm)&0xF0) ? 9 : 25 - (char)(imm), \
1116 ((char)(imm)&0xF0) ? 10 : 26 - (char)(imm), \
1117 ((char)(imm)&0xF0) ? 11 : 27 - (char)(imm), \
1118 ((char)(imm)&0xF0) ? 12 : 28 - (char)(imm), \
1119 ((char)(imm)&0xF0) ? 13 : 29 - (char)(imm), \
1120 ((char)(imm)&0xF0) ? 14 : 30 - (char)(imm), \
1121 ((char)(imm)&0xF0) ? 15 : 31 - (char)(imm)); })
1123 #define _mm_bslli_si128(a, imm) \
1124 _mm_slli_si128((a), (imm))
1126 /// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
1127 /// by the specified number of bits. Low-order bits are cleared.
1129 /// \headerfile <x86intrin.h>
1131 /// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction.
1134 /// A 128-bit integer vector containing the source operand.
1136 /// An integer value specifying the number of bits to left-shift each value
1138 /// \returns A 128-bit integer vector containing the left-shifted values.
1139 static __inline__ __m128i __DEFAULT_FN_ATTRS
1140 _mm_slli_epi16(__m128i __a
, int __count
)
1142 return (__m128i
)__builtin_ia32_psllwi128((__v8hi
)__a
, __count
);
1145 /// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
1146 /// by the specified number of bits. Low-order bits are cleared.
1148 /// \headerfile <x86intrin.h>
1150 /// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction.
1153 /// A 128-bit integer vector containing the source operand.
1155 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
1156 /// to left-shift each value in operand __a.
1157 /// \returns A 128-bit integer vector containing the left-shifted values.
1158 static __inline__ __m128i __DEFAULT_FN_ATTRS
1159 _mm_sll_epi16(__m128i __a
, __m128i __count
)
1161 return (__m128i
)__builtin_ia32_psllw128((__v8hi
)__a
, (__v8hi
)__count
);
1164 /// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
1165 /// by the specified number of bits. Low-order bits are cleared.
1167 /// \headerfile <x86intrin.h>
1169 /// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction.
1172 /// A 128-bit integer vector containing the source operand.
1174 /// An integer value specifying the number of bits to left-shift each value
1176 /// \returns A 128-bit integer vector containing the left-shifted values.
1177 static __inline__ __m128i __DEFAULT_FN_ATTRS
1178 _mm_slli_epi32(__m128i __a
, int __count
)
1180 return (__m128i
)__builtin_ia32_pslldi128((__v4si
)__a
, __count
);
1183 /// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
1184 /// by the specified number of bits. Low-order bits are cleared.
1186 /// \headerfile <x86intrin.h>
1188 /// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction.
1191 /// A 128-bit integer vector containing the source operand.
1193 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
1194 /// to left-shift each value in operand __a.
1195 /// \returns A 128-bit integer vector containing the left-shifted values.
1196 static __inline__ __m128i __DEFAULT_FN_ATTRS
1197 _mm_sll_epi32(__m128i __a
, __m128i __count
)
1199 return (__m128i
)__builtin_ia32_pslld128((__v4si
)__a
, (__v4si
)__count
);
1202 /// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
1203 /// by the specified number of bits. Low-order bits are cleared.
1205 /// \headerfile <x86intrin.h>
1207 /// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction.
1210 /// A 128-bit integer vector containing the source operand.
1212 /// An integer value specifying the number of bits to left-shift each value
1214 /// \returns A 128-bit integer vector containing the left-shifted values.
1215 static __inline__ __m128i __DEFAULT_FN_ATTRS
1216 _mm_slli_epi64(__m128i __a
, int __count
)
1218 return __builtin_ia32_psllqi128((__v2di
)__a
, __count
);
1221 /// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
1222 /// by the specified number of bits. Low-order bits are cleared.
1224 /// \headerfile <x86intrin.h>
1226 /// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction.
1229 /// A 128-bit integer vector containing the source operand.
1231 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
1232 /// to left-shift each value in operand __a.
1233 /// \returns A 128-bit integer vector containing the left-shifted values.
1234 static __inline__ __m128i __DEFAULT_FN_ATTRS
1235 _mm_sll_epi64(__m128i __a
, __m128i __count
)
1237 return __builtin_ia32_psllq128((__v2di
)__a
, (__v2di
)__count
);
1240 /// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
1241 /// by the specified number of bits. High-order bits are filled with the sign
1242 /// bit of the initial value.
1244 /// \headerfile <x86intrin.h>
1246 /// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction.
1249 /// A 128-bit integer vector containing the source operand.
1251 /// An integer value specifying the number of bits to right-shift each value
1253 /// \returns A 128-bit integer vector containing the right-shifted values.
1254 static __inline__ __m128i __DEFAULT_FN_ATTRS
1255 _mm_srai_epi16(__m128i __a
, int __count
)
1257 return (__m128i
)__builtin_ia32_psrawi128((__v8hi
)__a
, __count
);
1260 /// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
1261 /// by the specified number of bits. High-order bits are filled with the sign
1262 /// bit of the initial value.
1264 /// \headerfile <x86intrin.h>
1266 /// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction.
1269 /// A 128-bit integer vector containing the source operand.
1271 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
1272 /// to right-shift each value in operand __a.
1273 /// \returns A 128-bit integer vector containing the right-shifted values.
1274 static __inline__ __m128i __DEFAULT_FN_ATTRS
1275 _mm_sra_epi16(__m128i __a
, __m128i __count
)
1277 return (__m128i
)__builtin_ia32_psraw128((__v8hi
)__a
, (__v8hi
)__count
);
1280 /// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
1281 /// by the specified number of bits. High-order bits are filled with the sign
1282 /// bit of the initial value.
1284 /// \headerfile <x86intrin.h>
1286 /// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction.
1289 /// A 128-bit integer vector containing the source operand.
1291 /// An integer value specifying the number of bits to right-shift each value
1293 /// \returns A 128-bit integer vector containing the right-shifted values.
1294 static __inline__ __m128i __DEFAULT_FN_ATTRS
1295 _mm_srai_epi32(__m128i __a
, int __count
)
1297 return (__m128i
)__builtin_ia32_psradi128((__v4si
)__a
, __count
);
1300 /// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
1301 /// by the specified number of bits. High-order bits are filled with the sign
1302 /// bit of the initial value.
1304 /// \headerfile <x86intrin.h>
1306 /// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction.
1309 /// A 128-bit integer vector containing the source operand.
1311 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
1312 /// to right-shift each value in operand __a.
1313 /// \returns A 128-bit integer vector containing the right-shifted values.
1314 static __inline__ __m128i __DEFAULT_FN_ATTRS
1315 _mm_sra_epi32(__m128i __a
, __m128i __count
)
1317 return (__m128i
)__builtin_ia32_psrad128((__v4si
)__a
, (__v4si
)__count
);
1320 /// \brief Right-shifts the 128-bit integer vector operand by the specified
1321 /// number of bytes. High-order bits are cleared.
1323 /// \headerfile <x86intrin.h>
1326 /// __m128i _mm_srli_si128(__m128i a, const int imm);
1329 /// This intrinsic corresponds to the \c VPSRLDQ / PSRLDQ instruction.
1332 /// A 128-bit integer vector containing the source operand.
1334 /// An immediate value specifying the number of bytes to right-shift operand
1336 /// \returns A 128-bit integer vector containing the right-shifted value.
1337 #define _mm_srli_si128(a, imm) __extension__ ({ \
1338 (__m128i)__builtin_shufflevector( \
1339 (__v16qi)(__m128i)(a), \
1340 (__v16qi)_mm_setzero_si128(), \
1341 ((char)(imm)&0xF0) ? 16 : (char)(imm) + 0, \
1342 ((char)(imm)&0xF0) ? 17 : (char)(imm) + 1, \
1343 ((char)(imm)&0xF0) ? 18 : (char)(imm) + 2, \
1344 ((char)(imm)&0xF0) ? 19 : (char)(imm) + 3, \
1345 ((char)(imm)&0xF0) ? 20 : (char)(imm) + 4, \
1346 ((char)(imm)&0xF0) ? 21 : (char)(imm) + 5, \
1347 ((char)(imm)&0xF0) ? 22 : (char)(imm) + 6, \
1348 ((char)(imm)&0xF0) ? 23 : (char)(imm) + 7, \
1349 ((char)(imm)&0xF0) ? 24 : (char)(imm) + 8, \
1350 ((char)(imm)&0xF0) ? 25 : (char)(imm) + 9, \
1351 ((char)(imm)&0xF0) ? 26 : (char)(imm) + 10, \
1352 ((char)(imm)&0xF0) ? 27 : (char)(imm) + 11, \
1353 ((char)(imm)&0xF0) ? 28 : (char)(imm) + 12, \
1354 ((char)(imm)&0xF0) ? 29 : (char)(imm) + 13, \
1355 ((char)(imm)&0xF0) ? 30 : (char)(imm) + 14, \
1356 ((char)(imm)&0xF0) ? 31 : (char)(imm) + 15); })
1358 #define _mm_bsrli_si128(a, imm) \
1359 _mm_srli_si128((a), (imm))
1361 /// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
1362 /// operand by the specified number of bits. High-order bits are cleared.
1364 /// \headerfile <x86intrin.h>
1366 /// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction.
1369 /// A 128-bit integer vector containing the source operand.
1371 /// An integer value specifying the number of bits to right-shift each value
1373 /// \returns A 128-bit integer vector containing the right-shifted values.
1374 static __inline__ __m128i __DEFAULT_FN_ATTRS
1375 _mm_srli_epi16(__m128i __a
, int __count
)
1377 return (__m128i
)__builtin_ia32_psrlwi128((__v8hi
)__a
, __count
);
1380 /// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
1381 /// operand by the specified number of bits. High-order bits are cleared.
1383 /// \headerfile <x86intrin.h>
1385 /// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction.
1388 /// A 128-bit integer vector containing the source operand.
1390 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
1391 /// to right-shift each value in operand __a.
1392 /// \returns A 128-bit integer vector containing the right-shifted values.
1393 static __inline__ __m128i __DEFAULT_FN_ATTRS
1394 _mm_srl_epi16(__m128i __a
, __m128i __count
)
1396 return (__m128i
)__builtin_ia32_psrlw128((__v8hi
)__a
, (__v8hi
)__count
);
1399 /// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
1400 /// operand by the specified number of bits. High-order bits are cleared.
1402 /// \headerfile <x86intrin.h>
1404 /// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction.
1407 /// A 128-bit integer vector containing the source operand.
1409 /// An integer value specifying the number of bits to right-shift each value
1411 /// \returns A 128-bit integer vector containing the right-shifted values.
1412 static __inline__ __m128i __DEFAULT_FN_ATTRS
1413 _mm_srli_epi32(__m128i __a
, int __count
)
1415 return (__m128i
)__builtin_ia32_psrldi128((__v4si
)__a
, __count
);
1418 /// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
1419 /// operand by the specified number of bits. High-order bits are cleared.
1421 /// \headerfile <x86intrin.h>
1423 /// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction.
1426 /// A 128-bit integer vector containing the source operand.
1428 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
1429 /// to right-shift each value in operand __a.
1430 /// \returns A 128-bit integer vector containing the right-shifted values.
1431 static __inline__ __m128i __DEFAULT_FN_ATTRS
1432 _mm_srl_epi32(__m128i __a
, __m128i __count
)
1434 return (__m128i
)__builtin_ia32_psrld128((__v4si
)__a
, (__v4si
)__count
);
1437 /// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
1438 /// operand by the specified number of bits. High-order bits are cleared.
1440 /// \headerfile <x86intrin.h>
1442 /// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction.
1445 /// A 128-bit integer vector containing the source operand.
1447 /// An integer value specifying the number of bits to right-shift each value
1449 /// \returns A 128-bit integer vector containing the right-shifted values.
1450 static __inline__ __m128i __DEFAULT_FN_ATTRS
1451 _mm_srli_epi64(__m128i __a
, int __count
)
1453 return __builtin_ia32_psrlqi128((__v2di
)__a
, __count
);
1456 /// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
1457 /// operand by the specified number of bits. High-order bits are cleared.
1459 /// \headerfile <x86intrin.h>
1461 /// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction.
1464 /// A 128-bit integer vector containing the source operand.
1466 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
1467 /// to right-shift each value in operand __a.
1468 /// \returns A 128-bit integer vector containing the right-shifted values.
1469 static __inline__ __m128i __DEFAULT_FN_ATTRS
1470 _mm_srl_epi64(__m128i __a
, __m128i __count
)
1472 return __builtin_ia32_psrlq128((__v2di
)__a
, (__v2di
)__count
);
1475 /// \brief Compares each of the corresponding 8-bit values of the 128-bit
1476 /// integer vectors for equality. Each comparison yields 0h for false, FFh
1479 /// \headerfile <x86intrin.h>
1481 /// This intrinsic corresponds to the \c VPCMPEQB / PCMPEQB instruction.
1484 /// A 128-bit integer vector.
1486 /// A 128-bit integer vector.
1487 /// \returns A 128-bit integer vector containing the comparison results.
1488 static __inline__ __m128i __DEFAULT_FN_ATTRS
1489 _mm_cmpeq_epi8(__m128i __a
, __m128i __b
)
1491 return (__m128i
)((__v16qi
)__a
== (__v16qi
)__b
);
1494 /// \brief Compares each of the corresponding 16-bit values of the 128-bit
1495 /// integer vectors for equality. Each comparison yields 0h for false, FFFFh
1498 /// \headerfile <x86intrin.h>
1500 /// This intrinsic corresponds to the \c VPCMPEQW / PCMPEQW instruction.
1503 /// A 128-bit integer vector.
1505 /// A 128-bit integer vector.
1506 /// \returns A 128-bit integer vector containing the comparison results.
1507 static __inline__ __m128i __DEFAULT_FN_ATTRS
1508 _mm_cmpeq_epi16(__m128i __a
, __m128i __b
)
1510 return (__m128i
)((__v8hi
)__a
== (__v8hi
)__b
);
1513 /// \brief Compares each of the corresponding 32-bit values of the 128-bit
1514 /// integer vectors for equality. Each comparison yields 0h for false,
1515 /// FFFFFFFFh for true.
1517 /// \headerfile <x86intrin.h>
1519 /// This intrinsic corresponds to the \c VPCMPEQD / PCMPEQD instruction.
1522 /// A 128-bit integer vector.
1524 /// A 128-bit integer vector.
1525 /// \returns A 128-bit integer vector containing the comparison results.
1526 static __inline__ __m128i __DEFAULT_FN_ATTRS
1527 _mm_cmpeq_epi32(__m128i __a
, __m128i __b
)
1529 return (__m128i
)((__v4si
)__a
== (__v4si
)__b
);
1532 /// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
1533 /// integer vectors to determine if the values in the first operand are
1534 /// greater than those in the second operand. Each comparison yields 0h for
1535 /// false, FFh for true.
1537 /// \headerfile <x86intrin.h>
1539 /// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction.
1542 /// A 128-bit integer vector.
1544 /// A 128-bit integer vector.
1545 /// \returns A 128-bit integer vector containing the comparison results.
1546 static __inline__ __m128i __DEFAULT_FN_ATTRS
1547 _mm_cmpgt_epi8(__m128i __a
, __m128i __b
)
1549 /* This function always performs a signed comparison, but __v16qi is a char
1550 which may be signed or unsigned, so use __v16qs. */
1551 return (__m128i
)((__v16qs
)__a
> (__v16qs
)__b
);
1554 /// \brief Compares each of the corresponding signed 16-bit values of the
1555 /// 128-bit integer vectors to determine if the values in the first operand
1556 /// are greater than those in the second operand. Each comparison yields 0h
1557 /// for false, FFFFh for true.
1559 /// \headerfile <x86intrin.h>
1561 /// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction.
1564 /// A 128-bit integer vector.
1566 /// A 128-bit integer vector.
1567 /// \returns A 128-bit integer vector containing the comparison results.
1568 static __inline__ __m128i __DEFAULT_FN_ATTRS
1569 _mm_cmpgt_epi16(__m128i __a
, __m128i __b
)
1571 return (__m128i
)((__v8hi
)__a
> (__v8hi
)__b
);
1574 /// \brief Compares each of the corresponding signed 32-bit values of the
1575 /// 128-bit integer vectors to determine if the values in the first operand
1576 /// are greater than those in the second operand. Each comparison yields 0h
1577 /// for false, FFFFFFFFh for true.
1579 /// \headerfile <x86intrin.h>
1581 /// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction.
1584 /// A 128-bit integer vector.
1586 /// A 128-bit integer vector.
1587 /// \returns A 128-bit integer vector containing the comparison results.
1588 static __inline__ __m128i __DEFAULT_FN_ATTRS
1589 _mm_cmpgt_epi32(__m128i __a
, __m128i __b
)
1591 return (__m128i
)((__v4si
)__a
> (__v4si
)__b
);
1594 /// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
1595 /// integer vectors to determine if the values in the first operand are less
1596 /// than those in the second operand. Each comparison yields 0h for false,
1599 /// \headerfile <x86intrin.h>
1601 /// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction.
1604 /// A 128-bit integer vector.
1606 /// A 128-bit integer vector.
1607 /// \returns A 128-bit integer vector containing the comparison results.
1608 static __inline__ __m128i __DEFAULT_FN_ATTRS
1609 _mm_cmplt_epi8(__m128i __a
, __m128i __b
)
1611 return _mm_cmpgt_epi8(__b
, __a
);
1614 /// \brief Compares each of the corresponding signed 16-bit values of the
1615 /// 128-bit integer vectors to determine if the values in the first operand
1616 /// are less than those in the second operand. Each comparison yields 0h for
1617 /// false, FFFFh for true.
1619 /// \headerfile <x86intrin.h>
1621 /// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction.
1624 /// A 128-bit integer vector.
1626 /// A 128-bit integer vector.
1627 /// \returns A 128-bit integer vector containing the comparison results.
1628 static __inline__ __m128i __DEFAULT_FN_ATTRS
1629 _mm_cmplt_epi16(__m128i __a
, __m128i __b
)
1631 return _mm_cmpgt_epi16(__b
, __a
);
1634 /// \brief Compares each of the corresponding signed 32-bit values of the
1635 /// 128-bit integer vectors to determine if the values in the first operand
1636 /// are less than those in the second operand. Each comparison yields 0h for
1637 /// false, FFFFFFFFh for true.
1639 /// \headerfile <x86intrin.h>
1641 /// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction.
1644 /// A 128-bit integer vector.
1646 /// A 128-bit integer vector.
1647 /// \returns A 128-bit integer vector containing the comparison results.
1648 static __inline__ __m128i __DEFAULT_FN_ATTRS
1649 _mm_cmplt_epi32(__m128i __a
, __m128i __b
)
1651 return _mm_cmpgt_epi32(__b
, __a
);
1655 /// \brief Converts a 64-bit signed integer value from the second operand into a
1656 /// double-precision value and returns it in the lower element of a [2 x
1657 /// double] vector; the upper element of the returned vector is copied from
1658 /// the upper element of the first operand.
1660 /// \headerfile <x86intrin.h>
1662 /// This intrinsic corresponds to the \c VCVTSI2SD / CVTSI2SD instruction.
1665 /// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
1666 /// copied to the upper 64 bits of the destination.
1668 /// A 64-bit signed integer operand containing the value to be converted.
1669 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
1670 /// converted value of the second operand. The upper 64 bits are copied from
1671 /// the upper 64 bits of the first operand.
1672 static __inline__ __m128d __DEFAULT_FN_ATTRS
1673 _mm_cvtsi64_sd(__m128d __a
, long long __b
)
1679 /// \brief Converts the first (lower) element of a vector of [2 x double] into a
1680 /// 64-bit signed integer value, according to the current rounding mode.
1682 /// \headerfile <x86intrin.h>
1684 /// This intrinsic corresponds to the \c VCVTSD2SI / CVTSD2SI instruction.
1687 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1689 /// \returns A 64-bit signed integer containing the converted value.
1690 static __inline__
long long __DEFAULT_FN_ATTRS
1691 _mm_cvtsd_si64(__m128d __a
)
1693 return __builtin_ia32_cvtsd2si64((__v2df
)__a
);
1696 /// \brief Converts the first (lower) element of a vector of [2 x double] into a
1697 /// 64-bit signed integer value, truncating the result when it is inexact.
1699 /// \headerfile <x86intrin.h>
1701 /// This intrinsic corresponds to the \c VCVTTSD2SI / CVTTSD2SI instruction.
1704 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1706 /// \returns A 64-bit signed integer containing the converted value.
1707 static __inline__
long long __DEFAULT_FN_ATTRS
1708 _mm_cvttsd_si64(__m128d __a
)
1714 /// \brief Converts a vector of [4 x i32] into a vector of [4 x float].
1716 /// \headerfile <x86intrin.h>
1718 /// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction.
1721 /// A 128-bit integer vector.
1722 /// \returns A 128-bit vector of [4 x float] containing the converted values.
1723 static __inline__ __m128 __DEFAULT_FN_ATTRS
1724 _mm_cvtepi32_ps(__m128i __a
)
1726 return __builtin_ia32_cvtdq2ps((__v4si
)__a
);
1729 /// \brief Converts a vector of [4 x float] into a vector of [4 x i32].
1731 /// \headerfile <x86intrin.h>
1733 /// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction.
1736 /// A 128-bit vector of [4 x float].
1737 /// \returns A 128-bit integer vector of [4 x i32] containing the converted
1739 static __inline__ __m128i __DEFAULT_FN_ATTRS
1740 _mm_cvtps_epi32(__m128 __a
)
1742 return (__m128i
)__builtin_ia32_cvtps2dq((__v4sf
)__a
);
1745 /// \brief Converts a vector of [4 x float] into a vector of [4 x i32],
1746 /// truncating the result when it is inexact.
1748 /// \headerfile <x86intrin.h>
1750 /// This intrinsic corresponds to the \c VCVTTPS2DQ / CVTTPS2DQ instruction.
1753 /// A 128-bit vector of [4 x float].
1754 /// \returns A 128-bit vector of [4 x i32] containing the converted values.
1755 static __inline__ __m128i __DEFAULT_FN_ATTRS
1756 _mm_cvttps_epi32(__m128 __a
)
1758 return (__m128i
)__builtin_convertvector((__v4sf
)__a
, __v4si
);
1761 /// \brief Returns a vector of [4 x i32] where the lowest element is the input
1762 /// operand and the remaining elements are zero.
1764 /// \headerfile <x86intrin.h>
1766 /// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
1769 /// A 32-bit signed integer operand.
1770 /// \returns A 128-bit vector of [4 x i32].
1771 static __inline__ __m128i __DEFAULT_FN_ATTRS
1772 _mm_cvtsi32_si128(int __a
)
1774 return (__m128i
)(__v4si
){ __a
, 0, 0, 0 };
1778 /// \brief Returns a vector of [2 x i64] where the lower element is the input
1779 /// operand and the upper element is zero.
1781 /// \headerfile <x86intrin.h>
1783 /// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
1786 /// A 64-bit signed integer operand containing the value to be converted.
1787 /// \returns A 128-bit vector of [2 x i64] containing the converted value.
1788 static __inline__ __m128i __DEFAULT_FN_ATTRS
1789 _mm_cvtsi64_si128(long long __a
)
1791 return (__m128i
){ __a
, 0 };
1795 /// \brief Moves the least significant 32 bits of a vector of [4 x i32] to a
1796 /// 32-bit signed integer value.
1798 /// \headerfile <x86intrin.h>
1800 /// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
1803 /// A vector of [4 x i32]. The least significant 32 bits are moved to the
1805 /// \returns A 32-bit signed integer containing the moved value.
1806 static __inline__
int __DEFAULT_FN_ATTRS
1807 _mm_cvtsi128_si32(__m128i __a
)
1809 __v4si __b
= (__v4si
)__a
;
1814 /// \brief Moves the least significant 64 bits of a vector of [2 x i64] to a
1815 /// 64-bit signed integer value.
1817 /// \headerfile <x86intrin.h>
1819 /// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
1822 /// A vector of [2 x i64]. The least significant 64 bits are moved to the
1824 /// \returns A 64-bit signed integer containing the moved value.
1825 static __inline__
long long __DEFAULT_FN_ATTRS
1826 _mm_cvtsi128_si64(__m128i __a
)
1832 /// \brief Moves packed integer values from an aligned 128-bit memory location
1833 /// to elements in a 128-bit integer vector.
1835 /// \headerfile <x86intrin.h>
1837 /// This intrinsic corresponds to the \c VMOVDQA / MOVDQA instruction.
1840 /// An aligned pointer to a memory location containing integer values.
1841 /// \returns A 128-bit integer vector containing the moved values.
1842 static __inline__ __m128i __DEFAULT_FN_ATTRS
1843 _mm_load_si128(__m128i
const *__p
)
1848 /// \brief Moves packed integer values from an unaligned 128-bit memory location
1849 /// to elements in a 128-bit integer vector.
1851 /// \headerfile <x86intrin.h>
1853 /// This intrinsic corresponds to the \c VMOVDQU / MOVDQU instruction.
1856 /// A pointer to a memory location containing integer values.
1857 /// \returns A 128-bit integer vector containing the moved values.
1858 static __inline__ __m128i __DEFAULT_FN_ATTRS
1859 _mm_loadu_si128(__m128i
const *__p
)
1861 struct __loadu_si128
{
1863 } __attribute__((__packed__
, __may_alias__
));
1864 return ((struct __loadu_si128
*)__p
)->__v
;
1867 /// \brief Returns a vector of [2 x i64] where the lower element is taken from
1868 /// the lower element of the operand, and the upper element is zero.
1870 /// \headerfile <x86intrin.h>
1872 /// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
1875 /// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
1876 /// the destination.
1877 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
1878 /// moved value. The higher order bits are cleared.
1879 static __inline__ __m128i __DEFAULT_FN_ATTRS
1880 _mm_loadl_epi64(__m128i
const *__p
)
1882 struct __mm_loadl_epi64_struct
{
1884 } __attribute__((__packed__
, __may_alias__
));
1885 return (__m128i
) { ((struct __mm_loadl_epi64_struct
*)__p
)->__u
, 0};
1888 /// \brief Generates a 128-bit vector of [4 x i32] with unspecified content.
1889 /// This could be used as an argument to another intrinsic function where the
1890 /// argument is required but the value is not actually used.
1892 /// \headerfile <x86intrin.h>
1894 /// This intrinsic has no corresponding instruction.
1896 /// \returns A 128-bit vector of [4 x i32] with unspecified content.
1897 static __inline__ __m128i __DEFAULT_FN_ATTRS
1898 _mm_undefined_si128(void)
1900 return (__m128i
)__builtin_ia32_undef128();
1903 /// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
1904 /// the specified 64-bit integer values.
1906 /// \headerfile <x86intrin.h>
1908 /// This intrinsic is a utility function and does not correspond to a specific
1912 /// A 64-bit integer value used to initialize the upper 64 bits of the
1913 /// destination vector of [2 x i64].
1915 /// A 64-bit integer value used to initialize the lower 64 bits of the
1916 /// destination vector of [2 x i64].
1917 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
1918 /// provided in the operands.
1919 static __inline__ __m128i __DEFAULT_FN_ATTRS
1920 _mm_set_epi64x(long long __q1
, long long __q0
)
1922 return (__m128i
){ __q0
, __q1
};
1925 /// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
1926 /// the specified 64-bit integer values.
1928 /// \headerfile <x86intrin.h>
1930 /// This intrinsic is a utility function and does not correspond to a specific
1934 /// A 64-bit integer value used to initialize the upper 64 bits of the
1935 /// destination vector of [2 x i64].
1937 /// A 64-bit integer value used to initialize the lower 64 bits of the
1938 /// destination vector of [2 x i64].
1939 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
1940 /// provided in the operands.
1941 static __inline__ __m128i __DEFAULT_FN_ATTRS
1942 _mm_set_epi64(__m64 __q1
, __m64 __q0
)
1944 return (__m128i
){ (long long)__q0
, (long long)__q1
};
1947 /// \brief Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
1948 /// the specified 32-bit integer values.
1950 /// \headerfile <x86intrin.h>
1952 /// This intrinsic is a utility function and does not correspond to a specific
1956 /// A 32-bit integer value used to initialize bits [127:96] of the
1957 /// destination vector.
1959 /// A 32-bit integer value used to initialize bits [95:64] of the destination
1962 /// A 32-bit integer value used to initialize bits [63:32] of the destination
1965 /// A 32-bit integer value used to initialize bits [31:0] of the destination
1967 /// \returns An initialized 128-bit vector of [4 x i32] containing the values
1968 /// provided in the operands.
1969 static __inline__ __m128i __DEFAULT_FN_ATTRS
1970 _mm_set_epi32(int __i3
, int __i2
, int __i1
, int __i0
)
1972 return (__m128i
)(__v4si
){ __i0
, __i1
, __i2
, __i3
};
1975 /// \brief Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
1976 /// the specified 16-bit integer values.
1978 /// \headerfile <x86intrin.h>
1980 /// This intrinsic is a utility function and does not correspond to a specific
1984 /// A 16-bit integer value used to initialize bits [127:112] of the
1985 /// destination vector.
1987 /// A 16-bit integer value used to initialize bits [111:96] of the
1988 /// destination vector.
1990 /// A 16-bit integer value used to initialize bits [95:80] of the destination
1993 /// A 16-bit integer value used to initialize bits [79:64] of the destination
1996 /// A 16-bit integer value used to initialize bits [63:48] of the destination
1999 /// A 16-bit integer value used to initialize bits [47:32] of the destination
2002 /// A 16-bit integer value used to initialize bits [31:16] of the destination
2005 /// A 16-bit integer value used to initialize bits [15:0] of the destination
2007 /// \returns An initialized 128-bit vector of [8 x i16] containing the values
2008 /// provided in the operands.
2009 static __inline__ __m128i __DEFAULT_FN_ATTRS
2010 _mm_set_epi16(short __w7
, short __w6
, short __w5
, short __w4
, short __w3
, short __w2
, short __w1
, short __w0
)
2012 return (__m128i
)(__v8hi
){ __w0
, __w1
, __w2
, __w3
, __w4
, __w5
, __w6
, __w7
};
2015 /// \brief Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
2016 /// the specified 8-bit integer values.
2018 /// \headerfile <x86intrin.h>
2020 /// This intrinsic is a utility function and does not correspond to a specific
2024 /// Initializes bits [127:120] of the destination vector.
2026 /// Initializes bits [119:112] of the destination vector.
2028 /// Initializes bits [111:104] of the destination vector.
2030 /// Initializes bits [103:96] of the destination vector.
2032 /// Initializes bits [95:88] of the destination vector.
2034 /// Initializes bits [87:80] of the destination vector.
2036 /// Initializes bits [79:72] of the destination vector.
2038 /// Initializes bits [71:64] of the destination vector.
2040 /// Initializes bits [63:56] of the destination vector.
2042 /// Initializes bits [55:48] of the destination vector.
2044 /// Initializes bits [47:40] of the destination vector.
2046 /// Initializes bits [39:32] of the destination vector.
2048 /// Initializes bits [31:24] of the destination vector.
2050 /// Initializes bits [23:16] of the destination vector.
2052 /// Initializes bits [15:8] of the destination vector.
2054 /// Initializes bits [7:0] of the destination vector.
2055 /// \returns An initialized 128-bit vector of [16 x i8] containing the values
2056 /// provided in the operands.
2057 static __inline__ __m128i __DEFAULT_FN_ATTRS
2058 _mm_set_epi8(char __b15
, char __b14
, char __b13
, char __b12
, char __b11
, char __b10
, char __b9
, char __b8
, char __b7
, char __b6
, char __b5
, char __b4
, char __b3
, char __b2
, char __b1
, char __b0
)
2060 return (__m128i
)(__v16qi
){ __b0
, __b1
, __b2
, __b3
, __b4
, __b5
, __b6
, __b7
, __b8
, __b9
, __b10
, __b11
, __b12
, __b13
, __b14
, __b15
};
2063 /// \brief Initializes both values in a 128-bit integer vector with the
2064 /// specified 64-bit integer value.
2066 /// \headerfile <x86intrin.h>
2068 /// This intrinsic is a utility function and does not correspond to a specific
2072 /// Integer value used to initialize the elements of the destination integer
2074 /// \returns An initialized 128-bit integer vector of [2 x i64] with both
2075 /// elements containing the value provided in the operand.
2076 static __inline__ __m128i __DEFAULT_FN_ATTRS
2077 _mm_set1_epi64x(long long __q
)
2079 return (__m128i
){ __q
, __q
};
2082 /// \brief Initializes both values in a 128-bit vector of [2 x i64] with the
2083 /// specified 64-bit value.
2085 /// \headerfile <x86intrin.h>
2087 /// This intrinsic is a utility function and does not correspond to a specific
2091 /// A 64-bit value used to initialize the elements of the destination integer
2093 /// \returns An initialized 128-bit vector of [2 x i64] with all elements
2094 /// containing the value provided in the operand.
2095 static __inline__ __m128i __DEFAULT_FN_ATTRS
2096 _mm_set1_epi64(__m64 __q
)
2098 return (__m128i
){ (long long)__q
, (long long)__q
};
2101 /// \brief Initializes all values in a 128-bit vector of [4 x i32] with the
2102 /// specified 32-bit value.
2104 /// \headerfile <x86intrin.h>
2106 /// This intrinsic is a utility function and does not correspond to a specific
2110 /// A 32-bit value used to initialize the elements of the destination integer
2112 /// \returns An initialized 128-bit vector of [4 x i32] with all elements
2113 /// containing the value provided in the operand.
2114 static __inline__ __m128i __DEFAULT_FN_ATTRS
2115 _mm_set1_epi32(int __i
)
2117 return (__m128i
)(__v4si
){ __i
, __i
, __i
, __i
};
2120 /// \brief Initializes all values in a 128-bit vector of [8 x i16] with the
2121 /// specified 16-bit value.
2123 /// \headerfile <x86intrin.h>
2125 /// This intrinsic is a utility function and does not correspond to a specific
2129 /// A 16-bit value used to initialize the elements of the destination integer
2131 /// \returns An initialized 128-bit vector of [8 x i16] with all elements
2132 /// containing the value provided in the operand.
2133 static __inline__ __m128i __DEFAULT_FN_ATTRS
2134 _mm_set1_epi16(short __w
)
2136 return (__m128i
)(__v8hi
){ __w
, __w
, __w
, __w
, __w
, __w
, __w
, __w
};
2139 /// \brief Initializes all values in a 128-bit vector of [16 x i8] with the
2140 /// specified 8-bit value.
2142 /// \headerfile <x86intrin.h>
2144 /// This intrinsic is a utility function and does not correspond to a specific
2148 /// An 8-bit value used to initialize the elements of the destination integer
2150 /// \returns An initialized 128-bit vector of [16 x i8] with all elements
2151 /// containing the value provided in the operand.
2152 static __inline__ __m128i __DEFAULT_FN_ATTRS
2153 _mm_set1_epi8(char __b
)
2155 return (__m128i
)(__v16qi
){ __b
, __b
, __b
, __b
, __b
, __b
, __b
, __b
, __b
, __b
, __b
, __b
, __b
, __b
, __b
, __b
};
2158 static __inline__ __m128i __DEFAULT_FN_ATTRS
2159 _mm_setr_epi64(__m64 __q0
, __m64 __q1
)
2161 return (__m128i
){ (long long)__q0
, (long long)__q1
};
2164 static __inline__ __m128i __DEFAULT_FN_ATTRS
2165 _mm_setr_epi32(int __i0
, int __i1
, int __i2
, int __i3
)
2167 return (__m128i
)(__v4si
){ __i0
, __i1
, __i2
, __i3
};
2170 static __inline__ __m128i __DEFAULT_FN_ATTRS
2171 _mm_setr_epi16(short __w0
, short __w1
, short __w2
, short __w3
, short __w4
, short __w5
, short __w6
, short __w7
)
2173 return (__m128i
)(__v8hi
){ __w0
, __w1
, __w2
, __w3
, __w4
, __w5
, __w6
, __w7
};
2176 static __inline__ __m128i __DEFAULT_FN_ATTRS
2177 _mm_setr_epi8(char __b0
, char __b1
, char __b2
, char __b3
, char __b4
, char __b5
, char __b6
, char __b7
, char __b8
, char __b9
, char __b10
, char __b11
, char __b12
, char __b13
, char __b14
, char __b15
)
2179 return (__m128i
)(__v16qi
){ __b0
, __b1
, __b2
, __b3
, __b4
, __b5
, __b6
, __b7
, __b8
, __b9
, __b10
, __b11
, __b12
, __b13
, __b14
, __b15
};
2182 static __inline__ __m128i __DEFAULT_FN_ATTRS
2183 _mm_setzero_si128(void)
2185 return (__m128i
){ 0LL, 0LL };
2188 static __inline__
void __DEFAULT_FN_ATTRS
2189 _mm_store_si128(__m128i
*__p
, __m128i __b
)
2194 static __inline__
void __DEFAULT_FN_ATTRS
2195 _mm_storeu_si128(__m128i
*__p
, __m128i __b
)
2197 struct __storeu_si128
{
2199 } __attribute__((__packed__
, __may_alias__
));
2200 ((struct __storeu_si128
*)__p
)->__v
= __b
;
2203 static __inline__
void __DEFAULT_FN_ATTRS
2204 _mm_maskmoveu_si128(__m128i __d
, __m128i __n
, char *__p
)
2206 __builtin_ia32_maskmovdqu((__v16qi
)__d
, (__v16qi
)__n
, __p
);
2209 static __inline__
void __DEFAULT_FN_ATTRS
2210 _mm_storel_epi64(__m128i
*__p
, __m128i __a
)
2212 struct __mm_storel_epi64_struct
{
2214 } __attribute__((__packed__
, __may_alias__
));
2215 ((struct __mm_storel_epi64_struct
*)__p
)->__u
= __a
[0];
2218 static __inline__
void __DEFAULT_FN_ATTRS
2219 _mm_stream_pd(double *__p
, __m128d __a
)
2221 __builtin_nontemporal_store((__v2df
)__a
, (__v2df
*)__p
);
2224 static __inline__
void __DEFAULT_FN_ATTRS
2225 _mm_stream_si128(__m128i
*__p
, __m128i __a
)
2227 __builtin_nontemporal_store((__v2di
)__a
, (__v2di
*)__p
);
2230 static __inline__
void __DEFAULT_FN_ATTRS
2231 _mm_stream_si32(int *__p
, int __a
)
2233 __builtin_ia32_movnti(__p
, __a
);
2237 static __inline__
void __DEFAULT_FN_ATTRS
2238 _mm_stream_si64(long long *__p
, long long __a
)
2240 __builtin_ia32_movnti64(__p
, __a
);
2244 static __inline__
void __DEFAULT_FN_ATTRS
2245 _mm_clflush(void const *__p
)
2247 __builtin_ia32_clflush(__p
);
2250 static __inline__
void __DEFAULT_FN_ATTRS
2253 __builtin_ia32_lfence();
2256 static __inline__
void __DEFAULT_FN_ATTRS
2259 __builtin_ia32_mfence();
2262 static __inline__ __m128i __DEFAULT_FN_ATTRS
2263 _mm_packs_epi16(__m128i __a
, __m128i __b
)
2265 return (__m128i
)__builtin_ia32_packsswb128((__v8hi
)__a
, (__v8hi
)__b
);
2268 static __inline__ __m128i __DEFAULT_FN_ATTRS
2269 _mm_packs_epi32(__m128i __a
, __m128i __b
)
2271 return (__m128i
)__builtin_ia32_packssdw128((__v4si
)__a
, (__v4si
)__b
);
2274 static __inline__ __m128i __DEFAULT_FN_ATTRS
2275 _mm_packus_epi16(__m128i __a
, __m128i __b
)
2277 return (__m128i
)__builtin_ia32_packuswb128((__v8hi
)__a
, (__v8hi
)__b
);
2280 static __inline__
int __DEFAULT_FN_ATTRS
2281 _mm_extract_epi16(__m128i __a
, int __imm
)
2283 __v8hi __b
= (__v8hi
)__a
;
2284 return (unsigned short)__b
[__imm
& 7];
2287 static __inline__ __m128i __DEFAULT_FN_ATTRS
2288 _mm_insert_epi16(__m128i __a
, int __b
, int __imm
)
2290 __v8hi __c
= (__v8hi
)__a
;
2291 __c
[__imm
& 7] = __b
;
2292 return (__m128i
)__c
;
2295 static __inline__
int __DEFAULT_FN_ATTRS
2296 _mm_movemask_epi8(__m128i __a
)
2298 return __builtin_ia32_pmovmskb128((__v16qi
)__a
);
2301 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \
2302 (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
2303 (__v4si)_mm_undefined_si128(), \
2304 ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
2305 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); })
2307 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
2308 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
2309 (__v8hi)_mm_undefined_si128(), \
2310 ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
2311 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3, \
2314 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
2315 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
2316 (__v8hi)_mm_undefined_si128(), \
2318 4 + (((imm) >> 0) & 0x3), \
2319 4 + (((imm) >> 2) & 0x3), \
2320 4 + (((imm) >> 4) & 0x3), \
2321 4 + (((imm) >> 6) & 0x3)); })
2323 static __inline__ __m128i __DEFAULT_FN_ATTRS
2324 _mm_unpackhi_epi8(__m128i __a
, __m128i __b
)
2326 return (__m128i
)__builtin_shufflevector((__v16qi
)__a
, (__v16qi
)__b
, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
2329 static __inline__ __m128i __DEFAULT_FN_ATTRS
2330 _mm_unpackhi_epi16(__m128i __a
, __m128i __b
)
2332 return (__m128i
)__builtin_shufflevector((__v8hi
)__a
, (__v8hi
)__b
, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
2335 static __inline__ __m128i __DEFAULT_FN_ATTRS
2336 _mm_unpackhi_epi32(__m128i __a
, __m128i __b
)
2338 return (__m128i
)__builtin_shufflevector((__v4si
)__a
, (__v4si
)__b
, 2, 4+2, 3, 4+3);
2341 static __inline__ __m128i __DEFAULT_FN_ATTRS
2342 _mm_unpackhi_epi64(__m128i __a
, __m128i __b
)
2344 return (__m128i
)__builtin_shufflevector((__v2di
)__a
, (__v2di
)__b
, 1, 2+1);
2347 static __inline__ __m128i __DEFAULT_FN_ATTRS
2348 _mm_unpacklo_epi8(__m128i __a
, __m128i __b
)
2350 return (__m128i
)__builtin_shufflevector((__v16qi
)__a
, (__v16qi
)__b
, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
2353 static __inline__ __m128i __DEFAULT_FN_ATTRS
2354 _mm_unpacklo_epi16(__m128i __a
, __m128i __b
)
2356 return (__m128i
)__builtin_shufflevector((__v8hi
)__a
, (__v8hi
)__b
, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
2359 static __inline__ __m128i __DEFAULT_FN_ATTRS
2360 _mm_unpacklo_epi32(__m128i __a
, __m128i __b
)
2362 return (__m128i
)__builtin_shufflevector((__v4si
)__a
, (__v4si
)__b
, 0, 4+0, 1, 4+1);
2365 static __inline__ __m128i __DEFAULT_FN_ATTRS
2366 _mm_unpacklo_epi64(__m128i __a
, __m128i __b
)
2368 return (__m128i
)__builtin_shufflevector((__v2di
)__a
, (__v2di
)__b
, 0, 2+0);
2371 static __inline__ __m64 __DEFAULT_FN_ATTRS
2372 _mm_movepi64_pi64(__m128i __a
)
2374 return (__m64
)__a
[0];
2377 static __inline__ __m128i __DEFAULT_FN_ATTRS
2378 _mm_movpi64_epi64(__m64 __a
)
2380 return (__m128i
){ (long long)__a
, 0 };
2383 static __inline__ __m128i __DEFAULT_FN_ATTRS
2384 _mm_move_epi64(__m128i __a
)
2386 return __builtin_shufflevector((__v2di
)__a
, (__m128i
){ 0 }, 0, 2);
2389 static __inline__ __m128d __DEFAULT_FN_ATTRS
2390 _mm_unpackhi_pd(__m128d __a
, __m128d __b
)
2392 return __builtin_shufflevector((__v2df
)__a
, (__v2df
)__b
, 1, 2+1);
2395 static __inline__ __m128d __DEFAULT_FN_ATTRS
2396 _mm_unpacklo_pd(__m128d __a
, __m128d __b
)
2398 return __builtin_shufflevector((__v2df
)__a
, (__v2df
)__b
, 0, 2+0);
2401 static __inline__
int __DEFAULT_FN_ATTRS
2402 _mm_movemask_pd(__m128d __a
)
2404 return __builtin_ia32_movmskpd((__v2df
)__a
);
2407 #define _mm_shuffle_pd(a, b, i) __extension__ ({ \
2408 (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
2409 0 + (((i) >> 0) & 0x1), \
2410 2 + (((i) >> 1) & 0x1)); })
2412 static __inline__ __m128 __DEFAULT_FN_ATTRS
2413 _mm_castpd_ps(__m128d __a
)
2418 static __inline__ __m128i __DEFAULT_FN_ATTRS
2419 _mm_castpd_si128(__m128d __a
)
2421 return (__m128i
)__a
;
2424 static __inline__ __m128d __DEFAULT_FN_ATTRS
2425 _mm_castps_pd(__m128 __a
)
2427 return (__m128d
)__a
;
2430 static __inline__ __m128i __DEFAULT_FN_ATTRS
2431 _mm_castps_si128(__m128 __a
)
2433 return (__m128i
)__a
;
2436 static __inline__ __m128 __DEFAULT_FN_ATTRS
2437 _mm_castsi128_ps(__m128i __a
)
2442 static __inline__ __m128d __DEFAULT_FN_ATTRS
2443 _mm_castsi128_pd(__m128i __a
)
2445 return (__m128d
)__a
;
2448 static __inline__
void __DEFAULT_FN_ATTRS
2451 __builtin_ia32_pause();
2454 #undef __DEFAULT_FN_ATTRS
2456 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
2458 #endif /* __EMMINTRIN_H */