* config/i386/netbsd-elf.h (TARGET_OS_CPP_BUILTINS): Define.
[official-gcc.git] / gcc / config / i386 / xmmintrin.h
blob09eea2236dc5f3488c5ca4079c6221d5e1003700
1 /* Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of GNU CC.
5 GNU CC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
10 GNU CC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GNU CC; see the file COPYING. If not, write to
17 the Free Software Foundation, 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
20 /* As a special exception, if you include this header file into source
21 files compiled by GCC, this header file does not by itself cause
22 the resulting executable to be covered by the GNU General Public
23 License. This exception does not however invalidate any other
24 reasons why the executable file might be covered by the GNU General
25 Public License. */
27 /* Implemented from the specification included in the Intel C++ Compiler
28 User Guide and Reference, version 5.0. */
30 #ifndef _XMMINTRIN_H_INCLUDED
31 #define _XMMINTRIN_H_INCLUDED
33 /* We need type definitions from the MMX header file. */
34 #include <mmintrin.h>
36 /* The data type indended for user use. */
37 typedef int __m128 __attribute__ ((__mode__(__V4SF__)));
39 /* Internal data types for implementing the instrinsics. */
40 typedef int __v4sf __attribute__ ((__mode__(__V4SF__)));
41 typedef int __v4si __attribute__ ((__mode__(__V4SI__)));
43 /* Create a selector for use with the SHUFPS instruction. */
44 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
45 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
47 /* Constants for use with _mm_prefetch. */
48 enum _mm_hint
50 _MM_HINT_T0 = 3,
51 _MM_HINT_T1 = 2,
52 _MM_HINT_T2 = 1,
53 _MM_HINT_NTA = 0
56 /* Bits in the MXCSR. */
57 #define _MM_EXCEPT_MASK 0x003f
58 #define _MM_EXCEPT_INVALID 0x0001
59 #define _MM_EXCEPT_DENORM 0x0002
60 #define _MM_EXCEPT_DIV_ZERO 0x0004
61 #define _MM_EXCEPT_OVERFLOW 0x0008
62 #define _MM_EXCEPT_UNDERFLOW 0x0010
63 #define _MM_EXCEPT_INEXACT 0x0020
65 #define _MM_MASK_MASK 0x1f80
66 #define _MM_MASK_INVALID 0x0080
67 #define _MM_MASK_DENORM 0x0100
68 #define _MM_MASK_DIV_ZERO 0x0200
69 #define _MM_MASK_OVERFLOW 0x0400
70 #define _MM_MASK_UNDERFLOW 0x0800
71 #define _MM_MASK_INEXACT 0x1000
73 #define _MM_ROUND_MASK 0x6000
74 #define _MM_ROUND_NEAREST 0x0000
75 #define _MM_ROUND_DOWN 0x2000
76 #define _MM_ROUND_UP 0x4000
77 #define _MM_ROUND_TOWARD_ZERO 0x6000
79 #define _MM_FLUSH_ZERO_MASK 0x8000
80 #define _MM_FLUSH_ZERO_ON 0x8000
81 #define _MM_FLUSH_ZERO_OFF 0x0000
83 /* Perform the respective operation on the lower SPFP (single-precision
84 floating-point) values of A and B; the upper three SPFP values are
85 passed through from A. */
87 static __inline __m128
88 _mm_add_ss (__m128 __A, __m128 __B)
90 return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B);
93 static __inline __m128
94 _mm_sub_ss (__m128 __A, __m128 __B)
96 return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B);
99 static __inline __m128
100 _mm_mul_ss (__m128 __A, __m128 __B)
102 return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B);
105 static __inline __m128
106 _mm_div_ss (__m128 __A, __m128 __B)
108 return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B);
111 static __inline __m128
112 _mm_sqrt_ss (__m128 __A)
114 return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A);
117 static __inline __m128
118 _mm_rcp_ss (__m128 __A)
120 return (__m128) __builtin_ia32_rcpss ((__v4sf)__A);
123 static __inline __m128
124 _mm_rsqrt_ss (__m128 __A)
126 return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A);
129 static __inline __m128
130 _mm_min_ss (__m128 __A, __m128 __B)
132 return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B);
135 static __inline __m128
136 _mm_max_ss (__m128 __A, __m128 __B)
138 return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
141 /* Perform the respective operation on the four SPFP values in A and B. */
143 static __inline __m128
144 _mm_add_ps (__m128 __A, __m128 __B)
146 return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
149 static __inline __m128
150 _mm_sub_ps (__m128 __A, __m128 __B)
152 return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B);
155 static __inline __m128
156 _mm_mul_ps (__m128 __A, __m128 __B)
158 return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B);
161 static __inline __m128
162 _mm_div_ps (__m128 __A, __m128 __B)
164 return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B);
167 static __inline __m128
168 _mm_sqrt_ps (__m128 __A)
170 return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
173 static __inline __m128
174 _mm_rcp_ps (__m128 __A)
176 return (__m128) __builtin_ia32_rcpps ((__v4sf)__A);
179 static __inline __m128
180 _mm_rsqrt_ps (__m128 __A)
182 return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A);
185 static __inline __m128
186 _mm_min_ps (__m128 __A, __m128 __B)
188 return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B);
191 static __inline __m128
192 _mm_max_ps (__m128 __A, __m128 __B)
194 return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B);
197 /* Perform logical bit-wise operations on 128-bit values. */
199 static __inline __m128
200 _mm_and_ps (__m128 __A, __m128 __B)
202 return __builtin_ia32_andps (__A, __B);
205 static __inline __m128
206 _mm_andnot_ps (__m128 __A, __m128 __B)
208 return __builtin_ia32_andnps (__A, __B);
211 static __inline __m128
212 _mm_or_ps (__m128 __A, __m128 __B)
214 return __builtin_ia32_orps (__A, __B);
217 static __inline __m128
218 _mm_xor_ps (__m128 __A, __m128 __B)
220 return __builtin_ia32_xorps (__A, __B);
223 /* Perform a comparison on the lower SPFP values of A and B. If the
224 comparison is true, place a mask of all ones in the result, otherwise a
225 mask of zeros. The upper three SPFP values are passed through from A. */
227 static __inline __m128
228 _mm_cmpeq_ss (__m128 __A, __m128 __B)
230 return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B);
233 static __inline __m128
234 _mm_cmplt_ss (__m128 __A, __m128 __B)
236 return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B);
239 static __inline __m128
240 _mm_cmple_ss (__m128 __A, __m128 __B)
242 return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B);
245 static __inline __m128
246 _mm_cmpgt_ss (__m128 __A, __m128 __B)
248 return (__m128) __builtin_ia32_cmpgtss ((__v4sf)__A, (__v4sf)__B);
251 static __inline __m128
252 _mm_cmpge_ss (__m128 __A, __m128 __B)
254 return (__m128) __builtin_ia32_cmpgess ((__v4sf)__A, (__v4sf)__B);
257 static __inline __m128
258 _mm_cmpneq_ss (__m128 __A, __m128 __B)
260 return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B);
263 static __inline __m128
264 _mm_cmpnlt_ss (__m128 __A, __m128 __B)
266 return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B);
269 static __inline __m128
270 _mm_cmpnle_ss (__m128 __A, __m128 __B)
272 return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B);
275 static __inline __m128
276 _mm_cmpngt_ss (__m128 __A, __m128 __B)
278 return (__m128) __builtin_ia32_cmpngtss ((__v4sf)__A, (__v4sf)__B);
281 static __inline __m128
282 _mm_cmpnge_ss (__m128 __A, __m128 __B)
284 return (__m128) __builtin_ia32_cmpngess ((__v4sf)__A, (__v4sf)__B);
287 static __inline __m128
288 _mm_cmpord_ss (__m128 __A, __m128 __B)
290 return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B);
293 static __inline __m128
294 _mm_cmpunord_ss (__m128 __A, __m128 __B)
296 return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B);
299 /* Perform a comparison on the four SPFP values of A and B. For each
300 element, if the comparison is true, place a mask of all ones in the
301 result, otherwise a mask of zeros. */
303 static __inline __m128
304 _mm_cmpeq_ps (__m128 __A, __m128 __B)
306 return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
309 static __inline __m128
310 _mm_cmplt_ps (__m128 __A, __m128 __B)
312 return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B);
315 static __inline __m128
316 _mm_cmple_ps (__m128 __A, __m128 __B)
318 return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B);
321 static __inline __m128
322 _mm_cmpgt_ps (__m128 __A, __m128 __B)
324 return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B);
327 static __inline __m128
328 _mm_cmpge_ps (__m128 __A, __m128 __B)
330 return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B);
333 static __inline __m128
334 _mm_cmpneq_ps (__m128 __A, __m128 __B)
336 return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B);
339 static __inline __m128
340 _mm_cmpnlt_ps (__m128 __A, __m128 __B)
342 return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B);
345 static __inline __m128
346 _mm_cmpnle_ps (__m128 __A, __m128 __B)
348 return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B);
351 static __inline __m128
352 _mm_cmpngt_ps (__m128 __A, __m128 __B)
354 return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B);
357 static __inline __m128
358 _mm_cmpnge_ps (__m128 __A, __m128 __B)
360 return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B);
363 static __inline __m128
364 _mm_cmpord_ps (__m128 __A, __m128 __B)
366 return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B);
369 static __inline __m128
370 _mm_cmpunord_ps (__m128 __A, __m128 __B)
372 return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B);
375 /* Compare the lower SPFP values of A and B and return 1 if true
376 and 0 if false. */
378 static __inline int
379 _mm_comieq_ss (__m128 __A, __m128 __B)
381 return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B);
384 static __inline int
385 _mm_comilt_ss (__m128 __A, __m128 __B)
387 return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B);
390 static __inline int
391 _mm_comile_ss (__m128 __A, __m128 __B)
393 return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B);
396 static __inline int
397 _mm_comigt_ss (__m128 __A, __m128 __B)
399 return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B);
402 static __inline int
403 _mm_comige_ss (__m128 __A, __m128 __B)
405 return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B);
408 static __inline int
409 _mm_comineq_ss (__m128 __A, __m128 __B)
411 return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B);
414 static __inline int
415 _mm_ucomieq_ss (__m128 __A, __m128 __B)
417 return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B);
420 static __inline int
421 _mm_ucomilt_ss (__m128 __A, __m128 __B)
423 return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B);
426 static __inline int
427 _mm_ucomile_ss (__m128 __A, __m128 __B)
429 return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B);
432 static __inline int
433 _mm_ucomigt_ss (__m128 __A, __m128 __B)
435 return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B);
438 static __inline int
439 _mm_ucomige_ss (__m128 __A, __m128 __B)
441 return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B);
444 static __inline int
445 _mm_ucomineq_ss (__m128 __A, __m128 __B)
447 return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B);
450 /* Convert the lower SPFP value to a 32-bit integer according to the current
451 rounding mode. */
452 static __inline int
453 _mm_cvtss_si32 (__m128 __A)
455 return __builtin_ia32_cvtss2si ((__v4sf) __A);
458 /* Convert the two lower SPFP values to 32-bit integers according to the
459 current rounding mode. Return the integers in packed form. */
460 static __inline __m64
461 _mm_cvtps_pi32 (__m128 __A)
463 return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A);
466 /* Truncate the lower SPFP value to a 32-bit integer. */
467 static __inline int
468 _mm_cvttss_si32 (__m128 __A)
470 return __builtin_ia32_cvttss2si ((__v4sf) __A);
473 /* Truncate the two lower SPFP values to 32-bit integers. Return the
474 integers in packed form. */
475 static __inline __m64
476 _mm_cvttps_pi32 (__m128 __A)
478 return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A);
481 /* Convert B to a SPFP value and insert it as element zero in A. */
482 static __inline __m128
483 _mm_cvtsi32_ss (__m128 __A, int __B)
485 return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B);
488 /* Convert the two 32-bit values in B to SPFP form and insert them
489 as the two lower elements in A. */
490 static __inline __m128
491 _mm_cvtpi32_ps (__m128 __A, __m64 __B)
493 return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B);
496 /* Convert the four signed 16-bit values in A to SPFP form. */
497 static __inline __m128
498 _mm_cvtpi16_ps (__m64 __A)
500 __v4hi __sign;
501 __v2si __hisi, __losi;
502 __v4sf __r;
504 /* This comparison against zero gives us a mask that can be used to
505 fill in the missing sign bits in the unpack operations below, so
506 that we get signed values after unpacking. */
507 __sign = (__v4hi) __builtin_ia32_mmx_zero ();
508 __sign = __builtin_ia32_pcmpgtw (__sign, (__v4hi)__A);
510 /* Convert the four words to doublewords. */
511 __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign);
512 __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign);
514 /* Convert the doublewords to floating point two at a time. */
515 __r = (__v4sf) __builtin_ia32_setzerops ();
516 __r = __builtin_ia32_cvtpi2ps (__r, __hisi);
517 __r = __builtin_ia32_movlhps (__r, __r);
518 __r = __builtin_ia32_cvtpi2ps (__r, __losi);
520 return (__m128) __r;
523 /* Convert the four unsigned 16-bit values in A to SPFP form. */
524 static __inline __m128
525 _mm_cvtpu16_ps (__m64 __A)
527 __v4hi __zero = (__v4hi) __builtin_ia32_mmx_zero ();
528 __v2si __hisi, __losi;
529 __v4sf __r;
531 /* Convert the four words to doublewords. */
532 __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __zero);
533 __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __zero);
535 /* Convert the doublewords to floating point two at a time. */
536 __r = (__v4sf) __builtin_ia32_setzerops ();
537 __r = __builtin_ia32_cvtpi2ps (__r, __hisi);
538 __r = __builtin_ia32_movlhps (__r, __r);
539 __r = __builtin_ia32_cvtpi2ps (__r, __losi);
541 return (__m128) __r;
544 /* Convert the low four signed 8-bit values in A to SPFP form. */
545 static __inline __m128
546 _mm_cvtpi8_ps (__m64 __A)
548 __v8qi __sign;
550 /* This comparison against zero gives us a mask that can be used to
551 fill in the missing sign bits in the unpack operations below, so
552 that we get signed values after unpacking. */
553 __sign = (__v8qi) __builtin_ia32_mmx_zero ();
554 __sign = __builtin_ia32_pcmpgtb (__sign, (__v8qi)__A);
556 /* Convert the four low bytes to words. */
557 __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign);
559 return _mm_cvtpi16_ps(__A);
562 /* Convert the low four unsigned 8-bit values in A to SPFP form. */
563 static __inline __m128
564 _mm_cvtpu8_ps(__m64 __A)
566 __v8qi __zero = (__v8qi) __builtin_ia32_mmx_zero ();
567 __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __zero);
568 return _mm_cvtpu16_ps(__A);
571 /* Convert the four signed 32-bit values in A and B to SPFP form. */
572 static __inline __m128
573 _mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
575 __v4sf __zero = (__v4sf) __builtin_ia32_setzerops ();
576 __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A);
577 __v4sf __sfb = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__B);
578 return (__m128) __builtin_ia32_movlhps (__sfa, __sfb);
581 /* Convert the four SPFP values in A to four signed 16-bit integers. */
582 static __inline __m64
583 _mm_cvtps_pi16(__m128 __A)
585 __v4sf __hisf = (__v4sf)__A;
586 __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf);
587 __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf);
588 __v2si __losi = __builtin_ia32_cvtps2pi (__losf);
589 return (__m64) __builtin_ia32_packssdw (__losi, __hisi);
592 /* Convert the four SPFP values in A to four signed 8-bit integers. */
593 static __inline __m64
594 _mm_cvtps_pi8(__m128 __A)
596 __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A);
597 __v4hi __zero = (__v4hi) __builtin_ia32_mmx_zero ();
598 return (__m64) __builtin_ia32_packsswb (__tmp, __zero);
601 /* Selects four specific SPFP values from A and B based on MASK. */
602 #if 0
603 static __inline __m128
604 _mm_shuffle_ps (__m128 __A, __m128 __B, int __mask)
606 return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask);
608 #else
609 #define _mm_shuffle_ps(A, B, MASK) \
610 ((__m128) __builtin_ia32_shufps ((__v4sf)(A), (__v4sf)(B), (MASK)))
611 #endif
614 /* Selects and interleaves the upper two SPFP values from A and B. */
615 static __inline __m128
616 _mm_unpackhi_ps (__m128 __A, __m128 __B)
618 return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B);
621 /* Selects and interleaves the lower two SPFP values from A and B. */
622 static __inline __m128
623 _mm_unpacklo_ps (__m128 __A, __m128 __B)
625 return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B);
628 /* Sets the upper two SPFP values with 64-bits of data loaded from P;
629 the lower two values are passed through from A. */
630 static __inline __m128
631 _mm_loadh_pi (__m128 __A, __m64 *__P)
633 return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (__v2si *)__P);
636 /* Stores the upper two SPFP values of A into P. */
637 static __inline void
638 _mm_storeh_pi (__m64 *__P, __m128 __A)
640 __builtin_ia32_storehps ((__v2si *)__P, (__v4sf)__A);
643 /* Moves the upper two values of B into the lower two values of A. */
644 static __inline __m128
645 _mm_movehl_ps (__m128 __A, __m128 __B)
647 return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B);
650 /* Moves the lower two values of B into the upper two values of A. */
651 static __inline __m128
652 _mm_movelh_ps (__m128 __A, __m128 __B)
654 return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B);
657 /* Sets the lower two SPFP values with 64-bits of data loaded from P;
658 the upper two values are passed through from A. */
659 static __inline __m128
660 _mm_loadl_pi (__m128 __A, __m64 *__P)
662 return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (__v2si *)__P);
665 /* Stores the lower two SPFP values of A into P. */
666 static __inline void
667 _mm_storel_pi (__m64 *__P, __m128 __A)
669 __builtin_ia32_storelps ((__v2si *)__P, (__v4sf)__A);
672 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */
673 static __inline int
674 _mm_movemask_ps (__m128 __A)
676 return __builtin_ia32_movmskps ((__v4sf)__A);
679 /* Return the contents of the control register. */
680 static __inline unsigned int
681 _mm_getcsr (void)
683 return __builtin_ia32_stmxcsr ();
686 /* Read exception bits from the control register. */
687 static __inline unsigned int
688 _MM_GET_EXCEPTION_STATE (void)
690 return _mm_getcsr() & _MM_EXCEPT_MASK;
693 static __inline unsigned int
694 _MM_GET_EXCEPTION_MASK (void)
696 return _mm_getcsr() & _MM_MASK_MASK;
699 static __inline unsigned int
700 _MM_GET_ROUNDING_MODE (void)
702 return _mm_getcsr() & _MM_ROUND_MASK;
705 static __inline unsigned int
706 _MM_GET_FLUSH_ZERO_MODE (void)
708 return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
711 /* Set the control register to I. */
712 static __inline void
713 _mm_setcsr (unsigned int __I)
715 __builtin_ia32_ldmxcsr (__I);
718 /* Set exception bits in the control register. */
719 static __inline void
720 _MM_SET_EXCEPTION_STATE(unsigned int __mask)
722 _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask);
725 static __inline void
726 _MM_SET_EXCEPTION_MASK (unsigned int __mask)
728 _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask);
731 static __inline void
732 _MM_SET_ROUNDING_MODE (unsigned int __mode)
734 _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode);
737 static __inline void
738 _MM_SET_FLUSH_ZERO_MODE (unsigned int __mode)
740 _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode);
743 /* Create a vector with element 0 as *P and the rest zero. */
744 static __inline __m128
745 _mm_load_ss (float *__P)
747 return (__m128) __builtin_ia32_loadss (__P);
750 /* Create a vector with all four elements equal to *P. */
751 static __inline __m128
752 _mm_load1_ps (float *__P)
754 __v4sf __tmp = __builtin_ia32_loadss (__P);
755 return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,0,0,0));
758 static __inline __m128
759 _mm_load_ps1 (float *__P)
761 return _mm_load1_ps (__P);
764 /* Load four SPFP values from P. The address must be 16-byte aligned. */
765 static __inline __m128
766 _mm_load_ps (float *__P)
768 return (__m128) __builtin_ia32_loadaps (__P);
771 /* Load four SPFP values from P. The address need not be 16-byte aligned. */
772 static __inline __m128
773 _mm_loadu_ps (float *__P)
775 return (__m128) __builtin_ia32_loadups (__P);
778 /* Load four SPFP values in reverse order. The address must be aligned. */
779 static __inline __m128
780 _mm_loadr_ps (float *__P)
782 __v4sf __tmp = __builtin_ia32_loadaps (__P);
783 return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3));
786 /* Create a vector with element 0 as F and the rest zero. */
787 static __inline __m128
788 _mm_set_ss (float __F)
790 return (__m128) __builtin_ia32_loadss (&__F);
793 /* Create a vector with all four elements equal to F. */
794 static __inline __m128
795 _mm_set1_ps (float __F)
797 __v4sf __tmp = __builtin_ia32_loadss (&__F);
798 return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,0,0,0));
801 static __inline __m128
802 _mm_set_ps1 (float __F)
804 return _mm_set1_ps (__F);
807 /* Create the vector [Z Y X W]. */
808 static __inline __m128
809 _mm_set_ps (float __Z, float __Y, float __X, float __W)
811 union {
812 float __a[4];
813 __m128 __v;
814 } __u;
816 __u.__a[0] = __W;
817 __u.__a[1] = __X;
818 __u.__a[2] = __Y;
819 __u.__a[3] = __Z;
821 return __u.__v;
824 /* Create the vector [W X Y Z]. */
825 static __inline __m128
826 _mm_setr_ps (float __Z, float __Y, float __X, float __W)
828 return _mm_set_ps (__W, __X, __Y, __Z);
831 /* Create a vector of zeros. */
832 static __inline __m128
833 _mm_setzero_ps (void)
835 return (__m128) __builtin_ia32_setzerops ();
838 /* Stores the lower SPFP value. */
839 static __inline void
840 _mm_store_ss (float *__P, __m128 __A)
842 __builtin_ia32_storess (__P, (__v4sf)__A);
845 /* Store the lower SPFP value across four words. */
846 static __inline void
847 _mm_store1_ps (float *__P, __m128 __A)
849 __v4sf __va = (__v4sf)__A;
850 __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0));
851 __builtin_ia32_storeaps (__P, __tmp);
854 static __inline void
855 _mm_store_ps1 (float *__P, __m128 __A)
857 _mm_store1_ps (__P, __A);
860 /* Store four SPFP values. The address must be 16-byte aligned. */
861 static __inline void
862 _mm_store_ps (float *__P, __m128 __A)
864 __builtin_ia32_storeaps (__P, (__v4sf)__A);
867 /* Store four SPFP values. The address need not be 16-byte aligned. */
868 static __inline void
869 _mm_storeu_ps (float *__P, __m128 __A)
871 __builtin_ia32_storeups (__P, (__v4sf)__A);
874 /* Store four SPFP values in reverse order. The addres must be aligned. */
875 static __inline void
876 _mm_storer_ps (float *__P, __m128 __A)
878 __v4sf __va = (__v4sf)__A;
879 __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3));
880 __builtin_ia32_storeaps (__P, __tmp);
883 /* Sets the low SPFP value of A from the low value of B. */
884 static __inline __m128
885 _mm_move_ss (__m128 __A, __m128 __B)
887 return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
890 /* Extracts one of the four words of A. The selector N must be immediate. */
891 #if 0
892 static __inline int
893 _mm_extract_pi16 (__m64 __A, int __N)
895 return __builtin_ia32_pextrw ((__v4hi)__A, __N);
897 #else
898 #define _mm_extract_pi16(A, N) \
899 __builtin_ia32_pextrw ((__v4hi)(A), (N))
900 #endif
902 /* Inserts word D into one of four words of A. The selector N must be
903 immediate. */
904 #if 0
905 static __inline __m64
906 _mm_insert_pi16 (__m64 __A, int __D, int __N)
908 return (__m64)__builtin_ia32_pinsrw ((__v4hi)__A, __D, __N);
910 #else
911 #define _mm_insert_pi16(A, D, N) \
912 ((__m64) __builtin_ia32_pinsrw ((__v4hi)(A), (D), (N)))
913 #endif
915 /* Compute the element-wise maximum of signed 16-bit values. */
916 static __inline __m64
917 _mm_max_pi16 (__m64 __A, __m64 __B)
919 return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B);
922 /* Compute the element-wise maximum of unsigned 8-bit values. */
923 static __inline __m64
924 _mm_max_pu8 (__m64 __A, __m64 __B)
926 return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B);
929 /* Compute the element-wise minimum of signed 16-bit values. */
930 static __inline __m64
931 _mm_min_pi16 (__m64 __A, __m64 __B)
933 return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B);
936 /* Compute the element-wise minimum of unsigned 8-bit values. */
937 static __inline __m64
938 _mm_min_pu8 (__m64 __A, __m64 __B)
940 return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B);
943 /* Create an 8-bit mask of the signs of 8-bit values. */
944 static __inline int
945 _mm_movemask_pi8 (__m64 __A)
947 return __builtin_ia32_pmovmskb ((__v8qi)__A);
950 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
951 in B and produce the high 16 bits of the 32-bit results. */
952 static __inline __m64
953 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
955 return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B);
958 /* Return a combination of the four 16-bit values in A. The selector
959 must be an immediate. */
960 #if 0
961 static __inline __m64
962 _mm_shuffle_pi16 (__m64 __A, int __N)
964 return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N);
966 #else
967 #define _mm_shuffle_pi16(A, N) \
968 ((__m64) __builtin_ia32_pshufw ((__v4hi)(A), (N)))
969 #endif
971 /* Conditionally store byte elements of A into P. The high bit of each
972 byte in the selector N determines whether the corresponding byte from
973 A is stored. */
974 static __inline void
975 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
977 __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
980 /* Compute the rounded averages of the unsigned 8-bit values in A and B. */
981 static __inline __m64
982 _mm_avg_pu8 (__m64 __A, __m64 __B)
984 return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B);
987 /* Compute the rounded averages of the unsigned 16-bit values in A and B. */
988 static __inline __m64
989 _mm_avg_pu16 (__m64 __A, __m64 __B)
991 return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B);
994 /* Compute the sum of the absolute differences of the unsigned 8-bit
995 values in A and B. Return the value in the lower 16-bit word; the
996 upper words are cleared. */
997 static __inline __m64
998 _mm_sad_pu8 (__m64 __A, __m64 __B)
1000 return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B);
1003 /* Loads one cache line from address P to a location "closer" to the
1004 processor. The selector I specifies the type of prefetch operation. */
1005 #if 0
1006 static __inline void
1007 _mm_prefetch (void *__P, enum _mm_hint __I)
1009 __builtin_prefetch (__P, 0, __I);
1011 #else
1012 #define _mm_prefetch(P, I) \
1013 __builtin_prefetch ((P), 0, (I))
1014 #endif
1016 /* Stores the data in A to the address P without polluting the caches. */
1017 static __inline void
1018 _mm_stream_pi (__m64 *__P, __m64 __A)
1020 __builtin_ia32_movntq (__P, __A);
1023 /* Likewise. The address must be 16-byte aligned. */
1024 static __inline void
1025 _mm_stream_ps (float *__P, __m128 __A)
1027 __builtin_ia32_movntps (__P, (__v4sf)__A);
1030 /* Guarantees that every preceeding store is globally visible before
1031 any subsequent store. */
1032 static __inline void
1033 _mm_sfence (void)
1035 __builtin_ia32_sfence ();
1038 /* The execution of the next instruction is delayed by an implementation
1039 specific amount of time. The instruction does not modify the
1040 architectural state. */
1041 static __inline void
1042 _mm_pause (void)
1044 __asm__ __volatile__ ("rep; nop" : : );
1047 /* Transpose the 4x4 matrix composed of row[0-3]. */
1048 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
1049 do { \
1050 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
1051 __v4sf __t0 = __builtin_ia32_shufps (__r0, __r1, 0x44); \
1052 __v4sf __t1 = __builtin_ia32_shufps (__r0, __r1, 0xEE); \
1053 __v4sf __t2 = __builtin_ia32_shufps (__r2, __r3, 0x44); \
1054 __v4sf __t3 = __builtin_ia32_shufps (__r2, __r3, 0xEE); \
1055 (row0) = __builtin_ia32_shufps (__t0, __t1, 0x88); \
1056 (row1) = __builtin_ia32_shufps (__t0, __t1, 0xDD); \
1057 (row2) = __builtin_ia32_shufps (__t2, __t3, 0x88); \
1058 (row3) = __builtin_ia32_shufps (__t2, __t3, 0xDD); \
1059 } while (0)
1061 #ifdef __SSE2__
1062 /* SSE2 */
1063 typedef int __v2df __attribute__ ((mode (V2DF)));
1064 typedef int __v2di __attribute__ ((mode (V2DI)));
1065 typedef int __v4si __attribute__ ((mode (V4SI)));
1066 typedef int __v8hi __attribute__ ((mode (V8HI)));
1067 typedef int __v16qi __attribute__ ((mode (V16QI)));
1069 #define __m128i __m128
1070 #define __m128d __v2df
1072 static __inline __m128d
1073 _mm_add_pd (__m128d __A, __m128d __B)
1075 return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B);
1078 static __inline __m128d
1079 _mm_add_sd (__m128d __A, __m128d __B)
1081 return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B);
1084 static __inline __m128d
1085 _mm_sub_pd (__m128d __A, __m128d __B)
1087 return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B);
1090 static __inline __m128d
1091 _mm_sub_sd (__m128d __A, __m128d __B)
1093 return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B);
1096 static __inline __m128d
1097 _mm_mul_pd (__m128d __A, __m128d __B)
1099 return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B);
1102 static __inline __m128d
1103 _mm_mul_sd (__m128d __A, __m128d __B)
1105 return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B);
1108 static __inline __m128d
1109 _mm_div_pd (__m128d __A, __m128d __B)
1111 return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B);
1114 static __inline __m128d
1115 _mm_div_sd (__m128d __A, __m128d __B)
1117 return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B);
1120 static __inline __m128d
1121 _mm_sqrt_pd (__m128d __A)
1123 return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A);
1126 static __inline __m128d
1127 _mm_sqrt_sd (__m128d __A)
1129 return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__A);
1132 static __inline __m128d
1133 _mm_min_pd (__m128d __A, __m128d __B)
1135 return (__m128d)__builtin_ia32_minpd ((__v2df)__A, (__v2df)__B);
1138 static __inline __m128d
1139 _mm_min_sd (__m128d __A, __m128d __B)
1141 return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B);
1144 static __inline __m128d
1145 _mm_max_pd (__m128d __A, __m128d __B)
1147 return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B);
1150 static __inline __m128d
1151 _mm_max_sd (__m128d __A, __m128d __B)
1153 return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B);
1156 static __inline __m128d
1157 _mm_and_pd (__m128d __A, __m128d __B)
1159 return (__m128d)__builtin_ia32_andpd ((__v2df)__A, (__v2df)__B);
1162 static __inline __m128d
1163 _mm_andnot_pd (__m128d __A, __m128d __B)
1165 return (__m128d)__builtin_ia32_andnpd ((__v2df)__A, (__v2df)__B);
1168 static __inline __m128d
1169 _mm_or_pd (__m128d __A, __m128d __B)
1171 return (__m128d)__builtin_ia32_orpd ((__v2df)__A, (__v2df)__B);
1174 static __inline __m128d
1175 _mm_xor_pd (__m128d __A, __m128d __B)
1177 return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B);
1180 static __inline __m128d
1181 _mm_cmpeq_pd (__m128d __A, __m128d __B)
1183 return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B);
1186 static __inline __m128d
1187 _mm_cmplt_pd (__m128d __A, __m128d __B)
1189 return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B);
1192 static __inline __m128d
1193 _mm_cmple_pd (__m128d __A, __m128d __B)
1195 return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B);
1198 static __inline __m128d
1199 _mm_cmpgt_pd (__m128d __A, __m128d __B)
1201 return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B);
1204 static __inline __m128d
1205 _mm_cmpge_pd (__m128d __A, __m128d __B)
1207 return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B);
1210 static __inline __m128d
1211 _mm_cmpneq_pd (__m128d __A, __m128d __B)
1213 return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B);
1216 static __inline __m128d
1217 _mm_cmpnlt_pd (__m128d __A, __m128d __B)
1219 return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B);
1222 static __inline __m128d
1223 _mm_cmpnle_pd (__m128d __A, __m128d __B)
1225 return (__m128d)__builtin_ia32_cmpnlepd ((__v2df)__A, (__v2df)__B);
1228 static __inline __m128d
1229 _mm_cmpngt_pd (__m128d __A, __m128d __B)
1231 return (__m128d)__builtin_ia32_cmpngtpd ((__v2df)__A, (__v2df)__B);
1234 static __inline __m128d
1235 _mm_cmpnge_pd (__m128d __A, __m128d __B)
1237 return (__m128d)__builtin_ia32_cmpngepd ((__v2df)__A, (__v2df)__B);
1240 static __inline __m128d
1241 _mm_cmpord_pd (__m128d __A, __m128d __B)
1243 return (__m128d)__builtin_ia32_cmpordpd ((__v2df)__A, (__v2df)__B);
1246 static __inline __m128d
1247 _mm_cmpunord_pd (__m128d __A, __m128d __B)
1249 return (__m128d)__builtin_ia32_cmpunordpd ((__v2df)__A, (__v2df)__B);
1252 static __inline __m128d
1253 _mm_cmpeq_sd (__m128d __A, __m128d __B)
1255 return (__m128d)__builtin_ia32_cmpeqsd ((__v2df)__A, (__v2df)__B);
1258 static __inline __m128d
1259 _mm_cmplt_sd (__m128d __A, __m128d __B)
1261 return (__m128d)__builtin_ia32_cmpltsd ((__v2df)__A, (__v2df)__B);
1264 static __inline __m128d
1265 _mm_cmple_sd (__m128d __A, __m128d __B)
1267 return (__m128d)__builtin_ia32_cmplesd ((__v2df)__A, (__v2df)__B);
1270 static __inline __m128d
1271 _mm_cmpgt_sd (__m128d __A, __m128d __B)
1273 return (__m128d)__builtin_ia32_cmpgtsd ((__v2df)__A, (__v2df)__B);
1276 static __inline __m128d
1277 _mm_cmpge_sd (__m128d __A, __m128d __B)
1279 return (__m128d)__builtin_ia32_cmpgesd ((__v2df)__A, (__v2df)__B);
1282 static __inline __m128d
1283 _mm_cmpneq_sd (__m128d __A, __m128d __B)
1285 return (__m128d)__builtin_ia32_cmpneqsd ((__v2df)__A, (__v2df)__B);
1288 static __inline __m128d
1289 _mm_cmpnlt_sd (__m128d __A, __m128d __B)
1291 return (__m128d)__builtin_ia32_cmpnltsd ((__v2df)__A, (__v2df)__B);
1294 static __inline __m128d
1295 _mm_cmpnle_sd (__m128d __A, __m128d __B)
1297 return (__m128d)__builtin_ia32_cmpnlesd ((__v2df)__A, (__v2df)__B);
1300 static __inline __m128d
1301 _mm_cmpngt_sd (__m128d __A, __m128d __B)
1303 return (__m128d)__builtin_ia32_cmpngtsd ((__v2df)__A, (__v2df)__B);
1306 static __inline __m128d
1307 _mm_cmpnge_sd (__m128d __A, __m128d __B)
1309 return (__m128d)__builtin_ia32_cmpngesd ((__v2df)__A, (__v2df)__B);
1312 static __inline __m128d
1313 _mm_cmpord_sd (__m128d __A, __m128d __B)
1315 return (__m128d)__builtin_ia32_cmpordsd ((__v2df)__A, (__v2df)__B);
1318 static __inline __m128d
1319 _mm_cmpunord_sd (__m128d __A, __m128d __B)
1321 return (__m128d)__builtin_ia32_cmpunordsd ((__v2df)__A, (__v2df)__B);
1324 static __inline int
1325 _mm_comieq_sd (__m128d __A, __m128d __B)
1327 return __builtin_ia32_comisdeq ((__v2df)__A, (__v2df)__B);
1330 static __inline int
1331 _mm_comilt_sd (__m128d __A, __m128d __B)
1333 return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B);
1336 static __inline int
1337 _mm_comile_sd (__m128d __A, __m128d __B)
1339 return __builtin_ia32_comisdle ((__v2df)__A, (__v2df)__B);
1342 static __inline int
1343 _mm_comigt_sd (__m128d __A, __m128d __B)
1345 return __builtin_ia32_comisdgt ((__v2df)__A, (__v2df)__B);
1348 static __inline int
1349 _mm_comige_sd (__m128d __A, __m128d __B)
1351 return __builtin_ia32_comisdge ((__v2df)__A, (__v2df)__B);
1354 static __inline int
1355 _mm_comineq_sd (__m128d __A, __m128d __B)
1357 return __builtin_ia32_comisdneq ((__v2df)__A, (__v2df)__B);
1360 static __inline int
1361 _mm_ucomieq_sd (__m128d __A, __m128d __B)
1363 return __builtin_ia32_ucomisdeq ((__v2df)__A, (__v2df)__B);
1366 static __inline int
1367 _mm_ucomilt_sd (__m128d __A, __m128d __B)
1369 return __builtin_ia32_ucomisdlt ((__v2df)__A, (__v2df)__B);
1372 static __inline int
1373 _mm_ucomile_sd (__m128d __A, __m128d __B)
1375 return __builtin_ia32_ucomisdle ((__v2df)__A, (__v2df)__B);
1378 static __inline int
1379 _mm_ucomigt_sd (__m128d __A, __m128d __B)
1381 return __builtin_ia32_ucomisdgt ((__v2df)__A, (__v2df)__B);
1384 static __inline int
1385 _mm_ucomige_sd (__m128d __A, __m128d __B)
1387 return __builtin_ia32_ucomisdge ((__v2df)__A, (__v2df)__B);
1390 static __inline int
1391 _mm_ucomineq_sd (__m128d __A, __m128d __B)
1393 return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B);
1396 static __inline __m128d
1397 _mm_cvtepi32_pd (__m128i __A)
1399 return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __A);
1402 static __inline __m128d
1403 _mm_cvtepi32_ps (__m128i __A)
1405 return (__m128d)__builtin_ia32_cvtdq2ps ((__v4si) __A);
1408 static __inline __m128d
1409 _mm_cvtpd_epi32 (__m128d __A)
1411 return (__m128d)__builtin_ia32_cvtpd2dq ((__v2df) __A);
1414 static __inline __m64
1415 _mm_cvtpd_pi32 (__m128d __A)
1417 return (__m64)__builtin_ia32_cvtpd2pi ((__v2df) __A);
1420 static __inline __m128d
1421 _mm_cvtpd_ps (__m128d __A)
1423 return (__m128d)__builtin_ia32_cvtpd2ps ((__v2df) __A);
1426 static __inline __m128d
1427 _mm_cvttpd_epi32 (__m128d __A)
1429 return (__m128d)__builtin_ia32_cvttpd2dq ((__v2df) __A);
1432 static __inline __m64
1433 _mm_cvttpd_pi32 (__m128d __A)
1435 return (__m64)__builtin_ia32_cvttpd2pi ((__v2df) __A);
1438 static __inline __m128d
1439 _mm_cvtpi32_pd (__m64 __A)
1441 return (__m128d)__builtin_ia32_cvtpi2pd ((__v2si) __A);
1444 static __inline __m128d
1445 _mm_cvtps_epi32 (__m128d __A)
1447 return (__m128d)__builtin_ia32_cvtps2dq ((__v4sf) __A);
1450 static __inline __m128d
1451 _mm_cvttps_epi32 (__m128d __A)
1453 return (__m128d)__builtin_ia32_cvttps2dq ((__v4sf) __A);
1456 static __inline __m128d
1457 _mm_cvtps_pd (__m128d __A)
1459 return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __A);
1462 static __inline int
1463 _mm_cvtsd_si32 (__m128d __A)
1465 return __builtin_ia32_cvtsd2si ((__v2df) __A);
1468 static __inline int
1469 _mm_cvttsd_si32 (__m128d __A)
1471 return __builtin_ia32_cvttsd2si ((__v2df) __A);
1474 static __inline __m128d
1475 _mm_cvtsd_ss (__m128d __A, __m128d __B)
1477 return (__m128d)__builtin_ia32_cvtsd2ss ((__v4sf) __A, (__v2df) __B);
1480 static __inline __m128d
1481 _mm_cvtsi32_sd (__m128d __A, int __B)
1483 return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B);
1486 static __inline __m128d
1487 _mm_cvtss_sd (__m128d __A, __m128d __B)
1489 return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B);
1492 #define _mm_shuffle_pd(__A, __B, __C) ((__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, (C)))
1494 static __inline __m128d
1495 _mm_unpackhi_pd (__m128d __A, __m128d __B)
1497 return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B);
1500 static __inline __m128d
1501 _mm_unpacklo_pd (__m128d __A, __m128d __B)
1503 return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B);
1506 static __inline __m128d
1507 _mm_loadh_pd (__m128d __A, __m128d *__B)
1509 return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, (__v2si *)__B);
1512 static __inline void
1513 _mm_storeh_pd (__m128d *__A, __m128d __B)
1515 __builtin_ia32_storehpd ((__v2si *)__A, (__v2df)__B);
1518 static __inline __m128d
1519 _mm_loadl_pd (__m128d __A, __m128d *__B)
1521 return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, (__v2si *)__B);
1524 static __inline void
1525 _mm_storel_pd (__m128d *__A, __m128d __B)
1527 __builtin_ia32_storelpd ((__v2si *)__A, (__v2df)__B);
1530 static __inline int
1531 _mm_movemask_pd (__m128d __A)
1533 return __builtin_ia32_movmskpd ((__v2df)__A);
1536 static __inline __m128i
1537 _mm_packs_epi16 (__m128i __A, __m128i __B)
1539 return (__m128i)__builtin_ia32_packsswb128 ((__v8hi)__A, (__v8hi)__B);
1542 static __inline __m128i
1543 _mm_packs_epi32 (__m128i __A, __m128i __B)
1545 return (__m128i)__builtin_ia32_packssdw128 ((__v4si)__A, (__v4si)__B);
1548 static __inline __m128i
1549 _mm_packus_epi16 (__m128i __A, __m128i __B)
1551 return (__m128i)__builtin_ia32_packuswb128 ((__v8hi)__A, (__v8hi)__B);
1554 static __inline __m128i
1555 _mm_unpackhi_epi8 (__m128i __A, __m128i __B)
1557 return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__A, (__v16qi)__B);
1560 static __inline __m128i
1561 _mm_unpackhi_epi16 (__m128i __A, __m128i __B)
1563 return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__A, (__v8hi)__B);
1566 static __inline __m128i
1567 _mm_unpackhi_epi32 (__m128i __A, __m128i __B)
1569 return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__A, (__v4si)__B);
1572 static __inline __m128i
1573 _mm_unpacklo_epi8 (__m128i __A, __m128i __B)
1575 return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__A, (__v16qi)__B);
1578 static __inline __m128i
1579 _mm_unpacklo_epi16 (__m128i __A, __m128i __B)
1581 return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__A, (__v8hi)__B);
1584 static __inline __m128i
1585 _mm_unpacklo_epi32 (__m128i __A, __m128i __B)
1587 return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__A, (__v4si)__B);
1590 static __inline __m128i
1591 _mm_add_epi8 (__m128i __A, __m128i __B)
1593 return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B);
1596 static __inline __m128i
1597 _mm_add_epi16 (__m128i __A, __m128i __B)
1599 return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B);
1602 static __inline __m128i
1603 _mm_add_epi32 (__m128i __A, __m128i __B)
1605 return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B);
1608 static __inline __m128i
1609 _mm_add_epi64 (__m128i __A, __m128i __B)
1611 return (__m128i)__builtin_ia32_paddq128 ((__v4si)__A, (__v4si)__B);
1614 static __inline __m128i
1615 _mm_adds_epi8 (__m128i __A, __m128i __B)
1617 return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B);
1620 static __inline __m128i
1621 _mm_adds_epi16 (__m128i __A, __m128i __B)
1623 return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B);
1626 static __inline __m128i
1627 _mm_adds_epu8 (__m128i __A, __m128i __B)
1629 return (__m128i)__builtin_ia32_paddusb128 ((__v16qi)__A, (__v16qi)__B);
1632 static __inline __m128i
1633 _mm_adds_epu16 (__m128i __A, __m128i __B)
1635 return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B);
1638 static __inline __m128i
1639 _mm_sub_epi8 (__m128i __A, __m128i __B)
1641 return (__m128i)__builtin_ia32_psubb128 ((__v16qi)__A, (__v16qi)__B);
1644 static __inline __m128i
1645 _mm_sub_epi16 (__m128i __A, __m128i __B)
1647 return (__m128i)__builtin_ia32_psubw128 ((__v8hi)__A, (__v8hi)__B);
1650 static __inline __m128i
1651 _mm_sub_epi32 (__m128i __A, __m128i __B)
1653 return (__m128i)__builtin_ia32_psubd128 ((__v4si)__A, (__v4si)__B);
1656 static __inline __m128i
1657 _mm_sub_epi64 (__m128i __A, __m128i __B)
1659 return (__m128i)__builtin_ia32_psubq128 ((__v4si)__A, (__v4si)__B);
1662 static __inline __m128i
1663 _mm_subs_epi8 (__m128i __A, __m128i __B)
1665 return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B);
1668 static __inline __m128i
1669 _mm_subs_epi16 (__m128i __A, __m128i __B)
1671 return (__m128i)__builtin_ia32_psubsw128 ((__v8hi)__A, (__v8hi)__B);
1674 static __inline __m128i
1675 _mm_subs_epu8 (__m128i __A, __m128i __B)
1677 return (__m128i)__builtin_ia32_psubusb128 ((__v16qi)__A, (__v16qi)__B);
1680 static __inline __m128i
1681 _mm_subs_epu16 (__m128i __A, __m128i __B)
1683 return (__m128i)__builtin_ia32_psubusw128 ((__v8hi)__A, (__v8hi)__B);
1686 static __inline __m128i
1687 _mm_madd_epi16 (__m128i __A, __m128i __B)
1689 return (__m128i)__builtin_ia32_pmaddwd128 ((__v8hi)__A, (__v8hi)__B);
1692 static __inline __m128i
1693 _mm_mulhi_epi16 (__m128i __A, __m128i __B)
1695 return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B);
1698 static __inline __m128i
1699 _mm_mullo_epi16 (__m128i __A, __m128i __B)
1701 return (__m128i)__builtin_ia32_pmullw128 ((__v8hi)__A, (__v8hi)__B);
1704 static __inline __m64
1705 _mm_mul_pu16 (__m64 __A, __m64 __B)
1707 return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B);
1710 static __inline __m128i
1711 _mm_mul_epu16 (__m128i __A, __m128i __B)
1713 return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B);
1716 static __inline __m128i
1717 _mm_sll_epi16 (__m128i __A, __m128i __B)
1719 return (__m128i)__builtin_ia32_psllw128 ((__v8hi)__A, (__v2di)__B);
1722 static __inline __m128i
1723 _mm_sll_epi32 (__m128i __A, __m128i __B)
1725 return (__m128i)__builtin_ia32_pslld128 ((__v4si)__A, (__v2di)__B);
1728 static __inline __m128i
1729 _mm_sll_epi64 (__m128i __A, __m128i __B)
1731 return (__m128i)__builtin_ia32_psllq128 ((__v2di)__A, (__v2di)__B);
1734 static __inline __m128i
1735 _mm_sra_epi16 (__m128i __A, __m128i __B)
1737 return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v2di)__B);
1740 static __inline __m128i
1741 _mm_sra_epi32 (__m128i __A, __m128i __B)
1743 return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v2di)__B);
1746 static __inline __m128i
1747 _mm_srl_epi16 (__m128i __A, __m128i __B)
1749 return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v2di)__B);
1752 static __inline __m128i
1753 _mm_srl_epi32 (__m128i __A, __m128i __B)
1755 return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v2di)__B);
1758 static __inline __m128i
1759 _mm_srl_epi64 (__m128i __A, __m128i __B)
1761 return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B);
1764 static __inline __m128i
1765 _mm_slli_epi16 (__m128i __A, int __B)
1767 return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B);
1770 static __inline __m128i
1771 _mm_slli_epi32 (__m128i __A, int __B)
1773 return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B);
1776 static __inline __m128i
1777 _mm_slli_epi64 (__m128i __A, int __B)
1779 return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B);
1782 static __inline __m128i
1783 _mm_srai_epi16 (__m128i __A, int __B)
1785 return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B);
1788 static __inline __m128i
1789 _mm_srai_epi32 (__m128i __A, int __B)
1791 return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B);
1794 static __inline __m128i
1795 _mm_srli_epi16 (__m128i __A, int __B)
1797 return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B);
1800 static __inline __m128i
1801 _mm_srli_epi32 (__m128i __A, int __B)
1803 return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B);
1806 static __inline __m128i
1807 _mm_srli_epi64 (__m128i __A, int __B)
1809 return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B);
1812 static __inline __m128i
1813 _mm_and_si128 (__m128i __A, __m128i __B)
1815 return (__m128i)__builtin_ia32_pand128 ((__v2di)__A, (__v2di)__B);
1818 static __inline __m128i
1819 _mm_andnot_si128 (__m128i __A, __m128i __B)
1821 return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B);
1824 static __inline __m128i
1825 _mm_or_si128 (__m128i __A, __m128i __B)
1827 return (__m128i)__builtin_ia32_por128 ((__v2di)__A, (__v2di)__B);
1830 static __inline __m128i
1831 _mm_xor_si128 (__m128i __A, __m128i __B)
1833 return (__m128i)__builtin_ia32_pxor128 ((__v2di)__A, (__v2di)__B);
1836 static __inline __m128i
1837 _mm_cmpeq_epi8 (__m128i __A, __m128i __B)
1839 return (__m128i)__builtin_ia32_pcmpeqb128 ((__v16qi)__A, (__v16qi)__B);
1842 static __inline __m128i
1843 _mm_cmpeq_epi16 (__m128i __A, __m128i __B)
1845 return (__m128i)__builtin_ia32_pcmpeqw128 ((__v8hi)__A, (__v8hi)__B);
1848 static __inline __m128i
1849 _mm_cmpeq_epi32 (__m128i __A, __m128i __B)
1851 return (__m128i)__builtin_ia32_pcmpeqd128 ((__v4si)__A, (__v4si)__B);
1854 static __inline __m128i
1855 _mm_cmpgt_epi8 (__m128i __A, __m128i __B)
1857 return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__A, (__v16qi)__B);
1860 static __inline __m128i
1861 _mm_cmpgt_epi16 (__m128i __A, __m128i __B)
1863 return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__A, (__v8hi)__B);
1866 static __inline __m128i
1867 _mm_cmpgt_epi32 (__m128i __A, __m128i __B)
1869 return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__A, (__v4si)__B);
1872 #define _mm_extract_epi16(__A, __B) __builtin_ia32_pextrw128 ((__v8hi)__A, __B)
1874 #define _mm_insert_epi16 (__A, __B, __C) ((__m128i)__builtin_ia32_pinsrw128 ((__v8hi)__A, __B, __C))
1876 static __inline __m128i
1877 _mm_max_epi16 (__m128i __A, __m128i __B)
1879 return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B);
1882 static __inline __m128i
1883 _mm_max_epu8 (__m128i __A, __m128i __B)
1885 return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B);
1888 static __inline __m128i
1889 _mm_min_epi16 (__m128i __A, __m128i __B)
1891 return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B);
1894 static __inline __m128i
1895 _mm_min_epu8 (__m128i __A, __m128i __B)
1897 return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B);
1900 static __inline int
1901 _mm_movemask_epi8 (__m128i __A)
1903 return __builtin_ia32_pmovmskb128 ((__v16qi)__A);
1906 static __inline __m128i
1907 _mm_mulhi_epu16 (__m128i __A, __m128i __B)
1909 return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B);
1912 #define _mm_shufflehi_epi16(__A, __B) ((__m128i)__builtin_ia32_pshufhw128 ((__v8hi)__A, __B))
1913 #define _mm_shufflelo_epi16(__A, __B) ((__m128i)__builtin_ia32_pshuflw128 ((__v8hi)__A, __B))
1914 #define _mm_shuffle_epi32(__A, __B) ((__m128i)__builtin_ia32_pshufd ((__v4si)__A, __B))
1916 static __inline void
1917 _mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
1919 __builtin_ia32_maskmovdqu ((__v16qi)__A, (__v16qi)__B, __C);
1922 static __inline __m128i
1923 _mm_avg_epu8 (__m128i __A, __m128i __B)
1925 return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__A, (__v16qi)__B);
1928 static __inline __m128i
1929 _mm_avg_epu16 (__m128i __A, __m128i __B)
1931 return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__A, (__v8hi)__B);
1934 static __inline __m128i
1935 _mm_sad_epu8 (__m128i __A, __m128i __B)
1937 return (__m128i)__builtin_ia32_psadbw128 ((__v16qi)__A, (__v16qi)__B);
1940 static __inline void
1941 _mm_stream_si32 (int *__A, int __B)
1943 __builtin_ia32_movnti (__A, __B);
1946 static __inline void
1947 _mm_stream_si128 (__m128i *__A, __m128i __B)
1949 __builtin_ia32_movntdq ((__v2di *)__A, (__v2di)__B);
1952 static __inline void
1953 _mm_stream_pd (__m128d *__A, __m128d __B)
1955 __builtin_ia32_movntpd (__A, (__v2df)__B);
1958 static __inline __m128i
1959 _mm_movpi64_epi64 (__m64 __A)
1961 return (__m128i)__builtin_ia32_movq2dq ((unsigned long long)__A);
1964 static __inline void
1965 _mm_clflush (void *__A)
1967 return __builtin_ia32_clflush (__A);
1970 static __inline void
1971 _mm_lfence (void)
1973 __builtin_ia32_lfence ();
1976 static __inline void
1977 _mm_mfence (void)
1979 __builtin_ia32_mfence ();
1982 #endif /* __SSE2_BUILTINS__ */
1984 #endif /* _XMMINTRIN_H_INCLUDED */