Merge from mainline (gomp-merge-2005-02-26).
[official-gcc.git] / gcc / config / i386 / xmmintrin.h
bloba22b809085eefdbca0d221c985fe266588ebf81b
1 /* Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING. If not, write to
17 the Free Software Foundation, 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
20 /* As a special exception, if you include this header file into source
21 files compiled by GCC, this header file does not by itself cause
22 the resulting executable to be covered by the GNU General Public
23 License. This exception does not however invalidate any other
24 reasons why the executable file might be covered by the GNU General
25 Public License. */
27 /* Implemented from the specification included in the Intel C++ Compiler
28 User Guide and Reference, version 8.0. */
30 #ifndef _XMMINTRIN_H_INCLUDED
31 #define _XMMINTRIN_H_INCLUDED
33 #ifndef __SSE__
34 # error "SSE instruction set not enabled"
35 #else
37 /* We need type definitions from the MMX header file. */
38 #include <mmintrin.h>
40 /* Get _mm_malloc () and _mm_free (). */
41 #include <mm_malloc.h>
43 /* The data type intended for user use. */
44 typedef float __m128 __attribute__ ((__vector_size__ (16)));
46 /* Internal data types for implementing the intrinsics. */
47 typedef float __v4sf __attribute__ ((__vector_size__ (16)));
49 /* Create a selector for use with the SHUFPS instruction. */
50 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
51 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
53 /* Constants for use with _mm_prefetch. */
54 enum _mm_hint
56 _MM_HINT_T0 = 3,
57 _MM_HINT_T1 = 2,
58 _MM_HINT_T2 = 1,
59 _MM_HINT_NTA = 0
62 /* Bits in the MXCSR. */
63 #define _MM_EXCEPT_MASK 0x003f
64 #define _MM_EXCEPT_INVALID 0x0001
65 #define _MM_EXCEPT_DENORM 0x0002
66 #define _MM_EXCEPT_DIV_ZERO 0x0004
67 #define _MM_EXCEPT_OVERFLOW 0x0008
68 #define _MM_EXCEPT_UNDERFLOW 0x0010
69 #define _MM_EXCEPT_INEXACT 0x0020
71 #define _MM_MASK_MASK 0x1f80
72 #define _MM_MASK_INVALID 0x0080
73 #define _MM_MASK_DENORM 0x0100
74 #define _MM_MASK_DIV_ZERO 0x0200
75 #define _MM_MASK_OVERFLOW 0x0400
76 #define _MM_MASK_UNDERFLOW 0x0800
77 #define _MM_MASK_INEXACT 0x1000
79 #define _MM_ROUND_MASK 0x6000
80 #define _MM_ROUND_NEAREST 0x0000
81 #define _MM_ROUND_DOWN 0x2000
82 #define _MM_ROUND_UP 0x4000
83 #define _MM_ROUND_TOWARD_ZERO 0x6000
85 #define _MM_FLUSH_ZERO_MASK 0x8000
86 #define _MM_FLUSH_ZERO_ON 0x8000
87 #define _MM_FLUSH_ZERO_OFF 0x0000
89 /* Create a vector of zeros. */
90 static __inline __m128
91 _mm_setzero_ps (void)
93 return (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
96 /* Perform the respective operation on the lower SPFP (single-precision
97 floating-point) values of A and B; the upper three SPFP values are
98 passed through from A. */
100 static __inline __m128
101 _mm_add_ss (__m128 __A, __m128 __B)
103 return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B);
106 static __inline __m128
107 _mm_sub_ss (__m128 __A, __m128 __B)
109 return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B);
112 static __inline __m128
113 _mm_mul_ss (__m128 __A, __m128 __B)
115 return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B);
118 static __inline __m128
119 _mm_div_ss (__m128 __A, __m128 __B)
121 return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B);
124 static __inline __m128
125 _mm_sqrt_ss (__m128 __A)
127 return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A);
130 static __inline __m128
131 _mm_rcp_ss (__m128 __A)
133 return (__m128) __builtin_ia32_rcpss ((__v4sf)__A);
136 static __inline __m128
137 _mm_rsqrt_ss (__m128 __A)
139 return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A);
142 static __inline __m128
143 _mm_min_ss (__m128 __A, __m128 __B)
145 return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B);
148 static __inline __m128
149 _mm_max_ss (__m128 __A, __m128 __B)
151 return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
154 /* Perform the respective operation on the four SPFP values in A and B. */
156 static __inline __m128
157 _mm_add_ps (__m128 __A, __m128 __B)
159 return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
162 static __inline __m128
163 _mm_sub_ps (__m128 __A, __m128 __B)
165 return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B);
168 static __inline __m128
169 _mm_mul_ps (__m128 __A, __m128 __B)
171 return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B);
174 static __inline __m128
175 _mm_div_ps (__m128 __A, __m128 __B)
177 return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B);
180 static __inline __m128
181 _mm_sqrt_ps (__m128 __A)
183 return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
186 static __inline __m128
187 _mm_rcp_ps (__m128 __A)
189 return (__m128) __builtin_ia32_rcpps ((__v4sf)__A);
192 static __inline __m128
193 _mm_rsqrt_ps (__m128 __A)
195 return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A);
198 static __inline __m128
199 _mm_min_ps (__m128 __A, __m128 __B)
201 return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B);
204 static __inline __m128
205 _mm_max_ps (__m128 __A, __m128 __B)
207 return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B);
210 /* Perform logical bit-wise operations on 128-bit values. */
212 static __inline __m128
213 _mm_and_ps (__m128 __A, __m128 __B)
215 return __builtin_ia32_andps (__A, __B);
218 static __inline __m128
219 _mm_andnot_ps (__m128 __A, __m128 __B)
221 return __builtin_ia32_andnps (__A, __B);
224 static __inline __m128
225 _mm_or_ps (__m128 __A, __m128 __B)
227 return __builtin_ia32_orps (__A, __B);
230 static __inline __m128
231 _mm_xor_ps (__m128 __A, __m128 __B)
233 return __builtin_ia32_xorps (__A, __B);
236 /* Perform a comparison on the lower SPFP values of A and B. If the
237 comparison is true, place a mask of all ones in the result, otherwise a
238 mask of zeros. The upper three SPFP values are passed through from A. */
240 static __inline __m128
241 _mm_cmpeq_ss (__m128 __A, __m128 __B)
243 return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B);
246 static __inline __m128
247 _mm_cmplt_ss (__m128 __A, __m128 __B)
249 return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B);
252 static __inline __m128
253 _mm_cmple_ss (__m128 __A, __m128 __B)
255 return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B);
258 static __inline __m128
259 _mm_cmpgt_ss (__m128 __A, __m128 __B)
261 return (__m128) __builtin_ia32_movss ((__v4sf) __A,
262 (__v4sf)
263 __builtin_ia32_cmpltss ((__v4sf) __B,
264 (__v4sf)
265 __A));
268 static __inline __m128
269 _mm_cmpge_ss (__m128 __A, __m128 __B)
271 return (__m128) __builtin_ia32_movss ((__v4sf) __A,
272 (__v4sf)
273 __builtin_ia32_cmpless ((__v4sf) __B,
274 (__v4sf)
275 __A));
278 static __inline __m128
279 _mm_cmpneq_ss (__m128 __A, __m128 __B)
281 return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B);
284 static __inline __m128
285 _mm_cmpnlt_ss (__m128 __A, __m128 __B)
287 return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B);
290 static __inline __m128
291 _mm_cmpnle_ss (__m128 __A, __m128 __B)
293 return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B);
296 static __inline __m128
297 _mm_cmpngt_ss (__m128 __A, __m128 __B)
299 return (__m128) __builtin_ia32_movss ((__v4sf) __A,
300 (__v4sf)
301 __builtin_ia32_cmpnltss ((__v4sf) __B,
302 (__v4sf)
303 __A));
306 static __inline __m128
307 _mm_cmpnge_ss (__m128 __A, __m128 __B)
309 return (__m128) __builtin_ia32_movss ((__v4sf) __A,
310 (__v4sf)
311 __builtin_ia32_cmpnless ((__v4sf) __B,
312 (__v4sf)
313 __A));
316 static __inline __m128
317 _mm_cmpord_ss (__m128 __A, __m128 __B)
319 return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B);
322 static __inline __m128
323 _mm_cmpunord_ss (__m128 __A, __m128 __B)
325 return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B);
328 /* Perform a comparison on the four SPFP values of A and B. For each
329 element, if the comparison is true, place a mask of all ones in the
330 result, otherwise a mask of zeros. */
332 static __inline __m128
333 _mm_cmpeq_ps (__m128 __A, __m128 __B)
335 return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
338 static __inline __m128
339 _mm_cmplt_ps (__m128 __A, __m128 __B)
341 return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B);
344 static __inline __m128
345 _mm_cmple_ps (__m128 __A, __m128 __B)
347 return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B);
350 static __inline __m128
351 _mm_cmpgt_ps (__m128 __A, __m128 __B)
353 return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B);
356 static __inline __m128
357 _mm_cmpge_ps (__m128 __A, __m128 __B)
359 return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B);
362 static __inline __m128
363 _mm_cmpneq_ps (__m128 __A, __m128 __B)
365 return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B);
368 static __inline __m128
369 _mm_cmpnlt_ps (__m128 __A, __m128 __B)
371 return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B);
374 static __inline __m128
375 _mm_cmpnle_ps (__m128 __A, __m128 __B)
377 return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B);
380 static __inline __m128
381 _mm_cmpngt_ps (__m128 __A, __m128 __B)
383 return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B);
386 static __inline __m128
387 _mm_cmpnge_ps (__m128 __A, __m128 __B)
389 return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B);
392 static __inline __m128
393 _mm_cmpord_ps (__m128 __A, __m128 __B)
395 return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B);
398 static __inline __m128
399 _mm_cmpunord_ps (__m128 __A, __m128 __B)
401 return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B);
404 /* Compare the lower SPFP values of A and B and return 1 if true
405 and 0 if false. */
407 static __inline int
408 _mm_comieq_ss (__m128 __A, __m128 __B)
410 return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B);
413 static __inline int
414 _mm_comilt_ss (__m128 __A, __m128 __B)
416 return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B);
419 static __inline int
420 _mm_comile_ss (__m128 __A, __m128 __B)
422 return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B);
425 static __inline int
426 _mm_comigt_ss (__m128 __A, __m128 __B)
428 return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B);
431 static __inline int
432 _mm_comige_ss (__m128 __A, __m128 __B)
434 return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B);
437 static __inline int
438 _mm_comineq_ss (__m128 __A, __m128 __B)
440 return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B);
443 static __inline int
444 _mm_ucomieq_ss (__m128 __A, __m128 __B)
446 return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B);
449 static __inline int
450 _mm_ucomilt_ss (__m128 __A, __m128 __B)
452 return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B);
455 static __inline int
456 _mm_ucomile_ss (__m128 __A, __m128 __B)
458 return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B);
461 static __inline int
462 _mm_ucomigt_ss (__m128 __A, __m128 __B)
464 return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B);
467 static __inline int
468 _mm_ucomige_ss (__m128 __A, __m128 __B)
470 return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B);
473 static __inline int
474 _mm_ucomineq_ss (__m128 __A, __m128 __B)
476 return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B);
479 /* Convert the lower SPFP value to a 32-bit integer according to the current
480 rounding mode. */
481 static __inline int
482 _mm_cvtss_si32 (__m128 __A)
484 return __builtin_ia32_cvtss2si ((__v4sf) __A);
487 static __inline int
488 _mm_cvt_ss2si (__m128 __A)
490 return _mm_cvtss_si32 (__A);
493 #ifdef __x86_64__
494 /* Convert the lower SPFP value to a 32-bit integer according to the current
495 rounding mode. */
496 static __inline long long
497 _mm_cvtss_si64x (__m128 __A)
499 return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
501 #endif
503 /* Convert the two lower SPFP values to 32-bit integers according to the
504 current rounding mode. Return the integers in packed form. */
505 static __inline __m64
506 _mm_cvtps_pi32 (__m128 __A)
508 return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A);
511 static __inline __m64
512 _mm_cvt_ps2pi (__m128 __A)
514 return _mm_cvtps_pi32 (__A);
517 /* Truncate the lower SPFP value to a 32-bit integer. */
518 static __inline int
519 _mm_cvttss_si32 (__m128 __A)
521 return __builtin_ia32_cvttss2si ((__v4sf) __A);
524 static __inline int
525 _mm_cvtt_ss2si (__m128 __A)
527 return _mm_cvttss_si32 (__A);
530 #ifdef __x86_64__
531 /* Truncate the lower SPFP value to a 32-bit integer. */
532 static __inline long long
533 _mm_cvttss_si64x (__m128 __A)
535 return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
537 #endif
539 /* Truncate the two lower SPFP values to 32-bit integers. Return the
540 integers in packed form. */
541 static __inline __m64
542 _mm_cvttps_pi32 (__m128 __A)
544 return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A);
547 static __inline __m64
548 _mm_cvtt_ps2pi (__m128 __A)
550 return _mm_cvttps_pi32 (__A);
553 /* Convert B to a SPFP value and insert it as element zero in A. */
554 static __inline __m128
555 _mm_cvtsi32_ss (__m128 __A, int __B)
557 return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B);
560 static __inline __m128
561 _mm_cvt_si2ss (__m128 __A, int __B)
563 return _mm_cvtsi32_ss (__A, __B);
566 #ifdef __x86_64__
567 /* Convert B to a SPFP value and insert it as element zero in A. */
568 static __inline __m128
569 _mm_cvtsi64x_ss (__m128 __A, long long __B)
571 return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
573 #endif
575 /* Convert the two 32-bit values in B to SPFP form and insert them
576 as the two lower elements in A. */
577 static __inline __m128
578 _mm_cvtpi32_ps (__m128 __A, __m64 __B)
580 return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B);
583 static __inline __m128
584 _mm_cvt_pi2ps (__m128 __A, __m64 __B)
586 return _mm_cvtpi32_ps (__A, __B);
589 /* Convert the four signed 16-bit values in A to SPFP form. */
590 static __inline __m128
591 _mm_cvtpi16_ps (__m64 __A)
593 __v4hi __sign;
594 __v2si __hisi, __losi;
595 __v4sf __r;
597 /* This comparison against zero gives us a mask that can be used to
598 fill in the missing sign bits in the unpack operations below, so
599 that we get signed values after unpacking. */
600 __sign = __builtin_ia32_pcmpgtw ((__v4hi)0LL, (__v4hi)__A);
602 /* Convert the four words to doublewords. */
603 __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign);
604 __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign);
606 /* Convert the doublewords to floating point two at a time. */
607 __r = (__v4sf) _mm_setzero_ps ();
608 __r = __builtin_ia32_cvtpi2ps (__r, __hisi);
609 __r = __builtin_ia32_movlhps (__r, __r);
610 __r = __builtin_ia32_cvtpi2ps (__r, __losi);
612 return (__m128) __r;
615 /* Convert the four unsigned 16-bit values in A to SPFP form. */
616 static __inline __m128
617 _mm_cvtpu16_ps (__m64 __A)
619 __v2si __hisi, __losi;
620 __v4sf __r;
622 /* Convert the four words to doublewords. */
623 __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, (__v4hi)0LL);
624 __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, (__v4hi)0LL);
626 /* Convert the doublewords to floating point two at a time. */
627 __r = (__v4sf) _mm_setzero_ps ();
628 __r = __builtin_ia32_cvtpi2ps (__r, __hisi);
629 __r = __builtin_ia32_movlhps (__r, __r);
630 __r = __builtin_ia32_cvtpi2ps (__r, __losi);
632 return (__m128) __r;
635 /* Convert the low four signed 8-bit values in A to SPFP form. */
636 static __inline __m128
637 _mm_cvtpi8_ps (__m64 __A)
639 __v8qi __sign;
641 /* This comparison against zero gives us a mask that can be used to
642 fill in the missing sign bits in the unpack operations below, so
643 that we get signed values after unpacking. */
644 __sign = __builtin_ia32_pcmpgtb ((__v8qi)0LL, (__v8qi)__A);
646 /* Convert the four low bytes to words. */
647 __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign);
649 return _mm_cvtpi16_ps(__A);
652 /* Convert the low four unsigned 8-bit values in A to SPFP form. */
653 static __inline __m128
654 _mm_cvtpu8_ps(__m64 __A)
656 __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, (__v8qi)0LL);
657 return _mm_cvtpu16_ps(__A);
660 /* Convert the four signed 32-bit values in A and B to SPFP form. */
661 static __inline __m128
662 _mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
664 __v4sf __zero = (__v4sf) _mm_setzero_ps ();
665 __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A);
666 __v4sf __sfb = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__B);
667 return (__m128) __builtin_ia32_movlhps (__sfa, __sfb);
670 /* Convert the four SPFP values in A to four signed 16-bit integers. */
671 static __inline __m64
672 _mm_cvtps_pi16(__m128 __A)
674 __v4sf __hisf = (__v4sf)__A;
675 __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf);
676 __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf);
677 __v2si __losi = __builtin_ia32_cvtps2pi (__losf);
678 return (__m64) __builtin_ia32_packssdw (__hisi, __losi);
681 /* Convert the four SPFP values in A to four signed 8-bit integers. */
682 static __inline __m64
683 _mm_cvtps_pi8(__m128 __A)
685 __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A);
686 return (__m64) __builtin_ia32_packsswb (__tmp, (__v4hi)0LL);
689 /* Selects four specific SPFP values from A and B based on MASK. */
690 #if 0
691 static __inline __m128
692 _mm_shuffle_ps (__m128 __A, __m128 __B, int __mask)
694 return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask);
696 #else
697 #define _mm_shuffle_ps(A, B, MASK) \
698 ((__m128) __builtin_ia32_shufps ((__v4sf)(A), (__v4sf)(B), (MASK)))
699 #endif
702 /* Selects and interleaves the upper two SPFP values from A and B. */
703 static __inline __m128
704 _mm_unpackhi_ps (__m128 __A, __m128 __B)
706 return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B);
709 /* Selects and interleaves the lower two SPFP values from A and B. */
710 static __inline __m128
711 _mm_unpacklo_ps (__m128 __A, __m128 __B)
713 return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B);
716 /* Sets the upper two SPFP values with 64-bits of data loaded from P;
717 the lower two values are passed through from A. */
718 static __inline __m128
719 _mm_loadh_pi (__m128 __A, __m64 const *__P)
721 return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (__v2si *)__P);
724 /* Stores the upper two SPFP values of A into P. */
725 static __inline void
726 _mm_storeh_pi (__m64 *__P, __m128 __A)
728 __builtin_ia32_storehps ((__v2si *)__P, (__v4sf)__A);
731 /* Moves the upper two values of B into the lower two values of A. */
732 static __inline __m128
733 _mm_movehl_ps (__m128 __A, __m128 __B)
735 return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B);
738 /* Moves the lower two values of B into the upper two values of A. */
739 static __inline __m128
740 _mm_movelh_ps (__m128 __A, __m128 __B)
742 return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B);
745 /* Sets the lower two SPFP values with 64-bits of data loaded from P;
746 the upper two values are passed through from A. */
747 static __inline __m128
748 _mm_loadl_pi (__m128 __A, __m64 const *__P)
750 return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (__v2si *)__P);
753 /* Stores the lower two SPFP values of A into P. */
754 static __inline void
755 _mm_storel_pi (__m64 *__P, __m128 __A)
757 __builtin_ia32_storelps ((__v2si *)__P, (__v4sf)__A);
760 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */
761 static __inline int
762 _mm_movemask_ps (__m128 __A)
764 return __builtin_ia32_movmskps ((__v4sf)__A);
767 /* Return the contents of the control register. */
768 static __inline unsigned int
769 _mm_getcsr (void)
771 return __builtin_ia32_stmxcsr ();
774 /* Read exception bits from the control register. */
775 static __inline unsigned int
776 _MM_GET_EXCEPTION_STATE (void)
778 return _mm_getcsr() & _MM_EXCEPT_MASK;
781 static __inline unsigned int
782 _MM_GET_EXCEPTION_MASK (void)
784 return _mm_getcsr() & _MM_MASK_MASK;
787 static __inline unsigned int
788 _MM_GET_ROUNDING_MODE (void)
790 return _mm_getcsr() & _MM_ROUND_MASK;
793 static __inline unsigned int
794 _MM_GET_FLUSH_ZERO_MODE (void)
796 return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
799 /* Set the control register to I. */
800 static __inline void
801 _mm_setcsr (unsigned int __I)
803 __builtin_ia32_ldmxcsr (__I);
806 /* Set exception bits in the control register. */
807 static __inline void
808 _MM_SET_EXCEPTION_STATE(unsigned int __mask)
810 _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask);
813 static __inline void
814 _MM_SET_EXCEPTION_MASK (unsigned int __mask)
816 _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask);
819 static __inline void
820 _MM_SET_ROUNDING_MODE (unsigned int __mode)
822 _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode);
825 static __inline void
826 _MM_SET_FLUSH_ZERO_MODE (unsigned int __mode)
828 _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode);
831 /* Create a vector with element 0 as F and the rest zero. */
832 static __inline __m128
833 _mm_set_ss (float __F)
835 return (__m128)(__v4sf){ __F, 0, 0, 0 };
838 /* Create a vector with all four elements equal to F. */
839 static __inline __m128
840 _mm_set1_ps (float __F)
842 return (__m128)(__v4sf){ __F, __F, __F, __F };
845 static __inline __m128
846 _mm_set_ps1 (float __F)
848 return _mm_set1_ps (__F);
851 /* Create a vector with element 0 as *P and the rest zero. */
852 static __inline __m128
853 _mm_load_ss (float const *__P)
855 return _mm_set_ss (*__P);
858 /* Create a vector with all four elements equal to *P. */
859 static __inline __m128
860 _mm_load1_ps (float const *__P)
862 return _mm_set1_ps (*__P);
865 static __inline __m128
866 _mm_load_ps1 (float const *__P)
868 return _mm_load1_ps (__P);
871 /* Load four SPFP values from P. The address must be 16-byte aligned. */
872 static __inline __m128
873 _mm_load_ps (float const *__P)
875 return (__m128) *(__v4sf *)__P;
878 /* Load four SPFP values from P. The address need not be 16-byte aligned. */
879 static __inline __m128
880 _mm_loadu_ps (float const *__P)
882 return (__m128) __builtin_ia32_loadups (__P);
885 /* Load four SPFP values in reverse order. The address must be aligned. */
886 static __inline __m128
887 _mm_loadr_ps (float const *__P)
889 __v4sf __tmp = *(__v4sf *)__P;
890 return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3));
893 /* Create the vector [Z Y X W]. */
894 static __inline __m128
895 _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
897 return (__m128)(__v4sf){ __W, __X, __Y, __Z };
900 /* Create the vector [W X Y Z]. */
901 static __inline __m128
902 _mm_setr_ps (float __Z, float __Y, float __X, float __W)
904 return (__m128)(__v4sf){ __Z, __Y, __X, __W };
907 /* Stores the lower SPFP value. */
908 static __inline void
909 _mm_store_ss (float *__P, __m128 __A)
911 *__P = __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
914 /* Store four SPFP values. The address must be 16-byte aligned. */
915 static __inline void
916 _mm_store_ps (float *__P, __m128 __A)
918 *(__v4sf *)__P = (__v4sf)__A;
921 /* Store four SPFP values. The address need not be 16-byte aligned. */
922 static __inline void
923 _mm_storeu_ps (float *__P, __m128 __A)
925 __builtin_ia32_storeups (__P, (__v4sf)__A);
928 /* Store the lower SPFP value across four words. */
929 static __inline void
930 _mm_store1_ps (float *__P, __m128 __A)
932 __v4sf __va = (__v4sf)__A;
933 __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0));
934 _mm_storeu_ps (__P, __tmp);
937 static __inline void
938 _mm_store_ps1 (float *__P, __m128 __A)
940 _mm_store1_ps (__P, __A);
943 /* Store four SPFP values in reverse order. The address must be aligned. */
944 static __inline void
945 _mm_storer_ps (float *__P, __m128 __A)
947 __v4sf __va = (__v4sf)__A;
948 __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3));
949 _mm_store_ps (__P, __tmp);
952 /* Sets the low SPFP value of A from the low value of B. */
953 static __inline __m128
954 _mm_move_ss (__m128 __A, __m128 __B)
956 return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
959 /* Extracts one of the four words of A. The selector N must be immediate. */
960 #if 0
961 static __inline int __attribute__((__always_inline__))
962 _mm_extract_pi16 (__m64 const __A, int const __N)
964 return __builtin_ia32_vec_ext_v4hi ((__v4hi)__A, __N);
967 static __inline int __attribute__((__always_inline__))
968 _m_pextrw (__m64 const __A, int const __N)
970 return _mm_extract_pi16 (__A, __N);
972 #else
973 #define _mm_extract_pi16(A, N) __builtin_ia32_vec_ext_v4hi ((__v4hi)(A), (N))
974 #define _m_pextrw(A, N) _mm_extract_pi16((A), (N))
975 #endif
977 /* Inserts word D into one of four words of A. The selector N must be
978 immediate. */
979 #if 0
980 static __inline __m64 __attribute__((__always_inline__))
981 _mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
983 return (__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)__A, __D, __N);
986 static __inline __m64 __attribute__((__always_inline__))
987 _m_pinsrw (__m64 const __A, int const __D, int const __N)
989 return _mm_insert_pi16 (__A, __D, __N);
991 #else
992 #define _mm_insert_pi16(A, D, N) \
993 ((__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)(A), (D), (N)))
994 #define _m_pinsrw(A, D, N) _mm_insert_pi16((A), (D), (N))
995 #endif
997 /* Compute the element-wise maximum of signed 16-bit values. */
998 static __inline __m64
999 _mm_max_pi16 (__m64 __A, __m64 __B)
1001 return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B);
1004 static __inline __m64
1005 _m_pmaxsw (__m64 __A, __m64 __B)
1007 return _mm_max_pi16 (__A, __B);
1010 /* Compute the element-wise maximum of unsigned 8-bit values. */
1011 static __inline __m64
1012 _mm_max_pu8 (__m64 __A, __m64 __B)
1014 return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B);
1017 static __inline __m64
1018 _m_pmaxub (__m64 __A, __m64 __B)
1020 return _mm_max_pu8 (__A, __B);
1023 /* Compute the element-wise minimum of signed 16-bit values. */
1024 static __inline __m64
1025 _mm_min_pi16 (__m64 __A, __m64 __B)
1027 return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B);
1030 static __inline __m64
1031 _m_pminsw (__m64 __A, __m64 __B)
1033 return _mm_min_pi16 (__A, __B);
1036 /* Compute the element-wise minimum of unsigned 8-bit values. */
1037 static __inline __m64
1038 _mm_min_pu8 (__m64 __A, __m64 __B)
1040 return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B);
1043 static __inline __m64
1044 _m_pminub (__m64 __A, __m64 __B)
1046 return _mm_min_pu8 (__A, __B);
1049 /* Create an 8-bit mask of the signs of 8-bit values. */
1050 static __inline int
1051 _mm_movemask_pi8 (__m64 __A)
1053 return __builtin_ia32_pmovmskb ((__v8qi)__A);
1056 static __inline int
1057 _m_pmovmskb (__m64 __A)
1059 return _mm_movemask_pi8 (__A);
1062 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1063 in B and produce the high 16 bits of the 32-bit results. */
1064 static __inline __m64
1065 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
1067 return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B);
1070 static __inline __m64
1071 _m_pmulhuw (__m64 __A, __m64 __B)
1073 return _mm_mulhi_pu16 (__A, __B);
1076 /* Return a combination of the four 16-bit values in A. The selector
1077 must be an immediate. */
1078 #if 0
1079 static __inline __m64
1080 _mm_shuffle_pi16 (__m64 __A, int __N)
1082 return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N);
1085 static __inline __m64
1086 _m_pshufw (__m64 __A, int __N)
1088 return _mm_shuffle_pi16 (__A, __N);
1090 #else
1091 #define _mm_shuffle_pi16(A, N) \
1092 ((__m64) __builtin_ia32_pshufw ((__v4hi)(A), (N)))
1093 #define _m_pshufw(A, N) _mm_shuffle_pi16 ((A), (N))
1094 #endif
1096 /* Conditionally store byte elements of A into P. The high bit of each
1097 byte in the selector N determines whether the corresponding byte from
1098 A is stored. */
1099 static __inline void
1100 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1102 __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
1105 static __inline void
1106 _m_maskmovq (__m64 __A, __m64 __N, char *__P)
1108 _mm_maskmove_si64 (__A, __N, __P);
1111 /* Compute the rounded averages of the unsigned 8-bit values in A and B. */
1112 static __inline __m64
1113 _mm_avg_pu8 (__m64 __A, __m64 __B)
1115 return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B);
1118 static __inline __m64
1119 _m_pavgb (__m64 __A, __m64 __B)
1121 return _mm_avg_pu8 (__A, __B);
1124 /* Compute the rounded averages of the unsigned 16-bit values in A and B. */
1125 static __inline __m64
1126 _mm_avg_pu16 (__m64 __A, __m64 __B)
1128 return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B);
1131 static __inline __m64
1132 _m_pavgw (__m64 __A, __m64 __B)
1134 return _mm_avg_pu16 (__A, __B);
1137 /* Compute the sum of the absolute differences of the unsigned 8-bit
1138 values in A and B. Return the value in the lower 16-bit word; the
1139 upper words are cleared. */
1140 static __inline __m64
1141 _mm_sad_pu8 (__m64 __A, __m64 __B)
1143 return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B);
1146 static __inline __m64
1147 _m_psadbw (__m64 __A, __m64 __B)
1149 return _mm_sad_pu8 (__A, __B);
1152 /* Loads one cache line from address P to a location "closer" to the
1153 processor. The selector I specifies the type of prefetch operation. */
1154 #if 0
1155 static __inline void
1156 _mm_prefetch (void *__P, enum _mm_hint __I)
1158 __builtin_prefetch (__P, 0, __I);
1160 #else
1161 #define _mm_prefetch(P, I) \
1162 __builtin_prefetch ((P), 0, (I))
1163 #endif
1165 /* Stores the data in A to the address P without polluting the caches. */
1166 static __inline void
1167 _mm_stream_pi (__m64 *__P, __m64 __A)
1169 __builtin_ia32_movntq ((unsigned long long *)__P, (unsigned long long)__A);
1172 /* Likewise. The address must be 16-byte aligned. */
1173 static __inline void
1174 _mm_stream_ps (float *__P, __m128 __A)
1176 __builtin_ia32_movntps (__P, (__v4sf)__A);
1179 /* Guarantees that every preceding store is globally visible before
1180 any subsequent store. */
1181 static __inline void
1182 _mm_sfence (void)
1184 __builtin_ia32_sfence ();
1187 /* The execution of the next instruction is delayed by an implementation
1188 specific amount of time. The instruction does not modify the
1189 architectural state. */
1190 static __inline void
1191 _mm_pause (void)
1193 __asm__ __volatile__ ("rep; nop" : : );
1196 /* Transpose the 4x4 matrix composed of row[0-3]. */
1197 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
1198 do { \
1199 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
1200 __v4sf __t0 = __builtin_ia32_shufps (__r0, __r1, 0x44); \
1201 __v4sf __t2 = __builtin_ia32_shufps (__r0, __r1, 0xEE); \
1202 __v4sf __t1 = __builtin_ia32_shufps (__r2, __r3, 0x44); \
1203 __v4sf __t3 = __builtin_ia32_shufps (__r2, __r3, 0xEE); \
1204 (row0) = __builtin_ia32_shufps (__t0, __t1, 0x88); \
1205 (row1) = __builtin_ia32_shufps (__t0, __t1, 0xDD); \
1206 (row2) = __builtin_ia32_shufps (__t2, __t3, 0x88); \
1207 (row3) = __builtin_ia32_shufps (__t2, __t3, 0xDD); \
1208 } while (0)
1210 /* For backward source compatibility. */
1211 #include <emmintrin.h>
1213 #endif /* __SSE__ */
1214 #endif /* _XMMINTRIN_H_INCLUDED */