gcc/config/i386/xmmintrin.h

   1 /* Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   2
   3    This file is part of GCC.
   4
   5    GCC is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 2, or (at your option)
   8    any later version.
   9
  10    GCC is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with GCC; see the file COPYING.  If not, write to
  17    the Free Software Foundation, 59 Temple Place - Suite 330,
  18    Boston, MA 02111-1307, USA.  */
  19
  20 /* As a special exception, if you include this header file into source
  21    files compiled by GCC, this header file does not by itself cause
  22    the resulting executable to be covered by the GNU General Public
  23    License.  This exception does not however invalidate any other
  24    reasons why the executable file might be covered by the GNU General
  25    Public License.  */
  26
  27 /* Implemented from the specification included in the Intel C++ Compiler
  28    User Guide and Reference, version 8.0.  */
  29
  30 #ifndef _XMMINTRIN_H_INCLUDED
  31 #define _XMMINTRIN_H_INCLUDED
  32
  33 #ifndef __SSE__
  34 # error "SSE instruction set not enabled"
  35 #else
  36
  37 /* We need type definitions from the MMX header file.  */
  38 #include <mmintrin.h>
  39
  40 /* Get _mm_malloc () and _mm_free ().  */
  41 #include <mm_malloc.h>
  42
  43 /* The data type intended for user use.  */
  44 typedef float __m128 __attribute__ ((__vector_size__ (16)));
  45
  46 /* Internal data types for implementing the intrinsics.  */
  47 typedef float __v4sf __attribute__ ((__vector_size__ (16)));
  48
  49 /* Create a selector for use with the SHUFPS instruction.  */
  50 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
  51  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
  52
  53 /* Constants for use with _mm_prefetch.  */
  54 enum _mm_hint
  55 {
  56   _MM_HINT_T0 = 3,
  57   _MM_HINT_T1 = 2,
  58   _MM_HINT_T2 = 1,
  59   _MM_HINT_NTA = 0
  60 };
  61
  62 /* Bits in the MXCSR.  */
  63 #define _MM_EXCEPT_MASK       0x003f
  64 #define _MM_EXCEPT_INVALID    0x0001
  65 #define _MM_EXCEPT_DENORM     0x0002
  66 #define _MM_EXCEPT_DIV_ZERO   0x0004
  67 #define _MM_EXCEPT_OVERFLOW   0x0008
  68 #define _MM_EXCEPT_UNDERFLOW  0x0010
  69 #define _MM_EXCEPT_INEXACT    0x0020
  70
  71 #define _MM_MASK_MASK         0x1f80
  72 #define _MM_MASK_INVALID      0x0080
  73 #define _MM_MASK_DENORM       0x0100
  74 #define _MM_MASK_DIV_ZERO     0x0200
  75 #define _MM_MASK_OVERFLOW     0x0400
  76 #define _MM_MASK_UNDERFLOW    0x0800
  77 #define _MM_MASK_INEXACT      0x1000
  78
  79 #define _MM_ROUND_MASK        0x6000
  80 #define _MM_ROUND_NEAREST     0x0000
  81 #define _MM_ROUND_DOWN        0x2000
  82 #define _MM_ROUND_UP          0x4000
  83 #define _MM_ROUND_TOWARD_ZERO 0x6000
  84
  85 #define _MM_FLUSH_ZERO_MASK   0x8000
  86 #define _MM_FLUSH_ZERO_ON     0x8000
  87 #define _MM_FLUSH_ZERO_OFF    0x0000
  88
  89 /* Create a vector of zeros.  */
  90 static __inline __m128
  91 _mm_setzero_ps (void)
  92 {
  93   return (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
  94 }
  95
  96 /* Perform the respective operation on the lower SPFP (single-precision
  97    floating-point) values of A and B; the upper three SPFP values are
  98    passed through from A.  */
  99
 100 static __inline __m128
 101 _mm_add_ss (__m128 __A, __m128 __B)
 102 {
 103   return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B);
 104 }
 105
 106 static __inline __m128
 107 _mm_sub_ss (__m128 __A, __m128 __B)
 108 {
 109   return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B);
 110 }
 111
 112 static __inline __m128
 113 _mm_mul_ss (__m128 __A, __m128 __B)
 114 {
 115   return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B);
 116 }
 117
 118 static __inline __m128
 119 _mm_div_ss (__m128 __A, __m128 __B)
 120 {
 121   return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B);
 122 }
 123
 124 static __inline __m128
 125 _mm_sqrt_ss (__m128 __A)
 126 {
 127   return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A);
 128 }
 129
 130 static __inline __m128
 131 _mm_rcp_ss (__m128 __A)
 132 {
 133   return (__m128) __builtin_ia32_rcpss ((__v4sf)__A);
 134 }
 135
 136 static __inline __m128
 137 _mm_rsqrt_ss (__m128 __A)
 138 {
 139   return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A);
 140 }
 141
 142 static __inline __m128
 143 _mm_min_ss (__m128 __A, __m128 __B)
 144 {
 145   return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B);
 146 }
 147
 148 static __inline __m128
 149 _mm_max_ss (__m128 __A, __m128 __B)
 150 {
 151   return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
 152 }
 153
 154 /* Perform the respective operation on the four SPFP values in A and B.  */
 155
 156 static __inline __m128
 157 _mm_add_ps (__m128 __A, __m128 __B)
 158 {
 159   return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
 160 }
 161
 162 static __inline __m128
 163 _mm_sub_ps (__m128 __A, __m128 __B)
 164 {
 165   return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B);
 166 }
 167
 168 static __inline __m128
 169 _mm_mul_ps (__m128 __A, __m128 __B)
 170 {
 171   return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B);
 172 }
 173
 174 static __inline __m128
 175 _mm_div_ps (__m128 __A, __m128 __B)
 176 {
 177   return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B);
 178 }
 179
 180 static __inline __m128
 181 _mm_sqrt_ps (__m128 __A)
 182 {
 183   return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
 184 }
 185
 186 static __inline __m128
 187 _mm_rcp_ps (__m128 __A)
 188 {
 189   return (__m128) __builtin_ia32_rcpps ((__v4sf)__A);
 190 }
 191
 192 static __inline __m128
 193 _mm_rsqrt_ps (__m128 __A)
 194 {
 195   return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A);
 196 }
 197
 198 static __inline __m128
 199 _mm_min_ps (__m128 __A, __m128 __B)
 200 {
 201   return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B);
 202 }
 203
 204 static __inline __m128
 205 _mm_max_ps (__m128 __A, __m128 __B)
 206 {
 207   return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B);
 208 }
 209
 210 /* Perform logical bit-wise operations on 128-bit values.  */
 211
 212 static __inline __m128
 213 _mm_and_ps (__m128 __A, __m128 __B)
 214 {
 215   return __builtin_ia32_andps (__A, __B);
 216 }
 217
 218 static __inline __m128
 219 _mm_andnot_ps (__m128 __A, __m128 __B)
 220 {
 221   return __builtin_ia32_andnps (__A, __B);
 222 }
 223
 224 static __inline __m128
 225 _mm_or_ps (__m128 __A, __m128 __B)
 226 {
 227   return __builtin_ia32_orps (__A, __B);
 228 }
 229
 230 static __inline __m128
 231 _mm_xor_ps (__m128 __A, __m128 __B)
 232 {
 233   return __builtin_ia32_xorps (__A, __B);
 234 }
 235
 236 /* Perform a comparison on the lower SPFP values of A and B.  If the
 237    comparison is true, place a mask of all ones in the result, otherwise a
 238    mask of zeros.  The upper three SPFP values are passed through from A.  */
 239
 240 static __inline __m128
 241 _mm_cmpeq_ss (__m128 __A, __m128 __B)
 242 {
 243   return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B);
 244 }
 245
 246 static __inline __m128
 247 _mm_cmplt_ss (__m128 __A, __m128 __B)
 248 {
 249   return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B);
 250 }
 251
 252 static __inline __m128
 253 _mm_cmple_ss (__m128 __A, __m128 __B)
 254 {
 255   return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B);
 256 }
 257
 258 static __inline __m128
 259 _mm_cmpgt_ss (__m128 __A, __m128 __B)
 260 {
 261   return (__m128) __builtin_ia32_movss ((__v4sf) __A,
 262                                         (__v4sf)
 263                                         __builtin_ia32_cmpltss ((__v4sf) __B,
 264                                                                 (__v4sf)
 265                                                                 __A));
 266 }
 267
 268 static __inline __m128
 269 _mm_cmpge_ss (__m128 __A, __m128 __B)
 270 {
 271   return (__m128) __builtin_ia32_movss ((__v4sf) __A,
 272                                         (__v4sf)
 273                                         __builtin_ia32_cmpless ((__v4sf) __B,
 274                                                                 (__v4sf)
 275                                                                 __A));
 276 }
 277
 278 static __inline __m128
 279 _mm_cmpneq_ss (__m128 __A, __m128 __B)
 280 {
 281   return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B);
 282 }
 283
 284 static __inline __m128
 285 _mm_cmpnlt_ss (__m128 __A, __m128 __B)
 286 {
 287   return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B);
 288 }
 289
 290 static __inline __m128
 291 _mm_cmpnle_ss (__m128 __A, __m128 __B)
 292 {
 293   return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B);
 294 }
 295
 296 static __inline __m128
 297 _mm_cmpngt_ss (__m128 __A, __m128 __B)
 298 {
 299   return (__m128) __builtin_ia32_movss ((__v4sf) __A,
 300                                         (__v4sf)
 301                                         __builtin_ia32_cmpnltss ((__v4sf) __B,
 302                                                                  (__v4sf)
 303                                                                  __A));
 304 }
 305
 306 static __inline __m128
 307 _mm_cmpnge_ss (__m128 __A, __m128 __B)
 308 {
 309   return (__m128) __builtin_ia32_movss ((__v4sf) __A,
 310                                         (__v4sf)
 311                                         __builtin_ia32_cmpnless ((__v4sf) __B,
 312                                                                  (__v4sf)
 313                                                                  __A));
 314 }
 315
 316 static __inline __m128
 317 _mm_cmpord_ss (__m128 __A, __m128 __B)
 318 {
 319   return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B);
 320 }
 321
 322 static __inline __m128
 323 _mm_cmpunord_ss (__m128 __A, __m128 __B)
 324 {
 325   return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B);
 326 }
 327
 328 /* Perform a comparison on the four SPFP values of A and B.  For each
 329    element, if the comparison is true, place a mask of all ones in the
 330    result, otherwise a mask of zeros.  */
 331
 332 static __inline __m128
 333 _mm_cmpeq_ps (__m128 __A, __m128 __B)
 334 {
 335   return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
 336 }
 337
 338 static __inline __m128
 339 _mm_cmplt_ps (__m128 __A, __m128 __B)
 340 {
 341   return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B);
 342 }
 343
 344 static __inline __m128
 345 _mm_cmple_ps (__m128 __A, __m128 __B)
 346 {
 347   return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B);
 348 }
 349
 350 static __inline __m128
 351 _mm_cmpgt_ps (__m128 __A, __m128 __B)
 352 {
 353   return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B);
 354 }
 355
 356 static __inline __m128
 357 _mm_cmpge_ps (__m128 __A, __m128 __B)
 358 {
 359   return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B);
 360 }
 361
 362 static __inline __m128
 363 _mm_cmpneq_ps (__m128 __A, __m128 __B)
 364 {
 365   return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B);
 366 }
 367
 368 static __inline __m128
 369 _mm_cmpnlt_ps (__m128 __A, __m128 __B)
 370 {
 371   return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B);
 372 }
 373
 374 static __inline __m128
 375 _mm_cmpnle_ps (__m128 __A, __m128 __B)
 376 {
 377   return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B);
 378 }
 379
 380 static __inline __m128
 381 _mm_cmpngt_ps (__m128 __A, __m128 __B)
 382 {
 383   return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B);
 384 }
 385
 386 static __inline __m128
 387 _mm_cmpnge_ps (__m128 __A, __m128 __B)
 388 {
 389   return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B);
 390 }
 391
 392 static __inline __m128
 393 _mm_cmpord_ps (__m128 __A, __m128 __B)
 394 {
 395   return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B);
 396 }
 397
 398 static __inline __m128
 399 _mm_cmpunord_ps (__m128 __A, __m128 __B)
 400 {
 401   return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B);
 402 }
 403
 404 /* Compare the lower SPFP values of A and B and return 1 if true
 405    and 0 if false.  */
 406
 407 static __inline int
 408 _mm_comieq_ss (__m128 __A, __m128 __B)
 409 {
 410   return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B);
 411 }
 412
 413 static __inline int
 414 _mm_comilt_ss (__m128 __A, __m128 __B)
 415 {
 416   return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B);
 417 }
 418
 419 static __inline int
 420 _mm_comile_ss (__m128 __A, __m128 __B)
 421 {
 422   return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B);
 423 }
 424
 425 static __inline int
 426 _mm_comigt_ss (__m128 __A, __m128 __B)
 427 {
 428   return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B);
 429 }
 430
 431 static __inline int
 432 _mm_comige_ss (__m128 __A, __m128 __B)
 433 {
 434   return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B);
 435 }
 436
 437 static __inline int
 438 _mm_comineq_ss (__m128 __A, __m128 __B)
 439 {
 440   return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B);
 441 }
 442
 443 static __inline int
 444 _mm_ucomieq_ss (__m128 __A, __m128 __B)
 445 {
 446   return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B);
 447 }
 448
 449 static __inline int
 450 _mm_ucomilt_ss (__m128 __A, __m128 __B)
 451 {
 452   return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B);
 453 }
 454
 455 static __inline int
 456 _mm_ucomile_ss (__m128 __A, __m128 __B)
 457 {
 458   return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B);
 459 }
 460
 461 static __inline int
 462 _mm_ucomigt_ss (__m128 __A, __m128 __B)
 463 {
 464   return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B);
 465 }
 466
 467 static __inline int
 468 _mm_ucomige_ss (__m128 __A, __m128 __B)
 469 {
 470   return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B);
 471 }
 472
 473 static __inline int
 474 _mm_ucomineq_ss (__m128 __A, __m128 __B)
 475 {
 476   return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B);
 477 }
 478
 479 /* Convert the lower SPFP value to a 32-bit integer according to the current
 480    rounding mode.  */
 481 static __inline int
 482 _mm_cvtss_si32 (__m128 __A)
 483 {
 484   return __builtin_ia32_cvtss2si ((__v4sf) __A);
 485 }
 486
 487 static __inline int
 488 _mm_cvt_ss2si (__m128 __A)
 489 {
 490   return _mm_cvtss_si32 (__A);
 491 }
 492
 493 #ifdef __x86_64__
 494 /* Convert the lower SPFP value to a 32-bit integer according to the current
 495    rounding mode.  */
 496 static __inline long long
 497 _mm_cvtss_si64x (__m128 __A)
 498 {
 499   return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
 500 }
 501 #endif
 502
 503 /* Convert the two lower SPFP values to 32-bit integers according to the
 504    current rounding mode.  Return the integers in packed form.  */
 505 static __inline __m64
 506 _mm_cvtps_pi32 (__m128 __A)
 507 {
 508   return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A);
 509 }
 510
 511 static __inline __m64
 512 _mm_cvt_ps2pi (__m128 __A)
 513 {
 514   return _mm_cvtps_pi32 (__A);
 515 }
 516
 517 /* Truncate the lower SPFP value to a 32-bit integer.  */
 518 static __inline int
 519 _mm_cvttss_si32 (__m128 __A)
 520 {
 521   return __builtin_ia32_cvttss2si ((__v4sf) __A);
 522 }
 523
 524 static __inline int
 525 _mm_cvtt_ss2si (__m128 __A)
 526 {
 527   return _mm_cvttss_si32 (__A);
 528 }
 529
 530 #ifdef __x86_64__
 531 /* Truncate the lower SPFP value to a 32-bit integer.  */
 532 static __inline long long
 533 _mm_cvttss_si64x (__m128 __A)
 534 {
 535   return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
 536 }
 537 #endif
 538
 539 /* Truncate the two lower SPFP values to 32-bit integers.  Return the
 540    integers in packed form.  */
 541 static __inline __m64
 542 _mm_cvttps_pi32 (__m128 __A)
 543 {
 544   return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A);
 545 }
 546
 547 static __inline __m64
 548 _mm_cvtt_ps2pi (__m128 __A)
 549 {
 550   return _mm_cvttps_pi32 (__A);
 551 }
 552
 553 /* Convert B to a SPFP value and insert it as element zero in A.  */
 554 static __inline __m128
 555 _mm_cvtsi32_ss (__m128 __A, int __B)
 556 {
 557   return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B);
 558 }
 559
 560 static __inline __m128
 561 _mm_cvt_si2ss (__m128 __A, int __B)
 562 {
 563   return _mm_cvtsi32_ss (__A, __B);
 564 }
 565
 566 #ifdef __x86_64__
 567 /* Convert B to a SPFP value and insert it as element zero in A.  */
 568 static __inline __m128
 569 _mm_cvtsi64x_ss (__m128 __A, long long __B)
 570 {
 571   return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
 572 }
 573 #endif
 574
 575 /* Convert the two 32-bit values in B to SPFP form and insert them
 576    as the two lower elements in A.  */
 577 static __inline __m128
 578 _mm_cvtpi32_ps (__m128 __A, __m64 __B)
 579 {
 580   return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B);
 581 }
 582
 583 static __inline __m128
 584 _mm_cvt_pi2ps (__m128 __A, __m64 __B)
 585 {
 586   return _mm_cvtpi32_ps (__A, __B);
 587 }
 588
 589 /* Convert the four signed 16-bit values in A to SPFP form.  */
 590 static __inline __m128
 591 _mm_cvtpi16_ps (__m64 __A)
 592 {
 593   __v4hi __sign;
 594   __v2si __hisi, __losi;
 595   __v4sf __r;
 596
 597   /* This comparison against zero gives us a mask that can be used to
 598      fill in the missing sign bits in the unpack operations below, so
 599      that we get signed values after unpacking.  */
 600   __sign = __builtin_ia32_pcmpgtw ((__v4hi)0LL, (__v4hi)__A);
 601
 602   /* Convert the four words to doublewords.  */
 603   __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign);
 604   __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign);
 605
 606   /* Convert the doublewords to floating point two at a time.  */
 607   __r = (__v4sf) _mm_setzero_ps ();
 608   __r = __builtin_ia32_cvtpi2ps (__r, __hisi);
 609   __r = __builtin_ia32_movlhps (__r, __r);
 610   __r = __builtin_ia32_cvtpi2ps (__r, __losi);
 611
 612   return (__m128) __r;
 613 }
 614
 615 /* Convert the four unsigned 16-bit values in A to SPFP form.  */
 616 static __inline __m128
 617 _mm_cvtpu16_ps (__m64 __A)
 618 {
 619   __v2si __hisi, __losi;
 620   __v4sf __r;
 621
 622   /* Convert the four words to doublewords.  */
 623   __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, (__v4hi)0LL);
 624   __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, (__v4hi)0LL);
 625
 626   /* Convert the doublewords to floating point two at a time.  */
 627   __r = (__v4sf) _mm_setzero_ps ();
 628   __r = __builtin_ia32_cvtpi2ps (__r, __hisi);
 629   __r = __builtin_ia32_movlhps (__r, __r);
 630   __r = __builtin_ia32_cvtpi2ps (__r, __losi);
 631
 632   return (__m128) __r;
 633 }
 634
 635 /* Convert the low four signed 8-bit values in A to SPFP form.  */
 636 static __inline __m128
 637 _mm_cvtpi8_ps (__m64 __A)
 638 {
 639   __v8qi __sign;
 640
 641   /* This comparison against zero gives us a mask that can be used to
 642      fill in the missing sign bits in the unpack operations below, so
 643      that we get signed values after unpacking.  */
 644   __sign = __builtin_ia32_pcmpgtb ((__v8qi)0LL, (__v8qi)__A);
 645
 646   /* Convert the four low bytes to words.  */
 647   __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign);
 648
 649   return _mm_cvtpi16_ps(__A);
 650 }
 651
 652 /* Convert the low four unsigned 8-bit values in A to SPFP form.  */
 653 static __inline __m128
 654 _mm_cvtpu8_ps(__m64 __A)
 655 {
 656   __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, (__v8qi)0LL);
 657   return _mm_cvtpu16_ps(__A);
 658 }
 659
 660 /* Convert the four signed 32-bit values in A and B to SPFP form.  */
 661 static __inline __m128
 662 _mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
 663 {
 664   __v4sf __zero = (__v4sf) _mm_setzero_ps ();
 665   __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A);
 666   __v4sf __sfb = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__B);
 667   return (__m128) __builtin_ia32_movlhps (__sfa, __sfb);
 668 }
 669
 670 /* Convert the four SPFP values in A to four signed 16-bit integers.  */
 671 static __inline __m64
 672 _mm_cvtps_pi16(__m128 __A)
 673 {
 674   __v4sf __hisf = (__v4sf)__A;
 675   __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf);
 676   __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf);
 677   __v2si __losi = __builtin_ia32_cvtps2pi (__losf);
 678   return (__m64) __builtin_ia32_packssdw (__hisi, __losi);
 679 }
 680
 681 /* Convert the four SPFP values in A to four signed 8-bit integers.  */
 682 static __inline __m64
 683 _mm_cvtps_pi8(__m128 __A)
 684 {
 685   __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A);
 686   return (__m64) __builtin_ia32_packsswb (__tmp, (__v4hi)0LL);
 687 }
 688
 689 /* Selects four specific SPFP values from A and B based on MASK.  */
 690 #if 0
 691 static __inline __m128
 692 _mm_shuffle_ps (__m128 __A, __m128 __B, int __mask)
 693 {
 694   return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask);
 695 }
 696 #else
 697 #define _mm_shuffle_ps(A, B, MASK) \
 698  ((__m128) __builtin_ia32_shufps ((__v4sf)(A), (__v4sf)(B), (MASK)))
 699 #endif
 700
 701
 702 /* Selects and interleaves the upper two SPFP values from A and B.  */
 703 static __inline __m128
 704 _mm_unpackhi_ps (__m128 __A, __m128 __B)
 705 {
 706   return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B);
 707 }
 708
 709 /* Selects and interleaves the lower two SPFP values from A and B.  */
 710 static __inline __m128
 711 _mm_unpacklo_ps (__m128 __A, __m128 __B)
 712 {
 713   return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B);
 714 }
 715
 716 /* Sets the upper two SPFP values with 64-bits of data loaded from P;
 717    the lower two values are passed through from A.  */
 718 static __inline __m128
 719 _mm_loadh_pi (__m128 __A, __m64 const *__P)
 720 {
 721   return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (__v2si *)__P);
 722 }
 723
 724 /* Stores the upper two SPFP values of A into P.  */
 725 static __inline void
 726 _mm_storeh_pi (__m64 *__P, __m128 __A)
 727 {
 728   __builtin_ia32_storehps ((__v2si *)__P, (__v4sf)__A);
 729 }
 730
 731 /* Moves the upper two values of B into the lower two values of A.  */
 732 static __inline __m128
 733 _mm_movehl_ps (__m128 __A, __m128 __B)
 734 {
 735   return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B);
 736 }
 737
 738 /* Moves the lower two values of B into the upper two values of A.  */
 739 static __inline __m128
 740 _mm_movelh_ps (__m128 __A, __m128 __B)
 741 {
 742   return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B);
 743 }
 744
 745 /* Sets the lower two SPFP values with 64-bits of data loaded from P;
 746    the upper two values are passed through from A.  */
 747 static __inline __m128
 748 _mm_loadl_pi (__m128 __A, __m64 const *__P)
 749 {
 750   return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (__v2si *)__P);
 751 }
 752
 753 /* Stores the lower two SPFP values of A into P.  */
 754 static __inline void
 755 _mm_storel_pi (__m64 *__P, __m128 __A)
 756 {
 757   __builtin_ia32_storelps ((__v2si *)__P, (__v4sf)__A);
 758 }
 759
 760 /* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
 761 static __inline int
 762 _mm_movemask_ps (__m128 __A)
 763 {
 764   return __builtin_ia32_movmskps ((__v4sf)__A);
 765 }
 766
 767 /* Return the contents of the control register.  */
 768 static __inline unsigned int
 769 _mm_getcsr (void)
 770 {
 771   return __builtin_ia32_stmxcsr ();
 772 }
 773
 774 /* Read exception bits from the control register.  */
 775 static __inline unsigned int
 776 _MM_GET_EXCEPTION_STATE (void)
 777 {
 778   return _mm_getcsr() & _MM_EXCEPT_MASK;
 779 }
 780
 781 static __inline unsigned int
 782 _MM_GET_EXCEPTION_MASK (void)
 783 {
 784   return _mm_getcsr() & _MM_MASK_MASK;
 785 }
 786
 787 static __inline unsigned int
 788 _MM_GET_ROUNDING_MODE (void)
 789 {
 790   return _mm_getcsr() & _MM_ROUND_MASK;
 791 }
 792
 793 static __inline unsigned int
 794 _MM_GET_FLUSH_ZERO_MODE (void)
 795 {
 796   return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
 797 }
 798
 799 /* Set the control register to I.  */
 800 static __inline void
 801 _mm_setcsr (unsigned int __I)
 802 {
 803   __builtin_ia32_ldmxcsr (__I);
 804 }
 805
 806 /* Set exception bits in the control register.  */
 807 static __inline void
 808 _MM_SET_EXCEPTION_STATE(unsigned int __mask)
 809 {
 810   _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask);
 811 }
 812
 813 static __inline void
 814 _MM_SET_EXCEPTION_MASK (unsigned int __mask)
 815 {
 816   _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask);
 817 }
 818
 819 static __inline void
 820 _MM_SET_ROUNDING_MODE (unsigned int __mode)
 821 {
 822   _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode);
 823 }
 824
 825 static __inline void
 826 _MM_SET_FLUSH_ZERO_MODE (unsigned int __mode)
 827 {
 828   _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode);
 829 }
 830
 831 /* Create a vector with element 0 as F and the rest zero.  */
 832 static __inline __m128
 833 _mm_set_ss (float __F)
 834 {
 835   return (__m128)(__v4sf){ __F, 0, 0, 0 };
 836 }
 837
 838 /* Create a vector with all four elements equal to F.  */
 839 static __inline __m128
 840 _mm_set1_ps (float __F)
 841 {
 842   return (__m128)(__v4sf){ __F, __F, __F, __F };
 843 }
 844
 845 static __inline __m128
 846 _mm_set_ps1 (float __F)
 847 {
 848   return _mm_set1_ps (__F);
 849 }
 850
 851 /* Create a vector with element 0 as *P and the rest zero.  */
 852 static __inline __m128
 853 _mm_load_ss (float const *__P)
 854 {
 855   return _mm_set_ss (*__P);
 856 }
 857
 858 /* Create a vector with all four elements equal to *P.  */
 859 static __inline __m128
 860 _mm_load1_ps (float const *__P)
 861 {
 862   return _mm_set1_ps (*__P);
 863 }
 864
 865 static __inline __m128
 866 _mm_load_ps1 (float const *__P)
 867 {
 868   return _mm_load1_ps (__P);
 869 }
 870
 871 /* Load four SPFP values from P.  The address must be 16-byte aligned.  */
 872 static __inline __m128
 873 _mm_load_ps (float const *__P)
 874 {
 875   return (__m128) *(__v4sf *)__P;
 876 }
 877
 878 /* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
 879 static __inline __m128
 880 _mm_loadu_ps (float const *__P)
 881 {
 882   return (__m128) __builtin_ia32_loadups (__P);
 883 }
 884
 885 /* Load four SPFP values in reverse order.  The address must be aligned.  */
 886 static __inline __m128
 887 _mm_loadr_ps (float const *__P)
 888 {
 889   __v4sf __tmp = *(__v4sf *)__P;
 890   return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3));
 891 }
 892
 893 /* Create the vector [Z Y X W].  */
 894 static __inline __m128
 895 _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
 896 {
 897   return (__m128)(__v4sf){ __W, __X, __Y, __Z };
 898 }
 899
 900 /* Create the vector [W X Y Z].  */
 901 static __inline __m128
 902 _mm_setr_ps (float __Z, float __Y, float __X, float __W)
 903 {
 904   return (__m128)(__v4sf){ __Z, __Y, __X, __W };
 905 }
 906
 907 /* Stores the lower SPFP value.  */
 908 static __inline void
 909 _mm_store_ss (float *__P, __m128 __A)
 910 {
 911   *__P = __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
 912 }
 913
 914 /* Store four SPFP values.  The address must be 16-byte aligned.  */
 915 static __inline void
 916 _mm_store_ps (float *__P, __m128 __A)
 917 {
 918   *(__v4sf *)__P = (__v4sf)__A;
 919 }
 920
 921 /* Store four SPFP values.  The address need not be 16-byte aligned.  */
 922 static __inline void
 923 _mm_storeu_ps (float *__P, __m128 __A)
 924 {
 925   __builtin_ia32_storeups (__P, (__v4sf)__A);
 926 }
 927
 928 /* Store the lower SPFP value across four words.  */
 929 static __inline void
 930 _mm_store1_ps (float *__P, __m128 __A)
 931 {
 932   __v4sf __va = (__v4sf)__A;
 933   __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0));
 934   _mm_storeu_ps (__P, __tmp);
 935 }
 936
 937 static __inline void
 938 _mm_store_ps1 (float *__P, __m128 __A)
 939 {
 940   _mm_store1_ps (__P, __A);
 941 }
 942
 943 /* Store four SPFP values in reverse order.  The address must be aligned.  */
 944 static __inline void
 945 _mm_storer_ps (float *__P, __m128 __A)
 946 {
 947   __v4sf __va = (__v4sf)__A;
 948   __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3));
 949   _mm_store_ps (__P, __tmp);
 950 }
 951
 952 /* Sets the low SPFP value of A from the low value of B.  */
 953 static __inline __m128
 954 _mm_move_ss (__m128 __A, __m128 __B)
 955 {
 956   return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
 957 }
 958
 959 /* Extracts one of the four words of A.  The selector N must be immediate.  */
 960 #if 0
 961 static __inline int __attribute__((__always_inline__))
 962 _mm_extract_pi16 (__m64 const __A, int const __N)
 963 {
 964   return __builtin_ia32_vec_ext_v4hi ((__v4hi)__A, __N);
 965 }
 966
 967 static __inline int __attribute__((__always_inline__))
 968 _m_pextrw (__m64 const __A, int const __N)
 969 {
 970   return _mm_extract_pi16 (__A, __N);
 971 }
 972 #else
 973 #define _mm_extract_pi16(A, N)  __builtin_ia32_vec_ext_v4hi ((__v4hi)(A), (N))
 974 #define _m_pextrw(A, N)         _mm_extract_pi16((A), (N))
 975 #endif
 976
 977 /* Inserts word D into one of four words of A.  The selector N must be
 978    immediate.  */
 979 #if 0
 980 static __inline __m64 __attribute__((__always_inline__))
 981 _mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
 982 {
 983   return (__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)__A, __D, __N);
 984 }
 985
 986 static __inline __m64 __attribute__((__always_inline__))
 987 _m_pinsrw (__m64 const __A, int const __D, int const __N)
 988 {
 989   return _mm_insert_pi16 (__A, __D, __N);
 990 }
 991 #else
 992 #define _mm_insert_pi16(A, D, N) \
 993   ((__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)(A), (D), (N)))
 994 #define _m_pinsrw(A, D, N)       _mm_insert_pi16((A), (D), (N))
 995 #endif
 996
 997 /* Compute the element-wise maximum of signed 16-bit values.  */
 998 static __inline __m64
 999 _mm_max_pi16 (__m64 __A, __m64 __B)
1000 {
1001   return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B);
1002 }
1003
1004 static __inline __m64
1005 _m_pmaxsw (__m64 __A, __m64 __B)
1006 {
1007   return _mm_max_pi16 (__A, __B);
1008 }
1009
1010 /* Compute the element-wise maximum of unsigned 8-bit values.  */
1011 static __inline __m64
1012 _mm_max_pu8 (__m64 __A, __m64 __B)
1013 {
1014   return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B);
1015 }
1016
1017 static __inline __m64
1018 _m_pmaxub (__m64 __A, __m64 __B)
1019 {
1020   return _mm_max_pu8 (__A, __B);
1021 }
1022
1023 /* Compute the element-wise minimum of signed 16-bit values.  */
1024 static __inline __m64
1025 _mm_min_pi16 (__m64 __A, __m64 __B)
1026 {
1027   return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B);
1028 }
1029
1030 static __inline __m64
1031 _m_pminsw (__m64 __A, __m64 __B)
1032 {
1033   return _mm_min_pi16 (__A, __B);
1034 }
1035
1036 /* Compute the element-wise minimum of unsigned 8-bit values.  */
1037 static __inline __m64
1038 _mm_min_pu8 (__m64 __A, __m64 __B)
1039 {
1040   return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B);
1041 }
1042
1043 static __inline __m64
1044 _m_pminub (__m64 __A, __m64 __B)
1045 {
1046   return _mm_min_pu8 (__A, __B);
1047 }
1048
1049 /* Create an 8-bit mask of the signs of 8-bit values.  */
1050 static __inline int
1051 _mm_movemask_pi8 (__m64 __A)
1052 {
1053   return __builtin_ia32_pmovmskb ((__v8qi)__A);
1054 }
1055
1056 static __inline int
1057 _m_pmovmskb (__m64 __A)
1058 {
1059   return _mm_movemask_pi8 (__A);
1060 }
1061
1062 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1063    in B and produce the high 16 bits of the 32-bit results.  */
1064 static __inline __m64
1065 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
1066 {
1067   return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B);
1068 }
1069
1070 static __inline __m64
1071 _m_pmulhuw (__m64 __A, __m64 __B)
1072 {
1073   return _mm_mulhi_pu16 (__A, __B);
1074 }
1075
1076 /* Return a combination of the four 16-bit values in A.  The selector
1077    must be an immediate.  */
1078 #if 0
1079 static __inline __m64
1080 _mm_shuffle_pi16 (__m64 __A, int __N)
1081 {
1082   return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N);
1083 }
1084
1085 static __inline __m64
1086 _m_pshufw (__m64 __A, int __N)
1087 {
1088   return _mm_shuffle_pi16 (__A, __N);
1089 }
1090 #else
1091 #define _mm_shuffle_pi16(A, N) \
1092   ((__m64) __builtin_ia32_pshufw ((__v4hi)(A), (N)))
1093 #define _m_pshufw(A, N)         _mm_shuffle_pi16 ((A), (N))
1094 #endif
1095
1096 /* Conditionally store byte elements of A into P.  The high bit of each
1097    byte in the selector N determines whether the corresponding byte from
1098    A is stored.  */
1099 static __inline void
1100 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1101 {
1102   __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
1103 }
1104
1105 static __inline void
1106 _m_maskmovq (__m64 __A, __m64 __N, char *__P)
1107 {
1108   _mm_maskmove_si64 (__A, __N, __P);
1109 }
1110
1111 /* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
1112 static __inline __m64
1113 _mm_avg_pu8 (__m64 __A, __m64 __B)
1114 {
1115   return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B);
1116 }
1117
1118 static __inline __m64
1119 _m_pavgb (__m64 __A, __m64 __B)
1120 {
1121   return _mm_avg_pu8 (__A, __B);
1122 }
1123
1124 /* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
1125 static __inline __m64
1126 _mm_avg_pu16 (__m64 __A, __m64 __B)
1127 {
1128   return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B);
1129 }
1130
1131 static __inline __m64
1132 _m_pavgw (__m64 __A, __m64 __B)
1133 {
1134   return _mm_avg_pu16 (__A, __B);
1135 }
1136
1137 /* Compute the sum of the absolute differences of the unsigned 8-bit
1138    values in A and B.  Return the value in the lower 16-bit word; the
1139    upper words are cleared.  */
1140 static __inline __m64
1141 _mm_sad_pu8 (__m64 __A, __m64 __B)
1142 {
1143   return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B);
1144 }
1145
1146 static __inline __m64
1147 _m_psadbw (__m64 __A, __m64 __B)
1148 {
1149   return _mm_sad_pu8 (__A, __B);
1150 }
1151
1152 /* Loads one cache line from address P to a location "closer" to the
1153    processor.  The selector I specifies the type of prefetch operation.  */
1154 #if 0
1155 static __inline void
1156 _mm_prefetch (void *__P, enum _mm_hint __I)
1157 {
1158   __builtin_prefetch (__P, 0, __I);
1159 }
1160 #else
1161 #define _mm_prefetch(P, I) \
1162   __builtin_prefetch ((P), 0, (I))
1163 #endif
1164
1165 /* Stores the data in A to the address P without polluting the caches.  */
1166 static __inline void
1167 _mm_stream_pi (__m64 *__P, __m64 __A)
1168 {
1169   __builtin_ia32_movntq ((unsigned long long *)__P, (unsigned long long)__A);
1170 }
1171
1172 /* Likewise.  The address must be 16-byte aligned.  */
1173 static __inline void
1174 _mm_stream_ps (float *__P, __m128 __A)
1175 {
1176   __builtin_ia32_movntps (__P, (__v4sf)__A);
1177 }
1178
1179 /* Guarantees that every preceding store is globally visible before
1180    any subsequent store.  */
1181 static __inline void
1182 _mm_sfence (void)
1183 {
1184   __builtin_ia32_sfence ();
1185 }
1186
1187 /* The execution of the next instruction is delayed by an implementation
1188    specific amount of time.  The instruction does not modify the
1189    architectural state.  */
1190 static __inline void
1191 _mm_pause (void)
1192 {
1193   __asm__ __volatile__ ("rep; nop" : : );
1194 }
1195
1196 /* Transpose the 4x4 matrix composed of row[0-3].  */
1197 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)                       \
1198 do {                                                                    \
1199   __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);    \
1200   __v4sf __t0 = __builtin_ia32_shufps (__r0, __r1, 0x44);               \
1201   __v4sf __t2 = __builtin_ia32_shufps (__r0, __r1, 0xEE);               \
1202   __v4sf __t1 = __builtin_ia32_shufps (__r2, __r3, 0x44);               \
1203   __v4sf __t3 = __builtin_ia32_shufps (__r2, __r3, 0xEE);               \
1204   (row0) = __builtin_ia32_shufps (__t0, __t1, 0x88);                    \
1205   (row1) = __builtin_ia32_shufps (__t0, __t1, 0xDD);                    \
1206   (row2) = __builtin_ia32_shufps (__t2, __t3, 0x88);                    \
1207   (row3) = __builtin_ia32_shufps (__t2, __t3, 0xDD);                    \
1208 } while (0)
1209
1210 /* For backward source compatibility.  */
1211 #include <emmintrin.h>
1212
1213 #endif /* __SSE__ */
1214 #endif /* _XMMINTRIN_H_INCLUDED */