gcc/config/i386/xmmintrin.h

   1 /* Copyright (C) 2002-2013 Free Software Foundation, Inc.
   2
   3    This file is part of GCC.
   4
   5    GCC is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3, or (at your option)
   8    any later version.
   9
  10    GCC is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    Under Section 7 of GPL version 3, you are granted additional
  16    permissions described in the GCC Runtime Library Exception, version
  17    3.1, as published by the Free Software Foundation.
  18
  19    You should have received a copy of the GNU General Public License and
  20    a copy of the GCC Runtime Library Exception along with this program;
  21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  22    <http://www.gnu.org/licenses/>.  */
  23
  24 /* Implemented from the specification included in the Intel C++ Compiler
  25    User Guide and Reference, version 9.0.  */
  26
  27 #ifndef _XMMINTRIN_H_INCLUDED
  28 #define _XMMINTRIN_H_INCLUDED
  29
  30 /* We need type definitions from the MMX header file.  */
  31 #include <mmintrin.h>
  32
  33 /* Get _mm_malloc () and _mm_free ().  */
  34 #include <mm_malloc.h>
  35
  36 #ifndef __SSE__
  37 #pragma GCC push_options
  38 #pragma GCC target("sse")
  39 #define __DISABLE_SSE__
  40 #endif /* __SSE__ */
  41
  42 /* The Intel API is flexible enough that we must allow aliasing with other
  43    vector types, and their scalar components.  */
  44 typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
  45
  46 /* Internal data types for implementing the intrinsics.  */
  47 typedef float __v4sf __attribute__ ((__vector_size__ (16)));
  48
  49 /* Create a selector for use with the SHUFPS instruction.  */
  50 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
  51  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
  52
  53 /* Constants for use with _mm_prefetch.  */
  54 enum _mm_hint
  55 {
  56   _MM_HINT_T0 = 3,
  57   _MM_HINT_T1 = 2,
  58   _MM_HINT_T2 = 1,
  59   _MM_HINT_NTA = 0
  60 };
  61
  62 /* Bits in the MXCSR.  */
  63 #define _MM_EXCEPT_MASK       0x003f
  64 #define _MM_EXCEPT_INVALID    0x0001
  65 #define _MM_EXCEPT_DENORM     0x0002
  66 #define _MM_EXCEPT_DIV_ZERO   0x0004
  67 #define _MM_EXCEPT_OVERFLOW   0x0008
  68 #define _MM_EXCEPT_UNDERFLOW  0x0010
  69 #define _MM_EXCEPT_INEXACT    0x0020
  70
  71 #define _MM_MASK_MASK         0x1f80
  72 #define _MM_MASK_INVALID      0x0080
  73 #define _MM_MASK_DENORM       0x0100
  74 #define _MM_MASK_DIV_ZERO     0x0200
  75 #define _MM_MASK_OVERFLOW     0x0400
  76 #define _MM_MASK_UNDERFLOW    0x0800
  77 #define _MM_MASK_INEXACT      0x1000
  78
  79 #define _MM_ROUND_MASK        0x6000
  80 #define _MM_ROUND_NEAREST     0x0000
  81 #define _MM_ROUND_DOWN        0x2000
  82 #define _MM_ROUND_UP          0x4000
  83 #define _MM_ROUND_TOWARD_ZERO 0x6000
  84
  85 #define _MM_FLUSH_ZERO_MASK   0x8000
  86 #define _MM_FLUSH_ZERO_ON     0x8000
  87 #define _MM_FLUSH_ZERO_OFF    0x0000
  88
  89 /* Create a vector of zeros.  */
  90 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  91 _mm_setzero_ps (void)
  92 {
  93   return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
  94 }
  95
  96 /* Perform the respective operation on the lower SPFP (single-precision
  97    floating-point) values of A and B; the upper three SPFP values are
  98    passed through from A.  */
  99
 100 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 101 _mm_add_ss (__m128 __A, __m128 __B)
 102 {
 103   return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B);
 104 }
 105
 106 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 107 _mm_sub_ss (__m128 __A, __m128 __B)
 108 {
 109   return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B);
 110 }
 111
 112 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 113 _mm_mul_ss (__m128 __A, __m128 __B)
 114 {
 115   return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B);
 116 }
 117
 118 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 119 _mm_div_ss (__m128 __A, __m128 __B)
 120 {
 121   return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B);
 122 }
 123
 124 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 125 _mm_sqrt_ss (__m128 __A)
 126 {
 127   return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A);
 128 }
 129
 130 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 131 _mm_rcp_ss (__m128 __A)
 132 {
 133   return (__m128) __builtin_ia32_rcpss ((__v4sf)__A);
 134 }
 135
 136 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 137 _mm_rsqrt_ss (__m128 __A)
 138 {
 139   return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A);
 140 }
 141
 142 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 143 _mm_min_ss (__m128 __A, __m128 __B)
 144 {
 145   return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B);
 146 }
 147
 148 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 149 _mm_max_ss (__m128 __A, __m128 __B)
 150 {
 151   return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
 152 }
 153
 154 /* Perform the respective operation on the four SPFP values in A and B.  */
 155
 156 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 157 _mm_add_ps (__m128 __A, __m128 __B)
 158 {
 159   return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
 160 }
 161
 162 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 163 _mm_sub_ps (__m128 __A, __m128 __B)
 164 {
 165   return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B);
 166 }
 167
 168 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 169 _mm_mul_ps (__m128 __A, __m128 __B)
 170 {
 171   return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B);
 172 }
 173
 174 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 175 _mm_div_ps (__m128 __A, __m128 __B)
 176 {
 177   return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B);
 178 }
 179
 180 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 181 _mm_sqrt_ps (__m128 __A)
 182 {
 183   return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
 184 }
 185
 186 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 187 _mm_rcp_ps (__m128 __A)
 188 {
 189   return (__m128) __builtin_ia32_rcpps ((__v4sf)__A);
 190 }
 191
 192 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 193 _mm_rsqrt_ps (__m128 __A)
 194 {
 195   return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A);
 196 }
 197
 198 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 199 _mm_min_ps (__m128 __A, __m128 __B)
 200 {
 201   return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B);
 202 }
 203
 204 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 205 _mm_max_ps (__m128 __A, __m128 __B)
 206 {
 207   return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B);
 208 }
 209
 210 /* Perform logical bit-wise operations on 128-bit values.  */
 211
 212 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 213 _mm_and_ps (__m128 __A, __m128 __B)
 214 {
 215   return __builtin_ia32_andps (__A, __B);
 216 }
 217
 218 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 219 _mm_andnot_ps (__m128 __A, __m128 __B)
 220 {
 221   return __builtin_ia32_andnps (__A, __B);
 222 }
 223
 224 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 225 _mm_or_ps (__m128 __A, __m128 __B)
 226 {
 227   return __builtin_ia32_orps (__A, __B);
 228 }
 229
 230 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 231 _mm_xor_ps (__m128 __A, __m128 __B)
 232 {
 233   return __builtin_ia32_xorps (__A, __B);
 234 }
 235
 236 /* Perform a comparison on the lower SPFP values of A and B.  If the
 237    comparison is true, place a mask of all ones in the result, otherwise a
 238    mask of zeros.  The upper three SPFP values are passed through from A.  */
 239
 240 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 241 _mm_cmpeq_ss (__m128 __A, __m128 __B)
 242 {
 243   return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B);
 244 }
 245
 246 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 247 _mm_cmplt_ss (__m128 __A, __m128 __B)
 248 {
 249   return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B);
 250 }
 251
 252 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 253 _mm_cmple_ss (__m128 __A, __m128 __B)
 254 {
 255   return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B);
 256 }
 257
 258 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 259 _mm_cmpgt_ss (__m128 __A, __m128 __B)
 260 {
 261   return (__m128) __builtin_ia32_movss ((__v4sf) __A,
 262                                         (__v4sf)
 263                                         __builtin_ia32_cmpltss ((__v4sf) __B,
 264                                                                 (__v4sf)
 265                                                                 __A));
 266 }
 267
 268 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 269 _mm_cmpge_ss (__m128 __A, __m128 __B)
 270 {
 271   return (__m128) __builtin_ia32_movss ((__v4sf) __A,
 272                                         (__v4sf)
 273                                         __builtin_ia32_cmpless ((__v4sf) __B,
 274                                                                 (__v4sf)
 275                                                                 __A));
 276 }
 277
 278 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 279 _mm_cmpneq_ss (__m128 __A, __m128 __B)
 280 {
 281   return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B);
 282 }
 283
 284 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 285 _mm_cmpnlt_ss (__m128 __A, __m128 __B)
 286 {
 287   return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B);
 288 }
 289
 290 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 291 _mm_cmpnle_ss (__m128 __A, __m128 __B)
 292 {
 293   return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B);
 294 }
 295
 296 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 297 _mm_cmpngt_ss (__m128 __A, __m128 __B)
 298 {
 299   return (__m128) __builtin_ia32_movss ((__v4sf) __A,
 300                                         (__v4sf)
 301                                         __builtin_ia32_cmpnltss ((__v4sf) __B,
 302                                                                  (__v4sf)
 303                                                                  __A));
 304 }
 305
 306 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 307 _mm_cmpnge_ss (__m128 __A, __m128 __B)
 308 {
 309   return (__m128) __builtin_ia32_movss ((__v4sf) __A,
 310                                         (__v4sf)
 311                                         __builtin_ia32_cmpnless ((__v4sf) __B,
 312                                                                  (__v4sf)
 313                                                                  __A));
 314 }
 315
 316 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 317 _mm_cmpord_ss (__m128 __A, __m128 __B)
 318 {
 319   return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B);
 320 }
 321
 322 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 323 _mm_cmpunord_ss (__m128 __A, __m128 __B)
 324 {
 325   return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B);
 326 }
 327
 328 /* Perform a comparison on the four SPFP values of A and B.  For each
 329    element, if the comparison is true, place a mask of all ones in the
 330    result, otherwise a mask of zeros.  */
 331
 332 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 333 _mm_cmpeq_ps (__m128 __A, __m128 __B)
 334 {
 335   return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
 336 }
 337
 338 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 339 _mm_cmplt_ps (__m128 __A, __m128 __B)
 340 {
 341   return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B);
 342 }
 343
 344 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 345 _mm_cmple_ps (__m128 __A, __m128 __B)
 346 {
 347   return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B);
 348 }
 349
 350 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 351 _mm_cmpgt_ps (__m128 __A, __m128 __B)
 352 {
 353   return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B);
 354 }
 355
 356 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 357 _mm_cmpge_ps (__m128 __A, __m128 __B)
 358 {
 359   return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B);
 360 }
 361
 362 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 363 _mm_cmpneq_ps (__m128 __A, __m128 __B)
 364 {
 365   return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B);
 366 }
 367
 368 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 369 _mm_cmpnlt_ps (__m128 __A, __m128 __B)
 370 {
 371   return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B);
 372 }
 373
 374 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 375 _mm_cmpnle_ps (__m128 __A, __m128 __B)
 376 {
 377   return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B);
 378 }
 379
 380 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 381 _mm_cmpngt_ps (__m128 __A, __m128 __B)
 382 {
 383   return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B);
 384 }
 385
 386 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 387 _mm_cmpnge_ps (__m128 __A, __m128 __B)
 388 {
 389   return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B);
 390 }
 391
 392 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 393 _mm_cmpord_ps (__m128 __A, __m128 __B)
 394 {
 395   return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B);
 396 }
 397
 398 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 399 _mm_cmpunord_ps (__m128 __A, __m128 __B)
 400 {
 401   return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B);
 402 }
 403
 404 /* Compare the lower SPFP values of A and B and return 1 if true
 405    and 0 if false.  */
 406
 407 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 408 _mm_comieq_ss (__m128 __A, __m128 __B)
 409 {
 410   return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B);
 411 }
 412
 413 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 414 _mm_comilt_ss (__m128 __A, __m128 __B)
 415 {
 416   return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B);
 417 }
 418
 419 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 420 _mm_comile_ss (__m128 __A, __m128 __B)
 421 {
 422   return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B);
 423 }
 424
 425 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 426 _mm_comigt_ss (__m128 __A, __m128 __B)
 427 {
 428   return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B);
 429 }
 430
 431 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 432 _mm_comige_ss (__m128 __A, __m128 __B)
 433 {
 434   return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B);
 435 }
 436
 437 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 438 _mm_comineq_ss (__m128 __A, __m128 __B)
 439 {
 440   return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B);
 441 }
 442
 443 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 444 _mm_ucomieq_ss (__m128 __A, __m128 __B)
 445 {
 446   return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B);
 447 }
 448
 449 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 450 _mm_ucomilt_ss (__m128 __A, __m128 __B)
 451 {
 452   return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B);
 453 }
 454
 455 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 456 _mm_ucomile_ss (__m128 __A, __m128 __B)
 457 {
 458   return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B);
 459 }
 460
 461 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 462 _mm_ucomigt_ss (__m128 __A, __m128 __B)
 463 {
 464   return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B);
 465 }
 466
 467 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 468 _mm_ucomige_ss (__m128 __A, __m128 __B)
 469 {
 470   return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B);
 471 }
 472
 473 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 474 _mm_ucomineq_ss (__m128 __A, __m128 __B)
 475 {
 476   return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B);
 477 }
 478
 479 /* Convert the lower SPFP value to a 32-bit integer according to the current
 480    rounding mode.  */
 481 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 482 _mm_cvtss_si32 (__m128 __A)
 483 {
 484   return __builtin_ia32_cvtss2si ((__v4sf) __A);
 485 }
 486
 487 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 488 _mm_cvt_ss2si (__m128 __A)
 489 {
 490   return _mm_cvtss_si32 (__A);
 491 }
 492
 493 #ifdef __x86_64__
 494 /* Convert the lower SPFP value to a 32-bit integer according to the
 495    current rounding mode.  */
 496
 497 /* Intel intrinsic.  */
 498 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 499 _mm_cvtss_si64 (__m128 __A)
 500 {
 501   return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
 502 }
 503
 504 /* Microsoft intrinsic.  */
 505 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 506 _mm_cvtss_si64x (__m128 __A)
 507 {
 508   return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
 509 }
 510 #endif
 511
 512 /* Convert the two lower SPFP values to 32-bit integers according to the
 513    current rounding mode.  Return the integers in packed form.  */
 514 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 515 _mm_cvtps_pi32 (__m128 __A)
 516 {
 517   return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A);
 518 }
 519
 520 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 521 _mm_cvt_ps2pi (__m128 __A)
 522 {
 523   return _mm_cvtps_pi32 (__A);
 524 }
 525
 526 /* Truncate the lower SPFP value to a 32-bit integer.  */
 527 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 528 _mm_cvttss_si32 (__m128 __A)
 529 {
 530   return __builtin_ia32_cvttss2si ((__v4sf) __A);
 531 }
 532
 533 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 534 _mm_cvtt_ss2si (__m128 __A)
 535 {
 536   return _mm_cvttss_si32 (__A);
 537 }
 538
 539 #ifdef __x86_64__
 540 /* Truncate the lower SPFP value to a 32-bit integer.  */
 541
 542 /* Intel intrinsic.  */
 543 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 544 _mm_cvttss_si64 (__m128 __A)
 545 {
 546   return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
 547 }
 548
 549 /* Microsoft intrinsic.  */
 550 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 551 _mm_cvttss_si64x (__m128 __A)
 552 {
 553   return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
 554 }
 555 #endif
 556
 557 /* Truncate the two lower SPFP values to 32-bit integers.  Return the
 558    integers in packed form.  */
 559 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 560 _mm_cvttps_pi32 (__m128 __A)
 561 {
 562   return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A);
 563 }
 564
 565 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 566 _mm_cvtt_ps2pi (__m128 __A)
 567 {
 568   return _mm_cvttps_pi32 (__A);
 569 }
 570
 571 /* Convert B to a SPFP value and insert it as element zero in A.  */
 572 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 573 _mm_cvtsi32_ss (__m128 __A, int __B)
 574 {
 575   return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B);
 576 }
 577
 578 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 579 _mm_cvt_si2ss (__m128 __A, int __B)
 580 {
 581   return _mm_cvtsi32_ss (__A, __B);
 582 }
 583
 584 #ifdef __x86_64__
 585 /* Convert B to a SPFP value and insert it as element zero in A.  */
 586
 587 /* Intel intrinsic.  */
 588 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 589 _mm_cvtsi64_ss (__m128 __A, long long __B)
 590 {
 591   return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
 592 }
 593
 594 /* Microsoft intrinsic.  */
 595 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 596 _mm_cvtsi64x_ss (__m128 __A, long long __B)
 597 {
 598   return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
 599 }
 600 #endif
 601
 602 /* Convert the two 32-bit values in B to SPFP form and insert them
 603    as the two lower elements in A.  */
 604 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 605 _mm_cvtpi32_ps (__m128 __A, __m64 __B)
 606 {
 607   return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B);
 608 }
 609
 610 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 611 _mm_cvt_pi2ps (__m128 __A, __m64 __B)
 612 {
 613   return _mm_cvtpi32_ps (__A, __B);
 614 }
 615
 616 /* Convert the four signed 16-bit values in A to SPFP form.  */
 617 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 618 _mm_cvtpi16_ps (__m64 __A)
 619 {
 620   __v4hi __sign;
 621   __v2si __hisi, __losi;
 622   __v4sf __zero, __ra, __rb;
 623
 624   /* This comparison against zero gives us a mask that can be used to
 625      fill in the missing sign bits in the unpack operations below, so
 626      that we get signed values after unpacking.  */
 627   __sign = __builtin_ia32_pcmpgtw ((__v4hi)0LL, (__v4hi)__A);
 628
 629   /* Convert the four words to doublewords.  */
 630   __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign);
 631   __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign);
 632
 633   /* Convert the doublewords to floating point two at a time.  */
 634   __zero = (__v4sf) _mm_setzero_ps ();
 635   __ra = __builtin_ia32_cvtpi2ps (__zero, __losi);
 636   __rb = __builtin_ia32_cvtpi2ps (__ra, __hisi);
 637
 638   return (__m128) __builtin_ia32_movlhps (__ra, __rb);
 639 }
 640
 641 /* Convert the four unsigned 16-bit values in A to SPFP form.  */
 642 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 643 _mm_cvtpu16_ps (__m64 __A)
 644 {
 645   __v2si __hisi, __losi;
 646   __v4sf __zero, __ra, __rb;
 647
 648   /* Convert the four words to doublewords.  */
 649   __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, (__v4hi)0LL);
 650   __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, (__v4hi)0LL);
 651
 652   /* Convert the doublewords to floating point two at a time.  */
 653   __zero = (__v4sf) _mm_setzero_ps ();
 654   __ra = __builtin_ia32_cvtpi2ps (__zero, __losi);
 655   __rb = __builtin_ia32_cvtpi2ps (__ra, __hisi);
 656
 657   return (__m128) __builtin_ia32_movlhps (__ra, __rb);
 658 }
 659
 660 /* Convert the low four signed 8-bit values in A to SPFP form.  */
 661 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 662 _mm_cvtpi8_ps (__m64 __A)
 663 {
 664   __v8qi __sign;
 665
 666   /* This comparison against zero gives us a mask that can be used to
 667      fill in the missing sign bits in the unpack operations below, so
 668      that we get signed values after unpacking.  */
 669   __sign = __builtin_ia32_pcmpgtb ((__v8qi)0LL, (__v8qi)__A);
 670
 671   /* Convert the four low bytes to words.  */
 672   __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign);
 673
 674   return _mm_cvtpi16_ps(__A);
 675 }
 676
 677 /* Convert the low four unsigned 8-bit values in A to SPFP form.  */
 678 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 679 _mm_cvtpu8_ps(__m64 __A)
 680 {
 681   __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, (__v8qi)0LL);
 682   return _mm_cvtpu16_ps(__A);
 683 }
 684
 685 /* Convert the four signed 32-bit values in A and B to SPFP form.  */
 686 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 687 _mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
 688 {
 689   __v4sf __zero = (__v4sf) _mm_setzero_ps ();
 690   __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A);
 691   __v4sf __sfb = __builtin_ia32_cvtpi2ps (__sfa, (__v2si)__B);
 692   return (__m128) __builtin_ia32_movlhps (__sfa, __sfb);
 693 }
 694
 695 /* Convert the four SPFP values in A to four signed 16-bit integers.  */
 696 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 697 _mm_cvtps_pi16(__m128 __A)
 698 {
 699   __v4sf __hisf = (__v4sf)__A;
 700   __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf);
 701   __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf);
 702   __v2si __losi = __builtin_ia32_cvtps2pi (__losf);
 703   return (__m64) __builtin_ia32_packssdw (__hisi, __losi);
 704 }
 705
 706 /* Convert the four SPFP values in A to four signed 8-bit integers.  */
 707 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 708 _mm_cvtps_pi8(__m128 __A)
 709 {
 710   __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A);
 711   return (__m64) __builtin_ia32_packsswb (__tmp, (__v4hi)0LL);
 712 }
 713
 714 /* Selects four specific SPFP values from A and B based on MASK.  */
 715 #ifdef __OPTIMIZE__
 716 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 717 _mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask)
 718 {
 719   return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask);
 720 }
 721 #else
 722 #define _mm_shuffle_ps(A, B, MASK)                                      \
 723   ((__m128) __builtin_ia32_shufps ((__v4sf)(__m128)(A),                 \
 724                                    (__v4sf)(__m128)(B), (int)(MASK)))
 725 #endif
 726
 727 /* Selects and interleaves the upper two SPFP values from A and B.  */
 728 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 729 _mm_unpackhi_ps (__m128 __A, __m128 __B)
 730 {
 731   return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B);
 732 }
 733
 734 /* Selects and interleaves the lower two SPFP values from A and B.  */
 735 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 736 _mm_unpacklo_ps (__m128 __A, __m128 __B)
 737 {
 738   return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B);
 739 }
 740
 741 /* Sets the upper two SPFP values with 64-bits of data loaded from P;
 742    the lower two values are passed through from A.  */
 743 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 744 _mm_loadh_pi (__m128 __A, __m64 const *__P)
 745 {
 746   return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (const __v2sf *)__P);
 747 }
 748
 749 /* Stores the upper two SPFP values of A into P.  */
 750 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 751 _mm_storeh_pi (__m64 *__P, __m128 __A)
 752 {
 753   __builtin_ia32_storehps ((__v2sf *)__P, (__v4sf)__A);
 754 }
 755
 756 /* Moves the upper two values of B into the lower two values of A.  */
 757 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 758 _mm_movehl_ps (__m128 __A, __m128 __B)
 759 {
 760   return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B);
 761 }
 762
 763 /* Moves the lower two values of B into the upper two values of A.  */
 764 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 765 _mm_movelh_ps (__m128 __A, __m128 __B)
 766 {
 767   return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B);
 768 }
 769
 770 /* Sets the lower two SPFP values with 64-bits of data loaded from P;
 771    the upper two values are passed through from A.  */
 772 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 773 _mm_loadl_pi (__m128 __A, __m64 const *__P)
 774 {
 775   return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (const __v2sf *)__P);
 776 }
 777
 778 /* Stores the lower two SPFP values of A into P.  */
 779 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 780 _mm_storel_pi (__m64 *__P, __m128 __A)
 781 {
 782   __builtin_ia32_storelps ((__v2sf *)__P, (__v4sf)__A);
 783 }
 784
 785 /* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
 786 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 787 _mm_movemask_ps (__m128 __A)
 788 {
 789   return __builtin_ia32_movmskps ((__v4sf)__A);
 790 }
 791
 792 /* Return the contents of the control register.  */
 793 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 794 _mm_getcsr (void)
 795 {
 796   return __builtin_ia32_stmxcsr ();
 797 }
 798
 799 /* Read exception bits from the control register.  */
 800 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 801 _MM_GET_EXCEPTION_STATE (void)
 802 {
 803   return _mm_getcsr() & _MM_EXCEPT_MASK;
 804 }
 805
 806 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 807 _MM_GET_EXCEPTION_MASK (void)
 808 {
 809   return _mm_getcsr() & _MM_MASK_MASK;
 810 }
 811
 812 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 813 _MM_GET_ROUNDING_MODE (void)
 814 {
 815   return _mm_getcsr() & _MM_ROUND_MASK;
 816 }
 817
 818 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 819 _MM_GET_FLUSH_ZERO_MODE (void)
 820 {
 821   return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
 822 }
 823
 824 /* Set the control register to I.  */
 825 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 826 _mm_setcsr (unsigned int __I)
 827 {
 828   __builtin_ia32_ldmxcsr (__I);
 829 }
 830
 831 /* Set exception bits in the control register.  */
 832 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 833 _MM_SET_EXCEPTION_STATE(unsigned int __mask)
 834 {
 835   _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask);
 836 }
 837
 838 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 839 _MM_SET_EXCEPTION_MASK (unsigned int __mask)
 840 {
 841   _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask);
 842 }
 843
 844 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 845 _MM_SET_ROUNDING_MODE (unsigned int __mode)
 846 {
 847   _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode);
 848 }
 849
 850 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 851 _MM_SET_FLUSH_ZERO_MODE (unsigned int __mode)
 852 {
 853   _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode);
 854 }
 855
 856 /* Create a vector with element 0 as F and the rest zero.  */
 857 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 858 _mm_set_ss (float __F)
 859 {
 860   return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
 861 }
 862
 863 /* Create a vector with all four elements equal to F.  */
 864 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 865 _mm_set1_ps (float __F)
 866 {
 867   return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
 868 }
 869
 870 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 871 _mm_set_ps1 (float __F)
 872 {
 873   return _mm_set1_ps (__F);
 874 }
 875
 876 /* Create a vector with element 0 as *P and the rest zero.  */
 877 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 878 _mm_load_ss (float const *__P)
 879 {
 880   return _mm_set_ss (*__P);
 881 }
 882
 883 /* Create a vector with all four elements equal to *P.  */
 884 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 885 _mm_load1_ps (float const *__P)
 886 {
 887   return _mm_set1_ps (*__P);
 888 }
 889
 890 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 891 _mm_load_ps1 (float const *__P)
 892 {
 893   return _mm_load1_ps (__P);
 894 }
 895
 896 /* Load four SPFP values from P.  The address must be 16-byte aligned.  */
 897 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 898 _mm_load_ps (float const *__P)
 899 {
 900   return (__m128) *(__v4sf *)__P;
 901 }
 902
 903 /* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
 904 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 905 _mm_loadu_ps (float const *__P)
 906 {
 907   return (__m128) __builtin_ia32_loadups (__P);
 908 }
 909
 910 /* Load four SPFP values in reverse order.  The address must be aligned.  */
 911 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 912 _mm_loadr_ps (float const *__P)
 913 {
 914   __v4sf __tmp = *(__v4sf *)__P;
 915   return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3));
 916 }
 917
 918 /* Create the vector [Z Y X W].  */
 919 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 920 _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
 921 {
 922   return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
 923 }
 924
 925 /* Create the vector [W X Y Z].  */
 926 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 927 _mm_setr_ps (float __Z, float __Y, float __X, float __W)
 928 {
 929   return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
 930 }
 931
 932 /* Stores the lower SPFP value.  */
 933 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 934 _mm_store_ss (float *__P, __m128 __A)
 935 {
 936   *__P = __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
 937 }
 938
 939 extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 940 _mm_cvtss_f32 (__m128 __A)
 941 {
 942   return __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
 943 }
 944
 945 /* Store four SPFP values.  The address must be 16-byte aligned.  */
 946 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 947 _mm_store_ps (float *__P, __m128 __A)
 948 {
 949   *(__v4sf *)__P = (__v4sf)__A;
 950 }
 951
 952 /* Store four SPFP values.  The address need not be 16-byte aligned.  */
 953 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 954 _mm_storeu_ps (float *__P, __m128 __A)
 955 {
 956   __builtin_ia32_storeups (__P, (__v4sf)__A);
 957 }
 958
 959 /* Store the lower SPFP value across four words.  */
 960 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 961 _mm_store1_ps (float *__P, __m128 __A)
 962 {
 963   __v4sf __va = (__v4sf)__A;
 964   __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0));
 965   _mm_storeu_ps (__P, __tmp);
 966 }
 967
 968 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 969 _mm_store_ps1 (float *__P, __m128 __A)
 970 {
 971   _mm_store1_ps (__P, __A);
 972 }
 973
 974 /* Store four SPFP values in reverse order.  The address must be aligned.  */
 975 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 976 _mm_storer_ps (float *__P, __m128 __A)
 977 {
 978   __v4sf __va = (__v4sf)__A;
 979   __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3));
 980   _mm_store_ps (__P, __tmp);
 981 }
 982
 983 /* Sets the low SPFP value of A from the low value of B.  */
 984 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 985 _mm_move_ss (__m128 __A, __m128 __B)
 986 {
 987   return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
 988 }
 989
 990 /* Extracts one of the four words of A.  The selector N must be immediate.  */
 991 #ifdef __OPTIMIZE__
 992 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 993 _mm_extract_pi16 (__m64 const __A, int const __N)
 994 {
 995   return __builtin_ia32_vec_ext_v4hi ((__v4hi)__A, __N);
 996 }
 997
 998 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 999 _m_pextrw (__m64 const __A, int const __N)
1000 {
1001   return _mm_extract_pi16 (__A, __N);
1002 }
1003 #else
1004 #define _mm_extract_pi16(A, N)  \
1005   ((int) __builtin_ia32_vec_ext_v4hi ((__v4hi)(__m64)(A), (int)(N)))
1006
1007 #define _m_pextrw(A, N) _mm_extract_pi16(A, N)
1008 #endif
1009
1010 /* Inserts word D into one of four words of A.  The selector N must be
1011    immediate.  */
1012 #ifdef __OPTIMIZE__
1013 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1014 _mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
1015 {
1016   return (__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)__A, __D, __N);
1017 }
1018
1019 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1020 _m_pinsrw (__m64 const __A, int const __D, int const __N)
1021 {
1022   return _mm_insert_pi16 (__A, __D, __N);
1023 }
1024 #else
1025 #define _mm_insert_pi16(A, D, N)                                \
1026   ((__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)(__m64)(A),     \
1027                                         (int)(D), (int)(N)))
1028
1029 #define _m_pinsrw(A, D, N) _mm_insert_pi16(A, D, N)
1030 #endif
1031
1032 /* Compute the element-wise maximum of signed 16-bit values.  */
1033 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1034 _mm_max_pi16 (__m64 __A, __m64 __B)
1035 {
1036   return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B);
1037 }
1038
1039 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1040 _m_pmaxsw (__m64 __A, __m64 __B)
1041 {
1042   return _mm_max_pi16 (__A, __B);
1043 }
1044
1045 /* Compute the element-wise maximum of unsigned 8-bit values.  */
1046 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1047 _mm_max_pu8 (__m64 __A, __m64 __B)
1048 {
1049   return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B);
1050 }
1051
1052 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1053 _m_pmaxub (__m64 __A, __m64 __B)
1054 {
1055   return _mm_max_pu8 (__A, __B);
1056 }
1057
1058 /* Compute the element-wise minimum of signed 16-bit values.  */
1059 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1060 _mm_min_pi16 (__m64 __A, __m64 __B)
1061 {
1062   return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B);
1063 }
1064
1065 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1066 _m_pminsw (__m64 __A, __m64 __B)
1067 {
1068   return _mm_min_pi16 (__A, __B);
1069 }
1070
1071 /* Compute the element-wise minimum of unsigned 8-bit values.  */
1072 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1073 _mm_min_pu8 (__m64 __A, __m64 __B)
1074 {
1075   return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B);
1076 }
1077
1078 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1079 _m_pminub (__m64 __A, __m64 __B)
1080 {
1081   return _mm_min_pu8 (__A, __B);
1082 }
1083
1084 /* Create an 8-bit mask of the signs of 8-bit values.  */
1085 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1086 _mm_movemask_pi8 (__m64 __A)
1087 {
1088   return __builtin_ia32_pmovmskb ((__v8qi)__A);
1089 }
1090
1091 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1092 _m_pmovmskb (__m64 __A)
1093 {
1094   return _mm_movemask_pi8 (__A);
1095 }
1096
1097 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1098    in B and produce the high 16 bits of the 32-bit results.  */
1099 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1100 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
1101 {
1102   return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B);
1103 }
1104
1105 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1106 _m_pmulhuw (__m64 __A, __m64 __B)
1107 {
1108   return _mm_mulhi_pu16 (__A, __B);
1109 }
1110
1111 /* Return a combination of the four 16-bit values in A.  The selector
1112    must be an immediate.  */
1113 #ifdef __OPTIMIZE__
1114 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1115 _mm_shuffle_pi16 (__m64 __A, int const __N)
1116 {
1117   return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N);
1118 }
1119
1120 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1121 _m_pshufw (__m64 __A, int const __N)
1122 {
1123   return _mm_shuffle_pi16 (__A, __N);
1124 }
1125 #else
1126 #define _mm_shuffle_pi16(A, N) \
1127   ((__m64) __builtin_ia32_pshufw ((__v4hi)(__m64)(A), (int)(N)))
1128
1129 #define _m_pshufw(A, N) _mm_shuffle_pi16 (A, N)
1130 #endif
1131
1132 /* Conditionally store byte elements of A into P.  The high bit of each
1133    byte in the selector N determines whether the corresponding byte from
1134    A is stored.  */
1135 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1136 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1137 {
1138   __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
1139 }
1140
1141 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1142 _m_maskmovq (__m64 __A, __m64 __N, char *__P)
1143 {
1144   _mm_maskmove_si64 (__A, __N, __P);
1145 }
1146
1147 /* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
1148 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1149 _mm_avg_pu8 (__m64 __A, __m64 __B)
1150 {
1151   return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B);
1152 }
1153
1154 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1155 _m_pavgb (__m64 __A, __m64 __B)
1156 {
1157   return _mm_avg_pu8 (__A, __B);
1158 }
1159
1160 /* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
1161 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1162 _mm_avg_pu16 (__m64 __A, __m64 __B)
1163 {
1164   return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B);
1165 }
1166
1167 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1168 _m_pavgw (__m64 __A, __m64 __B)
1169 {
1170   return _mm_avg_pu16 (__A, __B);
1171 }
1172
1173 /* Compute the sum of the absolute differences of the unsigned 8-bit
1174    values in A and B.  Return the value in the lower 16-bit word; the
1175    upper words are cleared.  */
1176 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1177 _mm_sad_pu8 (__m64 __A, __m64 __B)
1178 {
1179   return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B);
1180 }
1181
1182 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1183 _m_psadbw (__m64 __A, __m64 __B)
1184 {
1185   return _mm_sad_pu8 (__A, __B);
1186 }
1187
1188 /* Loads one cache line from address P to a location "closer" to the
1189    processor.  The selector I specifies the type of prefetch operation.  */
1190 #ifdef __OPTIMIZE__
1191 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1192 _mm_prefetch (const void *__P, enum _mm_hint __I)
1193 {
1194   __builtin_prefetch (__P, 0, __I);
1195 }
1196 #else
1197 #define _mm_prefetch(P, I) \
1198   __builtin_prefetch ((P), 0, (I))
1199 #endif
1200
1201 /* Stores the data in A to the address P without polluting the caches.  */
1202 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1203 _mm_stream_pi (__m64 *__P, __m64 __A)
1204 {
1205   __builtin_ia32_movntq ((unsigned long long *)__P, (unsigned long long)__A);
1206 }
1207
1208 /* Likewise.  The address must be 16-byte aligned.  */
1209 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1210 _mm_stream_ps (float *__P, __m128 __A)
1211 {
1212   __builtin_ia32_movntps (__P, (__v4sf)__A);
1213 }
1214
1215 /* Guarantees that every preceding store is globally visible before
1216    any subsequent store.  */
1217 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1218 _mm_sfence (void)
1219 {
1220   __builtin_ia32_sfence ();
1221 }
1222
1223 /* The execution of the next instruction is delayed by an implementation
1224    specific amount of time.  The instruction does not modify the
1225    architectural state.  */
1226 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1227 _mm_pause (void)
1228 {
1229   __builtin_ia32_pause ();
1230 }
1231
1232 /* Transpose the 4x4 matrix composed of row[0-3].  */
1233 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)                       \
1234 do {                                                                    \
1235   __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);    \
1236   __v4sf __t0 = __builtin_ia32_unpcklps (__r0, __r1);                   \
1237   __v4sf __t1 = __builtin_ia32_unpcklps (__r2, __r3);                   \
1238   __v4sf __t2 = __builtin_ia32_unpckhps (__r0, __r1);                   \
1239   __v4sf __t3 = __builtin_ia32_unpckhps (__r2, __r3);                   \
1240   (row0) = __builtin_ia32_movlhps (__t0, __t1);                         \
1241   (row1) = __builtin_ia32_movhlps (__t1, __t0);                         \
1242   (row2) = __builtin_ia32_movlhps (__t2, __t3);                         \
1243   (row3) = __builtin_ia32_movhlps (__t3, __t2);                         \
1244 } while (0)
1245
1246 /* For backward source compatibility.  */
1247 # include <emmintrin.h>
1248
1249 #ifdef __DISABLE_SSE__
1250 #undef __DISABLE_SSE__
1251 #pragma GCC pop_options
1252 #endif /* __DISABLE_SSE__ */
1253
1254 #endif /* _XMMINTRIN_H_INCLUDED */