gcc/config/rs6000/xmmintrin.h

   1 /* Copyright (C) 2002-2018 Free Software Foundation, Inc.
   2
   3    This file is part of GCC.
   4
   5    GCC is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3, or (at your option)
   8    any later version.
   9
  10    GCC is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    Under Section 7 of GPL version 3, you are granted additional
  16    permissions described in the GCC Runtime Library Exception, version
  17    3.1, as published by the Free Software Foundation.
  18
  19    You should have received a copy of the GNU General Public License and
  20    a copy of the GCC Runtime Library Exception along with this program;
  21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  22    <http://www.gnu.org/licenses/>.  */
  23
  24 /* Implemented from the specification included in the Intel C++ Compiler
  25    User Guide and Reference, version 9.0.  */
  26
  27 #ifndef NO_WARN_X86_INTRINSICS
  28 /* This header is distributed to simplify porting x86_64 code that
  29    makes explicit use of Intel intrinsics to powerpc64le.
  30    It is the user's responsibility to determine if the results are
  31    acceptable and make additional changes as necessary.
  32    Note that much code that uses Intel intrinsics can be rewritten in
  33    standard C or GNU C extensions, which are more portable and better
  34    optimized across multiple targets.
  35
  36    In the specific case of X86 SSE (__m128) intrinsics, the PowerPC
  37    VMX/VSX ISA is a good match for vector float SIMD operations.
  38    However scalar float operations in vector (XMM) registers require
  39    the POWER8 VSX ISA (2.07) level. Also there are important
  40    differences for data format and placement of float scalars in the
  41    vector register. For PowerISA Scalar floats in FPRs (left most
  42    64-bits of the low 32 VSRs) is in double format, while X86_64 SSE
  43    uses the right most 32-bits of the XMM. These differences require
  44    extra steps on POWER to match the SSE scalar float semantics.
  45
  46    Most SSE scalar float intrinsic operations can be performed more
  47    efficiently as C language float scalar operations or optimized to
  48    use vector SIMD operations.  We recommend this for new applications.
  49
  50    Another difference is the format and details of the X86_64 MXSCR vs
  51    the PowerISA FPSCR / VSCR registers. We recommend applications
  52    replace direct access to the MXSCR with the more portable <fenv.h>
  53    Posix APIs. */
  54 #error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
  55 #endif
  56
  57 #ifndef _XMMINTRIN_H_INCLUDED
  58 #define _XMMINTRIN_H_INCLUDED
  59
  60 /* Define four value permute mask */
  61 #define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
  62
  63 #include <altivec.h>
  64
  65 /* Avoid collisions between altivec.h and strict adherence to C++ and
  66    C11 standards.  This should eventually be done inside altivec.h itself,
  67    but only after testing a full distro build.  */
  68 #if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \
  69                                  (defined(__STDC_VERSION__) &&  \
  70                                   __STDC_VERSION__ >= 201112L))
  71 #undef vector
  72 #undef pixel
  73 #undef bool
  74 #endif
  75
  76 #include <assert.h>
  77
  78 /* We need type definitions from the MMX header file.  */
  79 #include <mmintrin.h>
  80
  81 /* Get _mm_malloc () and _mm_free ().  */
  82 #include <mm_malloc.h>
  83
  84 /* The Intel API is flexible enough that we must allow aliasing with other
  85    vector types, and their scalar components.  */
  86 typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
  87
  88 /* Unaligned version of the same type.  */
  89 typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__,
  90                                        __aligned__ (1)));
  91
  92 /* Internal data types for implementing the intrinsics.  */
  93 typedef float __v4sf __attribute__ ((__vector_size__ (16)));
  94
  95 /* Create an undefined vector.  */
  96 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  97 _mm_undefined_ps (void)
  98 {
  99   __m128 __Y = __Y;
 100   return __Y;
 101 }
 102
 103 /* Create a vector of zeros.  */
 104 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 105 _mm_setzero_ps (void)
 106 {
 107   return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
 108 }
 109
 110 /* Load four SPFP values from P.  The address must be 16-byte aligned.  */
 111 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 112 _mm_load_ps (float const *__P)
 113 {
 114   assert(((unsigned long)__P & 0xfUL) == 0UL);
 115   return ((__m128)vec_ld(0, (__v4sf*)__P));
 116 }
 117
 118 /* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
 119 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 120 _mm_loadu_ps (float const *__P)
 121 {
 122   return (vec_vsx_ld(0, __P));
 123 }
 124
 125 /* Load four SPFP values in reverse order.  The address must be aligned.  */
 126 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 127 _mm_loadr_ps (float const *__P)
 128 {
 129   __v4sf   __tmp;
 130   __m128 result;
 131   static const __vector unsigned char permute_vector =
 132     { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
 133         0x17, 0x10, 0x11, 0x12, 0x13 };
 134
 135   __tmp = vec_ld (0, (__v4sf *) __P);
 136   result = (__m128) vec_perm (__tmp, __tmp, permute_vector);
 137   return result;
 138 }
 139
 140 /* Create a vector with all four elements equal to F.  */
 141 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 142 _mm_set1_ps (float __F)
 143 {
 144   return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
 145 }
 146
 147 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 148 _mm_set_ps1 (float __F)
 149 {
 150   return _mm_set1_ps (__F);
 151 }
 152
 153 /* Create the vector [Z Y X W].  */
 154 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 155 _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
 156 {
 157   return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
 158 }
 159
 160 /* Create the vector [W X Y Z].  */
 161 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 162 _mm_setr_ps (float __Z, float __Y, float __X, float __W)
 163 {
 164   return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
 165 }
 166
 167 /* Store four SPFP values.  The address must be 16-byte aligned.  */
 168 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 169 _mm_store_ps (float *__P, __m128 __A)
 170 {
 171   assert(((unsigned long)__P & 0xfUL) == 0UL);
 172   vec_st((__v4sf)__A, 0, (__v4sf*)__P);
 173 }
 174
 175 /* Store four SPFP values.  The address need not be 16-byte aligned.  */
 176 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 177 _mm_storeu_ps (float *__P, __m128 __A)
 178 {
 179   *(__m128_u *)__P = __A;
 180 }
 181
 182 /* Store four SPFP values in reverse order.  The address must be aligned.  */
 183 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 184 _mm_storer_ps (float *__P, __m128 __A)
 185 {
 186   __v4sf   __tmp;
 187   static const __vector unsigned char permute_vector =
 188     { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
 189         0x17, 0x10, 0x11, 0x12, 0x13 };
 190
 191   __tmp = (__m128) vec_perm (__A, __A, permute_vector);
 192
 193   _mm_store_ps (__P, __tmp);
 194 }
 195
 196 /* Store the lower SPFP value across four words.  */
 197 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 198 _mm_store1_ps (float *__P, __m128 __A)
 199 {
 200   __v4sf __va = vec_splat((__v4sf)__A, 0);
 201   _mm_store_ps (__P, __va);
 202 }
 203
 204 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 205 _mm_store_ps1 (float *__P, __m128 __A)
 206 {
 207   _mm_store1_ps (__P, __A);
 208 }
 209
 210 /* Create a vector with element 0 as F and the rest zero.  */
 211 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 212 _mm_set_ss (float __F)
 213 {
 214   return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
 215 }
 216
 217 /* Sets the low SPFP value of A from the low value of B.  */
 218 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 219 _mm_move_ss (__m128 __A, __m128 __B)
 220 {
 221   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 222
 223   return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask));
 224 }
 225
 226 /* Create a vector with element 0 as *P and the rest zero.  */
 227 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 228 _mm_load_ss (float const *__P)
 229 {
 230   return _mm_set_ss (*__P);
 231 }
 232
 233 /* Stores the lower SPFP value.  */
 234 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 235 _mm_store_ss (float *__P, __m128 __A)
 236 {
 237   *__P = ((__v4sf)__A)[0];
 238 }
 239
 240 /* Perform the respective operation on the lower SPFP (single-precision
 241    floating-point) values of A and B; the upper three SPFP values are
 242    passed through from A.  */
 243
 244 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 245 _mm_add_ss (__m128 __A, __m128 __B)
 246 {
 247 #ifdef _ARCH_PWR7
 248   __m128 a, b, c;
 249   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 250   /* PowerISA VSX does not allow partial (for just lower double)
 251      results. So to insure we don't generate spurious exceptions
 252      (from the upper double values) we splat the lower double
 253      before we to the operation.  */
 254   a = vec_splat (__A, 0);
 255   b = vec_splat (__B, 0);
 256   c = a + b;
 257   /* Then we merge the lower float result with the original upper
 258      float elements from __A.  */
 259   return (vec_sel (__A, c, mask));
 260 #else
 261   __A[0] = __A[0] + __B[0];
 262   return (__A);
 263 #endif
 264 }
 265
 266 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 267 _mm_sub_ss (__m128 __A, __m128 __B)
 268 {
 269 #ifdef _ARCH_PWR7
 270   __m128 a, b, c;
 271   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 272   /* PowerISA VSX does not allow partial (for just lower double)
 273      results. So to insure we don't generate spurious exceptions
 274      (from the upper double values) we splat the lower double
 275      before we to the operation.  */
 276   a = vec_splat (__A, 0);
 277   b = vec_splat (__B, 0);
 278   c = a - b;
 279   /* Then we merge the lower float result with the original upper
 280      float elements from __A.  */
 281   return (vec_sel (__A, c, mask));
 282 #else
 283   __A[0] = __A[0] - __B[0];
 284   return (__A);
 285 #endif
 286 }
 287
 288 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 289 _mm_mul_ss (__m128 __A, __m128 __B)
 290 {
 291 #ifdef _ARCH_PWR7
 292   __m128 a, b, c;
 293   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 294   /* PowerISA VSX does not allow partial (for just lower double)
 295      results. So to insure we don't generate spurious exceptions
 296      (from the upper double values) we splat the lower double
 297      before we to the operation.  */
 298   a = vec_splat (__A, 0);
 299   b = vec_splat (__B, 0);
 300   c = a * b;
 301   /* Then we merge the lower float result with the original upper
 302      float elements from __A.  */
 303   return (vec_sel (__A, c, mask));
 304 #else
 305   __A[0] = __A[0] * __B[0];
 306   return (__A);
 307 #endif
 308 }
 309
 310 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 311 _mm_div_ss (__m128 __A, __m128 __B)
 312 {
 313 #ifdef _ARCH_PWR7
 314   __m128 a, b, c;
 315   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 316   /* PowerISA VSX does not allow partial (for just lower double)
 317      results. So to insure we don't generate spurious exceptions
 318      (from the upper double values) we splat the lower double
 319      before we to the operation.  */
 320   a = vec_splat (__A, 0);
 321   b = vec_splat (__B, 0);
 322   c = a / b;
 323   /* Then we merge the lower float result with the original upper
 324      float elements from __A.  */
 325   return (vec_sel (__A, c, mask));
 326 #else
 327   __A[0] = __A[0] / __B[0];
 328   return (__A);
 329 #endif
 330 }
 331
 332 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 333 _mm_sqrt_ss (__m128 __A)
 334 {
 335   __m128 a, c;
 336   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 337   /* PowerISA VSX does not allow partial (for just lower double)
 338    * results. So to insure we don't generate spurious exceptions
 339    * (from the upper double values) we splat the lower double
 340    * before we to the operation. */
 341   a = vec_splat (__A, 0);
 342   c = vec_sqrt (a);
 343   /* Then we merge the lower float result with the original upper
 344    * float elements from __A.  */
 345   return (vec_sel (__A, c, mask));
 346 }
 347
 348 /* Perform the respective operation on the four SPFP values in A and B.  */
 349 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 350 _mm_add_ps (__m128 __A, __m128 __B)
 351 {
 352   return (__m128) ((__v4sf)__A + (__v4sf)__B);
 353 }
 354
 355 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 356 _mm_sub_ps (__m128 __A, __m128 __B)
 357 {
 358   return (__m128) ((__v4sf)__A - (__v4sf)__B);
 359 }
 360
 361 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 362 _mm_mul_ps (__m128 __A, __m128 __B)
 363 {
 364   return (__m128) ((__v4sf)__A * (__v4sf)__B);
 365 }
 366
 367 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 368 _mm_div_ps (__m128 __A, __m128 __B)
 369 {
 370   return (__m128) ((__v4sf)__A / (__v4sf)__B);
 371 }
 372
 373 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 374 _mm_sqrt_ps (__m128 __A)
 375 {
 376   return (vec_sqrt ((__v4sf)__A));
 377 }
 378
 379 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 380 _mm_rcp_ps (__m128 __A)
 381 {
 382   return (vec_re ((__v4sf)__A));
 383 }
 384
 385 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 386 _mm_rsqrt_ps (__m128 __A)
 387 {
 388   return (vec_rsqrte (__A));
 389 }
 390
 391 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 392 _mm_rcp_ss (__m128 __A)
 393 {
 394   __m128 a, c;
 395   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 396   /* PowerISA VSX does not allow partial (for just lower double)
 397    * results. So to insure we don't generate spurious exceptions
 398    * (from the upper double values) we splat the lower double
 399    * before we to the operation. */
 400   a = vec_splat (__A, 0);
 401   c = _mm_rcp_ps (a);
 402   /* Then we merge the lower float result with the original upper
 403    * float elements from __A.  */
 404   return (vec_sel (__A, c, mask));
 405 }
 406
 407 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 408 _mm_rsqrt_ss (__m128 __A)
 409 {
 410   __m128 a, c;
 411   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 412   /* PowerISA VSX does not allow partial (for just lower double)
 413    * results. So to insure we don't generate spurious exceptions
 414    * (from the upper double values) we splat the lower double
 415    * before we to the operation. */
 416   a = vec_splat (__A, 0);
 417   c = vec_rsqrte (a);
 418   /* Then we merge the lower float result with the original upper
 419    * float elements from __A.  */
 420   return (vec_sel (__A, c, mask));
 421 }
 422
 423 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 424 _mm_min_ss (__m128 __A, __m128 __B)
 425 {
 426   __v4sf a, b, c;
 427   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 428   /* PowerISA VSX does not allow partial (for just lower float)
 429    * results. So to insure we don't generate spurious exceptions
 430    * (from the upper float values) we splat the lower float
 431    * before we to the operation. */
 432   a = vec_splat ((__v4sf)__A, 0);
 433   b = vec_splat ((__v4sf)__B, 0);
 434   c = vec_min (a, b);
 435   /* Then we merge the lower float result with the original upper
 436    * float elements from __A.  */
 437   return (vec_sel ((__v4sf)__A, c, mask));
 438 }
 439
 440 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 441 _mm_max_ss (__m128 __A, __m128 __B)
 442 {
 443   __v4sf a, b, c;
 444   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 445   /* PowerISA VSX does not allow partial (for just lower float)
 446    * results. So to insure we don't generate spurious exceptions
 447    * (from the upper float values) we splat the lower float
 448    * before we to the operation. */
 449   a = vec_splat (__A, 0);
 450   b = vec_splat (__B, 0);
 451   c = vec_max (a, b);
 452   /* Then we merge the lower float result with the original upper
 453    * float elements from __A.  */
 454   return (vec_sel ((__v4sf)__A, c, mask));
 455 }
 456
 457 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 458 _mm_min_ps (__m128 __A, __m128 __B)
 459 {
 460   __vector __bool int m = vec_cmpgt ((__v4sf) __B, (__v4sf) __A);
 461   return vec_sel (__B, __A, m);
 462 }
 463
 464 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 465 _mm_max_ps (__m128 __A, __m128 __B)
 466 {
 467   __vector __bool int m = vec_cmpgt ((__v4sf) __A, (__v4sf) __B);
 468   return vec_sel (__B, __A, m);
 469 }
 470
 471 /* Perform logical bit-wise operations on 128-bit values.  */
 472 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 473 _mm_and_ps (__m128 __A, __m128 __B)
 474 {
 475   return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B));
 476 //  return __builtin_ia32_andps (__A, __B);
 477 }
 478
 479 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 480 _mm_andnot_ps (__m128 __A, __m128 __B)
 481 {
 482   return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A));
 483 }
 484
 485 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 486 _mm_or_ps (__m128 __A, __m128 __B)
 487 {
 488   return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B));
 489 }
 490
 491 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 492 _mm_xor_ps (__m128 __A, __m128 __B)
 493 {
 494   return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B));
 495 }
 496
 497 /* Perform a comparison on the four SPFP values of A and B.  For each
 498    element, if the comparison is true, place a mask of all ones in the
 499    result, otherwise a mask of zeros.  */
 500 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 501 _mm_cmpeq_ps (__m128 __A, __m128 __B)
 502 {
 503   return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B));
 504 }
 505
 506 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 507 _mm_cmplt_ps (__m128 __A, __m128 __B)
 508 {
 509   return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
 510 }
 511
 512 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 513 _mm_cmple_ps (__m128 __A, __m128 __B)
 514 {
 515   return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
 516 }
 517
 518 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 519 _mm_cmpgt_ps (__m128 __A, __m128 __B)
 520 {
 521   return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
 522 }
 523
 524 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 525 _mm_cmpge_ps (__m128 __A, __m128 __B)
 526 {
 527   return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
 528 }
 529
 530 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 531 _mm_cmpneq_ps (__m128  __A, __m128  __B)
 532 {
 533   __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
 534   return ((__m128)vec_nor (temp, temp));
 535 }
 536
 537 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 538 _mm_cmpnlt_ps (__m128 __A, __m128 __B)
 539 {
 540   return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
 541 }
 542
 543 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 544 _mm_cmpnle_ps (__m128 __A, __m128 __B)
 545 {
 546   return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
 547 }
 548
 549 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 550 _mm_cmpngt_ps (__m128 __A, __m128 __B)
 551 {
 552   return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
 553 }
 554
 555 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 556 _mm_cmpnge_ps (__m128 __A, __m128 __B)
 557 {
 558   return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
 559 }
 560
 561 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 562 _mm_cmpord_ps (__m128  __A, __m128  __B)
 563 {
 564   __vector unsigned int a, b;
 565   __vector unsigned int c, d;
 566   static const __vector unsigned int float_exp_mask =
 567     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
 568
 569   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
 570   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
 571   c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
 572   d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
 573   return ((__m128 ) vec_and (c, d));
 574 }
 575
 576 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 577 _mm_cmpunord_ps (__m128 __A, __m128 __B)
 578 {
 579   __vector unsigned int a, b;
 580   __vector unsigned int c, d;
 581   static const __vector unsigned int float_exp_mask =
 582     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
 583
 584   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
 585   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
 586   c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
 587   d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
 588   return ((__m128 ) vec_or (c, d));
 589 }
 590
 591 /* Perform a comparison on the lower SPFP values of A and B.  If the
 592    comparison is true, place a mask of all ones in the result, otherwise a
 593    mask of zeros.  The upper three SPFP values are passed through from A.  */
 594 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 595 _mm_cmpeq_ss (__m128  __A, __m128  __B)
 596 {
 597   static const __vector unsigned int mask =
 598     { 0xffffffff, 0, 0, 0 };
 599   __v4sf a, b, c;
 600   /* PowerISA VMX does not allow partial (for just element 0)
 601    * results. So to insure we don't generate spurious exceptions
 602    * (from the upper elements) we splat the lower float
 603    * before we to the operation. */
 604   a = vec_splat ((__v4sf) __A, 0);
 605   b = vec_splat ((__v4sf) __B, 0);
 606   c = (__v4sf) vec_cmpeq(a, b);
 607   /* Then we merge the lower float result with the original upper
 608    * float elements from __A.  */
 609   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 610 }
 611
 612 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 613 _mm_cmplt_ss (__m128 __A, __m128 __B)
 614 {
 615   static const __vector unsigned int mask =
 616     { 0xffffffff, 0, 0, 0 };
 617   __v4sf a, b, c;
 618   /* PowerISA VMX does not allow partial (for just element 0)
 619    * results. So to insure we don't generate spurious exceptions
 620    * (from the upper elements) we splat the lower float
 621    * before we to the operation. */
 622   a = vec_splat ((__v4sf) __A, 0);
 623   b = vec_splat ((__v4sf) __B, 0);
 624   c = (__v4sf) vec_cmplt(a, b);
 625   /* Then we merge the lower float result with the original upper
 626    * float elements from __A.  */
 627   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 628 }
 629
 630 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 631 _mm_cmple_ss (__m128 __A, __m128 __B)
 632 {
 633   static const __vector unsigned int mask =
 634     { 0xffffffff, 0, 0, 0 };
 635   __v4sf a, b, c;
 636   /* PowerISA VMX does not allow partial (for just element 0)
 637    * results. So to insure we don't generate spurious exceptions
 638    * (from the upper elements) we splat the lower float
 639    * before we to the operation. */
 640   a = vec_splat ((__v4sf) __A, 0);
 641   b = vec_splat ((__v4sf) __B, 0);
 642   c = (__v4sf) vec_cmple(a, b);
 643   /* Then we merge the lower float result with the original upper
 644    * float elements from __A.  */
 645   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 646 }
 647
 648 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 649 _mm_cmpgt_ss (__m128 __A, __m128 __B)
 650 {
 651   static const __vector unsigned int mask =
 652     { 0xffffffff, 0, 0, 0 };
 653   __v4sf a, b, c;
 654   /* PowerISA VMX does not allow partial (for just element 0)
 655    * results. So to insure we don't generate spurious exceptions
 656    * (from the upper elements) we splat the lower float
 657    * before we to the operation. */
 658   a = vec_splat ((__v4sf) __A, 0);
 659   b = vec_splat ((__v4sf) __B, 0);
 660   c = (__v4sf) vec_cmpgt(a, b);
 661   /* Then we merge the lower float result with the original upper
 662    * float elements from __A.  */
 663   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 664 }
 665
 666 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 667 _mm_cmpge_ss (__m128 __A, __m128 __B)
 668 {
 669   static const __vector unsigned int mask =
 670     { 0xffffffff, 0, 0, 0 };
 671   __v4sf a, b, c;
 672   /* PowerISA VMX does not allow partial (for just element 0)
 673    * results. So to insure we don't generate spurious exceptions
 674    * (from the upper elements) we splat the lower float
 675    * before we to the operation. */
 676   a = vec_splat ((__v4sf) __A, 0);
 677   b = vec_splat ((__v4sf) __B, 0);
 678   c = (__v4sf) vec_cmpge(a, b);
 679   /* Then we merge the lower float result with the original upper
 680    * float elements from __A.  */
 681   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 682 }
 683
 684 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 685 _mm_cmpneq_ss (__m128 __A, __m128 __B)
 686 {
 687   static const __vector unsigned int mask =
 688     { 0xffffffff, 0, 0, 0 };
 689   __v4sf a, b, c;
 690   /* PowerISA VMX does not allow partial (for just element 0)
 691    * results. So to insure we don't generate spurious exceptions
 692    * (from the upper elements) we splat the lower float
 693    * before we to the operation. */
 694   a = vec_splat ((__v4sf) __A, 0);
 695   b = vec_splat ((__v4sf) __B, 0);
 696   c = (__v4sf) vec_cmpeq(a, b);
 697   c = vec_nor (c, c);
 698   /* Then we merge the lower float result with the original upper
 699    * float elements from __A.  */
 700   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 701 }
 702
 703 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 704 _mm_cmpnlt_ss (__m128 __A, __m128 __B)
 705 {
 706   static const __vector unsigned int mask =
 707     { 0xffffffff, 0, 0, 0 };
 708   __v4sf a, b, c;
 709   /* PowerISA VMX does not allow partial (for just element 0)
 710    * results. So to insure we don't generate spurious exceptions
 711    * (from the upper elements) we splat the lower float
 712    * before we to the operation. */
 713   a = vec_splat ((__v4sf) __A, 0);
 714   b = vec_splat ((__v4sf) __B, 0);
 715   c = (__v4sf) vec_cmpge(a, b);
 716   /* Then we merge the lower float result with the original upper
 717    * float elements from __A.  */
 718   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 719 }
 720
 721 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 722 _mm_cmpnle_ss (__m128 __A, __m128 __B)
 723 {
 724   static const __vector unsigned int mask =
 725     { 0xffffffff, 0, 0, 0 };
 726   __v4sf a, b, c;
 727   /* PowerISA VMX does not allow partial (for just element 0)
 728    * results. So to insure we don't generate spurious exceptions
 729    * (from the upper elements) we splat the lower float
 730    * before we to the operation. */
 731   a = vec_splat ((__v4sf) __A, 0);
 732   b = vec_splat ((__v4sf) __B, 0);
 733   c = (__v4sf) vec_cmpgt(a, b);
 734   /* Then we merge the lower float result with the original upper
 735    * float elements from __A.  */
 736   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 737 }
 738
 739 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 740 _mm_cmpngt_ss (__m128 __A, __m128 __B)
 741 {
 742   static const __vector unsigned int mask =
 743     { 0xffffffff, 0, 0, 0 };
 744   __v4sf a, b, c;
 745   /* PowerISA VMX does not allow partial (for just element 0)
 746    * results. So to insure we don't generate spurious exceptions
 747    * (from the upper elements) we splat the lower float
 748    * before we to the operation. */
 749   a = vec_splat ((__v4sf) __A, 0);
 750   b = vec_splat ((__v4sf) __B, 0);
 751   c = (__v4sf) vec_cmple(a, b);
 752   /* Then we merge the lower float result with the original upper
 753    * float elements from __A.  */
 754   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 755 }
 756
 757 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 758 _mm_cmpnge_ss (__m128 __A, __m128 __B)
 759 {
 760   static const __vector unsigned int mask =
 761     { 0xffffffff, 0, 0, 0 };
 762   __v4sf a, b, c;
 763   /* PowerISA VMX does not allow partial (for just element 0)
 764    * results. So to insure we don't generate spurious exceptions
 765    * (from the upper elements) we splat the lower float
 766    * before we do the operation. */
 767   a = vec_splat ((__v4sf) __A, 0);
 768   b = vec_splat ((__v4sf) __B, 0);
 769   c = (__v4sf) vec_cmplt(a, b);
 770   /* Then we merge the lower float result with the original upper
 771    * float elements from __A.  */
 772   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 773 }
 774
 775 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 776 _mm_cmpord_ss (__m128 __A, __m128 __B)
 777 {
 778   __vector unsigned int a, b;
 779   __vector unsigned int c, d;
 780   static const __vector unsigned int float_exp_mask =
 781     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
 782   static const __vector unsigned int mask =
 783     { 0xffffffff, 0, 0, 0 };
 784
 785   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
 786   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
 787   c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
 788   d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
 789   c = vec_and (c, d);
 790   /* Then we merge the lower float result with the original upper
 791    * float elements from __A.  */
 792   return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
 793 }
 794
 795 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 796 _mm_cmpunord_ss (__m128 __A, __m128 __B)
 797 {
 798   __vector unsigned int a, b;
 799   __vector unsigned int c, d;
 800   static const __vector unsigned int float_exp_mask =
 801     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
 802   static const __vector unsigned int mask =
 803     { 0xffffffff, 0, 0, 0 };
 804
 805   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
 806   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
 807   c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
 808   d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
 809   c = vec_or (c, d);
 810   /* Then we merge the lower float result with the original upper
 811    * float elements from __A.  */
 812   return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
 813 }
 814
 815 /* Compare the lower SPFP values of A and B and return 1 if true
 816    and 0 if false.  */
 817 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 818 _mm_comieq_ss (__m128 __A, __m128 __B)
 819 {
 820   return (__A[0] == __B[0]);
 821 }
 822
 823 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 824 _mm_comilt_ss (__m128 __A, __m128 __B)
 825 {
 826   return (__A[0] < __B[0]);
 827 }
 828
 829 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 830 _mm_comile_ss (__m128 __A, __m128 __B)
 831 {
 832   return (__A[0] <= __B[0]);
 833 }
 834
 835 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 836 _mm_comigt_ss (__m128 __A, __m128 __B)
 837 {
 838   return (__A[0] > __B[0]);
 839 }
 840
 841 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 842 _mm_comige_ss (__m128 __A, __m128 __B)
 843 {
 844   return (__A[0] >= __B[0]);
 845 }
 846
 847 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 848 _mm_comineq_ss (__m128 __A, __m128 __B)
 849 {
 850   return (__A[0] != __B[0]);
 851 }
 852
 853 /* FIXME
 854  * The __mm_ucomi??_ss implementations below are exactly the same as
 855  * __mm_comi??_ss because GCC for PowerPC only generates unordered
 856  * compares (scalar and vector).
 857  * Technically __mm_comieq_ss et al should be using the ordered
 858  * compare and signal for QNaNs.
 859  * The __mm_ucomieq_sd et all should be OK, as is.
 860  */
 861 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 862 _mm_ucomieq_ss (__m128 __A, __m128 __B)
 863 {
 864   return (__A[0] == __B[0]);
 865 }
 866
 867 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 868 _mm_ucomilt_ss (__m128 __A, __m128 __B)
 869 {
 870   return (__A[0] < __B[0]);
 871 }
 872
 873 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 874 _mm_ucomile_ss (__m128 __A, __m128 __B)
 875 {
 876   return (__A[0] <= __B[0]);
 877 }
 878
 879 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 880 _mm_ucomigt_ss (__m128 __A, __m128 __B)
 881 {
 882   return (__A[0] > __B[0]);
 883 }
 884
 885 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 886 _mm_ucomige_ss (__m128 __A, __m128 __B)
 887 {
 888   return (__A[0] >= __B[0]);
 889 }
 890
 891 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 892 _mm_ucomineq_ss (__m128 __A, __m128 __B)
 893 {
 894   return (__A[0] != __B[0]);
 895 }
 896
 897 extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 898 _mm_cvtss_f32 (__m128 __A)
 899 {
 900   return ((__v4sf)__A)[0];
 901 }
 902
 903 /* Convert the lower SPFP value to a 32-bit integer according to the current
 904    rounding mode.  */
 905 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 906 _mm_cvtss_si32 (__m128 __A)
 907 {
 908   __m64 res = 0;
 909 #ifdef _ARCH_PWR8
 910   __m128 vtmp;
 911   double dtmp;
 912   __asm__(
 913       "xxsldwi %x1,%x3,%x3,3;\n"
 914       "xscvspdp %x2,%x1;\n"
 915       "fctiw  %2,%2;\n"
 916       "mfvsrd  %0,%x2;\n"
 917       : "=r" (res),
 918         "=&wa" (vtmp),
 919         "=f" (dtmp)
 920       : "wa" (__A)
 921       : );
 922 #else
 923   res = __builtin_rint(__A[0]);
 924 #endif
 925   return (res);
 926 }
 927
 928 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 929 _mm_cvt_ss2si (__m128 __A)
 930 {
 931   return _mm_cvtss_si32 (__A);
 932 }
 933
 934 /* Convert the lower SPFP value to a 32-bit integer according to the
 935    current rounding mode.  */
 936
 937 /* Intel intrinsic.  */
 938 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 939 _mm_cvtss_si64 (__m128 __A)
 940 {
 941   __m64 res = 0;
 942 #ifdef _ARCH_PWR8
 943   __m128 vtmp;
 944   double dtmp;
 945   __asm__(
 946       "xxsldwi %x1,%x3,%x3,3;\n"
 947       "xscvspdp %x2,%x1;\n"
 948       "fctid  %2,%2;\n"
 949       "mfvsrd  %0,%x2;\n"
 950       : "=r" (res),
 951         "=&wa" (vtmp),
 952         "=f" (dtmp)
 953       : "wa" (__A)
 954       : );
 955 #else
 956   res = __builtin_llrint(__A[0]);
 957 #endif
 958   return (res);
 959 }
 960
 961 /* Microsoft intrinsic.  */
 962 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 963 _mm_cvtss_si64x (__m128 __A)
 964 {
 965   return _mm_cvtss_si64 ((__v4sf) __A);
 966 }
 967
 968 /* Constants for use with _mm_prefetch.  */
 969 enum _mm_hint
 970 {
 971   /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit.  */
 972   _MM_HINT_ET0 = 7,
 973   _MM_HINT_ET1 = 6,
 974   _MM_HINT_T0 = 3,
 975   _MM_HINT_T1 = 2,
 976   _MM_HINT_T2 = 1,
 977   _MM_HINT_NTA = 0
 978 };
 979
 980 /* Loads one cache line from address P to a location "closer" to the
 981    processor.  The selector I specifies the type of prefetch operation.  */
 982 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 983 _mm_prefetch (const void *__P, enum _mm_hint __I)
 984 {
 985   /* Current PowerPC will ignores the hint parameters.  */
 986   __builtin_prefetch (__P);
 987 }
 988
 989 /* Convert the two lower SPFP values to 32-bit integers according to the
 990    current rounding mode.  Return the integers in packed form.  */
 991 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 992 _mm_cvtps_pi32 (__m128 __A)
 993 {
 994   /* Splat two lower SPFP values to both halves.  */
 995   __v4sf temp, rounded;
 996   __vector unsigned long long result;
 997
 998   /* Splat two lower SPFP values to both halves.  */
 999   temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
1000   rounded = vec_rint(temp);
1001   result = (__vector unsigned long long) vec_cts (rounded, 0);
1002
1003   return (__m64) ((__vector long long) result)[0];
1004 }
1005
1006 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1007 _mm_cvt_ps2pi (__m128 __A)
1008 {
1009   return _mm_cvtps_pi32 (__A);
1010 }
1011
1012 /* Truncate the lower SPFP value to a 32-bit integer.  */
1013 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1014 _mm_cvttss_si32 (__m128 __A)
1015 {
1016   /* Extract the lower float element.  */
1017   float temp = __A[0];
1018   /* truncate to 32-bit integer and return.  */
1019   return temp;
1020 }
1021
1022 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1023 _mm_cvtt_ss2si (__m128 __A)
1024 {
1025   return _mm_cvttss_si32 (__A);
1026 }
1027
1028 /* Intel intrinsic.  */
1029 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1030 _mm_cvttss_si64 (__m128 __A)
1031 {
1032   /* Extract the lower float element.  */
1033   float temp = __A[0];
1034   /* truncate to 32-bit integer and return.  */
1035   return temp;
1036 }
1037
1038 /* Microsoft intrinsic.  */
1039 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1040 _mm_cvttss_si64x (__m128 __A)
1041 {
1042   /* Extract the lower float element.  */
1043   float temp = __A[0];
1044   /* truncate to 32-bit integer and return.  */
1045   return temp;
1046 }
1047
1048 /* Truncate the two lower SPFP values to 32-bit integers.  Return the
1049    integers in packed form.  */
1050 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1051 _mm_cvttps_pi32 (__m128 __A)
1052 {
1053   __v4sf temp;
1054   __vector unsigned long long result;
1055
1056   /* Splat two lower SPFP values to both halves.  */
1057   temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
1058   result = (__vector unsigned long long) vec_cts (temp, 0);
1059
1060   return (__m64) ((__vector long long) result)[0];
1061 }
1062
1063 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1064 _mm_cvtt_ps2pi (__m128 __A)
1065 {
1066   return _mm_cvttps_pi32 (__A);
1067 }
1068
1069 /* Convert B to a SPFP value and insert it as element zero in A.  */
1070 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1071 _mm_cvtsi32_ss (__m128 __A, int __B)
1072 {
1073   float temp = __B;
1074   __A[0] = temp;
1075
1076   return __A;
1077 }
1078
1079 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1080 _mm_cvt_si2ss (__m128 __A, int __B)
1081 {
1082   return _mm_cvtsi32_ss (__A, __B);
1083 }
1084
1085 /* Convert B to a SPFP value and insert it as element zero in A.  */
1086 /* Intel intrinsic.  */
1087 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1088 _mm_cvtsi64_ss (__m128 __A, long long __B)
1089 {
1090   float temp = __B;
1091   __A[0] = temp;
1092
1093   return __A;
1094 }
1095
1096 /* Microsoft intrinsic.  */
1097 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1098 _mm_cvtsi64x_ss (__m128 __A, long long __B)
1099 {
1100   return _mm_cvtsi64_ss (__A, __B);
1101 }
1102
1103 /* Convert the two 32-bit values in B to SPFP form and insert them
1104    as the two lower elements in A.  */
1105 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1106 _mm_cvtpi32_ps (__m128        __A, __m64        __B)
1107 {
1108   __vector signed int vm1;
1109   __vector float vf1;
1110
1111   vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B};
1112   vf1 = (__vector float) vec_ctf (vm1, 0);
1113
1114   return ((__m128) (__vector unsigned long long)
1115     { ((__vector unsigned long long)vf1) [0],
1116         ((__vector unsigned long long)__A) [1]});
1117 }
1118
1119 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1120 _mm_cvt_pi2ps (__m128 __A, __m64 __B)
1121 {
1122   return _mm_cvtpi32_ps (__A, __B);
1123 }
1124
1125 /* Convert the four signed 16-bit values in A to SPFP form.  */
1126 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1127 _mm_cvtpi16_ps (__m64 __A)
1128 {
1129   __vector signed short vs8;
1130   __vector signed int vi4;
1131   __vector float vf1;
1132
1133   vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A };
1134   vi4 = vec_vupklsh (vs8);
1135   vf1 = (__vector float) vec_ctf (vi4, 0);
1136
1137   return (__m128) vf1;
1138 }
1139
1140 /* Convert the four unsigned 16-bit values in A to SPFP form.  */
1141 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1142 _mm_cvtpu16_ps (__m64 __A)
1143 {
1144   const __vector unsigned short zero =
1145     { 0, 0, 0, 0, 0, 0, 0, 0 };
1146   __vector unsigned short vs8;
1147   __vector unsigned int vi4;
1148   __vector float vf1;
1149
1150   vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A };
1151   vi4 = (__vector unsigned int) vec_vmrglh (vs8, zero);
1152   vf1 = (__vector float) vec_ctf (vi4, 0);
1153
1154   return (__m128) vf1;
1155 }
1156
1157 /* Convert the low four signed 8-bit values in A to SPFP form.  */
1158 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1159 _mm_cvtpi8_ps (__m64 __A)
1160 {
1161   __vector signed char vc16;
1162   __vector signed short vs8;
1163   __vector signed int vi4;
1164   __vector float vf1;
1165
1166   vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A };
1167   vs8 = vec_vupkhsb (vc16);
1168   vi4 = vec_vupkhsh (vs8);
1169   vf1 = (__vector float) vec_ctf (vi4, 0);
1170
1171   return (__m128) vf1;
1172 }
1173
1174 /* Convert the low four unsigned 8-bit values in A to SPFP form.  */
1175 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1176
1177 _mm_cvtpu8_ps (__m64  __A)
1178 {
1179   const __vector unsigned char zero =
1180     { 0, 0, 0, 0, 0, 0, 0, 0 };
1181   __vector unsigned char vc16;
1182   __vector unsigned short vs8;
1183   __vector unsigned int vi4;
1184   __vector float vf1;
1185
1186   vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A };
1187   vs8 = (__vector unsigned short) vec_vmrglb (vc16, zero);
1188   vi4 = (__vector unsigned int) vec_vmrghh (vs8,
1189                                             (__vector unsigned short) zero);
1190   vf1 = (__vector float) vec_ctf (vi4, 0);
1191
1192   return (__m128) vf1;
1193 }
1194
1195 /* Convert the four signed 32-bit values in A and B to SPFP form.  */
1196 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1197 _mm_cvtpi32x2_ps (__m64 __A, __m64 __B)
1198 {
1199   __vector signed int vi4;
1200   __vector float vf4;
1201
1202   vi4 = (__vector signed int) (__vector unsigned long long) { __B, __A };
1203   vf4 = (__vector float) vec_ctf (vi4, 0);
1204   return (__m128) vf4;
1205 }
1206
1207 /* Convert the four SPFP values in A to four signed 16-bit integers.  */
1208 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1209 _mm_cvtps_pi16 (__m128 __A)
1210 {
1211   __v4sf rounded;
1212   __vector signed int temp;
1213   __vector unsigned long long result;
1214
1215   rounded = vec_rint(__A);
1216   temp = vec_cts (rounded, 0);
1217   result = (__vector unsigned long long) vec_pack (temp, temp);
1218
1219   return (__m64) ((__vector long long) result)[0];
1220 }
1221
1222 /* Convert the four SPFP values in A to four signed 8-bit integers.  */
1223 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1224 _mm_cvtps_pi8 (__m128 __A)
1225 {
1226   __v4sf rounded;
1227   __vector signed int tmp_i;
1228   static const __vector signed int zero = {0, 0, 0, 0};
1229   __vector signed short tmp_s;
1230   __vector signed char res_v;
1231
1232   rounded = vec_rint(__A);
1233   tmp_i = vec_cts (rounded, 0);
1234   tmp_s = vec_pack (tmp_i, zero);
1235   res_v = vec_pack (tmp_s, tmp_s);
1236   return (__m64) ((__vector long long) res_v)[0];
1237 }
1238
1239 /* Selects four specific SPFP values from A and B based on MASK.  */
1240 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1241
1242 _mm_shuffle_ps (__m128  __A, __m128  __B, int const __mask)
1243 {
1244   unsigned long element_selector_10 = __mask & 0x03;
1245   unsigned long element_selector_32 = (__mask >> 2) & 0x03;
1246   unsigned long element_selector_54 = (__mask >> 4) & 0x03;
1247   unsigned long element_selector_76 = (__mask >> 6) & 0x03;
1248   static const unsigned int permute_selectors[4] =
1249     {
1250 #ifdef __LITTLE_ENDIAN__
1251       0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1252 #elif __BIG_ENDIAN__
1253       0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203
1254 #endif
1255     };
1256   __vector unsigned int t;
1257
1258 #ifdef __LITTLE_ENDIAN__
1259   t[0] = permute_selectors[element_selector_10];
1260   t[1] = permute_selectors[element_selector_32];
1261   t[2] = permute_selectors[element_selector_54] + 0x10101010;
1262   t[3] = permute_selectors[element_selector_76] + 0x10101010;
1263 #elif __BIG_ENDIAN__
1264   t[3] = permute_selectors[element_selector_10] + 0x10101010;
1265   t[2] = permute_selectors[element_selector_32] + 0x10101010;
1266   t[1] = permute_selectors[element_selector_54];
1267   t[0] = permute_selectors[element_selector_76];
1268 #endif
1269   return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t);
1270 }
1271
1272 /* Selects and interleaves the upper two SPFP values from A and B.  */
1273 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1274 _mm_unpackhi_ps (__m128 __A, __m128 __B)
1275 {
1276   return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B);
1277 }
1278
1279 /* Selects and interleaves the lower two SPFP values from A and B.  */
1280 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1281 _mm_unpacklo_ps (__m128 __A, __m128 __B)
1282 {
1283   return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B);
1284 }
1285
1286 /* Sets the upper two SPFP values with 64-bits of data loaded from P;
1287    the lower two values are passed through from A.  */
1288 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1289 _mm_loadh_pi (__m128 __A, __m64 const *__P)
1290 {
1291   __vector unsigned long long __a = (__vector unsigned long long)__A;
1292   __vector unsigned long long __p = vec_splats(*__P);
1293   __a [1] = __p [1];
1294
1295   return (__m128)__a;
1296 }
1297
1298 /* Stores the upper two SPFP values of A into P.  */
1299 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1300 _mm_storeh_pi (__m64 *__P, __m128 __A)
1301 {
1302   __vector unsigned long long __a = (__vector unsigned long long) __A;
1303
1304   *__P = __a[1];
1305 }
1306
1307 /* Moves the upper two values of B into the lower two values of A.  */
1308 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1309 _mm_movehl_ps (__m128 __A, __m128 __B)
1310 {
1311   return (__m128) vec_mergel ((__vector unsigned long long)__B,
1312                               (__vector unsigned long long)__A);
1313 }
1314
1315 /* Moves the lower two values of B into the upper two values of A.  */
1316 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1317 _mm_movelh_ps (__m128 __A, __m128 __B)
1318 {
1319   return (__m128) vec_mergeh ((__vector unsigned long long)__A,
1320                               (__vector unsigned long long)__B);
1321 }
1322
1323 /* Sets the lower two SPFP values with 64-bits of data loaded from P;
1324    the upper two values are passed through from A.  */
1325 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1326 _mm_loadl_pi (__m128 __A, __m64 const *__P)
1327 {
1328   __vector unsigned long long __a = (__vector unsigned long long)__A;
1329   __vector unsigned long long __p = vec_splats(*__P);
1330   __a [0] = __p [0];
1331
1332   return (__m128)__a;
1333 }
1334
1335 /* Stores the lower two SPFP values of A into P.  */
1336 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1337 _mm_storel_pi (__m64 *__P, __m128 __A)
1338 {
1339   __vector unsigned long long __a = (__vector unsigned long long) __A;
1340
1341   *__P = __a[0];
1342 }
1343
1344 #ifdef _ARCH_PWR8
1345 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
1346
1347 /* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
1348 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1349 _mm_movemask_ps (__m128  __A)
1350 {
1351   __vector unsigned long long result;
1352   static const __vector unsigned int perm_mask =
1353     {
1354 #ifdef __LITTLE_ENDIAN__
1355         0x00204060, 0x80808080, 0x80808080, 0x80808080
1356 #elif __BIG_ENDIAN__
1357       0x80808080, 0x80808080, 0x80808080, 0x00204060
1358 #endif
1359     };
1360
1361   result = ((__vector unsigned long long)
1362             vec_vbpermq ((__vector unsigned char) __A,
1363                          (__vector unsigned char) perm_mask));
1364
1365 #ifdef __LITTLE_ENDIAN__
1366   return result[1];
1367 #elif __BIG_ENDIAN__
1368   return result[0];
1369 #endif
1370 }
1371 #endif /* _ARCH_PWR8 */
1372
1373 /* Create a vector with all four elements equal to *P.  */
1374 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1375 _mm_load1_ps (float const *__P)
1376 {
1377   return _mm_set1_ps (*__P);
1378 }
1379
1380 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1381 _mm_load_ps1 (float const *__P)
1382 {
1383   return _mm_load1_ps (__P);
1384 }
1385
1386 /* Extracts one of the four words of A.  The selector N must be immediate.  */
1387 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1388 _mm_extract_pi16 (__m64 const __A, int const __N)
1389 {
1390   unsigned int shiftr = __N & 3;
1391 #ifdef __BIG_ENDIAN__
1392   shiftr = 3 - shiftr;
1393 #endif
1394
1395   return ((__A >> (shiftr * 16)) & 0xffff);
1396 }
1397
1398 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1399 _m_pextrw (__m64 const __A, int const __N)
1400 {
1401   return _mm_extract_pi16 (__A, __N);
1402 }
1403
1404 /* Inserts word D into one of four words of A.  The selector N must be
1405    immediate.  */
1406 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1407 _mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
1408 {
1409   const int shiftl = (__N & 3) * 16;
1410   const __m64 shiftD = (const __m64) __D << shiftl;
1411   const __m64 mask = 0xffffUL << shiftl;
1412   __m64 result = (__A & (~mask)) | (shiftD & mask);
1413
1414   return (result);
1415 }
1416
1417 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1418 _m_pinsrw (__m64 const __A, int const __D, int const __N)
1419 {
1420   return _mm_insert_pi16 (__A, __D, __N);
1421 }
1422
1423 /* Compute the element-wise maximum of signed 16-bit values.  */
1424 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1425
1426 _mm_max_pi16 (__m64 __A, __m64 __B)
1427 {
1428 #if _ARCH_PWR8
1429   __vector signed short a, b, r;
1430   __vector __bool short c;
1431
1432   a = (__vector signed short)vec_splats (__A);
1433   b = (__vector signed short)vec_splats (__B);
1434   c = (__vector __bool short)vec_cmpgt (a, b);
1435   r = vec_sel (b, a, c);
1436   return (__m64) ((__vector long long) r)[0];
1437 #else
1438   __m64_union m1, m2, res;
1439
1440   m1.as_m64 = __A;
1441   m2.as_m64 = __B;
1442
1443   res.as_short[0] =
1444       (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1445   res.as_short[1] =
1446       (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1447   res.as_short[2] =
1448       (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1449   res.as_short[3] =
1450       (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1451
1452   return (__m64) res.as_m64;
1453 #endif
1454 }
1455
1456 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1457 _m_pmaxsw (__m64 __A, __m64 __B)
1458 {
1459   return _mm_max_pi16 (__A, __B);
1460 }
1461
1462 /* Compute the element-wise maximum of unsigned 8-bit values.  */
1463 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1464 _mm_max_pu8 (__m64 __A, __m64 __B)
1465 {
1466 #if _ARCH_PWR8
1467   __vector unsigned char a, b, r;
1468   __vector __bool char c;
1469
1470   a = (__vector unsigned char)vec_splats (__A);
1471   b = (__vector unsigned char)vec_splats (__B);
1472   c = (__vector __bool char)vec_cmpgt (a, b);
1473   r = vec_sel (b, a, c);
1474   return (__m64) ((__vector long long) r)[0];
1475 #else
1476   __m64_union m1, m2, res;
1477   long i;
1478
1479   m1.as_m64 = __A;
1480   m2.as_m64 = __B;
1481
1482
1483   for (i = 0; i < 8; i++)
1484   res.as_char[i] =
1485       ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ?
1486           m1.as_char[i] : m2.as_char[i];
1487
1488   return (__m64) res.as_m64;
1489 #endif
1490 }
1491
1492 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1493 _m_pmaxub (__m64 __A, __m64 __B)
1494 {
1495   return _mm_max_pu8 (__A, __B);
1496 }
1497
1498 /* Compute the element-wise minimum of signed 16-bit values.  */
1499 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1500 _mm_min_pi16 (__m64 __A, __m64 __B)
1501 {
1502 #if _ARCH_PWR8
1503   __vector signed short a, b, r;
1504   __vector __bool short c;
1505
1506   a = (__vector signed short)vec_splats (__A);
1507   b = (__vector signed short)vec_splats (__B);
1508   c = (__vector __bool short)vec_cmplt (a, b);
1509   r = vec_sel (b, a, c);
1510   return (__m64) ((__vector long long) r)[0];
1511 #else
1512   __m64_union m1, m2, res;
1513
1514   m1.as_m64 = __A;
1515   m2.as_m64 = __B;
1516
1517   res.as_short[0] =
1518       (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1519   res.as_short[1] =
1520       (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1521   res.as_short[2] =
1522       (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1523   res.as_short[3] =
1524       (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1525
1526   return (__m64) res.as_m64;
1527 #endif
1528 }
1529
1530 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1531 _m_pminsw (__m64 __A, __m64 __B)
1532 {
1533   return _mm_min_pi16 (__A, __B);
1534 }
1535
1536 /* Compute the element-wise minimum of unsigned 8-bit values.  */
1537 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1538 _mm_min_pu8 (__m64 __A, __m64 __B)
1539 {
1540 #if _ARCH_PWR8
1541   __vector unsigned char a, b, r;
1542   __vector __bool char c;
1543
1544   a = (__vector unsigned char)vec_splats (__A);
1545   b = (__vector unsigned char)vec_splats (__B);
1546   c = (__vector __bool char)vec_cmplt (a, b);
1547   r = vec_sel (b, a, c);
1548   return (__m64) ((__vector long long) r)[0];
1549 #else
1550   __m64_union m1, m2, res;
1551   long i;
1552
1553   m1.as_m64 = __A;
1554   m2.as_m64 = __B;
1555
1556
1557   for (i = 0; i < 8; i++)
1558   res.as_char[i] =
1559       ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ?
1560           m1.as_char[i] : m2.as_char[i];
1561
1562   return (__m64) res.as_m64;
1563 #endif
1564 }
1565
1566 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1567 _m_pminub (__m64 __A, __m64 __B)
1568 {
1569   return _mm_min_pu8 (__A, __B);
1570 }
1571
1572 /* Create an 8-bit mask of the signs of 8-bit values.  */
1573 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1574 _mm_movemask_pi8 (__m64 __A)
1575 {
1576   unsigned long long p = 0x0008101820283038UL; // permute control for sign bits
1577
1578   return __builtin_bpermd (p, __A);
1579 }
1580
1581 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1582 _m_pmovmskb (__m64 __A)
1583 {
1584   return _mm_movemask_pi8 (__A);
1585 }
1586
1587 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1588    in B and produce the high 16 bits of the 32-bit results.  */
1589 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1590 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
1591 {
1592   __vector unsigned short a, b;
1593   __vector unsigned short c;
1594   __vector unsigned int w0, w1;
1595   __vector unsigned char xform1 = {
1596       0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
1597       0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
1598     };
1599
1600   a = (__vector unsigned short)vec_splats (__A);
1601   b = (__vector unsigned short)vec_splats (__B);
1602
1603   w0 = vec_vmuleuh (a, b);
1604   w1 = vec_vmulouh (a, b);
1605   c = (__vector unsigned short)vec_perm (w0, w1, xform1);
1606
1607   return (__m64) ((__vector long long) c)[0];
1608 }
1609
1610 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1611 _m_pmulhuw (__m64 __A, __m64 __B)
1612 {
1613   return _mm_mulhi_pu16 (__A, __B);
1614 }
1615
1616 /* Return a combination of the four 16-bit values in A.  The selector
1617    must be an immediate.  */
1618 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1619 _mm_shuffle_pi16 (__m64 __A, int const __N)
1620 {
1621   unsigned long element_selector_10 = __N & 0x03;
1622   unsigned long element_selector_32 = (__N >> 2) & 0x03;
1623   unsigned long element_selector_54 = (__N >> 4) & 0x03;
1624   unsigned long element_selector_76 = (__N >> 6) & 0x03;
1625   static const unsigned short permute_selectors[4] =
1626     {
1627 #ifdef __LITTLE_ENDIAN__
1628               0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1629 #elif __BIG_ENDIAN__
1630               0x0607, 0x0405, 0x0203, 0x0001
1631 #endif
1632     };
1633   __m64_union t;
1634   __vector unsigned long long a, p, r;
1635
1636 #ifdef __LITTLE_ENDIAN__
1637   t.as_short[0] = permute_selectors[element_selector_10];
1638   t.as_short[1] = permute_selectors[element_selector_32];
1639   t.as_short[2] = permute_selectors[element_selector_54];
1640   t.as_short[3] = permute_selectors[element_selector_76];
1641 #elif __BIG_ENDIAN__
1642   t.as_short[3] = permute_selectors[element_selector_10];
1643   t.as_short[2] = permute_selectors[element_selector_32];
1644   t.as_short[1] = permute_selectors[element_selector_54];
1645   t.as_short[0] = permute_selectors[element_selector_76];
1646 #endif
1647   p = vec_splats (t.as_m64);
1648   a = vec_splats (__A);
1649   r = vec_perm (a, a, (__vector unsigned char)p);
1650   return (__m64) ((__vector long long) r)[0];
1651 }
1652
1653 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1654 _m_pshufw (__m64 __A, int const __N)
1655 {
1656   return _mm_shuffle_pi16 (__A, __N);
1657 }
1658
1659 /* Conditionally store byte elements of A into P.  The high bit of each
1660    byte in the selector N determines whether the corresponding byte from
1661    A is stored.  */
1662 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1663 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1664 {
1665   __m64 hibit = 0x8080808080808080UL;
1666   __m64 mask, tmp;
1667   __m64 *p = (__m64*)__P;
1668
1669   tmp = *p;
1670   mask = _mm_cmpeq_pi8 ((__N & hibit), hibit);
1671   tmp = (tmp & (~mask)) | (__A & mask);
1672   *p = tmp;
1673 }
1674
1675 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1676 _m_maskmovq (__m64 __A, __m64 __N, char *__P)
1677 {
1678   _mm_maskmove_si64 (__A, __N, __P);
1679 }
1680
1681 /* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
1682 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1683 _mm_avg_pu8 (__m64 __A, __m64 __B)
1684 {
1685   __vector unsigned char a, b, c;
1686
1687   a = (__vector unsigned char)vec_splats (__A);
1688   b = (__vector unsigned char)vec_splats (__B);
1689   c = vec_avg (a, b);
1690   return (__m64) ((__vector long long) c)[0];
1691 }
1692
1693 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1694 _m_pavgb (__m64 __A, __m64 __B)
1695 {
1696   return _mm_avg_pu8 (__A, __B);
1697 }
1698
1699 /* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
1700 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1701 _mm_avg_pu16 (__m64 __A, __m64 __B)
1702 {
1703   __vector unsigned short a, b, c;
1704
1705   a = (__vector unsigned short)vec_splats (__A);
1706   b = (__vector unsigned short)vec_splats (__B);
1707   c = vec_avg (a, b);
1708   return (__m64) ((__vector long long) c)[0];
1709 }
1710
1711 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1712 _m_pavgw (__m64 __A, __m64 __B)
1713 {
1714   return _mm_avg_pu16 (__A, __B);
1715 }
1716
1717 /* Compute the sum of the absolute differences of the unsigned 8-bit
1718    values in A and B.  Return the value in the lower 16-bit word; the
1719    upper words are cleared.  */
1720 extern __inline    __m64    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1721 _mm_sad_pu8 (__m64  __A, __m64  __B)
1722 {
1723   __vector unsigned char a, b;
1724   __vector unsigned char vmin, vmax, vabsdiff;
1725   __vector signed int vsum;
1726   const __vector unsigned int zero =
1727     { 0, 0, 0, 0 };
1728   unsigned short result;
1729
1730   a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A };
1731   b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B };
1732   vmin = vec_min (a, b);
1733   vmax = vec_max (a, b);
1734   vabsdiff = vec_sub (vmax, vmin);
1735   /* Sum four groups of bytes into integers.  */
1736   vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
1737   /* Sum across four integers with integer result.  */
1738   vsum = vec_sums (vsum, (__vector signed int) zero);
1739   /* The sum is in the right most 32-bits of the vector result.
1740      Transfer to a GPR and truncate to 16 bits.  */
1741   result = vsum[3];
1742   return (result);
1743 }
1744
1745 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1746 _m_psadbw (__m64 __A, __m64 __B)
1747 {
1748   return _mm_sad_pu8 (__A, __B);
1749 }
1750
1751 /* Stores the data in A to the address P without polluting the caches.  */
1752 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1753 _mm_stream_pi (__m64 *__P, __m64 __A)
1754 {
1755   /* Use the data cache block touch for store transient.  */
1756   __asm__ (
1757     "   dcbtstt 0,%0"
1758     :
1759     : "b" (__P)
1760     : "memory"
1761   );
1762   *__P = __A;
1763 }
1764
1765 /* Likewise.  The address must be 16-byte aligned.  */
1766 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1767 _mm_stream_ps (float *__P, __m128 __A)
1768 {
1769   /* Use the data cache block touch for store transient.  */
1770   __asm__ (
1771     "   dcbtstt 0,%0"
1772     :
1773     : "b" (__P)
1774     : "memory"
1775   );
1776   _mm_store_ps (__P, __A);
1777 }
1778
1779 /* Guarantees that every preceding store is globally visible before
1780    any subsequent store.  */
1781 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1782 _mm_sfence (void)
1783 {
1784   /* Generate a light weight sync.  */
1785   __atomic_thread_fence (__ATOMIC_RELEASE);
1786 }
1787
1788 /* The execution of the next instruction is delayed by an implementation
1789    specific amount of time.  The instruction does not modify the
1790    architectural state.  This is after the pop_options pragma because
1791    it does not require SSE support in the processor--the encoding is a
1792    nop on processors that do not support it.  */
1793 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1794 _mm_pause (void)
1795 {
1796   /* There is no exact match with this construct, but the following is
1797      close to the desired effect.  */
1798 #if _ARCH_PWR8
1799   /* On power8 and later processors we can depend on Program Priority
1800      (PRI) and associated "very low" PPI setting.  Since we don't know
1801      what PPI this thread is running at we: 1) save the current PRI
1802      from the PPR SPR into a local GRP, 2) set the PRI to "very low*
1803      via the special or 31,31,31 encoding. 3) issue an "isync" to
1804      insure the PRI change takes effect before we execute any more
1805      instructions.
1806      Now we can execute a lwsync (release barrier) while we execute
1807      this thread at "very low" PRI.  Finally we restore the original
1808      PRI and continue execution.  */
1809   unsigned long __PPR;
1810
1811   __asm__ volatile (
1812     "   mfppr   %0;"
1813     "   or 31,31,31;"
1814     "   isync;"
1815     "   lwsync;"
1816     "   isync;"
1817     "   mtppr   %0;"
1818     : "=r" (__PPR)
1819     :
1820     : "memory"
1821   );
1822 #else
1823   /* For older processor where we may not even have Program Priority
1824      controls we can only depend on Heavy Weight Sync.  */
1825   __atomic_thread_fence (__ATOMIC_SEQ_CST);
1826 #endif
1827 }
1828
1829 /* Transpose the 4x4 matrix composed of row[0-3].  */
1830 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)                       \
1831 do {                                                                    \
1832   __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);    \
1833   __v4sf __t0 = vec_vmrghw (__r0, __r1);                        \
1834   __v4sf __t1 = vec_vmrghw (__r2, __r3);                        \
1835   __v4sf __t2 = vec_vmrglw (__r0, __r1);                        \
1836   __v4sf __t3 = vec_vmrglw (__r2, __r3);                        \
1837   (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0,        \
1838                                (__vector long long)__t1);       \
1839   (row1) = (__v4sf)vec_mergel ((__vector long long)__t0,        \
1840                                (__vector long long)__t1);       \
1841   (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2,        \
1842                                (__vector long long)__t3);       \
1843   (row3) = (__v4sf)vec_mergel ((__vector long long)__t2,        \
1844                                (__vector long long)__t3);       \
1845 } while (0)
1846
1847 /* For backward source compatibility.  */
1848 //# include <emmintrin.h>
1849
1850 #endif /* _XMMINTRIN_H_INCLUDED */