gcc/config/rs6000/xmmintrin.h

   1 /* Copyright (C) 2002-2017 Free Software Foundation, Inc.
   2
   3    This file is part of GCC.
   4
   5    GCC is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3, or (at your option)
   8    any later version.
   9
  10    GCC is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    Under Section 7 of GPL version 3, you are granted additional
  16    permissions described in the GCC Runtime Library Exception, version
  17    3.1, as published by the Free Software Foundation.
  18
  19    You should have received a copy of the GNU General Public License and
  20    a copy of the GCC Runtime Library Exception along with this program;
  21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  22    <http://www.gnu.org/licenses/>.  */
  23
  24 /* Implemented from the specification included in the Intel C++ Compiler
  25    User Guide and Reference, version 9.0.  */
  26
  27 #ifndef NO_WARN_X86_INTRINSICS
  28 /* This header is distributed to simplify porting x86_64 code that
  29    makes explicit use of Intel intrinsics to powerpc64le.
  30    It is the user's responsibility to determine if the results are
  31    acceptable and make additional changes as necessary.
  32    Note that much code that uses Intel intrinsics can be rewritten in
  33    standard C or GNU C extensions, which are more portable and better
  34    optimized across multiple targets.
  35
  36    In the specific case of X86 SSE (__m128) intrinsics, the PowerPC
  37    VMX/VSX ISA is a good match for vector float SIMD operations.
  38    However scalar float operations in vector (XMM) registers require
  39    the POWER8 VSX ISA (2.07) level. Also there are important
  40    differences for data format and placement of float scalars in the
  41    vector register. For PowerISA Scalar floats in FPRs (left most
  42    64-bits of the low 32 VSRs) is in double format, while X86_64 SSE
  43    uses the right most 32-bits of the XMM. These differences require
  44    extra steps on POWER to match the SSE scalar float semantics.
  45
  46    Most SSE scalar float intrinsic operations can be performed more
  47    efficiently as C language float scalar operations or optimized to
  48    use vector SIMD operations.  We recommend this for new applications.
  49
  50    Another difference is the format and details of the X86_64 MXSCR vs
  51    the PowerISA FPSCR / VSCR registers. We recommend applications
  52    replace direct access to the MXSCR with the more portable <fenv.h>
  53    Posix APIs. */
  54 #warning "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this warning."
  55 #endif
  56
  57 #ifndef _XMMINTRIN_H_INCLUDED
  58 #define _XMMINTRIN_H_INCLUDED
  59
  60 #include <altivec.h>
  61 #include <assert.h>
  62
  63 /* We need type definitions from the MMX header file.  */
  64 #include <mmintrin.h>
  65
  66 /* Get _mm_malloc () and _mm_free ().  */
  67 #include <mm_malloc.h>
  68
  69 /* The Intel API is flexible enough that we must allow aliasing with other
  70    vector types, and their scalar components.  */
  71 typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
  72
  73 /* Internal data types for implementing the intrinsics.  */
  74 typedef float __v4sf __attribute__ ((__vector_size__ (16)));
  75
  76 /* Create an undefined vector.  */
  77 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  78 _mm_undefined_ps (void)
  79 {
  80   __m128 __Y = __Y;
  81   return __Y;
  82 }
  83
  84 /* Create a vector of zeros.  */
  85 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  86 _mm_setzero_ps (void)
  87 {
  88   return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
  89 }
  90
  91 /* Load four SPFP values from P.  The address must be 16-byte aligned.  */
  92 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  93 _mm_load_ps (float const *__P)
  94 {
  95   assert(((unsigned long)__P & 0xfUL) == 0UL);
  96   return ((__m128)vec_ld(0, (__v4sf*)__P));
  97 }
  98
  99 /* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
 100 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 101 _mm_loadu_ps (float const *__P)
 102 {
 103   return (vec_vsx_ld(0, __P));
 104 }
 105
 106 /* Load four SPFP values in reverse order.  The address must be aligned.  */
 107 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 108 _mm_loadr_ps (float const *__P)
 109 {
 110   __v4sf   __tmp;
 111   __m128 result;
 112   static const __vector unsigned char permute_vector =
 113     { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
 114         0x17, 0x10, 0x11, 0x12, 0x13 };
 115
 116   __tmp = vec_ld (0, (__v4sf *) __P);
 117   result = (__m128) vec_perm (__tmp, __tmp, permute_vector);
 118   return result;
 119 }
 120
 121 /* Create a vector with all four elements equal to F.  */
 122 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 123 _mm_set1_ps (float __F)
 124 {
 125   return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
 126 }
 127
 128 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 129 _mm_set_ps1 (float __F)
 130 {
 131   return _mm_set1_ps (__F);
 132 }
 133
 134 /* Create the vector [Z Y X W].  */
 135 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 136 _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
 137 {
 138   return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
 139 }
 140
 141 /* Create the vector [W X Y Z].  */
 142 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 143 _mm_setr_ps (float __Z, float __Y, float __X, float __W)
 144 {
 145   return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
 146 }
 147
 148 /* Store four SPFP values.  The address must be 16-byte aligned.  */
 149 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 150 _mm_store_ps (float *__P, __m128 __A)
 151 {
 152   assert(((unsigned long)__P & 0xfUL) == 0UL);
 153   vec_st((__v4sf)__A, 0, (__v4sf*)__P);
 154 }
 155
 156 /* Store four SPFP values.  The address need not be 16-byte aligned.  */
 157 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 158 _mm_storeu_ps (float *__P, __m128 __A)
 159 {
 160   *(__m128 *)__P = __A;
 161 }
 162
 163 /* Store four SPFP values in reverse order.  The address must be aligned.  */
 164 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 165 _mm_storer_ps (float *__P, __m128 __A)
 166 {
 167   __v4sf   __tmp;
 168   static const __vector unsigned char permute_vector =
 169     { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
 170         0x17, 0x10, 0x11, 0x12, 0x13 };
 171
 172   __tmp = (__m128) vec_perm (__A, __A, permute_vector);
 173
 174   _mm_store_ps (__P, __tmp);
 175 }
 176
 177 /* Store the lower SPFP value across four words.  */
 178 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 179 _mm_store1_ps (float *__P, __m128 __A)
 180 {
 181   __v4sf __va = vec_splat((__v4sf)__A, 0);
 182   _mm_store_ps (__P, __va);
 183 }
 184
 185 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 186 _mm_store_ps1 (float *__P, __m128 __A)
 187 {
 188   _mm_store1_ps (__P, __A);
 189 }
 190
 191 /* Create a vector with element 0 as F and the rest zero.  */
 192 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 193 _mm_set_ss (float __F)
 194 {
 195   return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
 196 }
 197
 198 /* Sets the low SPFP value of A from the low value of B.  */
 199 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 200 _mm_move_ss (__m128 __A, __m128 __B)
 201 {
 202   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 203
 204   return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask));
 205 }
 206
 207 /* Create a vector with element 0 as *P and the rest zero.  */
 208 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 209 _mm_load_ss (float const *__P)
 210 {
 211   return _mm_set_ss (*__P);
 212 }
 213
 214 /* Stores the lower SPFP value.  */
 215 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 216 _mm_store_ss (float *__P, __m128 __A)
 217 {
 218   *__P = ((__v4sf)__A)[0];
 219 }
 220
 221 /* Perform the respective operation on the lower SPFP (single-precision
 222    floating-point) values of A and B; the upper three SPFP values are
 223    passed through from A.  */
 224
 225 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 226 _mm_add_ss (__m128 __A, __m128 __B)
 227 {
 228 #ifdef _ARCH_PWR7
 229   __m128 a, b, c;
 230   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 231   /* PowerISA VSX does not allow partial (for just lower double)
 232      results. So to insure we don't generate spurious exceptions
 233      (from the upper double values) we splat the lower double
 234      before we to the operation.  */
 235   a = vec_splat (__A, 0);
 236   b = vec_splat (__B, 0);
 237   c = a + b;
 238   /* Then we merge the lower float result with the original upper
 239      float elements from __A.  */
 240   return (vec_sel (__A, c, mask));
 241 #else
 242   __A[0] = __A[0] + __B[0];
 243   return (__A);
 244 #endif
 245 }
 246
 247 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 248 _mm_sub_ss (__m128 __A, __m128 __B)
 249 {
 250 #ifdef _ARCH_PWR7
 251   __m128 a, b, c;
 252   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 253   /* PowerISA VSX does not allow partial (for just lower double)
 254      results. So to insure we don't generate spurious exceptions
 255      (from the upper double values) we splat the lower double
 256      before we to the operation.  */
 257   a = vec_splat (__A, 0);
 258   b = vec_splat (__B, 0);
 259   c = a - b;
 260   /* Then we merge the lower float result with the original upper
 261      float elements from __A.  */
 262   return (vec_sel (__A, c, mask));
 263 #else
 264   __A[0] = __A[0] - __B[0];
 265   return (__A);
 266 #endif
 267 }
 268
 269 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 270 _mm_mul_ss (__m128 __A, __m128 __B)
 271 {
 272 #ifdef _ARCH_PWR7
 273   __m128 a, b, c;
 274   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 275   /* PowerISA VSX does not allow partial (for just lower double)
 276      results. So to insure we don't generate spurious exceptions
 277      (from the upper double values) we splat the lower double
 278      before we to the operation.  */
 279   a = vec_splat (__A, 0);
 280   b = vec_splat (__B, 0);
 281   c = a * b;
 282   /* Then we merge the lower float result with the original upper
 283      float elements from __A.  */
 284   return (vec_sel (__A, c, mask));
 285 #else
 286   __A[0] = __A[0] * __B[0];
 287   return (__A);
 288 #endif
 289 }
 290
 291 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 292 _mm_div_ss (__m128 __A, __m128 __B)
 293 {
 294 #ifdef _ARCH_PWR7
 295   __m128 a, b, c;
 296   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 297   /* PowerISA VSX does not allow partial (for just lower double)
 298      results. So to insure we don't generate spurious exceptions
 299      (from the upper double values) we splat the lower double
 300      before we to the operation.  */
 301   a = vec_splat (__A, 0);
 302   b = vec_splat (__B, 0);
 303   c = a / b;
 304   /* Then we merge the lower float result with the original upper
 305      float elements from __A.  */
 306   return (vec_sel (__A, c, mask));
 307 #else
 308   __A[0] = __A[0] / __B[0];
 309   return (__A);
 310 #endif
 311 }
 312
 313 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 314 _mm_sqrt_ss (__m128 __A)
 315 {
 316   __m128 a, c;
 317   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 318   /* PowerISA VSX does not allow partial (for just lower double)
 319    * results. So to insure we don't generate spurious exceptions
 320    * (from the upper double values) we splat the lower double
 321    * before we to the operation. */
 322   a = vec_splat (__A, 0);
 323   c = vec_sqrt (a);
 324   /* Then we merge the lower float result with the original upper
 325    * float elements from __A.  */
 326   return (vec_sel (__A, c, mask));
 327 }
 328
 329 /* Perform the respective operation on the four SPFP values in A and B.  */
 330 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 331 _mm_add_ps (__m128 __A, __m128 __B)
 332 {
 333   return (__m128) ((__v4sf)__A + (__v4sf)__B);
 334 }
 335
 336 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 337 _mm_sub_ps (__m128 __A, __m128 __B)
 338 {
 339   return (__m128) ((__v4sf)__A - (__v4sf)__B);
 340 }
 341
 342 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 343 _mm_mul_ps (__m128 __A, __m128 __B)
 344 {
 345   return (__m128) ((__v4sf)__A * (__v4sf)__B);
 346 }
 347
 348 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 349 _mm_div_ps (__m128 __A, __m128 __B)
 350 {
 351   return (__m128) ((__v4sf)__A / (__v4sf)__B);
 352 }
 353
 354 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 355 _mm_sqrt_ps (__m128 __A)
 356 {
 357   return (vec_sqrt ((__v4sf)__A));
 358 }
 359
 360 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 361 _mm_rcp_ps (__m128 __A)
 362 {
 363   return (vec_re ((__v4sf)__A));
 364 }
 365
 366 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 367 _mm_rsqrt_ps (__m128 __A)
 368 {
 369   return (vec_rsqrte (__A));
 370 }
 371
 372 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 373 _mm_rcp_ss (__m128 __A)
 374 {
 375   __m128 a, c;
 376   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 377   /* PowerISA VSX does not allow partial (for just lower double)
 378    * results. So to insure we don't generate spurious exceptions
 379    * (from the upper double values) we splat the lower double
 380    * before we to the operation. */
 381   a = vec_splat (__A, 0);
 382   c = _mm_rcp_ps (a);
 383   /* Then we merge the lower float result with the original upper
 384    * float elements from __A.  */
 385   return (vec_sel (__A, c, mask));
 386 }
 387
 388 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 389 _mm_rsqrt_ss (__m128 __A)
 390 {
 391   __m128 a, c;
 392   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 393   /* PowerISA VSX does not allow partial (for just lower double)
 394    * results. So to insure we don't generate spurious exceptions
 395    * (from the upper double values) we splat the lower double
 396    * before we to the operation. */
 397   a = vec_splat (__A, 0);
 398   c = vec_rsqrte (a);
 399   /* Then we merge the lower float result with the original upper
 400    * float elements from __A.  */
 401   return (vec_sel (__A, c, mask));
 402 }
 403
 404 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 405 _mm_min_ss (__m128 __A, __m128 __B)
 406 {
 407   __v4sf a, b, c;
 408   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 409   /* PowerISA VSX does not allow partial (for just lower float)
 410    * results. So to insure we don't generate spurious exceptions
 411    * (from the upper float values) we splat the lower float
 412    * before we to the operation. */
 413   a = vec_splat ((__v4sf)__A, 0);
 414   b = vec_splat ((__v4sf)__B, 0);
 415   c = vec_min (a, b);
 416   /* Then we merge the lower float result with the original upper
 417    * float elements from __A.  */
 418   return (vec_sel ((__v4sf)__A, c, mask));
 419 }
 420
 421 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 422 _mm_max_ss (__m128 __A, __m128 __B)
 423 {
 424   __v4sf a, b, c;
 425   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 426   /* PowerISA VSX does not allow partial (for just lower float)
 427    * results. So to insure we don't generate spurious exceptions
 428    * (from the upper float values) we splat the lower float
 429    * before we to the operation. */
 430   a = vec_splat (__A, 0);
 431   b = vec_splat (__B, 0);
 432   c = vec_max (a, b);
 433   /* Then we merge the lower float result with the original upper
 434    * float elements from __A.  */
 435   return (vec_sel ((__v4sf)__A, c, mask));
 436 }
 437
 438 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 439 _mm_min_ps (__m128 __A, __m128 __B)
 440 {
 441   return ((__m128)vec_min ((__v4sf)__A,(__v4sf) __B));
 442 }
 443
 444 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 445 _mm_max_ps (__m128 __A, __m128 __B)
 446 {
 447   return ((__m128)vec_max ((__v4sf)__A, (__v4sf)__B));
 448 }
 449
 450 /* Perform logical bit-wise operations on 128-bit values.  */
 451 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 452 _mm_and_ps (__m128 __A, __m128 __B)
 453 {
 454   return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B));
 455 //  return __builtin_ia32_andps (__A, __B);
 456 }
 457
 458 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 459 _mm_andnot_ps (__m128 __A, __m128 __B)
 460 {
 461   return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A));
 462 }
 463
 464 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 465 _mm_or_ps (__m128 __A, __m128 __B)
 466 {
 467   return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B));
 468 }
 469
 470 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 471 _mm_xor_ps (__m128 __A, __m128 __B)
 472 {
 473   return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B));
 474 }
 475
 476 /* Perform a comparison on the four SPFP values of A and B.  For each
 477    element, if the comparison is true, place a mask of all ones in the
 478    result, otherwise a mask of zeros.  */
 479 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 480 _mm_cmpeq_ps (__m128 __A, __m128 __B)
 481 {
 482   return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B));
 483 }
 484
 485 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 486 _mm_cmplt_ps (__m128 __A, __m128 __B)
 487 {
 488   return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
 489 }
 490
 491 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 492 _mm_cmple_ps (__m128 __A, __m128 __B)
 493 {
 494   return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
 495 }
 496
 497 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 498 _mm_cmpgt_ps (__m128 __A, __m128 __B)
 499 {
 500   return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
 501 }
 502
 503 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 504 _mm_cmpge_ps (__m128 __A, __m128 __B)
 505 {
 506   return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
 507 }
 508
 509 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 510 _mm_cmpneq_ps (__m128  __A, __m128  __B)
 511 {
 512   __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
 513   return ((__m128)vec_nor (temp, temp));
 514 }
 515
 516 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 517 _mm_cmpnlt_ps (__m128 __A, __m128 __B)
 518 {
 519   return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
 520 }
 521
 522 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 523 _mm_cmpnle_ps (__m128 __A, __m128 __B)
 524 {
 525   return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
 526 }
 527
 528 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 529 _mm_cmpngt_ps (__m128 __A, __m128 __B)
 530 {
 531   return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
 532 }
 533
 534 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 535 _mm_cmpnge_ps (__m128 __A, __m128 __B)
 536 {
 537   return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
 538 }
 539
 540 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 541 _mm_cmpord_ps (__m128  __A, __m128  __B)
 542 {
 543   __vector unsigned int a, b;
 544   __vector unsigned int c, d;
 545   static const __vector unsigned int float_exp_mask =
 546     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
 547
 548   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
 549   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
 550   c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
 551   d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
 552   return ((__m128 ) vec_and (c, d));
 553 }
 554
 555 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 556 _mm_cmpunord_ps (__m128 __A, __m128 __B)
 557 {
 558   __vector unsigned int a, b;
 559   __vector unsigned int c, d;
 560   static const __vector unsigned int float_exp_mask =
 561     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
 562
 563   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
 564   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
 565   c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
 566   d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
 567   return ((__m128 ) vec_or (c, d));
 568 }
 569
 570 /* Perform a comparison on the lower SPFP values of A and B.  If the
 571    comparison is true, place a mask of all ones in the result, otherwise a
 572    mask of zeros.  The upper three SPFP values are passed through from A.  */
 573 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 574 _mm_cmpeq_ss (__m128  __A, __m128  __B)
 575 {
 576   static const __vector unsigned int mask =
 577     { 0xffffffff, 0, 0, 0 };
 578   __v4sf a, b, c;
 579   /* PowerISA VMX does not allow partial (for just element 0)
 580    * results. So to insure we don't generate spurious exceptions
 581    * (from the upper elements) we splat the lower float
 582    * before we to the operation. */
 583   a = vec_splat ((__v4sf) __A, 0);
 584   b = vec_splat ((__v4sf) __B, 0);
 585   c = (__v4sf) vec_cmpeq(a, b);
 586   /* Then we merge the lower float result with the original upper
 587    * float elements from __A.  */
 588   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 589 }
 590
 591 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 592 _mm_cmplt_ss (__m128 __A, __m128 __B)
 593 {
 594   static const __vector unsigned int mask =
 595     { 0xffffffff, 0, 0, 0 };
 596   __v4sf a, b, c;
 597   /* PowerISA VMX does not allow partial (for just element 0)
 598    * results. So to insure we don't generate spurious exceptions
 599    * (from the upper elements) we splat the lower float
 600    * before we to the operation. */
 601   a = vec_splat ((__v4sf) __A, 0);
 602   b = vec_splat ((__v4sf) __B, 0);
 603   c = (__v4sf) vec_cmplt(a, b);
 604   /* Then we merge the lower float result with the original upper
 605    * float elements from __A.  */
 606   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 607 }
 608
 609 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 610 _mm_cmple_ss (__m128 __A, __m128 __B)
 611 {
 612   static const __vector unsigned int mask =
 613     { 0xffffffff, 0, 0, 0 };
 614   __v4sf a, b, c;
 615   /* PowerISA VMX does not allow partial (for just element 0)
 616    * results. So to insure we don't generate spurious exceptions
 617    * (from the upper elements) we splat the lower float
 618    * before we to the operation. */
 619   a = vec_splat ((__v4sf) __A, 0);
 620   b = vec_splat ((__v4sf) __B, 0);
 621   c = (__v4sf) vec_cmple(a, b);
 622   /* Then we merge the lower float result with the original upper
 623    * float elements from __A.  */
 624   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 625 }
 626
 627 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 628 _mm_cmpgt_ss (__m128 __A, __m128 __B)
 629 {
 630   static const __vector unsigned int mask =
 631     { 0xffffffff, 0, 0, 0 };
 632   __v4sf a, b, c;
 633   /* PowerISA VMX does not allow partial (for just element 0)
 634    * results. So to insure we don't generate spurious exceptions
 635    * (from the upper elements) we splat the lower float
 636    * before we to the operation. */
 637   a = vec_splat ((__v4sf) __A, 0);
 638   b = vec_splat ((__v4sf) __B, 0);
 639   c = (__v4sf) vec_cmpgt(a, b);
 640   /* Then we merge the lower float result with the original upper
 641    * float elements from __A.  */
 642   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 643 }
 644
 645 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 646 _mm_cmpge_ss (__m128 __A, __m128 __B)
 647 {
 648   static const __vector unsigned int mask =
 649     { 0xffffffff, 0, 0, 0 };
 650   __v4sf a, b, c;
 651   /* PowerISA VMX does not allow partial (for just element 0)
 652    * results. So to insure we don't generate spurious exceptions
 653    * (from the upper elements) we splat the lower float
 654    * before we to the operation. */
 655   a = vec_splat ((__v4sf) __A, 0);
 656   b = vec_splat ((__v4sf) __B, 0);
 657   c = (__v4sf) vec_cmpge(a, b);
 658   /* Then we merge the lower float result with the original upper
 659    * float elements from __A.  */
 660   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 661 }
 662
 663 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 664 _mm_cmpneq_ss (__m128 __A, __m128 __B)
 665 {
 666   static const __vector unsigned int mask =
 667     { 0xffffffff, 0, 0, 0 };
 668   __v4sf a, b, c;
 669   /* PowerISA VMX does not allow partial (for just element 0)
 670    * results. So to insure we don't generate spurious exceptions
 671    * (from the upper elements) we splat the lower float
 672    * before we to the operation. */
 673   a = vec_splat ((__v4sf) __A, 0);
 674   b = vec_splat ((__v4sf) __B, 0);
 675   c = (__v4sf) vec_cmpeq(a, b);
 676   c = vec_nor (c, c);
 677   /* Then we merge the lower float result with the original upper
 678    * float elements from __A.  */
 679   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 680 }
 681
 682 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 683 _mm_cmpnlt_ss (__m128 __A, __m128 __B)
 684 {
 685   static const __vector unsigned int mask =
 686     { 0xffffffff, 0, 0, 0 };
 687   __v4sf a, b, c;
 688   /* PowerISA VMX does not allow partial (for just element 0)
 689    * results. So to insure we don't generate spurious exceptions
 690    * (from the upper elements) we splat the lower float
 691    * before we to the operation. */
 692   a = vec_splat ((__v4sf) __A, 0);
 693   b = vec_splat ((__v4sf) __B, 0);
 694   c = (__v4sf) vec_cmpge(a, b);
 695   /* Then we merge the lower float result with the original upper
 696    * float elements from __A.  */
 697   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 698 }
 699
 700 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 701 _mm_cmpnle_ss (__m128 __A, __m128 __B)
 702 {
 703   static const __vector unsigned int mask =
 704     { 0xffffffff, 0, 0, 0 };
 705   __v4sf a, b, c;
 706   /* PowerISA VMX does not allow partial (for just element 0)
 707    * results. So to insure we don't generate spurious exceptions
 708    * (from the upper elements) we splat the lower float
 709    * before we to the operation. */
 710   a = vec_splat ((__v4sf) __A, 0);
 711   b = vec_splat ((__v4sf) __B, 0);
 712   c = (__v4sf) vec_cmpgt(a, b);
 713   /* Then we merge the lower float result with the original upper
 714    * float elements from __A.  */
 715   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 716 }
 717
 718 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 719 _mm_cmpngt_ss (__m128 __A, __m128 __B)
 720 {
 721   static const __vector unsigned int mask =
 722     { 0xffffffff, 0, 0, 0 };
 723   __v4sf a, b, c;
 724   /* PowerISA VMX does not allow partial (for just element 0)
 725    * results. So to insure we don't generate spurious exceptions
 726    * (from the upper elements) we splat the lower float
 727    * before we to the operation. */
 728   a = vec_splat ((__v4sf) __A, 0);
 729   b = vec_splat ((__v4sf) __B, 0);
 730   c = (__v4sf) vec_cmple(a, b);
 731   /* Then we merge the lower float result with the original upper
 732    * float elements from __A.  */
 733   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 734 }
 735
 736 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 737 _mm_cmpnge_ss (__m128 __A, __m128 __B)
 738 {
 739   static const __vector unsigned int mask =
 740     { 0xffffffff, 0, 0, 0 };
 741   __v4sf a, b, c;
 742   /* PowerISA VMX does not allow partial (for just element 0)
 743    * results. So to insure we don't generate spurious exceptions
 744    * (from the upper elements) we splat the lower float
 745    * before we do the operation. */
 746   a = vec_splat ((__v4sf) __A, 0);
 747   b = vec_splat ((__v4sf) __B, 0);
 748   c = (__v4sf) vec_cmplt(a, b);
 749   /* Then we merge the lower float result with the original upper
 750    * float elements from __A.  */
 751   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 752 }
 753
 754 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 755 _mm_cmpord_ss (__m128 __A, __m128 __B)
 756 {
 757   __vector unsigned int a, b;
 758   __vector unsigned int c, d;
 759   static const __vector unsigned int float_exp_mask =
 760     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
 761   static const __vector unsigned int mask =
 762     { 0xffffffff, 0, 0, 0 };
 763
 764   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
 765   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
 766   c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
 767   d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
 768   c = vec_and (c, d);
 769   /* Then we merge the lower float result with the original upper
 770    * float elements from __A.  */
 771   return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
 772 }
 773
 774 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 775 _mm_cmpunord_ss (__m128 __A, __m128 __B)
 776 {
 777   __vector unsigned int a, b;
 778   __vector unsigned int c, d;
 779   static const __vector unsigned int float_exp_mask =
 780     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
 781   static const __vector unsigned int mask =
 782     { 0xffffffff, 0, 0, 0 };
 783
 784   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
 785   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
 786   c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
 787   d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
 788   c = vec_or (c, d);
 789   /* Then we merge the lower float result with the original upper
 790    * float elements from __A.  */
 791   return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
 792 }
 793
 794 /* Compare the lower SPFP values of A and B and return 1 if true
 795    and 0 if false.  */
 796 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 797 _mm_comieq_ss (__m128 __A, __m128 __B)
 798 {
 799   return (__A[0] == __B[0]);
 800 }
 801
 802 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 803 _mm_comilt_ss (__m128 __A, __m128 __B)
 804 {
 805   return (__A[0] < __B[0]);
 806 }
 807
 808 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 809 _mm_comile_ss (__m128 __A, __m128 __B)
 810 {
 811   return (__A[0] <= __B[0]);
 812 }
 813
 814 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 815 _mm_comigt_ss (__m128 __A, __m128 __B)
 816 {
 817   return (__A[0] > __B[0]);
 818 }
 819
 820 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 821 _mm_comige_ss (__m128 __A, __m128 __B)
 822 {
 823   return (__A[0] >= __B[0]);
 824 }
 825
 826 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 827 _mm_comineq_ss (__m128 __A, __m128 __B)
 828 {
 829   return (__A[0] != __B[0]);
 830 }
 831
 832 /* FIXME
 833  * The __mm_ucomi??_ss implementations below are exactly the same as
 834  * __mm_comi??_ss because GCC for PowerPC only generates unordered
 835  * compares (scalar and vector).
 836  * Technically __mm_comieq_ss et al should be using the ordered
 837  * compare and signal for QNaNs.
 838  * The __mm_ucomieq_sd et all should be OK, as is.
 839  */
 840 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 841 _mm_ucomieq_ss (__m128 __A, __m128 __B)
 842 {
 843   return (__A[0] == __B[0]);
 844 }
 845
 846 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 847 _mm_ucomilt_ss (__m128 __A, __m128 __B)
 848 {
 849   return (__A[0] < __B[0]);
 850 }
 851
 852 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 853 _mm_ucomile_ss (__m128 __A, __m128 __B)
 854 {
 855   return (__A[0] <= __B[0]);
 856 }
 857
 858 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 859 _mm_ucomigt_ss (__m128 __A, __m128 __B)
 860 {
 861   return (__A[0] > __B[0]);
 862 }
 863
 864 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 865 _mm_ucomige_ss (__m128 __A, __m128 __B)
 866 {
 867   return (__A[0] >= __B[0]);
 868 }
 869
 870 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 871 _mm_ucomineq_ss (__m128 __A, __m128 __B)
 872 {
 873   return (__A[0] != __B[0]);
 874 }
 875
 876 extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 877 _mm_cvtss_f32 (__m128 __A)
 878 {
 879   return ((__v4sf)__A)[0];
 880 }
 881
 882 /* Convert the lower SPFP value to a 32-bit integer according to the current
 883    rounding mode.  */
 884 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 885 _mm_cvtss_si32 (__m128 __A)
 886 {
 887   __m64 res = 0;
 888 #ifdef _ARCH_PWR8
 889   __m128 vtmp;
 890   __asm__(
 891       "xxsldwi %x1,%x2,%x2,3;\n"
 892       "xscvspdp %x1,%x1;\n"
 893       "fctiw  %1,%1;\n"
 894       "mfvsrd  %0,%x1;\n"
 895       : "=r" (res),
 896         "=&wi" (vtmp)
 897       : "wa" (__A)
 898       : );
 899 #else
 900   res = __builtin_rint(__A[0]);
 901 #endif
 902   return (res);
 903 }
 904
 905 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 906 _mm_cvt_ss2si (__m128 __A)
 907 {
 908   return _mm_cvtss_si32 (__A);
 909 }
 910
 911 /* Convert the lower SPFP value to a 32-bit integer according to the
 912    current rounding mode.  */
 913
 914 /* Intel intrinsic.  */
 915 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 916 _mm_cvtss_si64 (__m128 __A)
 917 {
 918   __m64 res = 0;
 919 #ifdef _ARCH_PWR8
 920   __m128 vtmp;
 921   __asm__(
 922       "xxsldwi %x1,%x2,%x2,3;\n"
 923       "xscvspdp %x1,%x1;\n"
 924       "fctid  %1,%1;\n"
 925       "mfvsrd  %0,%x1;\n"
 926       : "=r" (res),
 927         "=&wi" (vtmp)
 928       : "wa" (__A)
 929       : );
 930 #else
 931   res = __builtin_llrint(__A[0]);
 932 #endif
 933   return (res);
 934 }
 935
 936 /* Microsoft intrinsic.  */
 937 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 938 _mm_cvtss_si64x (__m128 __A)
 939 {
 940   return _mm_cvtss_si64 ((__v4sf) __A);
 941 }
 942
 943 /* Constants for use with _mm_prefetch.  */
 944 enum _mm_hint
 945 {
 946   /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit.  */
 947   _MM_HINT_ET0 = 7,
 948   _MM_HINT_ET1 = 6,
 949   _MM_HINT_T0 = 3,
 950   _MM_HINT_T1 = 2,
 951   _MM_HINT_T2 = 1,
 952   _MM_HINT_NTA = 0
 953 };
 954
 955 /* Loads one cache line from address P to a location "closer" to the
 956    processor.  The selector I specifies the type of prefetch operation.  */
 957 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 958 _mm_prefetch (const void *__P, enum _mm_hint __I)
 959 {
 960   /* Current PowerPC will ignores the hint parameters.  */
 961   __builtin_prefetch (__P);
 962 }
 963
 964 /* Convert the two lower SPFP values to 32-bit integers according to the
 965    current rounding mode.  Return the integers in packed form.  */
 966 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 967 _mm_cvtps_pi32 (__m128 __A)
 968 {
 969   /* Splat two lower SPFP values to both halves.  */
 970   __v4sf temp, rounded;
 971   __vector __m64 result;
 972
 973   /* Splat two lower SPFP values to both halves.  */
 974   temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
 975   rounded = vec_rint(temp);
 976   result = (__vector __m64) vec_cts (rounded, 0);
 977
 978   return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0));
 979 }
 980
 981 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 982 _mm_cvt_ps2pi (__m128 __A)
 983 {
 984   return _mm_cvtps_pi32 (__A);
 985 }
 986
 987 /* Truncate the lower SPFP value to a 32-bit integer.  */
 988 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 989 _mm_cvttss_si32 (__m128 __A)
 990 {
 991   /* Extract the lower float element.  */
 992   float temp = __A[0];
 993   /* truncate to 32-bit integer and return.  */
 994   return temp;
 995 }
 996
 997 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 998 _mm_cvtt_ss2si (__m128 __A)
 999 {
1000   return _mm_cvttss_si32 (__A);
1001 }
1002
1003 /* Intel intrinsic.  */
1004 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1005 _mm_cvttss_si64 (__m128 __A)
1006 {
1007   /* Extract the lower float element.  */
1008   float temp = __A[0];
1009   /* truncate to 32-bit integer and return.  */
1010   return temp;
1011 }
1012
1013 /* Microsoft intrinsic.  */
1014 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1015 _mm_cvttss_si64x (__m128 __A)
1016 {
1017   /* Extract the lower float element.  */
1018   float temp = __A[0];
1019   /* truncate to 32-bit integer and return.  */
1020   return temp;
1021 }
1022
1023 /* Truncate the two lower SPFP values to 32-bit integers.  Return the
1024    integers in packed form.  */
1025 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1026 _mm_cvttps_pi32 (__m128 __A)
1027 {
1028   __v4sf temp;
1029   __vector __m64 result;
1030
1031   /* Splat two lower SPFP values to both halves.  */
1032   temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
1033   result = (__vector __m64) vec_cts (temp, 0);
1034
1035   return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0));
1036 }
1037
1038 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1039 _mm_cvtt_ps2pi (__m128 __A)
1040 {
1041   return _mm_cvttps_pi32 (__A);
1042 }
1043
1044 /* Convert B to a SPFP value and insert it as element zero in A.  */
1045 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1046 _mm_cvtsi32_ss (__m128 __A, int __B)
1047 {
1048   float temp = __B;
1049   __A[0] = temp;
1050
1051   return __A;
1052 }
1053
1054 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1055 _mm_cvt_si2ss (__m128 __A, int __B)
1056 {
1057   return _mm_cvtsi32_ss (__A, __B);
1058 }
1059
1060 /* Convert B to a SPFP value and insert it as element zero in A.  */
1061 /* Intel intrinsic.  */
1062 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1063 _mm_cvtsi64_ss (__m128 __A, long long __B)
1064 {
1065   float temp = __B;
1066   __A[0] = temp;
1067
1068   return __A;
1069 }
1070
1071 /* Microsoft intrinsic.  */
1072 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1073 _mm_cvtsi64x_ss (__m128 __A, long long __B)
1074 {
1075   return _mm_cvtsi64_ss (__A, __B);
1076 }
1077
1078 /* Convert the two 32-bit values in B to SPFP form and insert them
1079    as the two lower elements in A.  */
1080 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1081 _mm_cvtpi32_ps (__m128        __A, __m64        __B)
1082 {
1083   __vector signed int vm1;
1084   __vector float vf1;
1085
1086   vm1 = (__vector signed int) __builtin_pack_vector_int128 (__B, __B);
1087   vf1 = (__vector float) vec_ctf (vm1, 0);
1088
1089   return ((__m128) (__vector __m64)
1090     { ((__vector __m64)vf1) [0], ((__vector __m64)__A) [1]});
1091 }
1092
1093 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1094 _mm_cvt_pi2ps (__m128 __A, __m64 __B)
1095 {
1096   return _mm_cvtpi32_ps (__A, __B);
1097 }
1098
1099 /* Convert the four signed 16-bit values in A to SPFP form.  */
1100 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1101 _mm_cvtpi16_ps (__m64 __A)
1102 {
1103   __vector signed short vs8;
1104   __vector signed int vi4;
1105   __vector float vf1;
1106
1107   vs8 = (__vector signed short) __builtin_pack_vector_int128 (__A, __A);
1108   vi4 = vec_vupklsh (vs8);
1109   vf1 = (__vector float) vec_ctf (vi4, 0);
1110
1111   return (__m128) vf1;
1112 }
1113
1114 /* Convert the four unsigned 16-bit values in A to SPFP form.  */
1115 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1116 _mm_cvtpu16_ps (__m64 __A)
1117 {
1118   const __vector unsigned short zero =
1119     { 0, 0, 0, 0, 0, 0, 0, 0 };
1120   __vector unsigned short vs8;
1121   __vector unsigned int vi4;
1122   __vector float vf1;
1123
1124   vs8 = (__vector unsigned short) __builtin_pack_vector_int128 (__A, __A);
1125   vi4 = (__vector unsigned int) vec_vmrglh (vs8, zero);
1126   vf1 = (__vector float) vec_ctf (vi4, 0);
1127
1128   return (__m128) vf1;
1129 }
1130
1131 /* Convert the low four signed 8-bit values in A to SPFP form.  */
1132 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1133 _mm_cvtpi8_ps (__m64 __A)
1134 {
1135   __vector signed char vc16;
1136   __vector signed short vs8;
1137   __vector signed int vi4;
1138   __vector float vf1;
1139
1140   vc16 = (__vector signed char) __builtin_pack_vector_int128 (__A, __A);
1141   vs8 = vec_vupkhsb (vc16);
1142   vi4 = vec_vupkhsh (vs8);
1143   vf1 = (__vector float) vec_ctf (vi4, 0);
1144
1145   return (__m128) vf1;
1146 }
1147
1148 /* Convert the low four unsigned 8-bit values in A to SPFP form.  */
1149 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1150
1151 _mm_cvtpu8_ps (__m64  __A)
1152 {
1153   const __vector unsigned char zero =
1154     { 0, 0, 0, 0, 0, 0, 0, 0 };
1155   __vector unsigned char vc16;
1156   __vector unsigned short vs8;
1157   __vector unsigned int vi4;
1158   __vector float vf1;
1159
1160   vc16 = (__vector unsigned char) __builtin_pack_vector_int128 (__A, __A);
1161   vs8 = (__vector unsigned short) vec_vmrglb (vc16, zero);
1162   vi4 = (__vector unsigned int) vec_vmrghh (vs8,
1163                                             (__vector unsigned short) zero);
1164   vf1 = (__vector float) vec_ctf (vi4, 0);
1165
1166   return (__m128) vf1;
1167 }
1168
1169 /* Convert the four signed 32-bit values in A and B to SPFP form.  */
1170 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1171 _mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
1172 {
1173   __vector signed int vi4;
1174   __vector float vf4;
1175
1176   vi4 = (__vector signed int) __builtin_pack_vector_int128 (__B, __A);
1177   vf4 = (__vector float) vec_ctf (vi4, 0);
1178   return (__m128) vf4;
1179 }
1180
1181 /* Convert the four SPFP values in A to four signed 16-bit integers.  */
1182 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1183 _mm_cvtps_pi16(__m128 __A)
1184 {
1185   __v4sf rounded;
1186   __vector signed int temp;
1187   __vector __m64 result;
1188
1189   rounded = vec_rint(__A);
1190   temp = vec_cts (rounded, 0);
1191   result = (__vector __m64) vec_pack (temp, temp);
1192
1193   return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0));
1194 }
1195
1196 /* Convert the four SPFP values in A to four signed 8-bit integers.  */
1197 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1198 _mm_cvtps_pi8(__m128 __A)
1199 {
1200   __v4sf rounded;
1201   __vector signed int tmp_i;
1202   static const __vector signed int zero = {0, 0, 0, 0};
1203   __vector signed short tmp_s;
1204   __vector signed char res_v;
1205   __m64 result;
1206
1207   rounded = vec_rint(__A);
1208   tmp_i = vec_cts (rounded, 0);
1209   tmp_s = vec_pack (tmp_i, zero);
1210   res_v = vec_pack (tmp_s, tmp_s);
1211   result = (__m64) __builtin_unpack_vector_int128 ((__vector __int128)res_v, 0);
1212
1213   return (result);
1214 }
1215
1216 /* Selects four specific SPFP values from A and B based on MASK.  */
1217 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1218
1219 _mm_shuffle_ps (__m128  __A, __m128  __B, int const __mask)
1220 {
1221   unsigned long element_selector_10 = __mask & 0x03;
1222   unsigned long element_selector_32 = (__mask >> 2) & 0x03;
1223   unsigned long element_selector_54 = (__mask >> 4) & 0x03;
1224   unsigned long element_selector_76 = (__mask >> 6) & 0x03;
1225   static const unsigned int permute_selectors[4] =
1226     {
1227 #ifdef __LITTLE_ENDIAN__
1228       0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1229 #elif __BIG_ENDIAN__
1230       0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203
1231 #endif
1232     };
1233   __vector unsigned int t;
1234
1235 #ifdef __LITTLE_ENDIAN__
1236   t[0] = permute_selectors[element_selector_10];
1237   t[1] = permute_selectors[element_selector_32];
1238   t[2] = permute_selectors[element_selector_54] + 0x10101010;
1239   t[3] = permute_selectors[element_selector_76] + 0x10101010;
1240 #elif __BIG_ENDIAN__
1241   t[3] = permute_selectors[element_selector_10] + 0x10101010;
1242   t[2] = permute_selectors[element_selector_32] + 0x10101010;
1243   t[1] = permute_selectors[element_selector_54];
1244   t[0] = permute_selectors[element_selector_76];
1245 #endif
1246   return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t);
1247 }
1248
1249 /* Selects and interleaves the upper two SPFP values from A and B.  */
1250 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1251 _mm_unpackhi_ps (__m128 __A, __m128 __B)
1252 {
1253   return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B);
1254 }
1255
1256 /* Selects and interleaves the lower two SPFP values from A and B.  */
1257 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1258 _mm_unpacklo_ps (__m128 __A, __m128 __B)
1259 {
1260   return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B);
1261 }
1262
1263 /* Sets the upper two SPFP values with 64-bits of data loaded from P;
1264    the lower two values are passed through from A.  */
1265 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1266 _mm_loadh_pi (__m128 __A, __m64 const *__P)
1267 {
1268   __vector __m64 __a = (__vector __m64)__A;
1269   __vector __m64 __p = vec_splats(*__P);
1270   __a [1] = __p [1];
1271
1272   return (__m128)__a;
1273 }
1274
1275 /* Stores the upper two SPFP values of A into P.  */
1276 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1277 _mm_storeh_pi (__m64 *__P, __m128 __A)
1278 {
1279   __vector __m64 __a = (__vector __m64) __A;
1280
1281   *__P = __a[1];
1282 }
1283
1284 /* Moves the upper two values of B into the lower two values of A.  */
1285 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1286 _mm_movehl_ps (__m128 __A, __m128 __B)
1287 {
1288   return (__m128) vec_mergel ((__vector __m64)__B, (__vector __m64)__A);
1289 }
1290
1291 /* Moves the lower two values of B into the upper two values of A.  */
1292 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1293 _mm_movelh_ps (__m128 __A, __m128 __B)
1294 {
1295   return (__m128) vec_mergeh ((__vector __m64)__A, (__vector __m64)__B);
1296 }
1297
1298 /* Sets the lower two SPFP values with 64-bits of data loaded from P;
1299    the upper two values are passed through from A.  */
1300 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1301 _mm_loadl_pi (__m128 __A, __m64 const *__P)
1302 {
1303   __vector __m64 __a = (__vector __m64)__A;
1304   __vector __m64 __p = vec_splats(*__P);
1305   __a [0] = __p [0];
1306
1307   return (__m128)__a;
1308 }
1309
1310 /* Stores the lower two SPFP values of A into P.  */
1311 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1312 _mm_storel_pi (__m64 *__P, __m128 __A)
1313 {
1314   __vector __m64 __a = (__vector __m64) __A;
1315
1316   *__P = __a[0];
1317 }
1318
1319 #ifdef _ARCH_PWR8
1320 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
1321
1322 /* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
1323 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1324 _mm_movemask_ps (__m128  __A)
1325 {
1326   __vector __m64 result;
1327   static const __vector unsigned int perm_mask =
1328     {
1329 #ifdef __LITTLE_ENDIAN__
1330         0x00204060, 0x80808080, 0x80808080, 0x80808080
1331 #elif __BIG_ENDIAN__
1332       0x80808080, 0x80808080, 0x80808080, 0x00204060
1333 #endif
1334     };
1335
1336   result = (__vector __m64) vec_vbpermq ((__vector unsigned char) __A,
1337                                          (__vector unsigned char) perm_mask);
1338
1339 #ifdef __LITTLE_ENDIAN__
1340   return result[1];
1341 #elif __BIG_ENDIAN__
1342   return result[0];
1343 #endif
1344 }
1345 #endif /* _ARCH_PWR8 */
1346
1347 /* Create a vector with all four elements equal to *P.  */
1348 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1349 _mm_load1_ps (float const *__P)
1350 {
1351   return _mm_set1_ps (*__P);
1352 }
1353
1354 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1355 _mm_load_ps1 (float const *__P)
1356 {
1357   return _mm_load1_ps (__P);
1358 }
1359
1360 /* Extracts one of the four words of A.  The selector N must be immediate.  */
1361 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1362 _mm_extract_pi16 (__m64 const __A, int const __N)
1363 {
1364   const int shiftr = (__N & 3) * 16;
1365
1366   return ((__A >> shiftr) & 0xffff);
1367 }
1368
1369 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1370 _m_pextrw (__m64 const __A, int const __N)
1371 {
1372   return _mm_extract_pi16 (__A, __N);
1373 }
1374
1375 /* Inserts word D into one of four words of A.  The selector N must be
1376    immediate.  */
1377 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1378 _mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
1379 {
1380   const int shiftl = (__N & 3) * 16;
1381   const __m64 shiftD = (const __m64) __D << shiftl;
1382   const __m64 mask = 0xffffUL << shiftl;
1383   __m64 result = (__A & (~mask)) | (shiftD & mask);
1384
1385   return (result);
1386 }
1387
1388 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1389 _m_pinsrw (__m64 const __A, int const __D, int const __N)
1390 {
1391   return _mm_insert_pi16 (__A, __D, __N);
1392 }
1393
1394 /* Compute the element-wise maximum of signed 16-bit values.  */
1395 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1396
1397 _mm_max_pi16 (__m64 __A, __m64 __B)
1398 {
1399 #if _ARCH_PWR8
1400   __vector signed short a, b, r;
1401   __vector bool short c;
1402
1403   a = (__vector signed short)vec_splats (__A);
1404   b = (__vector signed short)vec_splats (__B);
1405   c = (__vector bool short)vec_cmpgt (a, b);
1406   r = vec_sel (b, a, c);
1407   return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1408 #else
1409   __m64_union m1, m2, res;
1410
1411   m1.as_m64 = __A;
1412   m2.as_m64 = __B;
1413
1414   res.as_short[0] =
1415       (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1416   res.as_short[1] =
1417       (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1418   res.as_short[2] =
1419       (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1420   res.as_short[3] =
1421       (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1422
1423   return (__m64) res.as_m64;
1424 #endif
1425 }
1426
1427 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1428 _m_pmaxsw (__m64 __A, __m64 __B)
1429 {
1430   return _mm_max_pi16 (__A, __B);
1431 }
1432
1433 /* Compute the element-wise maximum of unsigned 8-bit values.  */
1434 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1435 _mm_max_pu8 (__m64 __A, __m64 __B)
1436 {
1437 #if _ARCH_PWR8
1438   __vector unsigned char a, b, r;
1439   __vector bool char c;
1440
1441   a = (__vector unsigned char)vec_splats (__A);
1442   b = (__vector unsigned char)vec_splats (__B);
1443   c = (__vector bool char)vec_cmpgt (a, b);
1444   r = vec_sel (b, a, c);
1445   return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1446 #else
1447   __m64_union m1, m2, res;
1448   long i;
1449
1450   m1.as_m64 = __A;
1451   m2.as_m64 = __B;
1452
1453
1454   for (i = 0; i < 8; i++)
1455   res.as_char[i] =
1456       ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ?
1457           m1.as_char[i] : m2.as_char[i];
1458
1459   return (__m64) res.as_m64;
1460 #endif
1461 }
1462
1463 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1464 _m_pmaxub (__m64 __A, __m64 __B)
1465 {
1466   return _mm_max_pu8 (__A, __B);
1467 }
1468
1469 /* Compute the element-wise minimum of signed 16-bit values.  */
1470 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1471 _mm_min_pi16 (__m64 __A, __m64 __B)
1472 {
1473 #if _ARCH_PWR8
1474   __vector signed short a, b, r;
1475   __vector bool short c;
1476
1477   a = (__vector signed short)vec_splats (__A);
1478   b = (__vector signed short)vec_splats (__B);
1479   c = (__vector bool short)vec_cmplt (a, b);
1480   r = vec_sel (b, a, c);
1481   return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1482 #else
1483   __m64_union m1, m2, res;
1484
1485   m1.as_m64 = __A;
1486   m2.as_m64 = __B;
1487
1488   res.as_short[0] =
1489       (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1490   res.as_short[1] =
1491       (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1492   res.as_short[2] =
1493       (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1494   res.as_short[3] =
1495       (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1496
1497   return (__m64) res.as_m64;
1498 #endif
1499 }
1500
1501 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1502 _m_pminsw (__m64 __A, __m64 __B)
1503 {
1504   return _mm_min_pi16 (__A, __B);
1505 }
1506
1507 /* Compute the element-wise minimum of unsigned 8-bit values.  */
1508 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1509 _mm_min_pu8 (__m64 __A, __m64 __B)
1510 {
1511 #if _ARCH_PWR8
1512   __vector unsigned char a, b, r;
1513   __vector bool char c;
1514
1515   a = (__vector unsigned char)vec_splats (__A);
1516   b = (__vector unsigned char)vec_splats (__B);
1517   c = (__vector bool char)vec_cmplt (a, b);
1518   r = vec_sel (b, a, c);
1519   return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1520 #else
1521   __m64_union m1, m2, res;
1522   long i;
1523
1524   m1.as_m64 = __A;
1525   m2.as_m64 = __B;
1526
1527
1528   for (i = 0; i < 8; i++)
1529   res.as_char[i] =
1530       ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ?
1531           m1.as_char[i] : m2.as_char[i];
1532
1533   return (__m64) res.as_m64;
1534 #endif
1535 }
1536
1537 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1538 _m_pminub (__m64 __A, __m64 __B)
1539 {
1540   return _mm_min_pu8 (__A, __B);
1541 }
1542
1543 /* Create an 8-bit mask of the signs of 8-bit values.  */
1544 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1545 _mm_movemask_pi8 (__m64 __A)
1546 {
1547   unsigned long p = 0x0008101820283038UL; // permute control for sign bits
1548
1549   return __builtin_bpermd (p, __A);
1550 }
1551
1552 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1553 _m_pmovmskb (__m64 __A)
1554 {
1555   return _mm_movemask_pi8 (__A);
1556 }
1557
1558 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1559    in B and produce the high 16 bits of the 32-bit results.  */
1560 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1561 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
1562 {
1563   __vector unsigned short a, b;
1564   __vector unsigned short c;
1565   __vector unsigned int w0, w1;
1566   __vector unsigned char xform1 = {
1567       0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
1568       0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
1569     };
1570
1571   a = (__vector unsigned short)vec_splats (__A);
1572   b = (__vector unsigned short)vec_splats (__B);
1573
1574   w0 = vec_vmuleuh (a, b);
1575   w1 = vec_vmulouh (a, b);
1576   c = (__vector unsigned short)vec_perm (w0, w1, xform1);
1577
1578   return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
1579 }
1580
1581 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1582 _m_pmulhuw (__m64 __A, __m64 __B)
1583 {
1584   return _mm_mulhi_pu16 (__A, __B);
1585 }
1586
1587 /* Return a combination of the four 16-bit values in A.  The selector
1588    must be an immediate.  */
1589 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1590 _mm_shuffle_pi16 (__m64 __A, int const __N)
1591 {
1592   unsigned long element_selector_10 = __N & 0x03;
1593   unsigned long element_selector_32 = (__N >> 2) & 0x03;
1594   unsigned long element_selector_54 = (__N >> 4) & 0x03;
1595   unsigned long element_selector_76 = (__N >> 6) & 0x03;
1596   static const unsigned short permute_selectors[4] =
1597     {
1598 #ifdef __LITTLE_ENDIAN__
1599               0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1600 #elif __BIG_ENDIAN__
1601               0x0607, 0x0405, 0x0203, 0x0001
1602 #endif
1603     };
1604   __m64_union t;
1605   __vector __m64 a, p, r;
1606
1607 #ifdef __LITTLE_ENDIAN__
1608   t.as_short[0] = permute_selectors[element_selector_10];
1609   t.as_short[1] = permute_selectors[element_selector_32];
1610   t.as_short[2] = permute_selectors[element_selector_54];
1611   t.as_short[3] = permute_selectors[element_selector_76];
1612 #elif __BIG_ENDIAN__
1613   t.as_short[3] = permute_selectors[element_selector_10];
1614   t.as_short[2] = permute_selectors[element_selector_32];
1615   t.as_short[1] = permute_selectors[element_selector_54];
1616   t.as_short[0] = permute_selectors[element_selector_76];
1617 #endif
1618   p = vec_splats (t.as_m64);
1619   a = vec_splats (__A);
1620   r = vec_perm (a, a, (__vector unsigned char)p);
1621   return (__builtin_unpack_vector_int128 ((__vector __int128)r, 0));
1622 }
1623
1624 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1625 _m_pshufw (__m64 __A, int const __N)
1626 {
1627   return _mm_shuffle_pi16 (__A, __N);
1628 }
1629
1630 /* Conditionally store byte elements of A into P.  The high bit of each
1631    byte in the selector N determines whether the corresponding byte from
1632    A is stored.  */
1633 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1634 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1635 {
1636   __m64 hibit = 0x8080808080808080UL;
1637   __m64 mask, tmp;
1638   __m64 *p = (__m64*)__P;
1639
1640   tmp = *p;
1641   mask = _mm_cmpeq_pi8 ((__N & hibit), hibit);
1642   tmp = (tmp & (~mask)) | (__A & mask);
1643   *p = tmp;
1644 }
1645
1646 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1647 _m_maskmovq (__m64 __A, __m64 __N, char *__P)
1648 {
1649   _mm_maskmove_si64 (__A, __N, __P);
1650 }
1651
1652 /* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
1653 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1654 _mm_avg_pu8 (__m64 __A, __m64 __B)
1655 {
1656   __vector unsigned char a, b, c;
1657
1658   a = (__vector unsigned char)vec_splats (__A);
1659   b = (__vector unsigned char)vec_splats (__B);
1660   c = vec_avg (a, b);
1661   return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
1662 }
1663
1664 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1665 _m_pavgb (__m64 __A, __m64 __B)
1666 {
1667   return _mm_avg_pu8 (__A, __B);
1668 }
1669
1670 /* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
1671 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1672 _mm_avg_pu16 (__m64 __A, __m64 __B)
1673 {
1674   __vector unsigned short a, b, c;
1675
1676   a = (__vector unsigned short)vec_splats (__A);
1677   b = (__vector unsigned short)vec_splats (__B);
1678   c = vec_avg (a, b);
1679   return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
1680 }
1681
1682 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1683 _m_pavgw (__m64 __A, __m64 __B)
1684 {
1685   return _mm_avg_pu16 (__A, __B);
1686 }
1687
1688 /* Compute the sum of the absolute differences of the unsigned 8-bit
1689    values in A and B.  Return the value in the lower 16-bit word; the
1690    upper words are cleared.  */
1691 extern __inline    __m64    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1692 _mm_sad_pu8 (__m64  __A, __m64  __B)
1693 {
1694   __vector unsigned char a, b;
1695   __vector unsigned char vmin, vmax, vabsdiff;
1696   __vector signed int vsum;
1697   const __vector unsigned int zero =
1698     { 0, 0, 0, 0 };
1699   unsigned short result;
1700
1701   a = (__vector unsigned char) __builtin_pack_vector_int128 (0UL, __A);
1702   b = (__vector unsigned char) __builtin_pack_vector_int128 (0UL, __B);
1703   vmin = vec_min (a, b);
1704   vmax = vec_max (a, b);
1705   vabsdiff = vec_sub (vmax, vmin);
1706   /* Sum four groups of bytes into integers.  */
1707   vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
1708   /* Sum across four integers with integer result.  */
1709   vsum = vec_sums (vsum, (__vector signed int) zero);
1710   /* The sum is in the right most 32-bits of the vector result.
1711      Transfer to a GPR and truncate to 16 bits.  */
1712   result = vsum[3];
1713   return (result);
1714 }
1715
1716 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1717 _m_psadbw (__m64 __A, __m64 __B)
1718 {
1719   return _mm_sad_pu8 (__A, __B);
1720 }
1721
1722 /* Stores the data in A to the address P without polluting the caches.  */
1723 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1724 _mm_stream_pi (__m64 *__P, __m64 __A)
1725 {
1726   /* Use the data cache block touch for store transient.  */
1727   __asm__ (
1728     "   dcbtstt 0,%0"
1729     :
1730     : "b" (__P)
1731     : "memory"
1732   );
1733   *__P = __A;
1734 }
1735
1736 /* Likewise.  The address must be 16-byte aligned.  */
1737 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1738 _mm_stream_ps (float *__P, __m128 __A)
1739 {
1740   /* Use the data cache block touch for store transient.  */
1741   __asm__ (
1742     "   dcbtstt 0,%0"
1743     :
1744     : "b" (__P)
1745     : "memory"
1746   );
1747   _mm_store_ps (__P, __A);
1748 }
1749
1750 /* Guarantees that every preceding store is globally visible before
1751    any subsequent store.  */
1752 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1753 _mm_sfence (void)
1754 {
1755   /* Generate a light weight sync.  */
1756   __atomic_thread_fence (__ATOMIC_RELEASE);
1757 }
1758
1759 /* The execution of the next instruction is delayed by an implementation
1760    specific amount of time.  The instruction does not modify the
1761    architectural state.  This is after the pop_options pragma because
1762    it does not require SSE support in the processor--the encoding is a
1763    nop on processors that do not support it.  */
1764 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1765 _mm_pause (void)
1766 {
1767   /* There is no exact match with this construct, but the following is
1768      close to the desired effect.  */
1769 #if _ARCH_PWR8
1770   /* On power8 and later processors we can depend on Program Priority
1771      (PRI) and associated "very low" PPI setting.  Since we don't know
1772      what PPI this thread is running at we: 1) save the current PRI
1773      from the PPR SPR into a local GRP, 2) set the PRI to "very low*
1774      via the special or 31,31,31 encoding. 3) issue an "isync" to
1775      insure the PRI change takes effect before we execute any more
1776      instructions.
1777      Now we can execute a lwsync (release barrier) while we execute
1778      this thread at "very low" PRI.  Finally we restore the original
1779      PRI and continue execution.  */
1780   unsigned long __PPR;
1781
1782   __asm__ volatile (
1783     "   mfppr   %0;"
1784     "   or 31,31,31;"
1785     "   isync;"
1786     "   lwsync;"
1787     "   isync;"
1788     "   mtppr   %0;"
1789     : "=r" (__PPR)
1790     :
1791     : "memory"
1792   );
1793 #else
1794   /* For older processor where we may not even have Program Priority
1795      controls we can only depend on Heavy Weight Sync.  */
1796   __atomic_thread_fence (__ATOMIC_SEQ_CST);
1797 #endif
1798 }
1799
1800 /* Transpose the 4x4 matrix composed of row[0-3].  */
1801 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)                       \
1802 do {                                                                    \
1803   __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);    \
1804   __v4sf __t0 = vec_vmrghw (__r0, __r1);                        \
1805   __v4sf __t1 = vec_vmrghw (__r2, __r3);                        \
1806   __v4sf __t2 = vec_vmrglw (__r0, __r1);                        \
1807   __v4sf __t3 = vec_vmrglw (__r2, __r3);                        \
1808   (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0,        \
1809                                (__vector long long)__t1);       \
1810   (row1) = (__v4sf)vec_mergel ((__vector long long)__t0,        \
1811                                (__vector long long)__t1);       \
1812   (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2,        \
1813                                (__vector long long)__t3);       \
1814   (row3) = (__v4sf)vec_mergel ((__vector long long)__t2,        \
1815                                (__vector long long)__t3);       \
1816 } while (0)
1817
1818 /* For backward source compatibility.  */
1819 //# include <emmintrin.h>
1820
1821 #endif /* _XMMINTRIN_H_INCLUDED */