gcc/config/rs6000/mmintrin.h

   1 /* Copyright (C) 2002-2018 Free Software Foundation, Inc.
   2
   3    This file is part of GCC.
   4
   5    GCC is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3, or (at your option)
   8    any later version.
   9
  10    GCC is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    Under Section 7 of GPL version 3, you are granted additional
  16    permissions described in the GCC Runtime Library Exception, version
  17    3.1, as published by the Free Software Foundation.
  18
  19    You should have received a copy of the GNU General Public License and
  20    a copy of the GCC Runtime Library Exception along with this program;
  21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  22    <http://www.gnu.org/licenses/>.  */
  23
  24 /* Implemented from the specification included in the Intel C++ Compiler
  25    User Guide and Reference, version 9.0.  */
  26
  27 #ifndef NO_WARN_X86_INTRINSICS
  28 /* This header is distributed to simplify porting x86_64 code that
  29    makes explicit use of Intel intrinsics to powerpc64le.
  30    It is the user's responsibility to determine if the results are
  31    acceptable and make additional changes as necessary.
  32    Note that much code that uses Intel intrinsics can be rewritten in
  33    standard C or GNU C extensions, which are more portable and better
  34    optimized across multiple targets.
  35
  36    In the specific case of X86 MMX (__m64) intrinsics, the PowerPC
  37    target does not support a native __vector_size__ (8) type.  Instead
  38    we typedef __m64 to a 64-bit unsigned long long, which is natively
  39    supported in 64-bit mode.  This works well for the _si64 and some
  40    _pi32 operations, but starts to generate long sequences for _pi16
  41    and _pi8 operations.  For those cases it better (faster and
  42    smaller code) to transfer __m64 data to the PowerPC vector 128-bit
  43    unit, perform the operation, and then transfer the result back to
  44    the __m64 type. This implies that the direct register move
  45    instructions, introduced with power8, are available for efficient
  46    implementation of these transfers.
  47
  48    Most MMX intrinsic operations can be performed efficiently as
  49    C language 64-bit scalar operation or optimized to use the newer
  50    128-bit SSE/Altivec operations.  We recomend this for new
  51    applications.  */
  52 #error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
  53 #endif
  54
  55 #ifndef _MMINTRIN_H_INCLUDED
  56 #define _MMINTRIN_H_INCLUDED
  57
  58 #include <altivec.h>
  59 /* The Intel API is flexible enough that we must allow aliasing with other
  60    vector types, and their scalar components.  */
  61 typedef __attribute__ ((__aligned__ (8))) unsigned long long __m64;
  62
  63 typedef __attribute__ ((__aligned__ (8)))
  64 union
  65   {
  66     __m64 as_m64;
  67     char as_char[8];
  68     signed char as_signed_char [8];
  69     short as_short[4];
  70     int as_int[2];
  71     long long as_long_long;
  72     float as_float[2];
  73     double as_double;
  74   } __m64_union;
  75
  76 /* Empty the multimedia state.  */
  77 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  78 _mm_empty (void)
  79 {
  80   /* nothing to do on PowerPC.  */
  81 }
  82
  83 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  84 _m_empty (void)
  85 {
  86   /* nothing to do on PowerPC.  */
  87 }
  88
  89 /* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
  90 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  91 _mm_cvtsi32_si64 (int __i)
  92 {
  93   return (__m64) (unsigned int) __i;
  94 }
  95
  96 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  97 _m_from_int (int __i)
  98 {
  99   return _mm_cvtsi32_si64 (__i);
 100 }
 101
 102 /* Convert the lower 32 bits of the __m64 object into an integer.  */
 103 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 104 _mm_cvtsi64_si32 (__m64 __i)
 105 {
 106   return ((int) __i);
 107 }
 108
 109 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 110 _m_to_int (__m64 __i)
 111 {
 112   return _mm_cvtsi64_si32 (__i);
 113 }
 114
 115 /* Convert I to a __m64 object.  */
 116
 117 /* Intel intrinsic.  */
 118 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 119 _m_from_int64 (long long __i)
 120 {
 121   return (__m64) __i;
 122 }
 123
 124 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 125 _mm_cvtsi64_m64 (long long __i)
 126 {
 127   return (__m64) __i;
 128 }
 129
 130 /* Microsoft intrinsic.  */
 131 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 132 _mm_cvtsi64x_si64 (long long __i)
 133 {
 134   return (__m64) __i;
 135 }
 136
 137 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 138 _mm_set_pi64x (long long __i)
 139 {
 140   return (__m64) __i;
 141 }
 142
 143 /* Convert the __m64 object to a 64bit integer.  */
 144
 145 /* Intel intrinsic.  */
 146 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 147 _m_to_int64 (__m64 __i)
 148 {
 149   return (long long)__i;
 150 }
 151
 152 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 153 _mm_cvtm64_si64 (__m64 __i)
 154 {
 155   return (long long) __i;
 156 }
 157
 158 /* Microsoft intrinsic.  */
 159 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 160 _mm_cvtsi64_si64x (__m64 __i)
 161 {
 162   return (long long) __i;
 163 }
 164
 165 #ifdef _ARCH_PWR8
 166 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
 167    the result, and the four 16-bit values from M2 into the upper four 8-bit
 168    values of the result, all with signed saturation.  */
 169 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 170 _mm_packs_pi16 (__m64 __m1, __m64 __m2)
 171 {
 172   __vector signed short vm1;
 173   __vector signed char vresult;
 174
 175   vm1 = (__vector signed short) (__vector unsigned long long) { __m2, __m1 };
 176   vresult = vec_vpkshss (vm1, vm1);
 177   return (__m64) ((__vector long long) vresult)[0];
 178 }
 179
 180 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 181 _m_packsswb (__m64 __m1, __m64 __m2)
 182 {
 183   return _mm_packs_pi16 (__m1, __m2);
 184 }
 185
 186 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
 187    the result, and the two 32-bit values from M2 into the upper two 16-bit
 188    values of the result, all with signed saturation.  */
 189 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 190 _mm_packs_pi32 (__m64 __m1, __m64 __m2)
 191 {
 192   __vector signed int vm1;
 193   __vector signed short vresult;
 194
 195   vm1 = (__vector signed int) (__vector unsigned long long) { __m2, __m1 };
 196   vresult = vec_vpkswss (vm1, vm1);
 197   return (__m64) ((__vector long long) vresult)[0];
 198 }
 199
 200 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 201 _m_packssdw (__m64 __m1, __m64 __m2)
 202 {
 203   return _mm_packs_pi32 (__m1, __m2);
 204 }
 205
 206 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
 207    the result, and the four 16-bit values from M2 into the upper four 8-bit
 208    values of the result, all with unsigned saturation.  */
 209 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 210 _mm_packs_pu16 (__m64 __m1, __m64 __m2)
 211 {
 212   __vector signed short vm1;
 213   __vector unsigned char vresult;
 214
 215   vm1 = (__vector signed short) (__vector unsigned long long) { __m2, __m1 };
 216   vresult = vec_vpkshus (vm1, vm1);
 217   return (__m64) ((__vector long long) vresult)[0];
 218 }
 219
 220 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 221 _m_packuswb (__m64 __m1, __m64 __m2)
 222 {
 223   return _mm_packs_pu16 (__m1, __m2);
 224 }
 225 #endif /* end ARCH_PWR8 */
 226
 227 /* Interleave the four 8-bit values from the high half of M1 with the four
 228    8-bit values from the high half of M2.  */
 229 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 230 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
 231 {
 232 #if _ARCH_PWR8
 233   __vector unsigned char a, b, c;
 234
 235   a = (__vector unsigned char)vec_splats (__m1);
 236   b = (__vector unsigned char)vec_splats (__m2);
 237   c = vec_mergel (a, b);
 238   return (__m64) ((__vector long long) c)[0];
 239 #else
 240   __m64_union m1, m2, res;
 241
 242   m1.as_m64 = __m1;
 243   m2.as_m64 = __m2;
 244
 245   res.as_char[0] = m1.as_char[4];
 246   res.as_char[1] = m2.as_char[4];
 247   res.as_char[2] = m1.as_char[5];
 248   res.as_char[3] = m2.as_char[5];
 249   res.as_char[4] = m1.as_char[6];
 250   res.as_char[5] = m2.as_char[6];
 251   res.as_char[6] = m1.as_char[7];
 252   res.as_char[7] = m2.as_char[7];
 253
 254   return (__m64) res.as_m64;
 255 #endif
 256 }
 257
 258 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 259 _m_punpckhbw (__m64 __m1, __m64 __m2)
 260 {
 261   return _mm_unpackhi_pi8 (__m1, __m2);
 262 }
 263
 264 /* Interleave the two 16-bit values from the high half of M1 with the two
 265    16-bit values from the high half of M2.  */
 266 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 267 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
 268 {
 269   __m64_union m1, m2, res;
 270
 271   m1.as_m64 = __m1;
 272   m2.as_m64 = __m2;
 273
 274   res.as_short[0] = m1.as_short[2];
 275   res.as_short[1] = m2.as_short[2];
 276   res.as_short[2] = m1.as_short[3];
 277   res.as_short[3] = m2.as_short[3];
 278
 279   return (__m64) res.as_m64;
 280 }
 281
 282 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 283 _m_punpckhwd (__m64 __m1, __m64 __m2)
 284 {
 285   return _mm_unpackhi_pi16 (__m1, __m2);
 286 }
 287 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
 288    value from the high half of M2.  */
 289 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 290 _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
 291 {
 292   __m64_union m1, m2, res;
 293
 294   m1.as_m64 = __m1;
 295   m2.as_m64 = __m2;
 296
 297   res.as_int[0] = m1.as_int[1];
 298   res.as_int[1] = m2.as_int[1];
 299
 300   return (__m64) res.as_m64;
 301 }
 302
 303 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 304 _m_punpckhdq (__m64 __m1, __m64 __m2)
 305 {
 306   return _mm_unpackhi_pi32 (__m1, __m2);
 307 }
 308 /* Interleave the four 8-bit values from the low half of M1 with the four
 309    8-bit values from the low half of M2.  */
 310 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 311 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
 312 {
 313 #if _ARCH_PWR8
 314   __vector unsigned char a, b, c;
 315
 316   a = (__vector unsigned char)vec_splats (__m1);
 317   b = (__vector unsigned char)vec_splats (__m2);
 318   c = vec_mergel (a, b);
 319   return (__m64) ((__vector long long) c)[1];
 320 #else
 321   __m64_union m1, m2, res;
 322
 323   m1.as_m64 = __m1;
 324   m2.as_m64 = __m2;
 325
 326   res.as_char[0] = m1.as_char[0];
 327   res.as_char[1] = m2.as_char[0];
 328   res.as_char[2] = m1.as_char[1];
 329   res.as_char[3] = m2.as_char[1];
 330   res.as_char[4] = m1.as_char[2];
 331   res.as_char[5] = m2.as_char[2];
 332   res.as_char[6] = m1.as_char[3];
 333   res.as_char[7] = m2.as_char[3];
 334
 335   return (__m64) res.as_m64;
 336 #endif
 337 }
 338
 339 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 340 _m_punpcklbw (__m64 __m1, __m64 __m2)
 341 {
 342   return _mm_unpacklo_pi8 (__m1, __m2);
 343 }
 344 /* Interleave the two 16-bit values from the low half of M1 with the two
 345    16-bit values from the low half of M2.  */
 346 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 347 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
 348 {
 349   __m64_union m1, m2, res;
 350
 351   m1.as_m64 = __m1;
 352   m2.as_m64 = __m2;
 353
 354   res.as_short[0] = m1.as_short[0];
 355   res.as_short[1] = m2.as_short[0];
 356   res.as_short[2] = m1.as_short[1];
 357   res.as_short[3] = m2.as_short[1];
 358
 359   return (__m64) res.as_m64;
 360 }
 361
 362 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 363 _m_punpcklwd (__m64 __m1, __m64 __m2)
 364 {
 365   return _mm_unpacklo_pi16 (__m1, __m2);
 366 }
 367
 368 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
 369    value from the low half of M2.  */
 370 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 371 _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
 372 {
 373   __m64_union m1, m2, res;
 374
 375   m1.as_m64 = __m1;
 376   m2.as_m64 = __m2;
 377
 378   res.as_int[0] = m1.as_int[0];
 379   res.as_int[1] = m2.as_int[0];
 380
 381   return (__m64) res.as_m64;
 382 }
 383
 384 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 385 _m_punpckldq (__m64 __m1, __m64 __m2)
 386 {
 387   return _mm_unpacklo_pi32 (__m1, __m2);
 388 }
 389
 390 /* Add the 8-bit values in M1 to the 8-bit values in M2.  */
 391 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 392 _mm_add_pi8 (__m64 __m1, __m64 __m2)
 393 {
 394 #if _ARCH_PWR8
 395   __vector signed char a, b, c;
 396
 397   a = (__vector signed char)vec_splats (__m1);
 398   b = (__vector signed char)vec_splats (__m2);
 399   c = vec_add (a, b);
 400   return (__m64) ((__vector long long) c)[0];
 401 #else
 402   __m64_union m1, m2, res;
 403
 404   m1.as_m64 = __m1;
 405   m2.as_m64 = __m2;
 406
 407   res.as_char[0] = m1.as_char[0] + m2.as_char[0];
 408   res.as_char[1] = m1.as_char[1] + m2.as_char[1];
 409   res.as_char[2] = m1.as_char[2] + m2.as_char[2];
 410   res.as_char[3] = m1.as_char[3] + m2.as_char[3];
 411   res.as_char[4] = m1.as_char[4] + m2.as_char[4];
 412   res.as_char[5] = m1.as_char[5] + m2.as_char[5];
 413   res.as_char[6] = m1.as_char[6] + m2.as_char[6];
 414   res.as_char[7] = m1.as_char[7] + m2.as_char[7];
 415
 416   return (__m64) res.as_m64;
 417 #endif
 418 }
 419
 420 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 421 _m_paddb (__m64 __m1, __m64 __m2)
 422 {
 423   return _mm_add_pi8 (__m1, __m2);
 424 }
 425
 426 /* Add the 16-bit values in M1 to the 16-bit values in M2.  */
 427 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 428 _mm_add_pi16 (__m64 __m1, __m64 __m2)
 429 {
 430 #if _ARCH_PWR8
 431   __vector signed short a, b, c;
 432
 433   a = (__vector signed short)vec_splats (__m1);
 434   b = (__vector signed short)vec_splats (__m2);
 435   c = vec_add (a, b);
 436   return (__m64) ((__vector long long) c)[0];
 437 #else
 438   __m64_union m1, m2, res;
 439
 440   m1.as_m64 = __m1;
 441   m2.as_m64 = __m2;
 442
 443   res.as_short[0] = m1.as_short[0] + m2.as_short[0];
 444   res.as_short[1] = m1.as_short[1] + m2.as_short[1];
 445   res.as_short[2] = m1.as_short[2] + m2.as_short[2];
 446   res.as_short[3] = m1.as_short[3] + m2.as_short[3];
 447
 448   return (__m64) res.as_m64;
 449 #endif
 450 }
 451
 452 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 453 _m_paddw (__m64 __m1, __m64 __m2)
 454 {
 455   return _mm_add_pi16 (__m1, __m2);
 456 }
 457
 458 /* Add the 32-bit values in M1 to the 32-bit values in M2.  */
 459 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 460 _mm_add_pi32 (__m64 __m1, __m64 __m2)
 461 {
 462 #if _ARCH_PWR9
 463   __vector signed int a, b, c;
 464
 465   a = (__vector signed int)vec_splats (__m1);
 466   b = (__vector signed int)vec_splats (__m2);
 467   c = vec_add (a, b);
 468   return (__m64) ((__vector long long) c)[0];
 469 #else
 470   __m64_union m1, m2, res;
 471
 472   m1.as_m64 = __m1;
 473   m2.as_m64 = __m2;
 474
 475   res.as_int[0] = m1.as_int[0] + m2.as_int[0];
 476   res.as_int[1] = m1.as_int[1] + m2.as_int[1];
 477
 478   return (__m64) res.as_m64;
 479 #endif
 480 }
 481
 482 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 483 _m_paddd (__m64 __m1, __m64 __m2)
 484 {
 485   return _mm_add_pi32 (__m1, __m2);
 486 }
 487
 488 /* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
 489 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 490 _mm_sub_pi8 (__m64 __m1, __m64 __m2)
 491 {
 492 #if _ARCH_PWR8
 493   __vector signed char a, b, c;
 494
 495   a = (__vector signed char)vec_splats (__m1);
 496   b = (__vector signed char)vec_splats (__m2);
 497   c = vec_sub (a, b);
 498   return (__m64) ((__vector long long) c)[0];
 499 #else
 500   __m64_union m1, m2, res;
 501
 502   m1.as_m64 = __m1;
 503   m2.as_m64 = __m2;
 504
 505   res.as_char[0] = m1.as_char[0] - m2.as_char[0];
 506   res.as_char[1] = m1.as_char[1] - m2.as_char[1];
 507   res.as_char[2] = m1.as_char[2] - m2.as_char[2];
 508   res.as_char[3] = m1.as_char[3] - m2.as_char[3];
 509   res.as_char[4] = m1.as_char[4] - m2.as_char[4];
 510   res.as_char[5] = m1.as_char[5] - m2.as_char[5];
 511   res.as_char[6] = m1.as_char[6] - m2.as_char[6];
 512   res.as_char[7] = m1.as_char[7] - m2.as_char[7];
 513
 514   return (__m64) res.as_m64;
 515 #endif
 516 }
 517
 518 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 519 _m_psubb (__m64 __m1, __m64 __m2)
 520 {
 521   return _mm_sub_pi8 (__m1, __m2);
 522 }
 523
 524 /* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
 525 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 526 _mm_sub_pi16 (__m64 __m1, __m64 __m2)
 527 {
 528 #if _ARCH_PWR8
 529   __vector signed short a, b, c;
 530
 531   a = (__vector signed short)vec_splats (__m1);
 532   b = (__vector signed short)vec_splats (__m2);
 533   c = vec_sub (a, b);
 534   return (__m64) ((__vector long long) c)[0];
 535 #else
 536   __m64_union m1, m2, res;
 537
 538   m1.as_m64 = __m1;
 539   m2.as_m64 = __m2;
 540
 541   res.as_short[0] = m1.as_short[0] - m2.as_short[0];
 542   res.as_short[1] = m1.as_short[1] - m2.as_short[1];
 543   res.as_short[2] = m1.as_short[2] - m2.as_short[2];
 544   res.as_short[3] = m1.as_short[3] - m2.as_short[3];
 545
 546   return (__m64) res.as_m64;
 547 #endif
 548 }
 549
 550 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 551 _m_psubw (__m64 __m1, __m64 __m2)
 552 {
 553   return _mm_sub_pi16 (__m1, __m2);
 554 }
 555
 556 /* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
 557 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 558 _mm_sub_pi32 (__m64 __m1, __m64 __m2)
 559 {
 560 #if _ARCH_PWR9
 561   __vector signed int a, b, c;
 562
 563   a = (__vector signed int)vec_splats (__m1);
 564   b = (__vector signed int)vec_splats (__m2);
 565   c = vec_sub (a, b);
 566   return (__m64) ((__vector long long) c)[0];
 567 #else
 568   __m64_union m1, m2, res;
 569
 570   m1.as_m64 = __m1;
 571   m2.as_m64 = __m2;
 572
 573   res.as_int[0] = m1.as_int[0] - m2.as_int[0];
 574   res.as_int[1] = m1.as_int[1] - m2.as_int[1];
 575
 576   return (__m64) res.as_m64;
 577 #endif
 578 }
 579
 580 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 581 _m_psubd (__m64 __m1, __m64 __m2)
 582 {
 583   return _mm_add_pi32 (__m1, __m2);
 584 }
 585
 586 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 587 _mm_add_si64 (__m64 __m1, __m64 __m2)
 588 {
 589   return (__m1 + __m2);
 590 }
 591
 592 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 593 _mm_sub_si64 (__m64 __m1, __m64 __m2)
 594 {
 595   return (__m1 - __m2);
 596 }
 597
 598 /* Shift the 64-bit value in M left by COUNT.  */
 599 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 600 _mm_sll_si64 (__m64 __m, __m64 __count)
 601 {
 602   return (__m << __count);
 603 }
 604
 605 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 606 _m_psllq (__m64 __m, __m64 __count)
 607 {
 608   return _mm_sll_si64 (__m, __count);
 609 }
 610
 611 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 612 _mm_slli_si64 (__m64 __m, const int __count)
 613 {
 614   return (__m << __count);
 615 }
 616
 617 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 618 _m_psllqi (__m64 __m, const int __count)
 619 {
 620   return _mm_slli_si64 (__m, __count);
 621 }
 622
 623 /* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
 624 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 625 _mm_srl_si64 (__m64 __m, __m64 __count)
 626 {
 627   return (__m >> __count);
 628 }
 629
 630 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 631 _m_psrlq (__m64 __m, __m64 __count)
 632 {
 633   return _mm_srl_si64 (__m, __count);
 634 }
 635
 636 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 637 _mm_srli_si64 (__m64 __m, const int __count)
 638 {
 639   return (__m >> __count);
 640 }
 641
 642 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 643 _m_psrlqi (__m64 __m, const int __count)
 644 {
 645   return _mm_srli_si64 (__m, __count);
 646 }
 647
 648 /* Bit-wise AND the 64-bit values in M1 and M2.  */
 649 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 650 _mm_and_si64 (__m64 __m1, __m64 __m2)
 651 {
 652   return (__m1 & __m2);
 653 }
 654
 655 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 656 _m_pand (__m64 __m1, __m64 __m2)
 657 {
 658   return _mm_and_si64 (__m1, __m2);
 659 }
 660
 661 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
 662    64-bit value in M2.  */
 663 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 664 _mm_andnot_si64 (__m64 __m1, __m64 __m2)
 665 {
 666   return (~__m1 & __m2);
 667 }
 668
 669 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 670 _m_pandn (__m64 __m1, __m64 __m2)
 671 {
 672   return _mm_andnot_si64 (__m1, __m2);
 673 }
 674
 675 /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
 676 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 677 _mm_or_si64 (__m64 __m1, __m64 __m2)
 678 {
 679   return (__m1 | __m2);
 680 }
 681
 682 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 683 _m_por (__m64 __m1, __m64 __m2)
 684 {
 685   return _mm_or_si64 (__m1, __m2);
 686 }
 687
 688 /* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
 689 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 690 _mm_xor_si64 (__m64 __m1, __m64 __m2)
 691 {
 692   return  (__m1 ^ __m2);
 693 }
 694
 695 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 696 _m_pxor (__m64 __m1, __m64 __m2)
 697 {
 698   return _mm_xor_si64 (__m1, __m2);
 699 }
 700
 701 /* Creates a 64-bit zero.  */
 702 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 703 _mm_setzero_si64 (void)
 704 {
 705   return (__m64) 0;
 706 }
 707
 708 /* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
 709    test is true and zero if false.  */
 710 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 711 _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
 712 {
 713 #ifdef _ARCH_PWR6
 714   __m64 res;
 715   __asm__(
 716       "cmpb %0,%1,%2;\n"
 717       : "=r" (res)
 718       : "r" (__m1),
 719         "r" (__m2)
 720       : );
 721   return (res);
 722 #else
 723   __m64_union m1, m2, res;
 724
 725   m1.as_m64 = __m1;
 726   m2.as_m64 = __m2;
 727
 728   res.as_char[0] = (m1.as_char[0] == m2.as_char[0])? -1: 0;
 729   res.as_char[1] = (m1.as_char[1] == m2.as_char[1])? -1: 0;
 730   res.as_char[2] = (m1.as_char[2] == m2.as_char[2])? -1: 0;
 731   res.as_char[3] = (m1.as_char[3] == m2.as_char[3])? -1: 0;
 732   res.as_char[4] = (m1.as_char[4] == m2.as_char[4])? -1: 0;
 733   res.as_char[5] = (m1.as_char[5] == m2.as_char[5])? -1: 0;
 734   res.as_char[6] = (m1.as_char[6] == m2.as_char[6])? -1: 0;
 735   res.as_char[7] = (m1.as_char[7] == m2.as_char[7])? -1: 0;
 736
 737   return (__m64) res.as_m64;
 738 #endif
 739 }
 740
 741 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 742 _m_pcmpeqb (__m64 __m1, __m64 __m2)
 743 {
 744   return _mm_cmpeq_pi8 (__m1, __m2);
 745 }
 746
 747 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 748 _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
 749 {
 750 #if _ARCH_PWR8
 751   __vector signed char a, b, c;
 752
 753   a = (__vector signed char)vec_splats (__m1);
 754   b = (__vector signed char)vec_splats (__m2);
 755   c = (__vector signed char)vec_cmpgt (a, b);
 756   return (__m64) ((__vector long long) c)[0];
 757 #else
 758   __m64_union m1, m2, res;
 759
 760   m1.as_m64 = __m1;
 761   m2.as_m64 = __m2;
 762
 763   res.as_char[0] = (m1.as_char[0] > m2.as_char[0])? -1: 0;
 764   res.as_char[1] = (m1.as_char[1] > m2.as_char[1])? -1: 0;
 765   res.as_char[2] = (m1.as_char[2] > m2.as_char[2])? -1: 0;
 766   res.as_char[3] = (m1.as_char[3] > m2.as_char[3])? -1: 0;
 767   res.as_char[4] = (m1.as_char[4] > m2.as_char[4])? -1: 0;
 768   res.as_char[5] = (m1.as_char[5] > m2.as_char[5])? -1: 0;
 769   res.as_char[6] = (m1.as_char[6] > m2.as_char[6])? -1: 0;
 770   res.as_char[7] = (m1.as_char[7] > m2.as_char[7])? -1: 0;
 771
 772   return (__m64) res.as_m64;
 773 #endif
 774 }
 775
 776 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 777 _m_pcmpgtb (__m64 __m1, __m64 __m2)
 778 {
 779   return _mm_cmpgt_pi8 (__m1, __m2);
 780 }
 781
 782 /* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
 783    the test is true and zero if false.  */
 784 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 785 _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
 786 {
 787 #if _ARCH_PWR8
 788   __vector signed short a, b, c;
 789
 790   a = (__vector signed short)vec_splats (__m1);
 791   b = (__vector signed short)vec_splats (__m2);
 792   c = (__vector signed short)vec_cmpeq (a, b);
 793   return (__m64) ((__vector long long) c)[0];
 794 #else
 795   __m64_union m1, m2, res;
 796
 797   m1.as_m64 = __m1;
 798   m2.as_m64 = __m2;
 799
 800   res.as_short[0] = (m1.as_short[0] == m2.as_short[0])? -1: 0;
 801   res.as_short[1] = (m1.as_short[1] == m2.as_short[1])? -1: 0;
 802   res.as_short[2] = (m1.as_short[2] == m2.as_short[2])? -1: 0;
 803   res.as_short[3] = (m1.as_short[3] == m2.as_short[3])? -1: 0;
 804
 805   return (__m64) res.as_m64;
 806 #endif
 807 }
 808
 809 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 810 _m_pcmpeqw (__m64 __m1, __m64 __m2)
 811 {
 812   return _mm_cmpeq_pi16 (__m1, __m2);
 813 }
 814
 815 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 816 _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
 817 {
 818 #if _ARCH_PWR8
 819   __vector signed short a, b, c;
 820
 821   a = (__vector signed short)vec_splats (__m1);
 822   b = (__vector signed short)vec_splats (__m2);
 823   c = (__vector signed short)vec_cmpgt (a, b);
 824   return (__m64) ((__vector long long) c)[0];
 825 #else
 826   __m64_union m1, m2, res;
 827
 828   m1.as_m64 = __m1;
 829   m2.as_m64 = __m2;
 830
 831   res.as_short[0] = (m1.as_short[0] > m2.as_short[0])? -1: 0;
 832   res.as_short[1] = (m1.as_short[1] > m2.as_short[1])? -1: 0;
 833   res.as_short[2] = (m1.as_short[2] > m2.as_short[2])? -1: 0;
 834   res.as_short[3] = (m1.as_short[3] > m2.as_short[3])? -1: 0;
 835
 836   return (__m64) res.as_m64;
 837 #endif
 838 }
 839
 840 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 841 _m_pcmpgtw (__m64 __m1, __m64 __m2)
 842 {
 843   return _mm_cmpgt_pi16 (__m1, __m2);
 844 }
 845
 846 /* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
 847    the test is true and zero if false.  */
 848 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 849 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
 850 {
 851 #if _ARCH_PWR9
 852   __vector signed int a, b, c;
 853
 854   a = (__vector signed int)vec_splats (__m1);
 855   b = (__vector signed int)vec_splats (__m2);
 856   c = (__vector signed int)vec_cmpeq (a, b);
 857   return (__m64) ((__vector long long) c)[0];
 858 #else
 859   __m64_union m1, m2, res;
 860
 861   m1.as_m64 = __m1;
 862   m2.as_m64 = __m2;
 863
 864   res.as_int[0] = (m1.as_int[0] == m2.as_int[0])? -1: 0;
 865   res.as_int[1] = (m1.as_int[1] == m2.as_int[1])? -1: 0;
 866
 867   return (__m64) res.as_m64;
 868 #endif
 869 }
 870
 871 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 872 _m_pcmpeqd (__m64 __m1, __m64 __m2)
 873 {
 874   return _mm_cmpeq_pi32 (__m1, __m2);
 875 }
 876
 877 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 878 _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
 879 {
 880 #if _ARCH_PWR9
 881   __vector signed int a, b, c;
 882
 883   a = (__vector signed int)vec_splats (__m1);
 884   b = (__vector signed int)vec_splats (__m2);
 885   c = (__vector signed int)vec_cmpgt (a, b);
 886   return (__m64) ((__vector long long) c)[0];
 887 #else
 888   __m64_union m1, m2, res;
 889
 890   m1.as_m64 = __m1;
 891   m2.as_m64 = __m2;
 892
 893   res.as_int[0] = (m1.as_int[0] > m2.as_int[0])? -1: 0;
 894   res.as_int[1] = (m1.as_int[1] > m2.as_int[1])? -1: 0;
 895
 896   return (__m64) res.as_m64;
 897 #endif
 898 }
 899
 900 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 901 _m_pcmpgtd (__m64 __m1, __m64 __m2)
 902 {
 903   return _mm_cmpgt_pi32 (__m1, __m2);
 904 }
 905
 906 #if _ARCH_PWR8
 907 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
 908    saturated arithmetic.  */
 909 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 910 _mm_adds_pi8 (__m64 __m1, __m64 __m2)
 911 {
 912   __vector signed char a, b, c;
 913
 914   a = (__vector signed char)vec_splats (__m1);
 915   b = (__vector signed char)vec_splats (__m2);
 916   c = vec_adds (a, b);
 917   return (__m64) ((__vector long long) c)[0];
 918 }
 919
 920 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 921 _m_paddsb (__m64 __m1, __m64 __m2)
 922 {
 923   return _mm_adds_pi8 (__m1, __m2);
 924 }
 925 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
 926    saturated arithmetic.  */
 927 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 928 _mm_adds_pi16 (__m64 __m1, __m64 __m2)
 929 {
 930   __vector signed short a, b, c;
 931
 932   a = (__vector signed short)vec_splats (__m1);
 933   b = (__vector signed short)vec_splats (__m2);
 934   c = vec_adds (a, b);
 935   return (__m64) ((__vector long long) c)[0];
 936 }
 937
 938 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 939 _m_paddsw (__m64 __m1, __m64 __m2)
 940 {
 941   return _mm_adds_pi16 (__m1, __m2);
 942 }
 943 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
 944    saturated arithmetic.  */
 945 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 946 _mm_adds_pu8 (__m64 __m1, __m64 __m2)
 947 {
 948   __vector unsigned char a, b, c;
 949
 950   a = (__vector unsigned char)vec_splats (__m1);
 951   b = (__vector unsigned char)vec_splats (__m2);
 952   c = vec_adds (a, b);
 953   return (__m64) ((__vector long long) c)[0];
 954 }
 955
 956 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 957 _m_paddusb (__m64 __m1, __m64 __m2)
 958 {
 959   return _mm_adds_pu8 (__m1, __m2);
 960 }
 961
 962 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
 963    saturated arithmetic.  */
 964 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 965 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
 966 {
 967   __vector unsigned short a, b, c;
 968
 969   a = (__vector unsigned short)vec_splats (__m1);
 970   b = (__vector unsigned short)vec_splats (__m2);
 971   c = vec_adds (a, b);
 972   return (__m64) ((__vector long long) c)[0];
 973 }
 974
 975 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 976 _m_paddusw (__m64 __m1, __m64 __m2)
 977 {
 978   return _mm_adds_pu16 (__m1, __m2);
 979 }
 980
 981 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
 982    saturating arithmetic.  */
 983 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 984 _mm_subs_pi8 (__m64 __m1, __m64 __m2)
 985 {
 986   __vector signed char a, b, c;
 987
 988   a = (__vector signed char)vec_splats (__m1);
 989   b = (__vector signed char)vec_splats (__m2);
 990   c = vec_subs (a, b);
 991   return (__m64) ((__vector long long) c)[0];
 992 }
 993
 994 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 995 _m_psubsb (__m64 __m1, __m64 __m2)
 996 {
 997   return _mm_subs_pi8 (__m1, __m2);
 998 }
 999
1000 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1001    signed saturating arithmetic.  */
1002 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1003 _mm_subs_pi16 (__m64 __m1, __m64 __m2)
1004 {
1005   __vector signed short a, b, c;
1006
1007   a = (__vector signed short)vec_splats (__m1);
1008   b = (__vector signed short)vec_splats (__m2);
1009   c = vec_subs (a, b);
1010   return (__m64) ((__vector long long) c)[0];
1011 }
1012
1013 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1014 _m_psubsw (__m64 __m1, __m64 __m2)
1015 {
1016   return _mm_subs_pi16 (__m1, __m2);
1017 }
1018
1019 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1020    unsigned saturating arithmetic.  */
1021 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1022 _mm_subs_pu8 (__m64 __m1, __m64 __m2)
1023 {
1024   __vector unsigned char a, b, c;
1025
1026   a = (__vector unsigned char)vec_splats (__m1);
1027   b = (__vector unsigned char)vec_splats (__m2);
1028   c = vec_subs (a, b);
1029   return (__m64) ((__vector long long) c)[0];
1030 }
1031
1032 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1033 _m_psubusb (__m64 __m1, __m64 __m2)
1034 {
1035   return _mm_subs_pu8 (__m1, __m2);
1036 }
1037
1038 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1039    unsigned saturating arithmetic.  */
1040 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1041 _mm_subs_pu16 (__m64 __m1, __m64 __m2)
1042 {
1043   __vector unsigned short a, b, c;
1044
1045   a = (__vector unsigned short)vec_splats (__m1);
1046   b = (__vector unsigned short)vec_splats (__m2);
1047   c = vec_subs (a, b);
1048   return (__m64) ((__vector long long) c)[0];
1049 }
1050
1051 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1052 _m_psubusw (__m64 __m1, __m64 __m2)
1053 {
1054   return _mm_subs_pu16 (__m1, __m2);
1055 }
1056
1057 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1058    four 32-bit intermediate results, which are then summed by pairs to
1059    produce two 32-bit results.  */
1060 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1061 _mm_madd_pi16 (__m64 __m1, __m64 __m2)
1062 {
1063   __vector signed short a, b;
1064   __vector signed int c;
1065   __vector signed int zero = {0, 0, 0, 0};
1066
1067   a = (__vector signed short)vec_splats (__m1);
1068   b = (__vector signed short)vec_splats (__m2);
1069   c = vec_vmsumshm (a, b, zero);
1070   return (__m64) ((__vector long long) c)[0];
1071 }
1072
1073 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1074 _m_pmaddwd (__m64 __m1, __m64 __m2)
1075 {
1076   return _mm_madd_pi16 (__m1, __m2);
1077 }
1078 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1079    M2 and produce the high 16 bits of the 32-bit results.  */
1080 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1081 _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
1082 {
1083   __vector signed short a, b;
1084   __vector signed short c;
1085   __vector signed int w0, w1;
1086   __vector unsigned char xform1 = {
1087       0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
1088       0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
1089     };
1090
1091   a = (__vector signed short)vec_splats (__m1);
1092   b = (__vector signed short)vec_splats (__m2);
1093
1094   w0 = vec_vmulesh (a, b);
1095   w1 = vec_vmulosh (a, b);
1096   c = (__vector signed short)vec_perm (w0, w1, xform1);
1097
1098   return (__m64) ((__vector long long) c)[0];
1099 }
1100
1101 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1102 _m_pmulhw (__m64 __m1, __m64 __m2)
1103 {
1104   return _mm_mulhi_pi16 (__m1, __m2);
1105 }
1106
1107 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1108    the low 16 bits of the results.  */
1109 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1110 _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
1111 {
1112   __vector signed short a, b, c;
1113
1114   a = (__vector signed short)vec_splats (__m1);
1115   b = (__vector signed short)vec_splats (__m2);
1116   c = a * b;
1117   return (__m64) ((__vector long long) c)[0];
1118 }
1119
1120 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1121 _m_pmullw (__m64 __m1, __m64 __m2)
1122 {
1123   return _mm_mullo_pi16 (__m1, __m2);
1124 }
1125
1126 /* Shift four 16-bit values in M left by COUNT.  */
1127 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1128 _mm_sll_pi16 (__m64 __m, __m64 __count)
1129 {
1130   __vector signed short m, r;
1131   __vector unsigned short c;
1132
1133   if (__count <= 15)
1134     {
1135       m = (__vector signed short)vec_splats (__m);
1136       c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1137       r = vec_sl (m, (__vector unsigned short)c);
1138       return (__m64) ((__vector long long) r)[0];
1139     }
1140   else
1141   return (0);
1142 }
1143
1144 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1145 _m_psllw (__m64 __m, __m64 __count)
1146 {
1147   return _mm_sll_pi16 (__m, __count);
1148 }
1149
1150 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1151 _mm_slli_pi16 (__m64 __m, int __count)
1152 {
1153   /* Promote int to long then invoke mm_sll_pi16.  */
1154   return _mm_sll_pi16 (__m, __count);
1155 }
1156
1157 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1158 _m_psllwi (__m64 __m, int __count)
1159 {
1160   return _mm_slli_pi16 (__m, __count);
1161 }
1162
1163 /* Shift two 32-bit values in M left by COUNT.  */
1164 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1165 _mm_sll_pi32 (__m64 __m, __m64 __count)
1166 {
1167   __m64_union m, res;
1168
1169   m.as_m64 = __m;
1170
1171   res.as_int[0] = m.as_int[0] << __count;
1172   res.as_int[1] = m.as_int[1] << __count;
1173   return (res.as_m64);
1174 }
1175
1176 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1177 _m_pslld (__m64 __m, __m64 __count)
1178 {
1179   return _mm_sll_pi32 (__m, __count);
1180 }
1181
1182 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1183 _mm_slli_pi32 (__m64 __m, int __count)
1184 {
1185   /* Promote int to long then invoke mm_sll_pi32.  */
1186   return _mm_sll_pi32 (__m, __count);
1187 }
1188
1189 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1190 _m_pslldi (__m64 __m, int __count)
1191 {
1192   return _mm_slli_pi32 (__m, __count);
1193 }
1194
1195 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
1196 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1197 _mm_sra_pi16 (__m64 __m, __m64 __count)
1198 {
1199   __vector signed short m, r;
1200   __vector unsigned short c;
1201
1202   if (__count <= 15)
1203     {
1204         m = (__vector signed short)vec_splats (__m);
1205         c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1206         r = vec_sra (m, (__vector unsigned short)c);
1207         return (__m64) ((__vector long long) r)[0];
1208     }
1209   else
1210   return (0);
1211 }
1212
1213 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1214 _m_psraw (__m64 __m, __m64 __count)
1215 {
1216   return _mm_sra_pi16 (__m, __count);
1217 }
1218
1219 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1220 _mm_srai_pi16 (__m64 __m, int __count)
1221 {
1222   /* Promote int to long then invoke mm_sra_pi32.  */
1223   return _mm_sra_pi16 (__m, __count);
1224 }
1225
1226 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1227 _m_psrawi (__m64 __m, int __count)
1228 {
1229   return _mm_srai_pi16 (__m, __count);
1230 }
1231
1232 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
1233 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1234 _mm_sra_pi32 (__m64 __m, __m64 __count)
1235 {
1236   __m64_union m, res;
1237
1238   m.as_m64 = __m;
1239
1240   res.as_int[0] = m.as_int[0] >> __count;
1241   res.as_int[1] = m.as_int[1] >> __count;
1242   return (res.as_m64);
1243 }
1244
1245 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1246 _m_psrad (__m64 __m, __m64 __count)
1247 {
1248   return _mm_sra_pi32 (__m, __count);
1249 }
1250
1251 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1252 _mm_srai_pi32 (__m64 __m, int __count)
1253 {
1254   /* Promote int to long then invoke mm_sra_pi32.  */
1255   return _mm_sra_pi32 (__m, __count);
1256 }
1257
1258 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1259 _m_psradi (__m64 __m, int __count)
1260 {
1261   return _mm_srai_pi32 (__m, __count);
1262 }
1263
1264 /* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
1265 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1266 _mm_srl_pi16 (__m64 __m, __m64 __count)
1267 {
1268   __vector unsigned short m, r;
1269   __vector unsigned short c;
1270
1271   if (__count <= 15)
1272     {
1273         m = (__vector unsigned short)vec_splats (__m);
1274         c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1275         r = vec_sr (m, (__vector unsigned short)c);
1276         return (__m64) ((__vector long long) r)[0];
1277     }
1278   else
1279     return (0);
1280 }
1281
1282 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1283 _m_psrlw (__m64 __m, __m64 __count)
1284 {
1285   return _mm_srl_pi16 (__m, __count);
1286 }
1287
1288 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1289 _mm_srli_pi16 (__m64 __m, int __count)
1290 {
1291   /* Promote int to long then invoke mm_sra_pi32.  */
1292   return _mm_srl_pi16 (__m, __count);
1293 }
1294
1295 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1296 _m_psrlwi (__m64 __m, int __count)
1297 {
1298   return _mm_srli_pi16 (__m, __count);
1299 }
1300
1301 /* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
1302 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1303 _mm_srl_pi32 (__m64 __m, __m64 __count)
1304 {
1305   __m64_union m, res;
1306
1307   m.as_m64 = __m;
1308
1309   res.as_int[0] = (unsigned int)m.as_int[0] >> __count;
1310   res.as_int[1] = (unsigned int)m.as_int[1] >> __count;
1311   return (res.as_m64);
1312 }
1313
1314 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1315 _m_psrld (__m64 __m, __m64 __count)
1316 {
1317   return _mm_srl_pi32 (__m, __count);
1318 }
1319
1320 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1321 _mm_srli_pi32 (__m64 __m, int __count)
1322 {
1323   /* Promote int to long then invoke mm_srl_pi32.  */
1324   return _mm_srl_pi32 (__m, __count);
1325 }
1326
1327 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1328 _m_psrldi (__m64 __m, int __count)
1329 {
1330   return _mm_srli_pi32 (__m, __count);
1331 }
1332 #endif /* _ARCH_PWR8 */
1333
1334 /* Creates a vector of two 32-bit values; I0 is least significant.  */
1335 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1336 _mm_set_pi32 (int __i1, int __i0)
1337 {
1338   __m64_union res;
1339
1340   res.as_int[0] = __i0;
1341   res.as_int[1] = __i1;
1342   return (res.as_m64);
1343 }
1344
1345 /* Creates a vector of four 16-bit values; W0 is least significant.  */
1346 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1347 _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
1348 {
1349   __m64_union res;
1350
1351   res.as_short[0] = __w0;
1352   res.as_short[1] = __w1;
1353   res.as_short[2] = __w2;
1354   res.as_short[3] = __w3;
1355   return (res.as_m64);
1356 }
1357
1358 /* Creates a vector of eight 8-bit values; B0 is least significant.  */
1359 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1360 _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
1361              char __b3, char __b2, char __b1, char __b0)
1362 {
1363   __m64_union res;
1364
1365   res.as_char[0] = __b0;
1366   res.as_char[1] = __b1;
1367   res.as_char[2] = __b2;
1368   res.as_char[3] = __b3;
1369   res.as_char[4] = __b4;
1370   res.as_char[5] = __b5;
1371   res.as_char[6] = __b6;
1372   res.as_char[7] = __b7;
1373   return (res.as_m64);
1374 }
1375
1376 /* Similar, but with the arguments in reverse order.  */
1377 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1378 _mm_setr_pi32 (int __i0, int __i1)
1379 {
1380   __m64_union res;
1381
1382   res.as_int[0] = __i0;
1383   res.as_int[1] = __i1;
1384   return (res.as_m64);
1385 }
1386
1387 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1388 _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
1389 {
1390   return _mm_set_pi16 (__w3, __w2, __w1, __w0);
1391 }
1392
1393 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1394 _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
1395               char __b4, char __b5, char __b6, char __b7)
1396 {
1397   return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1398 }
1399
1400 /* Creates a vector of two 32-bit values, both elements containing I.  */
1401 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1402 _mm_set1_pi32 (int __i)
1403 {
1404   __m64_union res;
1405
1406   res.as_int[0] = __i;
1407   res.as_int[1] = __i;
1408   return (res.as_m64);
1409 }
1410
1411 /* Creates a vector of four 16-bit values, all elements containing W.  */
1412 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1413 _mm_set1_pi16 (short __w)
1414 {
1415 #if _ARCH_PWR9
1416   __vector signed short w;
1417
1418   w = (__vector signed short)vec_splats (__w);
1419   return (__m64) ((__vector long long) w)[0];
1420 #else
1421   __m64_union res;
1422
1423   res.as_short[0] = __w;
1424   res.as_short[1] = __w;
1425   res.as_short[2] = __w;
1426   res.as_short[3] = __w;
1427   return (res.as_m64);
1428 #endif
1429 }
1430
1431 /* Creates a vector of eight 8-bit values, all elements containing B.  */
1432 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1433 _mm_set1_pi8 (signed char __b)
1434 {
1435 #if _ARCH_PWR8
1436   __vector signed char b;
1437
1438   b = (__vector signed char)vec_splats (__b);
1439   return (__m64) ((__vector long long) b)[0];
1440 #else
1441   __m64_union res;
1442
1443   res.as_char[0] = __b;
1444   res.as_char[1] = __b;
1445   res.as_char[2] = __b;
1446   res.as_char[3] = __b;
1447   res.as_char[4] = __b;
1448   res.as_char[5] = __b;
1449   res.as_char[6] = __b;
1450   res.as_char[7] = __b;
1451   return (res.as_m64);
1452 #endif
1453 }
1454 #endif /* _MMINTRIN_H_INCLUDED */