gcc/config/i386/mmintrin.h

   1 /* Copyright (C) 2002 Free Software Foundation, Inc.
   2
   3    This file is part of GNU CC.
   4
   5    GNU CC is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 2, or (at your option)
   8    any later version.
   9
  10    GNU CC is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with GNU CC; see the file COPYING.  If not, write to
  17    the Free Software Foundation, 59 Temple Place - Suite 330,
  18    Boston, MA 02111-1307, USA.  */
  19
  20 /* As a special exception, if you include this header file into source
  21    files compiled by GCC, this header file does not by itself cause
  22    the resulting executable to be covered by the GNU General Public
  23    License.  This exception does not however invalidate any other
  24    reasons why the executable file might be covered by the GNU General
  25    Public License.  */
  26
  27 /* Implemented from the specification included in the Intel C++ Compiler
  28    User Guide and Reference, version 5.0.  */
  29
  30 #ifndef _MMINTRIN_H_INCLUDED
  31 #define _MMINTRIN_H_INCLUDED
  32
  33 /* The data type intended for user use.  */
  34 typedef unsigned long long __m64;
  35
  36 /* Internal data types for implementing the intrinsics.  */
  37 typedef int __v2si __attribute__ ((__mode__ (__V2SI__)));
  38 typedef int __v4hi __attribute__ ((__mode__ (__V4HI__)));
  39 typedef int __v8qi __attribute__ ((__mode__ (__V8QI__)));
  40
  41 /* Empty the multimedia state.  */
  42 static __inline void
  43 _mm_empty (void)
  44 {
  45   __builtin_ia32_emms ();
  46 }
  47
  48 /* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
  49 static __inline __m64
  50 _mm_cvtsi32_si64 (int __i)
  51 {
  52   return (unsigned int) __i;
  53 }
  54
  55 /* Convert the lower 32 bits of the __m64 object into an integer.  */
  56 static __inline int
  57 _mm_cvtsi64_si32 (__m64 __i)
  58 {
  59   return __i;
  60 }
  61
  62 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
  63    the result, and the four 16-bit values from M2 into the upper four 8-bit
  64    values of the result, all with signed saturation.  */
  65 static __inline __m64
  66 _mm_packs_pi16 (__m64 __m1, __m64 __m2)
  67 {
  68   return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2);
  69 }
  70
  71 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
  72    the result, and the two 32-bit values from M2 into the upper two 16-bit
  73    values of the result, all with signed saturation.  */
  74 static __inline __m64
  75 _mm_packs_pi32 (__m64 __m1, __m64 __m2)
  76 {
  77   return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2);
  78 }
  79
  80 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
  81    the result, and the four 16-bit values from M2 into the upper four 8-bit
  82    values of the result, all with unsigned saturation.  */
  83 static __inline __m64
  84 _mm_packs_pu16 (__m64 __m1, __m64 __m2)
  85 {
  86   return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2);
  87 }
  88
  89 /* Interleave the four 8-bit values from the high half of M1 with the four
  90    8-bit values from the high half of M2.  */
  91 static __inline __m64
  92 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
  93 {
  94   return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2);
  95 }
  96
  97 /* Interleave the two 16-bit values from the high half of M1 with the two
  98    16-bit values from the high half of M2.  */
  99 static __inline __m64
 100 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
 101 {
 102   return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2);
 103 }
 104
 105 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
 106    value from the high half of M2.  */
 107 static __inline __m64
 108 _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
 109 {
 110   return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2);
 111 }
 112
 113 /* Interleave the four 8-bit values from the low half of M1 with the four
 114    8-bit values from the low half of M2.  */
 115 static __inline __m64
 116 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
 117 {
 118   return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2);
 119 }
 120
 121 /* Interleave the two 16-bit values from the low half of M1 with the two
 122    16-bit values from the low half of M2.  */
 123 static __inline __m64
 124 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
 125 {
 126   return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2);
 127 }
 128
 129 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
 130    value from the low half of M2.  */
 131 static __inline __m64
 132 _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
 133 {
 134   return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2);
 135 }
 136
 137 /* Add the 8-bit values in M1 to the 8-bit values in M2.  */
 138 static __inline __m64
 139 _mm_add_pi8 (__m64 __m1, __m64 __m2)
 140 {
 141   return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2);
 142 }
 143
 144 /* Add the 16-bit values in M1 to the 16-bit values in M2.  */
 145 static __inline __m64
 146 _mm_add_pi16 (__m64 __m1, __m64 __m2)
 147 {
 148   return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2);
 149 }
 150
 151 /* Add the 32-bit values in M1 to the 32-bit values in M2.  */
 152 static __inline __m64
 153 _mm_add_pi32 (__m64 __m1, __m64 __m2)
 154 {
 155   return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2);
 156 }
 157
 158 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
 159    saturated arithmetic.  */
 160 static __inline __m64
 161 _mm_adds_pi8 (__m64 __m1, __m64 __m2)
 162 {
 163   return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2);
 164 }
 165
 166 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
 167    saturated arithmetic.  */
 168 static __inline __m64
 169 _mm_adds_pi16 (__m64 __m1, __m64 __m2)
 170 {
 171   return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2);
 172 }
 173
 174 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
 175    saturated arithmetic.  */
 176 static __inline __m64
 177 _mm_adds_pu8 (__m64 __m1, __m64 __m2)
 178 {
 179   return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2);
 180 }
 181
 182 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
 183    saturated arithmetic.  */
 184 static __inline __m64
 185 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
 186 {
 187   return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2);
 188 }
 189
 190 /* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
 191 static __inline __m64
 192 _mm_sub_pi8 (__m64 __m1, __m64 __m2)
 193 {
 194   return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2);
 195 }
 196
 197 /* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
 198 static __inline __m64
 199 _mm_sub_pi16 (__m64 __m1, __m64 __m2)
 200 {
 201   return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2);
 202 }
 203
 204 /* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
 205 static __inline __m64
 206 _mm_sub_pi32 (__m64 __m1, __m64 __m2)
 207 {
 208   return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2);
 209 }
 210
 211 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
 212    saturating arithmetic.  */
 213 static __inline __m64
 214 _mm_subs_pi8 (__m64 __m1, __m64 __m2)
 215 {
 216   return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2);
 217 }
 218
 219 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
 220    signed saturating arithmetic.  */
 221 static __inline __m64
 222 _mm_subs_pi16 (__m64 __m1, __m64 __m2)
 223 {
 224   return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2);
 225 }
 226
 227 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
 228    unsigned saturating arithmetic.  */
 229 static __inline __m64
 230 _mm_subs_pu8 (__m64 __m1, __m64 __m2)
 231 {
 232   return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2);
 233 }
 234
 235 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
 236    unsigned saturating arithmetic.  */
 237 static __inline __m64
 238 _mm_subs_pu16 (__m64 __m1, __m64 __m2)
 239 {
 240   return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2);
 241 }
 242
 243 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
 244    four 32-bit intermediate results, which are then summed by pairs to
 245    produce two 32-bit results.  */
 246 static __inline __m64
 247 _mm_madd_pi16 (__m64 __m1, __m64 __m2)
 248 {
 249   return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2);
 250 }
 251
 252 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
 253    M2 and produce the high 16 bits of the 32-bit results.  */
 254 static __inline __m64
 255 _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
 256 {
 257   return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2);
 258 }
 259
 260 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
 261    the low 16 bits of the results.  */
 262 static __inline __m64
 263 _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
 264 {
 265   return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2);
 266 }
 267
 268 /* Shift four 16-bit values in M left by COUNT.  */
 269 static __inline __m64
 270 _mm_sll_pi16 (__m64 __m, __m64 __count)
 271 {
 272   return (__m64) __builtin_ia32_psllw ((__v4hi)__m, __count);
 273 }
 274
 275 static __inline __m64
 276 _mm_slli_pi16 (__m64 __m, int __count)
 277 {
 278   return (__m64) __builtin_ia32_psllw ((__v4hi)__m, __count);
 279 }
 280
 281 /* Shift two 32-bit values in M left by COUNT.  */
 282 static __inline __m64
 283 _mm_sll_pi32 (__m64 __m, __m64 __count)
 284 {
 285   return (__m64) __builtin_ia32_pslld ((__v2si)__m, __count);
 286 }
 287
 288 static __inline __m64
 289 _mm_slli_pi32 (__m64 __m, int __count)
 290 {
 291   return (__m64) __builtin_ia32_pslld ((__v2si)__m, __count);
 292 }
 293
 294 /* Shift the 64-bit value in M left by COUNT.  */
 295 static __inline __m64
 296 _mm_sll_pi64 (__m64 __m, __m64 __count)
 297 {
 298   return (__m64) __builtin_ia32_psllq (__m, __count);
 299 }
 300
 301 static __inline __m64
 302 _mm_slli_pi64 (__m64 __m, int __count)
 303 {
 304   return (__m64) __builtin_ia32_psllq (__m, __count);
 305 }
 306
 307 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
 308 static __inline __m64
 309 _mm_sra_pi16 (__m64 __m, __m64 __count)
 310 {
 311   return (__m64) __builtin_ia32_psraw ((__v4hi)__m, __count);
 312 }
 313
 314 static __inline __m64
 315 _mm_srai_pi16 (__m64 __m, int __count)
 316 {
 317   return (__m64) __builtin_ia32_psraw ((__v4hi)__m, __count);
 318 }
 319
 320 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
 321 static __inline __m64
 322 _mm_sra_pi32 (__m64 __m, __m64 __count)
 323 {
 324   return (__m64) __builtin_ia32_psrad ((__v2si)__m, __count);
 325 }
 326
 327 static __inline __m64
 328 _mm_srai_pi32 (__m64 __m, int __count)
 329 {
 330   return (__m64) __builtin_ia32_psrad ((__v2si)__m, __count);
 331 }
 332
 333 /* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
 334 static __inline __m64
 335 _mm_srl_pi16 (__m64 __m, __m64 __count)
 336 {
 337   return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, __count);
 338 }
 339
 340 static __inline __m64
 341 _mm_srli_pi16 (__m64 __m, int __count)
 342 {
 343   return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, __count);
 344 }
 345
 346 /* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
 347 static __inline __m64
 348 _mm_srl_pi32 (__m64 __m, __m64 __count)
 349 {
 350   return (__m64) __builtin_ia32_psrld ((__v2si)__m, __count);
 351 }
 352
 353 static __inline __m64
 354 _mm_srli_pi32 (__m64 __m, int __count)
 355 {
 356   return (__m64) __builtin_ia32_psrld ((__v2si)__m, __count);
 357 }
 358
 359 /* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
 360 static __inline __m64
 361 _mm_srl_pi64 (__m64 __m, __m64 __count)
 362 {
 363   return (__m64) __builtin_ia32_psrlq (__m, __count);
 364 }
 365
 366 static __inline __m64
 367 _mm_srli_pi64 (__m64 __m, int __count)
 368 {
 369   return (__m64) __builtin_ia32_psrlq (__m, __count);
 370 }
 371
 372 /* Bit-wise AND the 64-bit values in M1 and M2.  */
 373 static __inline __m64
 374 _mm_and_si64 (__m64 __m1, __m64 __m2)
 375 {
 376   return __builtin_ia32_pand (__m1, __m2);
 377 }
 378
 379 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
 380    64-bit value in M2.  */
 381 static __inline __m64
 382 _mm_andnot_si64 (__m64 __m1, __m64 __m2)
 383 {
 384   return __builtin_ia32_pandn (__m1, __m2);
 385 }
 386
 387 /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
 388 static __inline __m64
 389 _mm_or_si64 (__m64 __m1, __m64 __m2)
 390 {
 391   return __builtin_ia32_por (__m1, __m2);
 392 }
 393
 394 /* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
 395 static __inline __m64
 396 _mm_xor_si64 (__m64 __m1, __m64 __m2)
 397 {
 398   return __builtin_ia32_pxor (__m1, __m2);
 399 }
 400
 401 /* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
 402    test is true and zero if false.  */
 403 static __inline __m64
 404 _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
 405 {
 406   return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2);
 407 }
 408
 409 static __inline __m64
 410 _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
 411 {
 412   return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2);
 413 }
 414
 415 /* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
 416    the test is true and zero if false.  */
 417 static __inline __m64
 418 _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
 419 {
 420   return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2);
 421 }
 422
 423 static __inline __m64
 424 _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
 425 {
 426   return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2);
 427 }
 428
 429 /* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
 430    the test is true and zero if false.  */
 431 static __inline __m64
 432 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
 433 {
 434   return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2);
 435 }
 436
 437 static __inline __m64
 438 _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
 439 {
 440   return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2);
 441 }
 442
 443 /* Creates a 64-bit zero.  */
 444 static __inline __m64
 445 _mm_setzero_si64 (void)
 446 {
 447   return __builtin_ia32_mmx_zero ();
 448 }
 449
 450 /* Creates a vector of two 32-bit values; I0 is least significant.  */
 451 static __inline __m64
 452 _mm_set_pi32 (int __i1, int __i0)
 453 {
 454   union {
 455     __m64 __q;
 456     struct {
 457       unsigned int __i0;
 458       unsigned int __i1;
 459     } __s;
 460   } __u;
 461
 462   __u.__s.__i0 = __i0;
 463   __u.__s.__i1 = __i1;
 464
 465   return __u.__q;
 466 }
 467
 468 /* Creates a vector of four 16-bit values; W0 is least significant.  */
 469 static __inline __m64
 470 _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
 471 {
 472   unsigned int __i1 = (unsigned short)__w3 << 16 | (unsigned short)__w2;
 473   unsigned int __i0 = (unsigned short)__w1 << 16 | (unsigned short)__w0;
 474   return _mm_set_pi32 (__i1, __i0);
 475
 476 }
 477
 478 /* Creates a vector of eight 8-bit values; B0 is least significant.  */
 479 static __inline __m64
 480 _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
 481              char __b3, char __b2, char __b1, char __b0)
 482 {
 483   unsigned int __i1, __i0;
 484
 485   __i1 = (unsigned char)__b7;
 486   __i1 = __i1 << 8 | (unsigned char)__b6;
 487   __i1 = __i1 << 8 | (unsigned char)__b5;
 488   __i1 = __i1 << 8 | (unsigned char)__b4;
 489
 490   __i0 = (unsigned char)__b3;
 491   __i0 = __i0 << 8 | (unsigned char)__b2;
 492   __i0 = __i0 << 8 | (unsigned char)__b1;
 493   __i0 = __i0 << 8 | (unsigned char)__b0;
 494
 495   return _mm_set_pi32 (__i1, __i0);
 496 }
 497
 498 /* Similar, but with the arguments in reverse order.  */
 499 static __inline __m64
 500 _mm_setr_pi32 (int __i0, int __i1)
 501 {
 502   return _mm_set_pi32 (__i1, __i0);
 503 }
 504
 505 static __inline __m64
 506 _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
 507 {
 508   return _mm_set_pi16 (__w3, __w2, __w1, __w0);
 509 }
 510
 511 static __inline __m64
 512 _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
 513               char __b4, char __b5, char __b6, char __b7)
 514 {
 515   return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
 516 }
 517
 518 /* Creates a vector of two 32-bit values, both elements containing I.  */
 519 static __inline __m64
 520 _mm_set1_pi32 (int __i)
 521 {
 522   return _mm_set_pi32 (__i, __i);
 523 }
 524
 525 /* Creates a vector of four 16-bit values, all elements containing W.  */
 526 static __inline __m64
 527 _mm_set1_pi16 (short __w)
 528 {
 529   unsigned int __i = (unsigned short)__w << 16 | (unsigned short)__w;
 530   return _mm_set1_pi32 (__i);
 531 }
 532
 533 /* Creates a vector of four 16-bit values, all elements containing B.  */
 534 static __inline __m64
 535 _mm_set1_pi8 (char __b)
 536 {
 537   unsigned int __w = (unsigned char)__b << 8 | (unsigned char)__b;
 538   unsigned int __i = __w << 16 | __w;
 539   return _mm_set1_pi32 (__i);
 540 }
 541
 542 #endif /* _MMINTRIN_H_INCLUDED */