gcc/config/i386/mmintrin.h

   1 /* Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   2
   3    This file is part of GCC.
   4
   5    GCC is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 2, or (at your option)
   8    any later version.
   9
  10    GCC is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with GCC; see the file COPYING.  If not, write to
  17    the Free Software Foundation, 59 Temple Place - Suite 330,
  18    Boston, MA 02111-1307, USA.  */
  19
  20 /* As a special exception, if you include this header file into source
  21    files compiled by GCC, this header file does not by itself cause
  22    the resulting executable to be covered by the GNU General Public
  23    License.  This exception does not however invalidate any other
  24    reasons why the executable file might be covered by the GNU General
  25    Public License.  */
  26
  27 /* Implemented from the specification included in the Intel C++ Compiler
  28    User Guide and Reference, version 8.0.  */
  29
  30 #ifndef _MMINTRIN_H_INCLUDED
  31 #define _MMINTRIN_H_INCLUDED
  32
  33 #ifndef __MMX__
  34 # error "MMX instruction set not enabled"
  35 #else
  36 /* The data type intended for user use.  */
  37 typedef int __m64 __attribute__ ((__vector_size__ (8)));
  38
  39 /* Internal data types for implementing the intrinsics.  */
  40 typedef int __v2si __attribute__ ((__vector_size__ (8)));
  41 typedef short __v4hi __attribute__ ((__vector_size__ (8)));
  42 typedef char __v8qi __attribute__ ((__vector_size__ (8)));
  43
  44 /* Empty the multimedia state.  */
  45 static __inline void
  46 _mm_empty (void)
  47 {
  48   __builtin_ia32_emms ();
  49 }
  50
  51 static __inline void
  52 _m_empty (void)
  53 {
  54   _mm_empty ();
  55 }
  56
  57 /* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
  58 static __inline __m64
  59 _mm_cvtsi32_si64 (int __i)
  60 {
  61   return (__m64) __builtin_ia32_vec_init_v2si (__i, 0);
  62 }
  63
  64 static __inline __m64
  65 _m_from_int (int __i)
  66 {
  67   return _mm_cvtsi32_si64 (__i);
  68 }
  69
  70 #ifdef __x86_64__
  71 /* Convert I to a __m64 object.  */
  72 static __inline __m64
  73 _mm_cvtsi64x_si64 (long long __i)
  74 {
  75   return (__m64) __i;
  76 }
  77
  78 /* Convert I to a __m64 object.  */
  79 static __inline __m64
  80 _mm_set_pi64x (long long __i)
  81 {
  82   return (__m64) __i;
  83 }
  84 #endif
  85
  86 /* Convert the lower 32 bits of the __m64 object into an integer.  */
  87 static __inline int
  88 _mm_cvtsi64_si32 (__m64 __i)
  89 {
  90   return __builtin_ia32_vec_ext_v2si ((__v2si)__i, 0);
  91 }
  92
  93 static __inline int
  94 _m_to_int (__m64 __i)
  95 {
  96   return _mm_cvtsi64_si32 (__i);
  97 }
  98
  99 #ifdef __x86_64__
 100 /* Convert the lower 32 bits of the __m64 object into an integer.  */
 101 static __inline long long
 102 _mm_cvtsi64_si64x (__m64 __i)
 103 {
 104   return (long long)__i;
 105 }
 106 #endif
 107
 108 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
 109    the result, and the four 16-bit values from M2 into the upper four 8-bit
 110    values of the result, all with signed saturation.  */
 111 static __inline __m64
 112 _mm_packs_pi16 (__m64 __m1, __m64 __m2)
 113 {
 114   return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2);
 115 }
 116
 117 static __inline __m64
 118 _m_packsswb (__m64 __m1, __m64 __m2)
 119 {
 120   return _mm_packs_pi16 (__m1, __m2);
 121 }
 122
 123 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
 124    the result, and the two 32-bit values from M2 into the upper two 16-bit
 125    values of the result, all with signed saturation.  */
 126 static __inline __m64
 127 _mm_packs_pi32 (__m64 __m1, __m64 __m2)
 128 {
 129   return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2);
 130 }
 131
 132 static __inline __m64
 133 _m_packssdw (__m64 __m1, __m64 __m2)
 134 {
 135   return _mm_packs_pi32 (__m1, __m2);
 136 }
 137
 138 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
 139    the result, and the four 16-bit values from M2 into the upper four 8-bit
 140    values of the result, all with unsigned saturation.  */
 141 static __inline __m64
 142 _mm_packs_pu16 (__m64 __m1, __m64 __m2)
 143 {
 144   return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2);
 145 }
 146
 147 static __inline __m64
 148 _m_packuswb (__m64 __m1, __m64 __m2)
 149 {
 150   return _mm_packs_pu16 (__m1, __m2);
 151 }
 152
 153 /* Interleave the four 8-bit values from the high half of M1 with the four
 154    8-bit values from the high half of M2.  */
 155 static __inline __m64
 156 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
 157 {
 158   return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2);
 159 }
 160
 161 static __inline __m64
 162 _m_punpckhbw (__m64 __m1, __m64 __m2)
 163 {
 164   return _mm_unpackhi_pi8 (__m1, __m2);
 165 }
 166
 167 /* Interleave the two 16-bit values from the high half of M1 with the two
 168    16-bit values from the high half of M2.  */
 169 static __inline __m64
 170 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
 171 {
 172   return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2);
 173 }
 174
 175 static __inline __m64
 176 _m_punpckhwd (__m64 __m1, __m64 __m2)
 177 {
 178   return _mm_unpackhi_pi16 (__m1, __m2);
 179 }
 180
 181 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
 182    value from the high half of M2.  */
 183 static __inline __m64
 184 _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
 185 {
 186   return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2);
 187 }
 188
 189 static __inline __m64
 190 _m_punpckhdq (__m64 __m1, __m64 __m2)
 191 {
 192   return _mm_unpackhi_pi32 (__m1, __m2);
 193 }
 194
 195 /* Interleave the four 8-bit values from the low half of M1 with the four
 196    8-bit values from the low half of M2.  */
 197 static __inline __m64
 198 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
 199 {
 200   return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2);
 201 }
 202
 203 static __inline __m64
 204 _m_punpcklbw (__m64 __m1, __m64 __m2)
 205 {
 206   return _mm_unpacklo_pi8 (__m1, __m2);
 207 }
 208
 209 /* Interleave the two 16-bit values from the low half of M1 with the two
 210    16-bit values from the low half of M2.  */
 211 static __inline __m64
 212 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
 213 {
 214   return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2);
 215 }
 216
 217 static __inline __m64
 218 _m_punpcklwd (__m64 __m1, __m64 __m2)
 219 {
 220   return _mm_unpacklo_pi16 (__m1, __m2);
 221 }
 222
 223 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
 224    value from the low half of M2.  */
 225 static __inline __m64
 226 _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
 227 {
 228   return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2);
 229 }
 230
 231 static __inline __m64
 232 _m_punpckldq (__m64 __m1, __m64 __m2)
 233 {
 234   return _mm_unpacklo_pi32 (__m1, __m2);
 235 }
 236
 237 /* Add the 8-bit values in M1 to the 8-bit values in M2.  */
 238 static __inline __m64
 239 _mm_add_pi8 (__m64 __m1, __m64 __m2)
 240 {
 241   return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2);
 242 }
 243
 244 static __inline __m64
 245 _m_paddb (__m64 __m1, __m64 __m2)
 246 {
 247   return _mm_add_pi8 (__m1, __m2);
 248 }
 249
 250 /* Add the 16-bit values in M1 to the 16-bit values in M2.  */
 251 static __inline __m64
 252 _mm_add_pi16 (__m64 __m1, __m64 __m2)
 253 {
 254   return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2);
 255 }
 256
 257 static __inline __m64
 258 _m_paddw (__m64 __m1, __m64 __m2)
 259 {
 260   return _mm_add_pi16 (__m1, __m2);
 261 }
 262
 263 /* Add the 32-bit values in M1 to the 32-bit values in M2.  */
 264 static __inline __m64
 265 _mm_add_pi32 (__m64 __m1, __m64 __m2)
 266 {
 267   return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2);
 268 }
 269
 270 static __inline __m64
 271 _m_paddd (__m64 __m1, __m64 __m2)
 272 {
 273   return _mm_add_pi32 (__m1, __m2);
 274 }
 275
 276 /* Add the 64-bit values in M1 to the 64-bit values in M2.  */
 277 static __inline __m64
 278 _mm_add_si64 (__m64 __m1, __m64 __m2)
 279 {
 280   return (__m64) __builtin_ia32_paddq ((long long)__m1, (long long)__m2);
 281 }
 282
 283 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
 284    saturated arithmetic.  */
 285 static __inline __m64
 286 _mm_adds_pi8 (__m64 __m1, __m64 __m2)
 287 {
 288   return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2);
 289 }
 290
 291 static __inline __m64
 292 _m_paddsb (__m64 __m1, __m64 __m2)
 293 {
 294   return _mm_adds_pi8 (__m1, __m2);
 295 }
 296
 297 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
 298    saturated arithmetic.  */
 299 static __inline __m64
 300 _mm_adds_pi16 (__m64 __m1, __m64 __m2)
 301 {
 302   return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2);
 303 }
 304
 305 static __inline __m64
 306 _m_paddsw (__m64 __m1, __m64 __m2)
 307 {
 308   return _mm_adds_pi16 (__m1, __m2);
 309 }
 310
 311 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
 312    saturated arithmetic.  */
 313 static __inline __m64
 314 _mm_adds_pu8 (__m64 __m1, __m64 __m2)
 315 {
 316   return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2);
 317 }
 318
 319 static __inline __m64
 320 _m_paddusb (__m64 __m1, __m64 __m2)
 321 {
 322   return _mm_adds_pu8 (__m1, __m2);
 323 }
 324
 325 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
 326    saturated arithmetic.  */
 327 static __inline __m64
 328 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
 329 {
 330   return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2);
 331 }
 332
 333 static __inline __m64
 334 _m_paddusw (__m64 __m1, __m64 __m2)
 335 {
 336   return _mm_adds_pu16 (__m1, __m2);
 337 }
 338
 339 /* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
 340 static __inline __m64
 341 _mm_sub_pi8 (__m64 __m1, __m64 __m2)
 342 {
 343   return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2);
 344 }
 345
 346 static __inline __m64
 347 _m_psubb (__m64 __m1, __m64 __m2)
 348 {
 349   return _mm_sub_pi8 (__m1, __m2);
 350 }
 351
 352 /* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
 353 static __inline __m64
 354 _mm_sub_pi16 (__m64 __m1, __m64 __m2)
 355 {
 356   return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2);
 357 }
 358
 359 static __inline __m64
 360 _m_psubw (__m64 __m1, __m64 __m2)
 361 {
 362   return _mm_sub_pi16 (__m1, __m2);
 363 }
 364
 365 /* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
 366 static __inline __m64
 367 _mm_sub_pi32 (__m64 __m1, __m64 __m2)
 368 {
 369   return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2);
 370 }
 371
 372 static __inline __m64
 373 _m_psubd (__m64 __m1, __m64 __m2)
 374 {
 375   return _mm_sub_pi32 (__m1, __m2);
 376 }
 377
 378 /* Add the 64-bit values in M1 to the 64-bit values in M2.  */
 379 static __inline __m64
 380 _mm_sub_si64 (__m64 __m1, __m64 __m2)
 381 {
 382   return (__m64) __builtin_ia32_psubq ((long long)__m1, (long long)__m2);
 383 }
 384
 385 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
 386    saturating arithmetic.  */
 387 static __inline __m64
 388 _mm_subs_pi8 (__m64 __m1, __m64 __m2)
 389 {
 390   return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2);
 391 }
 392
 393 static __inline __m64
 394 _m_psubsb (__m64 __m1, __m64 __m2)
 395 {
 396   return _mm_subs_pi8 (__m1, __m2);
 397 }
 398
 399 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
 400    signed saturating arithmetic.  */
 401 static __inline __m64
 402 _mm_subs_pi16 (__m64 __m1, __m64 __m2)
 403 {
 404   return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2);
 405 }
 406
 407 static __inline __m64
 408 _m_psubsw (__m64 __m1, __m64 __m2)
 409 {
 410   return _mm_subs_pi16 (__m1, __m2);
 411 }
 412
 413 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
 414    unsigned saturating arithmetic.  */
 415 static __inline __m64
 416 _mm_subs_pu8 (__m64 __m1, __m64 __m2)
 417 {
 418   return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2);
 419 }
 420
 421 static __inline __m64
 422 _m_psubusb (__m64 __m1, __m64 __m2)
 423 {
 424   return _mm_subs_pu8 (__m1, __m2);
 425 }
 426
 427 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
 428    unsigned saturating arithmetic.  */
 429 static __inline __m64
 430 _mm_subs_pu16 (__m64 __m1, __m64 __m2)
 431 {
 432   return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2);
 433 }
 434
 435 static __inline __m64
 436 _m_psubusw (__m64 __m1, __m64 __m2)
 437 {
 438   return _mm_subs_pu16 (__m1, __m2);
 439 }
 440
 441 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
 442    four 32-bit intermediate results, which are then summed by pairs to
 443    produce two 32-bit results.  */
 444 static __inline __m64
 445 _mm_madd_pi16 (__m64 __m1, __m64 __m2)
 446 {
 447   return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2);
 448 }
 449
 450 static __inline __m64
 451 _m_pmaddwd (__m64 __m1, __m64 __m2)
 452 {
 453   return _mm_madd_pi16 (__m1, __m2);
 454 }
 455
 456 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
 457    M2 and produce the high 16 bits of the 32-bit results.  */
 458 static __inline __m64
 459 _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
 460 {
 461   return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2);
 462 }
 463
 464 static __inline __m64
 465 _m_pmulhw (__m64 __m1, __m64 __m2)
 466 {
 467   return _mm_mulhi_pi16 (__m1, __m2);
 468 }
 469
 470 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
 471    the low 16 bits of the results.  */
 472 static __inline __m64
 473 _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
 474 {
 475   return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2);
 476 }
 477
 478 static __inline __m64
 479 _m_pmullw (__m64 __m1, __m64 __m2)
 480 {
 481   return _mm_mullo_pi16 (__m1, __m2);
 482 }
 483
 484 /* Shift four 16-bit values in M left by COUNT.  */
 485 static __inline __m64
 486 _mm_sll_pi16 (__m64 __m, __m64 __count)
 487 {
 488   return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (long long)__count);
 489 }
 490
 491 static __inline __m64
 492 _m_psllw (__m64 __m, __m64 __count)
 493 {
 494   return _mm_sll_pi16 (__m, __count);
 495 }
 496
 497 static __inline __m64
 498 _mm_slli_pi16 (__m64 __m, int __count)
 499 {
 500   return (__m64) __builtin_ia32_psllw ((__v4hi)__m, __count);
 501 }
 502
 503 static __inline __m64
 504 _m_psllwi (__m64 __m, int __count)
 505 {
 506   return _mm_slli_pi16 (__m, __count);
 507 }
 508
 509 /* Shift two 32-bit values in M left by COUNT.  */
 510 static __inline __m64
 511 _mm_sll_pi32 (__m64 __m, __m64 __count)
 512 {
 513   return (__m64) __builtin_ia32_pslld ((__v2si)__m, (long long)__count);
 514 }
 515
 516 static __inline __m64
 517 _m_pslld (__m64 __m, __m64 __count)
 518 {
 519   return _mm_sll_pi32 (__m, __count);
 520 }
 521
 522 static __inline __m64
 523 _mm_slli_pi32 (__m64 __m, int __count)
 524 {
 525   return (__m64) __builtin_ia32_pslld ((__v2si)__m, __count);
 526 }
 527
 528 static __inline __m64
 529 _m_pslldi (__m64 __m, int __count)
 530 {
 531   return _mm_slli_pi32 (__m, __count);
 532 }
 533
 534 /* Shift the 64-bit value in M left by COUNT.  */
 535 static __inline __m64
 536 _mm_sll_si64 (__m64 __m, __m64 __count)
 537 {
 538   return (__m64) __builtin_ia32_psllq ((long long)__m, (long long)__count);
 539 }
 540
 541 static __inline __m64
 542 _m_psllq (__m64 __m, __m64 __count)
 543 {
 544   return _mm_sll_si64 (__m, __count);
 545 }
 546
 547 static __inline __m64
 548 _mm_slli_si64 (__m64 __m, int __count)
 549 {
 550   return (__m64) __builtin_ia32_psllq ((long long)__m, (long long)__count);
 551 }
 552
 553 static __inline __m64
 554 _m_psllqi (__m64 __m, int __count)
 555 {
 556   return _mm_slli_si64 (__m, __count);
 557 }
 558
 559 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
 560 static __inline __m64
 561 _mm_sra_pi16 (__m64 __m, __m64 __count)
 562 {
 563   return (__m64) __builtin_ia32_psraw ((__v4hi)__m, (long long)__count);
 564 }
 565
 566 static __inline __m64
 567 _m_psraw (__m64 __m, __m64 __count)
 568 {
 569   return _mm_sra_pi16 (__m, __count);
 570 }
 571
 572 static __inline __m64
 573 _mm_srai_pi16 (__m64 __m, int __count)
 574 {
 575   return (__m64) __builtin_ia32_psraw ((__v4hi)__m, __count);
 576 }
 577
 578 static __inline __m64
 579 _m_psrawi (__m64 __m, int __count)
 580 {
 581   return _mm_srai_pi16 (__m, __count);
 582 }
 583
 584 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
 585 static __inline __m64
 586 _mm_sra_pi32 (__m64 __m, __m64 __count)
 587 {
 588   return (__m64) __builtin_ia32_psrad ((__v2si)__m, (long long)__count);
 589 }
 590
 591 static __inline __m64
 592 _m_psrad (__m64 __m, __m64 __count)
 593 {
 594   return _mm_sra_pi32 (__m, __count);
 595 }
 596
 597 static __inline __m64
 598 _mm_srai_pi32 (__m64 __m, int __count)
 599 {
 600   return (__m64) __builtin_ia32_psrad ((__v2si)__m, __count);
 601 }
 602
 603 static __inline __m64
 604 _m_psradi (__m64 __m, int __count)
 605 {
 606   return _mm_srai_pi32 (__m, __count);
 607 }
 608
 609 /* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
 610 static __inline __m64
 611 _mm_srl_pi16 (__m64 __m, __m64 __count)
 612 {
 613   return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, (long long)__count);
 614 }
 615
 616 static __inline __m64
 617 _m_psrlw (__m64 __m, __m64 __count)
 618 {
 619   return _mm_srl_pi16 (__m, __count);
 620 }
 621
 622 static __inline __m64
 623 _mm_srli_pi16 (__m64 __m, int __count)
 624 {
 625   return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, __count);
 626 }
 627
 628 static __inline __m64
 629 _m_psrlwi (__m64 __m, int __count)
 630 {
 631   return _mm_srli_pi16 (__m, __count);
 632 }
 633
 634 /* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
 635 static __inline __m64
 636 _mm_srl_pi32 (__m64 __m, __m64 __count)
 637 {
 638   return (__m64) __builtin_ia32_psrld ((__v2si)__m, (long long)__count);
 639 }
 640
 641 static __inline __m64
 642 _m_psrld (__m64 __m, __m64 __count)
 643 {
 644   return _mm_srl_pi32 (__m, __count);
 645 }
 646
 647 static __inline __m64
 648 _mm_srli_pi32 (__m64 __m, int __count)
 649 {
 650   return (__m64) __builtin_ia32_psrld ((__v2si)__m, __count);
 651 }
 652
 653 static __inline __m64
 654 _m_psrldi (__m64 __m, int __count)
 655 {
 656   return _mm_srli_pi32 (__m, __count);
 657 }
 658
 659 /* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
 660 static __inline __m64
 661 _mm_srl_si64 (__m64 __m, __m64 __count)
 662 {
 663   return (__m64) __builtin_ia32_psrlq ((long long)__m, (long long)__count);
 664 }
 665
 666 static __inline __m64
 667 _m_psrlq (__m64 __m, __m64 __count)
 668 {
 669   return _mm_srl_si64 (__m, __count);
 670 }
 671
 672 static __inline __m64
 673 _mm_srli_si64 (__m64 __m, int __count)
 674 {
 675   return (__m64) __builtin_ia32_psrlq ((long long)__m, (long long)__count);
 676 }
 677
 678 static __inline __m64
 679 _m_psrlqi (__m64 __m, int __count)
 680 {
 681   return _mm_srli_si64 (__m, __count);
 682 }
 683
 684 /* Bit-wise AND the 64-bit values in M1 and M2.  */
 685 static __inline __m64
 686 _mm_and_si64 (__m64 __m1, __m64 __m2)
 687 {
 688   return __builtin_ia32_pand (__m1, __m2);
 689 }
 690
 691 static __inline __m64
 692 _m_pand (__m64 __m1, __m64 __m2)
 693 {
 694   return _mm_and_si64 (__m1, __m2);
 695 }
 696
 697 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
 698    64-bit value in M2.  */
 699 static __inline __m64
 700 _mm_andnot_si64 (__m64 __m1, __m64 __m2)
 701 {
 702   return __builtin_ia32_pandn (__m1, __m2);
 703 }
 704
 705 static __inline __m64
 706 _m_pandn (__m64 __m1, __m64 __m2)
 707 {
 708   return _mm_andnot_si64 (__m1, __m2);
 709 }
 710
 711 /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
 712 static __inline __m64
 713 _mm_or_si64 (__m64 __m1, __m64 __m2)
 714 {
 715   return __builtin_ia32_por (__m1, __m2);
 716 }
 717
 718 static __inline __m64
 719 _m_por (__m64 __m1, __m64 __m2)
 720 {
 721   return _mm_or_si64 (__m1, __m2);
 722 }
 723
 724 /* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
 725 static __inline __m64
 726 _mm_xor_si64 (__m64 __m1, __m64 __m2)
 727 {
 728   return __builtin_ia32_pxor (__m1, __m2);
 729 }
 730
 731 static __inline __m64
 732 _m_pxor (__m64 __m1, __m64 __m2)
 733 {
 734   return _mm_xor_si64 (__m1, __m2);
 735 }
 736
 737 /* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
 738    test is true and zero if false.  */
 739 static __inline __m64
 740 _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
 741 {
 742   return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2);
 743 }
 744
 745 static __inline __m64
 746 _m_pcmpeqb (__m64 __m1, __m64 __m2)
 747 {
 748   return _mm_cmpeq_pi8 (__m1, __m2);
 749 }
 750
 751 static __inline __m64
 752 _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
 753 {
 754   return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2);
 755 }
 756
 757 static __inline __m64
 758 _m_pcmpgtb (__m64 __m1, __m64 __m2)
 759 {
 760   return _mm_cmpgt_pi8 (__m1, __m2);
 761 }
 762
 763 /* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
 764    the test is true and zero if false.  */
 765 static __inline __m64
 766 _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
 767 {
 768   return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2);
 769 }
 770
 771 static __inline __m64
 772 _m_pcmpeqw (__m64 __m1, __m64 __m2)
 773 {
 774   return _mm_cmpeq_pi16 (__m1, __m2);
 775 }
 776
 777 static __inline __m64
 778 _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
 779 {
 780   return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2);
 781 }
 782
 783 static __inline __m64
 784 _m_pcmpgtw (__m64 __m1, __m64 __m2)
 785 {
 786   return _mm_cmpgt_pi16 (__m1, __m2);
 787 }
 788
 789 /* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
 790    the test is true and zero if false.  */
 791 static __inline __m64
 792 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
 793 {
 794   return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2);
 795 }
 796
 797 static __inline __m64
 798 _m_pcmpeqd (__m64 __m1, __m64 __m2)
 799 {
 800   return _mm_cmpeq_pi32 (__m1, __m2);
 801 }
 802
 803 static __inline __m64
 804 _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
 805 {
 806   return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2);
 807 }
 808
 809 static __inline __m64
 810 _m_pcmpgtd (__m64 __m1, __m64 __m2)
 811 {
 812   return _mm_cmpgt_pi32 (__m1, __m2);
 813 }
 814
 815 /* Creates a 64-bit zero.  */
 816 static __inline __m64
 817 _mm_setzero_si64 (void)
 818 {
 819   return (__m64)0LL;
 820 }
 821
 822 /* Creates a vector of two 32-bit values; I0 is least significant.  */
 823 static __inline __m64
 824 _mm_set_pi32 (int __i1, int __i0)
 825 {
 826   return (__m64) __builtin_ia32_vec_init_v2si (__i0, __i1);
 827 }
 828
 829 /* Creates a vector of four 16-bit values; W0 is least significant.  */
 830 static __inline __m64
 831 _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
 832 {
 833   return (__m64) __builtin_ia32_vec_init_v4hi (__w0, __w1, __w2, __w3);
 834 }
 835
 836 /* Creates a vector of eight 8-bit values; B0 is least significant.  */
 837 static __inline __m64
 838 _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
 839              char __b3, char __b2, char __b1, char __b0)
 840 {
 841   return (__m64) __builtin_ia32_vec_init_v8qi (__b0, __b1, __b2, __b3,
 842                                                __b4, __b5, __b6, __b7);
 843 }
 844
 845 /* Similar, but with the arguments in reverse order.  */
 846 static __inline __m64
 847 _mm_setr_pi32 (int __i0, int __i1)
 848 {
 849   return _mm_set_pi32 (__i1, __i0);
 850 }
 851
 852 static __inline __m64
 853 _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
 854 {
 855   return _mm_set_pi16 (__w3, __w2, __w1, __w0);
 856 }
 857
 858 static __inline __m64
 859 _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
 860               char __b4, char __b5, char __b6, char __b7)
 861 {
 862   return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
 863 }
 864
 865 /* Creates a vector of two 32-bit values, both elements containing I.  */
 866 static __inline __m64
 867 _mm_set1_pi32 (int __i)
 868 {
 869   return _mm_set_pi32 (__i, __i);
 870 }
 871
 872 /* Creates a vector of four 16-bit values, all elements containing W.  */
 873 static __inline __m64
 874 _mm_set1_pi16 (short __w)
 875 {
 876   return _mm_set_pi16 (__w, __w, __w, __w);
 877 }
 878
 879 /* Creates a vector of eight 8-bit values, all elements containing B.  */
 880 static __inline __m64
 881 _mm_set1_pi8 (char __b)
 882 {
 883   return _mm_set_pi8 (__b, __b, __b, __b, __b, __b, __b, __b);
 884 }
 885
 886 #endif /* __MMX__ */
 887 #endif /* _MMINTRIN_H_INCLUDED */