[RS6000] PowerPC64 soft-float
[official-gcc.git] / gcc / config / rs6000 / mmintrin.h
blob7456c5bdec36b3c333fc15afbd7e2f629d99faf5
1 /* Copyright (C) 2002-2018 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
24 /* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 9.0. */
27 #ifndef NO_WARN_X86_INTRINSICS
28 /* This header is distributed to simplify porting x86_64 code that
29 makes explicit use of Intel intrinsics to powerpc64le.
30 It is the user's responsibility to determine if the results are
31 acceptable and make additional changes as necessary.
32 Note that much code that uses Intel intrinsics can be rewritten in
33 standard C or GNU C extensions, which are more portable and better
34 optimized across multiple targets.
36 In the specific case of X86 MMX (__m64) intrinsics, the PowerPC
37 target does not support a native __vector_size__ (8) type. Instead
38 we typedef __m64 to a 64-bit unsigned long long, which is natively
39 supported in 64-bit mode. This works well for the _si64 and some
40 _pi32 operations, but starts to generate long sequences for _pi16
41 and _pi8 operations. For those cases it better (faster and
42 smaller code) to transfer __m64 data to the PowerPC vector 128-bit
43 unit, perform the operation, and then transfer the result back to
44 the __m64 type. This implies that the direct register move
45 instructions, introduced with power8, are available for efficient
46 implementation of these transfers.
48 Most MMX intrinsic operations can be performed efficiently as
49 C language 64-bit scalar operation or optimized to use the newer
50 128-bit SSE/Altivec operations. We recomend this for new
51 applications. */
52 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
53 #endif
55 #ifndef _MMINTRIN_H_INCLUDED
56 #define _MMINTRIN_H_INCLUDED
58 #include <altivec.h>
59 /* The Intel API is flexible enough that we must allow aliasing with other
60 vector types, and their scalar components. */
61 typedef __attribute__ ((__aligned__ (8))) unsigned long long __m64;
63 typedef __attribute__ ((__aligned__ (8)))
64 union
66 __m64 as_m64;
67 char as_char[8];
68 signed char as_signed_char [8];
69 short as_short[4];
70 int as_int[2];
71 long long as_long_long;
72 float as_float[2];
73 double as_double;
74 } __m64_union;
76 /* Empty the multimedia state. */
77 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
78 _mm_empty (void)
80 /* nothing to do on PowerPC. */
83 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
84 _m_empty (void)
86 /* nothing to do on PowerPC. */
89 /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */
90 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
91 _mm_cvtsi32_si64 (int __i)
93 return (__m64) (unsigned int) __i;
96 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
97 _m_from_int (int __i)
99 return _mm_cvtsi32_si64 (__i);
102 /* Convert the lower 32 bits of the __m64 object into an integer. */
103 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
104 _mm_cvtsi64_si32 (__m64 __i)
106 return ((int) __i);
109 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
110 _m_to_int (__m64 __i)
112 return _mm_cvtsi64_si32 (__i);
115 /* Convert I to a __m64 object. */
117 /* Intel intrinsic. */
118 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
119 _m_from_int64 (long long __i)
121 return (__m64) __i;
124 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
125 _mm_cvtsi64_m64 (long long __i)
127 return (__m64) __i;
130 /* Microsoft intrinsic. */
131 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
132 _mm_cvtsi64x_si64 (long long __i)
134 return (__m64) __i;
137 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
138 _mm_set_pi64x (long long __i)
140 return (__m64) __i;
143 /* Convert the __m64 object to a 64bit integer. */
145 /* Intel intrinsic. */
146 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
147 _m_to_int64 (__m64 __i)
149 return (long long)__i;
152 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
153 _mm_cvtm64_si64 (__m64 __i)
155 return (long long) __i;
158 /* Microsoft intrinsic. */
159 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
160 _mm_cvtsi64_si64x (__m64 __i)
162 return (long long) __i;
165 #ifdef _ARCH_PWR8
166 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
167 the result, and the four 16-bit values from M2 into the upper four 8-bit
168 values of the result, all with signed saturation. */
169 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
170 _mm_packs_pi16 (__m64 __m1, __m64 __m2)
172 __vector signed short vm1;
173 __vector signed char vresult;
175 vm1 = (__vector signed short) (__vector unsigned long long) { __m2, __m1 };
176 vresult = vec_vpkshss (vm1, vm1);
177 return (__m64) ((__vector long long) vresult)[0];
180 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
181 _m_packsswb (__m64 __m1, __m64 __m2)
183 return _mm_packs_pi16 (__m1, __m2);
186 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
187 the result, and the two 32-bit values from M2 into the upper two 16-bit
188 values of the result, all with signed saturation. */
189 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
190 _mm_packs_pi32 (__m64 __m1, __m64 __m2)
192 __vector signed int vm1;
193 __vector signed short vresult;
195 vm1 = (__vector signed int) (__vector unsigned long long) { __m2, __m1 };
196 vresult = vec_vpkswss (vm1, vm1);
197 return (__m64) ((__vector long long) vresult)[0];
200 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
201 _m_packssdw (__m64 __m1, __m64 __m2)
203 return _mm_packs_pi32 (__m1, __m2);
206 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
207 the result, and the four 16-bit values from M2 into the upper four 8-bit
208 values of the result, all with unsigned saturation. */
209 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
210 _mm_packs_pu16 (__m64 __m1, __m64 __m2)
212 __vector signed short vm1;
213 __vector unsigned char vresult;
215 vm1 = (__vector signed short) (__vector unsigned long long) { __m2, __m1 };
216 vresult = vec_vpkshus (vm1, vm1);
217 return (__m64) ((__vector long long) vresult)[0];
220 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
221 _m_packuswb (__m64 __m1, __m64 __m2)
223 return _mm_packs_pu16 (__m1, __m2);
225 #endif /* end ARCH_PWR8 */
227 /* Interleave the four 8-bit values from the high half of M1 with the four
228 8-bit values from the high half of M2. */
229 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
230 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
232 #if _ARCH_PWR8
233 __vector unsigned char a, b, c;
235 a = (__vector unsigned char)vec_splats (__m1);
236 b = (__vector unsigned char)vec_splats (__m2);
237 c = vec_mergel (a, b);
238 return (__m64) ((__vector long long) c)[0];
239 #else
240 __m64_union m1, m2, res;
242 m1.as_m64 = __m1;
243 m2.as_m64 = __m2;
245 res.as_char[0] = m1.as_char[4];
246 res.as_char[1] = m2.as_char[4];
247 res.as_char[2] = m1.as_char[5];
248 res.as_char[3] = m2.as_char[5];
249 res.as_char[4] = m1.as_char[6];
250 res.as_char[5] = m2.as_char[6];
251 res.as_char[6] = m1.as_char[7];
252 res.as_char[7] = m2.as_char[7];
254 return (__m64) res.as_m64;
255 #endif
258 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
259 _m_punpckhbw (__m64 __m1, __m64 __m2)
261 return _mm_unpackhi_pi8 (__m1, __m2);
264 /* Interleave the two 16-bit values from the high half of M1 with the two
265 16-bit values from the high half of M2. */
266 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
267 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
269 __m64_union m1, m2, res;
271 m1.as_m64 = __m1;
272 m2.as_m64 = __m2;
274 res.as_short[0] = m1.as_short[2];
275 res.as_short[1] = m2.as_short[2];
276 res.as_short[2] = m1.as_short[3];
277 res.as_short[3] = m2.as_short[3];
279 return (__m64) res.as_m64;
282 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
283 _m_punpckhwd (__m64 __m1, __m64 __m2)
285 return _mm_unpackhi_pi16 (__m1, __m2);
287 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
288 value from the high half of M2. */
289 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
290 _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
292 __m64_union m1, m2, res;
294 m1.as_m64 = __m1;
295 m2.as_m64 = __m2;
297 res.as_int[0] = m1.as_int[1];
298 res.as_int[1] = m2.as_int[1];
300 return (__m64) res.as_m64;
303 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
304 _m_punpckhdq (__m64 __m1, __m64 __m2)
306 return _mm_unpackhi_pi32 (__m1, __m2);
308 /* Interleave the four 8-bit values from the low half of M1 with the four
309 8-bit values from the low half of M2. */
310 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
311 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
313 #if _ARCH_PWR8
314 __vector unsigned char a, b, c;
316 a = (__vector unsigned char)vec_splats (__m1);
317 b = (__vector unsigned char)vec_splats (__m2);
318 c = vec_mergel (a, b);
319 return (__m64) ((__vector long long) c)[1];
320 #else
321 __m64_union m1, m2, res;
323 m1.as_m64 = __m1;
324 m2.as_m64 = __m2;
326 res.as_char[0] = m1.as_char[0];
327 res.as_char[1] = m2.as_char[0];
328 res.as_char[2] = m1.as_char[1];
329 res.as_char[3] = m2.as_char[1];
330 res.as_char[4] = m1.as_char[2];
331 res.as_char[5] = m2.as_char[2];
332 res.as_char[6] = m1.as_char[3];
333 res.as_char[7] = m2.as_char[3];
335 return (__m64) res.as_m64;
336 #endif
339 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
340 _m_punpcklbw (__m64 __m1, __m64 __m2)
342 return _mm_unpacklo_pi8 (__m1, __m2);
344 /* Interleave the two 16-bit values from the low half of M1 with the two
345 16-bit values from the low half of M2. */
346 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
347 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
349 __m64_union m1, m2, res;
351 m1.as_m64 = __m1;
352 m2.as_m64 = __m2;
354 res.as_short[0] = m1.as_short[0];
355 res.as_short[1] = m2.as_short[0];
356 res.as_short[2] = m1.as_short[1];
357 res.as_short[3] = m2.as_short[1];
359 return (__m64) res.as_m64;
362 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
363 _m_punpcklwd (__m64 __m1, __m64 __m2)
365 return _mm_unpacklo_pi16 (__m1, __m2);
368 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
369 value from the low half of M2. */
370 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
371 _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
373 __m64_union m1, m2, res;
375 m1.as_m64 = __m1;
376 m2.as_m64 = __m2;
378 res.as_int[0] = m1.as_int[0];
379 res.as_int[1] = m2.as_int[0];
381 return (__m64) res.as_m64;
384 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
385 _m_punpckldq (__m64 __m1, __m64 __m2)
387 return _mm_unpacklo_pi32 (__m1, __m2);
390 /* Add the 8-bit values in M1 to the 8-bit values in M2. */
391 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
392 _mm_add_pi8 (__m64 __m1, __m64 __m2)
394 #if _ARCH_PWR8
395 __vector signed char a, b, c;
397 a = (__vector signed char)vec_splats (__m1);
398 b = (__vector signed char)vec_splats (__m2);
399 c = vec_add (a, b);
400 return (__m64) ((__vector long long) c)[0];
401 #else
402 __m64_union m1, m2, res;
404 m1.as_m64 = __m1;
405 m2.as_m64 = __m2;
407 res.as_char[0] = m1.as_char[0] + m2.as_char[0];
408 res.as_char[1] = m1.as_char[1] + m2.as_char[1];
409 res.as_char[2] = m1.as_char[2] + m2.as_char[2];
410 res.as_char[3] = m1.as_char[3] + m2.as_char[3];
411 res.as_char[4] = m1.as_char[4] + m2.as_char[4];
412 res.as_char[5] = m1.as_char[5] + m2.as_char[5];
413 res.as_char[6] = m1.as_char[6] + m2.as_char[6];
414 res.as_char[7] = m1.as_char[7] + m2.as_char[7];
416 return (__m64) res.as_m64;
417 #endif
420 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
421 _m_paddb (__m64 __m1, __m64 __m2)
423 return _mm_add_pi8 (__m1, __m2);
426 /* Add the 16-bit values in M1 to the 16-bit values in M2. */
427 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
428 _mm_add_pi16 (__m64 __m1, __m64 __m2)
430 #if _ARCH_PWR8
431 __vector signed short a, b, c;
433 a = (__vector signed short)vec_splats (__m1);
434 b = (__vector signed short)vec_splats (__m2);
435 c = vec_add (a, b);
436 return (__m64) ((__vector long long) c)[0];
437 #else
438 __m64_union m1, m2, res;
440 m1.as_m64 = __m1;
441 m2.as_m64 = __m2;
443 res.as_short[0] = m1.as_short[0] + m2.as_short[0];
444 res.as_short[1] = m1.as_short[1] + m2.as_short[1];
445 res.as_short[2] = m1.as_short[2] + m2.as_short[2];
446 res.as_short[3] = m1.as_short[3] + m2.as_short[3];
448 return (__m64) res.as_m64;
449 #endif
452 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
453 _m_paddw (__m64 __m1, __m64 __m2)
455 return _mm_add_pi16 (__m1, __m2);
458 /* Add the 32-bit values in M1 to the 32-bit values in M2. */
459 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
460 _mm_add_pi32 (__m64 __m1, __m64 __m2)
462 #if _ARCH_PWR9
463 __vector signed int a, b, c;
465 a = (__vector signed int)vec_splats (__m1);
466 b = (__vector signed int)vec_splats (__m2);
467 c = vec_add (a, b);
468 return (__m64) ((__vector long long) c)[0];
469 #else
470 __m64_union m1, m2, res;
472 m1.as_m64 = __m1;
473 m2.as_m64 = __m2;
475 res.as_int[0] = m1.as_int[0] + m2.as_int[0];
476 res.as_int[1] = m1.as_int[1] + m2.as_int[1];
478 return (__m64) res.as_m64;
479 #endif
482 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
483 _m_paddd (__m64 __m1, __m64 __m2)
485 return _mm_add_pi32 (__m1, __m2);
488 /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
489 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
490 _mm_sub_pi8 (__m64 __m1, __m64 __m2)
492 #if _ARCH_PWR8
493 __vector signed char a, b, c;
495 a = (__vector signed char)vec_splats (__m1);
496 b = (__vector signed char)vec_splats (__m2);
497 c = vec_sub (a, b);
498 return (__m64) ((__vector long long) c)[0];
499 #else
500 __m64_union m1, m2, res;
502 m1.as_m64 = __m1;
503 m2.as_m64 = __m2;
505 res.as_char[0] = m1.as_char[0] - m2.as_char[0];
506 res.as_char[1] = m1.as_char[1] - m2.as_char[1];
507 res.as_char[2] = m1.as_char[2] - m2.as_char[2];
508 res.as_char[3] = m1.as_char[3] - m2.as_char[3];
509 res.as_char[4] = m1.as_char[4] - m2.as_char[4];
510 res.as_char[5] = m1.as_char[5] - m2.as_char[5];
511 res.as_char[6] = m1.as_char[6] - m2.as_char[6];
512 res.as_char[7] = m1.as_char[7] - m2.as_char[7];
514 return (__m64) res.as_m64;
515 #endif
518 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
519 _m_psubb (__m64 __m1, __m64 __m2)
521 return _mm_sub_pi8 (__m1, __m2);
524 /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
525 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
526 _mm_sub_pi16 (__m64 __m1, __m64 __m2)
528 #if _ARCH_PWR8
529 __vector signed short a, b, c;
531 a = (__vector signed short)vec_splats (__m1);
532 b = (__vector signed short)vec_splats (__m2);
533 c = vec_sub (a, b);
534 return (__m64) ((__vector long long) c)[0];
535 #else
536 __m64_union m1, m2, res;
538 m1.as_m64 = __m1;
539 m2.as_m64 = __m2;
541 res.as_short[0] = m1.as_short[0] - m2.as_short[0];
542 res.as_short[1] = m1.as_short[1] - m2.as_short[1];
543 res.as_short[2] = m1.as_short[2] - m2.as_short[2];
544 res.as_short[3] = m1.as_short[3] - m2.as_short[3];
546 return (__m64) res.as_m64;
547 #endif
550 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
551 _m_psubw (__m64 __m1, __m64 __m2)
553 return _mm_sub_pi16 (__m1, __m2);
556 /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
557 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
558 _mm_sub_pi32 (__m64 __m1, __m64 __m2)
560 #if _ARCH_PWR9
561 __vector signed int a, b, c;
563 a = (__vector signed int)vec_splats (__m1);
564 b = (__vector signed int)vec_splats (__m2);
565 c = vec_sub (a, b);
566 return (__m64) ((__vector long long) c)[0];
567 #else
568 __m64_union m1, m2, res;
570 m1.as_m64 = __m1;
571 m2.as_m64 = __m2;
573 res.as_int[0] = m1.as_int[0] - m2.as_int[0];
574 res.as_int[1] = m1.as_int[1] - m2.as_int[1];
576 return (__m64) res.as_m64;
577 #endif
580 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
581 _m_psubd (__m64 __m1, __m64 __m2)
583 return _mm_add_pi32 (__m1, __m2);
586 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
587 _mm_add_si64 (__m64 __m1, __m64 __m2)
589 return (__m1 + __m2);
592 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
593 _mm_sub_si64 (__m64 __m1, __m64 __m2)
595 return (__m1 - __m2);
598 /* Shift the 64-bit value in M left by COUNT. */
599 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
600 _mm_sll_si64 (__m64 __m, __m64 __count)
602 return (__m << __count);
605 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
606 _m_psllq (__m64 __m, __m64 __count)
608 return _mm_sll_si64 (__m, __count);
611 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
612 _mm_slli_si64 (__m64 __m, const int __count)
614 return (__m << __count);
617 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
618 _m_psllqi (__m64 __m, const int __count)
620 return _mm_slli_si64 (__m, __count);
623 /* Shift the 64-bit value in M left by COUNT; shift in zeros. */
624 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
625 _mm_srl_si64 (__m64 __m, __m64 __count)
627 return (__m >> __count);
630 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
631 _m_psrlq (__m64 __m, __m64 __count)
633 return _mm_srl_si64 (__m, __count);
636 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
637 _mm_srli_si64 (__m64 __m, const int __count)
639 return (__m >> __count);
642 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
643 _m_psrlqi (__m64 __m, const int __count)
645 return _mm_srli_si64 (__m, __count);
648 /* Bit-wise AND the 64-bit values in M1 and M2. */
649 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
650 _mm_and_si64 (__m64 __m1, __m64 __m2)
652 return (__m1 & __m2);
655 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
656 _m_pand (__m64 __m1, __m64 __m2)
658 return _mm_and_si64 (__m1, __m2);
661 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
662 64-bit value in M2. */
663 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
664 _mm_andnot_si64 (__m64 __m1, __m64 __m2)
666 return (~__m1 & __m2);
669 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
670 _m_pandn (__m64 __m1, __m64 __m2)
672 return _mm_andnot_si64 (__m1, __m2);
675 /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
676 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
677 _mm_or_si64 (__m64 __m1, __m64 __m2)
679 return (__m1 | __m2);
682 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
683 _m_por (__m64 __m1, __m64 __m2)
685 return _mm_or_si64 (__m1, __m2);
688 /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
689 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
690 _mm_xor_si64 (__m64 __m1, __m64 __m2)
692 return (__m1 ^ __m2);
695 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
696 _m_pxor (__m64 __m1, __m64 __m2)
698 return _mm_xor_si64 (__m1, __m2);
701 /* Creates a 64-bit zero. */
702 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
703 _mm_setzero_si64 (void)
705 return (__m64) 0;
708 /* Compare eight 8-bit values. The result of the comparison is 0xFF if the
709 test is true and zero if false. */
710 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
711 _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
713 #ifdef _ARCH_PWR6
714 __m64 res;
715 __asm__(
716 "cmpb %0,%1,%2;\n"
717 : "=r" (res)
718 : "r" (__m1),
719 "r" (__m2)
720 : );
721 return (res);
722 #else
723 __m64_union m1, m2, res;
725 m1.as_m64 = __m1;
726 m2.as_m64 = __m2;
728 res.as_char[0] = (m1.as_char[0] == m2.as_char[0])? -1: 0;
729 res.as_char[1] = (m1.as_char[1] == m2.as_char[1])? -1: 0;
730 res.as_char[2] = (m1.as_char[2] == m2.as_char[2])? -1: 0;
731 res.as_char[3] = (m1.as_char[3] == m2.as_char[3])? -1: 0;
732 res.as_char[4] = (m1.as_char[4] == m2.as_char[4])? -1: 0;
733 res.as_char[5] = (m1.as_char[5] == m2.as_char[5])? -1: 0;
734 res.as_char[6] = (m1.as_char[6] == m2.as_char[6])? -1: 0;
735 res.as_char[7] = (m1.as_char[7] == m2.as_char[7])? -1: 0;
737 return (__m64) res.as_m64;
738 #endif
741 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
742 _m_pcmpeqb (__m64 __m1, __m64 __m2)
744 return _mm_cmpeq_pi8 (__m1, __m2);
747 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
748 _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
750 #if _ARCH_PWR8
751 __vector signed char a, b, c;
753 a = (__vector signed char)vec_splats (__m1);
754 b = (__vector signed char)vec_splats (__m2);
755 c = (__vector signed char)vec_cmpgt (a, b);
756 return (__m64) ((__vector long long) c)[0];
757 #else
758 __m64_union m1, m2, res;
760 m1.as_m64 = __m1;
761 m2.as_m64 = __m2;
763 res.as_char[0] = (m1.as_char[0] > m2.as_char[0])? -1: 0;
764 res.as_char[1] = (m1.as_char[1] > m2.as_char[1])? -1: 0;
765 res.as_char[2] = (m1.as_char[2] > m2.as_char[2])? -1: 0;
766 res.as_char[3] = (m1.as_char[3] > m2.as_char[3])? -1: 0;
767 res.as_char[4] = (m1.as_char[4] > m2.as_char[4])? -1: 0;
768 res.as_char[5] = (m1.as_char[5] > m2.as_char[5])? -1: 0;
769 res.as_char[6] = (m1.as_char[6] > m2.as_char[6])? -1: 0;
770 res.as_char[7] = (m1.as_char[7] > m2.as_char[7])? -1: 0;
772 return (__m64) res.as_m64;
773 #endif
776 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
777 _m_pcmpgtb (__m64 __m1, __m64 __m2)
779 return _mm_cmpgt_pi8 (__m1, __m2);
782 /* Compare four 16-bit values. The result of the comparison is 0xFFFF if
783 the test is true and zero if false. */
784 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
785 _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
787 #if _ARCH_PWR8
788 __vector signed short a, b, c;
790 a = (__vector signed short)vec_splats (__m1);
791 b = (__vector signed short)vec_splats (__m2);
792 c = (__vector signed short)vec_cmpeq (a, b);
793 return (__m64) ((__vector long long) c)[0];
794 #else
795 __m64_union m1, m2, res;
797 m1.as_m64 = __m1;
798 m2.as_m64 = __m2;
800 res.as_short[0] = (m1.as_short[0] == m2.as_short[0])? -1: 0;
801 res.as_short[1] = (m1.as_short[1] == m2.as_short[1])? -1: 0;
802 res.as_short[2] = (m1.as_short[2] == m2.as_short[2])? -1: 0;
803 res.as_short[3] = (m1.as_short[3] == m2.as_short[3])? -1: 0;
805 return (__m64) res.as_m64;
806 #endif
809 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
810 _m_pcmpeqw (__m64 __m1, __m64 __m2)
812 return _mm_cmpeq_pi16 (__m1, __m2);
815 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
816 _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
818 #if _ARCH_PWR8
819 __vector signed short a, b, c;
821 a = (__vector signed short)vec_splats (__m1);
822 b = (__vector signed short)vec_splats (__m2);
823 c = (__vector signed short)vec_cmpgt (a, b);
824 return (__m64) ((__vector long long) c)[0];
825 #else
826 __m64_union m1, m2, res;
828 m1.as_m64 = __m1;
829 m2.as_m64 = __m2;
831 res.as_short[0] = (m1.as_short[0] > m2.as_short[0])? -1: 0;
832 res.as_short[1] = (m1.as_short[1] > m2.as_short[1])? -1: 0;
833 res.as_short[2] = (m1.as_short[2] > m2.as_short[2])? -1: 0;
834 res.as_short[3] = (m1.as_short[3] > m2.as_short[3])? -1: 0;
836 return (__m64) res.as_m64;
837 #endif
840 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
841 _m_pcmpgtw (__m64 __m1, __m64 __m2)
843 return _mm_cmpgt_pi16 (__m1, __m2);
846 /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
847 the test is true and zero if false. */
848 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
849 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
851 #if _ARCH_PWR9
852 __vector signed int a, b, c;
854 a = (__vector signed int)vec_splats (__m1);
855 b = (__vector signed int)vec_splats (__m2);
856 c = (__vector signed int)vec_cmpeq (a, b);
857 return (__m64) ((__vector long long) c)[0];
858 #else
859 __m64_union m1, m2, res;
861 m1.as_m64 = __m1;
862 m2.as_m64 = __m2;
864 res.as_int[0] = (m1.as_int[0] == m2.as_int[0])? -1: 0;
865 res.as_int[1] = (m1.as_int[1] == m2.as_int[1])? -1: 0;
867 return (__m64) res.as_m64;
868 #endif
871 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
872 _m_pcmpeqd (__m64 __m1, __m64 __m2)
874 return _mm_cmpeq_pi32 (__m1, __m2);
877 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
878 _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
880 #if _ARCH_PWR9
881 __vector signed int a, b, c;
883 a = (__vector signed int)vec_splats (__m1);
884 b = (__vector signed int)vec_splats (__m2);
885 c = (__vector signed int)vec_cmpgt (a, b);
886 return (__m64) ((__vector long long) c)[0];
887 #else
888 __m64_union m1, m2, res;
890 m1.as_m64 = __m1;
891 m2.as_m64 = __m2;
893 res.as_int[0] = (m1.as_int[0] > m2.as_int[0])? -1: 0;
894 res.as_int[1] = (m1.as_int[1] > m2.as_int[1])? -1: 0;
896 return (__m64) res.as_m64;
897 #endif
900 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
901 _m_pcmpgtd (__m64 __m1, __m64 __m2)
903 return _mm_cmpgt_pi32 (__m1, __m2);
906 #if _ARCH_PWR8
907 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
908 saturated arithmetic. */
909 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
910 _mm_adds_pi8 (__m64 __m1, __m64 __m2)
912 __vector signed char a, b, c;
914 a = (__vector signed char)vec_splats (__m1);
915 b = (__vector signed char)vec_splats (__m2);
916 c = vec_adds (a, b);
917 return (__m64) ((__vector long long) c)[0];
920 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
921 _m_paddsb (__m64 __m1, __m64 __m2)
923 return _mm_adds_pi8 (__m1, __m2);
925 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
926 saturated arithmetic. */
927 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
928 _mm_adds_pi16 (__m64 __m1, __m64 __m2)
930 __vector signed short a, b, c;
932 a = (__vector signed short)vec_splats (__m1);
933 b = (__vector signed short)vec_splats (__m2);
934 c = vec_adds (a, b);
935 return (__m64) ((__vector long long) c)[0];
938 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
939 _m_paddsw (__m64 __m1, __m64 __m2)
941 return _mm_adds_pi16 (__m1, __m2);
943 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
944 saturated arithmetic. */
945 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
946 _mm_adds_pu8 (__m64 __m1, __m64 __m2)
948 __vector unsigned char a, b, c;
950 a = (__vector unsigned char)vec_splats (__m1);
951 b = (__vector unsigned char)vec_splats (__m2);
952 c = vec_adds (a, b);
953 return (__m64) ((__vector long long) c)[0];
956 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
957 _m_paddusb (__m64 __m1, __m64 __m2)
959 return _mm_adds_pu8 (__m1, __m2);
962 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
963 saturated arithmetic. */
964 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
965 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
967 __vector unsigned short a, b, c;
969 a = (__vector unsigned short)vec_splats (__m1);
970 b = (__vector unsigned short)vec_splats (__m2);
971 c = vec_adds (a, b);
972 return (__m64) ((__vector long long) c)[0];
975 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
976 _m_paddusw (__m64 __m1, __m64 __m2)
978 return _mm_adds_pu16 (__m1, __m2);
981 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
982 saturating arithmetic. */
983 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
984 _mm_subs_pi8 (__m64 __m1, __m64 __m2)
986 __vector signed char a, b, c;
988 a = (__vector signed char)vec_splats (__m1);
989 b = (__vector signed char)vec_splats (__m2);
990 c = vec_subs (a, b);
991 return (__m64) ((__vector long long) c)[0];
994 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
995 _m_psubsb (__m64 __m1, __m64 __m2)
997 return _mm_subs_pi8 (__m1, __m2);
1000 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1001 signed saturating arithmetic. */
1002 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1003 _mm_subs_pi16 (__m64 __m1, __m64 __m2)
1005 __vector signed short a, b, c;
1007 a = (__vector signed short)vec_splats (__m1);
1008 b = (__vector signed short)vec_splats (__m2);
1009 c = vec_subs (a, b);
1010 return (__m64) ((__vector long long) c)[0];
1013 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1014 _m_psubsw (__m64 __m1, __m64 __m2)
1016 return _mm_subs_pi16 (__m1, __m2);
1019 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1020 unsigned saturating arithmetic. */
1021 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1022 _mm_subs_pu8 (__m64 __m1, __m64 __m2)
1024 __vector unsigned char a, b, c;
1026 a = (__vector unsigned char)vec_splats (__m1);
1027 b = (__vector unsigned char)vec_splats (__m2);
1028 c = vec_subs (a, b);
1029 return (__m64) ((__vector long long) c)[0];
1032 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1033 _m_psubusb (__m64 __m1, __m64 __m2)
1035 return _mm_subs_pu8 (__m1, __m2);
1038 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1039 unsigned saturating arithmetic. */
1040 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1041 _mm_subs_pu16 (__m64 __m1, __m64 __m2)
1043 __vector unsigned short a, b, c;
1045 a = (__vector unsigned short)vec_splats (__m1);
1046 b = (__vector unsigned short)vec_splats (__m2);
1047 c = vec_subs (a, b);
1048 return (__m64) ((__vector long long) c)[0];
1051 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1052 _m_psubusw (__m64 __m1, __m64 __m2)
1054 return _mm_subs_pu16 (__m1, __m2);
1057 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1058 four 32-bit intermediate results, which are then summed by pairs to
1059 produce two 32-bit results. */
1060 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1061 _mm_madd_pi16 (__m64 __m1, __m64 __m2)
1063 __vector signed short a, b;
1064 __vector signed int c;
1065 __vector signed int zero = {0, 0, 0, 0};
1067 a = (__vector signed short)vec_splats (__m1);
1068 b = (__vector signed short)vec_splats (__m2);
1069 c = vec_vmsumshm (a, b, zero);
1070 return (__m64) ((__vector long long) c)[0];
1073 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1074 _m_pmaddwd (__m64 __m1, __m64 __m2)
1076 return _mm_madd_pi16 (__m1, __m2);
1078 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1079 M2 and produce the high 16 bits of the 32-bit results. */
1080 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1081 _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
1083 __vector signed short a, b;
1084 __vector signed short c;
1085 __vector signed int w0, w1;
1086 __vector unsigned char xform1 = {
1087 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1088 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1091 a = (__vector signed short)vec_splats (__m1);
1092 b = (__vector signed short)vec_splats (__m2);
1094 w0 = vec_vmulesh (a, b);
1095 w1 = vec_vmulosh (a, b);
1096 c = (__vector signed short)vec_perm (w0, w1, xform1);
1098 return (__m64) ((__vector long long) c)[0];
1101 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1102 _m_pmulhw (__m64 __m1, __m64 __m2)
1104 return _mm_mulhi_pi16 (__m1, __m2);
1107 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1108 the low 16 bits of the results. */
1109 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1110 _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
1112 __vector signed short a, b, c;
1114 a = (__vector signed short)vec_splats (__m1);
1115 b = (__vector signed short)vec_splats (__m2);
1116 c = a * b;
1117 return (__m64) ((__vector long long) c)[0];
1120 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1121 _m_pmullw (__m64 __m1, __m64 __m2)
1123 return _mm_mullo_pi16 (__m1, __m2);
1126 /* Shift four 16-bit values in M left by COUNT. */
1127 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1128 _mm_sll_pi16 (__m64 __m, __m64 __count)
1130 __vector signed short m, r;
1131 __vector unsigned short c;
1133 if (__count <= 15)
1135 m = (__vector signed short)vec_splats (__m);
1136 c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1137 r = vec_sl (m, (__vector unsigned short)c);
1138 return (__m64) ((__vector long long) r)[0];
1140 else
1141 return (0);
1144 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1145 _m_psllw (__m64 __m, __m64 __count)
1147 return _mm_sll_pi16 (__m, __count);
1150 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1151 _mm_slli_pi16 (__m64 __m, int __count)
1153 /* Promote int to long then invoke mm_sll_pi16. */
1154 return _mm_sll_pi16 (__m, __count);
1157 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1158 _m_psllwi (__m64 __m, int __count)
1160 return _mm_slli_pi16 (__m, __count);
1163 /* Shift two 32-bit values in M left by COUNT. */
1164 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1165 _mm_sll_pi32 (__m64 __m, __m64 __count)
1167 __m64_union m, res;
1169 m.as_m64 = __m;
1171 res.as_int[0] = m.as_int[0] << __count;
1172 res.as_int[1] = m.as_int[1] << __count;
1173 return (res.as_m64);
1176 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1177 _m_pslld (__m64 __m, __m64 __count)
1179 return _mm_sll_pi32 (__m, __count);
1182 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1183 _mm_slli_pi32 (__m64 __m, int __count)
1185 /* Promote int to long then invoke mm_sll_pi32. */
1186 return _mm_sll_pi32 (__m, __count);
1189 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1190 _m_pslldi (__m64 __m, int __count)
1192 return _mm_slli_pi32 (__m, __count);
1195 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
1196 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1197 _mm_sra_pi16 (__m64 __m, __m64 __count)
1199 __vector signed short m, r;
1200 __vector unsigned short c;
1202 if (__count <= 15)
1204 m = (__vector signed short)vec_splats (__m);
1205 c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1206 r = vec_sra (m, (__vector unsigned short)c);
1207 return (__m64) ((__vector long long) r)[0];
1209 else
1210 return (0);
1213 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1214 _m_psraw (__m64 __m, __m64 __count)
1216 return _mm_sra_pi16 (__m, __count);
1219 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1220 _mm_srai_pi16 (__m64 __m, int __count)
1222 /* Promote int to long then invoke mm_sra_pi32. */
1223 return _mm_sra_pi16 (__m, __count);
1226 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1227 _m_psrawi (__m64 __m, int __count)
1229 return _mm_srai_pi16 (__m, __count);
1232 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
1233 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1234 _mm_sra_pi32 (__m64 __m, __m64 __count)
1236 __m64_union m, res;
1238 m.as_m64 = __m;
1240 res.as_int[0] = m.as_int[0] >> __count;
1241 res.as_int[1] = m.as_int[1] >> __count;
1242 return (res.as_m64);
1245 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1246 _m_psrad (__m64 __m, __m64 __count)
1248 return _mm_sra_pi32 (__m, __count);
1251 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1252 _mm_srai_pi32 (__m64 __m, int __count)
1254 /* Promote int to long then invoke mm_sra_pi32. */
1255 return _mm_sra_pi32 (__m, __count);
1258 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1259 _m_psradi (__m64 __m, int __count)
1261 return _mm_srai_pi32 (__m, __count);
1264 /* Shift four 16-bit values in M right by COUNT; shift in zeros. */
1265 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1266 _mm_srl_pi16 (__m64 __m, __m64 __count)
1268 __vector unsigned short m, r;
1269 __vector unsigned short c;
1271 if (__count <= 15)
1273 m = (__vector unsigned short)vec_splats (__m);
1274 c = (__vector unsigned short)vec_splats ((unsigned short)__count);
1275 r = vec_sr (m, (__vector unsigned short)c);
1276 return (__m64) ((__vector long long) r)[0];
1278 else
1279 return (0);
1282 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1283 _m_psrlw (__m64 __m, __m64 __count)
1285 return _mm_srl_pi16 (__m, __count);
1288 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1289 _mm_srli_pi16 (__m64 __m, int __count)
1291 /* Promote int to long then invoke mm_sra_pi32. */
1292 return _mm_srl_pi16 (__m, __count);
1295 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1296 _m_psrlwi (__m64 __m, int __count)
1298 return _mm_srli_pi16 (__m, __count);
1301 /* Shift two 32-bit values in M right by COUNT; shift in zeros. */
1302 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1303 _mm_srl_pi32 (__m64 __m, __m64 __count)
1305 __m64_union m, res;
1307 m.as_m64 = __m;
1309 res.as_int[0] = (unsigned int)m.as_int[0] >> __count;
1310 res.as_int[1] = (unsigned int)m.as_int[1] >> __count;
1311 return (res.as_m64);
1314 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1315 _m_psrld (__m64 __m, __m64 __count)
1317 return _mm_srl_pi32 (__m, __count);
1320 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1321 _mm_srli_pi32 (__m64 __m, int __count)
1323 /* Promote int to long then invoke mm_srl_pi32. */
1324 return _mm_srl_pi32 (__m, __count);
1327 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1328 _m_psrldi (__m64 __m, int __count)
1330 return _mm_srli_pi32 (__m, __count);
1332 #endif /* _ARCH_PWR8 */
1334 /* Creates a vector of two 32-bit values; I0 is least significant. */
1335 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1336 _mm_set_pi32 (int __i1, int __i0)
1338 __m64_union res;
1340 res.as_int[0] = __i0;
1341 res.as_int[1] = __i1;
1342 return (res.as_m64);
1345 /* Creates a vector of four 16-bit values; W0 is least significant. */
1346 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1347 _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
1349 __m64_union res;
1351 res.as_short[0] = __w0;
1352 res.as_short[1] = __w1;
1353 res.as_short[2] = __w2;
1354 res.as_short[3] = __w3;
1355 return (res.as_m64);
1358 /* Creates a vector of eight 8-bit values; B0 is least significant. */
1359 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1360 _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
1361 char __b3, char __b2, char __b1, char __b0)
1363 __m64_union res;
1365 res.as_char[0] = __b0;
1366 res.as_char[1] = __b1;
1367 res.as_char[2] = __b2;
1368 res.as_char[3] = __b3;
1369 res.as_char[4] = __b4;
1370 res.as_char[5] = __b5;
1371 res.as_char[6] = __b6;
1372 res.as_char[7] = __b7;
1373 return (res.as_m64);
1376 /* Similar, but with the arguments in reverse order. */
1377 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1378 _mm_setr_pi32 (int __i0, int __i1)
1380 __m64_union res;
1382 res.as_int[0] = __i0;
1383 res.as_int[1] = __i1;
1384 return (res.as_m64);
1387 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1388 _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
1390 return _mm_set_pi16 (__w3, __w2, __w1, __w0);
1393 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1394 _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
1395 char __b4, char __b5, char __b6, char __b7)
1397 return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1400 /* Creates a vector of two 32-bit values, both elements containing I. */
1401 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1402 _mm_set1_pi32 (int __i)
1404 __m64_union res;
1406 res.as_int[0] = __i;
1407 res.as_int[1] = __i;
1408 return (res.as_m64);
1411 /* Creates a vector of four 16-bit values, all elements containing W. */
1412 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1413 _mm_set1_pi16 (short __w)
1415 #if _ARCH_PWR9
1416 __vector signed short w;
1418 w = (__vector signed short)vec_splats (__w);
1419 return (__m64) ((__vector long long) w)[0];
1420 #else
1421 __m64_union res;
1423 res.as_short[0] = __w;
1424 res.as_short[1] = __w;
1425 res.as_short[2] = __w;
1426 res.as_short[3] = __w;
1427 return (res.as_m64);
1428 #endif
1431 /* Creates a vector of eight 8-bit values, all elements containing B. */
1432 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1433 _mm_set1_pi8 (signed char __b)
1435 #if _ARCH_PWR8
1436 __vector signed char b;
1438 b = (__vector signed char)vec_splats (__b);
1439 return (__m64) ((__vector long long) b)[0];
1440 #else
1441 __m64_union res;
1443 res.as_char[0] = __b;
1444 res.as_char[1] = __b;
1445 res.as_char[2] = __b;
1446 res.as_char[3] = __b;
1447 res.as_char[4] = __b;
1448 res.as_char[5] = __b;
1449 res.as_char[6] = __b;
1450 res.as_char[7] = __b;
1451 return (res.as_m64);
1452 #endif
1454 #endif /* _MMINTRIN_H_INCLUDED */