PR rtl-optimization/88018
[official-gcc.git] / gcc / config / rs6000 / emmintrin.h
blob50a866852c73599abe08fe0de7c4a2bfe4a971d0
1 /* Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
24 /* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 9.0. */
27 #ifndef NO_WARN_X86_INTRINSICS
28 /* This header is distributed to simplify porting x86_64 code that
29 makes explicit use of Intel intrinsics to powerpc64le.
30 It is the user's responsibility to determine if the results are
31 acceptable and make additional changes as necessary.
32 Note that much code that uses Intel intrinsics can be rewritten in
33 standard C or GNU C extensions, which are more portable and better
34 optimized across multiple targets.
36 In the specific case of X86 SSE2 (__m128i, __m128d) intrinsics,
37 the PowerPC VMX/VSX ISA is a good match for vector double SIMD
38 operations. However scalar double operations in vector (XMM)
39 registers require the POWER8 VSX ISA (2.07) level. Also there are
40 important differences for data format and placement of double
41 scalars in the vector register.
43 For PowerISA Scalar double is in FPRs (left most 64-bits of the
44 low 32 VSRs), while X86_64 SSE2 uses the right most 64-bits of
45 the XMM. These differences require extra steps on POWER to match
46 the SSE2 scalar double semantics.
48 Most SSE2 scalar double intrinsic operations can be performed more
49 efficiently as C language double scalar operations or optimized to
50 use vector SIMD operations. We recommend this for new applications.
52 Another difference is the format and details of the X86_64 MXSCR vs
53 the PowerISA FPSCR / VSCR registers. We recommend applications
54 replace direct access to the MXSCR with the more portable <fenv.h>
55 Posix APIs. */
56 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
57 #endif
59 #ifndef EMMINTRIN_H_
60 #define EMMINTRIN_H_
62 #include <altivec.h>
63 #include <assert.h>
65 /* We need definitions from the SSE header files. */
66 #include <xmmintrin.h>
68 /* SSE2 */
69 typedef __vector double __v2df;
70 typedef __vector long long __v2di;
71 typedef __vector unsigned long long __v2du;
72 typedef __vector int __v4si;
73 typedef __vector unsigned int __v4su;
74 typedef __vector short __v8hi;
75 typedef __vector unsigned short __v8hu;
76 typedef __vector signed char __v16qi;
77 typedef __vector unsigned char __v16qu;
79 /* The Intel API is flexible enough that we must allow aliasing with other
80 vector types, and their scalar components. */
81 typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
82 typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
84 /* Unaligned version of the same types. */
85 typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
86 typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
88 /* Define two value permute mask. */
89 #define _MM_SHUFFLE2(x,y) (((x) << 1) | (y))
91 /* Create a vector with element 0 as F and the rest zero. */
92 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
93 _mm_set_sd (double __F)
95 return __extension__ (__m128d){ __F, 0.0 };
98 /* Create a vector with both elements equal to F. */
99 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
100 _mm_set1_pd (double __F)
102 return __extension__ (__m128d){ __F, __F };
105 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
106 _mm_set_pd1 (double __F)
108 return _mm_set1_pd (__F);
111 /* Create a vector with the lower value X and upper value W. */
112 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
113 _mm_set_pd (double __W, double __X)
115 return __extension__ (__m128d){ __X, __W };
118 /* Create a vector with the lower value W and upper value X. */
119 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
120 _mm_setr_pd (double __W, double __X)
122 return __extension__ (__m128d){ __W, __X };
125 /* Create an undefined vector. */
126 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
127 _mm_undefined_pd (void)
129 __m128d __Y = __Y;
130 return __Y;
133 /* Create a vector of zeros. */
134 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
135 _mm_setzero_pd (void)
137 return (__m128d) vec_splats (0);
140 /* Sets the low DPFP value of A from the low value of B. */
141 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
142 _mm_move_sd (__m128d __A, __m128d __B)
144 __v2df result = (__v2df) __A;
145 result [0] = ((__v2df) __B)[0];
146 return (__m128d) result;
149 /* Load two DPFP values from P. The address must be 16-byte aligned. */
150 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
151 _mm_load_pd (double const *__P)
153 assert(((unsigned long)__P & 0xfUL) == 0UL);
154 return ((__m128d)vec_ld(0, (__v16qu*)__P));
157 /* Load two DPFP values from P. The address need not be 16-byte aligned. */
158 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
159 _mm_loadu_pd (double const *__P)
161 return (vec_vsx_ld(0, __P));
164 /* Create a vector with all two elements equal to *P. */
165 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
166 _mm_load1_pd (double const *__P)
168 return (vec_splats (*__P));
171 /* Create a vector with element 0 as *P and the rest zero. */
172 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
173 _mm_load_sd (double const *__P)
175 return _mm_set_sd (*__P);
178 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
179 _mm_load_pd1 (double const *__P)
181 return _mm_load1_pd (__P);
184 /* Load two DPFP values in reverse order. The address must be aligned. */
185 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
186 _mm_loadr_pd (double const *__P)
188 __v2df __tmp = _mm_load_pd (__P);
189 return (__m128d)vec_xxpermdi (__tmp, __tmp, 2);
192 /* Store two DPFP values. The address must be 16-byte aligned. */
193 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
194 _mm_store_pd (double *__P, __m128d __A)
196 assert(((unsigned long)__P & 0xfUL) == 0UL);
197 vec_st((__v16qu)__A, 0, (__v16qu*)__P);
200 /* Store two DPFP values. The address need not be 16-byte aligned. */
201 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
202 _mm_storeu_pd (double *__P, __m128d __A)
204 *(__m128d_u *)__P = __A;
207 /* Stores the lower DPFP value. */
208 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
209 _mm_store_sd (double *__P, __m128d __A)
211 *__P = ((__v2df)__A)[0];
214 extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
215 _mm_cvtsd_f64 (__m128d __A)
217 return ((__v2df)__A)[0];
220 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
221 _mm_storel_pd (double *__P, __m128d __A)
223 _mm_store_sd (__P, __A);
226 /* Stores the upper DPFP value. */
227 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
228 _mm_storeh_pd (double *__P, __m128d __A)
230 *__P = ((__v2df)__A)[1];
232 /* Store the lower DPFP value across two words.
233 The address must be 16-byte aligned. */
234 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
235 _mm_store1_pd (double *__P, __m128d __A)
237 _mm_store_pd (__P, vec_splat (__A, 0));
240 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
241 _mm_store_pd1 (double *__P, __m128d __A)
243 _mm_store1_pd (__P, __A);
246 /* Store two DPFP values in reverse order. The address must be aligned. */
247 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
248 _mm_storer_pd (double *__P, __m128d __A)
250 _mm_store_pd (__P, vec_xxpermdi (__A, __A, 2));
253 /* Intel intrinsic. */
254 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
255 _mm_cvtsi128_si64 (__m128i __A)
257 return ((__v2di)__A)[0];
260 /* Microsoft intrinsic. */
261 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
262 _mm_cvtsi128_si64x (__m128i __A)
264 return ((__v2di)__A)[0];
267 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
268 _mm_add_pd (__m128d __A, __m128d __B)
270 return (__m128d) ((__v2df)__A + (__v2df)__B);
273 /* Add the lower double-precision (64-bit) floating-point element in
274 a and b, store the result in the lower element of dst, and copy
275 the upper element from a to the upper element of dst. */
276 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
277 _mm_add_sd (__m128d __A, __m128d __B)
279 __A[0] = __A[0] + __B[0];
280 return (__A);
283 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
284 _mm_sub_pd (__m128d __A, __m128d __B)
286 return (__m128d) ((__v2df)__A - (__v2df)__B);
289 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
290 _mm_sub_sd (__m128d __A, __m128d __B)
292 __A[0] = __A[0] - __B[0];
293 return (__A);
296 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
297 _mm_mul_pd (__m128d __A, __m128d __B)
299 return (__m128d) ((__v2df)__A * (__v2df)__B);
302 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
303 _mm_mul_sd (__m128d __A, __m128d __B)
305 __A[0] = __A[0] * __B[0];
306 return (__A);
309 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
310 _mm_div_pd (__m128d __A, __m128d __B)
312 return (__m128d) ((__v2df)__A / (__v2df)__B);
315 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
316 _mm_div_sd (__m128d __A, __m128d __B)
318 __A[0] = __A[0] / __B[0];
319 return (__A);
322 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
323 _mm_sqrt_pd (__m128d __A)
325 return (vec_sqrt (__A));
328 /* Return pair {sqrt (B[0]), A[1]}. */
329 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
330 _mm_sqrt_sd (__m128d __A, __m128d __B)
332 __v2df c;
333 c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0]));
334 return (__m128d) _mm_setr_pd (c[0], __A[1]);
337 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
338 _mm_min_pd (__m128d __A, __m128d __B)
340 return (vec_min (__A, __B));
343 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
344 _mm_min_sd (__m128d __A, __m128d __B)
346 __v2df a, b, c;
347 a = vec_splats (__A[0]);
348 b = vec_splats (__B[0]);
349 c = vec_min (a, b);
350 return (__m128d) _mm_setr_pd (c[0], __A[1]);
353 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
354 _mm_max_pd (__m128d __A, __m128d __B)
356 return (vec_max (__A, __B));
359 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
360 _mm_max_sd (__m128d __A, __m128d __B)
362 __v2df a, b, c;
363 a = vec_splats (__A[0]);
364 b = vec_splats (__B[0]);
365 c = vec_max (a, b);
366 return (__m128d) _mm_setr_pd (c[0], __A[1]);
369 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
370 _mm_cmpeq_pd (__m128d __A, __m128d __B)
372 return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B));
375 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
376 _mm_cmplt_pd (__m128d __A, __m128d __B)
378 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
381 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
382 _mm_cmple_pd (__m128d __A, __m128d __B)
384 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
387 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
388 _mm_cmpgt_pd (__m128d __A, __m128d __B)
390 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
393 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
394 _mm_cmpge_pd (__m128d __A, __m128d __B)
396 return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B));
399 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
400 _mm_cmpneq_pd (__m128d __A, __m128d __B)
402 __v2df temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B);
403 return ((__m128d)vec_nor (temp, temp));
406 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
407 _mm_cmpnlt_pd (__m128d __A, __m128d __B)
409 return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B));
412 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
413 _mm_cmpnle_pd (__m128d __A, __m128d __B)
415 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
418 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
419 _mm_cmpngt_pd (__m128d __A, __m128d __B)
421 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
424 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
425 _mm_cmpnge_pd (__m128d __A, __m128d __B)
427 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
430 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
431 _mm_cmpord_pd (__m128d __A, __m128d __B)
433 #if _ARCH_PWR8
434 __v2du c, d;
435 /* Compare against self will return false (0's) if NAN. */
436 c = (__v2du)vec_cmpeq (__A, __A);
437 d = (__v2du)vec_cmpeq (__B, __B);
438 #else
439 __v2du a, b;
440 __v2du c, d;
441 const __v2du double_exp_mask = {0x7ff0000000000000, 0x7ff0000000000000};
442 a = (__v2du)vec_abs ((__v2df)__A);
443 b = (__v2du)vec_abs ((__v2df)__B);
444 c = (__v2du)vec_cmpgt (double_exp_mask, a);
445 d = (__v2du)vec_cmpgt (double_exp_mask, b);
446 #endif
447 /* A != NAN and B != NAN. */
448 return ((__m128d)vec_and(c, d));
451 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
452 _mm_cmpunord_pd (__m128d __A, __m128d __B)
454 #if _ARCH_PWR8
455 __v2du c, d;
456 /* Compare against self will return false (0's) if NAN. */
457 c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
458 d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
459 /* A == NAN OR B == NAN converts too:
460 NOT(A != NAN) OR NOT(B != NAN). */
461 c = vec_nor (c, c);
462 return ((__m128d)vec_orc(c, d));
463 #else
464 __v2du c, d;
465 /* Compare against self will return false (0's) if NAN. */
466 c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
467 d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
468 /* Convert the true ('1's) is NAN. */
469 c = vec_nor (c, c);
470 d = vec_nor (d, d);
471 return ((__m128d)vec_or(c, d));
472 #endif
475 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
476 _mm_cmpeq_sd(__m128d __A, __m128d __B)
478 __v2df a, b, c;
479 /* PowerISA VSX does not allow partial (for just lower double)
480 results. So to insure we don't generate spurious exceptions
481 (from the upper double values) we splat the lower double
482 before we do the operation. */
483 a = vec_splats (__A[0]);
484 b = vec_splats (__B[0]);
485 c = (__v2df) vec_cmpeq(a, b);
486 /* Then we merge the lower double result with the original upper
487 double from __A. */
488 return (__m128d) _mm_setr_pd (c[0], __A[1]);
491 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
492 _mm_cmplt_sd (__m128d __A, __m128d __B)
494 __v2df a, b, c;
495 a = vec_splats (__A[0]);
496 b = vec_splats (__B[0]);
497 c = (__v2df) vec_cmplt(a, b);
498 return (__m128d) _mm_setr_pd (c[0], __A[1]);
501 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
502 _mm_cmple_sd (__m128d __A, __m128d __B)
504 __v2df a, b, c;
505 a = vec_splats (__A[0]);
506 b = vec_splats (__B[0]);
507 c = (__v2df) vec_cmple(a, b);
508 return (__m128d) _mm_setr_pd (c[0], __A[1]);
511 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
512 _mm_cmpgt_sd (__m128d __A, __m128d __B)
514 __v2df a, b, c;
515 a = vec_splats (__A[0]);
516 b = vec_splats (__B[0]);
517 c = (__v2df) vec_cmpgt(a, b);
518 return (__m128d) _mm_setr_pd (c[0], __A[1]);
521 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
522 _mm_cmpge_sd (__m128d __A, __m128d __B)
524 __v2df a, b, c;
525 a = vec_splats (__A[0]);
526 b = vec_splats (__B[0]);
527 c = (__v2df) vec_cmpge(a, b);
528 return (__m128d) _mm_setr_pd (c[0], __A[1]);
531 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
532 _mm_cmpneq_sd (__m128d __A, __m128d __B)
534 __v2df a, b, c;
535 a = vec_splats (__A[0]);
536 b = vec_splats (__B[0]);
537 c = (__v2df) vec_cmpeq(a, b);
538 c = vec_nor (c, c);
539 return (__m128d) _mm_setr_pd (c[0], __A[1]);
542 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
543 _mm_cmpnlt_sd (__m128d __A, __m128d __B)
545 __v2df a, b, c;
546 a = vec_splats (__A[0]);
547 b = vec_splats (__B[0]);
548 /* Not less than is just greater than or equal. */
549 c = (__v2df) vec_cmpge(a, b);
550 return (__m128d) _mm_setr_pd (c[0], __A[1]);
553 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
554 _mm_cmpnle_sd (__m128d __A, __m128d __B)
556 __v2df a, b, c;
557 a = vec_splats (__A[0]);
558 b = vec_splats (__B[0]);
559 /* Not less than or equal is just greater than. */
560 c = (__v2df) vec_cmpge(a, b);
561 return (__m128d) _mm_setr_pd (c[0], __A[1]);
564 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
565 _mm_cmpngt_sd (__m128d __A, __m128d __B)
567 __v2df a, b, c;
568 a = vec_splats (__A[0]);
569 b = vec_splats (__B[0]);
570 /* Not greater than is just less than or equal. */
571 c = (__v2df) vec_cmple(a, b);
572 return (__m128d) _mm_setr_pd (c[0], __A[1]);
575 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
576 _mm_cmpnge_sd (__m128d __A, __m128d __B)
578 __v2df a, b, c;
579 a = vec_splats (__A[0]);
580 b = vec_splats (__B[0]);
581 /* Not greater than or equal is just less than. */
582 c = (__v2df) vec_cmplt(a, b);
583 return (__m128d) _mm_setr_pd (c[0], __A[1]);
586 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
587 _mm_cmpord_sd (__m128d __A, __m128d __B)
589 __v2df r;
590 r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
591 return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]);
594 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
595 _mm_cmpunord_sd (__m128d __A, __m128d __B)
597 __v2df r;
598 r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
599 return (__m128d) _mm_setr_pd (r[0], __A[1]);
602 /* FIXME
603 The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
604 exactly the same because GCC for PowerPC only generates unordered
605 compares (scalar and vector).
606 Technically __mm_comieq_sp et all should be using the ordered
607 compare and signal for QNaNs. The __mm_ucomieq_sd et all should
608 be OK. */
609 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
610 _mm_comieq_sd (__m128d __A, __m128d __B)
612 return (__A[0] == __B[0]);
615 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
616 _mm_comilt_sd (__m128d __A, __m128d __B)
618 return (__A[0] < __B[0]);
621 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
622 _mm_comile_sd (__m128d __A, __m128d __B)
624 return (__A[0] <= __B[0]);
627 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
628 _mm_comigt_sd (__m128d __A, __m128d __B)
630 return (__A[0] > __B[0]);
633 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
634 _mm_comige_sd (__m128d __A, __m128d __B)
636 return (__A[0] >= __B[0]);
639 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
640 _mm_comineq_sd (__m128d __A, __m128d __B)
642 return (__A[0] != __B[0]);
645 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
646 _mm_ucomieq_sd (__m128d __A, __m128d __B)
648 return (__A[0] == __B[0]);
651 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
652 _mm_ucomilt_sd (__m128d __A, __m128d __B)
654 return (__A[0] < __B[0]);
657 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
658 _mm_ucomile_sd (__m128d __A, __m128d __B)
660 return (__A[0] <= __B[0]);
663 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
664 _mm_ucomigt_sd (__m128d __A, __m128d __B)
666 return (__A[0] > __B[0]);
669 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
670 _mm_ucomige_sd (__m128d __A, __m128d __B)
672 return (__A[0] >= __B[0]);
675 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
676 _mm_ucomineq_sd (__m128d __A, __m128d __B)
678 return (__A[0] != __B[0]);
681 /* Create a vector of Qi, where i is the element number. */
682 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
683 _mm_set_epi64x (long long __q1, long long __q0)
685 return __extension__ (__m128i)(__v2di){ __q0, __q1 };
688 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
689 _mm_set_epi64 (__m64 __q1, __m64 __q0)
691 return _mm_set_epi64x ((long long)__q1, (long long)__q0);
694 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
695 _mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
697 return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
700 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
701 _mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
702 short __q3, short __q2, short __q1, short __q0)
704 return __extension__ (__m128i)(__v8hi){
705 __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
708 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
709 _mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
710 char __q11, char __q10, char __q09, char __q08,
711 char __q07, char __q06, char __q05, char __q04,
712 char __q03, char __q02, char __q01, char __q00)
714 return __extension__ (__m128i)(__v16qi){
715 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
716 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
720 /* Set all of the elements of the vector to A. */
721 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
722 _mm_set1_epi64x (long long __A)
724 return _mm_set_epi64x (__A, __A);
727 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
728 _mm_set1_epi64 (__m64 __A)
730 return _mm_set_epi64 (__A, __A);
733 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
734 _mm_set1_epi32 (int __A)
736 return _mm_set_epi32 (__A, __A, __A, __A);
739 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
740 _mm_set1_epi16 (short __A)
742 return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
745 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
746 _mm_set1_epi8 (char __A)
748 return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
749 __A, __A, __A, __A, __A, __A, __A, __A);
752 /* Create a vector of Qi, where i is the element number.
753 The parameter order is reversed from the _mm_set_epi* functions. */
754 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
755 _mm_setr_epi64 (__m64 __q0, __m64 __q1)
757 return _mm_set_epi64 (__q1, __q0);
760 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
761 _mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
763 return _mm_set_epi32 (__q3, __q2, __q1, __q0);
766 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
767 _mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
768 short __q4, short __q5, short __q6, short __q7)
770 return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
773 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
774 _mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
775 char __q04, char __q05, char __q06, char __q07,
776 char __q08, char __q09, char __q10, char __q11,
777 char __q12, char __q13, char __q14, char __q15)
779 return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
780 __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
783 /* Create a vector with element 0 as *P and the rest zero. */
784 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
785 _mm_load_si128 (__m128i const *__P)
787 return *__P;
790 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
791 _mm_loadu_si128 (__m128i_u const *__P)
793 return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
796 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
797 _mm_loadl_epi64 (__m128i_u const *__P)
799 return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
802 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
803 _mm_store_si128 (__m128i *__P, __m128i __B)
805 assert(((unsigned long )__P & 0xfUL) == 0UL);
806 vec_st ((__v16qu) __B, 0, (__v16qu*)__P);
809 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
810 _mm_storeu_si128 (__m128i_u *__P, __m128i __B)
812 *__P = __B;
815 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
816 _mm_storel_epi64 (__m128i_u *__P, __m128i __B)
818 *(long long *)__P = ((__v2di)__B)[0];
821 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
822 _mm_movepi64_pi64 (__m128i_u __B)
824 return (__m64) ((__v2di)__B)[0];
827 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
828 _mm_movpi64_epi64 (__m64 __A)
830 return _mm_set_epi64 ((__m64)0LL, __A);
833 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
834 _mm_move_epi64 (__m128i __A)
836 return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]);
839 /* Create an undefined vector. */
840 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
841 _mm_undefined_si128 (void)
843 __m128i __Y = __Y;
844 return __Y;
847 /* Create a vector of zeros. */
848 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
849 _mm_setzero_si128 (void)
851 return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
854 #ifdef _ARCH_PWR8
855 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
856 _mm_cvtepi32_pd (__m128i __A)
858 __v2di val;
859 /* For LE need to generate Vector Unpack Low Signed Word.
860 Which is generated from unpackh. */
861 val = (__v2di)vec_unpackh ((__v4si)__A);
863 return (__m128d)vec_ctf (val, 0);
865 #endif
867 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
868 _mm_cvtepi32_ps (__m128i __A)
870 return ((__m128)vec_ctf((__v4si)__A, 0));
873 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
874 _mm_cvtpd_epi32 (__m128d __A)
876 __v2df rounded = vec_rint (__A);
877 __v4si result, temp;
878 const __v4si vzero =
879 { 0, 0, 0, 0 };
881 /* VSX Vector truncate Double-Precision to integer and Convert to
882 Signed Integer Word format with Saturate. */
883 __asm__(
884 "xvcvdpsxws %x0,%x1"
885 : "=wa" (temp)
886 : "wa" (rounded)
887 : );
889 #ifdef _ARCH_PWR8
890 temp = vec_mergeo (temp, temp);
891 result = (__v4si) vec_vpkudum ((__vector long long) temp,
892 (__vector long long) vzero);
893 #else
895 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
896 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
897 result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
899 #endif
900 return (__m128i) result;
903 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
904 _mm_cvtpd_pi32 (__m128d __A)
906 __m128i result = _mm_cvtpd_epi32(__A);
908 return (__m64) result[0];
911 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
912 _mm_cvtpd_ps (__m128d __A)
914 __v4sf result;
915 __v4si temp;
916 const __v4si vzero = { 0, 0, 0, 0 };
918 __asm__(
919 "xvcvdpsp %x0,%x1"
920 : "=wa" (temp)
921 : "wa" (__A)
922 : );
924 #ifdef _ARCH_PWR8
925 temp = vec_mergeo (temp, temp);
926 result = (__v4sf) vec_vpkudum ((__vector long long) temp,
927 (__vector long long) vzero);
928 #else
930 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
931 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
932 result = (__v4sf) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
934 #endif
935 return ((__m128)result);
938 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
939 _mm_cvttpd_epi32 (__m128d __A)
941 __v4si result;
942 __v4si temp;
943 const __v4si vzero = { 0, 0, 0, 0 };
945 /* VSX Vector truncate Double-Precision to integer and Convert to
946 Signed Integer Word format with Saturate. */
947 __asm__(
948 "xvcvdpsxws %x0,%x1"
949 : "=wa" (temp)
950 : "wa" (__A)
951 : );
953 #ifdef _ARCH_PWR8
954 temp = vec_mergeo (temp, temp);
955 result = (__v4si) vec_vpkudum ((__vector long long) temp,
956 (__vector long long) vzero);
957 #else
959 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
960 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
961 result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
963 #endif
965 return ((__m128i) result);
968 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
969 _mm_cvttpd_pi32 (__m128d __A)
971 __m128i result = _mm_cvttpd_epi32 (__A);
973 return (__m64) result[0];
976 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
977 _mm_cvtsi128_si32 (__m128i __A)
979 return ((__v4si)__A)[0];
982 #ifdef _ARCH_PWR8
983 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
984 _mm_cvtpi32_pd (__m64 __A)
986 __v4si temp;
987 __v2di tmp2;
988 __v2df result;
990 temp = (__v4si)vec_splats (__A);
991 tmp2 = (__v2di)vec_unpackl (temp);
992 result = vec_ctf ((__vector signed long long) tmp2, 0);
993 return (__m128d)result;
995 #endif
997 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
998 _mm_cvtps_epi32 (__m128 __A)
1000 __v4sf rounded;
1001 __v4si result;
1003 rounded = vec_rint((__v4sf) __A);
1004 result = vec_cts (rounded, 0);
1005 return (__m128i) result;
1008 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1009 _mm_cvttps_epi32 (__m128 __A)
1011 __v4si result;
1013 result = vec_cts ((__v4sf) __A, 0);
1014 return (__m128i) result;
1017 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1018 _mm_cvtps_pd (__m128 __A)
1020 /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
1021 #ifdef vec_doubleh
1022 return (__m128d) vec_doubleh ((__v4sf)__A);
1023 #else
1024 /* Otherwise the compiler is not current and so need to generate the
1025 equivalent code. */
1026 __v4sf a = (__v4sf)__A;
1027 __v4sf temp;
1028 __v2df result;
1029 #ifdef __LITTLE_ENDIAN__
1030 /* The input float values are in elements {[0], [1]} but the convert
1031 instruction needs them in elements {[1], [3]}, So we use two
1032 shift left double vector word immediates to get the elements
1033 lined up. */
1034 temp = __builtin_vsx_xxsldwi (a, a, 3);
1035 temp = __builtin_vsx_xxsldwi (a, temp, 2);
1036 #elif __BIG_ENDIAN__
1037 /* The input float values are in elements {[0], [1]} but the convert
1038 instruction needs them in elements {[0], [2]}, So we use two
1039 shift left double vector word immediates to get the elements
1040 lined up. */
1041 temp = vec_vmrghw (a, a);
1042 #endif
1043 __asm__(
1044 " xvcvspdp %x0,%x1"
1045 : "=wa" (result)
1046 : "wa" (temp)
1047 : );
1048 return (__m128d) result;
1049 #endif
1052 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1053 _mm_cvtsd_si32 (__m128d __A)
1055 __v2df rounded = vec_rint((__v2df) __A);
1056 int result = ((__v2df)rounded)[0];
1058 return result;
1060 /* Intel intrinsic. */
1061 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1062 _mm_cvtsd_si64 (__m128d __A)
1064 __v2df rounded = vec_rint ((__v2df) __A );
1065 long long result = ((__v2df) rounded)[0];
1067 return result;
1070 /* Microsoft intrinsic. */
1071 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1072 _mm_cvtsd_si64x (__m128d __A)
1074 return _mm_cvtsd_si64 ((__v2df)__A);
1077 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1078 _mm_cvttsd_si32 (__m128d __A)
1080 int result = ((__v2df)__A)[0];
1082 return result;
1085 /* Intel intrinsic. */
1086 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1087 _mm_cvttsd_si64 (__m128d __A)
1089 long long result = ((__v2df)__A)[0];
1091 return result;
1094 /* Microsoft intrinsic. */
1095 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1096 _mm_cvttsd_si64x (__m128d __A)
1098 return _mm_cvttsd_si64 (__A);
1101 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1102 _mm_cvtsd_ss (__m128 __A, __m128d __B)
1104 __v4sf result = (__v4sf)__A;
1106 #ifdef __LITTLE_ENDIAN__
1107 __v4sf temp_s;
1108 /* Copy double element[0] to element [1] for conversion. */
1109 __v2df temp_b = vec_splat((__v2df)__B, 0);
1111 /* Pre-rotate __A left 3 (logically right 1) elements. */
1112 result = __builtin_vsx_xxsldwi (result, result, 3);
1113 /* Convert double to single float scalar in a vector. */
1114 __asm__(
1115 "xscvdpsp %x0,%x1"
1116 : "=wa" (temp_s)
1117 : "wa" (temp_b)
1118 : );
1119 /* Shift the resulting scalar into vector element [0]. */
1120 result = __builtin_vsx_xxsldwi (result, temp_s, 1);
1121 #else
1122 result [0] = ((__v2df)__B)[0];
1123 #endif
1124 return (__m128) result;
1127 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1128 _mm_cvtsi32_sd (__m128d __A, int __B)
1130 __v2df result = (__v2df)__A;
1131 double db = __B;
1132 result [0] = db;
1133 return (__m128d)result;
1136 /* Intel intrinsic. */
1137 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1138 _mm_cvtsi64_sd (__m128d __A, long long __B)
1140 __v2df result = (__v2df)__A;
1141 double db = __B;
1142 result [0] = db;
1143 return (__m128d)result;
1146 /* Microsoft intrinsic. */
1147 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1148 _mm_cvtsi64x_sd (__m128d __A, long long __B)
1150 return _mm_cvtsi64_sd (__A, __B);
1153 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1154 _mm_cvtss_sd (__m128d __A, __m128 __B)
1156 #ifdef __LITTLE_ENDIAN__
1157 /* Use splat to move element [0] into position for the convert. */
1158 __v4sf temp = vec_splat ((__v4sf)__B, 0);
1159 __v2df res;
1160 /* Convert single float scalar to double in a vector. */
1161 __asm__(
1162 "xscvspdp %x0,%x1"
1163 : "=wa" (res)
1164 : "wa" (temp)
1165 : );
1166 return (__m128d) vec_mergel (res, (__v2df)__A);
1167 #else
1168 __v2df res = (__v2df)__A;
1169 res [0] = ((__v4sf)__B) [0];
1170 return (__m128d) res;
1171 #endif
1174 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1175 _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
1177 __vector double result;
1178 const int litmsk = __mask & 0x3;
1180 if (litmsk == 0)
1181 result = vec_mergeh (__A, __B);
1182 #if __GNUC__ < 6
1183 else if (litmsk == 1)
1184 result = vec_xxpermdi (__B, __A, 2);
1185 else if (litmsk == 2)
1186 result = vec_xxpermdi (__B, __A, 1);
1187 #else
1188 else if (litmsk == 1)
1189 result = vec_xxpermdi (__A, __B, 2);
1190 else if (litmsk == 2)
1191 result = vec_xxpermdi (__A, __B, 1);
1192 #endif
1193 else
1194 result = vec_mergel (__A, __B);
1196 return result;
1199 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1200 _mm_unpackhi_pd (__m128d __A, __m128d __B)
1202 return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B);
1205 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1206 _mm_unpacklo_pd (__m128d __A, __m128d __B)
1208 return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B);
1211 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1212 _mm_loadh_pd (__m128d __A, double const *__B)
1214 __v2df result = (__v2df)__A;
1215 result [1] = *__B;
1216 return (__m128d)result;
1219 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1220 _mm_loadl_pd (__m128d __A, double const *__B)
1222 __v2df result = (__v2df)__A;
1223 result [0] = *__B;
1224 return (__m128d)result;
1227 #ifdef _ARCH_PWR8
1228 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1230 /* Creates a 2-bit mask from the most significant bits of the DPFP values. */
1231 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1232 _mm_movemask_pd (__m128d __A)
1234 __vector unsigned long long result;
1235 static const __vector unsigned int perm_mask =
1237 #ifdef __LITTLE_ENDIAN__
1238 0x80800040, 0x80808080, 0x80808080, 0x80808080
1239 #elif __BIG_ENDIAN__
1240 0x80808080, 0x80808080, 0x80808080, 0x80800040
1241 #endif
1244 result = ((__vector unsigned long long)
1245 vec_vbpermq ((__vector unsigned char) __A,
1246 (__vector unsigned char) perm_mask));
1248 #ifdef __LITTLE_ENDIAN__
1249 return result[1];
1250 #elif __BIG_ENDIAN__
1251 return result[0];
1252 #endif
1254 #endif /* _ARCH_PWR8 */
1256 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1257 _mm_packs_epi16 (__m128i __A, __m128i __B)
1259 return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B);
1262 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1263 _mm_packs_epi32 (__m128i __A, __m128i __B)
1265 return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B);
1268 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1269 _mm_packus_epi16 (__m128i __A, __m128i __B)
1271 return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B);
1274 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1275 _mm_unpackhi_epi8 (__m128i __A, __m128i __B)
1277 return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B);
1280 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1281 _mm_unpackhi_epi16 (__m128i __A, __m128i __B)
1283 return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B);
1286 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1287 _mm_unpackhi_epi32 (__m128i __A, __m128i __B)
1289 return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B);
1292 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1293 _mm_unpackhi_epi64 (__m128i __A, __m128i __B)
1295 return (__m128i) vec_mergel ((__vector long long) __A,
1296 (__vector long long) __B);
1299 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1300 _mm_unpacklo_epi8 (__m128i __A, __m128i __B)
1302 return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B);
1305 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1306 _mm_unpacklo_epi16 (__m128i __A, __m128i __B)
1308 return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B);
1311 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1312 _mm_unpacklo_epi32 (__m128i __A, __m128i __B)
1314 return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B);
1317 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1318 _mm_unpacklo_epi64 (__m128i __A, __m128i __B)
1320 return (__m128i) vec_mergeh ((__vector long long) __A,
1321 (__vector long long) __B);
1324 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1325 _mm_add_epi8 (__m128i __A, __m128i __B)
1327 return (__m128i) ((__v16qu)__A + (__v16qu)__B);
1330 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1331 _mm_add_epi16 (__m128i __A, __m128i __B)
1333 return (__m128i) ((__v8hu)__A + (__v8hu)__B);
1336 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1337 _mm_add_epi32 (__m128i __A, __m128i __B)
1339 return (__m128i) ((__v4su)__A + (__v4su)__B);
1342 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1343 _mm_add_epi64 (__m128i __A, __m128i __B)
1345 return (__m128i) ((__v2du)__A + (__v2du)__B);
1348 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1349 _mm_adds_epi8 (__m128i __A, __m128i __B)
1351 return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B);
1354 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1355 _mm_adds_epi16 (__m128i __A, __m128i __B)
1357 return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B);
1360 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1361 _mm_adds_epu8 (__m128i __A, __m128i __B)
1363 return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B);
1366 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1367 _mm_adds_epu16 (__m128i __A, __m128i __B)
1369 return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B);
1372 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1373 _mm_sub_epi8 (__m128i __A, __m128i __B)
1375 return (__m128i) ((__v16qu)__A - (__v16qu)__B);
1378 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1379 _mm_sub_epi16 (__m128i __A, __m128i __B)
1381 return (__m128i) ((__v8hu)__A - (__v8hu)__B);
1384 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1385 _mm_sub_epi32 (__m128i __A, __m128i __B)
1387 return (__m128i) ((__v4su)__A - (__v4su)__B);
1390 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1391 _mm_sub_epi64 (__m128i __A, __m128i __B)
1393 return (__m128i) ((__v2du)__A - (__v2du)__B);
1396 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1397 _mm_subs_epi8 (__m128i __A, __m128i __B)
1399 return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B);
1402 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1403 _mm_subs_epi16 (__m128i __A, __m128i __B)
1405 return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B);
1408 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1409 _mm_subs_epu8 (__m128i __A, __m128i __B)
1411 return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B);
1414 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1415 _mm_subs_epu16 (__m128i __A, __m128i __B)
1417 return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B);
1420 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1421 _mm_madd_epi16 (__m128i __A, __m128i __B)
1423 __vector signed int zero = {0, 0, 0, 0};
1425 return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, zero);
1428 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1429 _mm_mulhi_epi16 (__m128i __A, __m128i __B)
1431 __vector signed int w0, w1;
1433 __vector unsigned char xform1 = {
1434 #ifdef __LITTLE_ENDIAN__
1435 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1436 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1437 #elif __BIG_ENDIAN__
1438 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
1439 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1440 #endif
1443 w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B);
1444 w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B);
1445 return (__m128i) vec_perm (w0, w1, xform1);
1448 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1449 _mm_mullo_epi16 (__m128i __A, __m128i __B)
1451 return (__m128i) ((__v8hi)__A * (__v8hi)__B);
1454 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1455 _mm_mul_su32 (__m64 __A, __m64 __B)
1457 unsigned int a = __A;
1458 unsigned int b = __B;
1460 return ((__m64)a * (__m64)b);
1463 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1464 _mm_mul_epu32 (__m128i __A, __m128i __B)
1466 #if __GNUC__ < 8
1467 __v2du result;
1469 #ifdef __LITTLE_ENDIAN__
1470 /* VMX Vector Multiply Odd Unsigned Word. */
1471 __asm__(
1472 "vmulouw %0,%1,%2"
1473 : "=v" (result)
1474 : "v" (__A), "v" (__B)
1475 : );
1476 #elif __BIG_ENDIAN__
1477 /* VMX Vector Multiply Even Unsigned Word. */
1478 __asm__(
1479 "vmuleuw %0,%1,%2"
1480 : "=v" (result)
1481 : "v" (__A), "v" (__B)
1482 : );
1483 #endif
1484 return (__m128i) result;
1485 #else
1486 #ifdef __LITTLE_ENDIAN__
1487 return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
1488 #elif __BIG_ENDIAN__
1489 return (__m128i) vec_mulo ((__v4su)__A, (__v4su)__B);
1490 #endif
1491 #endif
1494 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1495 _mm_slli_epi16 (__m128i __A, int __B)
1497 __v8hu lshift;
1498 __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1500 if (__B >= 0 && __B < 16)
1502 if (__builtin_constant_p(__B))
1503 lshift = (__v8hu) vec_splat_s16(__B);
1504 else
1505 lshift = vec_splats ((unsigned short) __B);
1507 result = vec_sl ((__v8hi) __A, lshift);
1510 return (__m128i) result;
1513 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1514 _mm_slli_epi32 (__m128i __A, int __B)
1516 __v4su lshift;
1517 __v4si result = { 0, 0, 0, 0 };
1519 if (__B >= 0 && __B < 32)
1521 if (__builtin_constant_p(__B) && __B < 16)
1522 lshift = (__v4su) vec_splat_s32(__B);
1523 else
1524 lshift = vec_splats ((unsigned int) __B);
1526 result = vec_sl ((__v4si) __A, lshift);
1529 return (__m128i) result;
1532 #ifdef _ARCH_PWR8
1533 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1534 _mm_slli_epi64 (__m128i __A, int __B)
1536 __v2du lshift;
1537 __v2di result = { 0, 0 };
1539 if (__B >= 0 && __B < 64)
1541 if (__builtin_constant_p(__B) && __B < 16)
1542 lshift = (__v2du) vec_splat_s32(__B);
1543 else
1544 lshift = (__v2du) vec_splats ((unsigned int) __B);
1546 result = vec_sl ((__v2di) __A, lshift);
1549 return (__m128i) result;
1551 #endif
1553 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1554 _mm_srai_epi16 (__m128i __A, int __B)
1556 __v8hu rshift = { 15, 15, 15, 15, 15, 15, 15, 15 };
1557 __v8hi result;
1559 if (__B < 16)
1561 if (__builtin_constant_p(__B))
1562 rshift = (__v8hu) vec_splat_s16(__B);
1563 else
1564 rshift = vec_splats ((unsigned short) __B);
1566 result = vec_sra ((__v8hi) __A, rshift);
1568 return (__m128i) result;
1571 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1572 _mm_srai_epi32 (__m128i __A, int __B)
1574 __v4su rshift = { 31, 31, 31, 31 };
1575 __v4si result;
1577 if (__B < 32)
1579 if (__builtin_constant_p(__B))
1581 if (__B < 16)
1582 rshift = (__v4su) vec_splat_s32(__B);
1583 else
1584 rshift = (__v4su) vec_splats((unsigned int)__B);
1586 else
1587 rshift = vec_splats ((unsigned int) __B);
1589 result = vec_sra ((__v4si) __A, rshift);
1591 return (__m128i) result;
1594 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1595 _mm_bslli_si128 (__m128i __A, const int __N)
1597 __v16qu result;
1598 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1600 if (__N < 16)
1601 result = vec_sld ((__v16qu) __A, zeros, __N);
1602 else
1603 result = zeros;
1605 return (__m128i) result;
1608 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1609 _mm_bsrli_si128 (__m128i __A, const int __N)
1611 __v16qu result;
1612 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1614 if (__N < 16)
1615 if (__builtin_constant_p(__N))
1616 /* Would like to use Vector Shift Left Double by Octet
1617 Immediate here to use the immediate form and avoid
1618 load of __N * 8 value into a separate VR. */
1619 result = vec_sld (zeros, (__v16qu) __A, (16 - __N));
1620 else
1622 __v16qu shift = vec_splats((unsigned char)(__N*8));
1623 result = vec_sro ((__v16qu)__A, shift);
1625 else
1626 result = zeros;
1628 return (__m128i) result;
1631 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1632 _mm_srli_si128 (__m128i __A, const int __N)
1634 return _mm_bsrli_si128 (__A, __N);
1637 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1638 _mm_slli_si128 (__m128i __A, const int _imm5)
1640 __v16qu result;
1641 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1643 if (_imm5 < 16)
1644 #ifdef __LITTLE_ENDIAN__
1645 result = vec_sld ((__v16qu) __A, zeros, _imm5);
1646 #elif __BIG_ENDIAN__
1647 result = vec_sld (zeros, (__v16qu) __A, (16 - _imm5));
1648 #endif
1649 else
1650 result = zeros;
1652 return (__m128i) result;
1655 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1657 _mm_srli_epi16 (__m128i __A, int __B)
1659 __v8hu rshift;
1660 __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1662 if (__B < 16)
1664 if (__builtin_constant_p(__B))
1665 rshift = (__v8hu) vec_splat_s16(__B);
1666 else
1667 rshift = vec_splats ((unsigned short) __B);
1669 result = vec_sr ((__v8hi) __A, rshift);
1672 return (__m128i) result;
1675 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1676 _mm_srli_epi32 (__m128i __A, int __B)
1678 __v4su rshift;
1679 __v4si result = { 0, 0, 0, 0 };
1681 if (__B < 32)
1683 if (__builtin_constant_p(__B))
1685 if (__B < 16)
1686 rshift = (__v4su) vec_splat_s32(__B);
1687 else
1688 rshift = (__v4su) vec_splats((unsigned int)__B);
1690 else
1691 rshift = vec_splats ((unsigned int) __B);
1693 result = vec_sr ((__v4si) __A, rshift);
1696 return (__m128i) result;
1699 #ifdef _ARCH_PWR8
1700 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1701 _mm_srli_epi64 (__m128i __A, int __B)
1703 __v2du rshift;
1704 __v2di result = { 0, 0 };
1706 if (__B < 64)
1708 if (__builtin_constant_p(__B))
1710 if (__B < 16)
1711 rshift = (__v2du) vec_splat_s32(__B);
1712 else
1713 rshift = (__v2du) vec_splats((unsigned long long)__B);
1715 else
1716 rshift = (__v2du) vec_splats ((unsigned int) __B);
1718 result = vec_sr ((__v2di) __A, rshift);
1721 return (__m128i) result;
1723 #endif
1725 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1726 _mm_sll_epi16 (__m128i __A, __m128i __B)
1728 __v8hu lshift;
1729 __vector __bool short shmask;
1730 const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1731 __v8hu result;
1733 #ifdef __LITTLE_ENDIAN__
1734 lshift = vec_splat ((__v8hu) __B, 0);
1735 #elif __BIG_ENDIAN__
1736 lshift = vec_splat ((__v8hu) __B, 3);
1737 #endif
1738 shmask = vec_cmple (lshift, shmax);
1739 result = vec_sl ((__v8hu) __A, lshift);
1740 result = vec_sel ((__v8hu) shmask, result, shmask);
1742 return (__m128i) result;
1745 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1746 _mm_sll_epi32 (__m128i __A, __m128i __B)
1748 __v4su lshift;
1749 __vector __bool int shmask;
1750 const __v4su shmax = { 32, 32, 32, 32 };
1751 __v4su result;
1752 #ifdef __LITTLE_ENDIAN__
1753 lshift = vec_splat ((__v4su) __B, 0);
1754 #elif __BIG_ENDIAN__
1755 lshift = vec_splat ((__v4su) __B, 1);
1756 #endif
1757 shmask = vec_cmplt (lshift, shmax);
1758 result = vec_sl ((__v4su) __A, lshift);
1759 result = vec_sel ((__v4su) shmask, result, shmask);
1761 return (__m128i) result;
1764 #ifdef _ARCH_PWR8
1765 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1766 _mm_sll_epi64 (__m128i __A, __m128i __B)
1768 __v2du lshift;
1769 __vector __bool long long shmask;
1770 const __v2du shmax = { 64, 64 };
1771 __v2du result;
1773 lshift = vec_splat ((__v2du) __B, 0);
1774 shmask = vec_cmplt (lshift, shmax);
1775 result = vec_sl ((__v2du) __A, lshift);
1776 result = vec_sel ((__v2du) shmask, result, shmask);
1778 return (__m128i) result;
1780 #endif
1782 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1783 _mm_sra_epi16 (__m128i __A, __m128i __B)
1785 const __v8hu rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1786 __v8hu rshift;
1787 __v8hi result;
1789 #ifdef __LITTLE_ENDIAN__
1790 rshift = vec_splat ((__v8hu)__B, 0);
1791 #elif __BIG_ENDIAN__
1792 rshift = vec_splat ((__v8hu)__B, 3);
1793 #endif
1794 rshift = vec_min (rshift, rshmax);
1795 result = vec_sra ((__v8hi) __A, rshift);
1797 return (__m128i) result;
1800 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1801 _mm_sra_epi32 (__m128i __A, __m128i __B)
1803 const __v4su rshmax = { 31, 31, 31, 31 };
1804 __v4su rshift;
1805 __v4si result;
1807 #ifdef __LITTLE_ENDIAN__
1808 rshift = vec_splat ((__v4su)__B, 0);
1809 #elif __BIG_ENDIAN__
1810 rshift = vec_splat ((__v4su)__B, 1);
1811 #endif
1812 rshift = vec_min (rshift, rshmax);
1813 result = vec_sra ((__v4si) __A, rshift);
1815 return (__m128i) result;
1818 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1819 _mm_srl_epi16 (__m128i __A, __m128i __B)
1821 __v8hu rshift;
1822 __vector __bool short shmask;
1823 const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1824 __v8hu result;
1826 #ifdef __LITTLE_ENDIAN__
1827 rshift = vec_splat ((__v8hu) __B, 0);
1828 #elif __BIG_ENDIAN__
1829 rshift = vec_splat ((__v8hu) __B, 3);
1830 #endif
1831 shmask = vec_cmple (rshift, shmax);
1832 result = vec_sr ((__v8hu) __A, rshift);
1833 result = vec_sel ((__v8hu) shmask, result, shmask);
1835 return (__m128i) result;
1838 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1839 _mm_srl_epi32 (__m128i __A, __m128i __B)
1841 __v4su rshift;
1842 __vector __bool int shmask;
1843 const __v4su shmax = { 32, 32, 32, 32 };
1844 __v4su result;
1846 #ifdef __LITTLE_ENDIAN__
1847 rshift = vec_splat ((__v4su) __B, 0);
1848 #elif __BIG_ENDIAN__
1849 rshift = vec_splat ((__v4su) __B, 1);
1850 #endif
1851 shmask = vec_cmplt (rshift, shmax);
1852 result = vec_sr ((__v4su) __A, rshift);
1853 result = vec_sel ((__v4su) shmask, result, shmask);
1855 return (__m128i) result;
1858 #ifdef _ARCH_PWR8
1859 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1860 _mm_srl_epi64 (__m128i __A, __m128i __B)
1862 __v2du rshift;
1863 __vector __bool long long shmask;
1864 const __v2du shmax = { 64, 64 };
1865 __v2du result;
1867 rshift = vec_splat ((__v2du) __B, 0);
1868 shmask = vec_cmplt (rshift, shmax);
1869 result = vec_sr ((__v2du) __A, rshift);
1870 result = vec_sel ((__v2du) shmask, result, shmask);
1872 return (__m128i) result;
1874 #endif
1876 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1877 _mm_and_pd (__m128d __A, __m128d __B)
1879 return (vec_and ((__v2df) __A, (__v2df) __B));
1882 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1883 _mm_andnot_pd (__m128d __A, __m128d __B)
1885 return (vec_andc ((__v2df) __B, (__v2df) __A));
1888 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1889 _mm_or_pd (__m128d __A, __m128d __B)
1891 return (vec_or ((__v2df) __A, (__v2df) __B));
1894 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1895 _mm_xor_pd (__m128d __A, __m128d __B)
1897 return (vec_xor ((__v2df) __A, (__v2df) __B));
1900 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1901 _mm_and_si128 (__m128i __A, __m128i __B)
1903 return (__m128i)vec_and ((__v2di) __A, (__v2di) __B);
1906 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1907 _mm_andnot_si128 (__m128i __A, __m128i __B)
1909 return (__m128i)vec_andc ((__v2di) __B, (__v2di) __A);
1912 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1913 _mm_or_si128 (__m128i __A, __m128i __B)
1915 return (__m128i)vec_or ((__v2di) __A, (__v2di) __B);
1918 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1919 _mm_xor_si128 (__m128i __A, __m128i __B)
1921 return (__m128i)vec_xor ((__v2di) __A, (__v2di) __B);
1924 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1925 _mm_cmpeq_epi8 (__m128i __A, __m128i __B)
1927 return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B);
1930 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1931 _mm_cmpeq_epi16 (__m128i __A, __m128i __B)
1933 return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B);
1936 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1937 _mm_cmpeq_epi32 (__m128i __A, __m128i __B)
1939 return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B);
1942 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1943 _mm_cmplt_epi8 (__m128i __A, __m128i __B)
1945 return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B);
1948 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1949 _mm_cmplt_epi16 (__m128i __A, __m128i __B)
1951 return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B);
1954 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1955 _mm_cmplt_epi32 (__m128i __A, __m128i __B)
1957 return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B);
1960 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1961 _mm_cmpgt_epi8 (__m128i __A, __m128i __B)
1963 return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B);
1966 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1967 _mm_cmpgt_epi16 (__m128i __A, __m128i __B)
1969 return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B);
1972 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1973 _mm_cmpgt_epi32 (__m128i __A, __m128i __B)
1975 return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B);
1978 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1979 _mm_extract_epi16 (__m128i const __A, int const __N)
1981 return (unsigned short) ((__v8hi)__A)[__N & 7];
1984 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1985 _mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
1987 __v8hi result = (__v8hi)__A;
1989 result [(__N & 7)] = __D;
1991 return (__m128i) result;
1994 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1995 _mm_max_epi16 (__m128i __A, __m128i __B)
1997 return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B);
2000 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2001 _mm_max_epu8 (__m128i __A, __m128i __B)
2003 return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B);
2006 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2007 _mm_min_epi16 (__m128i __A, __m128i __B)
2009 return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B);
2012 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2013 _mm_min_epu8 (__m128i __A, __m128i __B)
2015 return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B);
2019 #ifdef _ARCH_PWR8
2020 /* Intrinsic functions that require PowerISA 2.07 minimum. */
2022 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */
2023 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2024 _mm_movemask_epi8 (__m128i __A)
2026 __vector unsigned long long result;
2027 static const __vector unsigned char perm_mask =
2029 #ifdef __LITTLE_ENDIAN__
2030 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
2031 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00
2032 #elif __BIG_ENDIAN__
2033 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38,
2034 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78
2035 #endif
2038 result = ((__vector unsigned long long)
2039 vec_vbpermq ((__vector unsigned char) __A,
2040 (__vector unsigned char) perm_mask));
2042 #ifdef __LITTLE_ENDIAN__
2043 return result[1];
2044 #elif __BIG_ENDIAN__
2045 return result[0];
2046 #endif
2048 #endif /* _ARCH_PWR8 */
2050 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2051 _mm_mulhi_epu16 (__m128i __A, __m128i __B)
2053 __v4su w0, w1;
2054 __v16qu xform1 = {
2055 #ifdef __LITTLE_ENDIAN__
2056 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
2057 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
2058 #elif __BIG_ENDIAN__
2059 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
2060 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
2061 #endif
2064 w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B);
2065 w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B);
2066 return (__m128i) vec_perm (w0, w1, xform1);
2069 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2070 _mm_shufflehi_epi16 (__m128i __A, const int __mask)
2072 unsigned long element_selector_98 = __mask & 0x03;
2073 unsigned long element_selector_BA = (__mask >> 2) & 0x03;
2074 unsigned long element_selector_DC = (__mask >> 4) & 0x03;
2075 unsigned long element_selector_FE = (__mask >> 6) & 0x03;
2076 static const unsigned short permute_selectors[4] =
2078 #ifdef __LITTLE_ENDIAN__
2079 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2080 #elif __BIG_ENDIAN__
2081 0x0607, 0x0405, 0x0203, 0x0001
2082 #endif
2084 __v2du pmask =
2085 #ifdef __LITTLE_ENDIAN__
2086 { 0x1716151413121110UL, 0x1f1e1d1c1b1a1918UL};
2087 #elif __BIG_ENDIAN__
2088 { 0x1011121314151617UL, 0x18191a1b1c1d1e1fUL};
2089 #endif
2090 __m64_union t;
2091 __v2du a, r;
2093 #ifdef __LITTLE_ENDIAN__
2094 t.as_short[0] = permute_selectors[element_selector_98];
2095 t.as_short[1] = permute_selectors[element_selector_BA];
2096 t.as_short[2] = permute_selectors[element_selector_DC];
2097 t.as_short[3] = permute_selectors[element_selector_FE];
2098 #elif __BIG_ENDIAN__
2099 t.as_short[3] = permute_selectors[element_selector_98];
2100 t.as_short[2] = permute_selectors[element_selector_BA];
2101 t.as_short[1] = permute_selectors[element_selector_DC];
2102 t.as_short[0] = permute_selectors[element_selector_FE];
2103 #endif
2104 #ifdef __LITTLE_ENDIAN__
2105 pmask[1] = t.as_m64;
2106 #elif __BIG_ENDIAN__
2107 pmask[0] = t.as_m64;
2108 #endif
2109 a = (__v2du)__A;
2110 r = vec_perm (a, a, (__vector unsigned char)pmask);
2111 return (__m128i) r;
2114 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2115 _mm_shufflelo_epi16 (__m128i __A, const int __mask)
2117 unsigned long element_selector_10 = __mask & 0x03;
2118 unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2119 unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2120 unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2121 static const unsigned short permute_selectors[4] =
2123 #ifdef __LITTLE_ENDIAN__
2124 0x0100, 0x0302, 0x0504, 0x0706
2125 #elif __BIG_ENDIAN__
2126 0x0e0f, 0x0c0d, 0x0a0b, 0x0809
2127 #endif
2129 __v2du pmask = { 0x1011121314151617UL, 0x1f1e1d1c1b1a1918UL};
2130 __m64_union t;
2131 __v2du a, r;
2133 #ifdef __LITTLE_ENDIAN__
2134 t.as_short[0] = permute_selectors[element_selector_10];
2135 t.as_short[1] = permute_selectors[element_selector_32];
2136 t.as_short[2] = permute_selectors[element_selector_54];
2137 t.as_short[3] = permute_selectors[element_selector_76];
2138 #elif __BIG_ENDIAN__
2139 t.as_short[3] = permute_selectors[element_selector_10];
2140 t.as_short[2] = permute_selectors[element_selector_32];
2141 t.as_short[1] = permute_selectors[element_selector_54];
2142 t.as_short[0] = permute_selectors[element_selector_76];
2143 #endif
2144 #ifdef __LITTLE_ENDIAN__
2145 pmask[0] = t.as_m64;
2146 #elif __BIG_ENDIAN__
2147 pmask[1] = t.as_m64;
2148 #endif
2149 a = (__v2du)__A;
2150 r = vec_perm (a, a, (__vector unsigned char)pmask);
2151 return (__m128i) r;
2154 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2155 _mm_shuffle_epi32 (__m128i __A, const int __mask)
2157 unsigned long element_selector_10 = __mask & 0x03;
2158 unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2159 unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2160 unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2161 static const unsigned int permute_selectors[4] =
2163 #ifdef __LITTLE_ENDIAN__
2164 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2165 #elif __BIG_ENDIAN__
2166 0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203
2167 #endif
2169 __v4su t;
2171 #ifdef __LITTLE_ENDIAN__
2172 t[0] = permute_selectors[element_selector_10];
2173 t[1] = permute_selectors[element_selector_32];
2174 t[2] = permute_selectors[element_selector_54] + 0x10101010;
2175 t[3] = permute_selectors[element_selector_76] + 0x10101010;
2176 #elif __BIG_ENDIAN__
2177 t[3] = permute_selectors[element_selector_10] + 0x10101010;
2178 t[2] = permute_selectors[element_selector_32] + 0x10101010;
2179 t[1] = permute_selectors[element_selector_54];
2180 t[0] = permute_selectors[element_selector_76];
2181 #endif
2182 return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)t);
2185 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2186 _mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
2188 __v2du hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2189 __v16qu mask, tmp;
2190 __m128i_u *p = (__m128i_u*)__C;
2192 tmp = (__v16qu)_mm_loadu_si128(p);
2193 mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)hibit);
2194 tmp = vec_sel (tmp, (__v16qu)__A, mask);
2195 _mm_storeu_si128 (p, (__m128i)tmp);
2198 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2199 _mm_avg_epu8 (__m128i __A, __m128i __B)
2201 return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B);
2204 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2205 _mm_avg_epu16 (__m128i __A, __m128i __B)
2207 return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B);
2211 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2212 _mm_sad_epu8 (__m128i __A, __m128i __B)
2214 __v16qu a, b;
2215 __v16qu vmin, vmax, vabsdiff;
2216 __v4si vsum;
2217 const __v4su zero = { 0, 0, 0, 0 };
2218 __v4si result;
2220 a = (__v16qu) __A;
2221 b = (__v16qu) __B;
2222 vmin = vec_min (a, b);
2223 vmax = vec_max (a, b);
2224 vabsdiff = vec_sub (vmax, vmin);
2225 /* Sum four groups of bytes into integers. */
2226 vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
2227 /* Sum across four integers with two integer results. */
2228 result = vec_sum2s (vsum, (__vector signed int) zero);
2229 /* Rotate the sums into the correct position. */
2230 #ifdef __LITTLE_ENDIAN__
2231 result = vec_sld (result, result, 4);
2232 #elif __BIG_ENDIAN__
2233 result = vec_sld (result, result, 6);
2234 #endif
2235 /* Rotate the sums into the correct position. */
2236 return (__m128i) result;
2239 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2240 _mm_stream_si32 (int *__A, int __B)
2242 /* Use the data cache block touch for store transient. */
2243 __asm__ (
2244 "dcbtstt 0,%0"
2246 : "b" (__A)
2247 : "memory"
2249 *__A = __B;
2252 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2253 _mm_stream_si64 (long long int *__A, long long int __B)
2255 /* Use the data cache block touch for store transient. */
2256 __asm__ (
2257 " dcbtstt 0,%0"
2259 : "b" (__A)
2260 : "memory"
2262 *__A = __B;
2265 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2266 _mm_stream_si128 (__m128i *__A, __m128i __B)
2268 /* Use the data cache block touch for store transient. */
2269 __asm__ (
2270 "dcbtstt 0,%0"
2272 : "b" (__A)
2273 : "memory"
2275 *__A = __B;
2278 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2279 _mm_stream_pd (double *__A, __m128d __B)
2281 /* Use the data cache block touch for store transient. */
2282 __asm__ (
2283 "dcbtstt 0,%0"
2285 : "b" (__A)
2286 : "memory"
2288 *(__m128d*)__A = __B;
2291 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2292 _mm_clflush (void const *__A)
2294 /* Use the data cache block flush. */
2295 __asm__ (
2296 "dcbf 0,%0"
2298 : "b" (__A)
2299 : "memory"
2303 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2304 _mm_lfence (void)
2306 /* Use light weight sync for load to load ordering. */
2307 __atomic_thread_fence (__ATOMIC_RELEASE);
2310 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2311 _mm_mfence (void)
2313 /* Use heavy weight sync for any to any ordering. */
2314 __atomic_thread_fence (__ATOMIC_SEQ_CST);
2317 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2318 _mm_cvtsi32_si128 (int __A)
2320 return _mm_set_epi32 (0, 0, 0, __A);
2323 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2324 _mm_cvtsi64_si128 (long long __A)
2326 return __extension__ (__m128i)(__v2di){ __A, 0LL };
2329 /* Microsoft intrinsic. */
2330 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2331 _mm_cvtsi64x_si128 (long long __A)
2333 return __extension__ (__m128i)(__v2di){ __A, 0LL };
2336 /* Casts between various SP, DP, INT vector types. Note that these do no
2337 conversion of values, they just change the type. */
2338 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2339 _mm_castpd_ps(__m128d __A)
2341 return (__m128) __A;
2344 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2345 _mm_castpd_si128(__m128d __A)
2347 return (__m128i) __A;
2350 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2351 _mm_castps_pd(__m128 __A)
2353 return (__m128d) __A;
2356 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2357 _mm_castps_si128(__m128 __A)
2359 return (__m128i) __A;
2362 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2363 _mm_castsi128_ps(__m128i __A)
2365 return (__m128) __A;
2368 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2369 _mm_castsi128_pd(__m128i __A)
2371 return (__m128d) __A;
2374 #endif /* EMMINTRIN_H_ */