PR tree-optimization/82929
[official-gcc.git] / gcc / config / rs6000 / emmintrin.h
blob6a33ad9dd13551dabb3580dbd10133c827af4559
1 /* Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
24 /* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 9.0. */
27 #ifndef NO_WARN_X86_INTRINSICS
28 /* This header is distributed to simplify porting x86_64 code that
29 makes explicit use of Intel intrinsics to powerpc64le.
30 It is the user's responsibility to determine if the results are
31 acceptable and make additional changes as necessary.
32 Note that much code that uses Intel intrinsics can be rewritten in
33 standard C or GNU C extensions, which are more portable and better
34 optimized across multiple targets.
36 In the specific case of X86 SSE2 (__m128i, __m128d) intrinsics,
37 the PowerPC VMX/VSX ISA is a good match for vector double SIMD
38 operations. However scalar double operations in vector (XMM)
39 registers require the POWER8 VSX ISA (2.07) level. Also there are
40 important differences for data format and placement of double
41 scalars in the vector register.
43 For PowerISA Scalar double is in FPRs (left most 64-bits of the
44 low 32 VSRs), while X86_64 SSE2 uses the right most 64-bits of
45 the XMM. These differences require extra steps on POWER to match
46 the SSE2 scalar double semantics.
48 Most SSE2 scalar double intrinsic operations can be performed more
49 efficiently as C language double scalar operations or optimized to
50 use vector SIMD operations. We recommend this for new applications.
52 Another difference is the format and details of the X86_64 MXSCR vs
53 the PowerISA FPSCR / VSCR registers. We recommend applications
54 replace direct access to the MXSCR with the more portable <fenv.h>
55 Posix APIs. */
56 #warning "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this warning."
57 #endif
59 #ifndef EMMINTRIN_H_
60 #define EMMINTRIN_H_
62 #include <altivec.h>
63 #include <assert.h>
65 /* We need definitions from the SSE header files. */
66 #include <xmmintrin.h>
68 /* SSE2 */
69 typedef __vector double __v2df;
70 typedef __vector long long __v2di;
71 typedef __vector unsigned long long __v2du;
72 typedef __vector int __v4si;
73 typedef __vector unsigned int __v4su;
74 typedef __vector short __v8hi;
75 typedef __vector unsigned short __v8hu;
76 typedef __vector signed char __v16qi;
77 typedef __vector unsigned char __v16qu;
79 /* The Intel API is flexible enough that we must allow aliasing with other
80 vector types, and their scalar components. */
81 typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
82 typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
84 /* Unaligned version of the same types. */
85 typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
86 typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
88 /* Create a vector with element 0 as F and the rest zero. */
89 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
90 _mm_set_sd (double __F)
92 return __extension__ (__m128d){ __F, 0.0 };
95 /* Create a vector with both elements equal to F. */
96 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
97 _mm_set1_pd (double __F)
99 return __extension__ (__m128d){ __F, __F };
102 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
103 _mm_set_pd1 (double __F)
105 return _mm_set1_pd (__F);
108 /* Create a vector with the lower value X and upper value W. */
109 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
110 _mm_set_pd (double __W, double __X)
112 return __extension__ (__m128d){ __X, __W };
115 /* Create a vector with the lower value W and upper value X. */
116 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
117 _mm_setr_pd (double __W, double __X)
119 return __extension__ (__m128d){ __W, __X };
122 /* Create an undefined vector. */
123 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
124 _mm_undefined_pd (void)
126 __m128d __Y = __Y;
127 return __Y;
130 /* Create a vector of zeros. */
131 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
132 _mm_setzero_pd (void)
134 return (__m128d) vec_splats (0);
137 /* Sets the low DPFP value of A from the low value of B. */
138 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
139 _mm_move_sd (__m128d __A, __m128d __B)
141 __v2df result = (__v2df) __A;
142 result [0] = ((__v2df) __B)[0];
143 return (__m128d) result;
146 /* Load two DPFP values from P. The address must be 16-byte aligned. */
147 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
148 _mm_load_pd (double const *__P)
150 assert(((unsigned long)__P & 0xfUL) == 0UL);
151 return ((__m128d)vec_ld(0, (__v16qu*)__P));
154 /* Load two DPFP values from P. The address need not be 16-byte aligned. */
155 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
156 _mm_loadu_pd (double const *__P)
158 return (vec_vsx_ld(0, __P));
161 /* Create a vector with all two elements equal to *P. */
162 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
163 _mm_load1_pd (double const *__P)
165 return (vec_splats (*__P));
168 /* Create a vector with element 0 as *P and the rest zero. */
169 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
170 _mm_load_sd (double const *__P)
172 return _mm_set_sd (*__P);
175 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
176 _mm_load_pd1 (double const *__P)
178 return _mm_load1_pd (__P);
181 /* Load two DPFP values in reverse order. The address must be aligned. */
182 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
183 _mm_loadr_pd (double const *__P)
185 __v2df __tmp = _mm_load_pd (__P);
186 return (__m128d)vec_xxpermdi (__tmp, __tmp, 2);
189 /* Store two DPFP values. The address must be 16-byte aligned. */
190 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
191 _mm_store_pd (double *__P, __m128d __A)
193 assert(((unsigned long)__P & 0xfUL) == 0UL);
194 vec_st((__v16qu)__A, 0, (__v16qu*)__P);
197 /* Store two DPFP values. The address need not be 16-byte aligned. */
198 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
199 _mm_storeu_pd (double *__P, __m128d __A)
201 *(__m128d *)__P = __A;
204 /* Stores the lower DPFP value. */
205 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
206 _mm_store_sd (double *__P, __m128d __A)
208 *__P = ((__v2df)__A)[0];
211 extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
212 _mm_cvtsd_f64 (__m128d __A)
214 return ((__v2df)__A)[0];
217 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
218 _mm_storel_pd (double *__P, __m128d __A)
220 _mm_store_sd (__P, __A);
223 /* Stores the upper DPFP value. */
224 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
225 _mm_storeh_pd (double *__P, __m128d __A)
227 *__P = ((__v2df)__A)[1];
229 /* Store the lower DPFP value across two words.
230 The address must be 16-byte aligned. */
231 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
232 _mm_store1_pd (double *__P, __m128d __A)
234 _mm_store_pd (__P, vec_splat (__A, 0));
237 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
238 _mm_store_pd1 (double *__P, __m128d __A)
240 _mm_store1_pd (__P, __A);
243 /* Store two DPFP values in reverse order. The address must be aligned. */
244 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
245 _mm_storer_pd (double *__P, __m128d __A)
247 _mm_store_pd (__P, vec_xxpermdi (__A, __A, 2));
250 /* Intel intrinsic. */
251 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
252 _mm_cvtsi128_si64 (__m128i __A)
254 return ((__v2di)__A)[0];
257 /* Microsoft intrinsic. */
258 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
259 _mm_cvtsi128_si64x (__m128i __A)
261 return ((__v2di)__A)[0];
264 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
265 _mm_add_pd (__m128d __A, __m128d __B)
267 return (__m128d) ((__v2df)__A + (__v2df)__B);
270 /* Add the lower double-precision (64-bit) floating-point element in
271 a and b, store the result in the lower element of dst, and copy
272 the upper element from a to the upper element of dst. */
273 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
274 _mm_add_sd (__m128d __A, __m128d __B)
276 __A[0] = __A[0] + __B[0];
277 return (__A);
280 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
281 _mm_sub_pd (__m128d __A, __m128d __B)
283 return (__m128d) ((__v2df)__A - (__v2df)__B);
286 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
287 _mm_sub_sd (__m128d __A, __m128d __B)
289 __A[0] = __A[0] - __B[0];
290 return (__A);
293 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
294 _mm_mul_pd (__m128d __A, __m128d __B)
296 return (__m128d) ((__v2df)__A * (__v2df)__B);
299 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
300 _mm_mul_sd (__m128d __A, __m128d __B)
302 __A[0] = __A[0] * __B[0];
303 return (__A);
306 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
307 _mm_div_pd (__m128d __A, __m128d __B)
309 return (__m128d) ((__v2df)__A / (__v2df)__B);
312 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
313 _mm_div_sd (__m128d __A, __m128d __B)
315 __A[0] = __A[0] / __B[0];
316 return (__A);
319 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
320 _mm_sqrt_pd (__m128d __A)
322 return (vec_sqrt (__A));
325 /* Return pair {sqrt (B[0]), A[1]}. */
326 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
327 _mm_sqrt_sd (__m128d __A, __m128d __B)
329 __v2df c;
330 c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0]));
331 return (__m128d) _mm_setr_pd (c[0], __A[1]);
334 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
335 _mm_min_pd (__m128d __A, __m128d __B)
337 return (vec_min (__A, __B));
340 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
341 _mm_min_sd (__m128d __A, __m128d __B)
343 __v2df a, b, c;
344 a = vec_splats (__A[0]);
345 b = vec_splats (__B[0]);
346 c = vec_min (a, b);
347 return (__m128d) _mm_setr_pd (c[0], __A[1]);
350 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
351 _mm_max_pd (__m128d __A, __m128d __B)
353 return (vec_max (__A, __B));
356 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
357 _mm_max_sd (__m128d __A, __m128d __B)
359 __v2df a, b, c;
360 a = vec_splats (__A[0]);
361 b = vec_splats (__B[0]);
362 c = vec_max (a, b);
363 return (__m128d) _mm_setr_pd (c[0], __A[1]);
366 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
367 _mm_cmpeq_pd (__m128d __A, __m128d __B)
369 return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B));
372 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
373 _mm_cmplt_pd (__m128d __A, __m128d __B)
375 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
378 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
379 _mm_cmple_pd (__m128d __A, __m128d __B)
381 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
384 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
385 _mm_cmpgt_pd (__m128d __A, __m128d __B)
387 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
390 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
391 _mm_cmpge_pd (__m128d __A, __m128d __B)
393 return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B));
396 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
397 _mm_cmpneq_pd (__m128d __A, __m128d __B)
399 __v2df temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B);
400 return ((__m128d)vec_nor (temp, temp));
403 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
404 _mm_cmpnlt_pd (__m128d __A, __m128d __B)
406 return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B));
409 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
410 _mm_cmpnle_pd (__m128d __A, __m128d __B)
412 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
415 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
416 _mm_cmpngt_pd (__m128d __A, __m128d __B)
418 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
421 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
422 _mm_cmpnge_pd (__m128d __A, __m128d __B)
424 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
427 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
428 _mm_cmpord_pd (__m128d __A, __m128d __B)
430 #if _ARCH_PWR8
431 __v2du c, d;
432 /* Compare against self will return false (0's) if NAN. */
433 c = (__v2du)vec_cmpeq (__A, __A);
434 d = (__v2du)vec_cmpeq (__B, __B);
435 #else
436 __v2du a, b;
437 __v2du c, d;
438 const __v2du double_exp_mask = {0x7ff0000000000000, 0x7ff0000000000000};
439 a = (__v2du)vec_abs ((__v2df)__A);
440 b = (__v2du)vec_abs ((__v2df)__B);
441 c = (__v2du)vec_cmpgt (double_exp_mask, a);
442 d = (__v2du)vec_cmpgt (double_exp_mask, b);
443 #endif
444 /* A != NAN and B != NAN. */
445 return ((__m128d)vec_and(c, d));
448 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
449 _mm_cmpunord_pd (__m128d __A, __m128d __B)
451 #if _ARCH_PWR8
452 __v2du c, d;
453 /* Compare against self will return false (0's) if NAN. */
454 c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
455 d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
456 /* A == NAN OR B == NAN converts too:
457 NOT(A != NAN) OR NOT(B != NAN). */
458 c = vec_nor (c, c);
459 return ((__m128d)vec_orc(c, d));
460 #else
461 __v2du c, d;
462 /* Compare against self will return false (0's) if NAN. */
463 c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
464 d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
465 /* Convert the true ('1's) is NAN. */
466 c = vec_nor (c, c);
467 d = vec_nor (d, d);
468 return ((__m128d)vec_or(c, d));
469 #endif
472 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
473 _mm_cmpeq_sd(__m128d __A, __m128d __B)
475 __v2df a, b, c;
476 /* PowerISA VSX does not allow partial (for just lower double)
477 results. So to insure we don't generate spurious exceptions
478 (from the upper double values) we splat the lower double
479 before we do the operation. */
480 a = vec_splats (__A[0]);
481 b = vec_splats (__B[0]);
482 c = (__v2df) vec_cmpeq(a, b);
483 /* Then we merge the lower double result with the original upper
484 double from __A. */
485 return (__m128d) _mm_setr_pd (c[0], __A[1]);
488 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
489 _mm_cmplt_sd (__m128d __A, __m128d __B)
491 __v2df a, b, c;
492 a = vec_splats (__A[0]);
493 b = vec_splats (__B[0]);
494 c = (__v2df) vec_cmplt(a, b);
495 return (__m128d) _mm_setr_pd (c[0], __A[1]);
498 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
499 _mm_cmple_sd (__m128d __A, __m128d __B)
501 __v2df a, b, c;
502 a = vec_splats (__A[0]);
503 b = vec_splats (__B[0]);
504 c = (__v2df) vec_cmple(a, b);
505 return (__m128d) _mm_setr_pd (c[0], __A[1]);
508 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
509 _mm_cmpgt_sd (__m128d __A, __m128d __B)
511 __v2df a, b, c;
512 a = vec_splats (__A[0]);
513 b = vec_splats (__B[0]);
514 c = (__v2df) vec_cmpgt(a, b);
515 return (__m128d) _mm_setr_pd (c[0], __A[1]);
518 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
519 _mm_cmpge_sd (__m128d __A, __m128d __B)
521 __v2df a, b, c;
522 a = vec_splats (__A[0]);
523 b = vec_splats (__B[0]);
524 c = (__v2df) vec_cmpge(a, b);
525 return (__m128d) _mm_setr_pd (c[0], __A[1]);
528 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
529 _mm_cmpneq_sd (__m128d __A, __m128d __B)
531 __v2df a, b, c;
532 a = vec_splats (__A[0]);
533 b = vec_splats (__B[0]);
534 c = (__v2df) vec_cmpeq(a, b);
535 c = vec_nor (c, c);
536 return (__m128d) _mm_setr_pd (c[0], __A[1]);
539 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
540 _mm_cmpnlt_sd (__m128d __A, __m128d __B)
542 __v2df a, b, c;
543 a = vec_splats (__A[0]);
544 b = vec_splats (__B[0]);
545 /* Not less than is just greater than or equal. */
546 c = (__v2df) vec_cmpge(a, b);
547 return (__m128d) _mm_setr_pd (c[0], __A[1]);
550 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
551 _mm_cmpnle_sd (__m128d __A, __m128d __B)
553 __v2df a, b, c;
554 a = vec_splats (__A[0]);
555 b = vec_splats (__B[0]);
556 /* Not less than or equal is just greater than. */
557 c = (__v2df) vec_cmpge(a, b);
558 return (__m128d) _mm_setr_pd (c[0], __A[1]);
561 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
562 _mm_cmpngt_sd (__m128d __A, __m128d __B)
564 __v2df a, b, c;
565 a = vec_splats (__A[0]);
566 b = vec_splats (__B[0]);
567 /* Not greater than is just less than or equal. */
568 c = (__v2df) vec_cmple(a, b);
569 return (__m128d) _mm_setr_pd (c[0], __A[1]);
572 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
573 _mm_cmpnge_sd (__m128d __A, __m128d __B)
575 __v2df a, b, c;
576 a = vec_splats (__A[0]);
577 b = vec_splats (__B[0]);
578 /* Not greater than or equal is just less than. */
579 c = (__v2df) vec_cmplt(a, b);
580 return (__m128d) _mm_setr_pd (c[0], __A[1]);
583 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
584 _mm_cmpord_sd (__m128d __A, __m128d __B)
586 __v2df r;
587 r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
588 return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]);
591 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
592 _mm_cmpunord_sd (__m128d __A, __m128d __B)
594 __v2df r;
595 r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
596 return (__m128d) _mm_setr_pd (r[0], __A[1]);
599 /* FIXME
600 The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
601 exactly the same because GCC for PowerPC only generates unordered
602 compares (scalar and vector).
603 Technically __mm_comieq_sp et all should be using the ordered
604 compare and signal for QNaNs. The __mm_ucomieq_sd et all should
605 be OK. */
606 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
607 _mm_comieq_sd (__m128d __A, __m128d __B)
609 return (__A[0] == __B[0]);
612 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
613 _mm_comilt_sd (__m128d __A, __m128d __B)
615 return (__A[0] < __B[0]);
618 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
619 _mm_comile_sd (__m128d __A, __m128d __B)
621 return (__A[0] <= __B[0]);
624 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
625 _mm_comigt_sd (__m128d __A, __m128d __B)
627 return (__A[0] > __B[0]);
630 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
631 _mm_comige_sd (__m128d __A, __m128d __B)
633 return (__A[0] >= __B[0]);
636 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
637 _mm_comineq_sd (__m128d __A, __m128d __B)
639 return (__A[0] != __B[0]);
642 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
643 _mm_ucomieq_sd (__m128d __A, __m128d __B)
645 return (__A[0] == __B[0]);
648 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
649 _mm_ucomilt_sd (__m128d __A, __m128d __B)
651 return (__A[0] < __B[0]);
654 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
655 _mm_ucomile_sd (__m128d __A, __m128d __B)
657 return (__A[0] <= __B[0]);
660 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
661 _mm_ucomigt_sd (__m128d __A, __m128d __B)
663 return (__A[0] > __B[0]);
666 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
667 _mm_ucomige_sd (__m128d __A, __m128d __B)
669 return (__A[0] >= __B[0]);
672 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
673 _mm_ucomineq_sd (__m128d __A, __m128d __B)
675 return (__A[0] != __B[0]);
678 /* Create a vector of Qi, where i is the element number. */
679 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
680 _mm_set_epi64x (long long __q1, long long __q0)
682 return __extension__ (__m128i)(__v2di){ __q0, __q1 };
685 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
686 _mm_set_epi64 (__m64 __q1, __m64 __q0)
688 return _mm_set_epi64x ((long long)__q1, (long long)__q0);
691 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
692 _mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
694 return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
697 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
698 _mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
699 short __q3, short __q2, short __q1, short __q0)
701 return __extension__ (__m128i)(__v8hi){
702 __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
705 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
706 _mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
707 char __q11, char __q10, char __q09, char __q08,
708 char __q07, char __q06, char __q05, char __q04,
709 char __q03, char __q02, char __q01, char __q00)
711 return __extension__ (__m128i)(__v16qi){
712 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
713 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
717 /* Set all of the elements of the vector to A. */
718 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
719 _mm_set1_epi64x (long long __A)
721 return _mm_set_epi64x (__A, __A);
724 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
725 _mm_set1_epi64 (__m64 __A)
727 return _mm_set_epi64 (__A, __A);
730 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
731 _mm_set1_epi32 (int __A)
733 return _mm_set_epi32 (__A, __A, __A, __A);
736 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
737 _mm_set1_epi16 (short __A)
739 return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
742 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
743 _mm_set1_epi8 (char __A)
745 return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
746 __A, __A, __A, __A, __A, __A, __A, __A);
749 /* Create a vector of Qi, where i is the element number.
750 The parameter order is reversed from the _mm_set_epi* functions. */
751 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
752 _mm_setr_epi64 (__m64 __q0, __m64 __q1)
754 return _mm_set_epi64 (__q1, __q0);
757 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
758 _mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
760 return _mm_set_epi32 (__q3, __q2, __q1, __q0);
763 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
764 _mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
765 short __q4, short __q5, short __q6, short __q7)
767 return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
770 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
771 _mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
772 char __q04, char __q05, char __q06, char __q07,
773 char __q08, char __q09, char __q10, char __q11,
774 char __q12, char __q13, char __q14, char __q15)
776 return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
777 __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
780 /* Create a vector with element 0 as *P and the rest zero. */
781 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
782 _mm_load_si128 (__m128i const *__P)
784 return *__P;
787 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
788 _mm_loadu_si128 (__m128i_u const *__P)
790 return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
793 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
794 _mm_loadl_epi64 (__m128i_u const *__P)
796 return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
799 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
800 _mm_store_si128 (__m128i *__P, __m128i __B)
802 assert(((unsigned long )__P & 0xfUL) == 0UL);
803 vec_st ((__v16qu) __B, 0, (__v16qu*)__P);
806 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
807 _mm_storeu_si128 (__m128i_u *__P, __m128i __B)
809 *__P = __B;
812 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
813 _mm_storel_epi64 (__m128i_u *__P, __m128i __B)
815 *(long long *)__P = ((__v2di)__B)[0];
818 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
819 _mm_movepi64_pi64 (__m128i_u __B)
821 return (__m64) ((__v2di)__B)[0];
824 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
825 _mm_movpi64_epi64 (__m64 __A)
827 return _mm_set_epi64 ((__m64)0LL, __A);
830 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
831 _mm_move_epi64 (__m128i __A)
833 return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]);
836 /* Create an undefined vector. */
837 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
838 _mm_undefined_si128 (void)
840 __m128i __Y = __Y;
841 return __Y;
844 /* Create a vector of zeros. */
845 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
846 _mm_setzero_si128 (void)
848 return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
851 #ifdef _ARCH_PWR8
852 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
853 _mm_cvtepi32_pd (__m128i __A)
855 __v2di val;
856 /* For LE need to generate Vector Unpack Low Signed Word.
857 Which is generated from unpackh. */
858 val = (__v2di)vec_unpackh ((__v4si)__A);
860 return (__m128d)vec_ctf (val, 0);
862 #endif
864 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
865 _mm_cvtepi32_ps (__m128i __A)
867 return ((__m128)vec_ctf((__v4si)__A, 0));
870 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
871 _mm_cvtpd_epi32 (__m128d __A)
873 __v2df rounded = vec_rint (__A);
874 __v4si result, temp;
875 const __v4si vzero =
876 { 0, 0, 0, 0 };
878 /* VSX Vector truncate Double-Precision to integer and Convert to
879 Signed Integer Word format with Saturate. */
880 __asm__(
881 "xvcvdpsxws %x0,%x1"
882 : "=wa" (temp)
883 : "wa" (rounded)
884 : );
886 #ifdef _ARCH_PWR8
887 temp = vec_mergeo (temp, temp);
888 result = (__v4si)vec_vpkudum ((vector long)temp, (vector long)vzero);
889 #else
891 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
892 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
893 result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
895 #endif
896 return (__m128i) result;
899 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
900 _mm_cvtpd_pi32 (__m128d __A)
902 __m128i result = _mm_cvtpd_epi32(__A);
904 return (__m64) result[0];
907 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
908 _mm_cvtpd_ps (__m128d __A)
910 __v4sf result;
911 __v4si temp;
912 const __v4si vzero = { 0, 0, 0, 0 };
914 __asm__(
915 "xvcvdpsp %x0,%x1"
916 : "=wa" (temp)
917 : "wa" (__A)
918 : );
920 #ifdef _ARCH_PWR8
921 temp = vec_mergeo (temp, temp);
922 result = (__v4sf)vec_vpkudum ((vector long)temp, (vector long)vzero);
923 #else
925 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
926 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
927 result = (__v4sf) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
929 #endif
930 return ((__m128)result);
933 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
934 _mm_cvttpd_epi32 (__m128d __A)
936 __v4si result;
937 __v4si temp;
938 const __v4si vzero = { 0, 0, 0, 0 };
940 /* VSX Vector truncate Double-Precision to integer and Convert to
941 Signed Integer Word format with Saturate. */
942 __asm__(
943 "xvcvdpsxws %x0,%x1"
944 : "=wa" (temp)
945 : "wa" (__A)
946 : );
948 #ifdef _ARCH_PWR8
949 temp = vec_mergeo (temp, temp);
950 result = (__v4si)vec_vpkudum ((vector long)temp, (vector long)vzero);
951 #else
953 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
954 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
955 result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
957 #endif
959 return ((__m128i) result);
962 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
963 _mm_cvttpd_pi32 (__m128d __A)
965 __m128i result = _mm_cvttpd_epi32 (__A);
967 return (__m64) result[0];
970 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
971 _mm_cvtsi128_si32 (__m128i __A)
973 return ((__v4si)__A)[0];
976 #ifdef _ARCH_PWR8
977 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
978 _mm_cvtpi32_pd (__m64 __A)
980 __v4si temp;
981 __v2di tmp2;
982 __v2df result;
984 temp = (__v4si)vec_splats (__A);
985 tmp2 = (__v2di)vec_unpackl (temp);
986 result = vec_ctf ((__vector signed long)tmp2, 0);
987 return (__m128d)result;
989 #endif
991 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
992 _mm_cvtps_epi32 (__m128 __A)
994 __v4sf rounded;
995 __v4si result;
997 rounded = vec_rint((__v4sf) __A);
998 result = vec_cts (rounded, 0);
999 return (__m128i) result;
1002 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1003 _mm_cvttps_epi32 (__m128 __A)
1005 __v4si result;
1007 result = vec_cts ((__v4sf) __A, 0);
1008 return (__m128i) result;
1011 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1012 _mm_cvtps_pd (__m128 __A)
1014 /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
1015 #ifdef vec_doubleh
1016 return (__m128d) vec_doubleh ((__v4sf)__A);
1017 #else
1018 /* Otherwise the compiler is not current and so need to generate the
1019 equivalent code. */
1020 __v4sf a = (__v4sf)__A;
1021 __v4sf temp;
1022 __v2df result;
1023 #ifdef __LITTLE_ENDIAN__
1024 /* The input float values are in elements {[0], [1]} but the convert
1025 instruction needs them in elements {[1], [3]}, So we use two
1026 shift left double vector word immediates to get the elements
1027 lined up. */
1028 temp = __builtin_vsx_xxsldwi (a, a, 3);
1029 temp = __builtin_vsx_xxsldwi (a, temp, 2);
1030 #elif __BIG_ENDIAN__
1031 /* The input float values are in elements {[0], [1]} but the convert
1032 instruction needs them in elements {[0], [2]}, So we use two
1033 shift left double vector word immediates to get the elements
1034 lined up. */
1035 temp = vec_vmrghw (a, a);
1036 #endif
1037 __asm__(
1038 " xvcvspdp %x0,%x1"
1039 : "=wa" (result)
1040 : "wa" (temp)
1041 : );
1042 return (__m128d) result;
1043 #endif
1046 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1047 _mm_cvtsd_si32 (__m128d __A)
1049 __v2df rounded = vec_rint((__v2df) __A);
1050 int result = ((__v2df)rounded)[0];
1052 return result;
1054 /* Intel intrinsic. */
1055 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1056 _mm_cvtsd_si64 (__m128d __A)
1058 __v2df rounded = vec_rint ((__v2df) __A );
1059 long long result = ((__v2df) rounded)[0];
1061 return result;
1064 /* Microsoft intrinsic. */
1065 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1066 _mm_cvtsd_si64x (__m128d __A)
1068 return _mm_cvtsd_si64 ((__v2df)__A);
1071 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1072 _mm_cvttsd_si32 (__m128d __A)
1074 int result = ((__v2df)__A)[0];
1076 return result;
1079 /* Intel intrinsic. */
1080 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1081 _mm_cvttsd_si64 (__m128d __A)
1083 long long result = ((__v2df)__A)[0];
1085 return result;
1088 /* Microsoft intrinsic. */
1089 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1090 _mm_cvttsd_si64x (__m128d __A)
1092 return _mm_cvttsd_si64 (__A);
1095 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1096 _mm_cvtsd_ss (__m128 __A, __m128d __B)
1098 __v4sf result = (__v4sf)__A;
1100 #ifdef __LITTLE_ENDIAN__
1101 __v4sf temp_s;
1102 /* Copy double element[0] to element [1] for conversion. */
1103 __v2df temp_b = vec_splat((__v2df)__B, 0);
1105 /* Pre-rotate __A left 3 (logically right 1) elements. */
1106 result = __builtin_vsx_xxsldwi (result, result, 3);
1107 /* Convert double to single float scalar in a vector. */
1108 __asm__(
1109 "xscvdpsp %x0,%x1"
1110 : "=wa" (temp_s)
1111 : "wa" (temp_b)
1112 : );
1113 /* Shift the resulting scalar into vector element [0]. */
1114 result = __builtin_vsx_xxsldwi (result, temp_s, 1);
1115 #else
1116 result [0] = ((__v2df)__B)[0];
1117 #endif
1118 return (__m128) result;
1121 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1122 _mm_cvtsi32_sd (__m128d __A, int __B)
1124 __v2df result = (__v2df)__A;
1125 double db = __B;
1126 result [0] = db;
1127 return (__m128d)result;
1130 /* Intel intrinsic. */
1131 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1132 _mm_cvtsi64_sd (__m128d __A, long long __B)
1134 __v2df result = (__v2df)__A;
1135 double db = __B;
1136 result [0] = db;
1137 return (__m128d)result;
1140 /* Microsoft intrinsic. */
1141 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1142 _mm_cvtsi64x_sd (__m128d __A, long long __B)
1144 return _mm_cvtsi64_sd (__A, __B);
1147 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1148 _mm_cvtss_sd (__m128d __A, __m128 __B)
1150 #ifdef __LITTLE_ENDIAN__
1151 /* Use splat to move element [0] into position for the convert. */
1152 __v4sf temp = vec_splat ((__v4sf)__B, 0);
1153 __v2df res;
1154 /* Convert single float scalar to double in a vector. */
1155 __asm__(
1156 "xscvspdp %x0,%x1"
1157 : "=wa" (res)
1158 : "wa" (temp)
1159 : );
1160 return (__m128d) vec_mergel (res, (__v2df)__A);
1161 #else
1162 __v2df res = (__v2df)__A;
1163 res [0] = ((__v4sf)__B) [0];
1164 return (__m128d) res;
1165 #endif
1168 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1169 _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
1171 __vector double result;
1172 const int litmsk = __mask & 0x3;
1174 if (litmsk == 0)
1175 result = vec_mergeh (__A, __B);
1176 #if __GNUC__ < 6
1177 else if (litmsk == 1)
1178 result = vec_xxpermdi (__B, __A, 2);
1179 else if (litmsk == 2)
1180 result = vec_xxpermdi (__B, __A, 1);
1181 #else
1182 else if (litmsk == 1)
1183 result = vec_xxpermdi (__A, __B, 2);
1184 else if (litmsk == 2)
1185 result = vec_xxpermdi (__A, __B, 1);
1186 #endif
1187 else
1188 result = vec_mergel (__A, __B);
1190 return result;
1193 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1194 _mm_unpackhi_pd (__m128d __A, __m128d __B)
1196 return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B);
1199 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1200 _mm_unpacklo_pd (__m128d __A, __m128d __B)
1202 return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B);
1205 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1206 _mm_loadh_pd (__m128d __A, double const *__B)
1208 __v2df result = (__v2df)__A;
1209 result [1] = *__B;
1210 return (__m128d)result;
1213 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1214 _mm_loadl_pd (__m128d __A, double const *__B)
1216 __v2df result = (__v2df)__A;
1217 result [0] = *__B;
1218 return (__m128d)result;
1221 #ifdef _ARCH_PWR8
1222 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1224 /* Creates a 2-bit mask from the most significant bits of the DPFP values. */
1225 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1226 _mm_movemask_pd (__m128d __A)
1228 __vector __m64 result;
1229 static const __vector unsigned int perm_mask =
1231 #ifdef __LITTLE_ENDIAN__
1232 0x80800040, 0x80808080, 0x80808080, 0x80808080
1233 #elif __BIG_ENDIAN__
1234 0x80808080, 0x80808080, 0x80808080, 0x80800040
1235 #endif
1238 result = (__vector __m64) vec_vbpermq ((__vector unsigned char) __A,
1239 (__vector unsigned char) perm_mask);
1241 #ifdef __LITTLE_ENDIAN__
1242 return result[1];
1243 #elif __BIG_ENDIAN__
1244 return result[0];
1245 #endif
1247 #endif /* _ARCH_PWR8 */
1249 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1250 _mm_packs_epi16 (__m128i __A, __m128i __B)
1252 return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B);
1255 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1256 _mm_packs_epi32 (__m128i __A, __m128i __B)
1258 return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B);
1261 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1262 _mm_packus_epi16 (__m128i __A, __m128i __B)
1264 return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B);
1267 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1268 _mm_unpackhi_epi8 (__m128i __A, __m128i __B)
1270 return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B);
1273 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1274 _mm_unpackhi_epi16 (__m128i __A, __m128i __B)
1276 return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B);
1279 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1280 _mm_unpackhi_epi32 (__m128i __A, __m128i __B)
1282 return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B);
1285 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1286 _mm_unpackhi_epi64 (__m128i __A, __m128i __B)
1288 return (__m128i) vec_mergel ((__vector long)__A, (__vector long)__B);
1291 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1292 _mm_unpacklo_epi8 (__m128i __A, __m128i __B)
1294 return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B);
1297 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1298 _mm_unpacklo_epi16 (__m128i __A, __m128i __B)
1300 return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B);
1303 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1304 _mm_unpacklo_epi32 (__m128i __A, __m128i __B)
1306 return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B);
1309 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1310 _mm_unpacklo_epi64 (__m128i __A, __m128i __B)
1312 return (__m128i) vec_mergeh ((__vector long)__A, (__vector long)__B);
1315 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1316 _mm_add_epi8 (__m128i __A, __m128i __B)
1318 return (__m128i) ((__v16qu)__A + (__v16qu)__B);
1321 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1322 _mm_add_epi16 (__m128i __A, __m128i __B)
1324 return (__m128i) ((__v8hu)__A + (__v8hu)__B);
1327 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1328 _mm_add_epi32 (__m128i __A, __m128i __B)
1330 return (__m128i) ((__v4su)__A + (__v4su)__B);
1333 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1334 _mm_add_epi64 (__m128i __A, __m128i __B)
1336 return (__m128i) ((__v2du)__A + (__v2du)__B);
1339 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1340 _mm_adds_epi8 (__m128i __A, __m128i __B)
1342 return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B);
1345 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1346 _mm_adds_epi16 (__m128i __A, __m128i __B)
1348 return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B);
1351 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1352 _mm_adds_epu8 (__m128i __A, __m128i __B)
1354 return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B);
1357 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1358 _mm_adds_epu16 (__m128i __A, __m128i __B)
1360 return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B);
1363 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1364 _mm_sub_epi8 (__m128i __A, __m128i __B)
1366 return (__m128i) ((__v16qu)__A - (__v16qu)__B);
1369 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1370 _mm_sub_epi16 (__m128i __A, __m128i __B)
1372 return (__m128i) ((__v8hu)__A - (__v8hu)__B);
1375 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1376 _mm_sub_epi32 (__m128i __A, __m128i __B)
1378 return (__m128i) ((__v4su)__A - (__v4su)__B);
1381 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1382 _mm_sub_epi64 (__m128i __A, __m128i __B)
1384 return (__m128i) ((__v2du)__A - (__v2du)__B);
1387 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1388 _mm_subs_epi8 (__m128i __A, __m128i __B)
1390 return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B);
1393 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1394 _mm_subs_epi16 (__m128i __A, __m128i __B)
1396 return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B);
1399 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1400 _mm_subs_epu8 (__m128i __A, __m128i __B)
1402 return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B);
1405 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1406 _mm_subs_epu16 (__m128i __A, __m128i __B)
1408 return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B);
1411 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1412 _mm_madd_epi16 (__m128i __A, __m128i __B)
1414 __vector signed int zero = {0, 0, 0, 0};
1416 return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, zero);
1419 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1420 _mm_mulhi_epi16 (__m128i __A, __m128i __B)
1422 __vector signed int w0, w1;
1424 __vector unsigned char xform1 = {
1425 #ifdef __LITTLE_ENDIAN__
1426 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1427 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1428 #elif __BIG_ENDIAN__
1429 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
1430 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1431 #endif
1434 w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B);
1435 w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B);
1436 return (__m128i) vec_perm (w0, w1, xform1);
1439 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1440 _mm_mullo_epi16 (__m128i __A, __m128i __B)
1442 return (__m128i) ((__v8hi)__A * (__v8hi)__B);
1445 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1446 _mm_mul_su32 (__m64 __A, __m64 __B)
1448 unsigned int a = __A;
1449 unsigned int b = __B;
1451 return ((__m64)a * (__m64)b);
1454 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1455 _mm_mul_epu32 (__m128i __A, __m128i __B)
1457 #if __GNUC__ < 8
1458 __v2du result;
1460 #ifdef __LITTLE_ENDIAN__
1461 /* VMX Vector Multiply Odd Unsigned Word. */
1462 __asm__(
1463 "vmulouw %0,%1,%2"
1464 : "=v" (result)
1465 : "v" (__A), "v" (__B)
1466 : );
1467 #elif __BIG_ENDIAN__
1468 /* VMX Vector Multiply Even Unsigned Word. */
1469 __asm__(
1470 "vmuleuw %0,%1,%2"
1471 : "=v" (result)
1472 : "v" (__A), "v" (__B)
1473 : );
1474 #endif
1475 return (__m128i) result;
1476 #else
1477 #ifdef __LITTLE_ENDIAN__
1478 return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
1479 #elif __BIG_ENDIAN__
1480 return (__m128i) vec_mulo ((__v4su)__A, (__v4su)__B);
1481 #endif
1482 #endif
1485 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1486 _mm_slli_epi16 (__m128i __A, int __B)
1488 __v8hu lshift;
1489 __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1491 if (__B < 16)
1493 if (__builtin_constant_p(__B))
1494 lshift = (__v8hu) vec_splat_s16(__B);
1495 else
1496 lshift = vec_splats ((unsigned short) __B);
1498 result = vec_vslh ((__v8hi) __A, lshift);
1501 return (__m128i) result;
1504 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1505 _mm_slli_epi32 (__m128i __A, int __B)
1507 __v4su lshift;
1508 __v4si result = { 0, 0, 0, 0 };
1510 if (__B < 32)
1512 if (__builtin_constant_p(__B))
1513 lshift = (__v4su) vec_splat_s32(__B);
1514 else
1515 lshift = vec_splats ((unsigned int) __B);
1517 result = vec_vslw ((__v4si) __A, lshift);
1520 return (__m128i) result;
1523 #ifdef _ARCH_PWR8
1524 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1525 _mm_slli_epi64 (__m128i __A, int __B)
1527 __v2du lshift;
1528 __v2di result = { 0, 0 };
1530 if (__B < 64)
1532 if (__builtin_constant_p(__B))
1534 if (__B < 32)
1535 lshift = (__v2du) vec_splat_s32(__B);
1536 else
1537 lshift = (__v2du) vec_splats((unsigned long long)__B);
1539 else
1540 lshift = (__v2du) vec_splats ((unsigned int) __B);
1542 result = vec_vsld ((__v2di) __A, lshift);
1545 return (__m128i) result;
1547 #endif
1549 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1550 _mm_srai_epi16 (__m128i __A, int __B)
1552 __v8hu rshift = { 15, 15, 15, 15, 15, 15, 15, 15 };
1553 __v8hi result;
1555 if (__B < 16)
1557 if (__builtin_constant_p(__B))
1558 rshift = (__v8hu) vec_splat_s16(__B);
1559 else
1560 rshift = vec_splats ((unsigned short) __B);
1562 result = vec_vsrah ((__v8hi) __A, rshift);
1564 return (__m128i) result;
1567 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1568 _mm_srai_epi32 (__m128i __A, int __B)
1570 __v4su rshift = { 31, 31, 31, 31 };
1571 __v4si result;
1573 if (__B < 32)
1575 if (__builtin_constant_p(__B))
1577 if (__B < 16)
1578 rshift = (__v4su) vec_splat_s32(__B);
1579 else
1580 rshift = (__v4su) vec_splats((unsigned int)__B);
1582 else
1583 rshift = vec_splats ((unsigned int) __B);
1585 result = vec_vsraw ((__v4si) __A, rshift);
1587 return (__m128i) result;
1590 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1591 _mm_bslli_si128 (__m128i __A, const int __N)
1593 __v16qu result;
1594 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1596 if (__N < 16)
1597 result = vec_sld ((__v16qu) __A, zeros, __N);
1598 else
1599 result = zeros;
1601 return (__m128i) result;
1604 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1605 _mm_bsrli_si128 (__m128i __A, const int __N)
1607 __v16qu result;
1608 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1610 if (__N < 16)
1611 if (__builtin_constant_p(__N))
1612 /* Would like to use Vector Shift Left Double by Octet
1613 Immediate here to use the immediate form and avoid
1614 load of __N * 8 value into a separate VR. */
1615 result = vec_sld (zeros, (__v16qu) __A, (16 - __N));
1616 else
1618 __v16qu shift = vec_splats((unsigned char)(__N*8));
1619 result = vec_sro ((__v16qu)__A, shift);
1621 else
1622 result = zeros;
1624 return (__m128i) result;
1627 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1628 _mm_srli_si128 (__m128i __A, const int __N)
1630 return _mm_bsrli_si128 (__A, __N);
1633 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1634 _mm_slli_si128 (__m128i __A, const int _imm5)
1636 __v16qu result;
1637 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1639 if (_imm5 < 16)
1640 #ifdef __LITTLE_ENDIAN__
1641 result = vec_sld ((__v16qu) __A, zeros, _imm5);
1642 #elif __BIG_ENDIAN__
1643 result = vec_sld (zeros, (__v16qu) __A, (16 - _imm5));
1644 #endif
1645 else
1646 result = zeros;
1648 return (__m128i) result;
1651 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1653 _mm_srli_epi16 (__m128i __A, int __B)
1655 __v8hu rshift;
1656 __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1658 if (__B < 16)
1660 if (__builtin_constant_p(__B))
1661 rshift = (__v8hu) vec_splat_s16(__B);
1662 else
1663 rshift = vec_splats ((unsigned short) __B);
1665 result = vec_vsrh ((__v8hi) __A, rshift);
1668 return (__m128i) result;
1671 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1672 _mm_srli_epi32 (__m128i __A, int __B)
1674 __v4su rshift;
1675 __v4si result = { 0, 0, 0, 0 };
1677 if (__B < 32)
1679 if (__builtin_constant_p(__B))
1681 if (__B < 16)
1682 rshift = (__v4su) vec_splat_s32(__B);
1683 else
1684 rshift = (__v4su) vec_splats((unsigned int)__B);
1686 else
1687 rshift = vec_splats ((unsigned int) __B);
1689 result = vec_vsrw ((__v4si) __A, rshift);
1692 return (__m128i) result;
1695 #ifdef _ARCH_PWR8
1696 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1697 _mm_srli_epi64 (__m128i __A, int __B)
1699 __v2du rshift;
1700 __v2di result = { 0, 0 };
1702 if (__B < 64)
1704 if (__builtin_constant_p(__B))
1706 if (__B < 16)
1707 rshift = (__v2du) vec_splat_s32(__B);
1708 else
1709 rshift = (__v2du) vec_splats((unsigned long long)__B);
1711 else
1712 rshift = (__v2du) vec_splats ((unsigned int) __B);
1714 result = vec_vsrd ((__v2di) __A, rshift);
1717 return (__m128i) result;
1719 #endif
1721 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1722 _mm_sll_epi16 (__m128i __A, __m128i __B)
1724 __v8hu lshift, shmask;
1725 const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1726 __v8hu result;
1728 #ifdef __LITTLE_ENDIAN__
1729 lshift = vec_splat ((__v8hu)__B, 0);
1730 #elif __BIG_ENDIAN__
1731 lshift = vec_splat ((__v8hu)__B, 3);
1732 #endif
1733 shmask = lshift <= shmax;
1734 result = vec_vslh ((__v8hu) __A, lshift);
1735 result = vec_sel (shmask, result, shmask);
1737 return (__m128i) result;
1740 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1741 _mm_sll_epi32 (__m128i __A, __m128i __B)
1743 __v4su lshift, shmask;
1744 const __v4su shmax = { 32, 32, 32, 32 };
1745 __v4su result;
1746 #ifdef __LITTLE_ENDIAN__
1747 lshift = vec_splat ((__v4su)__B, 0);
1748 #elif __BIG_ENDIAN__
1749 lshift = vec_splat ((__v4su)__B, 1);
1750 #endif
1751 shmask = lshift < shmax;
1752 result = vec_vslw ((__v4su) __A, lshift);
1753 result = vec_sel (shmask, result, shmask);
1755 return (__m128i) result;
1758 #ifdef _ARCH_PWR8
1759 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1760 _mm_sll_epi64 (__m128i __A, __m128i __B)
1762 __v2du lshift, shmask;
1763 const __v2du shmax = { 64, 64 };
1764 __v2du result;
1766 lshift = (__v2du) vec_splat ((__v2du)__B, 0);
1767 shmask = lshift < shmax;
1768 result = vec_vsld ((__v2du) __A, lshift);
1769 result = (__v2du) vec_sel ((__v2df) shmask, (__v2df) result,
1770 (__v2df) shmask);
1772 return (__m128i) result;
1774 #endif
1776 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1777 _mm_sra_epi16 (__m128i __A, __m128i __B)
1779 const __v8hu rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1780 __v8hu rshift;
1781 __v8hi result;
1783 #ifdef __LITTLE_ENDIAN__
1784 rshift = vec_splat ((__v8hu)__B, 0);
1785 #elif __BIG_ENDIAN__
1786 rshift = vec_splat ((__v8hu)__B, 3);
1787 #endif
1788 rshift = vec_min (rshift, rshmax);
1789 result = vec_vsrah ((__v8hi) __A, rshift);
1791 return (__m128i) result;
1794 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1795 _mm_sra_epi32 (__m128i __A, __m128i __B)
1797 const __v4su rshmax = { 31, 31, 31, 31 };
1798 __v4su rshift;
1799 __v4si result;
1801 #ifdef __LITTLE_ENDIAN__
1802 rshift = vec_splat ((__v4su)__B, 0);
1803 #elif __BIG_ENDIAN__
1804 rshift = vec_splat ((__v4su)__B, 1);
1805 #endif
1806 rshift = vec_min (rshift, rshmax);
1807 result = vec_vsraw ((__v4si) __A, rshift);
1809 return (__m128i) result;
1812 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1813 _mm_srl_epi16 (__m128i __A, __m128i __B)
1815 __v8hu rshift, shmask;
1816 const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1817 __v8hu result;
1819 #ifdef __LITTLE_ENDIAN__
1820 rshift = vec_splat ((__v8hu)__B, 0);
1821 #elif __BIG_ENDIAN__
1822 rshift = vec_splat ((__v8hu)__B, 3);
1823 #endif
1824 shmask = rshift <= shmax;
1825 result = vec_vsrh ((__v8hu) __A, rshift);
1826 result = vec_sel (shmask, result, shmask);
1828 return (__m128i) result;
1831 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1832 _mm_srl_epi32 (__m128i __A, __m128i __B)
1834 __v4su rshift, shmask;
1835 const __v4su shmax = { 32, 32, 32, 32 };
1836 __v4su result;
1838 #ifdef __LITTLE_ENDIAN__
1839 rshift = vec_splat ((__v4su)__B, 0);
1840 #elif __BIG_ENDIAN__
1841 rshift = vec_splat ((__v4su)__B, 1);
1842 #endif
1843 shmask = rshift < shmax;
1844 result = vec_vsrw ((__v4su) __A, rshift);
1845 result = vec_sel (shmask, result, shmask);
1847 return (__m128i) result;
1850 #ifdef _ARCH_PWR8
1851 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1852 _mm_srl_epi64 (__m128i __A, __m128i __B)
1854 __v2du rshift, shmask;
1855 const __v2du shmax = { 64, 64 };
1856 __v2du result;
1858 rshift = (__v2du) vec_splat ((__v2du)__B, 0);
1859 shmask = rshift < shmax;
1860 result = vec_vsrd ((__v2du) __A, rshift);
1861 result = (__v2du)vec_sel ((__v2du)shmask, (__v2du)result, (__v2du)shmask);
1863 return (__m128i) result;
1865 #endif
1867 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1868 _mm_and_pd (__m128d __A, __m128d __B)
1870 return (vec_and ((__v2df) __A, (__v2df) __B));
1873 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1874 _mm_andnot_pd (__m128d __A, __m128d __B)
1876 return (vec_andc ((__v2df) __B, (__v2df) __A));
1879 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1880 _mm_or_pd (__m128d __A, __m128d __B)
1882 return (vec_or ((__v2df) __A, (__v2df) __B));
1885 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1886 _mm_xor_pd (__m128d __A, __m128d __B)
1888 return (vec_xor ((__v2df) __A, (__v2df) __B));
1891 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1892 _mm_cmpeq_epi8 (__m128i __A, __m128i __B)
1894 return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B);
1897 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1898 _mm_cmpeq_epi16 (__m128i __A, __m128i __B)
1900 return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B);
1903 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1904 _mm_cmpeq_epi32 (__m128i __A, __m128i __B)
1906 return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B);
1909 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1910 _mm_cmplt_epi8 (__m128i __A, __m128i __B)
1912 return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B);
1915 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1916 _mm_cmplt_epi16 (__m128i __A, __m128i __B)
1918 return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B);
1921 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1922 _mm_cmplt_epi32 (__m128i __A, __m128i __B)
1924 return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B);
1927 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1928 _mm_cmpgt_epi8 (__m128i __A, __m128i __B)
1930 return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B);
1933 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1934 _mm_cmpgt_epi16 (__m128i __A, __m128i __B)
1936 return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B);
1939 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1940 _mm_cmpgt_epi32 (__m128i __A, __m128i __B)
1942 return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B);
1945 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1946 _mm_extract_epi16 (__m128i const __A, int const __N)
1948 return (unsigned short) ((__v8hi)__A)[__N & 7];
1951 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1952 _mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
1954 __v8hi result = (__v8hi)__A;
1956 result [(__N & 7)] = __D;
1958 return (__m128i) result;
1961 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1962 _mm_max_epi16 (__m128i __A, __m128i __B)
1964 return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B);
1967 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1968 _mm_max_epu8 (__m128i __A, __m128i __B)
1970 return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B);
1973 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1974 _mm_min_epi16 (__m128i __A, __m128i __B)
1976 return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B);
1979 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1980 _mm_min_epu8 (__m128i __A, __m128i __B)
1982 return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B);
1986 #ifdef _ARCH_PWR8
1987 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1989 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */
1990 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1991 _mm_movemask_epi8 (__m128i __A)
1993 __vector __m64 result;
1994 static const __vector unsigned char perm_mask =
1996 #ifdef __LITTLE_ENDIAN__
1997 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
1998 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00
1999 #elif __BIG_ENDIAN__
2000 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38,
2001 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78
2002 #endif
2005 result = (__vector __m64) vec_vbpermq ((__vector unsigned char) __A,
2006 (__vector unsigned char) perm_mask);
2008 #ifdef __LITTLE_ENDIAN__
2009 return result[1];
2010 #elif __BIG_ENDIAN__
2011 return result[0];
2012 #endif
2014 #endif /* _ARCH_PWR8 */
2016 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2017 _mm_mulhi_epu16 (__m128i __A, __m128i __B)
2019 __v4su w0, w1;
2020 __v16qu xform1 = {
2021 #ifdef __LITTLE_ENDIAN__
2022 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
2023 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
2024 #elif __BIG_ENDIAN__
2025 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
2026 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
2027 #endif
2030 w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B);
2031 w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B);
2032 return (__m128i) vec_perm (w0, w1, xform1);
2035 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2036 _mm_shufflehi_epi16 (__m128i __A, const int __mask)
2038 unsigned long element_selector_98 = __mask & 0x03;
2039 unsigned long element_selector_BA = (__mask >> 2) & 0x03;
2040 unsigned long element_selector_DC = (__mask >> 4) & 0x03;
2041 unsigned long element_selector_FE = (__mask >> 6) & 0x03;
2042 static const unsigned short permute_selectors[4] =
2044 #ifdef __LITTLE_ENDIAN__
2045 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2046 #elif __BIG_ENDIAN__
2047 0x0607, 0x0405, 0x0203, 0x0001
2048 #endif
2050 __v2du pmask =
2051 #ifdef __LITTLE_ENDIAN__
2052 { 0x1716151413121110UL, 0x1f1e1d1c1b1a1918UL};
2053 #elif __BIG_ENDIAN__
2054 { 0x1011121314151617UL, 0x18191a1b1c1d1e1fUL};
2055 #endif
2056 __m64_union t;
2057 __v2du a, r;
2059 #ifdef __LITTLE_ENDIAN__
2060 t.as_short[0] = permute_selectors[element_selector_98];
2061 t.as_short[1] = permute_selectors[element_selector_BA];
2062 t.as_short[2] = permute_selectors[element_selector_DC];
2063 t.as_short[3] = permute_selectors[element_selector_FE];
2064 #elif __BIG_ENDIAN__
2065 t.as_short[3] = permute_selectors[element_selector_98];
2066 t.as_short[2] = permute_selectors[element_selector_BA];
2067 t.as_short[1] = permute_selectors[element_selector_DC];
2068 t.as_short[0] = permute_selectors[element_selector_FE];
2069 #endif
2070 #ifdef __LITTLE_ENDIAN__
2071 pmask[1] = t.as_m64;
2072 #elif __BIG_ENDIAN__
2073 pmask[0] = t.as_m64;
2074 #endif
2075 a = (__v2du)__A;
2076 r = vec_perm (a, a, (__vector unsigned char)pmask);
2077 return (__m128i) r;
2080 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2081 _mm_shufflelo_epi16 (__m128i __A, const int __mask)
2083 unsigned long element_selector_10 = __mask & 0x03;
2084 unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2085 unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2086 unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2087 static const unsigned short permute_selectors[4] =
2089 #ifdef __LITTLE_ENDIAN__
2090 0x0100, 0x0302, 0x0504, 0x0706
2091 #elif __BIG_ENDIAN__
2092 0x0e0f, 0x0c0d, 0x0a0b, 0x0809
2093 #endif
2095 __v2du pmask = { 0x1011121314151617UL, 0x1f1e1d1c1b1a1918UL};
2096 __m64_union t;
2097 __v2du a, r;
2099 #ifdef __LITTLE_ENDIAN__
2100 t.as_short[0] = permute_selectors[element_selector_10];
2101 t.as_short[1] = permute_selectors[element_selector_32];
2102 t.as_short[2] = permute_selectors[element_selector_54];
2103 t.as_short[3] = permute_selectors[element_selector_76];
2104 #elif __BIG_ENDIAN__
2105 t.as_short[3] = permute_selectors[element_selector_10];
2106 t.as_short[2] = permute_selectors[element_selector_32];
2107 t.as_short[1] = permute_selectors[element_selector_54];
2108 t.as_short[0] = permute_selectors[element_selector_76];
2109 #endif
2110 #ifdef __LITTLE_ENDIAN__
2111 pmask[0] = t.as_m64;
2112 #elif __BIG_ENDIAN__
2113 pmask[1] = t.as_m64;
2114 #endif
2115 a = (__v2du)__A;
2116 r = vec_perm (a, a, (__vector unsigned char)pmask);
2117 return (__m128i) r;
2120 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2121 _mm_shuffle_epi32 (__m128i __A, const int __mask)
2123 unsigned long element_selector_10 = __mask & 0x03;
2124 unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2125 unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2126 unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2127 static const unsigned int permute_selectors[4] =
2129 #ifdef __LITTLE_ENDIAN__
2130 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2131 #elif __BIG_ENDIAN__
2132 0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203
2133 #endif
2135 __v4su t;
2137 #ifdef __LITTLE_ENDIAN__
2138 t[0] = permute_selectors[element_selector_10];
2139 t[1] = permute_selectors[element_selector_32];
2140 t[2] = permute_selectors[element_selector_54] + 0x10101010;
2141 t[3] = permute_selectors[element_selector_76] + 0x10101010;
2142 #elif __BIG_ENDIAN__
2143 t[3] = permute_selectors[element_selector_10] + 0x10101010;
2144 t[2] = permute_selectors[element_selector_32] + 0x10101010;
2145 t[1] = permute_selectors[element_selector_54];
2146 t[0] = permute_selectors[element_selector_76];
2147 #endif
2148 return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)t);
2151 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2152 _mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
2154 __v2du hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2155 __v16qu mask, tmp;
2156 __m128i *p = (__m128i*)__C;
2158 tmp = (__v16qu)_mm_loadu_si128(p);
2159 mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)hibit);
2160 tmp = vec_sel (tmp, (__v16qu)__A, mask);
2161 _mm_storeu_si128 (p, (__m128i)tmp);
2164 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2165 _mm_avg_epu8 (__m128i __A, __m128i __B)
2167 return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B);
2170 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2171 _mm_avg_epu16 (__m128i __A, __m128i __B)
2173 return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B);
2177 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2178 _mm_sad_epu8 (__m128i __A, __m128i __B)
2180 __v16qu a, b;
2181 __v16qu vmin, vmax, vabsdiff;
2182 __v4si vsum;
2183 const __v4su zero = { 0, 0, 0, 0 };
2184 __v4si result;
2186 a = (__v16qu) __A;
2187 b = (__v16qu) __B;
2188 vmin = vec_min (a, b);
2189 vmax = vec_max (a, b);
2190 vabsdiff = vec_sub (vmax, vmin);
2191 /* Sum four groups of bytes into integers. */
2192 vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
2193 /* Sum across four integers with two integer results. */
2194 result = vec_sum2s (vsum, (__vector signed int) zero);
2195 /* Rotate the sums into the correct position. */
2196 #ifdef __LITTLE_ENDIAN__
2197 result = vec_sld (result, result, 4);
2198 #elif __BIG_ENDIAN__
2199 result = vec_sld (result, result, 6);
2200 #endif
2201 /* Rotate the sums into the correct position. */
2202 return (__m128i) result;
2205 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2206 _mm_stream_si32 (int *__A, int __B)
2208 /* Use the data cache block touch for store transient. */
2209 __asm__ (
2210 "dcbtstt 0,%0"
2212 : "b" (__A)
2213 : "memory"
2215 *__A = __B;
2218 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2219 _mm_stream_si64 (long long int *__A, long long int __B)
2221 /* Use the data cache block touch for store transient. */
2222 __asm__ (
2223 " dcbtstt 0,%0"
2225 : "b" (__A)
2226 : "memory"
2228 *__A = __B;
2231 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2232 _mm_stream_si128 (__m128i *__A, __m128i __B)
2234 /* Use the data cache block touch for store transient. */
2235 __asm__ (
2236 "dcbtstt 0,%0"
2238 : "b" (__A)
2239 : "memory"
2241 *__A = __B;
2244 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2245 _mm_stream_pd (double *__A, __m128d __B)
2247 /* Use the data cache block touch for store transient. */
2248 __asm__ (
2249 "dcbtstt 0,%0"
2251 : "b" (__A)
2252 : "memory"
2254 *(__m128d*)__A = __B;
2257 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2258 _mm_clflush (void const *__A)
2260 /* Use the data cache block flush. */
2261 __asm__ (
2262 "dcbf 0,%0"
2264 : "b" (__A)
2265 : "memory"
2269 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2270 _mm_lfence (void)
2272 /* Use light weight sync for load to load ordering. */
2273 __atomic_thread_fence (__ATOMIC_RELEASE);
2276 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2277 _mm_mfence (void)
2279 /* Use heavy weight sync for any to any ordering. */
2280 __atomic_thread_fence (__ATOMIC_SEQ_CST);
2283 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2284 _mm_cvtsi32_si128 (int __A)
2286 return _mm_set_epi32 (0, 0, 0, __A);
2289 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2290 _mm_cvtsi64_si128 (long long __A)
2292 return __extension__ (__m128i)(__v2di){ __A, 0LL };
2295 /* Microsoft intrinsic. */
2296 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2297 _mm_cvtsi64x_si128 (long long __A)
2299 return __extension__ (__m128i)(__v2di){ __A, 0LL };
2302 /* Casts between various SP, DP, INT vector types. Note that these do no
2303 conversion of values, they just change the type. */
2304 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2305 _mm_castpd_ps(__m128d __A)
2307 return (__m128) __A;
2310 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2311 _mm_castpd_si128(__m128d __A)
2313 return (__m128i) __A;
2316 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2317 _mm_castps_pd(__m128 __A)
2319 return (__m128d) __A;
2322 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2323 _mm_castps_si128(__m128 __A)
2325 return (__m128i) __A;
2328 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2329 _mm_castsi128_ps(__m128i __A)
2331 return (__m128) __A;
2334 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2335 _mm_castsi128_pd(__m128i __A)
2337 return (__m128d) __A;
2340 #endif /* EMMINTRIN_H_ */