gcc/ChangeLog:
[official-gcc.git] / gcc / config / rs6000 / emmintrin.h
blob5d27a80339cee2b53ef2e5f377731250b48d5263
1 /* Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
24 /* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 9.0. */
27 #ifndef NO_WARN_X86_INTRINSICS
28 /* This header is distributed to simplify porting x86_64 code that
29 makes explicit use of Intel intrinsics to powerpc64le.
30 It is the user's responsibility to determine if the results are
31 acceptable and make additional changes as necessary.
32 Note that much code that uses Intel intrinsics can be rewritten in
33 standard C or GNU C extensions, which are more portable and better
34 optimized across multiple targets.
36 In the specific case of X86 SSE2 (__m128i, __m128d) intrinsics,
37 the PowerPC VMX/VSX ISA is a good match for vector double SIMD
38 operations. However scalar double operations in vector (XMM)
39 registers require the POWER8 VSX ISA (2.07) level. Also there are
40 important differences for data format and placement of double
41 scalars in the vector register.
43 For PowerISA Scalar double is in FPRs (left most 64-bits of the
44 low 32 VSRs), while X86_64 SSE2 uses the right most 64-bits of
45 the XMM. These differences require extra steps on POWER to match
46 the SSE2 scalar double semantics.
48 Most SSE2 scalar double intrinsic operations can be performed more
49 efficiently as C language double scalar operations or optimized to
50 use vector SIMD operations. We recommend this for new applications.
52 Another difference is the format and details of the X86_64 MXSCR vs
53 the PowerISA FPSCR / VSCR registers. We recommend applications
54 replace direct access to the MXSCR with the more portable <fenv.h>
55 Posix APIs. */
56 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
57 #endif
59 #ifndef EMMINTRIN_H_
60 #define EMMINTRIN_H_
62 #include <altivec.h>
63 #include <assert.h>
65 /* We need definitions from the SSE header files. */
66 #include <xmmintrin.h>
68 /* SSE2 */
69 typedef __vector double __v2df;
70 typedef __vector long long __v2di;
71 typedef __vector unsigned long long __v2du;
72 typedef __vector int __v4si;
73 typedef __vector unsigned int __v4su;
74 typedef __vector short __v8hi;
75 typedef __vector unsigned short __v8hu;
76 typedef __vector signed char __v16qi;
77 typedef __vector unsigned char __v16qu;
79 /* The Intel API is flexible enough that we must allow aliasing with other
80 vector types, and their scalar components. */
81 typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
82 typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
84 /* Unaligned version of the same types. */
85 typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
86 typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
88 /* Create a vector with element 0 as F and the rest zero. */
89 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
90 _mm_set_sd (double __F)
92 return __extension__ (__m128d){ __F, 0.0 };
95 /* Create a vector with both elements equal to F. */
96 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
97 _mm_set1_pd (double __F)
99 return __extension__ (__m128d){ __F, __F };
102 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
103 _mm_set_pd1 (double __F)
105 return _mm_set1_pd (__F);
108 /* Create a vector with the lower value X and upper value W. */
109 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
110 _mm_set_pd (double __W, double __X)
112 return __extension__ (__m128d){ __X, __W };
115 /* Create a vector with the lower value W and upper value X. */
116 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
117 _mm_setr_pd (double __W, double __X)
119 return __extension__ (__m128d){ __W, __X };
122 /* Create an undefined vector. */
123 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
124 _mm_undefined_pd (void)
126 __m128d __Y = __Y;
127 return __Y;
130 /* Create a vector of zeros. */
131 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
132 _mm_setzero_pd (void)
134 return (__m128d) vec_splats (0);
137 /* Sets the low DPFP value of A from the low value of B. */
138 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
139 _mm_move_sd (__m128d __A, __m128d __B)
141 __v2df result = (__v2df) __A;
142 result [0] = ((__v2df) __B)[0];
143 return (__m128d) result;
146 /* Load two DPFP values from P. The address must be 16-byte aligned. */
147 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
148 _mm_load_pd (double const *__P)
150 assert(((unsigned long)__P & 0xfUL) == 0UL);
151 return ((__m128d)vec_ld(0, (__v16qu*)__P));
154 /* Load two DPFP values from P. The address need not be 16-byte aligned. */
155 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
156 _mm_loadu_pd (double const *__P)
158 return (vec_vsx_ld(0, __P));
161 /* Create a vector with all two elements equal to *P. */
162 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
163 _mm_load1_pd (double const *__P)
165 return (vec_splats (*__P));
168 /* Create a vector with element 0 as *P and the rest zero. */
169 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
170 _mm_load_sd (double const *__P)
172 return _mm_set_sd (*__P);
175 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
176 _mm_load_pd1 (double const *__P)
178 return _mm_load1_pd (__P);
181 /* Load two DPFP values in reverse order. The address must be aligned. */
182 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
183 _mm_loadr_pd (double const *__P)
185 __v2df __tmp = _mm_load_pd (__P);
186 return (__m128d)vec_xxpermdi (__tmp, __tmp, 2);
189 /* Store two DPFP values. The address must be 16-byte aligned. */
190 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
191 _mm_store_pd (double *__P, __m128d __A)
193 assert(((unsigned long)__P & 0xfUL) == 0UL);
194 vec_st((__v16qu)__A, 0, (__v16qu*)__P);
197 /* Store two DPFP values. The address need not be 16-byte aligned. */
198 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
199 _mm_storeu_pd (double *__P, __m128d __A)
201 *(__m128d *)__P = __A;
204 /* Stores the lower DPFP value. */
205 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
206 _mm_store_sd (double *__P, __m128d __A)
208 *__P = ((__v2df)__A)[0];
211 extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
212 _mm_cvtsd_f64 (__m128d __A)
214 return ((__v2df)__A)[0];
217 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
218 _mm_storel_pd (double *__P, __m128d __A)
220 _mm_store_sd (__P, __A);
223 /* Stores the upper DPFP value. */
224 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
225 _mm_storeh_pd (double *__P, __m128d __A)
227 *__P = ((__v2df)__A)[1];
229 /* Store the lower DPFP value across two words.
230 The address must be 16-byte aligned. */
231 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
232 _mm_store1_pd (double *__P, __m128d __A)
234 _mm_store_pd (__P, vec_splat (__A, 0));
237 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
238 _mm_store_pd1 (double *__P, __m128d __A)
240 _mm_store1_pd (__P, __A);
243 /* Store two DPFP values in reverse order. The address must be aligned. */
244 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
245 _mm_storer_pd (double *__P, __m128d __A)
247 _mm_store_pd (__P, vec_xxpermdi (__A, __A, 2));
250 /* Intel intrinsic. */
251 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
252 _mm_cvtsi128_si64 (__m128i __A)
254 return ((__v2di)__A)[0];
257 /* Microsoft intrinsic. */
258 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
259 _mm_cvtsi128_si64x (__m128i __A)
261 return ((__v2di)__A)[0];
264 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
265 _mm_add_pd (__m128d __A, __m128d __B)
267 return (__m128d) ((__v2df)__A + (__v2df)__B);
270 /* Add the lower double-precision (64-bit) floating-point element in
271 a and b, store the result in the lower element of dst, and copy
272 the upper element from a to the upper element of dst. */
273 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
274 _mm_add_sd (__m128d __A, __m128d __B)
276 __A[0] = __A[0] + __B[0];
277 return (__A);
280 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
281 _mm_sub_pd (__m128d __A, __m128d __B)
283 return (__m128d) ((__v2df)__A - (__v2df)__B);
286 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
287 _mm_sub_sd (__m128d __A, __m128d __B)
289 __A[0] = __A[0] - __B[0];
290 return (__A);
293 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
294 _mm_mul_pd (__m128d __A, __m128d __B)
296 return (__m128d) ((__v2df)__A * (__v2df)__B);
299 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
300 _mm_mul_sd (__m128d __A, __m128d __B)
302 __A[0] = __A[0] * __B[0];
303 return (__A);
306 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
307 _mm_div_pd (__m128d __A, __m128d __B)
309 return (__m128d) ((__v2df)__A / (__v2df)__B);
312 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
313 _mm_div_sd (__m128d __A, __m128d __B)
315 __A[0] = __A[0] / __B[0];
316 return (__A);
319 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
320 _mm_sqrt_pd (__m128d __A)
322 return (vec_sqrt (__A));
325 /* Return pair {sqrt (B[0]), A[1]}. */
326 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
327 _mm_sqrt_sd (__m128d __A, __m128d __B)
329 __v2df c;
330 c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0]));
331 return (__m128d) _mm_setr_pd (c[0], __A[1]);
334 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
335 _mm_min_pd (__m128d __A, __m128d __B)
337 return (vec_min (__A, __B));
340 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
341 _mm_min_sd (__m128d __A, __m128d __B)
343 __v2df a, b, c;
344 a = vec_splats (__A[0]);
345 b = vec_splats (__B[0]);
346 c = vec_min (a, b);
347 return (__m128d) _mm_setr_pd (c[0], __A[1]);
350 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
351 _mm_max_pd (__m128d __A, __m128d __B)
353 return (vec_max (__A, __B));
356 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
357 _mm_max_sd (__m128d __A, __m128d __B)
359 __v2df a, b, c;
360 a = vec_splats (__A[0]);
361 b = vec_splats (__B[0]);
362 c = vec_max (a, b);
363 return (__m128d) _mm_setr_pd (c[0], __A[1]);
366 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
367 _mm_cmpeq_pd (__m128d __A, __m128d __B)
369 return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B));
372 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
373 _mm_cmplt_pd (__m128d __A, __m128d __B)
375 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
378 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
379 _mm_cmple_pd (__m128d __A, __m128d __B)
381 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
384 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
385 _mm_cmpgt_pd (__m128d __A, __m128d __B)
387 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
390 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
391 _mm_cmpge_pd (__m128d __A, __m128d __B)
393 return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B));
396 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
397 _mm_cmpneq_pd (__m128d __A, __m128d __B)
399 __v2df temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B);
400 return ((__m128d)vec_nor (temp, temp));
403 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
404 _mm_cmpnlt_pd (__m128d __A, __m128d __B)
406 return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B));
409 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
410 _mm_cmpnle_pd (__m128d __A, __m128d __B)
412 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
415 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
416 _mm_cmpngt_pd (__m128d __A, __m128d __B)
418 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
421 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
422 _mm_cmpnge_pd (__m128d __A, __m128d __B)
424 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
427 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
428 _mm_cmpord_pd (__m128d __A, __m128d __B)
430 #if _ARCH_PWR8
431 __v2du c, d;
432 /* Compare against self will return false (0's) if NAN. */
433 c = (__v2du)vec_cmpeq (__A, __A);
434 d = (__v2du)vec_cmpeq (__B, __B);
435 #else
436 __v2du a, b;
437 __v2du c, d;
438 const __v2du double_exp_mask = {0x7ff0000000000000, 0x7ff0000000000000};
439 a = (__v2du)vec_abs ((__v2df)__A);
440 b = (__v2du)vec_abs ((__v2df)__B);
441 c = (__v2du)vec_cmpgt (double_exp_mask, a);
442 d = (__v2du)vec_cmpgt (double_exp_mask, b);
443 #endif
444 /* A != NAN and B != NAN. */
445 return ((__m128d)vec_and(c, d));
448 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
449 _mm_cmpunord_pd (__m128d __A, __m128d __B)
451 #if _ARCH_PWR8
452 __v2du c, d;
453 /* Compare against self will return false (0's) if NAN. */
454 c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
455 d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
456 /* A == NAN OR B == NAN converts too:
457 NOT(A != NAN) OR NOT(B != NAN). */
458 c = vec_nor (c, c);
459 return ((__m128d)vec_orc(c, d));
460 #else
461 __v2du c, d;
462 /* Compare against self will return false (0's) if NAN. */
463 c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
464 d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
465 /* Convert the true ('1's) is NAN. */
466 c = vec_nor (c, c);
467 d = vec_nor (d, d);
468 return ((__m128d)vec_or(c, d));
469 #endif
472 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
473 _mm_cmpeq_sd(__m128d __A, __m128d __B)
475 __v2df a, b, c;
476 /* PowerISA VSX does not allow partial (for just lower double)
477 results. So to insure we don't generate spurious exceptions
478 (from the upper double values) we splat the lower double
479 before we do the operation. */
480 a = vec_splats (__A[0]);
481 b = vec_splats (__B[0]);
482 c = (__v2df) vec_cmpeq(a, b);
483 /* Then we merge the lower double result with the original upper
484 double from __A. */
485 return (__m128d) _mm_setr_pd (c[0], __A[1]);
488 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
489 _mm_cmplt_sd (__m128d __A, __m128d __B)
491 __v2df a, b, c;
492 a = vec_splats (__A[0]);
493 b = vec_splats (__B[0]);
494 c = (__v2df) vec_cmplt(a, b);
495 return (__m128d) _mm_setr_pd (c[0], __A[1]);
498 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
499 _mm_cmple_sd (__m128d __A, __m128d __B)
501 __v2df a, b, c;
502 a = vec_splats (__A[0]);
503 b = vec_splats (__B[0]);
504 c = (__v2df) vec_cmple(a, b);
505 return (__m128d) _mm_setr_pd (c[0], __A[1]);
508 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
509 _mm_cmpgt_sd (__m128d __A, __m128d __B)
511 __v2df a, b, c;
512 a = vec_splats (__A[0]);
513 b = vec_splats (__B[0]);
514 c = (__v2df) vec_cmpgt(a, b);
515 return (__m128d) _mm_setr_pd (c[0], __A[1]);
518 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
519 _mm_cmpge_sd (__m128d __A, __m128d __B)
521 __v2df a, b, c;
522 a = vec_splats (__A[0]);
523 b = vec_splats (__B[0]);
524 c = (__v2df) vec_cmpge(a, b);
525 return (__m128d) _mm_setr_pd (c[0], __A[1]);
528 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
529 _mm_cmpneq_sd (__m128d __A, __m128d __B)
531 __v2df a, b, c;
532 a = vec_splats (__A[0]);
533 b = vec_splats (__B[0]);
534 c = (__v2df) vec_cmpeq(a, b);
535 c = vec_nor (c, c);
536 return (__m128d) _mm_setr_pd (c[0], __A[1]);
539 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
540 _mm_cmpnlt_sd (__m128d __A, __m128d __B)
542 __v2df a, b, c;
543 a = vec_splats (__A[0]);
544 b = vec_splats (__B[0]);
545 /* Not less than is just greater than or equal. */
546 c = (__v2df) vec_cmpge(a, b);
547 return (__m128d) _mm_setr_pd (c[0], __A[1]);
550 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
551 _mm_cmpnle_sd (__m128d __A, __m128d __B)
553 __v2df a, b, c;
554 a = vec_splats (__A[0]);
555 b = vec_splats (__B[0]);
556 /* Not less than or equal is just greater than. */
557 c = (__v2df) vec_cmpge(a, b);
558 return (__m128d) _mm_setr_pd (c[0], __A[1]);
561 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
562 _mm_cmpngt_sd (__m128d __A, __m128d __B)
564 __v2df a, b, c;
565 a = vec_splats (__A[0]);
566 b = vec_splats (__B[0]);
567 /* Not greater than is just less than or equal. */
568 c = (__v2df) vec_cmple(a, b);
569 return (__m128d) _mm_setr_pd (c[0], __A[1]);
572 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
573 _mm_cmpnge_sd (__m128d __A, __m128d __B)
575 __v2df a, b, c;
576 a = vec_splats (__A[0]);
577 b = vec_splats (__B[0]);
578 /* Not greater than or equal is just less than. */
579 c = (__v2df) vec_cmplt(a, b);
580 return (__m128d) _mm_setr_pd (c[0], __A[1]);
583 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
584 _mm_cmpord_sd (__m128d __A, __m128d __B)
586 __v2df r;
587 r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
588 return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]);
591 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
592 _mm_cmpunord_sd (__m128d __A, __m128d __B)
594 __v2df r;
595 r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
596 return (__m128d) _mm_setr_pd (r[0], __A[1]);
599 /* FIXME
600 The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
601 exactly the same because GCC for PowerPC only generates unordered
602 compares (scalar and vector).
603 Technically __mm_comieq_sp et all should be using the ordered
604 compare and signal for QNaNs. The __mm_ucomieq_sd et all should
605 be OK. */
606 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
607 _mm_comieq_sd (__m128d __A, __m128d __B)
609 return (__A[0] == __B[0]);
612 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
613 _mm_comilt_sd (__m128d __A, __m128d __B)
615 return (__A[0] < __B[0]);
618 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
619 _mm_comile_sd (__m128d __A, __m128d __B)
621 return (__A[0] <= __B[0]);
624 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
625 _mm_comigt_sd (__m128d __A, __m128d __B)
627 return (__A[0] > __B[0]);
630 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
631 _mm_comige_sd (__m128d __A, __m128d __B)
633 return (__A[0] >= __B[0]);
636 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
637 _mm_comineq_sd (__m128d __A, __m128d __B)
639 return (__A[0] != __B[0]);
642 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
643 _mm_ucomieq_sd (__m128d __A, __m128d __B)
645 return (__A[0] == __B[0]);
648 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
649 _mm_ucomilt_sd (__m128d __A, __m128d __B)
651 return (__A[0] < __B[0]);
654 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
655 _mm_ucomile_sd (__m128d __A, __m128d __B)
657 return (__A[0] <= __B[0]);
660 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
661 _mm_ucomigt_sd (__m128d __A, __m128d __B)
663 return (__A[0] > __B[0]);
666 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
667 _mm_ucomige_sd (__m128d __A, __m128d __B)
669 return (__A[0] >= __B[0]);
672 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
673 _mm_ucomineq_sd (__m128d __A, __m128d __B)
675 return (__A[0] != __B[0]);
678 /* Create a vector of Qi, where i is the element number. */
679 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
680 _mm_set_epi64x (long long __q1, long long __q0)
682 return __extension__ (__m128i)(__v2di){ __q0, __q1 };
685 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
686 _mm_set_epi64 (__m64 __q1, __m64 __q0)
688 return _mm_set_epi64x ((long long)__q1, (long long)__q0);
691 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
692 _mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
694 return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
697 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
698 _mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
699 short __q3, short __q2, short __q1, short __q0)
701 return __extension__ (__m128i)(__v8hi){
702 __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
705 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
706 _mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
707 char __q11, char __q10, char __q09, char __q08,
708 char __q07, char __q06, char __q05, char __q04,
709 char __q03, char __q02, char __q01, char __q00)
711 return __extension__ (__m128i)(__v16qi){
712 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
713 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
717 /* Set all of the elements of the vector to A. */
718 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
719 _mm_set1_epi64x (long long __A)
721 return _mm_set_epi64x (__A, __A);
724 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
725 _mm_set1_epi64 (__m64 __A)
727 return _mm_set_epi64 (__A, __A);
730 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
731 _mm_set1_epi32 (int __A)
733 return _mm_set_epi32 (__A, __A, __A, __A);
736 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
737 _mm_set1_epi16 (short __A)
739 return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
742 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
743 _mm_set1_epi8 (char __A)
745 return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
746 __A, __A, __A, __A, __A, __A, __A, __A);
749 /* Create a vector of Qi, where i is the element number.
750 The parameter order is reversed from the _mm_set_epi* functions. */
751 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
752 _mm_setr_epi64 (__m64 __q0, __m64 __q1)
754 return _mm_set_epi64 (__q1, __q0);
757 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
758 _mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
760 return _mm_set_epi32 (__q3, __q2, __q1, __q0);
763 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
764 _mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
765 short __q4, short __q5, short __q6, short __q7)
767 return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
770 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
771 _mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
772 char __q04, char __q05, char __q06, char __q07,
773 char __q08, char __q09, char __q10, char __q11,
774 char __q12, char __q13, char __q14, char __q15)
776 return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
777 __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
780 /* Create a vector with element 0 as *P and the rest zero. */
781 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
782 _mm_load_si128 (__m128i const *__P)
784 return *__P;
787 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
788 _mm_loadu_si128 (__m128i_u const *__P)
790 return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
793 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
794 _mm_loadl_epi64 (__m128i_u const *__P)
796 return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
799 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
800 _mm_store_si128 (__m128i *__P, __m128i __B)
802 assert(((unsigned long )__P & 0xfUL) == 0UL);
803 vec_st ((__v16qu) __B, 0, (__v16qu*)__P);
806 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
807 _mm_storeu_si128 (__m128i_u *__P, __m128i __B)
809 *__P = __B;
812 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
813 _mm_storel_epi64 (__m128i_u *__P, __m128i __B)
815 *(long long *)__P = ((__v2di)__B)[0];
818 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
819 _mm_movepi64_pi64 (__m128i_u __B)
821 return (__m64) ((__v2di)__B)[0];
824 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
825 _mm_movpi64_epi64 (__m64 __A)
827 return _mm_set_epi64 ((__m64)0LL, __A);
830 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
831 _mm_move_epi64 (__m128i __A)
833 return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]);
836 /* Create an undefined vector. */
837 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
838 _mm_undefined_si128 (void)
840 __m128i __Y = __Y;
841 return __Y;
844 /* Create a vector of zeros. */
845 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
846 _mm_setzero_si128 (void)
848 return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
851 #ifdef _ARCH_PWR8
852 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
853 _mm_cvtepi32_pd (__m128i __A)
855 __v2di val;
856 /* For LE need to generate Vector Unpack Low Signed Word.
857 Which is generated from unpackh. */
858 val = (__v2di)vec_unpackh ((__v4si)__A);
860 return (__m128d)vec_ctf (val, 0);
862 #endif
864 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
865 _mm_cvtepi32_ps (__m128i __A)
867 return ((__m128)vec_ctf((__v4si)__A, 0));
870 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
871 _mm_cvtpd_epi32 (__m128d __A)
873 __v2df rounded = vec_rint (__A);
874 __v4si result, temp;
875 const __v4si vzero =
876 { 0, 0, 0, 0 };
878 /* VSX Vector truncate Double-Precision to integer and Convert to
879 Signed Integer Word format with Saturate. */
880 __asm__(
881 "xvcvdpsxws %x0,%x1"
882 : "=wa" (temp)
883 : "wa" (rounded)
884 : );
886 #ifdef _ARCH_PWR8
887 temp = vec_mergeo (temp, temp);
888 result = (__v4si)vec_vpkudum ((__vector long)temp, (__vector long)vzero);
889 #else
891 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
892 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
893 result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
895 #endif
896 return (__m128i) result;
899 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
900 _mm_cvtpd_pi32 (__m128d __A)
902 __m128i result = _mm_cvtpd_epi32(__A);
904 return (__m64) result[0];
907 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
908 _mm_cvtpd_ps (__m128d __A)
910 __v4sf result;
911 __v4si temp;
912 const __v4si vzero = { 0, 0, 0, 0 };
914 __asm__(
915 "xvcvdpsp %x0,%x1"
916 : "=wa" (temp)
917 : "wa" (__A)
918 : );
920 #ifdef _ARCH_PWR8
921 temp = vec_mergeo (temp, temp);
922 result = (__v4sf)vec_vpkudum ((__vector long)temp, (__vector long)vzero);
923 #else
925 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
926 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
927 result = (__v4sf) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
929 #endif
930 return ((__m128)result);
933 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
934 _mm_cvttpd_epi32 (__m128d __A)
936 __v4si result;
937 __v4si temp;
938 const __v4si vzero = { 0, 0, 0, 0 };
940 /* VSX Vector truncate Double-Precision to integer and Convert to
941 Signed Integer Word format with Saturate. */
942 __asm__(
943 "xvcvdpsxws %x0,%x1"
944 : "=wa" (temp)
945 : "wa" (__A)
946 : );
948 #ifdef _ARCH_PWR8
949 temp = vec_mergeo (temp, temp);
950 result = (__v4si)vec_vpkudum ((__vector long)temp, (__vector long)vzero);
951 #else
953 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
954 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
955 result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
957 #endif
959 return ((__m128i) result);
962 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
963 _mm_cvttpd_pi32 (__m128d __A)
965 __m128i result = _mm_cvttpd_epi32 (__A);
967 return (__m64) result[0];
970 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
971 _mm_cvtsi128_si32 (__m128i __A)
973 return ((__v4si)__A)[0];
976 #ifdef _ARCH_PWR8
977 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
978 _mm_cvtpi32_pd (__m64 __A)
980 __v4si temp;
981 __v2di tmp2;
982 __v2df result;
984 temp = (__v4si)vec_splats (__A);
985 tmp2 = (__v2di)vec_unpackl (temp);
986 result = vec_ctf ((__vector signed long)tmp2, 0);
987 return (__m128d)result;
989 #endif
991 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
992 _mm_cvtps_epi32 (__m128 __A)
994 __v4sf rounded;
995 __v4si result;
997 rounded = vec_rint((__v4sf) __A);
998 result = vec_cts (rounded, 0);
999 return (__m128i) result;
1002 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1003 _mm_cvttps_epi32 (__m128 __A)
1005 __v4si result;
1007 result = vec_cts ((__v4sf) __A, 0);
1008 return (__m128i) result;
1011 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1012 _mm_cvtps_pd (__m128 __A)
1014 /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
1015 #ifdef vec_doubleh
1016 return (__m128d) vec_doubleh ((__v4sf)__A);
1017 #else
1018 /* Otherwise the compiler is not current and so need to generate the
1019 equivalent code. */
1020 __v4sf a = (__v4sf)__A;
1021 __v4sf temp;
1022 __v2df result;
1023 #ifdef __LITTLE_ENDIAN__
1024 /* The input float values are in elements {[0], [1]} but the convert
1025 instruction needs them in elements {[1], [3]}, So we use two
1026 shift left double vector word immediates to get the elements
1027 lined up. */
1028 temp = __builtin_vsx_xxsldwi (a, a, 3);
1029 temp = __builtin_vsx_xxsldwi (a, temp, 2);
1030 #elif __BIG_ENDIAN__
1031 /* The input float values are in elements {[0], [1]} but the convert
1032 instruction needs them in elements {[0], [2]}, So we use two
1033 shift left double vector word immediates to get the elements
1034 lined up. */
1035 temp = vec_vmrghw (a, a);
1036 #endif
1037 __asm__(
1038 " xvcvspdp %x0,%x1"
1039 : "=wa" (result)
1040 : "wa" (temp)
1041 : );
1042 return (__m128d) result;
1043 #endif
1046 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1047 _mm_cvtsd_si32 (__m128d __A)
1049 __v2df rounded = vec_rint((__v2df) __A);
1050 int result = ((__v2df)rounded)[0];
1052 return result;
1054 /* Intel intrinsic. */
1055 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1056 _mm_cvtsd_si64 (__m128d __A)
1058 __v2df rounded = vec_rint ((__v2df) __A );
1059 long long result = ((__v2df) rounded)[0];
1061 return result;
1064 /* Microsoft intrinsic. */
1065 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1066 _mm_cvtsd_si64x (__m128d __A)
1068 return _mm_cvtsd_si64 ((__v2df)__A);
1071 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1072 _mm_cvttsd_si32 (__m128d __A)
1074 int result = ((__v2df)__A)[0];
1076 return result;
1079 /* Intel intrinsic. */
1080 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1081 _mm_cvttsd_si64 (__m128d __A)
1083 long long result = ((__v2df)__A)[0];
1085 return result;
1088 /* Microsoft intrinsic. */
1089 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1090 _mm_cvttsd_si64x (__m128d __A)
1092 return _mm_cvttsd_si64 (__A);
1095 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1096 _mm_cvtsd_ss (__m128 __A, __m128d __B)
1098 __v4sf result = (__v4sf)__A;
1100 #ifdef __LITTLE_ENDIAN__
1101 __v4sf temp_s;
1102 /* Copy double element[0] to element [1] for conversion. */
1103 __v2df temp_b = vec_splat((__v2df)__B, 0);
1105 /* Pre-rotate __A left 3 (logically right 1) elements. */
1106 result = __builtin_vsx_xxsldwi (result, result, 3);
1107 /* Convert double to single float scalar in a vector. */
1108 __asm__(
1109 "xscvdpsp %x0,%x1"
1110 : "=wa" (temp_s)
1111 : "wa" (temp_b)
1112 : );
1113 /* Shift the resulting scalar into vector element [0]. */
1114 result = __builtin_vsx_xxsldwi (result, temp_s, 1);
1115 #else
1116 result [0] = ((__v2df)__B)[0];
1117 #endif
1118 return (__m128) result;
1121 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1122 _mm_cvtsi32_sd (__m128d __A, int __B)
1124 __v2df result = (__v2df)__A;
1125 double db = __B;
1126 result [0] = db;
1127 return (__m128d)result;
1130 /* Intel intrinsic. */
1131 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1132 _mm_cvtsi64_sd (__m128d __A, long long __B)
1134 __v2df result = (__v2df)__A;
1135 double db = __B;
1136 result [0] = db;
1137 return (__m128d)result;
1140 /* Microsoft intrinsic. */
1141 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1142 _mm_cvtsi64x_sd (__m128d __A, long long __B)
1144 return _mm_cvtsi64_sd (__A, __B);
1147 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1148 _mm_cvtss_sd (__m128d __A, __m128 __B)
1150 #ifdef __LITTLE_ENDIAN__
1151 /* Use splat to move element [0] into position for the convert. */
1152 __v4sf temp = vec_splat ((__v4sf)__B, 0);
1153 __v2df res;
1154 /* Convert single float scalar to double in a vector. */
1155 __asm__(
1156 "xscvspdp %x0,%x1"
1157 : "=wa" (res)
1158 : "wa" (temp)
1159 : );
1160 return (__m128d) vec_mergel (res, (__v2df)__A);
1161 #else
1162 __v2df res = (__v2df)__A;
1163 res [0] = ((__v4sf)__B) [0];
1164 return (__m128d) res;
1165 #endif
1168 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1169 _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
1171 __vector double result;
1172 const int litmsk = __mask & 0x3;
1174 if (litmsk == 0)
1175 result = vec_mergeh (__A, __B);
1176 #if __GNUC__ < 6
1177 else if (litmsk == 1)
1178 result = vec_xxpermdi (__B, __A, 2);
1179 else if (litmsk == 2)
1180 result = vec_xxpermdi (__B, __A, 1);
1181 #else
1182 else if (litmsk == 1)
1183 result = vec_xxpermdi (__A, __B, 2);
1184 else if (litmsk == 2)
1185 result = vec_xxpermdi (__A, __B, 1);
1186 #endif
1187 else
1188 result = vec_mergel (__A, __B);
1190 return result;
1193 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1194 _mm_unpackhi_pd (__m128d __A, __m128d __B)
1196 return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B);
1199 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1200 _mm_unpacklo_pd (__m128d __A, __m128d __B)
1202 return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B);
1205 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1206 _mm_loadh_pd (__m128d __A, double const *__B)
1208 __v2df result = (__v2df)__A;
1209 result [1] = *__B;
1210 return (__m128d)result;
1213 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1214 _mm_loadl_pd (__m128d __A, double const *__B)
1216 __v2df result = (__v2df)__A;
1217 result [0] = *__B;
1218 return (__m128d)result;
1221 #ifdef _ARCH_PWR8
1222 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1224 /* Creates a 2-bit mask from the most significant bits of the DPFP values. */
1225 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1226 _mm_movemask_pd (__m128d __A)
1228 __vector __m64 result;
1229 static const __vector unsigned int perm_mask =
1231 #ifdef __LITTLE_ENDIAN__
1232 0x80800040, 0x80808080, 0x80808080, 0x80808080
1233 #elif __BIG_ENDIAN__
1234 0x80808080, 0x80808080, 0x80808080, 0x80800040
1235 #endif
1238 result = (__vector __m64) vec_vbpermq ((__vector unsigned char) __A,
1239 (__vector unsigned char) perm_mask);
1241 #ifdef __LITTLE_ENDIAN__
1242 return result[1];
1243 #elif __BIG_ENDIAN__
1244 return result[0];
1245 #endif
1247 #endif /* _ARCH_PWR8 */
1249 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1250 _mm_packs_epi16 (__m128i __A, __m128i __B)
1252 return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B);
1255 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1256 _mm_packs_epi32 (__m128i __A, __m128i __B)
1258 return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B);
1261 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1262 _mm_packus_epi16 (__m128i __A, __m128i __B)
1264 return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B);
1267 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1268 _mm_unpackhi_epi8 (__m128i __A, __m128i __B)
1270 return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B);
1273 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1274 _mm_unpackhi_epi16 (__m128i __A, __m128i __B)
1276 return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B);
1279 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1280 _mm_unpackhi_epi32 (__m128i __A, __m128i __B)
1282 return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B);
1285 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1286 _mm_unpackhi_epi64 (__m128i __A, __m128i __B)
1288 return (__m128i) vec_mergel ((__vector long)__A, (__vector long)__B);
1291 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1292 _mm_unpacklo_epi8 (__m128i __A, __m128i __B)
1294 return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B);
1297 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1298 _mm_unpacklo_epi16 (__m128i __A, __m128i __B)
1300 return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B);
1303 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1304 _mm_unpacklo_epi32 (__m128i __A, __m128i __B)
1306 return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B);
1309 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1310 _mm_unpacklo_epi64 (__m128i __A, __m128i __B)
1312 return (__m128i) vec_mergeh ((__vector long)__A, (__vector long)__B);
1315 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1316 _mm_add_epi8 (__m128i __A, __m128i __B)
1318 return (__m128i) ((__v16qu)__A + (__v16qu)__B);
1321 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1322 _mm_add_epi16 (__m128i __A, __m128i __B)
1324 return (__m128i) ((__v8hu)__A + (__v8hu)__B);
1327 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1328 _mm_add_epi32 (__m128i __A, __m128i __B)
1330 return (__m128i) ((__v4su)__A + (__v4su)__B);
1333 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1334 _mm_add_epi64 (__m128i __A, __m128i __B)
1336 return (__m128i) ((__v2du)__A + (__v2du)__B);
1339 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1340 _mm_adds_epi8 (__m128i __A, __m128i __B)
1342 return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B);
1345 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1346 _mm_adds_epi16 (__m128i __A, __m128i __B)
1348 return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B);
1351 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1352 _mm_adds_epu8 (__m128i __A, __m128i __B)
1354 return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B);
1357 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1358 _mm_adds_epu16 (__m128i __A, __m128i __B)
1360 return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B);
1363 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1364 _mm_sub_epi8 (__m128i __A, __m128i __B)
1366 return (__m128i) ((__v16qu)__A - (__v16qu)__B);
1369 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1370 _mm_sub_epi16 (__m128i __A, __m128i __B)
1372 return (__m128i) ((__v8hu)__A - (__v8hu)__B);
1375 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1376 _mm_sub_epi32 (__m128i __A, __m128i __B)
1378 return (__m128i) ((__v4su)__A - (__v4su)__B);
1381 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1382 _mm_sub_epi64 (__m128i __A, __m128i __B)
1384 return (__m128i) ((__v2du)__A - (__v2du)__B);
1387 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1388 _mm_subs_epi8 (__m128i __A, __m128i __B)
1390 return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B);
1393 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1394 _mm_subs_epi16 (__m128i __A, __m128i __B)
1396 return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B);
1399 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1400 _mm_subs_epu8 (__m128i __A, __m128i __B)
1402 return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B);
1405 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1406 _mm_subs_epu16 (__m128i __A, __m128i __B)
1408 return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B);
1411 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1412 _mm_madd_epi16 (__m128i __A, __m128i __B)
1414 __vector signed int zero = {0, 0, 0, 0};
1416 return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, zero);
1419 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1420 _mm_mulhi_epi16 (__m128i __A, __m128i __B)
1422 __vector signed int w0, w1;
1424 __vector unsigned char xform1 = {
1425 #ifdef __LITTLE_ENDIAN__
1426 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1427 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1428 #elif __BIG_ENDIAN__
1429 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
1430 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1431 #endif
1434 w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B);
1435 w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B);
1436 return (__m128i) vec_perm (w0, w1, xform1);
1439 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1440 _mm_mullo_epi16 (__m128i __A, __m128i __B)
1442 return (__m128i) ((__v8hi)__A * (__v8hi)__B);
1445 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1446 _mm_mul_su32 (__m64 __A, __m64 __B)
1448 unsigned int a = __A;
1449 unsigned int b = __B;
1451 return ((__m64)a * (__m64)b);
1454 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1455 _mm_mul_epu32 (__m128i __A, __m128i __B)
1457 #if __GNUC__ < 8
1458 __v2du result;
1460 #ifdef __LITTLE_ENDIAN__
1461 /* VMX Vector Multiply Odd Unsigned Word. */
1462 __asm__(
1463 "vmulouw %0,%1,%2"
1464 : "=v" (result)
1465 : "v" (__A), "v" (__B)
1466 : );
1467 #elif __BIG_ENDIAN__
1468 /* VMX Vector Multiply Even Unsigned Word. */
1469 __asm__(
1470 "vmuleuw %0,%1,%2"
1471 : "=v" (result)
1472 : "v" (__A), "v" (__B)
1473 : );
1474 #endif
1475 return (__m128i) result;
1476 #else
1477 #ifdef __LITTLE_ENDIAN__
1478 return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
1479 #elif __BIG_ENDIAN__
1480 return (__m128i) vec_mulo ((__v4su)__A, (__v4su)__B);
1481 #endif
1482 #endif
1485 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1486 _mm_slli_epi16 (__m128i __A, int __B)
1488 __v8hu lshift;
1489 __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1491 if (__B >= 0 && __B < 16)
1493 if (__builtin_constant_p(__B))
1494 lshift = (__v8hu) vec_splat_s16(__B);
1495 else
1496 lshift = vec_splats ((unsigned short) __B);
1498 result = vec_vslh ((__v8hi) __A, lshift);
1501 return (__m128i) result;
1504 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1505 _mm_slli_epi32 (__m128i __A, int __B)
1507 __v4su lshift;
1508 __v4si result = { 0, 0, 0, 0 };
1510 if (__B >= 0 && __B < 32)
1512 if (__builtin_constant_p(__B) && __B < 16)
1513 lshift = (__v4su) vec_splat_s32(__B);
1514 else
1515 lshift = vec_splats ((unsigned int) __B);
1517 result = vec_vslw ((__v4si) __A, lshift);
1520 return (__m128i) result;
1523 #ifdef _ARCH_PWR8
1524 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1525 _mm_slli_epi64 (__m128i __A, int __B)
1527 __v2du lshift;
1528 __v2di result = { 0, 0 };
1530 if (__B >= 0 && __B < 64)
1532 if (__builtin_constant_p(__B) && __B < 16)
1533 lshift = (__v2du) vec_splat_s32(__B);
1534 else
1535 lshift = (__v2du) vec_splats ((unsigned int) __B);
1537 result = vec_vsld ((__v2di) __A, lshift);
1540 return (__m128i) result;
1542 #endif
1544 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1545 _mm_srai_epi16 (__m128i __A, int __B)
1547 __v8hu rshift = { 15, 15, 15, 15, 15, 15, 15, 15 };
1548 __v8hi result;
1550 if (__B < 16)
1552 if (__builtin_constant_p(__B))
1553 rshift = (__v8hu) vec_splat_s16(__B);
1554 else
1555 rshift = vec_splats ((unsigned short) __B);
1557 result = vec_vsrah ((__v8hi) __A, rshift);
1559 return (__m128i) result;
1562 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1563 _mm_srai_epi32 (__m128i __A, int __B)
1565 __v4su rshift = { 31, 31, 31, 31 };
1566 __v4si result;
1568 if (__B < 32)
1570 if (__builtin_constant_p(__B))
1572 if (__B < 16)
1573 rshift = (__v4su) vec_splat_s32(__B);
1574 else
1575 rshift = (__v4su) vec_splats((unsigned int)__B);
1577 else
1578 rshift = vec_splats ((unsigned int) __B);
1580 result = vec_vsraw ((__v4si) __A, rshift);
1582 return (__m128i) result;
1585 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1586 _mm_bslli_si128 (__m128i __A, const int __N)
1588 __v16qu result;
1589 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1591 if (__N < 16)
1592 result = vec_sld ((__v16qu) __A, zeros, __N);
1593 else
1594 result = zeros;
1596 return (__m128i) result;
1599 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1600 _mm_bsrli_si128 (__m128i __A, const int __N)
1602 __v16qu result;
1603 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1605 if (__N < 16)
1606 if (__builtin_constant_p(__N))
1607 /* Would like to use Vector Shift Left Double by Octet
1608 Immediate here to use the immediate form and avoid
1609 load of __N * 8 value into a separate VR. */
1610 result = vec_sld (zeros, (__v16qu) __A, (16 - __N));
1611 else
1613 __v16qu shift = vec_splats((unsigned char)(__N*8));
1614 result = vec_sro ((__v16qu)__A, shift);
1616 else
1617 result = zeros;
1619 return (__m128i) result;
1622 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1623 _mm_srli_si128 (__m128i __A, const int __N)
1625 return _mm_bsrli_si128 (__A, __N);
1628 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1629 _mm_slli_si128 (__m128i __A, const int _imm5)
1631 __v16qu result;
1632 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1634 if (_imm5 < 16)
1635 #ifdef __LITTLE_ENDIAN__
1636 result = vec_sld ((__v16qu) __A, zeros, _imm5);
1637 #elif __BIG_ENDIAN__
1638 result = vec_sld (zeros, (__v16qu) __A, (16 - _imm5));
1639 #endif
1640 else
1641 result = zeros;
1643 return (__m128i) result;
1646 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1648 _mm_srli_epi16 (__m128i __A, int __B)
1650 __v8hu rshift;
1651 __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1653 if (__B < 16)
1655 if (__builtin_constant_p(__B))
1656 rshift = (__v8hu) vec_splat_s16(__B);
1657 else
1658 rshift = vec_splats ((unsigned short) __B);
1660 result = vec_vsrh ((__v8hi) __A, rshift);
1663 return (__m128i) result;
1666 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1667 _mm_srli_epi32 (__m128i __A, int __B)
1669 __v4su rshift;
1670 __v4si result = { 0, 0, 0, 0 };
1672 if (__B < 32)
1674 if (__builtin_constant_p(__B))
1676 if (__B < 16)
1677 rshift = (__v4su) vec_splat_s32(__B);
1678 else
1679 rshift = (__v4su) vec_splats((unsigned int)__B);
1681 else
1682 rshift = vec_splats ((unsigned int) __B);
1684 result = vec_vsrw ((__v4si) __A, rshift);
1687 return (__m128i) result;
1690 #ifdef _ARCH_PWR8
1691 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1692 _mm_srli_epi64 (__m128i __A, int __B)
1694 __v2du rshift;
1695 __v2di result = { 0, 0 };
1697 if (__B < 64)
1699 if (__builtin_constant_p(__B))
1701 if (__B < 16)
1702 rshift = (__v2du) vec_splat_s32(__B);
1703 else
1704 rshift = (__v2du) vec_splats((unsigned long long)__B);
1706 else
1707 rshift = (__v2du) vec_splats ((unsigned int) __B);
1709 result = vec_vsrd ((__v2di) __A, rshift);
1712 return (__m128i) result;
1714 #endif
1716 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1717 _mm_sll_epi16 (__m128i __A, __m128i __B)
1719 __v8hu lshift, shmask;
1720 const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1721 __v8hu result;
1723 #ifdef __LITTLE_ENDIAN__
1724 lshift = vec_splat ((__v8hu)__B, 0);
1725 #elif __BIG_ENDIAN__
1726 lshift = vec_splat ((__v8hu)__B, 3);
1727 #endif
1728 shmask = lshift <= shmax;
1729 result = vec_vslh ((__v8hu) __A, lshift);
1730 result = vec_sel (shmask, result, shmask);
1732 return (__m128i) result;
1735 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1736 _mm_sll_epi32 (__m128i __A, __m128i __B)
1738 __v4su lshift, shmask;
1739 const __v4su shmax = { 32, 32, 32, 32 };
1740 __v4su result;
1741 #ifdef __LITTLE_ENDIAN__
1742 lshift = vec_splat ((__v4su)__B, 0);
1743 #elif __BIG_ENDIAN__
1744 lshift = vec_splat ((__v4su)__B, 1);
1745 #endif
1746 shmask = lshift < shmax;
1747 result = vec_vslw ((__v4su) __A, lshift);
1748 result = vec_sel (shmask, result, shmask);
1750 return (__m128i) result;
1753 #ifdef _ARCH_PWR8
1754 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1755 _mm_sll_epi64 (__m128i __A, __m128i __B)
1757 __v2du lshift, shmask;
1758 const __v2du shmax = { 64, 64 };
1759 __v2du result;
1761 lshift = (__v2du) vec_splat ((__v2du)__B, 0);
1762 shmask = lshift < shmax;
1763 result = vec_vsld ((__v2du) __A, lshift);
1764 result = (__v2du) vec_sel ((__v2df) shmask, (__v2df) result,
1765 (__v2df) shmask);
1767 return (__m128i) result;
1769 #endif
1771 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1772 _mm_sra_epi16 (__m128i __A, __m128i __B)
1774 const __v8hu rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1775 __v8hu rshift;
1776 __v8hi result;
1778 #ifdef __LITTLE_ENDIAN__
1779 rshift = vec_splat ((__v8hu)__B, 0);
1780 #elif __BIG_ENDIAN__
1781 rshift = vec_splat ((__v8hu)__B, 3);
1782 #endif
1783 rshift = vec_min (rshift, rshmax);
1784 result = vec_vsrah ((__v8hi) __A, rshift);
1786 return (__m128i) result;
1789 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1790 _mm_sra_epi32 (__m128i __A, __m128i __B)
1792 const __v4su rshmax = { 31, 31, 31, 31 };
1793 __v4su rshift;
1794 __v4si result;
1796 #ifdef __LITTLE_ENDIAN__
1797 rshift = vec_splat ((__v4su)__B, 0);
1798 #elif __BIG_ENDIAN__
1799 rshift = vec_splat ((__v4su)__B, 1);
1800 #endif
1801 rshift = vec_min (rshift, rshmax);
1802 result = vec_vsraw ((__v4si) __A, rshift);
1804 return (__m128i) result;
1807 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1808 _mm_srl_epi16 (__m128i __A, __m128i __B)
1810 __v8hu rshift, shmask;
1811 const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1812 __v8hu result;
1814 #ifdef __LITTLE_ENDIAN__
1815 rshift = vec_splat ((__v8hu)__B, 0);
1816 #elif __BIG_ENDIAN__
1817 rshift = vec_splat ((__v8hu)__B, 3);
1818 #endif
1819 shmask = rshift <= shmax;
1820 result = vec_vsrh ((__v8hu) __A, rshift);
1821 result = vec_sel (shmask, result, shmask);
1823 return (__m128i) result;
1826 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1827 _mm_srl_epi32 (__m128i __A, __m128i __B)
1829 __v4su rshift, shmask;
1830 const __v4su shmax = { 32, 32, 32, 32 };
1831 __v4su result;
1833 #ifdef __LITTLE_ENDIAN__
1834 rshift = vec_splat ((__v4su)__B, 0);
1835 #elif __BIG_ENDIAN__
1836 rshift = vec_splat ((__v4su)__B, 1);
1837 #endif
1838 shmask = rshift < shmax;
1839 result = vec_vsrw ((__v4su) __A, rshift);
1840 result = vec_sel (shmask, result, shmask);
1842 return (__m128i) result;
1845 #ifdef _ARCH_PWR8
1846 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1847 _mm_srl_epi64 (__m128i __A, __m128i __B)
1849 __v2du rshift, shmask;
1850 const __v2du shmax = { 64, 64 };
1851 __v2du result;
1853 rshift = (__v2du) vec_splat ((__v2du)__B, 0);
1854 shmask = rshift < shmax;
1855 result = vec_vsrd ((__v2du) __A, rshift);
1856 result = (__v2du)vec_sel ((__v2du)shmask, (__v2du)result, (__v2du)shmask);
1858 return (__m128i) result;
1860 #endif
1862 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1863 _mm_and_pd (__m128d __A, __m128d __B)
1865 return (vec_and ((__v2df) __A, (__v2df) __B));
1868 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1869 _mm_andnot_pd (__m128d __A, __m128d __B)
1871 return (vec_andc ((__v2df) __B, (__v2df) __A));
1874 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1875 _mm_or_pd (__m128d __A, __m128d __B)
1877 return (vec_or ((__v2df) __A, (__v2df) __B));
1880 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1881 _mm_xor_pd (__m128d __A, __m128d __B)
1883 return (vec_xor ((__v2df) __A, (__v2df) __B));
1886 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1887 _mm_cmpeq_epi8 (__m128i __A, __m128i __B)
1889 return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B);
1892 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1893 _mm_cmpeq_epi16 (__m128i __A, __m128i __B)
1895 return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B);
1898 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1899 _mm_cmpeq_epi32 (__m128i __A, __m128i __B)
1901 return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B);
1904 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1905 _mm_cmplt_epi8 (__m128i __A, __m128i __B)
1907 return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B);
1910 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1911 _mm_cmplt_epi16 (__m128i __A, __m128i __B)
1913 return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B);
1916 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1917 _mm_cmplt_epi32 (__m128i __A, __m128i __B)
1919 return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B);
1922 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1923 _mm_cmpgt_epi8 (__m128i __A, __m128i __B)
1925 return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B);
1928 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1929 _mm_cmpgt_epi16 (__m128i __A, __m128i __B)
1931 return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B);
1934 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1935 _mm_cmpgt_epi32 (__m128i __A, __m128i __B)
1937 return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B);
1940 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1941 _mm_extract_epi16 (__m128i const __A, int const __N)
1943 return (unsigned short) ((__v8hi)__A)[__N & 7];
1946 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1947 _mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
1949 __v8hi result = (__v8hi)__A;
1951 result [(__N & 7)] = __D;
1953 return (__m128i) result;
1956 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1957 _mm_max_epi16 (__m128i __A, __m128i __B)
1959 return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B);
1962 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1963 _mm_max_epu8 (__m128i __A, __m128i __B)
1965 return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B);
1968 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1969 _mm_min_epi16 (__m128i __A, __m128i __B)
1971 return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B);
1974 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1975 _mm_min_epu8 (__m128i __A, __m128i __B)
1977 return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B);
1981 #ifdef _ARCH_PWR8
1982 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1984 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */
1985 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1986 _mm_movemask_epi8 (__m128i __A)
1988 __vector __m64 result;
1989 static const __vector unsigned char perm_mask =
1991 #ifdef __LITTLE_ENDIAN__
1992 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
1993 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00
1994 #elif __BIG_ENDIAN__
1995 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38,
1996 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78
1997 #endif
2000 result = (__vector __m64) vec_vbpermq ((__vector unsigned char) __A,
2001 (__vector unsigned char) perm_mask);
2003 #ifdef __LITTLE_ENDIAN__
2004 return result[1];
2005 #elif __BIG_ENDIAN__
2006 return result[0];
2007 #endif
2009 #endif /* _ARCH_PWR8 */
2011 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2012 _mm_mulhi_epu16 (__m128i __A, __m128i __B)
2014 __v4su w0, w1;
2015 __v16qu xform1 = {
2016 #ifdef __LITTLE_ENDIAN__
2017 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
2018 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
2019 #elif __BIG_ENDIAN__
2020 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
2021 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
2022 #endif
2025 w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B);
2026 w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B);
2027 return (__m128i) vec_perm (w0, w1, xform1);
2030 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2031 _mm_shufflehi_epi16 (__m128i __A, const int __mask)
2033 unsigned long element_selector_98 = __mask & 0x03;
2034 unsigned long element_selector_BA = (__mask >> 2) & 0x03;
2035 unsigned long element_selector_DC = (__mask >> 4) & 0x03;
2036 unsigned long element_selector_FE = (__mask >> 6) & 0x03;
2037 static const unsigned short permute_selectors[4] =
2039 #ifdef __LITTLE_ENDIAN__
2040 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2041 #elif __BIG_ENDIAN__
2042 0x0607, 0x0405, 0x0203, 0x0001
2043 #endif
2045 __v2du pmask =
2046 #ifdef __LITTLE_ENDIAN__
2047 { 0x1716151413121110UL, 0x1f1e1d1c1b1a1918UL};
2048 #elif __BIG_ENDIAN__
2049 { 0x1011121314151617UL, 0x18191a1b1c1d1e1fUL};
2050 #endif
2051 __m64_union t;
2052 __v2du a, r;
2054 #ifdef __LITTLE_ENDIAN__
2055 t.as_short[0] = permute_selectors[element_selector_98];
2056 t.as_short[1] = permute_selectors[element_selector_BA];
2057 t.as_short[2] = permute_selectors[element_selector_DC];
2058 t.as_short[3] = permute_selectors[element_selector_FE];
2059 #elif __BIG_ENDIAN__
2060 t.as_short[3] = permute_selectors[element_selector_98];
2061 t.as_short[2] = permute_selectors[element_selector_BA];
2062 t.as_short[1] = permute_selectors[element_selector_DC];
2063 t.as_short[0] = permute_selectors[element_selector_FE];
2064 #endif
2065 #ifdef __LITTLE_ENDIAN__
2066 pmask[1] = t.as_m64;
2067 #elif __BIG_ENDIAN__
2068 pmask[0] = t.as_m64;
2069 #endif
2070 a = (__v2du)__A;
2071 r = vec_perm (a, a, (__vector unsigned char)pmask);
2072 return (__m128i) r;
2075 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2076 _mm_shufflelo_epi16 (__m128i __A, const int __mask)
2078 unsigned long element_selector_10 = __mask & 0x03;
2079 unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2080 unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2081 unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2082 static const unsigned short permute_selectors[4] =
2084 #ifdef __LITTLE_ENDIAN__
2085 0x0100, 0x0302, 0x0504, 0x0706
2086 #elif __BIG_ENDIAN__
2087 0x0e0f, 0x0c0d, 0x0a0b, 0x0809
2088 #endif
2090 __v2du pmask = { 0x1011121314151617UL, 0x1f1e1d1c1b1a1918UL};
2091 __m64_union t;
2092 __v2du a, r;
2094 #ifdef __LITTLE_ENDIAN__
2095 t.as_short[0] = permute_selectors[element_selector_10];
2096 t.as_short[1] = permute_selectors[element_selector_32];
2097 t.as_short[2] = permute_selectors[element_selector_54];
2098 t.as_short[3] = permute_selectors[element_selector_76];
2099 #elif __BIG_ENDIAN__
2100 t.as_short[3] = permute_selectors[element_selector_10];
2101 t.as_short[2] = permute_selectors[element_selector_32];
2102 t.as_short[1] = permute_selectors[element_selector_54];
2103 t.as_short[0] = permute_selectors[element_selector_76];
2104 #endif
2105 #ifdef __LITTLE_ENDIAN__
2106 pmask[0] = t.as_m64;
2107 #elif __BIG_ENDIAN__
2108 pmask[1] = t.as_m64;
2109 #endif
2110 a = (__v2du)__A;
2111 r = vec_perm (a, a, (__vector unsigned char)pmask);
2112 return (__m128i) r;
2115 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2116 _mm_shuffle_epi32 (__m128i __A, const int __mask)
2118 unsigned long element_selector_10 = __mask & 0x03;
2119 unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2120 unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2121 unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2122 static const unsigned int permute_selectors[4] =
2124 #ifdef __LITTLE_ENDIAN__
2125 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2126 #elif __BIG_ENDIAN__
2127 0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203
2128 #endif
2130 __v4su t;
2132 #ifdef __LITTLE_ENDIAN__
2133 t[0] = permute_selectors[element_selector_10];
2134 t[1] = permute_selectors[element_selector_32];
2135 t[2] = permute_selectors[element_selector_54] + 0x10101010;
2136 t[3] = permute_selectors[element_selector_76] + 0x10101010;
2137 #elif __BIG_ENDIAN__
2138 t[3] = permute_selectors[element_selector_10] + 0x10101010;
2139 t[2] = permute_selectors[element_selector_32] + 0x10101010;
2140 t[1] = permute_selectors[element_selector_54];
2141 t[0] = permute_selectors[element_selector_76];
2142 #endif
2143 return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)t);
2146 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2147 _mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
2149 __v2du hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2150 __v16qu mask, tmp;
2151 __m128i *p = (__m128i*)__C;
2153 tmp = (__v16qu)_mm_loadu_si128(p);
2154 mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)hibit);
2155 tmp = vec_sel (tmp, (__v16qu)__A, mask);
2156 _mm_storeu_si128 (p, (__m128i)tmp);
2159 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2160 _mm_avg_epu8 (__m128i __A, __m128i __B)
2162 return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B);
2165 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2166 _mm_avg_epu16 (__m128i __A, __m128i __B)
2168 return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B);
2172 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2173 _mm_sad_epu8 (__m128i __A, __m128i __B)
2175 __v16qu a, b;
2176 __v16qu vmin, vmax, vabsdiff;
2177 __v4si vsum;
2178 const __v4su zero = { 0, 0, 0, 0 };
2179 __v4si result;
2181 a = (__v16qu) __A;
2182 b = (__v16qu) __B;
2183 vmin = vec_min (a, b);
2184 vmax = vec_max (a, b);
2185 vabsdiff = vec_sub (vmax, vmin);
2186 /* Sum four groups of bytes into integers. */
2187 vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
2188 /* Sum across four integers with two integer results. */
2189 result = vec_sum2s (vsum, (__vector signed int) zero);
2190 /* Rotate the sums into the correct position. */
2191 #ifdef __LITTLE_ENDIAN__
2192 result = vec_sld (result, result, 4);
2193 #elif __BIG_ENDIAN__
2194 result = vec_sld (result, result, 6);
2195 #endif
2196 /* Rotate the sums into the correct position. */
2197 return (__m128i) result;
2200 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2201 _mm_stream_si32 (int *__A, int __B)
2203 /* Use the data cache block touch for store transient. */
2204 __asm__ (
2205 "dcbtstt 0,%0"
2207 : "b" (__A)
2208 : "memory"
2210 *__A = __B;
2213 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2214 _mm_stream_si64 (long long int *__A, long long int __B)
2216 /* Use the data cache block touch for store transient. */
2217 __asm__ (
2218 " dcbtstt 0,%0"
2220 : "b" (__A)
2221 : "memory"
2223 *__A = __B;
2226 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2227 _mm_stream_si128 (__m128i *__A, __m128i __B)
2229 /* Use the data cache block touch for store transient. */
2230 __asm__ (
2231 "dcbtstt 0,%0"
2233 : "b" (__A)
2234 : "memory"
2236 *__A = __B;
2239 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2240 _mm_stream_pd (double *__A, __m128d __B)
2242 /* Use the data cache block touch for store transient. */
2243 __asm__ (
2244 "dcbtstt 0,%0"
2246 : "b" (__A)
2247 : "memory"
2249 *(__m128d*)__A = __B;
2252 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2253 _mm_clflush (void const *__A)
2255 /* Use the data cache block flush. */
2256 __asm__ (
2257 "dcbf 0,%0"
2259 : "b" (__A)
2260 : "memory"
2264 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2265 _mm_lfence (void)
2267 /* Use light weight sync for load to load ordering. */
2268 __atomic_thread_fence (__ATOMIC_RELEASE);
2271 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2272 _mm_mfence (void)
2274 /* Use heavy weight sync for any to any ordering. */
2275 __atomic_thread_fence (__ATOMIC_SEQ_CST);
2278 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2279 _mm_cvtsi32_si128 (int __A)
2281 return _mm_set_epi32 (0, 0, 0, __A);
2284 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2285 _mm_cvtsi64_si128 (long long __A)
2287 return __extension__ (__m128i)(__v2di){ __A, 0LL };
2290 /* Microsoft intrinsic. */
2291 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2292 _mm_cvtsi64x_si128 (long long __A)
2294 return __extension__ (__m128i)(__v2di){ __A, 0LL };
2297 /* Casts between various SP, DP, INT vector types. Note that these do no
2298 conversion of values, they just change the type. */
2299 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2300 _mm_castpd_ps(__m128d __A)
2302 return (__m128) __A;
2305 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2306 _mm_castpd_si128(__m128d __A)
2308 return (__m128i) __A;
2311 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2312 _mm_castps_pd(__m128 __A)
2314 return (__m128d) __A;
2317 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2318 _mm_castps_si128(__m128 __A)
2320 return (__m128i) __A;
2323 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2324 _mm_castsi128_ps(__m128i __A)
2326 return (__m128) __A;
2329 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2330 _mm_castsi128_pd(__m128i __A)
2332 return (__m128d) __A;
2335 #endif /* EMMINTRIN_H_ */