Add gfc_class_set_vptr.
[official-gcc.git] / gcc / config / rs6000 / emmintrin.h
blob1a50cfdb0081c282538f87017d12df10a4ea7435
1 /* Copyright (C) 2003-2024 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
24 /* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 9.0. */
27 #ifndef NO_WARN_X86_INTRINSICS
28 /* This header is distributed to simplify porting x86_64 code that
29 makes explicit use of Intel intrinsics to powerpc64le.
30 It is the user's responsibility to determine if the results are
31 acceptable and make additional changes as necessary.
32 Note that much code that uses Intel intrinsics can be rewritten in
33 standard C or GNU C extensions, which are more portable and better
34 optimized across multiple targets.
36 In the specific case of X86 SSE2 (__m128i, __m128d) intrinsics,
37 the PowerPC VMX/VSX ISA is a good match for vector double SIMD
38 operations. However scalar double operations in vector (XMM)
39 registers require the POWER8 VSX ISA (2.07) level. Also there are
40 important differences for data format and placement of double
41 scalars in the vector register.
43 For PowerISA Scalar double is in FPRs (left most 64-bits of the
44 low 32 VSRs), while X86_64 SSE2 uses the right most 64-bits of
45 the XMM. These differences require extra steps on POWER to match
46 the SSE2 scalar double semantics.
48 Most SSE2 scalar double intrinsic operations can be performed more
49 efficiently as C language double scalar operations or optimized to
50 use vector SIMD operations. We recommend this for new applications.
52 Another difference is the format and details of the X86_64 MXSCR vs
53 the PowerISA FPSCR / VSCR registers. We recommend applications
54 replace direct access to the MXSCR with the more portable <fenv.h>
55 Posix APIs. */
56 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
57 #endif
59 #ifndef EMMINTRIN_H_
60 #define EMMINTRIN_H_
62 #include <altivec.h>
63 #include <assert.h>
65 /* We need definitions from the SSE header files. */
66 #include <xmmintrin.h>
68 /* SSE2 */
69 typedef __vector double __v2df;
70 typedef __vector long long __v2di;
71 typedef __vector unsigned long long __v2du;
72 typedef __vector int __v4si;
73 typedef __vector unsigned int __v4su;
74 typedef __vector short __v8hi;
75 typedef __vector unsigned short __v8hu;
76 typedef __vector signed char __v16qi;
77 typedef __vector unsigned char __v16qu;
79 /* The Intel API is flexible enough that we must allow aliasing with other
80 vector types, and their scalar components. */
81 typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
82 typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
84 /* Unaligned version of the same types. */
85 typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
86 typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
88 /* Define two value permute mask. */
89 #define _MM_SHUFFLE2(x,y) (((x) << 1) | (y))
91 /* Create a vector with element 0 as F and the rest zero. */
92 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
93 _mm_set_sd (double __F)
95 return __extension__ (__m128d){ __F, 0.0 };
98 /* Create a vector with both elements equal to F. */
99 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
100 _mm_set1_pd (double __F)
102 return __extension__ (__m128d){ __F, __F };
105 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
106 _mm_set_pd1 (double __F)
108 return _mm_set1_pd (__F);
111 /* Create a vector with the lower value X and upper value W. */
112 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
113 _mm_set_pd (double __W, double __X)
115 return __extension__ (__m128d){ __X, __W };
118 /* Create a vector with the lower value W and upper value X. */
119 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
120 _mm_setr_pd (double __W, double __X)
122 return __extension__ (__m128d){ __W, __X };
125 /* Create an undefined vector. */
126 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
127 _mm_undefined_pd (void)
129 __m128d __Y = __Y;
130 return __Y;
133 /* Create a vector of zeros. */
134 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
135 _mm_setzero_pd (void)
137 return (__m128d) vec_splats (0);
140 /* Sets the low DPFP value of A from the low value of B. */
141 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
142 _mm_move_sd (__m128d __A, __m128d __B)
144 __v2df __result = (__v2df) __A;
145 __result [0] = ((__v2df) __B)[0];
146 return (__m128d) __result;
149 /* Load two DPFP values from P. The address must be 16-byte aligned. */
150 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
151 _mm_load_pd (double const *__P)
153 assert(((unsigned long)__P & 0xfUL) == 0UL);
154 return ((__m128d)vec_ld(0, (__v16qu*)__P));
157 /* Load two DPFP values from P. The address need not be 16-byte aligned. */
158 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
159 _mm_loadu_pd (double const *__P)
161 return (vec_vsx_ld(0, __P));
164 /* Create a vector with all two elements equal to *P. */
165 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
166 _mm_load1_pd (double const *__P)
168 return (vec_splats (*__P));
171 /* Create a vector with element 0 as *P and the rest zero. */
172 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
173 _mm_load_sd (double const *__P)
175 return _mm_set_sd (*__P);
178 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
179 _mm_load_pd1 (double const *__P)
181 return _mm_load1_pd (__P);
184 /* Load two DPFP values in reverse order. The address must be aligned. */
185 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
186 _mm_loadr_pd (double const *__P)
188 __v2df __tmp = _mm_load_pd (__P);
189 return (__m128d)vec_xxpermdi (__tmp, __tmp, 2);
192 /* Store two DPFP values. The address must be 16-byte aligned. */
193 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
194 _mm_store_pd (double *__P, __m128d __A)
196 assert(((unsigned long)__P & 0xfUL) == 0UL);
197 vec_st((__v16qu)__A, 0, (__v16qu*)__P);
200 /* Store two DPFP values. The address need not be 16-byte aligned. */
201 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
202 _mm_storeu_pd (double *__P, __m128d __A)
204 *(__m128d_u *)__P = __A;
207 /* Stores the lower DPFP value. */
208 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
209 _mm_store_sd (double *__P, __m128d __A)
211 *__P = ((__v2df)__A)[0];
214 extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
215 _mm_cvtsd_f64 (__m128d __A)
217 return ((__v2df)__A)[0];
220 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
221 _mm_storel_pd (double *__P, __m128d __A)
223 _mm_store_sd (__P, __A);
226 /* Stores the upper DPFP value. */
227 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
228 _mm_storeh_pd (double *__P, __m128d __A)
230 *__P = ((__v2df)__A)[1];
232 /* Store the lower DPFP value across two words.
233 The address must be 16-byte aligned. */
234 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
235 _mm_store1_pd (double *__P, __m128d __A)
237 _mm_store_pd (__P, vec_splat (__A, 0));
240 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
241 _mm_store_pd1 (double *__P, __m128d __A)
243 _mm_store1_pd (__P, __A);
246 /* Store two DPFP values in reverse order. The address must be aligned. */
247 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
248 _mm_storer_pd (double *__P, __m128d __A)
250 _mm_store_pd (__P, vec_xxpermdi (__A, __A, 2));
253 /* Intel intrinsic. */
254 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
255 _mm_cvtsi128_si64 (__m128i __A)
257 return ((__v2di)__A)[0];
260 /* Microsoft intrinsic. */
261 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
262 _mm_cvtsi128_si64x (__m128i __A)
264 return ((__v2di)__A)[0];
267 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
268 _mm_add_pd (__m128d __A, __m128d __B)
270 return (__m128d) ((__v2df)__A + (__v2df)__B);
273 /* Add the lower double-precision (64-bit) floating-point element in
274 a and b, store the result in the lower element of dst, and copy
275 the upper element from a to the upper element of dst. */
276 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
277 _mm_add_sd (__m128d __A, __m128d __B)
279 __A[0] = __A[0] + __B[0];
280 return (__A);
283 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
284 _mm_sub_pd (__m128d __A, __m128d __B)
286 return (__m128d) ((__v2df)__A - (__v2df)__B);
289 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
290 _mm_sub_sd (__m128d __A, __m128d __B)
292 __A[0] = __A[0] - __B[0];
293 return (__A);
296 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
297 _mm_mul_pd (__m128d __A, __m128d __B)
299 return (__m128d) ((__v2df)__A * (__v2df)__B);
302 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
303 _mm_mul_sd (__m128d __A, __m128d __B)
305 __A[0] = __A[0] * __B[0];
306 return (__A);
309 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
310 _mm_div_pd (__m128d __A, __m128d __B)
312 return (__m128d) ((__v2df)__A / (__v2df)__B);
315 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
316 _mm_div_sd (__m128d __A, __m128d __B)
318 __A[0] = __A[0] / __B[0];
319 return (__A);
322 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
323 _mm_sqrt_pd (__m128d __A)
325 return (vec_sqrt (__A));
328 /* Return pair {sqrt (B[0]), A[1]}. */
329 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
330 _mm_sqrt_sd (__m128d __A, __m128d __B)
332 __v2df __c;
333 __c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0]));
334 return (__m128d) _mm_setr_pd (__c[0], __A[1]);
337 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
338 _mm_min_pd (__m128d __A, __m128d __B)
340 return (vec_min (__A, __B));
343 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
344 _mm_min_sd (__m128d __A, __m128d __B)
346 __v2df __a, __b, __c;
347 __a = vec_splats (__A[0]);
348 __b = vec_splats (__B[0]);
349 __c = vec_min (__a, __b);
350 return (__m128d) _mm_setr_pd (__c[0], __A[1]);
353 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
354 _mm_max_pd (__m128d __A, __m128d __B)
356 return (vec_max (__A, __B));
359 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
360 _mm_max_sd (__m128d __A, __m128d __B)
362 __v2df __a, __b, __c;
363 __a = vec_splats (__A[0]);
364 __b = vec_splats (__B[0]);
365 __c = vec_max (__a, __b);
366 return (__m128d) _mm_setr_pd (__c[0], __A[1]);
369 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
370 _mm_cmpeq_pd (__m128d __A, __m128d __B)
372 return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B));
375 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
376 _mm_cmplt_pd (__m128d __A, __m128d __B)
378 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
381 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
382 _mm_cmple_pd (__m128d __A, __m128d __B)
384 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
387 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
388 _mm_cmpgt_pd (__m128d __A, __m128d __B)
390 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
393 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
394 _mm_cmpge_pd (__m128d __A, __m128d __B)
396 return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B));
399 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
400 _mm_cmpneq_pd (__m128d __A, __m128d __B)
402 __v2df __temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B);
403 return ((__m128d)vec_nor (__temp, __temp));
406 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
407 _mm_cmpnlt_pd (__m128d __A, __m128d __B)
409 return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B));
412 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
413 _mm_cmpnle_pd (__m128d __A, __m128d __B)
415 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
418 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
419 _mm_cmpngt_pd (__m128d __A, __m128d __B)
421 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
424 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
425 _mm_cmpnge_pd (__m128d __A, __m128d __B)
427 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
430 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
431 _mm_cmpord_pd (__m128d __A, __m128d __B)
433 __v2du __c, __d;
434 /* Compare against self will return false (0's) if NAN. */
435 __c = (__v2du)vec_cmpeq (__A, __A);
436 __d = (__v2du)vec_cmpeq (__B, __B);
437 /* A != NAN and B != NAN. */
438 return ((__m128d)vec_and(__c, __d));
441 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
442 _mm_cmpunord_pd (__m128d __A, __m128d __B)
444 #if _ARCH_PWR8
445 __v2du __c, __d;
446 /* Compare against self will return false (0's) if NAN. */
447 __c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
448 __d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
449 /* A == NAN OR B == NAN converts too:
450 NOT(A != NAN) OR NOT(B != NAN). */
451 __c = vec_nor (__c, __c);
452 return ((__m128d)vec_orc(__c, __d));
453 #else
454 __v2du __c, __d;
455 /* Compare against self will return false (0's) if NAN. */
456 __c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
457 __d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
458 /* Convert the true ('1's) is NAN. */
459 __c = vec_nor (__c, __c);
460 __d = vec_nor (__d, __d);
461 return ((__m128d)vec_or(__c, __d));
462 #endif
465 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
466 _mm_cmpeq_sd(__m128d __A, __m128d __B)
468 __v2df __a, __b, __c;
469 /* PowerISA VSX does not allow partial (for just lower double)
470 results. So to insure we don't generate spurious exceptions
471 (from the upper double values) we splat the lower double
472 before we do the operation. */
473 __a = vec_splats (__A[0]);
474 __b = vec_splats (__B[0]);
475 __c = (__v2df) vec_cmpeq(__a, __b);
476 /* Then we merge the lower double result with the original upper
477 double from __A. */
478 return (__m128d) _mm_setr_pd (__c[0], __A[1]);
481 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
482 _mm_cmplt_sd (__m128d __A, __m128d __B)
484 __v2df __a, __b, __c;
485 __a = vec_splats (__A[0]);
486 __b = vec_splats (__B[0]);
487 __c = (__v2df) vec_cmplt(__a, __b);
488 return (__m128d) _mm_setr_pd (__c[0], __A[1]);
491 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
492 _mm_cmple_sd (__m128d __A, __m128d __B)
494 __v2df __a, __b, __c;
495 __a = vec_splats (__A[0]);
496 __b = vec_splats (__B[0]);
497 __c = (__v2df) vec_cmple(__a, __b);
498 return (__m128d) _mm_setr_pd (__c[0], __A[1]);
501 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
502 _mm_cmpgt_sd (__m128d __A, __m128d __B)
504 __v2df __a, __b, __c;
505 __a = vec_splats (__A[0]);
506 __b = vec_splats (__B[0]);
507 __c = (__v2df) vec_cmpgt(__a, __b);
508 return (__m128d) _mm_setr_pd (__c[0], __A[1]);
511 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
512 _mm_cmpge_sd (__m128d __A, __m128d __B)
514 __v2df __a, __b, __c;
515 __a = vec_splats (__A[0]);
516 __b = vec_splats (__B[0]);
517 __c = (__v2df) vec_cmpge(__a, __b);
518 return (__m128d) _mm_setr_pd (__c[0], __A[1]);
521 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
522 _mm_cmpneq_sd (__m128d __A, __m128d __B)
524 __v2df __a, __b, __c;
525 __a = vec_splats (__A[0]);
526 __b = vec_splats (__B[0]);
527 __c = (__v2df) vec_cmpeq(__a, __b);
528 __c = vec_nor (__c, __c);
529 return (__m128d) _mm_setr_pd (__c[0], __A[1]);
532 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
533 _mm_cmpnlt_sd (__m128d __A, __m128d __B)
535 __v2df __a, __b, __c;
536 __a = vec_splats (__A[0]);
537 __b = vec_splats (__B[0]);
538 /* Not less than is just greater than or equal. */
539 __c = (__v2df) vec_cmpge(__a, __b);
540 return (__m128d) _mm_setr_pd (__c[0], __A[1]);
543 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
544 _mm_cmpnle_sd (__m128d __A, __m128d __B)
546 __v2df __a, __b, __c;
547 __a = vec_splats (__A[0]);
548 __b = vec_splats (__B[0]);
549 /* Not less than or equal is just greater than. */
550 __c = (__v2df) vec_cmpge(__a, __b);
551 return (__m128d) _mm_setr_pd (__c[0], __A[1]);
554 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
555 _mm_cmpngt_sd (__m128d __A, __m128d __B)
557 __v2df __a, __b, __c;
558 __a = vec_splats (__A[0]);
559 __b = vec_splats (__B[0]);
560 /* Not greater than is just less than or equal. */
561 __c = (__v2df) vec_cmple(__a, __b);
562 return (__m128d) _mm_setr_pd (__c[0], __A[1]);
565 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
566 _mm_cmpnge_sd (__m128d __A, __m128d __B)
568 __v2df __a, __b, __c;
569 __a = vec_splats (__A[0]);
570 __b = vec_splats (__B[0]);
571 /* Not greater than or equal is just less than. */
572 __c = (__v2df) vec_cmplt(__a, __b);
573 return (__m128d) _mm_setr_pd (__c[0], __A[1]);
576 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
577 _mm_cmpord_sd (__m128d __A, __m128d __B)
579 __v2df __r;
580 __r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
581 return (__m128d) _mm_setr_pd (__r[0], ((__v2df)__A)[1]);
584 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
585 _mm_cmpunord_sd (__m128d __A, __m128d __B)
587 __v2df __r;
588 __r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
589 return (__m128d) _mm_setr_pd (__r[0], __A[1]);
592 /* FIXME
593 The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
594 exactly the same because GCC for PowerPC only generates unordered
595 compares (scalar and vector).
596 Technically __mm_comieq_sp et all should be using the ordered
597 compare and signal for QNaNs. The __mm_ucomieq_sd et all should
598 be OK. */
599 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
600 _mm_comieq_sd (__m128d __A, __m128d __B)
602 return (__A[0] == __B[0]);
605 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
606 _mm_comilt_sd (__m128d __A, __m128d __B)
608 return (__A[0] < __B[0]);
611 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
612 _mm_comile_sd (__m128d __A, __m128d __B)
614 return (__A[0] <= __B[0]);
617 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
618 _mm_comigt_sd (__m128d __A, __m128d __B)
620 return (__A[0] > __B[0]);
623 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
624 _mm_comige_sd (__m128d __A, __m128d __B)
626 return (__A[0] >= __B[0]);
629 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
630 _mm_comineq_sd (__m128d __A, __m128d __B)
632 return (__A[0] != __B[0]);
635 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
636 _mm_ucomieq_sd (__m128d __A, __m128d __B)
638 return (__A[0] == __B[0]);
641 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
642 _mm_ucomilt_sd (__m128d __A, __m128d __B)
644 return (__A[0] < __B[0]);
647 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
648 _mm_ucomile_sd (__m128d __A, __m128d __B)
650 return (__A[0] <= __B[0]);
653 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
654 _mm_ucomigt_sd (__m128d __A, __m128d __B)
656 return (__A[0] > __B[0]);
659 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
660 _mm_ucomige_sd (__m128d __A, __m128d __B)
662 return (__A[0] >= __B[0]);
665 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
666 _mm_ucomineq_sd (__m128d __A, __m128d __B)
668 return (__A[0] != __B[0]);
671 /* Create a vector of Qi, where i is the element number. */
672 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
673 _mm_set_epi64x (long long __q1, long long __q0)
675 return __extension__ (__m128i)(__v2di){ __q0, __q1 };
678 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
679 _mm_set_epi64 (__m64 __q1, __m64 __q0)
681 return _mm_set_epi64x ((long long)__q1, (long long)__q0);
684 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
685 _mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
687 return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
690 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
691 _mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
692 short __q3, short __q2, short __q1, short __q0)
694 return __extension__ (__m128i)(__v8hi){
695 __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
698 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
699 _mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
700 char __q11, char __q10, char __q09, char __q08,
701 char __q07, char __q06, char __q05, char __q04,
702 char __q03, char __q02, char __q01, char __q00)
704 return __extension__ (__m128i)(__v16qi){
705 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
706 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
710 /* Set all of the elements of the vector to A. */
711 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
712 _mm_set1_epi64x (long long __A)
714 return _mm_set_epi64x (__A, __A);
717 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
718 _mm_set1_epi64 (__m64 __A)
720 return _mm_set_epi64 (__A, __A);
723 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
724 _mm_set1_epi32 (int __A)
726 return _mm_set_epi32 (__A, __A, __A, __A);
729 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
730 _mm_set1_epi16 (short __A)
732 return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
735 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
736 _mm_set1_epi8 (char __A)
738 return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
739 __A, __A, __A, __A, __A, __A, __A, __A);
742 /* Create a vector of Qi, where i is the element number.
743 The parameter order is reversed from the _mm_set_epi* functions. */
744 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
745 _mm_setr_epi64 (__m64 __q0, __m64 __q1)
747 return _mm_set_epi64 (__q1, __q0);
750 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
751 _mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
753 return _mm_set_epi32 (__q3, __q2, __q1, __q0);
756 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
757 _mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
758 short __q4, short __q5, short __q6, short __q7)
760 return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
763 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
764 _mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
765 char __q04, char __q05, char __q06, char __q07,
766 char __q08, char __q09, char __q10, char __q11,
767 char __q12, char __q13, char __q14, char __q15)
769 return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
770 __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
773 /* Create a vector with element 0 as *P and the rest zero. */
774 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
775 _mm_load_si128 (__m128i const *__P)
777 return *__P;
780 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
781 _mm_loadu_si128 (__m128i_u const *__P)
783 return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
786 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
787 _mm_loadl_epi64 (__m128i_u const *__P)
789 return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
792 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
793 _mm_store_si128 (__m128i *__P, __m128i __B)
795 assert(((unsigned long )__P & 0xfUL) == 0UL);
796 vec_st ((__v16qu) __B, 0, (__v16qu*)__P);
799 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
800 _mm_storeu_si128 (__m128i_u *__P, __m128i __B)
802 *__P = __B;
805 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
806 _mm_storel_epi64 (__m128i_u *__P, __m128i __B)
808 *(long long *)__P = ((__v2di)__B)[0];
811 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
812 _mm_movepi64_pi64 (__m128i_u __B)
814 return (__m64) ((__v2di)__B)[0];
817 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
818 _mm_movpi64_epi64 (__m64 __A)
820 return _mm_set_epi64 ((__m64)0LL, __A);
823 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
824 _mm_move_epi64 (__m128i __A)
826 return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]);
829 /* Create an undefined vector. */
830 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
831 _mm_undefined_si128 (void)
833 __m128i __Y = __Y;
834 return __Y;
837 /* Create a vector of zeros. */
838 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
839 _mm_setzero_si128 (void)
841 return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
844 #ifdef _ARCH_PWR8
845 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
846 _mm_cvtepi32_pd (__m128i __A)
848 __v2di __val;
849 /* For LE need to generate Vector Unpack Low Signed Word.
850 Which is generated from unpackh. */
851 __val = (__v2di)vec_unpackh ((__v4si)__A);
853 return (__m128d)vec_ctf (__val, 0);
855 #endif
857 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
858 _mm_cvtepi32_ps (__m128i __A)
860 return ((__m128)vec_ctf((__v4si)__A, 0));
863 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
864 _mm_cvtpd_epi32 (__m128d __A)
866 __v2df __rounded = vec_rint (__A);
867 __v4si __result, __temp;
868 const __v4si __vzero =
869 { 0, 0, 0, 0 };
871 /* VSX Vector truncate Double-Precision to integer and Convert to
872 Signed Integer Word format with Saturate. */
873 __asm__(
874 "xvcvdpsxws %x0,%x1"
875 : "=wa" (__temp)
876 : "wa" (__rounded)
877 : );
879 #ifdef _ARCH_PWR8
880 #ifdef __LITTLE_ENDIAN__
881 __temp = vec_mergeo (__temp, __temp);
882 #else
883 __temp = vec_mergee (__temp, __temp);
884 #endif
885 __result = (__v4si) vec_vpkudum ((__vector long long) __temp,
886 (__vector long long) __vzero);
887 #else
889 const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
890 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
891 __result = (__v4si) vec_perm ((__v16qu) __temp, (__v16qu) __vzero, __pkperm);
893 #endif
894 return (__m128i) __result;
897 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
898 _mm_cvtpd_pi32 (__m128d __A)
900 __m128i __result = _mm_cvtpd_epi32(__A);
902 return (__m64) __result[0];
905 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
906 _mm_cvtpd_ps (__m128d __A)
908 __v4sf __result;
909 __v4si __temp;
910 const __v4si __vzero = { 0, 0, 0, 0 };
912 __asm__(
913 "xvcvdpsp %x0,%x1"
914 : "=wa" (__temp)
915 : "wa" (__A)
916 : );
918 #ifdef _ARCH_PWR8
919 #ifdef __LITTLE_ENDIAN__
920 __temp = vec_mergeo (__temp, __temp);
921 #else
922 __temp = vec_mergee (__temp, __temp);
923 #endif
924 __result = (__v4sf) vec_vpkudum ((__vector long long) __temp,
925 (__vector long long) __vzero);
926 #else
928 const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
929 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
930 __result = (__v4sf) vec_perm ((__v16qu) __temp, (__v16qu) __vzero, __pkperm);
932 #endif
933 return ((__m128)__result);
936 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
937 _mm_cvttpd_epi32 (__m128d __A)
939 __v4si __result;
940 __v4si __temp;
941 const __v4si __vzero = { 0, 0, 0, 0 };
943 /* VSX Vector truncate Double-Precision to integer and Convert to
944 Signed Integer Word format with Saturate. */
945 __asm__(
946 "xvcvdpsxws %x0,%x1"
947 : "=wa" (__temp)
948 : "wa" (__A)
949 : );
951 #ifdef _ARCH_PWR8
952 #ifdef __LITTLE_ENDIAN__
953 __temp = vec_mergeo (__temp, __temp);
954 #else
955 __temp = vec_mergee (__temp, __temp);
956 #endif
957 __result = (__v4si) vec_vpkudum ((__vector long long) __temp,
958 (__vector long long) __vzero);
959 #else
961 const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
962 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
963 __result = (__v4si) vec_perm ((__v16qu) __temp, (__v16qu) __vzero, __pkperm);
965 #endif
967 return ((__m128i) __result);
970 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
971 _mm_cvttpd_pi32 (__m128d __A)
973 __m128i __result = _mm_cvttpd_epi32 (__A);
975 return (__m64) __result[0];
978 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
979 _mm_cvtsi128_si32 (__m128i __A)
981 return ((__v4si)__A)[0];
984 #ifdef _ARCH_PWR8
985 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
986 _mm_cvtpi32_pd (__m64 __A)
988 __v4si __temp;
989 __v2di __tmp2;
990 __v2df __result;
992 __temp = (__v4si)vec_splats (__A);
993 __tmp2 = (__v2di)vec_unpackl (__temp);
994 __result = vec_ctf ((__vector signed long long) __tmp2, 0);
995 return (__m128d)__result;
997 #endif
999 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1000 _mm_cvtps_epi32 (__m128 __A)
1002 __v4sf __rounded;
1003 __v4si __result;
1005 __rounded = vec_rint((__v4sf) __A);
1006 __result = vec_cts (__rounded, 0);
1007 return (__m128i) __result;
1010 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1011 _mm_cvttps_epi32 (__m128 __A)
1013 __v4si __result;
1015 __result = vec_cts ((__v4sf) __A, 0);
1016 return (__m128i) __result;
1019 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1020 _mm_cvtps_pd (__m128 __A)
1022 /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
1023 #ifdef vec_doubleh
1024 return (__m128d) vec_doubleh ((__v4sf)__A);
1025 #else
1026 /* Otherwise the compiler is not current and so need to generate the
1027 equivalent code. */
1028 __v4sf __a = (__v4sf)__A;
1029 __v4sf __temp;
1030 __v2df __result;
1031 #ifdef __LITTLE_ENDIAN__
1032 /* The input float values are in elements {[0], [1]} but the convert
1033 instruction needs them in elements {[1], [3]}, So we use two
1034 shift left double vector word immediates to get the elements
1035 lined up. */
1036 __temp = __builtin_vsx_xxsldwi (__a, __a, 3);
1037 __temp = __builtin_vsx_xxsldwi (__a, __temp, 2);
1038 #else
1039 /* The input float values are in elements {[0], [1]} but the convert
1040 instruction needs them in elements {[0], [2]}, So we use two
1041 shift left double vector word immediates to get the elements
1042 lined up. */
1043 __temp = vec_vmrghw (__a, __a);
1044 #endif
1045 __asm__(
1046 " xvcvspdp %x0,%x1"
1047 : "=wa" (__result)
1048 : "wa" (__temp)
1049 : );
1050 return (__m128d) __result;
1051 #endif
1054 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1055 _mm_cvtsd_si32 (__m128d __A)
1057 __v2df __rounded = vec_rint((__v2df) __A);
1058 int __result = ((__v2df)__rounded)[0];
1060 return __result;
1062 /* Intel intrinsic. */
1063 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1064 _mm_cvtsd_si64 (__m128d __A)
1066 __v2df __rounded = vec_rint ((__v2df) __A );
1067 long long __result = ((__v2df) __rounded)[0];
1069 return __result;
1072 /* Microsoft intrinsic. */
1073 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1074 _mm_cvtsd_si64x (__m128d __A)
1076 return _mm_cvtsd_si64 ((__v2df)__A);
1079 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1080 _mm_cvttsd_si32 (__m128d __A)
1082 int __result = ((__v2df)__A)[0];
1084 return __result;
1087 /* Intel intrinsic. */
1088 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1089 _mm_cvttsd_si64 (__m128d __A)
1091 long long __result = ((__v2df)__A)[0];
1093 return __result;
1096 /* Microsoft intrinsic. */
1097 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1098 _mm_cvttsd_si64x (__m128d __A)
1100 return _mm_cvttsd_si64 (__A);
1103 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1104 _mm_cvtsd_ss (__m128 __A, __m128d __B)
1106 __v4sf __result = (__v4sf)__A;
1108 #ifdef __LITTLE_ENDIAN__
1109 __v4sf __temp_s;
1110 /* Copy double element[0] to element [1] for conversion. */
1111 __v2df __temp_b = vec_splat((__v2df)__B, 0);
1113 /* Pre-rotate __A left 3 (logically right 1) elements. */
1114 __result = __builtin_vsx_xxsldwi (__result, __result, 3);
1115 /* Convert double to single float scalar in a vector. */
1116 __asm__(
1117 "xscvdpsp %x0,%x1"
1118 : "=wa" (__temp_s)
1119 : "wa" (__temp_b)
1120 : );
1121 /* Shift the resulting scalar into vector element [0]. */
1122 __result = __builtin_vsx_xxsldwi (__result, __temp_s, 1);
1123 #else
1124 __result [0] = ((__v2df)__B)[0];
1125 #endif
1126 return (__m128) __result;
1129 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1130 _mm_cvtsi32_sd (__m128d __A, int __B)
1132 __v2df __result = (__v2df)__A;
1133 double __db = __B;
1134 __result [0] = __db;
1135 return (__m128d)__result;
1138 /* Intel intrinsic. */
1139 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1140 _mm_cvtsi64_sd (__m128d __A, long long __B)
1142 __v2df __result = (__v2df)__A;
1143 double __db = __B;
1144 __result [0] = __db;
1145 return (__m128d)__result;
1148 /* Microsoft intrinsic. */
1149 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1150 _mm_cvtsi64x_sd (__m128d __A, long long __B)
1152 return _mm_cvtsi64_sd (__A, __B);
1155 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1156 _mm_cvtss_sd (__m128d __A, __m128 __B)
1158 #ifdef __LITTLE_ENDIAN__
1159 /* Use splat to move element [0] into position for the convert. */
1160 __v4sf __temp = vec_splat ((__v4sf)__B, 0);
1161 __v2df __res;
1162 /* Convert single float scalar to double in a vector. */
1163 __asm__(
1164 "xscvspdp %x0,%x1"
1165 : "=wa" (__res)
1166 : "wa" (__temp)
1167 : );
1168 return (__m128d) vec_mergel (__res, (__v2df)__A);
1169 #else
1170 __v2df __res = (__v2df)__A;
1171 __res [0] = ((__v4sf)__B) [0];
1172 return (__m128d) __res;
1173 #endif
1176 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1177 _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
1179 __vector double __result;
1180 const int __litmsk = __mask & 0x3;
1182 if (__litmsk == 0)
1183 __result = vec_mergeh (__A, __B);
1184 #if __GNUC__ < 6
1185 else if (__litmsk == 1)
1186 __result = vec_xxpermdi (__B, __A, 2);
1187 else if (__litmsk == 2)
1188 __result = vec_xxpermdi (__B, __A, 1);
1189 #else
1190 else if (__litmsk == 1)
1191 __result = vec_xxpermdi (__A, __B, 2);
1192 else if (__litmsk == 2)
1193 __result = vec_xxpermdi (__A, __B, 1);
1194 #endif
1195 else
1196 __result = vec_mergel (__A, __B);
1198 return __result;
1201 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1202 _mm_unpackhi_pd (__m128d __A, __m128d __B)
1204 return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B);
1207 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1208 _mm_unpacklo_pd (__m128d __A, __m128d __B)
1210 return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B);
1213 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1214 _mm_loadh_pd (__m128d __A, double const *__B)
1216 __v2df __result = (__v2df)__A;
1217 __result [1] = *__B;
1218 return (__m128d)__result;
1221 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1222 _mm_loadl_pd (__m128d __A, double const *__B)
1224 __v2df __result = (__v2df)__A;
1225 __result [0] = *__B;
1226 return (__m128d)__result;
1229 #ifdef _ARCH_PWR8
1230 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1232 /* Creates a 2-bit mask from the most significant bits of the DPFP values. */
1233 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1234 _mm_movemask_pd (__m128d __A)
1236 #ifdef _ARCH_PWR10
1237 return vec_extractm ((__v2du) __A);
1238 #else
1239 __vector unsigned long long __result;
1240 static const __vector unsigned int __perm_mask =
1242 #ifdef __LITTLE_ENDIAN__
1243 0x80800040, 0x80808080, 0x80808080, 0x80808080
1244 #else
1245 0x80808080, 0x80808080, 0x80808080, 0x80804000
1246 #endif
1249 __result = ((__vector unsigned long long)
1250 vec_vbpermq ((__vector unsigned char) __A,
1251 (__vector unsigned char) __perm_mask));
1253 #ifdef __LITTLE_ENDIAN__
1254 return __result[1];
1255 #else
1256 return __result[0];
1257 #endif
1258 #endif /* !_ARCH_PWR10 */
1260 #endif /* _ARCH_PWR8 */
1262 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1263 _mm_packs_epi16 (__m128i __A, __m128i __B)
1265 return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B);
1268 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1269 _mm_packs_epi32 (__m128i __A, __m128i __B)
1271 return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B);
1274 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1275 _mm_packus_epi16 (__m128i __A, __m128i __B)
1277 return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B);
1280 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1281 _mm_unpackhi_epi8 (__m128i __A, __m128i __B)
1283 return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B);
1286 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1287 _mm_unpackhi_epi16 (__m128i __A, __m128i __B)
1289 return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B);
1292 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1293 _mm_unpackhi_epi32 (__m128i __A, __m128i __B)
1295 return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B);
1298 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1299 _mm_unpackhi_epi64 (__m128i __A, __m128i __B)
1301 return (__m128i) vec_mergel ((__vector long long) __A,
1302 (__vector long long) __B);
1305 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1306 _mm_unpacklo_epi8 (__m128i __A, __m128i __B)
1308 return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B);
1311 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1312 _mm_unpacklo_epi16 (__m128i __A, __m128i __B)
1314 return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B);
1317 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1318 _mm_unpacklo_epi32 (__m128i __A, __m128i __B)
1320 return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B);
1323 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1324 _mm_unpacklo_epi64 (__m128i __A, __m128i __B)
1326 return (__m128i) vec_mergeh ((__vector long long) __A,
1327 (__vector long long) __B);
1330 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1331 _mm_add_epi8 (__m128i __A, __m128i __B)
1333 return (__m128i) ((__v16qu)__A + (__v16qu)__B);
1336 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1337 _mm_add_epi16 (__m128i __A, __m128i __B)
1339 return (__m128i) ((__v8hu)__A + (__v8hu)__B);
1342 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1343 _mm_add_epi32 (__m128i __A, __m128i __B)
1345 return (__m128i) ((__v4su)__A + (__v4su)__B);
1348 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1349 _mm_add_epi64 (__m128i __A, __m128i __B)
1351 return (__m128i) ((__v2du)__A + (__v2du)__B);
1354 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1355 _mm_adds_epi8 (__m128i __A, __m128i __B)
1357 return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B);
1360 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1361 _mm_adds_epi16 (__m128i __A, __m128i __B)
1363 return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B);
1366 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1367 _mm_adds_epu8 (__m128i __A, __m128i __B)
1369 return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B);
1372 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1373 _mm_adds_epu16 (__m128i __A, __m128i __B)
1375 return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B);
1378 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1379 _mm_sub_epi8 (__m128i __A, __m128i __B)
1381 return (__m128i) ((__v16qu)__A - (__v16qu)__B);
1384 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1385 _mm_sub_epi16 (__m128i __A, __m128i __B)
1387 return (__m128i) ((__v8hu)__A - (__v8hu)__B);
1390 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1391 _mm_sub_epi32 (__m128i __A, __m128i __B)
1393 return (__m128i) ((__v4su)__A - (__v4su)__B);
1396 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1397 _mm_sub_epi64 (__m128i __A, __m128i __B)
1399 return (__m128i) ((__v2du)__A - (__v2du)__B);
1402 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1403 _mm_subs_epi8 (__m128i __A, __m128i __B)
1405 return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B);
1408 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1409 _mm_subs_epi16 (__m128i __A, __m128i __B)
1411 return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B);
1414 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1415 _mm_subs_epu8 (__m128i __A, __m128i __B)
1417 return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B);
1420 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1421 _mm_subs_epu16 (__m128i __A, __m128i __B)
1423 return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B);
1426 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1427 _mm_madd_epi16 (__m128i __A, __m128i __B)
1429 __vector signed int __zero = {0, 0, 0, 0};
1431 return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, __zero);
1434 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1435 _mm_mulhi_epi16 (__m128i __A, __m128i __B)
1437 __vector signed int __w0, __w1;
1439 __vector unsigned char __xform1 = {
1440 #ifdef __LITTLE_ENDIAN__
1441 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1442 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1443 #else
1444 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
1445 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1446 #endif
1449 __w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B);
1450 __w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B);
1451 return (__m128i) vec_perm (__w0, __w1, __xform1);
1454 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1455 _mm_mullo_epi16 (__m128i __A, __m128i __B)
1457 return (__m128i) ((__v8hi)__A * (__v8hi)__B);
1460 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1461 _mm_mul_su32 (__m64 __A, __m64 __B)
1463 unsigned int __a = __A;
1464 unsigned int __b = __B;
1466 return ((__m64)__a * (__m64)__b);
1469 #ifdef _ARCH_PWR8
1470 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1471 _mm_mul_epu32 (__m128i __A, __m128i __B)
1473 #if __GNUC__ < 8
1474 __v2du __result;
1476 #ifdef __LITTLE_ENDIAN__
1477 /* VMX Vector Multiply Odd Unsigned Word. */
1478 __asm__(
1479 "vmulouw %0,%1,%2"
1480 : "=v" (__result)
1481 : "v" (__A), "v" (__B)
1482 : );
1483 #else
1484 /* VMX Vector Multiply Even Unsigned Word. */
1485 __asm__(
1486 "vmuleuw %0,%1,%2"
1487 : "=v" (__result)
1488 : "v" (__A), "v" (__B)
1489 : );
1490 #endif
1491 return (__m128i) __result;
1492 #else
1493 return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
1494 #endif
1496 #endif
1498 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1499 _mm_slli_epi16 (__m128i __A, int __B)
1501 __v8hu __lshift;
1502 __v8hi __result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1504 if (__B >= 0 && __B < 16)
1506 if (__builtin_constant_p(__B))
1507 __lshift = (__v8hu) vec_splat_s16(__B);
1508 else
1509 __lshift = vec_splats ((unsigned short) __B);
1511 __result = vec_sl ((__v8hi) __A, __lshift);
1514 return (__m128i) __result;
1517 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1518 _mm_slli_epi32 (__m128i __A, int __B)
1520 __v4su __lshift;
1521 __v4si __result = { 0, 0, 0, 0 };
1523 if (__B >= 0 && __B < 32)
1525 if (__builtin_constant_p(__B) && __B < 16)
1526 __lshift = (__v4su) vec_splat_s32(__B);
1527 else
1528 __lshift = vec_splats ((unsigned int) __B);
1530 __result = vec_sl ((__v4si) __A, __lshift);
1533 return (__m128i) __result;
1536 #ifdef _ARCH_PWR8
1537 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1538 _mm_slli_epi64 (__m128i __A, int __B)
1540 __v2du __lshift;
1541 __v2di __result = { 0, 0 };
1543 if (__B >= 0 && __B < 64)
1545 if (__builtin_constant_p(__B) && __B < 16)
1546 __lshift = (__v2du) vec_splat_s32(__B);
1547 else
1548 __lshift = (__v2du) vec_splats ((unsigned int) __B);
1550 __result = vec_sl ((__v2di) __A, __lshift);
1553 return (__m128i) __result;
1555 #endif
1557 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1558 _mm_srai_epi16 (__m128i __A, int __B)
1560 __v8hu __rshift = { 15, 15, 15, 15, 15, 15, 15, 15 };
1561 __v8hi __result;
1563 if (__B < 16)
1565 if (__builtin_constant_p(__B))
1566 __rshift = (__v8hu) vec_splat_s16(__B);
1567 else
1568 __rshift = vec_splats ((unsigned short) __B);
1570 __result = vec_sra ((__v8hi) __A, __rshift);
1572 return (__m128i) __result;
1575 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1576 _mm_srai_epi32 (__m128i __A, int __B)
1578 __v4su __rshift = { 31, 31, 31, 31 };
1579 __v4si __result;
1581 if (__B < 32)
1583 if (__builtin_constant_p(__B))
1585 if (__B < 16)
1586 __rshift = (__v4su) vec_splat_s32(__B);
1587 else
1588 __rshift = (__v4su) vec_splats((unsigned int)__B);
1590 else
1591 __rshift = vec_splats ((unsigned int) __B);
1593 __result = vec_sra ((__v4si) __A, __rshift);
1595 return (__m128i) __result;
1598 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1599 _mm_bslli_si128 (__m128i __A, const int __N)
1601 __v16qu __result;
1602 const __v16qu __zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1604 if (__N == 0)
1605 return __A;
1606 else if (__N > 0 && __N < 16)
1607 #ifdef __LITTLE_ENDIAN__
1608 __result = vec_sld ((__v16qu) __A, __zeros, __N);
1609 #else
1610 __result = vec_sld (__zeros, (__v16qu) __A, (16 - __N));
1611 #endif
1612 else
1613 __result = __zeros;
1615 return (__m128i) __result;
1618 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1619 _mm_bsrli_si128 (__m128i __A, const int __N)
1621 __v16qu __result;
1622 const __v16qu __zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1624 if (__N == 0)
1625 return __A;
1626 else if (__N > 0 && __N < 16)
1627 #ifdef __LITTLE_ENDIAN__
1628 if (__builtin_constant_p(__N))
1629 /* Would like to use Vector Shift Left Double by Octet
1630 Immediate here to use the immediate form and avoid
1631 load of __N * 8 value into a separate VR. */
1632 __result = vec_sld (__zeros, (__v16qu) __A, (16 - __N));
1633 else
1634 #endif
1636 __v16qu __shift = vec_splats((unsigned char)(__N*8));
1637 #ifdef __LITTLE_ENDIAN__
1638 __result = vec_sro ((__v16qu)__A, __shift);
1639 #else
1640 __result = vec_slo ((__v16qu)__A, __shift);
1641 #endif
1643 else
1644 __result = __zeros;
1646 return (__m128i) __result;
1649 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1650 _mm_srli_si128 (__m128i __A, const int __N)
1652 return _mm_bsrli_si128 (__A, __N);
1655 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1656 _mm_slli_si128 (__m128i __A, const int __N)
1658 return _mm_bslli_si128 (__A, __N);
1661 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1663 _mm_srli_epi16 (__m128i __A, int __B)
1665 __v8hu __rshift;
1666 __v8hi __result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1668 if (__B < 16)
1670 if (__builtin_constant_p(__B))
1671 __rshift = (__v8hu) vec_splat_s16(__B);
1672 else
1673 __rshift = vec_splats ((unsigned short) __B);
1675 __result = vec_sr ((__v8hi) __A, __rshift);
1678 return (__m128i) __result;
1681 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1682 _mm_srli_epi32 (__m128i __A, int __B)
1684 __v4su __rshift;
1685 __v4si __result = { 0, 0, 0, 0 };
1687 if (__B < 32)
1689 if (__builtin_constant_p(__B))
1691 if (__B < 16)
1692 __rshift = (__v4su) vec_splat_s32(__B);
1693 else
1694 __rshift = (__v4su) vec_splats((unsigned int)__B);
1696 else
1697 __rshift = vec_splats ((unsigned int) __B);
1699 __result = vec_sr ((__v4si) __A, __rshift);
1702 return (__m128i) __result;
1705 #ifdef _ARCH_PWR8
1706 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1707 _mm_srli_epi64 (__m128i __A, int __B)
1709 __v2du __rshift;
1710 __v2di __result = { 0, 0 };
1712 if (__B < 64)
1714 if (__builtin_constant_p(__B))
1716 if (__B < 16)
1717 __rshift = (__v2du) vec_splat_s32(__B);
1718 else
1719 __rshift = (__v2du) vec_splats((unsigned long long)__B);
1721 else
1722 __rshift = (__v2du) vec_splats ((unsigned int) __B);
1724 __result = vec_sr ((__v2di) __A, __rshift);
1727 return (__m128i) __result;
1729 #endif
1731 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1732 _mm_sll_epi16 (__m128i __A, __m128i __B)
1734 __v8hu __lshift;
1735 __vector __bool short __shmask;
1736 const __v8hu __shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1737 __v8hu __result;
1739 #ifdef __LITTLE_ENDIAN__
1740 __lshift = vec_splat ((__v8hu) __B, 0);
1741 #else
1742 __lshift = vec_splat ((__v8hu) __B, 3);
1743 #endif
1744 __shmask = vec_cmple (__lshift, __shmax);
1745 __result = vec_sl ((__v8hu) __A, __lshift);
1746 __result = vec_sel ((__v8hu) __shmask, __result, __shmask);
1748 return (__m128i) __result;
1751 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1752 _mm_sll_epi32 (__m128i __A, __m128i __B)
1754 __v4su __lshift;
1755 __vector __bool int __shmask;
1756 const __v4su __shmax = { 32, 32, 32, 32 };
1757 __v4su __result;
1758 #ifdef __LITTLE_ENDIAN__
1759 __lshift = vec_splat ((__v4su) __B, 0);
1760 #else
1761 __lshift = vec_splat ((__v4su) __B, 1);
1762 #endif
1763 __shmask = vec_cmplt (__lshift, __shmax);
1764 __result = vec_sl ((__v4su) __A, __lshift);
1765 __result = vec_sel ((__v4su) __shmask, __result, __shmask);
1767 return (__m128i) __result;
1770 #ifdef _ARCH_PWR8
1771 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1772 _mm_sll_epi64 (__m128i __A, __m128i __B)
1774 __v2du __lshift;
1775 __vector __bool long long __shmask;
1776 const __v2du __shmax = { 64, 64 };
1777 __v2du __result;
1779 __lshift = vec_splat ((__v2du) __B, 0);
1780 __shmask = vec_cmplt (__lshift, __shmax);
1781 __result = vec_sl ((__v2du) __A, __lshift);
1782 __result = vec_sel ((__v2du) __shmask, __result, __shmask);
1784 return (__m128i) __result;
1786 #endif
1788 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1789 _mm_sra_epi16 (__m128i __A, __m128i __B)
1791 const __v8hu __rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1792 __v8hu __rshift;
1793 __v8hi __result;
1795 #ifdef __LITTLE_ENDIAN__
1796 __rshift = vec_splat ((__v8hu)__B, 0);
1797 #else
1798 __rshift = vec_splat ((__v8hu)__B, 3);
1799 #endif
1800 __rshift = vec_min (__rshift, __rshmax);
1801 __result = vec_sra ((__v8hi) __A, __rshift);
1803 return (__m128i) __result;
1806 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1807 _mm_sra_epi32 (__m128i __A, __m128i __B)
1809 const __v4su __rshmax = { 31, 31, 31, 31 };
1810 __v4su __rshift;
1811 __v4si __result;
1813 #ifdef __LITTLE_ENDIAN__
1814 __rshift = vec_splat ((__v4su)__B, 0);
1815 #else
1816 __rshift = vec_splat ((__v4su)__B, 1);
1817 #endif
1818 __rshift = vec_min (__rshift, __rshmax);
1819 __result = vec_sra ((__v4si) __A, __rshift);
1821 return (__m128i) __result;
1824 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1825 _mm_srl_epi16 (__m128i __A, __m128i __B)
1827 __v8hu __rshift;
1828 __vector __bool short __shmask;
1829 const __v8hu __shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1830 __v8hu __result;
1832 #ifdef __LITTLE_ENDIAN__
1833 __rshift = vec_splat ((__v8hu) __B, 0);
1834 #else
1835 __rshift = vec_splat ((__v8hu) __B, 3);
1836 #endif
1837 __shmask = vec_cmple (__rshift, __shmax);
1838 __result = vec_sr ((__v8hu) __A, __rshift);
1839 __result = vec_sel ((__v8hu) __shmask, __result, __shmask);
1841 return (__m128i) __result;
1844 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1845 _mm_srl_epi32 (__m128i __A, __m128i __B)
1847 __v4su __rshift;
1848 __vector __bool int __shmask;
1849 const __v4su __shmax = { 32, 32, 32, 32 };
1850 __v4su __result;
1852 #ifdef __LITTLE_ENDIAN__
1853 __rshift = vec_splat ((__v4su) __B, 0);
1854 #else
1855 __rshift = vec_splat ((__v4su) __B, 1);
1856 #endif
1857 __shmask = vec_cmplt (__rshift, __shmax);
1858 __result = vec_sr ((__v4su) __A, __rshift);
1859 __result = vec_sel ((__v4su) __shmask, __result, __shmask);
1861 return (__m128i) __result;
1864 #ifdef _ARCH_PWR8
1865 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1866 _mm_srl_epi64 (__m128i __A, __m128i __B)
1868 __v2du __rshift;
1869 __vector __bool long long __shmask;
1870 const __v2du __shmax = { 64, 64 };
1871 __v2du __result;
1873 __rshift = vec_splat ((__v2du) __B, 0);
1874 __shmask = vec_cmplt (__rshift, __shmax);
1875 __result = vec_sr ((__v2du) __A, __rshift);
1876 __result = vec_sel ((__v2du) __shmask, __result, __shmask);
1878 return (__m128i) __result;
1880 #endif
1882 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1883 _mm_and_pd (__m128d __A, __m128d __B)
1885 return (vec_and ((__v2df) __A, (__v2df) __B));
1888 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1889 _mm_andnot_pd (__m128d __A, __m128d __B)
1891 return (vec_andc ((__v2df) __B, (__v2df) __A));
1894 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1895 _mm_or_pd (__m128d __A, __m128d __B)
1897 return (vec_or ((__v2df) __A, (__v2df) __B));
1900 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1901 _mm_xor_pd (__m128d __A, __m128d __B)
1903 return (vec_xor ((__v2df) __A, (__v2df) __B));
1906 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1907 _mm_and_si128 (__m128i __A, __m128i __B)
1909 return (__m128i)vec_and ((__v2di) __A, (__v2di) __B);
1912 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1913 _mm_andnot_si128 (__m128i __A, __m128i __B)
1915 return (__m128i)vec_andc ((__v2di) __B, (__v2di) __A);
1918 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1919 _mm_or_si128 (__m128i __A, __m128i __B)
1921 return (__m128i)vec_or ((__v2di) __A, (__v2di) __B);
1924 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1925 _mm_xor_si128 (__m128i __A, __m128i __B)
1927 return (__m128i)vec_xor ((__v2di) __A, (__v2di) __B);
1930 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1931 _mm_cmpeq_epi8 (__m128i __A, __m128i __B)
1933 return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B);
1936 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1937 _mm_cmpeq_epi16 (__m128i __A, __m128i __B)
1939 return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B);
1942 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1943 _mm_cmpeq_epi32 (__m128i __A, __m128i __B)
1945 return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B);
1948 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1949 _mm_cmplt_epi8 (__m128i __A, __m128i __B)
1951 return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B);
1954 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1955 _mm_cmplt_epi16 (__m128i __A, __m128i __B)
1957 return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B);
1960 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1961 _mm_cmplt_epi32 (__m128i __A, __m128i __B)
1963 return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B);
1966 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1967 _mm_cmpgt_epi8 (__m128i __A, __m128i __B)
1969 return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B);
1972 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1973 _mm_cmpgt_epi16 (__m128i __A, __m128i __B)
1975 return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B);
1978 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1979 _mm_cmpgt_epi32 (__m128i __A, __m128i __B)
1981 return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B);
1984 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1985 _mm_extract_epi16 (__m128i const __A, int const __N)
1987 return (unsigned short) ((__v8hi)__A)[__N & 7];
1990 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1991 _mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
1993 __v8hi __result = (__v8hi)__A;
1995 __result [(__N & 7)] = __D;
1997 return (__m128i) __result;
2000 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2001 _mm_max_epi16 (__m128i __A, __m128i __B)
2003 return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B);
2006 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2007 _mm_max_epu8 (__m128i __A, __m128i __B)
2009 return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B);
2012 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2013 _mm_min_epi16 (__m128i __A, __m128i __B)
2015 return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B);
2018 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2019 _mm_min_epu8 (__m128i __A, __m128i __B)
2021 return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B);
2025 #ifdef _ARCH_PWR8
2026 /* Intrinsic functions that require PowerISA 2.07 minimum. */
2028 /* Return a mask created from the most significant bit of each 8-bit
2029 element in A. */
2030 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2031 _mm_movemask_epi8 (__m128i __A)
2033 #ifdef _ARCH_PWR10
2034 return vec_extractm ((__v16qu) __A);
2035 #else
2036 __vector unsigned long long __result;
2037 static const __vector unsigned char __perm_mask =
2039 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
2040 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00
2043 __result = ((__vector unsigned long long)
2044 vec_vbpermq ((__vector unsigned char) __A,
2045 (__vector unsigned char) __perm_mask));
2047 #ifdef __LITTLE_ENDIAN__
2048 return __result[1];
2049 #else
2050 return __result[0];
2051 #endif
2052 #endif /* !_ARCH_PWR10 */
2054 #endif /* _ARCH_PWR8 */
2056 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2057 _mm_mulhi_epu16 (__m128i __A, __m128i __B)
2059 __v4su __w0, __w1;
2060 __v16qu __xform1 = {
2061 #ifdef __LITTLE_ENDIAN__
2062 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
2063 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
2064 #else
2065 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
2066 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
2067 #endif
2070 __w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B);
2071 __w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B);
2072 return (__m128i) vec_perm (__w0, __w1, __xform1);
2075 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2076 _mm_shufflehi_epi16 (__m128i __A, const int __mask)
2078 unsigned long __element_selector_98 = __mask & 0x03;
2079 unsigned long __element_selector_BA = (__mask >> 2) & 0x03;
2080 unsigned long __element_selector_DC = (__mask >> 4) & 0x03;
2081 unsigned long __element_selector_FE = (__mask >> 6) & 0x03;
2082 static const unsigned short __permute_selectors[4] =
2084 #ifdef __LITTLE_ENDIAN__
2085 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2086 #else
2087 0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2088 #endif
2090 __v2du __pmask =
2091 #ifdef __LITTLE_ENDIAN__
2092 { 0x1716151413121110UL, 0UL};
2093 #else
2094 { 0x1011121314151617UL, 0UL};
2095 #endif
2096 __m64_union __t;
2097 __v2du __a, __r;
2099 __t.as_short[0] = __permute_selectors[__element_selector_98];
2100 __t.as_short[1] = __permute_selectors[__element_selector_BA];
2101 __t.as_short[2] = __permute_selectors[__element_selector_DC];
2102 __t.as_short[3] = __permute_selectors[__element_selector_FE];
2103 __pmask[1] = __t.as_m64;
2104 __a = (__v2du)__A;
2105 __r = vec_perm (__a, __a, (__vector unsigned char)__pmask);
2106 return (__m128i) __r;
2109 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2110 _mm_shufflelo_epi16 (__m128i __A, const int __mask)
2112 unsigned long __element_selector_10 = __mask & 0x03;
2113 unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2114 unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2115 unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2116 static const unsigned short __permute_selectors[4] =
2118 #ifdef __LITTLE_ENDIAN__
2119 0x0100, 0x0302, 0x0504, 0x0706
2120 #else
2121 0x0001, 0x0203, 0x0405, 0x0607
2122 #endif
2124 __v2du __pmask =
2125 #ifdef __LITTLE_ENDIAN__
2126 { 0UL, 0x1f1e1d1c1b1a1918UL};
2127 #else
2128 { 0UL, 0x18191a1b1c1d1e1fUL};
2129 #endif
2130 __m64_union __t;
2131 __v2du __a, __r;
2132 __t.as_short[0] = __permute_selectors[__element_selector_10];
2133 __t.as_short[1] = __permute_selectors[__element_selector_32];
2134 __t.as_short[2] = __permute_selectors[__element_selector_54];
2135 __t.as_short[3] = __permute_selectors[__element_selector_76];
2136 __pmask[0] = __t.as_m64;
2137 __a = (__v2du)__A;
2138 __r = vec_perm (__a, __a, (__vector unsigned char)__pmask);
2139 return (__m128i) __r;
2142 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2143 _mm_shuffle_epi32 (__m128i __A, const int __mask)
2145 unsigned long __element_selector_10 = __mask & 0x03;
2146 unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2147 unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2148 unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2149 static const unsigned int __permute_selectors[4] =
2151 #ifdef __LITTLE_ENDIAN__
2152 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2153 #else
2154 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2155 #endif
2157 __v4su __t;
2159 __t[0] = __permute_selectors[__element_selector_10];
2160 __t[1] = __permute_selectors[__element_selector_32];
2161 __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
2162 __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
2163 return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)__t);
2166 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2167 _mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
2169 __v2du __hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2170 __v16qu __mask, __tmp;
2171 __m128i_u *__p = (__m128i_u*)__C;
2173 __tmp = (__v16qu)_mm_loadu_si128(__p);
2174 __mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)__hibit);
2175 __tmp = vec_sel (__tmp, (__v16qu)__A, __mask);
2176 _mm_storeu_si128 (__p, (__m128i)__tmp);
2179 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2180 _mm_avg_epu8 (__m128i __A, __m128i __B)
2182 return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B);
2185 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2186 _mm_avg_epu16 (__m128i __A, __m128i __B)
2188 return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B);
2192 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2193 _mm_sad_epu8 (__m128i __A, __m128i __B)
2195 __v16qu __a, __b;
2196 __v16qu __vabsdiff;
2197 __v4si __vsum;
2198 const __v4su __zero = { 0, 0, 0, 0 };
2199 __v4si __result;
2201 __a = (__v16qu) __A;
2202 __b = (__v16qu) __B;
2203 #ifndef _ARCH_PWR9
2204 __v16qu __vmin = vec_min (__a, __b);
2205 __v16qu __vmax = vec_max (__a, __b);
2206 __vabsdiff = vec_sub (__vmax, __vmin);
2207 #else
2208 __vabsdiff = vec_absd (__a, __b);
2209 #endif
2210 /* Sum four groups of bytes into integers. */
2211 __vsum = (__vector signed int) vec_sum4s (__vabsdiff, __zero);
2212 #ifdef __LITTLE_ENDIAN__
2213 /* Sum across four integers with two integer results. */
2214 __asm__ ("vsum2sws %0,%1,%2" : "=v" (__result) : "v" (__vsum), "v" (__zero));
2215 /* Note: vec_sum2s could be used here, but on little-endian, vector
2216 shifts are added that are not needed for this use-case.
2217 A vector shift to correctly position the 32-bit integer results
2218 (currently at [0] and [2]) to [1] and [3] would then need to be
2219 swapped back again since the desired results are two 64-bit
2220 integers ([1]|[0] and [3]|[2]). Thus, no shift is performed. */
2221 #else
2222 /* Sum across four integers with two integer results. */
2223 __result = vec_sum2s (__vsum, (__vector signed int) __zero);
2224 /* Rotate the sums into the correct position. */
2225 __result = vec_sld (__result, __result, 6);
2226 #endif
2227 return (__m128i) __result;
2230 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2231 _mm_stream_si32 (int *__A, int __B)
2233 /* Use the data cache block touch for store transient. */
2234 __asm__ (
2235 "dcbtstt 0,%0"
2237 : "b" (__A)
2238 : "memory"
2240 *__A = __B;
2243 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2244 _mm_stream_si64 (long long int *__A, long long int __B)
2246 /* Use the data cache block touch for store transient. */
2247 __asm__ (
2248 " dcbtstt 0,%0"
2250 : "b" (__A)
2251 : "memory"
2253 *__A = __B;
2256 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2257 _mm_stream_si128 (__m128i *__A, __m128i __B)
2259 /* Use the data cache block touch for store transient. */
2260 __asm__ (
2261 "dcbtstt 0,%0"
2263 : "b" (__A)
2264 : "memory"
2266 *__A = __B;
2269 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2270 _mm_stream_pd (double *__A, __m128d __B)
2272 /* Use the data cache block touch for store transient. */
2273 __asm__ (
2274 "dcbtstt 0,%0"
2276 : "b" (__A)
2277 : "memory"
2279 *(__m128d*)__A = __B;
2282 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2283 _mm_clflush (void const *__A)
2285 /* Use the data cache block flush. */
2286 __asm__ (
2287 "dcbf 0,%0"
2289 : "b" (__A)
2290 : "memory"
2294 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2295 _mm_lfence (void)
2297 /* Use light weight sync for load to load ordering. */
2298 __atomic_thread_fence (__ATOMIC_RELEASE);
2301 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2302 _mm_mfence (void)
2304 /* Use heavy weight sync for any to any ordering. */
2305 __atomic_thread_fence (__ATOMIC_SEQ_CST);
2308 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2309 _mm_cvtsi32_si128 (int __A)
2311 return _mm_set_epi32 (0, 0, 0, __A);
2314 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2315 _mm_cvtsi64_si128 (long long __A)
2317 return __extension__ (__m128i)(__v2di){ __A, 0LL };
2320 /* Microsoft intrinsic. */
2321 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2322 _mm_cvtsi64x_si128 (long long __A)
2324 return __extension__ (__m128i)(__v2di){ __A, 0LL };
2327 /* Casts between various SP, DP, INT vector types. Note that these do no
2328 conversion of values, they just change the type. */
2329 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2330 _mm_castpd_ps(__m128d __A)
2332 return (__m128) __A;
2335 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2336 _mm_castpd_si128(__m128d __A)
2338 return (__m128i) __A;
2341 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2342 _mm_castps_pd(__m128 __A)
2344 return (__m128d) __A;
2347 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2348 _mm_castps_si128(__m128 __A)
2350 return (__m128i) __A;
2353 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2354 _mm_castsi128_ps(__m128i __A)
2356 return (__m128) __A;
2359 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2360 _mm_castsi128_pd(__m128i __A)
2362 return (__m128d) __A;
2365 #endif /* EMMINTRIN_H_ */