1 /* Copyright (C) 2003-2024 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
24 /* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 9.0. */
27 #ifndef NO_WARN_X86_INTRINSICS
28 /* This header is distributed to simplify porting x86_64 code that
29 makes explicit use of Intel intrinsics to powerpc64le.
30 It is the user's responsibility to determine if the results are
31 acceptable and make additional changes as necessary.
32 Note that much code that uses Intel intrinsics can be rewritten in
33 standard C or GNU C extensions, which are more portable and better
34 optimized across multiple targets.
36 In the specific case of X86 SSE2 (__m128i, __m128d) intrinsics,
37 the PowerPC VMX/VSX ISA is a good match for vector double SIMD
38 operations. However scalar double operations in vector (XMM)
39 registers require the POWER8 VSX ISA (2.07) level. Also there are
40 important differences for data format and placement of double
41 scalars in the vector register.
43 For PowerISA Scalar double is in FPRs (left most 64-bits of the
44 low 32 VSRs), while X86_64 SSE2 uses the right most 64-bits of
45 the XMM. These differences require extra steps on POWER to match
46 the SSE2 scalar double semantics.
48 Most SSE2 scalar double intrinsic operations can be performed more
49 efficiently as C language double scalar operations or optimized to
50 use vector SIMD operations. We recommend this for new applications.
52 Another difference is the format and details of the X86_64 MXSCR vs
53 the PowerISA FPSCR / VSCR registers. We recommend applications
54 replace direct access to the MXSCR with the more portable <fenv.h>
56 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
65 /* We need definitions from the SSE header files. */
66 #include <xmmintrin.h>
69 typedef __vector
double __v2df
;
70 typedef __vector
long long __v2di
;
71 typedef __vector
unsigned long long __v2du
;
72 typedef __vector
int __v4si
;
73 typedef __vector
unsigned int __v4su
;
74 typedef __vector
short __v8hi
;
75 typedef __vector
unsigned short __v8hu
;
76 typedef __vector
signed char __v16qi
;
77 typedef __vector
unsigned char __v16qu
;
79 /* The Intel API is flexible enough that we must allow aliasing with other
80 vector types, and their scalar components. */
81 typedef long long __m128i
__attribute__ ((__vector_size__ (16), __may_alias__
));
82 typedef double __m128d
__attribute__ ((__vector_size__ (16), __may_alias__
));
84 /* Unaligned version of the same types. */
85 typedef long long __m128i_u
__attribute__ ((__vector_size__ (16), __may_alias__
, __aligned__ (1)));
86 typedef double __m128d_u
__attribute__ ((__vector_size__ (16), __may_alias__
, __aligned__ (1)));
88 /* Define two value permute mask. */
89 #define _MM_SHUFFLE2(x,y) (((x) << 1) | (y))
91 /* Create a vector with element 0 as F and the rest zero. */
92 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
93 _mm_set_sd (double __F
)
95 return __extension__ (__m128d
){ __F
, 0.0 };
98 /* Create a vector with both elements equal to F. */
99 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
100 _mm_set1_pd (double __F
)
102 return __extension__ (__m128d
){ __F
, __F
};
105 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
106 _mm_set_pd1 (double __F
)
108 return _mm_set1_pd (__F
);
111 /* Create a vector with the lower value X and upper value W. */
112 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
113 _mm_set_pd (double __W
, double __X
)
115 return __extension__ (__m128d
){ __X
, __W
};
118 /* Create a vector with the lower value W and upper value X. */
119 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
120 _mm_setr_pd (double __W
, double __X
)
122 return __extension__ (__m128d
){ __W
, __X
};
125 /* Create an undefined vector. */
126 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
127 _mm_undefined_pd (void)
133 /* Create a vector of zeros. */
134 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
135 _mm_setzero_pd (void)
137 return (__m128d
) vec_splats (0);
140 /* Sets the low DPFP value of A from the low value of B. */
141 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
142 _mm_move_sd (__m128d __A
, __m128d __B
)
144 __v2df __result
= (__v2df
) __A
;
145 __result
[0] = ((__v2df
) __B
)[0];
146 return (__m128d
) __result
;
149 /* Load two DPFP values from P. The address must be 16-byte aligned. */
150 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
151 _mm_load_pd (double const *__P
)
153 assert(((unsigned long)__P
& 0xfUL
) == 0UL);
154 return ((__m128d
)vec_ld(0, (__v16qu
*)__P
));
157 /* Load two DPFP values from P. The address need not be 16-byte aligned. */
158 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
159 _mm_loadu_pd (double const *__P
)
161 return (vec_vsx_ld(0, __P
));
164 /* Create a vector with all two elements equal to *P. */
165 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
166 _mm_load1_pd (double const *__P
)
168 return (vec_splats (*__P
));
171 /* Create a vector with element 0 as *P and the rest zero. */
172 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
173 _mm_load_sd (double const *__P
)
175 return _mm_set_sd (*__P
);
178 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
179 _mm_load_pd1 (double const *__P
)
181 return _mm_load1_pd (__P
);
184 /* Load two DPFP values in reverse order. The address must be aligned. */
185 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
186 _mm_loadr_pd (double const *__P
)
188 __v2df __tmp
= _mm_load_pd (__P
);
189 return (__m128d
)vec_xxpermdi (__tmp
, __tmp
, 2);
192 /* Store two DPFP values. The address must be 16-byte aligned. */
193 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
194 _mm_store_pd (double *__P
, __m128d __A
)
196 assert(((unsigned long)__P
& 0xfUL
) == 0UL);
197 vec_st((__v16qu
)__A
, 0, (__v16qu
*)__P
);
200 /* Store two DPFP values. The address need not be 16-byte aligned. */
201 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
202 _mm_storeu_pd (double *__P
, __m128d __A
)
204 *(__m128d_u
*)__P
= __A
;
207 /* Stores the lower DPFP value. */
208 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
209 _mm_store_sd (double *__P
, __m128d __A
)
211 *__P
= ((__v2df
)__A
)[0];
214 extern __inline
double __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
215 _mm_cvtsd_f64 (__m128d __A
)
217 return ((__v2df
)__A
)[0];
220 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
221 _mm_storel_pd (double *__P
, __m128d __A
)
223 _mm_store_sd (__P
, __A
);
226 /* Stores the upper DPFP value. */
227 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
228 _mm_storeh_pd (double *__P
, __m128d __A
)
230 *__P
= ((__v2df
)__A
)[1];
232 /* Store the lower DPFP value across two words.
233 The address must be 16-byte aligned. */
234 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
235 _mm_store1_pd (double *__P
, __m128d __A
)
237 _mm_store_pd (__P
, vec_splat (__A
, 0));
240 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
241 _mm_store_pd1 (double *__P
, __m128d __A
)
243 _mm_store1_pd (__P
, __A
);
246 /* Store two DPFP values in reverse order. The address must be aligned. */
247 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
248 _mm_storer_pd (double *__P
, __m128d __A
)
250 _mm_store_pd (__P
, vec_xxpermdi (__A
, __A
, 2));
253 /* Intel intrinsic. */
254 extern __inline
long long __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
255 _mm_cvtsi128_si64 (__m128i __A
)
257 return ((__v2di
)__A
)[0];
260 /* Microsoft intrinsic. */
261 extern __inline
long long __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
262 _mm_cvtsi128_si64x (__m128i __A
)
264 return ((__v2di
)__A
)[0];
267 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
268 _mm_add_pd (__m128d __A
, __m128d __B
)
270 return (__m128d
) ((__v2df
)__A
+ (__v2df
)__B
);
273 /* Add the lower double-precision (64-bit) floating-point element in
274 a and b, store the result in the lower element of dst, and copy
275 the upper element from a to the upper element of dst. */
276 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
277 _mm_add_sd (__m128d __A
, __m128d __B
)
279 __A
[0] = __A
[0] + __B
[0];
283 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
284 _mm_sub_pd (__m128d __A
, __m128d __B
)
286 return (__m128d
) ((__v2df
)__A
- (__v2df
)__B
);
289 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
290 _mm_sub_sd (__m128d __A
, __m128d __B
)
292 __A
[0] = __A
[0] - __B
[0];
296 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
297 _mm_mul_pd (__m128d __A
, __m128d __B
)
299 return (__m128d
) ((__v2df
)__A
* (__v2df
)__B
);
302 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
303 _mm_mul_sd (__m128d __A
, __m128d __B
)
305 __A
[0] = __A
[0] * __B
[0];
309 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
310 _mm_div_pd (__m128d __A
, __m128d __B
)
312 return (__m128d
) ((__v2df
)__A
/ (__v2df
)__B
);
315 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
316 _mm_div_sd (__m128d __A
, __m128d __B
)
318 __A
[0] = __A
[0] / __B
[0];
322 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
323 _mm_sqrt_pd (__m128d __A
)
325 return (vec_sqrt (__A
));
328 /* Return pair {sqrt (B[0]), A[1]}. */
329 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
330 _mm_sqrt_sd (__m128d __A
, __m128d __B
)
333 __c
= vec_sqrt ((__v2df
) _mm_set1_pd (__B
[0]));
334 return (__m128d
) _mm_setr_pd (__c
[0], __A
[1]);
337 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
338 _mm_min_pd (__m128d __A
, __m128d __B
)
340 return (vec_min (__A
, __B
));
343 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
344 _mm_min_sd (__m128d __A
, __m128d __B
)
346 __v2df __a
, __b
, __c
;
347 __a
= vec_splats (__A
[0]);
348 __b
= vec_splats (__B
[0]);
349 __c
= vec_min (__a
, __b
);
350 return (__m128d
) _mm_setr_pd (__c
[0], __A
[1]);
353 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
354 _mm_max_pd (__m128d __A
, __m128d __B
)
356 return (vec_max (__A
, __B
));
359 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
360 _mm_max_sd (__m128d __A
, __m128d __B
)
362 __v2df __a
, __b
, __c
;
363 __a
= vec_splats (__A
[0]);
364 __b
= vec_splats (__B
[0]);
365 __c
= vec_max (__a
, __b
);
366 return (__m128d
) _mm_setr_pd (__c
[0], __A
[1]);
369 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
370 _mm_cmpeq_pd (__m128d __A
, __m128d __B
)
372 return ((__m128d
)vec_cmpeq ((__v2df
) __A
, (__v2df
) __B
));
375 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
376 _mm_cmplt_pd (__m128d __A
, __m128d __B
)
378 return ((__m128d
)vec_cmplt ((__v2df
) __A
, (__v2df
) __B
));
381 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
382 _mm_cmple_pd (__m128d __A
, __m128d __B
)
384 return ((__m128d
)vec_cmple ((__v2df
) __A
, (__v2df
) __B
));
387 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
388 _mm_cmpgt_pd (__m128d __A
, __m128d __B
)
390 return ((__m128d
)vec_cmpgt ((__v2df
) __A
, (__v2df
) __B
));
393 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
394 _mm_cmpge_pd (__m128d __A
, __m128d __B
)
396 return ((__m128d
)vec_cmpge ((__v2df
) __A
,(__v2df
) __B
));
399 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
400 _mm_cmpneq_pd (__m128d __A
, __m128d __B
)
402 __v2df __temp
= (__v2df
) vec_cmpeq ((__v2df
) __A
, (__v2df
)__B
);
403 return ((__m128d
)vec_nor (__temp
, __temp
));
406 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
407 _mm_cmpnlt_pd (__m128d __A
, __m128d __B
)
409 return ((__m128d
)vec_cmpge ((__v2df
) __A
, (__v2df
) __B
));
412 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
413 _mm_cmpnle_pd (__m128d __A
, __m128d __B
)
415 return ((__m128d
)vec_cmpgt ((__v2df
) __A
, (__v2df
) __B
));
418 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
419 _mm_cmpngt_pd (__m128d __A
, __m128d __B
)
421 return ((__m128d
)vec_cmple ((__v2df
) __A
, (__v2df
) __B
));
424 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
425 _mm_cmpnge_pd (__m128d __A
, __m128d __B
)
427 return ((__m128d
)vec_cmplt ((__v2df
) __A
, (__v2df
) __B
));
430 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
431 _mm_cmpord_pd (__m128d __A
, __m128d __B
)
434 /* Compare against self will return false (0's) if NAN. */
435 __c
= (__v2du
)vec_cmpeq (__A
, __A
);
436 __d
= (__v2du
)vec_cmpeq (__B
, __B
);
437 /* A != NAN and B != NAN. */
438 return ((__m128d
)vec_and(__c
, __d
));
441 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
442 _mm_cmpunord_pd (__m128d __A
, __m128d __B
)
446 /* Compare against self will return false (0's) if NAN. */
447 __c
= (__v2du
)vec_cmpeq ((__v2df
)__A
, (__v2df
)__A
);
448 __d
= (__v2du
)vec_cmpeq ((__v2df
)__B
, (__v2df
)__B
);
449 /* A == NAN OR B == NAN converts too:
450 NOT(A != NAN) OR NOT(B != NAN). */
451 __c
= vec_nor (__c
, __c
);
452 return ((__m128d
)vec_orc(__c
, __d
));
455 /* Compare against self will return false (0's) if NAN. */
456 __c
= (__v2du
)vec_cmpeq ((__v2df
)__A
, (__v2df
)__A
);
457 __d
= (__v2du
)vec_cmpeq ((__v2df
)__B
, (__v2df
)__B
);
458 /* Convert the true ('1's) is NAN. */
459 __c
= vec_nor (__c
, __c
);
460 __d
= vec_nor (__d
, __d
);
461 return ((__m128d
)vec_or(__c
, __d
));
465 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
466 _mm_cmpeq_sd(__m128d __A
, __m128d __B
)
468 __v2df __a
, __b
, __c
;
469 /* PowerISA VSX does not allow partial (for just lower double)
470 results. So to insure we don't generate spurious exceptions
471 (from the upper double values) we splat the lower double
472 before we do the operation. */
473 __a
= vec_splats (__A
[0]);
474 __b
= vec_splats (__B
[0]);
475 __c
= (__v2df
) vec_cmpeq(__a
, __b
);
476 /* Then we merge the lower double result with the original upper
478 return (__m128d
) _mm_setr_pd (__c
[0], __A
[1]);
481 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
482 _mm_cmplt_sd (__m128d __A
, __m128d __B
)
484 __v2df __a
, __b
, __c
;
485 __a
= vec_splats (__A
[0]);
486 __b
= vec_splats (__B
[0]);
487 __c
= (__v2df
) vec_cmplt(__a
, __b
);
488 return (__m128d
) _mm_setr_pd (__c
[0], __A
[1]);
491 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
492 _mm_cmple_sd (__m128d __A
, __m128d __B
)
494 __v2df __a
, __b
, __c
;
495 __a
= vec_splats (__A
[0]);
496 __b
= vec_splats (__B
[0]);
497 __c
= (__v2df
) vec_cmple(__a
, __b
);
498 return (__m128d
) _mm_setr_pd (__c
[0], __A
[1]);
501 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
502 _mm_cmpgt_sd (__m128d __A
, __m128d __B
)
504 __v2df __a
, __b
, __c
;
505 __a
= vec_splats (__A
[0]);
506 __b
= vec_splats (__B
[0]);
507 __c
= (__v2df
) vec_cmpgt(__a
, __b
);
508 return (__m128d
) _mm_setr_pd (__c
[0], __A
[1]);
511 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
512 _mm_cmpge_sd (__m128d __A
, __m128d __B
)
514 __v2df __a
, __b
, __c
;
515 __a
= vec_splats (__A
[0]);
516 __b
= vec_splats (__B
[0]);
517 __c
= (__v2df
) vec_cmpge(__a
, __b
);
518 return (__m128d
) _mm_setr_pd (__c
[0], __A
[1]);
521 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
522 _mm_cmpneq_sd (__m128d __A
, __m128d __B
)
524 __v2df __a
, __b
, __c
;
525 __a
= vec_splats (__A
[0]);
526 __b
= vec_splats (__B
[0]);
527 __c
= (__v2df
) vec_cmpeq(__a
, __b
);
528 __c
= vec_nor (__c
, __c
);
529 return (__m128d
) _mm_setr_pd (__c
[0], __A
[1]);
532 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
533 _mm_cmpnlt_sd (__m128d __A
, __m128d __B
)
535 __v2df __a
, __b
, __c
;
536 __a
= vec_splats (__A
[0]);
537 __b
= vec_splats (__B
[0]);
538 /* Not less than is just greater than or equal. */
539 __c
= (__v2df
) vec_cmpge(__a
, __b
);
540 return (__m128d
) _mm_setr_pd (__c
[0], __A
[1]);
543 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
544 _mm_cmpnle_sd (__m128d __A
, __m128d __B
)
546 __v2df __a
, __b
, __c
;
547 __a
= vec_splats (__A
[0]);
548 __b
= vec_splats (__B
[0]);
549 /* Not less than or equal is just greater than. */
550 __c
= (__v2df
) vec_cmpge(__a
, __b
);
551 return (__m128d
) _mm_setr_pd (__c
[0], __A
[1]);
554 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
555 _mm_cmpngt_sd (__m128d __A
, __m128d __B
)
557 __v2df __a
, __b
, __c
;
558 __a
= vec_splats (__A
[0]);
559 __b
= vec_splats (__B
[0]);
560 /* Not greater than is just less than or equal. */
561 __c
= (__v2df
) vec_cmple(__a
, __b
);
562 return (__m128d
) _mm_setr_pd (__c
[0], __A
[1]);
565 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
566 _mm_cmpnge_sd (__m128d __A
, __m128d __B
)
568 __v2df __a
, __b
, __c
;
569 __a
= vec_splats (__A
[0]);
570 __b
= vec_splats (__B
[0]);
571 /* Not greater than or equal is just less than. */
572 __c
= (__v2df
) vec_cmplt(__a
, __b
);
573 return (__m128d
) _mm_setr_pd (__c
[0], __A
[1]);
576 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
577 _mm_cmpord_sd (__m128d __A
, __m128d __B
)
580 __r
= (__v2df
)_mm_cmpord_pd (vec_splats (__A
[0]), vec_splats (__B
[0]));
581 return (__m128d
) _mm_setr_pd (__r
[0], ((__v2df
)__A
)[1]);
584 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
585 _mm_cmpunord_sd (__m128d __A
, __m128d __B
)
588 __r
= _mm_cmpunord_pd (vec_splats (__A
[0]), vec_splats (__B
[0]));
589 return (__m128d
) _mm_setr_pd (__r
[0], __A
[1]);
593 The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
594 exactly the same because GCC for PowerPC only generates unordered
595 compares (scalar and vector).
596 Technically __mm_comieq_sp et all should be using the ordered
597 compare and signal for QNaNs. The __mm_ucomieq_sd et all should
599 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
600 _mm_comieq_sd (__m128d __A
, __m128d __B
)
602 return (__A
[0] == __B
[0]);
605 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
606 _mm_comilt_sd (__m128d __A
, __m128d __B
)
608 return (__A
[0] < __B
[0]);
611 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
612 _mm_comile_sd (__m128d __A
, __m128d __B
)
614 return (__A
[0] <= __B
[0]);
617 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
618 _mm_comigt_sd (__m128d __A
, __m128d __B
)
620 return (__A
[0] > __B
[0]);
623 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
624 _mm_comige_sd (__m128d __A
, __m128d __B
)
626 return (__A
[0] >= __B
[0]);
629 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
630 _mm_comineq_sd (__m128d __A
, __m128d __B
)
632 return (__A
[0] != __B
[0]);
635 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
636 _mm_ucomieq_sd (__m128d __A
, __m128d __B
)
638 return (__A
[0] == __B
[0]);
641 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
642 _mm_ucomilt_sd (__m128d __A
, __m128d __B
)
644 return (__A
[0] < __B
[0]);
647 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
648 _mm_ucomile_sd (__m128d __A
, __m128d __B
)
650 return (__A
[0] <= __B
[0]);
653 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
654 _mm_ucomigt_sd (__m128d __A
, __m128d __B
)
656 return (__A
[0] > __B
[0]);
659 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
660 _mm_ucomige_sd (__m128d __A
, __m128d __B
)
662 return (__A
[0] >= __B
[0]);
665 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
666 _mm_ucomineq_sd (__m128d __A
, __m128d __B
)
668 return (__A
[0] != __B
[0]);
671 /* Create a vector of Qi, where i is the element number. */
672 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
673 _mm_set_epi64x (long long __q1
, long long __q0
)
675 return __extension__ (__m128i
)(__v2di
){ __q0
, __q1
};
678 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
679 _mm_set_epi64 (__m64 __q1
, __m64 __q0
)
681 return _mm_set_epi64x ((long long)__q1
, (long long)__q0
);
684 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
685 _mm_set_epi32 (int __q3
, int __q2
, int __q1
, int __q0
)
687 return __extension__ (__m128i
)(__v4si
){ __q0
, __q1
, __q2
, __q3
};
690 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
691 _mm_set_epi16 (short __q7
, short __q6
, short __q5
, short __q4
,
692 short __q3
, short __q2
, short __q1
, short __q0
)
694 return __extension__ (__m128i
)(__v8hi
){
695 __q0
, __q1
, __q2
, __q3
, __q4
, __q5
, __q6
, __q7
};
698 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
699 _mm_set_epi8 (char __q15
, char __q14
, char __q13
, char __q12
,
700 char __q11
, char __q10
, char __q09
, char __q08
,
701 char __q07
, char __q06
, char __q05
, char __q04
,
702 char __q03
, char __q02
, char __q01
, char __q00
)
704 return __extension__ (__m128i
)(__v16qi
){
705 __q00
, __q01
, __q02
, __q03
, __q04
, __q05
, __q06
, __q07
,
706 __q08
, __q09
, __q10
, __q11
, __q12
, __q13
, __q14
, __q15
710 /* Set all of the elements of the vector to A. */
711 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
712 _mm_set1_epi64x (long long __A
)
714 return _mm_set_epi64x (__A
, __A
);
717 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
718 _mm_set1_epi64 (__m64 __A
)
720 return _mm_set_epi64 (__A
, __A
);
723 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
724 _mm_set1_epi32 (int __A
)
726 return _mm_set_epi32 (__A
, __A
, __A
, __A
);
729 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
730 _mm_set1_epi16 (short __A
)
732 return _mm_set_epi16 (__A
, __A
, __A
, __A
, __A
, __A
, __A
, __A
);
735 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
736 _mm_set1_epi8 (char __A
)
738 return _mm_set_epi8 (__A
, __A
, __A
, __A
, __A
, __A
, __A
, __A
,
739 __A
, __A
, __A
, __A
, __A
, __A
, __A
, __A
);
742 /* Create a vector of Qi, where i is the element number.
743 The parameter order is reversed from the _mm_set_epi* functions. */
744 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
745 _mm_setr_epi64 (__m64 __q0
, __m64 __q1
)
747 return _mm_set_epi64 (__q1
, __q0
);
750 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
751 _mm_setr_epi32 (int __q0
, int __q1
, int __q2
, int __q3
)
753 return _mm_set_epi32 (__q3
, __q2
, __q1
, __q0
);
756 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
757 _mm_setr_epi16 (short __q0
, short __q1
, short __q2
, short __q3
,
758 short __q4
, short __q5
, short __q6
, short __q7
)
760 return _mm_set_epi16 (__q7
, __q6
, __q5
, __q4
, __q3
, __q2
, __q1
, __q0
);
763 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
764 _mm_setr_epi8 (char __q00
, char __q01
, char __q02
, char __q03
,
765 char __q04
, char __q05
, char __q06
, char __q07
,
766 char __q08
, char __q09
, char __q10
, char __q11
,
767 char __q12
, char __q13
, char __q14
, char __q15
)
769 return _mm_set_epi8 (__q15
, __q14
, __q13
, __q12
, __q11
, __q10
, __q09
, __q08
,
770 __q07
, __q06
, __q05
, __q04
, __q03
, __q02
, __q01
, __q00
);
773 /* Create a vector with element 0 as *P and the rest zero. */
774 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
775 _mm_load_si128 (__m128i
const *__P
)
780 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
781 _mm_loadu_si128 (__m128i_u
const *__P
)
783 return (__m128i
) (vec_vsx_ld(0, (signed int const *)__P
));
786 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
787 _mm_loadl_epi64 (__m128i_u
const *__P
)
789 return _mm_set_epi64 ((__m64
)0LL, *(__m64
*)__P
);
792 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
793 _mm_store_si128 (__m128i
*__P
, __m128i __B
)
795 assert(((unsigned long )__P
& 0xfUL
) == 0UL);
796 vec_st ((__v16qu
) __B
, 0, (__v16qu
*)__P
);
799 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
800 _mm_storeu_si128 (__m128i_u
*__P
, __m128i __B
)
805 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
806 _mm_storel_epi64 (__m128i_u
*__P
, __m128i __B
)
808 *(long long *)__P
= ((__v2di
)__B
)[0];
811 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
812 _mm_movepi64_pi64 (__m128i_u __B
)
814 return (__m64
) ((__v2di
)__B
)[0];
817 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
818 _mm_movpi64_epi64 (__m64 __A
)
820 return _mm_set_epi64 ((__m64
)0LL, __A
);
823 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
824 _mm_move_epi64 (__m128i __A
)
826 return _mm_set_epi64 ((__m64
)0LL, (__m64
)__A
[0]);
829 /* Create an undefined vector. */
830 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
831 _mm_undefined_si128 (void)
837 /* Create a vector of zeros. */
838 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
839 _mm_setzero_si128 (void)
841 return __extension__ (__m128i
)(__v4si
){ 0, 0, 0, 0 };
845 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
846 _mm_cvtepi32_pd (__m128i __A
)
849 /* For LE need to generate Vector Unpack Low Signed Word.
850 Which is generated from unpackh. */
851 __val
= (__v2di
)vec_unpackh ((__v4si
)__A
);
853 return (__m128d
)vec_ctf (__val
, 0);
857 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
858 _mm_cvtepi32_ps (__m128i __A
)
860 return ((__m128
)vec_ctf((__v4si
)__A
, 0));
863 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
864 _mm_cvtpd_epi32 (__m128d __A
)
866 __v2df __rounded
= vec_rint (__A
);
867 __v4si __result
, __temp
;
868 const __v4si __vzero
=
871 /* VSX Vector truncate Double-Precision to integer and Convert to
872 Signed Integer Word format with Saturate. */
880 #ifdef __LITTLE_ENDIAN__
881 __temp
= vec_mergeo (__temp
, __temp
);
883 __temp
= vec_mergee (__temp
, __temp
);
885 __result
= (__v4si
) vec_vpkudum ((__vector
long long) __temp
,
886 (__vector
long long) __vzero
);
889 const __v16qu __pkperm
= {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
890 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
891 __result
= (__v4si
) vec_perm ((__v16qu
) __temp
, (__v16qu
) __vzero
, __pkperm
);
894 return (__m128i
) __result
;
897 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
898 _mm_cvtpd_pi32 (__m128d __A
)
900 __m128i __result
= _mm_cvtpd_epi32(__A
);
902 return (__m64
) __result
[0];
905 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
906 _mm_cvtpd_ps (__m128d __A
)
910 const __v4si __vzero
= { 0, 0, 0, 0 };
919 #ifdef __LITTLE_ENDIAN__
920 __temp
= vec_mergeo (__temp
, __temp
);
922 __temp
= vec_mergee (__temp
, __temp
);
924 __result
= (__v4sf
) vec_vpkudum ((__vector
long long) __temp
,
925 (__vector
long long) __vzero
);
928 const __v16qu __pkperm
= {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
929 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
930 __result
= (__v4sf
) vec_perm ((__v16qu
) __temp
, (__v16qu
) __vzero
, __pkperm
);
933 return ((__m128
)__result
);
936 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
937 _mm_cvttpd_epi32 (__m128d __A
)
941 const __v4si __vzero
= { 0, 0, 0, 0 };
943 /* VSX Vector truncate Double-Precision to integer and Convert to
944 Signed Integer Word format with Saturate. */
952 #ifdef __LITTLE_ENDIAN__
953 __temp
= vec_mergeo (__temp
, __temp
);
955 __temp
= vec_mergee (__temp
, __temp
);
957 __result
= (__v4si
) vec_vpkudum ((__vector
long long) __temp
,
958 (__vector
long long) __vzero
);
961 const __v16qu __pkperm
= {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
962 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
963 __result
= (__v4si
) vec_perm ((__v16qu
) __temp
, (__v16qu
) __vzero
, __pkperm
);
967 return ((__m128i
) __result
);
970 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
971 _mm_cvttpd_pi32 (__m128d __A
)
973 __m128i __result
= _mm_cvttpd_epi32 (__A
);
975 return (__m64
) __result
[0];
978 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
979 _mm_cvtsi128_si32 (__m128i __A
)
981 return ((__v4si
)__A
)[0];
985 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
986 _mm_cvtpi32_pd (__m64 __A
)
992 __temp
= (__v4si
)vec_splats (__A
);
993 __tmp2
= (__v2di
)vec_unpackl (__temp
);
994 __result
= vec_ctf ((__vector
signed long long) __tmp2
, 0);
995 return (__m128d
)__result
;
999 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1000 _mm_cvtps_epi32 (__m128 __A
)
1005 __rounded
= vec_rint((__v4sf
) __A
);
1006 __result
= vec_cts (__rounded
, 0);
1007 return (__m128i
) __result
;
1010 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1011 _mm_cvttps_epi32 (__m128 __A
)
1015 __result
= vec_cts ((__v4sf
) __A
, 0);
1016 return (__m128i
) __result
;
1019 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1020 _mm_cvtps_pd (__m128 __A
)
1022 /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
1024 return (__m128d
) vec_doubleh ((__v4sf
)__A
);
1026 /* Otherwise the compiler is not current and so need to generate the
1028 __v4sf __a
= (__v4sf
)__A
;
1031 #ifdef __LITTLE_ENDIAN__
1032 /* The input float values are in elements {[0], [1]} but the convert
1033 instruction needs them in elements {[1], [3]}, So we use two
1034 shift left double vector word immediates to get the elements
1036 __temp
= __builtin_vsx_xxsldwi (__a
, __a
, 3);
1037 __temp
= __builtin_vsx_xxsldwi (__a
, __temp
, 2);
1039 /* The input float values are in elements {[0], [1]} but the convert
1040 instruction needs them in elements {[0], [2]}, So we use two
1041 shift left double vector word immediates to get the elements
1043 __temp
= vec_vmrghw (__a
, __a
);
1050 return (__m128d
) __result
;
1054 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1055 _mm_cvtsd_si32 (__m128d __A
)
1057 __v2df __rounded
= vec_rint((__v2df
) __A
);
1058 int __result
= ((__v2df
)__rounded
)[0];
1062 /* Intel intrinsic. */
1063 extern __inline
long long __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1064 _mm_cvtsd_si64 (__m128d __A
)
1066 __v2df __rounded
= vec_rint ((__v2df
) __A
);
1067 long long __result
= ((__v2df
) __rounded
)[0];
1072 /* Microsoft intrinsic. */
1073 extern __inline
long long __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1074 _mm_cvtsd_si64x (__m128d __A
)
1076 return _mm_cvtsd_si64 ((__v2df
)__A
);
1079 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1080 _mm_cvttsd_si32 (__m128d __A
)
1082 int __result
= ((__v2df
)__A
)[0];
1087 /* Intel intrinsic. */
1088 extern __inline
long long __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1089 _mm_cvttsd_si64 (__m128d __A
)
1091 long long __result
= ((__v2df
)__A
)[0];
1096 /* Microsoft intrinsic. */
1097 extern __inline
long long __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1098 _mm_cvttsd_si64x (__m128d __A
)
1100 return _mm_cvttsd_si64 (__A
);
1103 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1104 _mm_cvtsd_ss (__m128 __A
, __m128d __B
)
1106 __v4sf __result
= (__v4sf
)__A
;
1108 #ifdef __LITTLE_ENDIAN__
1110 /* Copy double element[0] to element [1] for conversion. */
1111 __v2df __temp_b
= vec_splat((__v2df
)__B
, 0);
1113 /* Pre-rotate __A left 3 (logically right 1) elements. */
1114 __result
= __builtin_vsx_xxsldwi (__result
, __result
, 3);
1115 /* Convert double to single float scalar in a vector. */
1121 /* Shift the resulting scalar into vector element [0]. */
1122 __result
= __builtin_vsx_xxsldwi (__result
, __temp_s
, 1);
1124 __result
[0] = ((__v2df
)__B
)[0];
1126 return (__m128
) __result
;
1129 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1130 _mm_cvtsi32_sd (__m128d __A
, int __B
)
1132 __v2df __result
= (__v2df
)__A
;
1134 __result
[0] = __db
;
1135 return (__m128d
)__result
;
1138 /* Intel intrinsic. */
1139 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1140 _mm_cvtsi64_sd (__m128d __A
, long long __B
)
1142 __v2df __result
= (__v2df
)__A
;
1144 __result
[0] = __db
;
1145 return (__m128d
)__result
;
1148 /* Microsoft intrinsic. */
1149 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1150 _mm_cvtsi64x_sd (__m128d __A
, long long __B
)
1152 return _mm_cvtsi64_sd (__A
, __B
);
1155 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1156 _mm_cvtss_sd (__m128d __A
, __m128 __B
)
1158 #ifdef __LITTLE_ENDIAN__
1159 /* Use splat to move element [0] into position for the convert. */
1160 __v4sf __temp
= vec_splat ((__v4sf
)__B
, 0);
1162 /* Convert single float scalar to double in a vector. */
1168 return (__m128d
) vec_mergel (__res
, (__v2df
)__A
);
1170 __v2df __res
= (__v2df
)__A
;
1171 __res
[0] = ((__v4sf
)__B
) [0];
1172 return (__m128d
) __res
;
1176 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1177 _mm_shuffle_pd(__m128d __A
, __m128d __B
, const int __mask
)
1179 __vector
double __result
;
1180 const int __litmsk
= __mask
& 0x3;
1183 __result
= vec_mergeh (__A
, __B
);
1185 else if (__litmsk
== 1)
1186 __result
= vec_xxpermdi (__B
, __A
, 2);
1187 else if (__litmsk
== 2)
1188 __result
= vec_xxpermdi (__B
, __A
, 1);
1190 else if (__litmsk
== 1)
1191 __result
= vec_xxpermdi (__A
, __B
, 2);
1192 else if (__litmsk
== 2)
1193 __result
= vec_xxpermdi (__A
, __B
, 1);
1196 __result
= vec_mergel (__A
, __B
);
1201 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1202 _mm_unpackhi_pd (__m128d __A
, __m128d __B
)
1204 return (__m128d
) vec_mergel ((__v2df
)__A
, (__v2df
)__B
);
1207 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1208 _mm_unpacklo_pd (__m128d __A
, __m128d __B
)
1210 return (__m128d
) vec_mergeh ((__v2df
)__A
, (__v2df
)__B
);
1213 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1214 _mm_loadh_pd (__m128d __A
, double const *__B
)
1216 __v2df __result
= (__v2df
)__A
;
1217 __result
[1] = *__B
;
1218 return (__m128d
)__result
;
1221 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1222 _mm_loadl_pd (__m128d __A
, double const *__B
)
1224 __v2df __result
= (__v2df
)__A
;
1225 __result
[0] = *__B
;
1226 return (__m128d
)__result
;
1230 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1232 /* Creates a 2-bit mask from the most significant bits of the DPFP values. */
1233 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1234 _mm_movemask_pd (__m128d __A
)
1237 return vec_extractm ((__v2du
) __A
);
1239 __vector
unsigned long long __result
;
1240 static const __vector
unsigned int __perm_mask
=
1242 #ifdef __LITTLE_ENDIAN__
1243 0x80800040, 0x80808080, 0x80808080, 0x80808080
1245 0x80808080, 0x80808080, 0x80808080, 0x80804000
1249 __result
= ((__vector
unsigned long long)
1250 vec_vbpermq ((__vector
unsigned char) __A
,
1251 (__vector
unsigned char) __perm_mask
));
1253 #ifdef __LITTLE_ENDIAN__
1258 #endif /* !_ARCH_PWR10 */
1260 #endif /* _ARCH_PWR8 */
1262 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1263 _mm_packs_epi16 (__m128i __A
, __m128i __B
)
1265 return (__m128i
) vec_packs ((__v8hi
) __A
, (__v8hi
)__B
);
1268 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1269 _mm_packs_epi32 (__m128i __A
, __m128i __B
)
1271 return (__m128i
) vec_packs ((__v4si
)__A
, (__v4si
)__B
);
1274 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1275 _mm_packus_epi16 (__m128i __A
, __m128i __B
)
1277 return (__m128i
) vec_packsu ((__v8hi
) __A
, (__v8hi
)__B
);
1280 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1281 _mm_unpackhi_epi8 (__m128i __A
, __m128i __B
)
1283 return (__m128i
) vec_mergel ((__v16qu
)__A
, (__v16qu
)__B
);
1286 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1287 _mm_unpackhi_epi16 (__m128i __A
, __m128i __B
)
1289 return (__m128i
) vec_mergel ((__v8hu
)__A
, (__v8hu
)__B
);
1292 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1293 _mm_unpackhi_epi32 (__m128i __A
, __m128i __B
)
1295 return (__m128i
) vec_mergel ((__v4su
)__A
, (__v4su
)__B
);
1298 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1299 _mm_unpackhi_epi64 (__m128i __A
, __m128i __B
)
1301 return (__m128i
) vec_mergel ((__vector
long long) __A
,
1302 (__vector
long long) __B
);
1305 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1306 _mm_unpacklo_epi8 (__m128i __A
, __m128i __B
)
1308 return (__m128i
) vec_mergeh ((__v16qu
)__A
, (__v16qu
)__B
);
1311 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1312 _mm_unpacklo_epi16 (__m128i __A
, __m128i __B
)
1314 return (__m128i
) vec_mergeh ((__v8hi
)__A
, (__v8hi
)__B
);
1317 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1318 _mm_unpacklo_epi32 (__m128i __A
, __m128i __B
)
1320 return (__m128i
) vec_mergeh ((__v4si
)__A
, (__v4si
)__B
);
1323 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1324 _mm_unpacklo_epi64 (__m128i __A
, __m128i __B
)
1326 return (__m128i
) vec_mergeh ((__vector
long long) __A
,
1327 (__vector
long long) __B
);
1330 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1331 _mm_add_epi8 (__m128i __A
, __m128i __B
)
1333 return (__m128i
) ((__v16qu
)__A
+ (__v16qu
)__B
);
1336 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1337 _mm_add_epi16 (__m128i __A
, __m128i __B
)
1339 return (__m128i
) ((__v8hu
)__A
+ (__v8hu
)__B
);
1342 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1343 _mm_add_epi32 (__m128i __A
, __m128i __B
)
1345 return (__m128i
) ((__v4su
)__A
+ (__v4su
)__B
);
1348 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1349 _mm_add_epi64 (__m128i __A
, __m128i __B
)
1351 return (__m128i
) ((__v2du
)__A
+ (__v2du
)__B
);
1354 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1355 _mm_adds_epi8 (__m128i __A
, __m128i __B
)
1357 return (__m128i
) vec_adds ((__v16qi
)__A
, (__v16qi
)__B
);
1360 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1361 _mm_adds_epi16 (__m128i __A
, __m128i __B
)
1363 return (__m128i
) vec_adds ((__v8hi
)__A
, (__v8hi
)__B
);
1366 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1367 _mm_adds_epu8 (__m128i __A
, __m128i __B
)
1369 return (__m128i
) vec_adds ((__v16qu
)__A
, (__v16qu
)__B
);
1372 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1373 _mm_adds_epu16 (__m128i __A
, __m128i __B
)
1375 return (__m128i
) vec_adds ((__v8hu
)__A
, (__v8hu
)__B
);
1378 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1379 _mm_sub_epi8 (__m128i __A
, __m128i __B
)
1381 return (__m128i
) ((__v16qu
)__A
- (__v16qu
)__B
);
1384 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1385 _mm_sub_epi16 (__m128i __A
, __m128i __B
)
1387 return (__m128i
) ((__v8hu
)__A
- (__v8hu
)__B
);
1390 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1391 _mm_sub_epi32 (__m128i __A
, __m128i __B
)
1393 return (__m128i
) ((__v4su
)__A
- (__v4su
)__B
);
1396 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1397 _mm_sub_epi64 (__m128i __A
, __m128i __B
)
1399 return (__m128i
) ((__v2du
)__A
- (__v2du
)__B
);
1402 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1403 _mm_subs_epi8 (__m128i __A
, __m128i __B
)
1405 return (__m128i
) vec_subs ((__v16qi
)__A
, (__v16qi
)__B
);
1408 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1409 _mm_subs_epi16 (__m128i __A
, __m128i __B
)
1411 return (__m128i
) vec_subs ((__v8hi
)__A
, (__v8hi
)__B
);
1414 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1415 _mm_subs_epu8 (__m128i __A
, __m128i __B
)
1417 return (__m128i
) vec_subs ((__v16qu
)__A
, (__v16qu
)__B
);
1420 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1421 _mm_subs_epu16 (__m128i __A
, __m128i __B
)
1423 return (__m128i
) vec_subs ((__v8hu
)__A
, (__v8hu
)__B
);
1426 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1427 _mm_madd_epi16 (__m128i __A
, __m128i __B
)
1429 __vector
signed int __zero
= {0, 0, 0, 0};
1431 return (__m128i
) vec_vmsumshm ((__v8hi
)__A
, (__v8hi
)__B
, __zero
);
1434 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1435 _mm_mulhi_epi16 (__m128i __A
, __m128i __B
)
1437 __vector
signed int __w0
, __w1
;
1439 __vector
unsigned char __xform1
= {
1440 #ifdef __LITTLE_ENDIAN__
1441 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1442 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1444 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
1445 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1449 __w0
= vec_vmulesh ((__v8hi
)__A
, (__v8hi
)__B
);
1450 __w1
= vec_vmulosh ((__v8hi
)__A
, (__v8hi
)__B
);
1451 return (__m128i
) vec_perm (__w0
, __w1
, __xform1
);
1454 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1455 _mm_mullo_epi16 (__m128i __A
, __m128i __B
)
1457 return (__m128i
) ((__v8hi
)__A
* (__v8hi
)__B
);
1460 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1461 _mm_mul_su32 (__m64 __A
, __m64 __B
)
1463 unsigned int __a
= __A
;
1464 unsigned int __b
= __B
;
1466 return ((__m64
)__a
* (__m64
)__b
);
1470 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1471 _mm_mul_epu32 (__m128i __A
, __m128i __B
)
1476 #ifdef __LITTLE_ENDIAN__
1477 /* VMX Vector Multiply Odd Unsigned Word. */
1481 : "v" (__A
), "v" (__B
)
1484 /* VMX Vector Multiply Even Unsigned Word. */
1488 : "v" (__A
), "v" (__B
)
1491 return (__m128i
) __result
;
1493 return (__m128i
) vec_mule ((__v4su
)__A
, (__v4su
)__B
);
1498 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1499 _mm_slli_epi16 (__m128i __A
, int __B
)
1502 __v8hi __result
= { 0, 0, 0, 0, 0, 0, 0, 0 };
1504 if (__B
>= 0 && __B
< 16)
1506 if (__builtin_constant_p(__B
))
1507 __lshift
= (__v8hu
) vec_splat_s16(__B
);
1509 __lshift
= vec_splats ((unsigned short) __B
);
1511 __result
= vec_sl ((__v8hi
) __A
, __lshift
);
1514 return (__m128i
) __result
;
1517 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1518 _mm_slli_epi32 (__m128i __A
, int __B
)
1521 __v4si __result
= { 0, 0, 0, 0 };
1523 if (__B
>= 0 && __B
< 32)
1525 if (__builtin_constant_p(__B
) && __B
< 16)
1526 __lshift
= (__v4su
) vec_splat_s32(__B
);
1528 __lshift
= vec_splats ((unsigned int) __B
);
1530 __result
= vec_sl ((__v4si
) __A
, __lshift
);
1533 return (__m128i
) __result
;
1537 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1538 _mm_slli_epi64 (__m128i __A
, int __B
)
1541 __v2di __result
= { 0, 0 };
1543 if (__B
>= 0 && __B
< 64)
1545 if (__builtin_constant_p(__B
) && __B
< 16)
1546 __lshift
= (__v2du
) vec_splat_s32(__B
);
1548 __lshift
= (__v2du
) vec_splats ((unsigned int) __B
);
1550 __result
= vec_sl ((__v2di
) __A
, __lshift
);
1553 return (__m128i
) __result
;
1557 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1558 _mm_srai_epi16 (__m128i __A
, int __B
)
1560 __v8hu __rshift
= { 15, 15, 15, 15, 15, 15, 15, 15 };
1565 if (__builtin_constant_p(__B
))
1566 __rshift
= (__v8hu
) vec_splat_s16(__B
);
1568 __rshift
= vec_splats ((unsigned short) __B
);
1570 __result
= vec_sra ((__v8hi
) __A
, __rshift
);
1572 return (__m128i
) __result
;
1575 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1576 _mm_srai_epi32 (__m128i __A
, int __B
)
1578 __v4su __rshift
= { 31, 31, 31, 31 };
1583 if (__builtin_constant_p(__B
))
1586 __rshift
= (__v4su
) vec_splat_s32(__B
);
1588 __rshift
= (__v4su
) vec_splats((unsigned int)__B
);
1591 __rshift
= vec_splats ((unsigned int) __B
);
1593 __result
= vec_sra ((__v4si
) __A
, __rshift
);
1595 return (__m128i
) __result
;
1598 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1599 _mm_bslli_si128 (__m128i __A
, const int __N
)
1602 const __v16qu __zeros
= { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1606 else if (__N
> 0 && __N
< 16)
1607 #ifdef __LITTLE_ENDIAN__
1608 __result
= vec_sld ((__v16qu
) __A
, __zeros
, __N
);
1610 __result
= vec_sld (__zeros
, (__v16qu
) __A
, (16 - __N
));
1615 return (__m128i
) __result
;
1618 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1619 _mm_bsrli_si128 (__m128i __A
, const int __N
)
1622 const __v16qu __zeros
= { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1626 else if (__N
> 0 && __N
< 16)
1627 #ifdef __LITTLE_ENDIAN__
1628 if (__builtin_constant_p(__N
))
1629 /* Would like to use Vector Shift Left Double by Octet
1630 Immediate here to use the immediate form and avoid
1631 load of __N * 8 value into a separate VR. */
1632 __result
= vec_sld (__zeros
, (__v16qu
) __A
, (16 - __N
));
1636 __v16qu __shift
= vec_splats((unsigned char)(__N
*8));
1637 #ifdef __LITTLE_ENDIAN__
1638 __result
= vec_sro ((__v16qu
)__A
, __shift
);
1640 __result
= vec_slo ((__v16qu
)__A
, __shift
);
1646 return (__m128i
) __result
;
1649 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1650 _mm_srli_si128 (__m128i __A
, const int __N
)
1652 return _mm_bsrli_si128 (__A
, __N
);
1655 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1656 _mm_slli_si128 (__m128i __A
, const int __N
)
1658 return _mm_bslli_si128 (__A
, __N
);
1661 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1663 _mm_srli_epi16 (__m128i __A
, int __B
)
1666 __v8hi __result
= { 0, 0, 0, 0, 0, 0, 0, 0 };
1670 if (__builtin_constant_p(__B
))
1671 __rshift
= (__v8hu
) vec_splat_s16(__B
);
1673 __rshift
= vec_splats ((unsigned short) __B
);
1675 __result
= vec_sr ((__v8hi
) __A
, __rshift
);
1678 return (__m128i
) __result
;
1681 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1682 _mm_srli_epi32 (__m128i __A
, int __B
)
1685 __v4si __result
= { 0, 0, 0, 0 };
1689 if (__builtin_constant_p(__B
))
1692 __rshift
= (__v4su
) vec_splat_s32(__B
);
1694 __rshift
= (__v4su
) vec_splats((unsigned int)__B
);
1697 __rshift
= vec_splats ((unsigned int) __B
);
1699 __result
= vec_sr ((__v4si
) __A
, __rshift
);
1702 return (__m128i
) __result
;
1706 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1707 _mm_srli_epi64 (__m128i __A
, int __B
)
1710 __v2di __result
= { 0, 0 };
1714 if (__builtin_constant_p(__B
))
1717 __rshift
= (__v2du
) vec_splat_s32(__B
);
1719 __rshift
= (__v2du
) vec_splats((unsigned long long)__B
);
1722 __rshift
= (__v2du
) vec_splats ((unsigned int) __B
);
1724 __result
= vec_sr ((__v2di
) __A
, __rshift
);
1727 return (__m128i
) __result
;
1731 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1732 _mm_sll_epi16 (__m128i __A
, __m128i __B
)
1735 __vector __bool
short __shmask
;
1736 const __v8hu __shmax
= { 15, 15, 15, 15, 15, 15, 15, 15 };
1739 #ifdef __LITTLE_ENDIAN__
1740 __lshift
= vec_splat ((__v8hu
) __B
, 0);
1742 __lshift
= vec_splat ((__v8hu
) __B
, 3);
1744 __shmask
= vec_cmple (__lshift
, __shmax
);
1745 __result
= vec_sl ((__v8hu
) __A
, __lshift
);
1746 __result
= vec_sel ((__v8hu
) __shmask
, __result
, __shmask
);
1748 return (__m128i
) __result
;
1751 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1752 _mm_sll_epi32 (__m128i __A
, __m128i __B
)
1755 __vector __bool
int __shmask
;
1756 const __v4su __shmax
= { 32, 32, 32, 32 };
1758 #ifdef __LITTLE_ENDIAN__
1759 __lshift
= vec_splat ((__v4su
) __B
, 0);
1761 __lshift
= vec_splat ((__v4su
) __B
, 1);
1763 __shmask
= vec_cmplt (__lshift
, __shmax
);
1764 __result
= vec_sl ((__v4su
) __A
, __lshift
);
1765 __result
= vec_sel ((__v4su
) __shmask
, __result
, __shmask
);
1767 return (__m128i
) __result
;
1771 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1772 _mm_sll_epi64 (__m128i __A
, __m128i __B
)
1775 __vector __bool
long long __shmask
;
1776 const __v2du __shmax
= { 64, 64 };
1779 __lshift
= vec_splat ((__v2du
) __B
, 0);
1780 __shmask
= vec_cmplt (__lshift
, __shmax
);
1781 __result
= vec_sl ((__v2du
) __A
, __lshift
);
1782 __result
= vec_sel ((__v2du
) __shmask
, __result
, __shmask
);
1784 return (__m128i
) __result
;
1788 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1789 _mm_sra_epi16 (__m128i __A
, __m128i __B
)
1791 const __v8hu __rshmax
= { 15, 15, 15, 15, 15, 15, 15, 15 };
1795 #ifdef __LITTLE_ENDIAN__
1796 __rshift
= vec_splat ((__v8hu
)__B
, 0);
1798 __rshift
= vec_splat ((__v8hu
)__B
, 3);
1800 __rshift
= vec_min (__rshift
, __rshmax
);
1801 __result
= vec_sra ((__v8hi
) __A
, __rshift
);
1803 return (__m128i
) __result
;
1806 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1807 _mm_sra_epi32 (__m128i __A
, __m128i __B
)
1809 const __v4su __rshmax
= { 31, 31, 31, 31 };
1813 #ifdef __LITTLE_ENDIAN__
1814 __rshift
= vec_splat ((__v4su
)__B
, 0);
1816 __rshift
= vec_splat ((__v4su
)__B
, 1);
1818 __rshift
= vec_min (__rshift
, __rshmax
);
1819 __result
= vec_sra ((__v4si
) __A
, __rshift
);
1821 return (__m128i
) __result
;
1824 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1825 _mm_srl_epi16 (__m128i __A
, __m128i __B
)
1828 __vector __bool
short __shmask
;
1829 const __v8hu __shmax
= { 15, 15, 15, 15, 15, 15, 15, 15 };
1832 #ifdef __LITTLE_ENDIAN__
1833 __rshift
= vec_splat ((__v8hu
) __B
, 0);
1835 __rshift
= vec_splat ((__v8hu
) __B
, 3);
1837 __shmask
= vec_cmple (__rshift
, __shmax
);
1838 __result
= vec_sr ((__v8hu
) __A
, __rshift
);
1839 __result
= vec_sel ((__v8hu
) __shmask
, __result
, __shmask
);
1841 return (__m128i
) __result
;
1844 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1845 _mm_srl_epi32 (__m128i __A
, __m128i __B
)
1848 __vector __bool
int __shmask
;
1849 const __v4su __shmax
= { 32, 32, 32, 32 };
1852 #ifdef __LITTLE_ENDIAN__
1853 __rshift
= vec_splat ((__v4su
) __B
, 0);
1855 __rshift
= vec_splat ((__v4su
) __B
, 1);
1857 __shmask
= vec_cmplt (__rshift
, __shmax
);
1858 __result
= vec_sr ((__v4su
) __A
, __rshift
);
1859 __result
= vec_sel ((__v4su
) __shmask
, __result
, __shmask
);
1861 return (__m128i
) __result
;
1865 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1866 _mm_srl_epi64 (__m128i __A
, __m128i __B
)
1869 __vector __bool
long long __shmask
;
1870 const __v2du __shmax
= { 64, 64 };
1873 __rshift
= vec_splat ((__v2du
) __B
, 0);
1874 __shmask
= vec_cmplt (__rshift
, __shmax
);
1875 __result
= vec_sr ((__v2du
) __A
, __rshift
);
1876 __result
= vec_sel ((__v2du
) __shmask
, __result
, __shmask
);
1878 return (__m128i
) __result
;
1882 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1883 _mm_and_pd (__m128d __A
, __m128d __B
)
1885 return (vec_and ((__v2df
) __A
, (__v2df
) __B
));
1888 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1889 _mm_andnot_pd (__m128d __A
, __m128d __B
)
1891 return (vec_andc ((__v2df
) __B
, (__v2df
) __A
));
1894 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1895 _mm_or_pd (__m128d __A
, __m128d __B
)
1897 return (vec_or ((__v2df
) __A
, (__v2df
) __B
));
1900 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1901 _mm_xor_pd (__m128d __A
, __m128d __B
)
1903 return (vec_xor ((__v2df
) __A
, (__v2df
) __B
));
1906 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1907 _mm_and_si128 (__m128i __A
, __m128i __B
)
1909 return (__m128i
)vec_and ((__v2di
) __A
, (__v2di
) __B
);
1912 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1913 _mm_andnot_si128 (__m128i __A
, __m128i __B
)
1915 return (__m128i
)vec_andc ((__v2di
) __B
, (__v2di
) __A
);
1918 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1919 _mm_or_si128 (__m128i __A
, __m128i __B
)
1921 return (__m128i
)vec_or ((__v2di
) __A
, (__v2di
) __B
);
1924 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1925 _mm_xor_si128 (__m128i __A
, __m128i __B
)
1927 return (__m128i
)vec_xor ((__v2di
) __A
, (__v2di
) __B
);
1930 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1931 _mm_cmpeq_epi8 (__m128i __A
, __m128i __B
)
1933 return (__m128i
) vec_cmpeq ((__v16qi
) __A
, (__v16qi
)__B
);
1936 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1937 _mm_cmpeq_epi16 (__m128i __A
, __m128i __B
)
1939 return (__m128i
) vec_cmpeq ((__v8hi
) __A
, (__v8hi
)__B
);
1942 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1943 _mm_cmpeq_epi32 (__m128i __A
, __m128i __B
)
1945 return (__m128i
) vec_cmpeq ((__v4si
) __A
, (__v4si
)__B
);
1948 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1949 _mm_cmplt_epi8 (__m128i __A
, __m128i __B
)
1951 return (__m128i
) vec_cmplt ((__v16qi
) __A
, (__v16qi
)__B
);
1954 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1955 _mm_cmplt_epi16 (__m128i __A
, __m128i __B
)
1957 return (__m128i
) vec_cmplt ((__v8hi
) __A
, (__v8hi
)__B
);
1960 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1961 _mm_cmplt_epi32 (__m128i __A
, __m128i __B
)
1963 return (__m128i
) vec_cmplt ((__v4si
) __A
, (__v4si
)__B
);
1966 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1967 _mm_cmpgt_epi8 (__m128i __A
, __m128i __B
)
1969 return (__m128i
) vec_cmpgt ((__v16qi
) __A
, (__v16qi
)__B
);
1972 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1973 _mm_cmpgt_epi16 (__m128i __A
, __m128i __B
)
1975 return (__m128i
) vec_cmpgt ((__v8hi
) __A
, (__v8hi
)__B
);
1978 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1979 _mm_cmpgt_epi32 (__m128i __A
, __m128i __B
)
1981 return (__m128i
) vec_cmpgt ((__v4si
) __A
, (__v4si
)__B
);
1984 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1985 _mm_extract_epi16 (__m128i
const __A
, int const __N
)
1987 return (unsigned short) ((__v8hi
)__A
)[__N
& 7];
1990 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
1991 _mm_insert_epi16 (__m128i
const __A
, int const __D
, int const __N
)
1993 __v8hi __result
= (__v8hi
)__A
;
1995 __result
[(__N
& 7)] = __D
;
1997 return (__m128i
) __result
;
2000 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2001 _mm_max_epi16 (__m128i __A
, __m128i __B
)
2003 return (__m128i
) vec_max ((__v8hi
)__A
, (__v8hi
)__B
);
2006 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2007 _mm_max_epu8 (__m128i __A
, __m128i __B
)
2009 return (__m128i
) vec_max ((__v16qu
) __A
, (__v16qu
)__B
);
2012 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2013 _mm_min_epi16 (__m128i __A
, __m128i __B
)
2015 return (__m128i
) vec_min ((__v8hi
) __A
, (__v8hi
)__B
);
2018 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2019 _mm_min_epu8 (__m128i __A
, __m128i __B
)
2021 return (__m128i
) vec_min ((__v16qu
) __A
, (__v16qu
)__B
);
2026 /* Intrinsic functions that require PowerISA 2.07 minimum. */
2028 /* Return a mask created from the most significant bit of each 8-bit
2030 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2031 _mm_movemask_epi8 (__m128i __A
)
2034 return vec_extractm ((__v16qu
) __A
);
2036 __vector
unsigned long long __result
;
2037 static const __vector
unsigned char __perm_mask
=
2039 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
2040 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00
2043 __result
= ((__vector
unsigned long long)
2044 vec_vbpermq ((__vector
unsigned char) __A
,
2045 (__vector
unsigned char) __perm_mask
));
2047 #ifdef __LITTLE_ENDIAN__
2052 #endif /* !_ARCH_PWR10 */
2054 #endif /* _ARCH_PWR8 */
2056 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2057 _mm_mulhi_epu16 (__m128i __A
, __m128i __B
)
2060 __v16qu __xform1
= {
2061 #ifdef __LITTLE_ENDIAN__
2062 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
2063 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
2065 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
2066 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
2070 __w0
= vec_vmuleuh ((__v8hu
)__A
, (__v8hu
)__B
);
2071 __w1
= vec_vmulouh ((__v8hu
)__A
, (__v8hu
)__B
);
2072 return (__m128i
) vec_perm (__w0
, __w1
, __xform1
);
2075 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2076 _mm_shufflehi_epi16 (__m128i __A
, const int __mask
)
2078 unsigned long __element_selector_98
= __mask
& 0x03;
2079 unsigned long __element_selector_BA
= (__mask
>> 2) & 0x03;
2080 unsigned long __element_selector_DC
= (__mask
>> 4) & 0x03;
2081 unsigned long __element_selector_FE
= (__mask
>> 6) & 0x03;
2082 static const unsigned short __permute_selectors
[4] =
2084 #ifdef __LITTLE_ENDIAN__
2085 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2087 0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2091 #ifdef __LITTLE_ENDIAN__
2092 { 0x1716151413121110UL
, 0UL};
2094 { 0x1011121314151617UL
, 0UL};
2099 __t
.as_short
[0] = __permute_selectors
[__element_selector_98
];
2100 __t
.as_short
[1] = __permute_selectors
[__element_selector_BA
];
2101 __t
.as_short
[2] = __permute_selectors
[__element_selector_DC
];
2102 __t
.as_short
[3] = __permute_selectors
[__element_selector_FE
];
2103 __pmask
[1] = __t
.as_m64
;
2105 __r
= vec_perm (__a
, __a
, (__vector
unsigned char)__pmask
);
2106 return (__m128i
) __r
;
2109 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2110 _mm_shufflelo_epi16 (__m128i __A
, const int __mask
)
2112 unsigned long __element_selector_10
= __mask
& 0x03;
2113 unsigned long __element_selector_32
= (__mask
>> 2) & 0x03;
2114 unsigned long __element_selector_54
= (__mask
>> 4) & 0x03;
2115 unsigned long __element_selector_76
= (__mask
>> 6) & 0x03;
2116 static const unsigned short __permute_selectors
[4] =
2118 #ifdef __LITTLE_ENDIAN__
2119 0x0100, 0x0302, 0x0504, 0x0706
2121 0x0001, 0x0203, 0x0405, 0x0607
2125 #ifdef __LITTLE_ENDIAN__
2126 { 0UL, 0x1f1e1d1c1b1a1918UL
};
2128 { 0UL, 0x18191a1b1c1d1e1fUL
};
2132 __t
.as_short
[0] = __permute_selectors
[__element_selector_10
];
2133 __t
.as_short
[1] = __permute_selectors
[__element_selector_32
];
2134 __t
.as_short
[2] = __permute_selectors
[__element_selector_54
];
2135 __t
.as_short
[3] = __permute_selectors
[__element_selector_76
];
2136 __pmask
[0] = __t
.as_m64
;
2138 __r
= vec_perm (__a
, __a
, (__vector
unsigned char)__pmask
);
2139 return (__m128i
) __r
;
2142 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2143 _mm_shuffle_epi32 (__m128i __A
, const int __mask
)
2145 unsigned long __element_selector_10
= __mask
& 0x03;
2146 unsigned long __element_selector_32
= (__mask
>> 2) & 0x03;
2147 unsigned long __element_selector_54
= (__mask
>> 4) & 0x03;
2148 unsigned long __element_selector_76
= (__mask
>> 6) & 0x03;
2149 static const unsigned int __permute_selectors
[4] =
2151 #ifdef __LITTLE_ENDIAN__
2152 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2154 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2159 __t
[0] = __permute_selectors
[__element_selector_10
];
2160 __t
[1] = __permute_selectors
[__element_selector_32
];
2161 __t
[2] = __permute_selectors
[__element_selector_54
] + 0x10101010;
2162 __t
[3] = __permute_selectors
[__element_selector_76
] + 0x10101010;
2163 return (__m128i
)vec_perm ((__v4si
) __A
, (__v4si
)__A
, (__vector
unsigned char)__t
);
2166 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2167 _mm_maskmoveu_si128 (__m128i __A
, __m128i __B
, char *__C
)
2169 __v2du __hibit
= { 0x7f7f7f7f7f7f7f7fUL
, 0x7f7f7f7f7f7f7f7fUL
};
2170 __v16qu __mask
, __tmp
;
2171 __m128i_u
*__p
= (__m128i_u
*)__C
;
2173 __tmp
= (__v16qu
)_mm_loadu_si128(__p
);
2174 __mask
= (__v16qu
)vec_cmpgt ((__v16qu
)__B
, (__v16qu
)__hibit
);
2175 __tmp
= vec_sel (__tmp
, (__v16qu
)__A
, __mask
);
2176 _mm_storeu_si128 (__p
, (__m128i
)__tmp
);
2179 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2180 _mm_avg_epu8 (__m128i __A
, __m128i __B
)
2182 return (__m128i
) vec_avg ((__v16qu
)__A
, (__v16qu
)__B
);
2185 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2186 _mm_avg_epu16 (__m128i __A
, __m128i __B
)
2188 return (__m128i
) vec_avg ((__v8hu
)__A
, (__v8hu
)__B
);
2192 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2193 _mm_sad_epu8 (__m128i __A
, __m128i __B
)
2198 const __v4su __zero
= { 0, 0, 0, 0 };
2201 __a
= (__v16qu
) __A
;
2202 __b
= (__v16qu
) __B
;
2204 __v16qu __vmin
= vec_min (__a
, __b
);
2205 __v16qu __vmax
= vec_max (__a
, __b
);
2206 __vabsdiff
= vec_sub (__vmax
, __vmin
);
2208 __vabsdiff
= vec_absd (__a
, __b
);
2210 /* Sum four groups of bytes into integers. */
2211 __vsum
= (__vector
signed int) vec_sum4s (__vabsdiff
, __zero
);
2212 #ifdef __LITTLE_ENDIAN__
2213 /* Sum across four integers with two integer results. */
2214 __asm__ ("vsum2sws %0,%1,%2" : "=v" (__result
) : "v" (__vsum
), "v" (__zero
));
2215 /* Note: vec_sum2s could be used here, but on little-endian, vector
2216 shifts are added that are not needed for this use-case.
2217 A vector shift to correctly position the 32-bit integer results
2218 (currently at [0] and [2]) to [1] and [3] would then need to be
2219 swapped back again since the desired results are two 64-bit
2220 integers ([1]|[0] and [3]|[2]). Thus, no shift is performed. */
2222 /* Sum across four integers with two integer results. */
2223 __result
= vec_sum2s (__vsum
, (__vector
signed int) __zero
);
2224 /* Rotate the sums into the correct position. */
2225 __result
= vec_sld (__result
, __result
, 6);
2227 return (__m128i
) __result
;
2230 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2231 _mm_stream_si32 (int *__A
, int __B
)
2233 /* Use the data cache block touch for store transient. */
2243 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2244 _mm_stream_si64 (long long int *__A
, long long int __B
)
2246 /* Use the data cache block touch for store transient. */
2256 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2257 _mm_stream_si128 (__m128i
*__A
, __m128i __B
)
2259 /* Use the data cache block touch for store transient. */
2269 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2270 _mm_stream_pd (double *__A
, __m128d __B
)
2272 /* Use the data cache block touch for store transient. */
2279 *(__m128d
*)__A
= __B
;
2282 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2283 _mm_clflush (void const *__A
)
2285 /* Use the data cache block flush. */
2294 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2297 /* Use light weight sync for load to load ordering. */
2298 __atomic_thread_fence (__ATOMIC_RELEASE
);
2301 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2304 /* Use heavy weight sync for any to any ordering. */
2305 __atomic_thread_fence (__ATOMIC_SEQ_CST
);
2308 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2309 _mm_cvtsi32_si128 (int __A
)
2311 return _mm_set_epi32 (0, 0, 0, __A
);
2314 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2315 _mm_cvtsi64_si128 (long long __A
)
2317 return __extension__ (__m128i
)(__v2di
){ __A
, 0LL };
2320 /* Microsoft intrinsic. */
2321 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2322 _mm_cvtsi64x_si128 (long long __A
)
2324 return __extension__ (__m128i
)(__v2di
){ __A
, 0LL };
2327 /* Casts between various SP, DP, INT vector types. Note that these do no
2328 conversion of values, they just change the type. */
2329 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2330 _mm_castpd_ps(__m128d __A
)
2332 return (__m128
) __A
;
2335 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2336 _mm_castpd_si128(__m128d __A
)
2338 return (__m128i
) __A
;
2341 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2342 _mm_castps_pd(__m128 __A
)
2344 return (__m128d
) __A
;
2347 extern __inline __m128i
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2348 _mm_castps_si128(__m128 __A
)
2350 return (__m128i
) __A
;
2353 extern __inline __m128
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2354 _mm_castsi128_ps(__m128i __A
)
2356 return (__m128
) __A
;
2359 extern __inline __m128d
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
2360 _mm_castsi128_pd(__m128i __A
)
2362 return (__m128d
) __A
;
2365 #endif /* EMMINTRIN_H_ */