doc: Tweak PIM4 link
[official-gcc.git] / gcc / config / rs6000 / tmmintrin.h
blobd547d78d51bf88a222aee261eb3499665b8e61f9
1 /* Copyright (C) 2003-2024 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
24 /* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 9.0. */
27 #ifndef NO_WARN_X86_INTRINSICS
28 /* This header is distributed to simplify porting x86_64 code that
29 makes explicit use of Intel intrinsics to powerpc64le.
30 It is the user's responsibility to determine if the results are
31 acceptable and make additional changes as necessary.
32 Note that much code that uses Intel intrinsics can be rewritten in
33 standard C or GNU C extensions, which are more portable and better
34 optimized across multiple targets. */
35 #endif
37 #ifndef TMMINTRIN_H_
38 #define TMMINTRIN_H_
40 #include <altivec.h>
41 #include <assert.h>
43 /* We need definitions from the SSE header files. */
44 #include <pmmintrin.h>
46 extern __inline __m128i
47 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
48 _mm_abs_epi16 (__m128i __A)
50 return (__m128i) vec_abs ((__v8hi) __A);
53 extern __inline __m128i
54 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
55 _mm_abs_epi32 (__m128i __A)
57 return (__m128i) vec_abs ((__v4si) __A);
60 extern __inline __m128i
61 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
62 _mm_abs_epi8 (__m128i __A)
64 return (__m128i) vec_abs ((__v16qi) __A);
67 extern __inline __m64
68 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
69 _mm_abs_pi16 (__m64 __A)
71 __v8hi __B = (__v8hi) (__v2du) { __A, __A };
72 return (__m64) ((__v2du) vec_abs (__B))[0];
75 extern __inline __m64
76 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
77 _mm_abs_pi32 (__m64 __A)
79 __v4si __B = (__v4si) (__v2du) { __A, __A };
80 return (__m64) ((__v2du) vec_abs (__B))[0];
83 extern __inline __m64
84 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
85 _mm_abs_pi8 (__m64 __A)
87 __v16qi __B = (__v16qi) (__v2du) { __A, __A };
88 return (__m64) ((__v2du) vec_abs (__B))[0];
91 extern __inline __m128i
92 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
93 _mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count)
95 if (__builtin_constant_p (__count) && __count < 16)
97 #ifdef __LITTLE_ENDIAN__
98 __A = (__m128i) vec_reve ((__v16qu) __A);
99 __B = (__m128i) vec_reve ((__v16qu) __B);
100 #endif
101 __A = (__m128i) vec_sld ((__v16qu) __B, (__v16qu) __A, __count);
102 #ifdef __LITTLE_ENDIAN__
103 __A = (__m128i) vec_reve ((__v16qu) __A);
104 #endif
105 return __A;
108 if (__count == 0)
109 return __B;
111 if (__count >= 16)
113 if (__count >= 32)
115 const __v16qu __zero = { 0 };
116 return (__m128i) __zero;
118 else
120 const __v16qu __shift =
121 vec_splats ((unsigned char) ((__count - 16) * 8));
122 #ifdef __LITTLE_ENDIAN__
123 return (__m128i) vec_sro ((__v16qu) __A, __shift);
124 #else
125 return (__m128i) vec_slo ((__v16qu) __A, __shift);
126 #endif
129 else
131 const __v16qu __shiftA =
132 vec_splats ((unsigned char) ((16 - __count) * 8));
133 const __v16qu __shiftB = vec_splats ((unsigned char) (__count * 8));
134 #ifdef __LITTLE_ENDIAN__
135 __A = (__m128i) vec_slo ((__v16qu) __A, __shiftA);
136 __B = (__m128i) vec_sro ((__v16qu) __B, __shiftB);
137 #else
138 __A = (__m128i) vec_sro ((__v16qu) __A, __shiftA);
139 __B = (__m128i) vec_slo ((__v16qu) __B, __shiftB);
140 #endif
141 return (__m128i) vec_or ((__v16qu) __A, (__v16qu) __B);
145 extern __inline __m64
146 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
147 _mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count)
149 if (__count < 16)
151 __v2du __C = { __B, __A };
152 #ifdef __LITTLE_ENDIAN__
153 const __v4su __shift = { __count << 3, 0, 0, 0 };
154 __C = (__v2du) vec_sro ((__v16qu) __C, (__v16qu) __shift);
155 #else
156 const __v4su __shift = { 0, 0, 0, __count << 3 };
157 __C = (__v2du) vec_slo ((__v16qu) __C, (__v16qu) __shift);
158 #endif
159 return (__m64) __C[0];
161 else
163 const __m64 __zero = { 0 };
164 return __zero;
168 extern __inline __m128i
169 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
170 _mm_hadd_epi16 (__m128i __A, __m128i __B)
172 const __v16qu __P =
173 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
174 const __v16qu __Q =
175 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
176 __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
177 __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
178 return (__m128i) vec_add (__C, __D);
181 extern __inline __m128i
182 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
183 _mm_hadd_epi32 (__m128i __A, __m128i __B)
185 const __v16qu __P =
186 { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
187 const __v16qu __Q =
188 { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
189 __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
190 __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
191 return (__m128i) vec_add (__C, __D);
194 extern __inline __m64
195 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
196 _mm_hadd_pi16 (__m64 __A, __m64 __B)
198 __v8hi __C = (__v8hi) (__v2du) { __A, __B };
199 const __v16qu __P =
200 { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 };
201 const __v16qu __Q =
202 { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 };
203 __v8hi __D = vec_perm (__C, __C, __Q);
204 __C = vec_perm (__C, __C, __P);
205 __C = vec_add (__C, __D);
206 return (__m64) ((__v2du) __C)[1];
209 extern __inline __m64
210 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
211 _mm_hadd_pi32 (__m64 __A, __m64 __B)
213 __v4si __C = (__v4si) (__v2du) { __A, __B };
214 const __v16qu __P =
215 { 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 };
216 const __v16qu __Q =
217 { 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 };
218 __v4si __D = vec_perm (__C, __C, __Q);
219 __C = vec_perm (__C, __C, __P);
220 __C = vec_add (__C, __D);
221 return (__m64) ((__v2du) __C)[1];
224 extern __inline __m128i
225 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
226 _mm_hadds_epi16 (__m128i __A, __m128i __B)
228 __v4si __C = { 0 }, __D = { 0 };
229 __C = vec_sum4s ((__v8hi) __A, __C);
230 __D = vec_sum4s ((__v8hi) __B, __D);
231 __C = (__v4si) vec_packs (__C, __D);
232 return (__m128i) __C;
235 extern __inline __m64
236 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
237 _mm_hadds_pi16 (__m64 __A, __m64 __B)
239 const __v4si __zero = { 0 };
240 __v8hi __C = (__v8hi) (__v2du) { __A, __B };
241 __v4si __D = vec_sum4s (__C, __zero);
242 __C = vec_packs (__D, __D);
243 return (__m64) ((__v2du) __C)[1];
246 extern __inline __m128i
247 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
248 _mm_hsub_epi16 (__m128i __A, __m128i __B)
250 const __v16qu __P =
251 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
252 const __v16qu __Q =
253 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
254 __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
255 __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
256 return (__m128i) vec_sub (__C, __D);
259 extern __inline __m128i
260 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
261 _mm_hsub_epi32 (__m128i __A, __m128i __B)
263 const __v16qu __P =
264 { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
265 const __v16qu __Q =
266 { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
267 __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
268 __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
269 return (__m128i) vec_sub (__C, __D);
272 extern __inline __m64
273 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
274 _mm_hsub_pi16 (__m64 __A, __m64 __B)
276 const __v16qu __P =
277 { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 };
278 const __v16qu __Q =
279 { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 };
280 __v8hi __C = (__v8hi) (__v2du) { __A, __B };
281 __v8hi __D = vec_perm (__C, __C, __Q);
282 __C = vec_perm (__C, __C, __P);
283 __C = vec_sub (__C, __D);
284 return (__m64) ((__v2du) __C)[1];
287 extern __inline __m64
288 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
289 _mm_hsub_pi32 (__m64 __A, __m64 __B)
291 const __v16qu __P =
292 { 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 };
293 const __v16qu __Q =
294 { 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 };
295 __v4si __C = (__v4si) (__v2du) { __A, __B };
296 __v4si __D = vec_perm (__C, __C, __Q);
297 __C = vec_perm (__C, __C, __P);
298 __C = vec_sub (__C, __D);
299 return (__m64) ((__v2du) __C)[1];
302 extern __inline __m128i
303 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
304 _mm_hsubs_epi16 (__m128i __A, __m128i __B)
306 const __v16qu __P =
307 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
308 const __v16qu __Q =
309 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
310 __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
311 __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
312 return (__m128i) vec_subs (__C, __D);
315 extern __inline __m64
316 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
317 _mm_hsubs_pi16 (__m64 __A, __m64 __B)
319 const __v16qu __P =
320 { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 };
321 const __v16qu __Q =
322 { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 };
323 __v8hi __C = (__v8hi) (__v2du) { __A, __B };
324 __v8hi __D = vec_perm (__C, __C, __P);
325 __v8hi __E = vec_perm (__C, __C, __Q);
326 __C = vec_subs (__D, __E);
327 return (__m64) ((__v2du) __C)[1];
330 extern __inline __m128i
331 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
332 _mm_shuffle_epi8 (__m128i __A, __m128i __B)
334 const __v16qi __zero = { 0 };
335 __vector __bool char __select = vec_cmplt ((__v16qi) __B, __zero);
336 __v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B);
337 return (__m128i) vec_sel (__C, __zero, __select);
340 extern __inline __m64
341 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
342 _mm_shuffle_pi8 (__m64 __A, __m64 __B)
344 const __v16qi __zero = { 0 };
345 __v16qi __C = (__v16qi) (__v2du) { __A, __A };
346 __v16qi __D = (__v16qi) (__v2du) { __B, __B };
347 __vector __bool char __select = vec_cmplt ((__v16qi) __D, __zero);
348 __C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D);
349 __C = vec_sel (__C, __zero, __select);
350 return (__m64) ((__v2du) (__C))[0];
353 #ifdef _ARCH_PWR8
354 extern __inline __m128i
355 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
356 _mm_sign_epi8 (__m128i __A, __m128i __B)
358 const __v16qi __zero = { 0 };
359 __v16qi __selectneg = (__v16qi) vec_cmplt ((__v16qi) __B, __zero);
360 __v16qi __selectpos =
361 (__v16qi) vec_neg ((__v16qi) vec_cmpgt ((__v16qi) __B, __zero));
362 __v16qi __conv = vec_add (__selectneg, __selectpos);
363 return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv);
365 #endif
367 #ifdef _ARCH_PWR8
368 extern __inline __m128i
369 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
370 _mm_sign_epi16 (__m128i __A, __m128i __B)
372 const __v8hi __zero = { 0 };
373 __v8hi __selectneg = (__v8hi) vec_cmplt ((__v8hi) __B, __zero);
374 __v8hi __selectpos =
375 (__v8hi) vec_neg ((__v8hi) vec_cmpgt ((__v8hi) __B, __zero));
376 __v8hi __conv = vec_add (__selectneg, __selectpos);
377 return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv);
379 #endif
381 #ifdef _ARCH_PWR8
382 extern __inline __m128i
383 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
384 _mm_sign_epi32 (__m128i __A, __m128i __B)
386 const __v4si __zero = { 0 };
387 __v4si __selectneg = (__v4si) vec_cmplt ((__v4si) __B, __zero);
388 __v4si __selectpos =
389 (__v4si) vec_neg ((__v4si) vec_cmpgt ((__v4si) __B, __zero));
390 __v4si __conv = vec_add (__selectneg, __selectpos);
391 return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv);
393 #endif
395 #ifdef _ARCH_PWR8
396 extern __inline __m64
397 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
398 _mm_sign_pi8 (__m64 __A, __m64 __B)
400 const __v16qi __zero = { 0 };
401 __v16qi __C = (__v16qi) (__v2du) { __A, __A };
402 __v16qi __D = (__v16qi) (__v2du) { __B, __B };
403 __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D);
404 return (__m64) ((__v2du) (__C))[0];
406 #endif
408 #ifdef _ARCH_PWR8
409 extern __inline __m64
410 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
411 _mm_sign_pi16 (__m64 __A, __m64 __B)
413 const __v8hi __zero = { 0 };
414 __v8hi __C = (__v8hi) (__v2du) { __A, __A };
415 __v8hi __D = (__v8hi) (__v2du) { __B, __B };
416 __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D);
417 return (__m64) ((__v2du) (__C))[0];
419 #endif
421 #ifdef _ARCH_PWR8
422 extern __inline __m64
423 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
424 _mm_sign_pi32 (__m64 __A, __m64 __B)
426 const __v4si __zero = { 0 };
427 __v4si __C = (__v4si) (__v2du) { __A, __A };
428 __v4si __D = (__v4si) (__v2du) { __B, __B };
429 __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D);
430 return (__m64) ((__v2du) (__C))[0];
432 #endif
434 extern __inline __m128i
435 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
436 _mm_maddubs_epi16 (__m128i __A, __m128i __B)
438 __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
439 __v8hi __C = vec_and (vec_unpackh ((__v16qi) __A), __unsigned);
440 __v8hi __D = vec_and (vec_unpackl ((__v16qi) __A), __unsigned);
441 __v8hi __E = vec_unpackh ((__v16qi) __B);
442 __v8hi __F = vec_unpackl ((__v16qi) __B);
443 __C = vec_mul (__C, __E);
444 __D = vec_mul (__D, __F);
445 const __v16qu __odds =
446 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
447 const __v16qu __evens =
448 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
449 __E = vec_perm (__C, __D, __odds);
450 __F = vec_perm (__C, __D, __evens);
451 return (__m128i) vec_adds (__E, __F);
454 extern __inline __m64
455 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
456 _mm_maddubs_pi16 (__m64 __A, __m64 __B)
458 __v8hi __C = (__v8hi) (__v2du) { __A, __A };
459 __C = vec_unpackl ((__v16qi) __C);
460 const __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
461 __C = vec_and (__C, __unsigned);
462 __v8hi __D = (__v8hi) (__v2du) { __B, __B };
463 __D = vec_unpackl ((__v16qi) __D);
464 __D = vec_mul (__C, __D);
465 const __v16qu __odds =
466 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
467 const __v16qu __evens =
468 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
469 __C = vec_perm (__D, __D, __odds);
470 __D = vec_perm (__D, __D, __evens);
471 __C = vec_adds (__C, __D);
472 return (__m64) ((__v2du) (__C))[0];
475 extern __inline __m128i
476 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
477 _mm_mulhrs_epi16 (__m128i __A, __m128i __B)
479 __v4si __C = vec_unpackh ((__v8hi) __A);
480 __v4si __D = vec_unpackh ((__v8hi) __B);
481 __C = vec_mul (__C, __D);
482 __D = vec_unpackl ((__v8hi) __A);
483 __v4si __E = vec_unpackl ((__v8hi) __B);
484 __D = vec_mul (__D, __E);
485 const __v4su __shift = vec_splats ((unsigned int) 14);
486 __C = vec_sr (__C, __shift);
487 __D = vec_sr (__D, __shift);
488 const __v4si __ones = vec_splats ((signed int) 1);
489 __C = vec_add (__C, __ones);
490 __C = vec_sr (__C, (__v4su) __ones);
491 __D = vec_add (__D, __ones);
492 __D = vec_sr (__D, (__v4su) __ones);
493 return (__m128i) vec_pack (__C, __D);
496 extern __inline __m64
497 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
498 _mm_mulhrs_pi16 (__m64 __A, __m64 __B)
500 __v4si __C = (__v4si) (__v2du) { __A, __A };
501 __C = vec_unpackh ((__v8hi) __C);
502 __v4si __D = (__v4si) (__v2du) { __B, __B };
503 __D = vec_unpackh ((__v8hi) __D);
504 __C = vec_mul (__C, __D);
505 const __v4su __shift = vec_splats ((unsigned int) 14);
506 __C = vec_sr (__C, __shift);
507 const __v4si __ones = vec_splats ((signed int) 1);
508 __C = vec_add (__C, __ones);
509 __C = vec_sr (__C, (__v4su) __ones);
510 __v8hi __E = vec_pack (__C, __D);
511 return (__m64) ((__v2du) (__E))[0];
514 #endif