[RS6000] PowerPC64 soft-float
[official-gcc.git] / gcc / config / rs6000 / xmmintrin.h
blob367f7a9cd8d1c6015eb5706c8a7ae6fb9c1ff1f7
1 /* Copyright (C) 2002-2018 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
24 /* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 9.0. */
27 #ifndef NO_WARN_X86_INTRINSICS
28 /* This header is distributed to simplify porting x86_64 code that
29 makes explicit use of Intel intrinsics to powerpc64le.
30 It is the user's responsibility to determine if the results are
31 acceptable and make additional changes as necessary.
32 Note that much code that uses Intel intrinsics can be rewritten in
33 standard C or GNU C extensions, which are more portable and better
34 optimized across multiple targets.
36 In the specific case of X86 SSE (__m128) intrinsics, the PowerPC
37 VMX/VSX ISA is a good match for vector float SIMD operations.
38 However scalar float operations in vector (XMM) registers require
39 the POWER8 VSX ISA (2.07) level. Also there are important
40 differences for data format and placement of float scalars in the
41 vector register. For PowerISA Scalar floats in FPRs (left most
42 64-bits of the low 32 VSRs) is in double format, while X86_64 SSE
43 uses the right most 32-bits of the XMM. These differences require
44 extra steps on POWER to match the SSE scalar float semantics.
46 Most SSE scalar float intrinsic operations can be performed more
47 efficiently as C language float scalar operations or optimized to
48 use vector SIMD operations. We recommend this for new applications.
50 Another difference is the format and details of the X86_64 MXSCR vs
51 the PowerISA FPSCR / VSCR registers. We recommend applications
52 replace direct access to the MXSCR with the more portable <fenv.h>
53 Posix APIs. */
54 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
55 #endif
57 #ifndef _XMMINTRIN_H_INCLUDED
58 #define _XMMINTRIN_H_INCLUDED
60 /* Define four value permute mask */
61 #define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
63 #include <altivec.h>
65 /* Avoid collisions between altivec.h and strict adherence to C++ and
66 C11 standards. This should eventually be done inside altivec.h itself,
67 but only after testing a full distro build. */
68 #if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \
69 (defined(__STDC_VERSION__) && \
70 __STDC_VERSION__ >= 201112L))
71 #undef vector
72 #undef pixel
73 #undef bool
74 #endif
76 #include <assert.h>
78 /* We need type definitions from the MMX header file. */
79 #include <mmintrin.h>
81 /* Get _mm_malloc () and _mm_free (). */
82 #include <mm_malloc.h>
84 /* The Intel API is flexible enough that we must allow aliasing with other
85 vector types, and their scalar components. */
86 typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
88 /* Unaligned version of the same type. */
89 typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__,
90 __aligned__ (1)));
92 /* Internal data types for implementing the intrinsics. */
93 typedef float __v4sf __attribute__ ((__vector_size__ (16)));
95 /* Create an undefined vector. */
96 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
97 _mm_undefined_ps (void)
99 __m128 __Y = __Y;
100 return __Y;
103 /* Create a vector of zeros. */
104 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
105 _mm_setzero_ps (void)
107 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
110 /* Load four SPFP values from P. The address must be 16-byte aligned. */
111 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
112 _mm_load_ps (float const *__P)
114 assert(((unsigned long)__P & 0xfUL) == 0UL);
115 return ((__m128)vec_ld(0, (__v4sf*)__P));
118 /* Load four SPFP values from P. The address need not be 16-byte aligned. */
119 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
120 _mm_loadu_ps (float const *__P)
122 return (vec_vsx_ld(0, __P));
125 /* Load four SPFP values in reverse order. The address must be aligned. */
126 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
127 _mm_loadr_ps (float const *__P)
129 __v4sf __tmp;
130 __m128 result;
131 static const __vector unsigned char permute_vector =
132 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
133 0x17, 0x10, 0x11, 0x12, 0x13 };
135 __tmp = vec_ld (0, (__v4sf *) __P);
136 result = (__m128) vec_perm (__tmp, __tmp, permute_vector);
137 return result;
140 /* Create a vector with all four elements equal to F. */
141 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
142 _mm_set1_ps (float __F)
144 return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
147 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
148 _mm_set_ps1 (float __F)
150 return _mm_set1_ps (__F);
153 /* Create the vector [Z Y X W]. */
154 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
155 _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
157 return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
160 /* Create the vector [W X Y Z]. */
161 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
162 _mm_setr_ps (float __Z, float __Y, float __X, float __W)
164 return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
167 /* Store four SPFP values. The address must be 16-byte aligned. */
168 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
169 _mm_store_ps (float *__P, __m128 __A)
171 assert(((unsigned long)__P & 0xfUL) == 0UL);
172 vec_st((__v4sf)__A, 0, (__v4sf*)__P);
175 /* Store four SPFP values. The address need not be 16-byte aligned. */
176 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
177 _mm_storeu_ps (float *__P, __m128 __A)
179 *(__m128_u *)__P = __A;
182 /* Store four SPFP values in reverse order. The address must be aligned. */
183 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
184 _mm_storer_ps (float *__P, __m128 __A)
186 __v4sf __tmp;
187 static const __vector unsigned char permute_vector =
188 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
189 0x17, 0x10, 0x11, 0x12, 0x13 };
191 __tmp = (__m128) vec_perm (__A, __A, permute_vector);
193 _mm_store_ps (__P, __tmp);
196 /* Store the lower SPFP value across four words. */
197 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
198 _mm_store1_ps (float *__P, __m128 __A)
200 __v4sf __va = vec_splat((__v4sf)__A, 0);
201 _mm_store_ps (__P, __va);
204 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
205 _mm_store_ps1 (float *__P, __m128 __A)
207 _mm_store1_ps (__P, __A);
210 /* Create a vector with element 0 as F and the rest zero. */
211 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
212 _mm_set_ss (float __F)
214 return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
217 /* Sets the low SPFP value of A from the low value of B. */
218 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
219 _mm_move_ss (__m128 __A, __m128 __B)
221 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
223 return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask));
226 /* Create a vector with element 0 as *P and the rest zero. */
227 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
228 _mm_load_ss (float const *__P)
230 return _mm_set_ss (*__P);
233 /* Stores the lower SPFP value. */
234 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
235 _mm_store_ss (float *__P, __m128 __A)
237 *__P = ((__v4sf)__A)[0];
240 /* Perform the respective operation on the lower SPFP (single-precision
241 floating-point) values of A and B; the upper three SPFP values are
242 passed through from A. */
244 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
245 _mm_add_ss (__m128 __A, __m128 __B)
247 #ifdef _ARCH_PWR7
248 __m128 a, b, c;
249 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
250 /* PowerISA VSX does not allow partial (for just lower double)
251 results. So to insure we don't generate spurious exceptions
252 (from the upper double values) we splat the lower double
253 before we to the operation. */
254 a = vec_splat (__A, 0);
255 b = vec_splat (__B, 0);
256 c = a + b;
257 /* Then we merge the lower float result with the original upper
258 float elements from __A. */
259 return (vec_sel (__A, c, mask));
260 #else
261 __A[0] = __A[0] + __B[0];
262 return (__A);
263 #endif
266 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
267 _mm_sub_ss (__m128 __A, __m128 __B)
269 #ifdef _ARCH_PWR7
270 __m128 a, b, c;
271 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
272 /* PowerISA VSX does not allow partial (for just lower double)
273 results. So to insure we don't generate spurious exceptions
274 (from the upper double values) we splat the lower double
275 before we to the operation. */
276 a = vec_splat (__A, 0);
277 b = vec_splat (__B, 0);
278 c = a - b;
279 /* Then we merge the lower float result with the original upper
280 float elements from __A. */
281 return (vec_sel (__A, c, mask));
282 #else
283 __A[0] = __A[0] - __B[0];
284 return (__A);
285 #endif
288 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
289 _mm_mul_ss (__m128 __A, __m128 __B)
291 #ifdef _ARCH_PWR7
292 __m128 a, b, c;
293 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
294 /* PowerISA VSX does not allow partial (for just lower double)
295 results. So to insure we don't generate spurious exceptions
296 (from the upper double values) we splat the lower double
297 before we to the operation. */
298 a = vec_splat (__A, 0);
299 b = vec_splat (__B, 0);
300 c = a * b;
301 /* Then we merge the lower float result with the original upper
302 float elements from __A. */
303 return (vec_sel (__A, c, mask));
304 #else
305 __A[0] = __A[0] * __B[0];
306 return (__A);
307 #endif
310 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
311 _mm_div_ss (__m128 __A, __m128 __B)
313 #ifdef _ARCH_PWR7
314 __m128 a, b, c;
315 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
316 /* PowerISA VSX does not allow partial (for just lower double)
317 results. So to insure we don't generate spurious exceptions
318 (from the upper double values) we splat the lower double
319 before we to the operation. */
320 a = vec_splat (__A, 0);
321 b = vec_splat (__B, 0);
322 c = a / b;
323 /* Then we merge the lower float result with the original upper
324 float elements from __A. */
325 return (vec_sel (__A, c, mask));
326 #else
327 __A[0] = __A[0] / __B[0];
328 return (__A);
329 #endif
332 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
333 _mm_sqrt_ss (__m128 __A)
335 __m128 a, c;
336 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
337 /* PowerISA VSX does not allow partial (for just lower double)
338 * results. So to insure we don't generate spurious exceptions
339 * (from the upper double values) we splat the lower double
340 * before we to the operation. */
341 a = vec_splat (__A, 0);
342 c = vec_sqrt (a);
343 /* Then we merge the lower float result with the original upper
344 * float elements from __A. */
345 return (vec_sel (__A, c, mask));
348 /* Perform the respective operation on the four SPFP values in A and B. */
349 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
350 _mm_add_ps (__m128 __A, __m128 __B)
352 return (__m128) ((__v4sf)__A + (__v4sf)__B);
355 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
356 _mm_sub_ps (__m128 __A, __m128 __B)
358 return (__m128) ((__v4sf)__A - (__v4sf)__B);
361 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
362 _mm_mul_ps (__m128 __A, __m128 __B)
364 return (__m128) ((__v4sf)__A * (__v4sf)__B);
367 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
368 _mm_div_ps (__m128 __A, __m128 __B)
370 return (__m128) ((__v4sf)__A / (__v4sf)__B);
373 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
374 _mm_sqrt_ps (__m128 __A)
376 return (vec_sqrt ((__v4sf)__A));
379 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
380 _mm_rcp_ps (__m128 __A)
382 return (vec_re ((__v4sf)__A));
385 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
386 _mm_rsqrt_ps (__m128 __A)
388 return (vec_rsqrte (__A));
391 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
392 _mm_rcp_ss (__m128 __A)
394 __m128 a, c;
395 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
396 /* PowerISA VSX does not allow partial (for just lower double)
397 * results. So to insure we don't generate spurious exceptions
398 * (from the upper double values) we splat the lower double
399 * before we to the operation. */
400 a = vec_splat (__A, 0);
401 c = _mm_rcp_ps (a);
402 /* Then we merge the lower float result with the original upper
403 * float elements from __A. */
404 return (vec_sel (__A, c, mask));
407 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
408 _mm_rsqrt_ss (__m128 __A)
410 __m128 a, c;
411 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
412 /* PowerISA VSX does not allow partial (for just lower double)
413 * results. So to insure we don't generate spurious exceptions
414 * (from the upper double values) we splat the lower double
415 * before we to the operation. */
416 a = vec_splat (__A, 0);
417 c = vec_rsqrte (a);
418 /* Then we merge the lower float result with the original upper
419 * float elements from __A. */
420 return (vec_sel (__A, c, mask));
423 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
424 _mm_min_ss (__m128 __A, __m128 __B)
426 __v4sf a, b, c;
427 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
428 /* PowerISA VSX does not allow partial (for just lower float)
429 * results. So to insure we don't generate spurious exceptions
430 * (from the upper float values) we splat the lower float
431 * before we to the operation. */
432 a = vec_splat ((__v4sf)__A, 0);
433 b = vec_splat ((__v4sf)__B, 0);
434 c = vec_min (a, b);
435 /* Then we merge the lower float result with the original upper
436 * float elements from __A. */
437 return (vec_sel ((__v4sf)__A, c, mask));
440 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
441 _mm_max_ss (__m128 __A, __m128 __B)
443 __v4sf a, b, c;
444 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
445 /* PowerISA VSX does not allow partial (for just lower float)
446 * results. So to insure we don't generate spurious exceptions
447 * (from the upper float values) we splat the lower float
448 * before we to the operation. */
449 a = vec_splat (__A, 0);
450 b = vec_splat (__B, 0);
451 c = vec_max (a, b);
452 /* Then we merge the lower float result with the original upper
453 * float elements from __A. */
454 return (vec_sel ((__v4sf)__A, c, mask));
457 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
458 _mm_min_ps (__m128 __A, __m128 __B)
460 __vector __bool int m = vec_cmpgt ((__v4sf) __B, (__v4sf) __A);
461 return vec_sel (__B, __A, m);
464 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
465 _mm_max_ps (__m128 __A, __m128 __B)
467 __vector __bool int m = vec_cmpgt ((__v4sf) __A, (__v4sf) __B);
468 return vec_sel (__B, __A, m);
471 /* Perform logical bit-wise operations on 128-bit values. */
472 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
473 _mm_and_ps (__m128 __A, __m128 __B)
475 return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B));
476 // return __builtin_ia32_andps (__A, __B);
479 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
480 _mm_andnot_ps (__m128 __A, __m128 __B)
482 return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A));
485 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
486 _mm_or_ps (__m128 __A, __m128 __B)
488 return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B));
491 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
492 _mm_xor_ps (__m128 __A, __m128 __B)
494 return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B));
497 /* Perform a comparison on the four SPFP values of A and B. For each
498 element, if the comparison is true, place a mask of all ones in the
499 result, otherwise a mask of zeros. */
500 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
501 _mm_cmpeq_ps (__m128 __A, __m128 __B)
503 return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B));
506 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
507 _mm_cmplt_ps (__m128 __A, __m128 __B)
509 return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
512 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
513 _mm_cmple_ps (__m128 __A, __m128 __B)
515 return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
518 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
519 _mm_cmpgt_ps (__m128 __A, __m128 __B)
521 return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
524 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
525 _mm_cmpge_ps (__m128 __A, __m128 __B)
527 return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
530 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
531 _mm_cmpneq_ps (__m128 __A, __m128 __B)
533 __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
534 return ((__m128)vec_nor (temp, temp));
537 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
538 _mm_cmpnlt_ps (__m128 __A, __m128 __B)
540 return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
543 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
544 _mm_cmpnle_ps (__m128 __A, __m128 __B)
546 return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
549 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
550 _mm_cmpngt_ps (__m128 __A, __m128 __B)
552 return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
555 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
556 _mm_cmpnge_ps (__m128 __A, __m128 __B)
558 return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
561 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
562 _mm_cmpord_ps (__m128 __A, __m128 __B)
564 __vector unsigned int a, b;
565 __vector unsigned int c, d;
566 static const __vector unsigned int float_exp_mask =
567 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
569 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
570 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
571 c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
572 d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
573 return ((__m128 ) vec_and (c, d));
576 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
577 _mm_cmpunord_ps (__m128 __A, __m128 __B)
579 __vector unsigned int a, b;
580 __vector unsigned int c, d;
581 static const __vector unsigned int float_exp_mask =
582 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
584 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
585 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
586 c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
587 d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
588 return ((__m128 ) vec_or (c, d));
591 /* Perform a comparison on the lower SPFP values of A and B. If the
592 comparison is true, place a mask of all ones in the result, otherwise a
593 mask of zeros. The upper three SPFP values are passed through from A. */
594 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
595 _mm_cmpeq_ss (__m128 __A, __m128 __B)
597 static const __vector unsigned int mask =
598 { 0xffffffff, 0, 0, 0 };
599 __v4sf a, b, c;
600 /* PowerISA VMX does not allow partial (for just element 0)
601 * results. So to insure we don't generate spurious exceptions
602 * (from the upper elements) we splat the lower float
603 * before we to the operation. */
604 a = vec_splat ((__v4sf) __A, 0);
605 b = vec_splat ((__v4sf) __B, 0);
606 c = (__v4sf) vec_cmpeq(a, b);
607 /* Then we merge the lower float result with the original upper
608 * float elements from __A. */
609 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
612 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
613 _mm_cmplt_ss (__m128 __A, __m128 __B)
615 static const __vector unsigned int mask =
616 { 0xffffffff, 0, 0, 0 };
617 __v4sf a, b, c;
618 /* PowerISA VMX does not allow partial (for just element 0)
619 * results. So to insure we don't generate spurious exceptions
620 * (from the upper elements) we splat the lower float
621 * before we to the operation. */
622 a = vec_splat ((__v4sf) __A, 0);
623 b = vec_splat ((__v4sf) __B, 0);
624 c = (__v4sf) vec_cmplt(a, b);
625 /* Then we merge the lower float result with the original upper
626 * float elements from __A. */
627 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
630 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
631 _mm_cmple_ss (__m128 __A, __m128 __B)
633 static const __vector unsigned int mask =
634 { 0xffffffff, 0, 0, 0 };
635 __v4sf a, b, c;
636 /* PowerISA VMX does not allow partial (for just element 0)
637 * results. So to insure we don't generate spurious exceptions
638 * (from the upper elements) we splat the lower float
639 * before we to the operation. */
640 a = vec_splat ((__v4sf) __A, 0);
641 b = vec_splat ((__v4sf) __B, 0);
642 c = (__v4sf) vec_cmple(a, b);
643 /* Then we merge the lower float result with the original upper
644 * float elements from __A. */
645 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
648 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
649 _mm_cmpgt_ss (__m128 __A, __m128 __B)
651 static const __vector unsigned int mask =
652 { 0xffffffff, 0, 0, 0 };
653 __v4sf a, b, c;
654 /* PowerISA VMX does not allow partial (for just element 0)
655 * results. So to insure we don't generate spurious exceptions
656 * (from the upper elements) we splat the lower float
657 * before we to the operation. */
658 a = vec_splat ((__v4sf) __A, 0);
659 b = vec_splat ((__v4sf) __B, 0);
660 c = (__v4sf) vec_cmpgt(a, b);
661 /* Then we merge the lower float result with the original upper
662 * float elements from __A. */
663 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
666 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
667 _mm_cmpge_ss (__m128 __A, __m128 __B)
669 static const __vector unsigned int mask =
670 { 0xffffffff, 0, 0, 0 };
671 __v4sf a, b, c;
672 /* PowerISA VMX does not allow partial (for just element 0)
673 * results. So to insure we don't generate spurious exceptions
674 * (from the upper elements) we splat the lower float
675 * before we to the operation. */
676 a = vec_splat ((__v4sf) __A, 0);
677 b = vec_splat ((__v4sf) __B, 0);
678 c = (__v4sf) vec_cmpge(a, b);
679 /* Then we merge the lower float result with the original upper
680 * float elements from __A. */
681 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
684 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
685 _mm_cmpneq_ss (__m128 __A, __m128 __B)
687 static const __vector unsigned int mask =
688 { 0xffffffff, 0, 0, 0 };
689 __v4sf a, b, c;
690 /* PowerISA VMX does not allow partial (for just element 0)
691 * results. So to insure we don't generate spurious exceptions
692 * (from the upper elements) we splat the lower float
693 * before we to the operation. */
694 a = vec_splat ((__v4sf) __A, 0);
695 b = vec_splat ((__v4sf) __B, 0);
696 c = (__v4sf) vec_cmpeq(a, b);
697 c = vec_nor (c, c);
698 /* Then we merge the lower float result with the original upper
699 * float elements from __A. */
700 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
703 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
704 _mm_cmpnlt_ss (__m128 __A, __m128 __B)
706 static const __vector unsigned int mask =
707 { 0xffffffff, 0, 0, 0 };
708 __v4sf a, b, c;
709 /* PowerISA VMX does not allow partial (for just element 0)
710 * results. So to insure we don't generate spurious exceptions
711 * (from the upper elements) we splat the lower float
712 * before we to the operation. */
713 a = vec_splat ((__v4sf) __A, 0);
714 b = vec_splat ((__v4sf) __B, 0);
715 c = (__v4sf) vec_cmpge(a, b);
716 /* Then we merge the lower float result with the original upper
717 * float elements from __A. */
718 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
721 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
722 _mm_cmpnle_ss (__m128 __A, __m128 __B)
724 static const __vector unsigned int mask =
725 { 0xffffffff, 0, 0, 0 };
726 __v4sf a, b, c;
727 /* PowerISA VMX does not allow partial (for just element 0)
728 * results. So to insure we don't generate spurious exceptions
729 * (from the upper elements) we splat the lower float
730 * before we to the operation. */
731 a = vec_splat ((__v4sf) __A, 0);
732 b = vec_splat ((__v4sf) __B, 0);
733 c = (__v4sf) vec_cmpgt(a, b);
734 /* Then we merge the lower float result with the original upper
735 * float elements from __A. */
736 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
739 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
740 _mm_cmpngt_ss (__m128 __A, __m128 __B)
742 static const __vector unsigned int mask =
743 { 0xffffffff, 0, 0, 0 };
744 __v4sf a, b, c;
745 /* PowerISA VMX does not allow partial (for just element 0)
746 * results. So to insure we don't generate spurious exceptions
747 * (from the upper elements) we splat the lower float
748 * before we to the operation. */
749 a = vec_splat ((__v4sf) __A, 0);
750 b = vec_splat ((__v4sf) __B, 0);
751 c = (__v4sf) vec_cmple(a, b);
752 /* Then we merge the lower float result with the original upper
753 * float elements from __A. */
754 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
757 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
758 _mm_cmpnge_ss (__m128 __A, __m128 __B)
760 static const __vector unsigned int mask =
761 { 0xffffffff, 0, 0, 0 };
762 __v4sf a, b, c;
763 /* PowerISA VMX does not allow partial (for just element 0)
764 * results. So to insure we don't generate spurious exceptions
765 * (from the upper elements) we splat the lower float
766 * before we do the operation. */
767 a = vec_splat ((__v4sf) __A, 0);
768 b = vec_splat ((__v4sf) __B, 0);
769 c = (__v4sf) vec_cmplt(a, b);
770 /* Then we merge the lower float result with the original upper
771 * float elements from __A. */
772 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
775 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
776 _mm_cmpord_ss (__m128 __A, __m128 __B)
778 __vector unsigned int a, b;
779 __vector unsigned int c, d;
780 static const __vector unsigned int float_exp_mask =
781 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
782 static const __vector unsigned int mask =
783 { 0xffffffff, 0, 0, 0 };
785 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
786 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
787 c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
788 d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
789 c = vec_and (c, d);
790 /* Then we merge the lower float result with the original upper
791 * float elements from __A. */
792 return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
795 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
796 _mm_cmpunord_ss (__m128 __A, __m128 __B)
798 __vector unsigned int a, b;
799 __vector unsigned int c, d;
800 static const __vector unsigned int float_exp_mask =
801 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
802 static const __vector unsigned int mask =
803 { 0xffffffff, 0, 0, 0 };
805 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
806 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
807 c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
808 d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
809 c = vec_or (c, d);
810 /* Then we merge the lower float result with the original upper
811 * float elements from __A. */
812 return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
815 /* Compare the lower SPFP values of A and B and return 1 if true
816 and 0 if false. */
817 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
818 _mm_comieq_ss (__m128 __A, __m128 __B)
820 return (__A[0] == __B[0]);
823 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
824 _mm_comilt_ss (__m128 __A, __m128 __B)
826 return (__A[0] < __B[0]);
829 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
830 _mm_comile_ss (__m128 __A, __m128 __B)
832 return (__A[0] <= __B[0]);
835 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
836 _mm_comigt_ss (__m128 __A, __m128 __B)
838 return (__A[0] > __B[0]);
841 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
842 _mm_comige_ss (__m128 __A, __m128 __B)
844 return (__A[0] >= __B[0]);
847 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
848 _mm_comineq_ss (__m128 __A, __m128 __B)
850 return (__A[0] != __B[0]);
853 /* FIXME
854 * The __mm_ucomi??_ss implementations below are exactly the same as
855 * __mm_comi??_ss because GCC for PowerPC only generates unordered
856 * compares (scalar and vector).
857 * Technically __mm_comieq_ss et al should be using the ordered
858 * compare and signal for QNaNs.
859 * The __mm_ucomieq_sd et all should be OK, as is.
861 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
862 _mm_ucomieq_ss (__m128 __A, __m128 __B)
864 return (__A[0] == __B[0]);
867 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
868 _mm_ucomilt_ss (__m128 __A, __m128 __B)
870 return (__A[0] < __B[0]);
873 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
874 _mm_ucomile_ss (__m128 __A, __m128 __B)
876 return (__A[0] <= __B[0]);
879 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
880 _mm_ucomigt_ss (__m128 __A, __m128 __B)
882 return (__A[0] > __B[0]);
885 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
886 _mm_ucomige_ss (__m128 __A, __m128 __B)
888 return (__A[0] >= __B[0]);
891 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
892 _mm_ucomineq_ss (__m128 __A, __m128 __B)
894 return (__A[0] != __B[0]);
897 extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
898 _mm_cvtss_f32 (__m128 __A)
900 return ((__v4sf)__A)[0];
903 /* Convert the lower SPFP value to a 32-bit integer according to the current
904 rounding mode. */
905 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
906 _mm_cvtss_si32 (__m128 __A)
908 __m64 res = 0;
909 #ifdef _ARCH_PWR8
910 __m128 vtmp;
911 double dtmp;
912 __asm__(
913 "xxsldwi %x1,%x3,%x3,3;\n"
914 "xscvspdp %x2,%x1;\n"
915 "fctiw %2,%2;\n"
916 "mfvsrd %0,%x2;\n"
917 : "=r" (res),
918 "=&wa" (vtmp),
919 "=f" (dtmp)
920 : "wa" (__A)
921 : );
922 #else
923 res = __builtin_rint(__A[0]);
924 #endif
925 return (res);
928 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
929 _mm_cvt_ss2si (__m128 __A)
931 return _mm_cvtss_si32 (__A);
934 /* Convert the lower SPFP value to a 32-bit integer according to the
935 current rounding mode. */
937 /* Intel intrinsic. */
938 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
939 _mm_cvtss_si64 (__m128 __A)
941 __m64 res = 0;
942 #ifdef _ARCH_PWR8
943 __m128 vtmp;
944 double dtmp;
945 __asm__(
946 "xxsldwi %x1,%x3,%x3,3;\n"
947 "xscvspdp %x2,%x1;\n"
948 "fctid %2,%2;\n"
949 "mfvsrd %0,%x2;\n"
950 : "=r" (res),
951 "=&wa" (vtmp),
952 "=f" (dtmp)
953 : "wa" (__A)
954 : );
955 #else
956 res = __builtin_llrint(__A[0]);
957 #endif
958 return (res);
961 /* Microsoft intrinsic. */
962 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
963 _mm_cvtss_si64x (__m128 __A)
965 return _mm_cvtss_si64 ((__v4sf) __A);
968 /* Constants for use with _mm_prefetch. */
969 enum _mm_hint
971 /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */
972 _MM_HINT_ET0 = 7,
973 _MM_HINT_ET1 = 6,
974 _MM_HINT_T0 = 3,
975 _MM_HINT_T1 = 2,
976 _MM_HINT_T2 = 1,
977 _MM_HINT_NTA = 0
980 /* Loads one cache line from address P to a location "closer" to the
981 processor. The selector I specifies the type of prefetch operation. */
982 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
983 _mm_prefetch (const void *__P, enum _mm_hint __I)
985 /* Current PowerPC will ignores the hint parameters. */
986 __builtin_prefetch (__P);
989 /* Convert the two lower SPFP values to 32-bit integers according to the
990 current rounding mode. Return the integers in packed form. */
991 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
992 _mm_cvtps_pi32 (__m128 __A)
994 /* Splat two lower SPFP values to both halves. */
995 __v4sf temp, rounded;
996 __vector unsigned long long result;
998 /* Splat two lower SPFP values to both halves. */
999 temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
1000 rounded = vec_rint(temp);
1001 result = (__vector unsigned long long) vec_cts (rounded, 0);
1003 return (__m64) ((__vector long long) result)[0];
1006 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1007 _mm_cvt_ps2pi (__m128 __A)
1009 return _mm_cvtps_pi32 (__A);
1012 /* Truncate the lower SPFP value to a 32-bit integer. */
1013 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1014 _mm_cvttss_si32 (__m128 __A)
1016 /* Extract the lower float element. */
1017 float temp = __A[0];
1018 /* truncate to 32-bit integer and return. */
1019 return temp;
1022 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1023 _mm_cvtt_ss2si (__m128 __A)
1025 return _mm_cvttss_si32 (__A);
1028 /* Intel intrinsic. */
1029 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1030 _mm_cvttss_si64 (__m128 __A)
1032 /* Extract the lower float element. */
1033 float temp = __A[0];
1034 /* truncate to 32-bit integer and return. */
1035 return temp;
1038 /* Microsoft intrinsic. */
1039 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1040 _mm_cvttss_si64x (__m128 __A)
1042 /* Extract the lower float element. */
1043 float temp = __A[0];
1044 /* truncate to 32-bit integer and return. */
1045 return temp;
1048 /* Truncate the two lower SPFP values to 32-bit integers. Return the
1049 integers in packed form. */
1050 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1051 _mm_cvttps_pi32 (__m128 __A)
1053 __v4sf temp;
1054 __vector unsigned long long result;
1056 /* Splat two lower SPFP values to both halves. */
1057 temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
1058 result = (__vector unsigned long long) vec_cts (temp, 0);
1060 return (__m64) ((__vector long long) result)[0];
1063 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1064 _mm_cvtt_ps2pi (__m128 __A)
1066 return _mm_cvttps_pi32 (__A);
1069 /* Convert B to a SPFP value and insert it as element zero in A. */
1070 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1071 _mm_cvtsi32_ss (__m128 __A, int __B)
1073 float temp = __B;
1074 __A[0] = temp;
1076 return __A;
1079 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1080 _mm_cvt_si2ss (__m128 __A, int __B)
1082 return _mm_cvtsi32_ss (__A, __B);
1085 /* Convert B to a SPFP value and insert it as element zero in A. */
1086 /* Intel intrinsic. */
1087 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1088 _mm_cvtsi64_ss (__m128 __A, long long __B)
1090 float temp = __B;
1091 __A[0] = temp;
1093 return __A;
1096 /* Microsoft intrinsic. */
1097 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1098 _mm_cvtsi64x_ss (__m128 __A, long long __B)
1100 return _mm_cvtsi64_ss (__A, __B);
1103 /* Convert the two 32-bit values in B to SPFP form and insert them
1104 as the two lower elements in A. */
1105 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1106 _mm_cvtpi32_ps (__m128 __A, __m64 __B)
1108 __vector signed int vm1;
1109 __vector float vf1;
1111 vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B};
1112 vf1 = (__vector float) vec_ctf (vm1, 0);
1114 return ((__m128) (__vector unsigned long long)
1115 { ((__vector unsigned long long)vf1) [0],
1116 ((__vector unsigned long long)__A) [1]});
1119 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1120 _mm_cvt_pi2ps (__m128 __A, __m64 __B)
1122 return _mm_cvtpi32_ps (__A, __B);
1125 /* Convert the four signed 16-bit values in A to SPFP form. */
1126 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1127 _mm_cvtpi16_ps (__m64 __A)
1129 __vector signed short vs8;
1130 __vector signed int vi4;
1131 __vector float vf1;
1133 vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A };
1134 vi4 = vec_vupklsh (vs8);
1135 vf1 = (__vector float) vec_ctf (vi4, 0);
1137 return (__m128) vf1;
1140 /* Convert the four unsigned 16-bit values in A to SPFP form. */
1141 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1142 _mm_cvtpu16_ps (__m64 __A)
1144 const __vector unsigned short zero =
1145 { 0, 0, 0, 0, 0, 0, 0, 0 };
1146 __vector unsigned short vs8;
1147 __vector unsigned int vi4;
1148 __vector float vf1;
1150 vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A };
1151 vi4 = (__vector unsigned int) vec_vmrglh (vs8, zero);
1152 vf1 = (__vector float) vec_ctf (vi4, 0);
1154 return (__m128) vf1;
1157 /* Convert the low four signed 8-bit values in A to SPFP form. */
1158 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1159 _mm_cvtpi8_ps (__m64 __A)
1161 __vector signed char vc16;
1162 __vector signed short vs8;
1163 __vector signed int vi4;
1164 __vector float vf1;
1166 vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A };
1167 vs8 = vec_vupkhsb (vc16);
1168 vi4 = vec_vupkhsh (vs8);
1169 vf1 = (__vector float) vec_ctf (vi4, 0);
1171 return (__m128) vf1;
1174 /* Convert the low four unsigned 8-bit values in A to SPFP form. */
1175 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1177 _mm_cvtpu8_ps (__m64 __A)
1179 const __vector unsigned char zero =
1180 { 0, 0, 0, 0, 0, 0, 0, 0 };
1181 __vector unsigned char vc16;
1182 __vector unsigned short vs8;
1183 __vector unsigned int vi4;
1184 __vector float vf1;
1186 vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A };
1187 vs8 = (__vector unsigned short) vec_vmrglb (vc16, zero);
1188 vi4 = (__vector unsigned int) vec_vmrghh (vs8,
1189 (__vector unsigned short) zero);
1190 vf1 = (__vector float) vec_ctf (vi4, 0);
1192 return (__m128) vf1;
1195 /* Convert the four signed 32-bit values in A and B to SPFP form. */
1196 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1197 _mm_cvtpi32x2_ps (__m64 __A, __m64 __B)
1199 __vector signed int vi4;
1200 __vector float vf4;
1202 vi4 = (__vector signed int) (__vector unsigned long long) { __B, __A };
1203 vf4 = (__vector float) vec_ctf (vi4, 0);
1204 return (__m128) vf4;
1207 /* Convert the four SPFP values in A to four signed 16-bit integers. */
1208 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1209 _mm_cvtps_pi16 (__m128 __A)
1211 __v4sf rounded;
1212 __vector signed int temp;
1213 __vector unsigned long long result;
1215 rounded = vec_rint(__A);
1216 temp = vec_cts (rounded, 0);
1217 result = (__vector unsigned long long) vec_pack (temp, temp);
1219 return (__m64) ((__vector long long) result)[0];
1222 /* Convert the four SPFP values in A to four signed 8-bit integers. */
1223 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1224 _mm_cvtps_pi8 (__m128 __A)
1226 __v4sf rounded;
1227 __vector signed int tmp_i;
1228 static const __vector signed int zero = {0, 0, 0, 0};
1229 __vector signed short tmp_s;
1230 __vector signed char res_v;
1232 rounded = vec_rint(__A);
1233 tmp_i = vec_cts (rounded, 0);
1234 tmp_s = vec_pack (tmp_i, zero);
1235 res_v = vec_pack (tmp_s, tmp_s);
1236 return (__m64) ((__vector long long) res_v)[0];
1239 /* Selects four specific SPFP values from A and B based on MASK. */
1240 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1242 _mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask)
1244 unsigned long element_selector_10 = __mask & 0x03;
1245 unsigned long element_selector_32 = (__mask >> 2) & 0x03;
1246 unsigned long element_selector_54 = (__mask >> 4) & 0x03;
1247 unsigned long element_selector_76 = (__mask >> 6) & 0x03;
1248 static const unsigned int permute_selectors[4] =
1250 #ifdef __LITTLE_ENDIAN__
1251 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1252 #elif __BIG_ENDIAN__
1253 0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203
1254 #endif
1256 __vector unsigned int t;
1258 #ifdef __LITTLE_ENDIAN__
1259 t[0] = permute_selectors[element_selector_10];
1260 t[1] = permute_selectors[element_selector_32];
1261 t[2] = permute_selectors[element_selector_54] + 0x10101010;
1262 t[3] = permute_selectors[element_selector_76] + 0x10101010;
1263 #elif __BIG_ENDIAN__
1264 t[3] = permute_selectors[element_selector_10] + 0x10101010;
1265 t[2] = permute_selectors[element_selector_32] + 0x10101010;
1266 t[1] = permute_selectors[element_selector_54];
1267 t[0] = permute_selectors[element_selector_76];
1268 #endif
1269 return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t);
1272 /* Selects and interleaves the upper two SPFP values from A and B. */
1273 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1274 _mm_unpackhi_ps (__m128 __A, __m128 __B)
1276 return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B);
1279 /* Selects and interleaves the lower two SPFP values from A and B. */
1280 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1281 _mm_unpacklo_ps (__m128 __A, __m128 __B)
1283 return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B);
1286 /* Sets the upper two SPFP values with 64-bits of data loaded from P;
1287 the lower two values are passed through from A. */
1288 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1289 _mm_loadh_pi (__m128 __A, __m64 const *__P)
1291 __vector unsigned long long __a = (__vector unsigned long long)__A;
1292 __vector unsigned long long __p = vec_splats(*__P);
1293 __a [1] = __p [1];
1295 return (__m128)__a;
1298 /* Stores the upper two SPFP values of A into P. */
1299 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1300 _mm_storeh_pi (__m64 *__P, __m128 __A)
1302 __vector unsigned long long __a = (__vector unsigned long long) __A;
1304 *__P = __a[1];
1307 /* Moves the upper two values of B into the lower two values of A. */
1308 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1309 _mm_movehl_ps (__m128 __A, __m128 __B)
1311 return (__m128) vec_mergel ((__vector unsigned long long)__B,
1312 (__vector unsigned long long)__A);
1315 /* Moves the lower two values of B into the upper two values of A. */
1316 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1317 _mm_movelh_ps (__m128 __A, __m128 __B)
1319 return (__m128) vec_mergeh ((__vector unsigned long long)__A,
1320 (__vector unsigned long long)__B);
1323 /* Sets the lower two SPFP values with 64-bits of data loaded from P;
1324 the upper two values are passed through from A. */
1325 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1326 _mm_loadl_pi (__m128 __A, __m64 const *__P)
1328 __vector unsigned long long __a = (__vector unsigned long long)__A;
1329 __vector unsigned long long __p = vec_splats(*__P);
1330 __a [0] = __p [0];
1332 return (__m128)__a;
1335 /* Stores the lower two SPFP values of A into P. */
1336 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1337 _mm_storel_pi (__m64 *__P, __m128 __A)
1339 __vector unsigned long long __a = (__vector unsigned long long) __A;
1341 *__P = __a[0];
1344 #ifdef _ARCH_PWR8
1345 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1347 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */
1348 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1349 _mm_movemask_ps (__m128 __A)
1351 __vector unsigned long long result;
1352 static const __vector unsigned int perm_mask =
1354 #ifdef __LITTLE_ENDIAN__
1355 0x00204060, 0x80808080, 0x80808080, 0x80808080
1356 #elif __BIG_ENDIAN__
1357 0x80808080, 0x80808080, 0x80808080, 0x00204060
1358 #endif
1361 result = ((__vector unsigned long long)
1362 vec_vbpermq ((__vector unsigned char) __A,
1363 (__vector unsigned char) perm_mask));
1365 #ifdef __LITTLE_ENDIAN__
1366 return result[1];
1367 #elif __BIG_ENDIAN__
1368 return result[0];
1369 #endif
1371 #endif /* _ARCH_PWR8 */
1373 /* Create a vector with all four elements equal to *P. */
1374 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1375 _mm_load1_ps (float const *__P)
1377 return _mm_set1_ps (*__P);
1380 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1381 _mm_load_ps1 (float const *__P)
1383 return _mm_load1_ps (__P);
1386 /* Extracts one of the four words of A. The selector N must be immediate. */
1387 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1388 _mm_extract_pi16 (__m64 const __A, int const __N)
1390 unsigned int shiftr = __N & 3;
1391 #ifdef __BIG_ENDIAN__
1392 shiftr = 3 - shiftr;
1393 #endif
1395 return ((__A >> (shiftr * 16)) & 0xffff);
1398 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1399 _m_pextrw (__m64 const __A, int const __N)
1401 return _mm_extract_pi16 (__A, __N);
1404 /* Inserts word D into one of four words of A. The selector N must be
1405 immediate. */
1406 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1407 _mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
1409 const int shiftl = (__N & 3) * 16;
1410 const __m64 shiftD = (const __m64) __D << shiftl;
1411 const __m64 mask = 0xffffUL << shiftl;
1412 __m64 result = (__A & (~mask)) | (shiftD & mask);
1414 return (result);
1417 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1418 _m_pinsrw (__m64 const __A, int const __D, int const __N)
1420 return _mm_insert_pi16 (__A, __D, __N);
1423 /* Compute the element-wise maximum of signed 16-bit values. */
1424 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1426 _mm_max_pi16 (__m64 __A, __m64 __B)
1428 #if _ARCH_PWR8
1429 __vector signed short a, b, r;
1430 __vector __bool short c;
1432 a = (__vector signed short)vec_splats (__A);
1433 b = (__vector signed short)vec_splats (__B);
1434 c = (__vector __bool short)vec_cmpgt (a, b);
1435 r = vec_sel (b, a, c);
1436 return (__m64) ((__vector long long) r)[0];
1437 #else
1438 __m64_union m1, m2, res;
1440 m1.as_m64 = __A;
1441 m2.as_m64 = __B;
1443 res.as_short[0] =
1444 (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1445 res.as_short[1] =
1446 (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1447 res.as_short[2] =
1448 (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1449 res.as_short[3] =
1450 (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1452 return (__m64) res.as_m64;
1453 #endif
1456 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1457 _m_pmaxsw (__m64 __A, __m64 __B)
1459 return _mm_max_pi16 (__A, __B);
1462 /* Compute the element-wise maximum of unsigned 8-bit values. */
1463 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1464 _mm_max_pu8 (__m64 __A, __m64 __B)
1466 #if _ARCH_PWR8
1467 __vector unsigned char a, b, r;
1468 __vector __bool char c;
1470 a = (__vector unsigned char)vec_splats (__A);
1471 b = (__vector unsigned char)vec_splats (__B);
1472 c = (__vector __bool char)vec_cmpgt (a, b);
1473 r = vec_sel (b, a, c);
1474 return (__m64) ((__vector long long) r)[0];
1475 #else
1476 __m64_union m1, m2, res;
1477 long i;
1479 m1.as_m64 = __A;
1480 m2.as_m64 = __B;
1483 for (i = 0; i < 8; i++)
1484 res.as_char[i] =
1485 ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ?
1486 m1.as_char[i] : m2.as_char[i];
1488 return (__m64) res.as_m64;
1489 #endif
1492 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1493 _m_pmaxub (__m64 __A, __m64 __B)
1495 return _mm_max_pu8 (__A, __B);
1498 /* Compute the element-wise minimum of signed 16-bit values. */
1499 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1500 _mm_min_pi16 (__m64 __A, __m64 __B)
1502 #if _ARCH_PWR8
1503 __vector signed short a, b, r;
1504 __vector __bool short c;
1506 a = (__vector signed short)vec_splats (__A);
1507 b = (__vector signed short)vec_splats (__B);
1508 c = (__vector __bool short)vec_cmplt (a, b);
1509 r = vec_sel (b, a, c);
1510 return (__m64) ((__vector long long) r)[0];
1511 #else
1512 __m64_union m1, m2, res;
1514 m1.as_m64 = __A;
1515 m2.as_m64 = __B;
1517 res.as_short[0] =
1518 (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1519 res.as_short[1] =
1520 (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1521 res.as_short[2] =
1522 (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1523 res.as_short[3] =
1524 (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1526 return (__m64) res.as_m64;
1527 #endif
1530 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1531 _m_pminsw (__m64 __A, __m64 __B)
1533 return _mm_min_pi16 (__A, __B);
1536 /* Compute the element-wise minimum of unsigned 8-bit values. */
1537 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1538 _mm_min_pu8 (__m64 __A, __m64 __B)
1540 #if _ARCH_PWR8
1541 __vector unsigned char a, b, r;
1542 __vector __bool char c;
1544 a = (__vector unsigned char)vec_splats (__A);
1545 b = (__vector unsigned char)vec_splats (__B);
1546 c = (__vector __bool char)vec_cmplt (a, b);
1547 r = vec_sel (b, a, c);
1548 return (__m64) ((__vector long long) r)[0];
1549 #else
1550 __m64_union m1, m2, res;
1551 long i;
1553 m1.as_m64 = __A;
1554 m2.as_m64 = __B;
1557 for (i = 0; i < 8; i++)
1558 res.as_char[i] =
1559 ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ?
1560 m1.as_char[i] : m2.as_char[i];
1562 return (__m64) res.as_m64;
1563 #endif
1566 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1567 _m_pminub (__m64 __A, __m64 __B)
1569 return _mm_min_pu8 (__A, __B);
1572 /* Create an 8-bit mask of the signs of 8-bit values. */
1573 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1574 _mm_movemask_pi8 (__m64 __A)
1576 unsigned long long p = 0x0008101820283038UL; // permute control for sign bits
1578 return __builtin_bpermd (p, __A);
1581 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1582 _m_pmovmskb (__m64 __A)
1584 return _mm_movemask_pi8 (__A);
1587 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1588 in B and produce the high 16 bits of the 32-bit results. */
1589 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1590 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
1592 __vector unsigned short a, b;
1593 __vector unsigned short c;
1594 __vector unsigned int w0, w1;
1595 __vector unsigned char xform1 = {
1596 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1597 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1600 a = (__vector unsigned short)vec_splats (__A);
1601 b = (__vector unsigned short)vec_splats (__B);
1603 w0 = vec_vmuleuh (a, b);
1604 w1 = vec_vmulouh (a, b);
1605 c = (__vector unsigned short)vec_perm (w0, w1, xform1);
1607 return (__m64) ((__vector long long) c)[0];
1610 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1611 _m_pmulhuw (__m64 __A, __m64 __B)
1613 return _mm_mulhi_pu16 (__A, __B);
1616 /* Return a combination of the four 16-bit values in A. The selector
1617 must be an immediate. */
1618 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1619 _mm_shuffle_pi16 (__m64 __A, int const __N)
1621 unsigned long element_selector_10 = __N & 0x03;
1622 unsigned long element_selector_32 = (__N >> 2) & 0x03;
1623 unsigned long element_selector_54 = (__N >> 4) & 0x03;
1624 unsigned long element_selector_76 = (__N >> 6) & 0x03;
1625 static const unsigned short permute_selectors[4] =
1627 #ifdef __LITTLE_ENDIAN__
1628 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1629 #elif __BIG_ENDIAN__
1630 0x0607, 0x0405, 0x0203, 0x0001
1631 #endif
1633 __m64_union t;
1634 __vector unsigned long long a, p, r;
1636 #ifdef __LITTLE_ENDIAN__
1637 t.as_short[0] = permute_selectors[element_selector_10];
1638 t.as_short[1] = permute_selectors[element_selector_32];
1639 t.as_short[2] = permute_selectors[element_selector_54];
1640 t.as_short[3] = permute_selectors[element_selector_76];
1641 #elif __BIG_ENDIAN__
1642 t.as_short[3] = permute_selectors[element_selector_10];
1643 t.as_short[2] = permute_selectors[element_selector_32];
1644 t.as_short[1] = permute_selectors[element_selector_54];
1645 t.as_short[0] = permute_selectors[element_selector_76];
1646 #endif
1647 p = vec_splats (t.as_m64);
1648 a = vec_splats (__A);
1649 r = vec_perm (a, a, (__vector unsigned char)p);
1650 return (__m64) ((__vector long long) r)[0];
1653 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1654 _m_pshufw (__m64 __A, int const __N)
1656 return _mm_shuffle_pi16 (__A, __N);
1659 /* Conditionally store byte elements of A into P. The high bit of each
1660 byte in the selector N determines whether the corresponding byte from
1661 A is stored. */
1662 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1663 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1665 __m64 hibit = 0x8080808080808080UL;
1666 __m64 mask, tmp;
1667 __m64 *p = (__m64*)__P;
1669 tmp = *p;
1670 mask = _mm_cmpeq_pi8 ((__N & hibit), hibit);
1671 tmp = (tmp & (~mask)) | (__A & mask);
1672 *p = tmp;
1675 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1676 _m_maskmovq (__m64 __A, __m64 __N, char *__P)
1678 _mm_maskmove_si64 (__A, __N, __P);
1681 /* Compute the rounded averages of the unsigned 8-bit values in A and B. */
1682 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1683 _mm_avg_pu8 (__m64 __A, __m64 __B)
1685 __vector unsigned char a, b, c;
1687 a = (__vector unsigned char)vec_splats (__A);
1688 b = (__vector unsigned char)vec_splats (__B);
1689 c = vec_avg (a, b);
1690 return (__m64) ((__vector long long) c)[0];
1693 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1694 _m_pavgb (__m64 __A, __m64 __B)
1696 return _mm_avg_pu8 (__A, __B);
1699 /* Compute the rounded averages of the unsigned 16-bit values in A and B. */
1700 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1701 _mm_avg_pu16 (__m64 __A, __m64 __B)
1703 __vector unsigned short a, b, c;
1705 a = (__vector unsigned short)vec_splats (__A);
1706 b = (__vector unsigned short)vec_splats (__B);
1707 c = vec_avg (a, b);
1708 return (__m64) ((__vector long long) c)[0];
1711 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1712 _m_pavgw (__m64 __A, __m64 __B)
1714 return _mm_avg_pu16 (__A, __B);
1717 /* Compute the sum of the absolute differences of the unsigned 8-bit
1718 values in A and B. Return the value in the lower 16-bit word; the
1719 upper words are cleared. */
1720 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1721 _mm_sad_pu8 (__m64 __A, __m64 __B)
1723 __vector unsigned char a, b;
1724 __vector unsigned char vmin, vmax, vabsdiff;
1725 __vector signed int vsum;
1726 const __vector unsigned int zero =
1727 { 0, 0, 0, 0 };
1728 unsigned short result;
1730 a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A };
1731 b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B };
1732 vmin = vec_min (a, b);
1733 vmax = vec_max (a, b);
1734 vabsdiff = vec_sub (vmax, vmin);
1735 /* Sum four groups of bytes into integers. */
1736 vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
1737 /* Sum across four integers with integer result. */
1738 vsum = vec_sums (vsum, (__vector signed int) zero);
1739 /* The sum is in the right most 32-bits of the vector result.
1740 Transfer to a GPR and truncate to 16 bits. */
1741 result = vsum[3];
1742 return (result);
1745 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1746 _m_psadbw (__m64 __A, __m64 __B)
1748 return _mm_sad_pu8 (__A, __B);
1751 /* Stores the data in A to the address P without polluting the caches. */
1752 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1753 _mm_stream_pi (__m64 *__P, __m64 __A)
1755 /* Use the data cache block touch for store transient. */
1756 __asm__ (
1757 " dcbtstt 0,%0"
1759 : "b" (__P)
1760 : "memory"
1762 *__P = __A;
1765 /* Likewise. The address must be 16-byte aligned. */
1766 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1767 _mm_stream_ps (float *__P, __m128 __A)
1769 /* Use the data cache block touch for store transient. */
1770 __asm__ (
1771 " dcbtstt 0,%0"
1773 : "b" (__P)
1774 : "memory"
1776 _mm_store_ps (__P, __A);
1779 /* Guarantees that every preceding store is globally visible before
1780 any subsequent store. */
1781 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1782 _mm_sfence (void)
1784 /* Generate a light weight sync. */
1785 __atomic_thread_fence (__ATOMIC_RELEASE);
1788 /* The execution of the next instruction is delayed by an implementation
1789 specific amount of time. The instruction does not modify the
1790 architectural state. This is after the pop_options pragma because
1791 it does not require SSE support in the processor--the encoding is a
1792 nop on processors that do not support it. */
1793 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1794 _mm_pause (void)
1796 /* There is no exact match with this construct, but the following is
1797 close to the desired effect. */
1798 #if _ARCH_PWR8
1799 /* On power8 and later processors we can depend on Program Priority
1800 (PRI) and associated "very low" PPI setting. Since we don't know
1801 what PPI this thread is running at we: 1) save the current PRI
1802 from the PPR SPR into a local GRP, 2) set the PRI to "very low*
1803 via the special or 31,31,31 encoding. 3) issue an "isync" to
1804 insure the PRI change takes effect before we execute any more
1805 instructions.
1806 Now we can execute a lwsync (release barrier) while we execute
1807 this thread at "very low" PRI. Finally we restore the original
1808 PRI and continue execution. */
1809 unsigned long __PPR;
1811 __asm__ volatile (
1812 " mfppr %0;"
1813 " or 31,31,31;"
1814 " isync;"
1815 " lwsync;"
1816 " isync;"
1817 " mtppr %0;"
1818 : "=r" (__PPR)
1820 : "memory"
1822 #else
1823 /* For older processor where we may not even have Program Priority
1824 controls we can only depend on Heavy Weight Sync. */
1825 __atomic_thread_fence (__ATOMIC_SEQ_CST);
1826 #endif
1829 /* Transpose the 4x4 matrix composed of row[0-3]. */
1830 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
1831 do { \
1832 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
1833 __v4sf __t0 = vec_vmrghw (__r0, __r1); \
1834 __v4sf __t1 = vec_vmrghw (__r2, __r3); \
1835 __v4sf __t2 = vec_vmrglw (__r0, __r1); \
1836 __v4sf __t3 = vec_vmrglw (__r2, __r3); \
1837 (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, \
1838 (__vector long long)__t1); \
1839 (row1) = (__v4sf)vec_mergel ((__vector long long)__t0, \
1840 (__vector long long)__t1); \
1841 (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2, \
1842 (__vector long long)__t3); \
1843 (row3) = (__v4sf)vec_mergel ((__vector long long)__t2, \
1844 (__vector long long)__t3); \
1845 } while (0)
1847 /* For backward source compatibility. */
1848 //# include <emmintrin.h>
1850 #endif /* _XMMINTRIN_H_INCLUDED */