Add a VEC_SERIES rtl code
[official-gcc.git] / gcc / config / rs6000 / xmmintrin.h
blobd7f87b554cab62fb7db98340e68df7e1905df313
1 /* Copyright (C) 2002-2017 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
24 /* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 9.0. */
27 #ifndef NO_WARN_X86_INTRINSICS
28 /* This header is distributed to simplify porting x86_64 code that
29 makes explicit use of Intel intrinsics to powerpc64le.
30 It is the user's responsibility to determine if the results are
31 acceptable and make additional changes as necessary.
32 Note that much code that uses Intel intrinsics can be rewritten in
33 standard C or GNU C extensions, which are more portable and better
34 optimized across multiple targets.
36 In the specific case of X86 SSE (__m128) intrinsics, the PowerPC
37 VMX/VSX ISA is a good match for vector float SIMD operations.
38 However scalar float operations in vector (XMM) registers require
39 the POWER8 VSX ISA (2.07) level. Also there are important
40 differences for data format and placement of float scalars in the
41 vector register. For PowerISA Scalar floats in FPRs (left most
42 64-bits of the low 32 VSRs) is in double format, while X86_64 SSE
43 uses the right most 32-bits of the XMM. These differences require
44 extra steps on POWER to match the SSE scalar float semantics.
46 Most SSE scalar float intrinsic operations can be performed more
47 efficiently as C language float scalar operations or optimized to
48 use vector SIMD operations. We recommend this for new applications.
50 Another difference is the format and details of the X86_64 MXSCR vs
51 the PowerISA FPSCR / VSCR registers. We recommend applications
52 replace direct access to the MXSCR with the more portable <fenv.h>
53 Posix APIs. */
54 #warning "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this warning."
55 #endif
57 #ifndef _XMMINTRIN_H_INCLUDED
58 #define _XMMINTRIN_H_INCLUDED
60 #include <altivec.h>
61 #include <assert.h>
63 /* We need type definitions from the MMX header file. */
64 #include <mmintrin.h>
66 /* Get _mm_malloc () and _mm_free (). */
67 #include <mm_malloc.h>
69 /* The Intel API is flexible enough that we must allow aliasing with other
70 vector types, and their scalar components. */
71 typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
73 /* Internal data types for implementing the intrinsics. */
74 typedef float __v4sf __attribute__ ((__vector_size__ (16)));
76 /* Create an undefined vector. */
77 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
78 _mm_undefined_ps (void)
80 __m128 __Y = __Y;
81 return __Y;
84 /* Create a vector of zeros. */
85 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
86 _mm_setzero_ps (void)
88 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
91 /* Load four SPFP values from P. The address must be 16-byte aligned. */
92 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
93 _mm_load_ps (float const *__P)
95 assert(((unsigned long)__P & 0xfUL) == 0UL);
96 return ((__m128)vec_ld(0, (__v4sf*)__P));
99 /* Load four SPFP values from P. The address need not be 16-byte aligned. */
100 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
101 _mm_loadu_ps (float const *__P)
103 return (vec_vsx_ld(0, __P));
106 /* Load four SPFP values in reverse order. The address must be aligned. */
107 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
108 _mm_loadr_ps (float const *__P)
110 __v4sf __tmp;
111 __m128 result;
112 static const __vector unsigned char permute_vector =
113 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
114 0x17, 0x10, 0x11, 0x12, 0x13 };
116 __tmp = vec_ld (0, (__v4sf *) __P);
117 result = (__m128) vec_perm (__tmp, __tmp, permute_vector);
118 return result;
121 /* Create a vector with all four elements equal to F. */
122 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
123 _mm_set1_ps (float __F)
125 return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
128 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
129 _mm_set_ps1 (float __F)
131 return _mm_set1_ps (__F);
134 /* Create the vector [Z Y X W]. */
135 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
136 _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
138 return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
141 /* Create the vector [W X Y Z]. */
142 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
143 _mm_setr_ps (float __Z, float __Y, float __X, float __W)
145 return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
148 /* Store four SPFP values. The address must be 16-byte aligned. */
149 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
150 _mm_store_ps (float *__P, __m128 __A)
152 assert(((unsigned long)__P & 0xfUL) == 0UL);
153 vec_st((__v4sf)__A, 0, (__v4sf*)__P);
156 /* Store four SPFP values. The address need not be 16-byte aligned. */
157 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
158 _mm_storeu_ps (float *__P, __m128 __A)
160 *(__m128 *)__P = __A;
163 /* Store four SPFP values in reverse order. The address must be aligned. */
164 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
165 _mm_storer_ps (float *__P, __m128 __A)
167 __v4sf __tmp;
168 static const __vector unsigned char permute_vector =
169 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
170 0x17, 0x10, 0x11, 0x12, 0x13 };
172 __tmp = (__m128) vec_perm (__A, __A, permute_vector);
174 _mm_store_ps (__P, __tmp);
177 /* Store the lower SPFP value across four words. */
178 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
179 _mm_store1_ps (float *__P, __m128 __A)
181 __v4sf __va = vec_splat((__v4sf)__A, 0);
182 _mm_store_ps (__P, __va);
185 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
186 _mm_store_ps1 (float *__P, __m128 __A)
188 _mm_store1_ps (__P, __A);
191 /* Create a vector with element 0 as F and the rest zero. */
192 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
193 _mm_set_ss (float __F)
195 return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
198 /* Sets the low SPFP value of A from the low value of B. */
199 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
200 _mm_move_ss (__m128 __A, __m128 __B)
202 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
204 return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask));
207 /* Create a vector with element 0 as *P and the rest zero. */
208 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
209 _mm_load_ss (float const *__P)
211 return _mm_set_ss (*__P);
214 /* Stores the lower SPFP value. */
215 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
216 _mm_store_ss (float *__P, __m128 __A)
218 *__P = ((__v4sf)__A)[0];
221 /* Perform the respective operation on the lower SPFP (single-precision
222 floating-point) values of A and B; the upper three SPFP values are
223 passed through from A. */
225 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
226 _mm_add_ss (__m128 __A, __m128 __B)
228 #ifdef _ARCH_PWR7
229 __m128 a, b, c;
230 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
231 /* PowerISA VSX does not allow partial (for just lower double)
232 results. So to insure we don't generate spurious exceptions
233 (from the upper double values) we splat the lower double
234 before we to the operation. */
235 a = vec_splat (__A, 0);
236 b = vec_splat (__B, 0);
237 c = a + b;
238 /* Then we merge the lower float result with the original upper
239 float elements from __A. */
240 return (vec_sel (__A, c, mask));
241 #else
242 __A[0] = __A[0] + __B[0];
243 return (__A);
244 #endif
247 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
248 _mm_sub_ss (__m128 __A, __m128 __B)
250 #ifdef _ARCH_PWR7
251 __m128 a, b, c;
252 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
253 /* PowerISA VSX does not allow partial (for just lower double)
254 results. So to insure we don't generate spurious exceptions
255 (from the upper double values) we splat the lower double
256 before we to the operation. */
257 a = vec_splat (__A, 0);
258 b = vec_splat (__B, 0);
259 c = a - b;
260 /* Then we merge the lower float result with the original upper
261 float elements from __A. */
262 return (vec_sel (__A, c, mask));
263 #else
264 __A[0] = __A[0] - __B[0];
265 return (__A);
266 #endif
269 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
270 _mm_mul_ss (__m128 __A, __m128 __B)
272 #ifdef _ARCH_PWR7
273 __m128 a, b, c;
274 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
275 /* PowerISA VSX does not allow partial (for just lower double)
276 results. So to insure we don't generate spurious exceptions
277 (from the upper double values) we splat the lower double
278 before we to the operation. */
279 a = vec_splat (__A, 0);
280 b = vec_splat (__B, 0);
281 c = a * b;
282 /* Then we merge the lower float result with the original upper
283 float elements from __A. */
284 return (vec_sel (__A, c, mask));
285 #else
286 __A[0] = __A[0] * __B[0];
287 return (__A);
288 #endif
291 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
292 _mm_div_ss (__m128 __A, __m128 __B)
294 #ifdef _ARCH_PWR7
295 __m128 a, b, c;
296 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
297 /* PowerISA VSX does not allow partial (for just lower double)
298 results. So to insure we don't generate spurious exceptions
299 (from the upper double values) we splat the lower double
300 before we to the operation. */
301 a = vec_splat (__A, 0);
302 b = vec_splat (__B, 0);
303 c = a / b;
304 /* Then we merge the lower float result with the original upper
305 float elements from __A. */
306 return (vec_sel (__A, c, mask));
307 #else
308 __A[0] = __A[0] / __B[0];
309 return (__A);
310 #endif
313 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
314 _mm_sqrt_ss (__m128 __A)
316 __m128 a, c;
317 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
318 /* PowerISA VSX does not allow partial (for just lower double)
319 * results. So to insure we don't generate spurious exceptions
320 * (from the upper double values) we splat the lower double
321 * before we to the operation. */
322 a = vec_splat (__A, 0);
323 c = vec_sqrt (a);
324 /* Then we merge the lower float result with the original upper
325 * float elements from __A. */
326 return (vec_sel (__A, c, mask));
329 /* Perform the respective operation on the four SPFP values in A and B. */
330 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
331 _mm_add_ps (__m128 __A, __m128 __B)
333 return (__m128) ((__v4sf)__A + (__v4sf)__B);
336 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
337 _mm_sub_ps (__m128 __A, __m128 __B)
339 return (__m128) ((__v4sf)__A - (__v4sf)__B);
342 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
343 _mm_mul_ps (__m128 __A, __m128 __B)
345 return (__m128) ((__v4sf)__A * (__v4sf)__B);
348 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
349 _mm_div_ps (__m128 __A, __m128 __B)
351 return (__m128) ((__v4sf)__A / (__v4sf)__B);
354 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
355 _mm_sqrt_ps (__m128 __A)
357 return (vec_sqrt ((__v4sf)__A));
360 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
361 _mm_rcp_ps (__m128 __A)
363 return (vec_re ((__v4sf)__A));
366 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
367 _mm_rsqrt_ps (__m128 __A)
369 return (vec_rsqrte (__A));
372 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
373 _mm_rcp_ss (__m128 __A)
375 __m128 a, c;
376 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
377 /* PowerISA VSX does not allow partial (for just lower double)
378 * results. So to insure we don't generate spurious exceptions
379 * (from the upper double values) we splat the lower double
380 * before we to the operation. */
381 a = vec_splat (__A, 0);
382 c = _mm_rcp_ps (a);
383 /* Then we merge the lower float result with the original upper
384 * float elements from __A. */
385 return (vec_sel (__A, c, mask));
388 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
389 _mm_rsqrt_ss (__m128 __A)
391 __m128 a, c;
392 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
393 /* PowerISA VSX does not allow partial (for just lower double)
394 * results. So to insure we don't generate spurious exceptions
395 * (from the upper double values) we splat the lower double
396 * before we to the operation. */
397 a = vec_splat (__A, 0);
398 c = vec_rsqrte (a);
399 /* Then we merge the lower float result with the original upper
400 * float elements from __A. */
401 return (vec_sel (__A, c, mask));
404 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
405 _mm_min_ss (__m128 __A, __m128 __B)
407 __v4sf a, b, c;
408 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
409 /* PowerISA VSX does not allow partial (for just lower float)
410 * results. So to insure we don't generate spurious exceptions
411 * (from the upper float values) we splat the lower float
412 * before we to the operation. */
413 a = vec_splat ((__v4sf)__A, 0);
414 b = vec_splat ((__v4sf)__B, 0);
415 c = vec_min (a, b);
416 /* Then we merge the lower float result with the original upper
417 * float elements from __A. */
418 return (vec_sel ((__v4sf)__A, c, mask));
421 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
422 _mm_max_ss (__m128 __A, __m128 __B)
424 __v4sf a, b, c;
425 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
426 /* PowerISA VSX does not allow partial (for just lower float)
427 * results. So to insure we don't generate spurious exceptions
428 * (from the upper float values) we splat the lower float
429 * before we to the operation. */
430 a = vec_splat (__A, 0);
431 b = vec_splat (__B, 0);
432 c = vec_max (a, b);
433 /* Then we merge the lower float result with the original upper
434 * float elements from __A. */
435 return (vec_sel ((__v4sf)__A, c, mask));
438 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
439 _mm_min_ps (__m128 __A, __m128 __B)
441 return ((__m128)vec_min ((__v4sf)__A,(__v4sf) __B));
444 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
445 _mm_max_ps (__m128 __A, __m128 __B)
447 return ((__m128)vec_max ((__v4sf)__A, (__v4sf)__B));
450 /* Perform logical bit-wise operations on 128-bit values. */
451 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
452 _mm_and_ps (__m128 __A, __m128 __B)
454 return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B));
455 // return __builtin_ia32_andps (__A, __B);
458 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
459 _mm_andnot_ps (__m128 __A, __m128 __B)
461 return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A));
464 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
465 _mm_or_ps (__m128 __A, __m128 __B)
467 return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B));
470 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
471 _mm_xor_ps (__m128 __A, __m128 __B)
473 return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B));
476 /* Perform a comparison on the four SPFP values of A and B. For each
477 element, if the comparison is true, place a mask of all ones in the
478 result, otherwise a mask of zeros. */
479 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
480 _mm_cmpeq_ps (__m128 __A, __m128 __B)
482 return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B));
485 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
486 _mm_cmplt_ps (__m128 __A, __m128 __B)
488 return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
491 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
492 _mm_cmple_ps (__m128 __A, __m128 __B)
494 return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
497 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
498 _mm_cmpgt_ps (__m128 __A, __m128 __B)
500 return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
503 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
504 _mm_cmpge_ps (__m128 __A, __m128 __B)
506 return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
509 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
510 _mm_cmpneq_ps (__m128 __A, __m128 __B)
512 __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
513 return ((__m128)vec_nor (temp, temp));
516 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
517 _mm_cmpnlt_ps (__m128 __A, __m128 __B)
519 return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
522 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
523 _mm_cmpnle_ps (__m128 __A, __m128 __B)
525 return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
528 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
529 _mm_cmpngt_ps (__m128 __A, __m128 __B)
531 return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
534 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
535 _mm_cmpnge_ps (__m128 __A, __m128 __B)
537 return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
540 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
541 _mm_cmpord_ps (__m128 __A, __m128 __B)
543 __vector unsigned int a, b;
544 __vector unsigned int c, d;
545 static const __vector unsigned int float_exp_mask =
546 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
548 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
549 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
550 c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
551 d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
552 return ((__m128 ) vec_and (c, d));
555 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
556 _mm_cmpunord_ps (__m128 __A, __m128 __B)
558 __vector unsigned int a, b;
559 __vector unsigned int c, d;
560 static const __vector unsigned int float_exp_mask =
561 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
563 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
564 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
565 c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
566 d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
567 return ((__m128 ) vec_or (c, d));
570 /* Perform a comparison on the lower SPFP values of A and B. If the
571 comparison is true, place a mask of all ones in the result, otherwise a
572 mask of zeros. The upper three SPFP values are passed through from A. */
573 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
574 _mm_cmpeq_ss (__m128 __A, __m128 __B)
576 static const __vector unsigned int mask =
577 { 0xffffffff, 0, 0, 0 };
578 __v4sf a, b, c;
579 /* PowerISA VMX does not allow partial (for just element 0)
580 * results. So to insure we don't generate spurious exceptions
581 * (from the upper elements) we splat the lower float
582 * before we to the operation. */
583 a = vec_splat ((__v4sf) __A, 0);
584 b = vec_splat ((__v4sf) __B, 0);
585 c = (__v4sf) vec_cmpeq(a, b);
586 /* Then we merge the lower float result with the original upper
587 * float elements from __A. */
588 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
591 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
592 _mm_cmplt_ss (__m128 __A, __m128 __B)
594 static const __vector unsigned int mask =
595 { 0xffffffff, 0, 0, 0 };
596 __v4sf a, b, c;
597 /* PowerISA VMX does not allow partial (for just element 0)
598 * results. So to insure we don't generate spurious exceptions
599 * (from the upper elements) we splat the lower float
600 * before we to the operation. */
601 a = vec_splat ((__v4sf) __A, 0);
602 b = vec_splat ((__v4sf) __B, 0);
603 c = (__v4sf) vec_cmplt(a, b);
604 /* Then we merge the lower float result with the original upper
605 * float elements from __A. */
606 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
609 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
610 _mm_cmple_ss (__m128 __A, __m128 __B)
612 static const __vector unsigned int mask =
613 { 0xffffffff, 0, 0, 0 };
614 __v4sf a, b, c;
615 /* PowerISA VMX does not allow partial (for just element 0)
616 * results. So to insure we don't generate spurious exceptions
617 * (from the upper elements) we splat the lower float
618 * before we to the operation. */
619 a = vec_splat ((__v4sf) __A, 0);
620 b = vec_splat ((__v4sf) __B, 0);
621 c = (__v4sf) vec_cmple(a, b);
622 /* Then we merge the lower float result with the original upper
623 * float elements from __A. */
624 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
627 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
628 _mm_cmpgt_ss (__m128 __A, __m128 __B)
630 static const __vector unsigned int mask =
631 { 0xffffffff, 0, 0, 0 };
632 __v4sf a, b, c;
633 /* PowerISA VMX does not allow partial (for just element 0)
634 * results. So to insure we don't generate spurious exceptions
635 * (from the upper elements) we splat the lower float
636 * before we to the operation. */
637 a = vec_splat ((__v4sf) __A, 0);
638 b = vec_splat ((__v4sf) __B, 0);
639 c = (__v4sf) vec_cmpgt(a, b);
640 /* Then we merge the lower float result with the original upper
641 * float elements from __A. */
642 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
645 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
646 _mm_cmpge_ss (__m128 __A, __m128 __B)
648 static const __vector unsigned int mask =
649 { 0xffffffff, 0, 0, 0 };
650 __v4sf a, b, c;
651 /* PowerISA VMX does not allow partial (for just element 0)
652 * results. So to insure we don't generate spurious exceptions
653 * (from the upper elements) we splat the lower float
654 * before we to the operation. */
655 a = vec_splat ((__v4sf) __A, 0);
656 b = vec_splat ((__v4sf) __B, 0);
657 c = (__v4sf) vec_cmpge(a, b);
658 /* Then we merge the lower float result with the original upper
659 * float elements from __A. */
660 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
663 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
664 _mm_cmpneq_ss (__m128 __A, __m128 __B)
666 static const __vector unsigned int mask =
667 { 0xffffffff, 0, 0, 0 };
668 __v4sf a, b, c;
669 /* PowerISA VMX does not allow partial (for just element 0)
670 * results. So to insure we don't generate spurious exceptions
671 * (from the upper elements) we splat the lower float
672 * before we to the operation. */
673 a = vec_splat ((__v4sf) __A, 0);
674 b = vec_splat ((__v4sf) __B, 0);
675 c = (__v4sf) vec_cmpeq(a, b);
676 c = vec_nor (c, c);
677 /* Then we merge the lower float result with the original upper
678 * float elements from __A. */
679 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
682 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
683 _mm_cmpnlt_ss (__m128 __A, __m128 __B)
685 static const __vector unsigned int mask =
686 { 0xffffffff, 0, 0, 0 };
687 __v4sf a, b, c;
688 /* PowerISA VMX does not allow partial (for just element 0)
689 * results. So to insure we don't generate spurious exceptions
690 * (from the upper elements) we splat the lower float
691 * before we to the operation. */
692 a = vec_splat ((__v4sf) __A, 0);
693 b = vec_splat ((__v4sf) __B, 0);
694 c = (__v4sf) vec_cmpge(a, b);
695 /* Then we merge the lower float result with the original upper
696 * float elements from __A. */
697 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
700 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
701 _mm_cmpnle_ss (__m128 __A, __m128 __B)
703 static const __vector unsigned int mask =
704 { 0xffffffff, 0, 0, 0 };
705 __v4sf a, b, c;
706 /* PowerISA VMX does not allow partial (for just element 0)
707 * results. So to insure we don't generate spurious exceptions
708 * (from the upper elements) we splat the lower float
709 * before we to the operation. */
710 a = vec_splat ((__v4sf) __A, 0);
711 b = vec_splat ((__v4sf) __B, 0);
712 c = (__v4sf) vec_cmpgt(a, b);
713 /* Then we merge the lower float result with the original upper
714 * float elements from __A. */
715 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
718 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
719 _mm_cmpngt_ss (__m128 __A, __m128 __B)
721 static const __vector unsigned int mask =
722 { 0xffffffff, 0, 0, 0 };
723 __v4sf a, b, c;
724 /* PowerISA VMX does not allow partial (for just element 0)
725 * results. So to insure we don't generate spurious exceptions
726 * (from the upper elements) we splat the lower float
727 * before we to the operation. */
728 a = vec_splat ((__v4sf) __A, 0);
729 b = vec_splat ((__v4sf) __B, 0);
730 c = (__v4sf) vec_cmple(a, b);
731 /* Then we merge the lower float result with the original upper
732 * float elements from __A. */
733 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
736 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
737 _mm_cmpnge_ss (__m128 __A, __m128 __B)
739 static const __vector unsigned int mask =
740 { 0xffffffff, 0, 0, 0 };
741 __v4sf a, b, c;
742 /* PowerISA VMX does not allow partial (for just element 0)
743 * results. So to insure we don't generate spurious exceptions
744 * (from the upper elements) we splat the lower float
745 * before we do the operation. */
746 a = vec_splat ((__v4sf) __A, 0);
747 b = vec_splat ((__v4sf) __B, 0);
748 c = (__v4sf) vec_cmplt(a, b);
749 /* Then we merge the lower float result with the original upper
750 * float elements from __A. */
751 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
754 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
755 _mm_cmpord_ss (__m128 __A, __m128 __B)
757 __vector unsigned int a, b;
758 __vector unsigned int c, d;
759 static const __vector unsigned int float_exp_mask =
760 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
761 static const __vector unsigned int mask =
762 { 0xffffffff, 0, 0, 0 };
764 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
765 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
766 c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
767 d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
768 c = vec_and (c, d);
769 /* Then we merge the lower float result with the original upper
770 * float elements from __A. */
771 return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
774 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
775 _mm_cmpunord_ss (__m128 __A, __m128 __B)
777 __vector unsigned int a, b;
778 __vector unsigned int c, d;
779 static const __vector unsigned int float_exp_mask =
780 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
781 static const __vector unsigned int mask =
782 { 0xffffffff, 0, 0, 0 };
784 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
785 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
786 c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
787 d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
788 c = vec_or (c, d);
789 /* Then we merge the lower float result with the original upper
790 * float elements from __A. */
791 return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
794 /* Compare the lower SPFP values of A and B and return 1 if true
795 and 0 if false. */
796 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
797 _mm_comieq_ss (__m128 __A, __m128 __B)
799 return (__A[0] == __B[0]);
802 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
803 _mm_comilt_ss (__m128 __A, __m128 __B)
805 return (__A[0] < __B[0]);
808 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
809 _mm_comile_ss (__m128 __A, __m128 __B)
811 return (__A[0] <= __B[0]);
814 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
815 _mm_comigt_ss (__m128 __A, __m128 __B)
817 return (__A[0] > __B[0]);
820 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
821 _mm_comige_ss (__m128 __A, __m128 __B)
823 return (__A[0] >= __B[0]);
826 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
827 _mm_comineq_ss (__m128 __A, __m128 __B)
829 return (__A[0] != __B[0]);
832 /* FIXME
833 * The __mm_ucomi??_ss implementations below are exactly the same as
834 * __mm_comi??_ss because GCC for PowerPC only generates unordered
835 * compares (scalar and vector).
836 * Technically __mm_comieq_ss et al should be using the ordered
837 * compare and signal for QNaNs.
838 * The __mm_ucomieq_sd et all should be OK, as is.
840 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
841 _mm_ucomieq_ss (__m128 __A, __m128 __B)
843 return (__A[0] == __B[0]);
846 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
847 _mm_ucomilt_ss (__m128 __A, __m128 __B)
849 return (__A[0] < __B[0]);
852 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
853 _mm_ucomile_ss (__m128 __A, __m128 __B)
855 return (__A[0] <= __B[0]);
858 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
859 _mm_ucomigt_ss (__m128 __A, __m128 __B)
861 return (__A[0] > __B[0]);
864 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
865 _mm_ucomige_ss (__m128 __A, __m128 __B)
867 return (__A[0] >= __B[0]);
870 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
871 _mm_ucomineq_ss (__m128 __A, __m128 __B)
873 return (__A[0] != __B[0]);
876 extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
877 _mm_cvtss_f32 (__m128 __A)
879 return ((__v4sf)__A)[0];
882 /* Convert the lower SPFP value to a 32-bit integer according to the current
883 rounding mode. */
884 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
885 _mm_cvtss_si32 (__m128 __A)
887 __m64 res = 0;
888 #ifdef _ARCH_PWR8
889 __m128 vtmp;
890 __asm__(
891 "xxsldwi %x1,%x2,%x2,3;\n"
892 "xscvspdp %x1,%x1;\n"
893 "fctiw %1,%1;\n"
894 "mfvsrd %0,%x1;\n"
895 : "=r" (res),
896 "=&wi" (vtmp)
897 : "wa" (__A)
898 : );
899 #else
900 res = __builtin_rint(__A[0]);
901 #endif
902 return (res);
905 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
906 _mm_cvt_ss2si (__m128 __A)
908 return _mm_cvtss_si32 (__A);
911 /* Convert the lower SPFP value to a 32-bit integer according to the
912 current rounding mode. */
914 /* Intel intrinsic. */
915 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
916 _mm_cvtss_si64 (__m128 __A)
918 __m64 res = 0;
919 #ifdef _ARCH_PWR8
920 __m128 vtmp;
921 __asm__(
922 "xxsldwi %x1,%x2,%x2,3;\n"
923 "xscvspdp %x1,%x1;\n"
924 "fctid %1,%1;\n"
925 "mfvsrd %0,%x1;\n"
926 : "=r" (res),
927 "=&wi" (vtmp)
928 : "wa" (__A)
929 : );
930 #else
931 res = __builtin_llrint(__A[0]);
932 #endif
933 return (res);
936 /* Microsoft intrinsic. */
937 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
938 _mm_cvtss_si64x (__m128 __A)
940 return _mm_cvtss_si64 ((__v4sf) __A);
943 /* Constants for use with _mm_prefetch. */
944 enum _mm_hint
946 /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */
947 _MM_HINT_ET0 = 7,
948 _MM_HINT_ET1 = 6,
949 _MM_HINT_T0 = 3,
950 _MM_HINT_T1 = 2,
951 _MM_HINT_T2 = 1,
952 _MM_HINT_NTA = 0
955 /* Loads one cache line from address P to a location "closer" to the
956 processor. The selector I specifies the type of prefetch operation. */
957 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
958 _mm_prefetch (const void *__P, enum _mm_hint __I)
960 /* Current PowerPC will ignores the hint parameters. */
961 __builtin_prefetch (__P);
964 /* Convert the two lower SPFP values to 32-bit integers according to the
965 current rounding mode. Return the integers in packed form. */
966 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
967 _mm_cvtps_pi32 (__m128 __A)
969 /* Splat two lower SPFP values to both halves. */
970 __v4sf temp, rounded;
971 __vector __m64 result;
973 /* Splat two lower SPFP values to both halves. */
974 temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
975 rounded = vec_rint(temp);
976 result = (__vector __m64) vec_cts (rounded, 0);
978 return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0));
981 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
982 _mm_cvt_ps2pi (__m128 __A)
984 return _mm_cvtps_pi32 (__A);
987 /* Truncate the lower SPFP value to a 32-bit integer. */
988 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
989 _mm_cvttss_si32 (__m128 __A)
991 /* Extract the lower float element. */
992 float temp = __A[0];
993 /* truncate to 32-bit integer and return. */
994 return temp;
997 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
998 _mm_cvtt_ss2si (__m128 __A)
1000 return _mm_cvttss_si32 (__A);
1003 /* Intel intrinsic. */
1004 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1005 _mm_cvttss_si64 (__m128 __A)
1007 /* Extract the lower float element. */
1008 float temp = __A[0];
1009 /* truncate to 32-bit integer and return. */
1010 return temp;
1013 /* Microsoft intrinsic. */
1014 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1015 _mm_cvttss_si64x (__m128 __A)
1017 /* Extract the lower float element. */
1018 float temp = __A[0];
1019 /* truncate to 32-bit integer and return. */
1020 return temp;
1023 /* Truncate the two lower SPFP values to 32-bit integers. Return the
1024 integers in packed form. */
1025 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1026 _mm_cvttps_pi32 (__m128 __A)
1028 __v4sf temp;
1029 __vector __m64 result;
1031 /* Splat two lower SPFP values to both halves. */
1032 temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
1033 result = (__vector __m64) vec_cts (temp, 0);
1035 return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0));
1038 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1039 _mm_cvtt_ps2pi (__m128 __A)
1041 return _mm_cvttps_pi32 (__A);
1044 /* Convert B to a SPFP value and insert it as element zero in A. */
1045 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1046 _mm_cvtsi32_ss (__m128 __A, int __B)
1048 float temp = __B;
1049 __A[0] = temp;
1051 return __A;
1054 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1055 _mm_cvt_si2ss (__m128 __A, int __B)
1057 return _mm_cvtsi32_ss (__A, __B);
1060 /* Convert B to a SPFP value and insert it as element zero in A. */
1061 /* Intel intrinsic. */
1062 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1063 _mm_cvtsi64_ss (__m128 __A, long long __B)
1065 float temp = __B;
1066 __A[0] = temp;
1068 return __A;
1071 /* Microsoft intrinsic. */
1072 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1073 _mm_cvtsi64x_ss (__m128 __A, long long __B)
1075 return _mm_cvtsi64_ss (__A, __B);
1078 /* Convert the two 32-bit values in B to SPFP form and insert them
1079 as the two lower elements in A. */
1080 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1081 _mm_cvtpi32_ps (__m128 __A, __m64 __B)
1083 __vector signed int vm1;
1084 __vector float vf1;
1086 vm1 = (__vector signed int) __builtin_pack_vector_int128 (__B, __B);
1087 vf1 = (__vector float) vec_ctf (vm1, 0);
1089 return ((__m128) (__vector __m64)
1090 { ((__vector __m64)vf1) [0], ((__vector __m64)__A) [1]});
1093 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1094 _mm_cvt_pi2ps (__m128 __A, __m64 __B)
1096 return _mm_cvtpi32_ps (__A, __B);
1099 /* Convert the four signed 16-bit values in A to SPFP form. */
1100 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1101 _mm_cvtpi16_ps (__m64 __A)
1103 __vector signed short vs8;
1104 __vector signed int vi4;
1105 __vector float vf1;
1107 vs8 = (__vector signed short) __builtin_pack_vector_int128 (__A, __A);
1108 vi4 = vec_vupklsh (vs8);
1109 vf1 = (__vector float) vec_ctf (vi4, 0);
1111 return (__m128) vf1;
1114 /* Convert the four unsigned 16-bit values in A to SPFP form. */
1115 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1116 _mm_cvtpu16_ps (__m64 __A)
1118 const __vector unsigned short zero =
1119 { 0, 0, 0, 0, 0, 0, 0, 0 };
1120 __vector unsigned short vs8;
1121 __vector unsigned int vi4;
1122 __vector float vf1;
1124 vs8 = (__vector unsigned short) __builtin_pack_vector_int128 (__A, __A);
1125 vi4 = (__vector unsigned int) vec_vmrglh (vs8, zero);
1126 vf1 = (__vector float) vec_ctf (vi4, 0);
1128 return (__m128) vf1;
1131 /* Convert the low four signed 8-bit values in A to SPFP form. */
1132 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1133 _mm_cvtpi8_ps (__m64 __A)
1135 __vector signed char vc16;
1136 __vector signed short vs8;
1137 __vector signed int vi4;
1138 __vector float vf1;
1140 vc16 = (__vector signed char) __builtin_pack_vector_int128 (__A, __A);
1141 vs8 = vec_vupkhsb (vc16);
1142 vi4 = vec_vupkhsh (vs8);
1143 vf1 = (__vector float) vec_ctf (vi4, 0);
1145 return (__m128) vf1;
1148 /* Convert the low four unsigned 8-bit values in A to SPFP form. */
1149 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1151 _mm_cvtpu8_ps (__m64 __A)
1153 const __vector unsigned char zero =
1154 { 0, 0, 0, 0, 0, 0, 0, 0 };
1155 __vector unsigned char vc16;
1156 __vector unsigned short vs8;
1157 __vector unsigned int vi4;
1158 __vector float vf1;
1160 vc16 = (__vector unsigned char) __builtin_pack_vector_int128 (__A, __A);
1161 vs8 = (__vector unsigned short) vec_vmrglb (vc16, zero);
1162 vi4 = (__vector unsigned int) vec_vmrghh (vs8,
1163 (__vector unsigned short) zero);
1164 vf1 = (__vector float) vec_ctf (vi4, 0);
1166 return (__m128) vf1;
1169 /* Convert the four signed 32-bit values in A and B to SPFP form. */
1170 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1171 _mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
1173 __vector signed int vi4;
1174 __vector float vf4;
1176 vi4 = (__vector signed int) __builtin_pack_vector_int128 (__B, __A);
1177 vf4 = (__vector float) vec_ctf (vi4, 0);
1178 return (__m128) vf4;
1181 /* Convert the four SPFP values in A to four signed 16-bit integers. */
1182 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1183 _mm_cvtps_pi16(__m128 __A)
1185 __v4sf rounded;
1186 __vector signed int temp;
1187 __vector __m64 result;
1189 rounded = vec_rint(__A);
1190 temp = vec_cts (rounded, 0);
1191 result = (__vector __m64) vec_pack (temp, temp);
1193 return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0));
1196 /* Convert the four SPFP values in A to four signed 8-bit integers. */
1197 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1198 _mm_cvtps_pi8(__m128 __A)
1200 __v4sf rounded;
1201 __vector signed int tmp_i;
1202 static const __vector signed int zero = {0, 0, 0, 0};
1203 __vector signed short tmp_s;
1204 __vector signed char res_v;
1205 __m64 result;
1207 rounded = vec_rint(__A);
1208 tmp_i = vec_cts (rounded, 0);
1209 tmp_s = vec_pack (tmp_i, zero);
1210 res_v = vec_pack (tmp_s, tmp_s);
1211 result = (__m64) __builtin_unpack_vector_int128 ((__vector __int128)res_v, 0);
1213 return (result);
1216 /* Selects four specific SPFP values from A and B based on MASK. */
1217 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1219 _mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask)
1221 unsigned long element_selector_10 = __mask & 0x03;
1222 unsigned long element_selector_32 = (__mask >> 2) & 0x03;
1223 unsigned long element_selector_54 = (__mask >> 4) & 0x03;
1224 unsigned long element_selector_76 = (__mask >> 6) & 0x03;
1225 static const unsigned int permute_selectors[4] =
1227 #ifdef __LITTLE_ENDIAN__
1228 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1229 #elif __BIG_ENDIAN__
1230 0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203
1231 #endif
1233 __vector unsigned int t;
1235 #ifdef __LITTLE_ENDIAN__
1236 t[0] = permute_selectors[element_selector_10];
1237 t[1] = permute_selectors[element_selector_32];
1238 t[2] = permute_selectors[element_selector_54] + 0x10101010;
1239 t[3] = permute_selectors[element_selector_76] + 0x10101010;
1240 #elif __BIG_ENDIAN__
1241 t[3] = permute_selectors[element_selector_10] + 0x10101010;
1242 t[2] = permute_selectors[element_selector_32] + 0x10101010;
1243 t[1] = permute_selectors[element_selector_54];
1244 t[0] = permute_selectors[element_selector_76];
1245 #endif
1246 return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t);
1249 /* Selects and interleaves the upper two SPFP values from A and B. */
1250 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1251 _mm_unpackhi_ps (__m128 __A, __m128 __B)
1253 return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B);
1256 /* Selects and interleaves the lower two SPFP values from A and B. */
1257 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1258 _mm_unpacklo_ps (__m128 __A, __m128 __B)
1260 return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B);
1263 /* Sets the upper two SPFP values with 64-bits of data loaded from P;
1264 the lower two values are passed through from A. */
1265 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1266 _mm_loadh_pi (__m128 __A, __m64 const *__P)
1268 __vector __m64 __a = (__vector __m64)__A;
1269 __vector __m64 __p = vec_splats(*__P);
1270 __a [1] = __p [1];
1272 return (__m128)__a;
1275 /* Stores the upper two SPFP values of A into P. */
1276 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1277 _mm_storeh_pi (__m64 *__P, __m128 __A)
1279 __vector __m64 __a = (__vector __m64) __A;
1281 *__P = __a[1];
1284 /* Moves the upper two values of B into the lower two values of A. */
1285 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1286 _mm_movehl_ps (__m128 __A, __m128 __B)
1288 return (__m128) vec_mergel ((__vector __m64)__B, (__vector __m64)__A);
1291 /* Moves the lower two values of B into the upper two values of A. */
1292 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1293 _mm_movelh_ps (__m128 __A, __m128 __B)
1295 return (__m128) vec_mergeh ((__vector __m64)__A, (__vector __m64)__B);
1298 /* Sets the lower two SPFP values with 64-bits of data loaded from P;
1299 the upper two values are passed through from A. */
1300 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1301 _mm_loadl_pi (__m128 __A, __m64 const *__P)
1303 __vector __m64 __a = (__vector __m64)__A;
1304 __vector __m64 __p = vec_splats(*__P);
1305 __a [0] = __p [0];
1307 return (__m128)__a;
1310 /* Stores the lower two SPFP values of A into P. */
1311 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1312 _mm_storel_pi (__m64 *__P, __m128 __A)
1314 __vector __m64 __a = (__vector __m64) __A;
1316 *__P = __a[0];
1319 #ifdef _ARCH_PWR8
1320 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1322 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */
1323 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1324 _mm_movemask_ps (__m128 __A)
1326 __vector __m64 result;
1327 static const __vector unsigned int perm_mask =
1329 #ifdef __LITTLE_ENDIAN__
1330 0x00204060, 0x80808080, 0x80808080, 0x80808080
1331 #elif __BIG_ENDIAN__
1332 0x80808080, 0x80808080, 0x80808080, 0x00204060
1333 #endif
1336 result = (__vector __m64) vec_vbpermq ((__vector unsigned char) __A,
1337 (__vector unsigned char) perm_mask);
1339 #ifdef __LITTLE_ENDIAN__
1340 return result[1];
1341 #elif __BIG_ENDIAN__
1342 return result[0];
1343 #endif
1345 #endif /* _ARCH_PWR8 */
1347 /* Create a vector with all four elements equal to *P. */
1348 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1349 _mm_load1_ps (float const *__P)
1351 return _mm_set1_ps (*__P);
1354 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1355 _mm_load_ps1 (float const *__P)
1357 return _mm_load1_ps (__P);
1360 /* Extracts one of the four words of A. The selector N must be immediate. */
1361 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1362 _mm_extract_pi16 (__m64 const __A, int const __N)
1364 const int shiftr = (__N & 3) * 16;
1366 return ((__A >> shiftr) & 0xffff);
1369 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1370 _m_pextrw (__m64 const __A, int const __N)
1372 return _mm_extract_pi16 (__A, __N);
1375 /* Inserts word D into one of four words of A. The selector N must be
1376 immediate. */
1377 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1378 _mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
1380 const int shiftl = (__N & 3) * 16;
1381 const __m64 shiftD = (const __m64) __D << shiftl;
1382 const __m64 mask = 0xffffUL << shiftl;
1383 __m64 result = (__A & (~mask)) | (shiftD & mask);
1385 return (result);
1388 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1389 _m_pinsrw (__m64 const __A, int const __D, int const __N)
1391 return _mm_insert_pi16 (__A, __D, __N);
1394 /* Compute the element-wise maximum of signed 16-bit values. */
1395 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1397 _mm_max_pi16 (__m64 __A, __m64 __B)
1399 #if _ARCH_PWR8
1400 __vector signed short a, b, r;
1401 __vector bool short c;
1403 a = (__vector signed short)vec_splats (__A);
1404 b = (__vector signed short)vec_splats (__B);
1405 c = (__vector bool short)vec_cmpgt (a, b);
1406 r = vec_sel (b, a, c);
1407 return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1408 #else
1409 __m64_union m1, m2, res;
1411 m1.as_m64 = __A;
1412 m2.as_m64 = __B;
1414 res.as_short[0] =
1415 (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1416 res.as_short[1] =
1417 (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1418 res.as_short[2] =
1419 (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1420 res.as_short[3] =
1421 (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1423 return (__m64) res.as_m64;
1424 #endif
1427 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1428 _m_pmaxsw (__m64 __A, __m64 __B)
1430 return _mm_max_pi16 (__A, __B);
1433 /* Compute the element-wise maximum of unsigned 8-bit values. */
1434 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1435 _mm_max_pu8 (__m64 __A, __m64 __B)
1437 #if _ARCH_PWR8
1438 __vector unsigned char a, b, r;
1439 __vector bool char c;
1441 a = (__vector unsigned char)vec_splats (__A);
1442 b = (__vector unsigned char)vec_splats (__B);
1443 c = (__vector bool char)vec_cmpgt (a, b);
1444 r = vec_sel (b, a, c);
1445 return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1446 #else
1447 __m64_union m1, m2, res;
1448 long i;
1450 m1.as_m64 = __A;
1451 m2.as_m64 = __B;
1454 for (i = 0; i < 8; i++)
1455 res.as_char[i] =
1456 ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ?
1457 m1.as_char[i] : m2.as_char[i];
1459 return (__m64) res.as_m64;
1460 #endif
1463 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1464 _m_pmaxub (__m64 __A, __m64 __B)
1466 return _mm_max_pu8 (__A, __B);
1469 /* Compute the element-wise minimum of signed 16-bit values. */
1470 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1471 _mm_min_pi16 (__m64 __A, __m64 __B)
1473 #if _ARCH_PWR8
1474 __vector signed short a, b, r;
1475 __vector bool short c;
1477 a = (__vector signed short)vec_splats (__A);
1478 b = (__vector signed short)vec_splats (__B);
1479 c = (__vector bool short)vec_cmplt (a, b);
1480 r = vec_sel (b, a, c);
1481 return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1482 #else
1483 __m64_union m1, m2, res;
1485 m1.as_m64 = __A;
1486 m2.as_m64 = __B;
1488 res.as_short[0] =
1489 (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1490 res.as_short[1] =
1491 (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1492 res.as_short[2] =
1493 (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1494 res.as_short[3] =
1495 (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1497 return (__m64) res.as_m64;
1498 #endif
1501 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1502 _m_pminsw (__m64 __A, __m64 __B)
1504 return _mm_min_pi16 (__A, __B);
1507 /* Compute the element-wise minimum of unsigned 8-bit values. */
1508 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1509 _mm_min_pu8 (__m64 __A, __m64 __B)
1511 #if _ARCH_PWR8
1512 __vector unsigned char a, b, r;
1513 __vector bool char c;
1515 a = (__vector unsigned char)vec_splats (__A);
1516 b = (__vector unsigned char)vec_splats (__B);
1517 c = (__vector bool char)vec_cmplt (a, b);
1518 r = vec_sel (b, a, c);
1519 return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1520 #else
1521 __m64_union m1, m2, res;
1522 long i;
1524 m1.as_m64 = __A;
1525 m2.as_m64 = __B;
1528 for (i = 0; i < 8; i++)
1529 res.as_char[i] =
1530 ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ?
1531 m1.as_char[i] : m2.as_char[i];
1533 return (__m64) res.as_m64;
1534 #endif
1537 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1538 _m_pminub (__m64 __A, __m64 __B)
1540 return _mm_min_pu8 (__A, __B);
1543 /* Create an 8-bit mask of the signs of 8-bit values. */
1544 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1545 _mm_movemask_pi8 (__m64 __A)
1547 unsigned long p = 0x0008101820283038UL; // permute control for sign bits
1549 return __builtin_bpermd (p, __A);
1552 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1553 _m_pmovmskb (__m64 __A)
1555 return _mm_movemask_pi8 (__A);
1558 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1559 in B and produce the high 16 bits of the 32-bit results. */
1560 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1561 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
1563 __vector unsigned short a, b;
1564 __vector unsigned short c;
1565 __vector unsigned int w0, w1;
1566 __vector unsigned char xform1 = {
1567 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1568 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1571 a = (__vector unsigned short)vec_splats (__A);
1572 b = (__vector unsigned short)vec_splats (__B);
1574 w0 = vec_vmuleuh (a, b);
1575 w1 = vec_vmulouh (a, b);
1576 c = (__vector unsigned short)vec_perm (w0, w1, xform1);
1578 return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
1581 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1582 _m_pmulhuw (__m64 __A, __m64 __B)
1584 return _mm_mulhi_pu16 (__A, __B);
1587 /* Return a combination of the four 16-bit values in A. The selector
1588 must be an immediate. */
1589 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1590 _mm_shuffle_pi16 (__m64 __A, int const __N)
1592 unsigned long element_selector_10 = __N & 0x03;
1593 unsigned long element_selector_32 = (__N >> 2) & 0x03;
1594 unsigned long element_selector_54 = (__N >> 4) & 0x03;
1595 unsigned long element_selector_76 = (__N >> 6) & 0x03;
1596 static const unsigned short permute_selectors[4] =
1598 #ifdef __LITTLE_ENDIAN__
1599 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1600 #elif __BIG_ENDIAN__
1601 0x0607, 0x0405, 0x0203, 0x0001
1602 #endif
1604 __m64_union t;
1605 __vector __m64 a, p, r;
1607 #ifdef __LITTLE_ENDIAN__
1608 t.as_short[0] = permute_selectors[element_selector_10];
1609 t.as_short[1] = permute_selectors[element_selector_32];
1610 t.as_short[2] = permute_selectors[element_selector_54];
1611 t.as_short[3] = permute_selectors[element_selector_76];
1612 #elif __BIG_ENDIAN__
1613 t.as_short[3] = permute_selectors[element_selector_10];
1614 t.as_short[2] = permute_selectors[element_selector_32];
1615 t.as_short[1] = permute_selectors[element_selector_54];
1616 t.as_short[0] = permute_selectors[element_selector_76];
1617 #endif
1618 p = vec_splats (t.as_m64);
1619 a = vec_splats (__A);
1620 r = vec_perm (a, a, (__vector unsigned char)p);
1621 return (__builtin_unpack_vector_int128 ((__vector __int128)r, 0));
1624 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1625 _m_pshufw (__m64 __A, int const __N)
1627 return _mm_shuffle_pi16 (__A, __N);
1630 /* Conditionally store byte elements of A into P. The high bit of each
1631 byte in the selector N determines whether the corresponding byte from
1632 A is stored. */
1633 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1634 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1636 __m64 hibit = 0x8080808080808080UL;
1637 __m64 mask, tmp;
1638 __m64 *p = (__m64*)__P;
1640 tmp = *p;
1641 mask = _mm_cmpeq_pi8 ((__N & hibit), hibit);
1642 tmp = (tmp & (~mask)) | (__A & mask);
1643 *p = tmp;
1646 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1647 _m_maskmovq (__m64 __A, __m64 __N, char *__P)
1649 _mm_maskmove_si64 (__A, __N, __P);
1652 /* Compute the rounded averages of the unsigned 8-bit values in A and B. */
1653 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1654 _mm_avg_pu8 (__m64 __A, __m64 __B)
1656 __vector unsigned char a, b, c;
1658 a = (__vector unsigned char)vec_splats (__A);
1659 b = (__vector unsigned char)vec_splats (__B);
1660 c = vec_avg (a, b);
1661 return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
1664 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1665 _m_pavgb (__m64 __A, __m64 __B)
1667 return _mm_avg_pu8 (__A, __B);
1670 /* Compute the rounded averages of the unsigned 16-bit values in A and B. */
1671 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1672 _mm_avg_pu16 (__m64 __A, __m64 __B)
1674 __vector unsigned short a, b, c;
1676 a = (__vector unsigned short)vec_splats (__A);
1677 b = (__vector unsigned short)vec_splats (__B);
1678 c = vec_avg (a, b);
1679 return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
1682 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1683 _m_pavgw (__m64 __A, __m64 __B)
1685 return _mm_avg_pu16 (__A, __B);
1688 /* Compute the sum of the absolute differences of the unsigned 8-bit
1689 values in A and B. Return the value in the lower 16-bit word; the
1690 upper words are cleared. */
1691 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1692 _mm_sad_pu8 (__m64 __A, __m64 __B)
1694 __vector unsigned char a, b;
1695 __vector unsigned char vmin, vmax, vabsdiff;
1696 __vector signed int vsum;
1697 const __vector unsigned int zero =
1698 { 0, 0, 0, 0 };
1699 unsigned short result;
1701 a = (__vector unsigned char) __builtin_pack_vector_int128 (0UL, __A);
1702 b = (__vector unsigned char) __builtin_pack_vector_int128 (0UL, __B);
1703 vmin = vec_min (a, b);
1704 vmax = vec_max (a, b);
1705 vabsdiff = vec_sub (vmax, vmin);
1706 /* Sum four groups of bytes into integers. */
1707 vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
1708 /* Sum across four integers with integer result. */
1709 vsum = vec_sums (vsum, (__vector signed int) zero);
1710 /* The sum is in the right most 32-bits of the vector result.
1711 Transfer to a GPR and truncate to 16 bits. */
1712 result = vsum[3];
1713 return (result);
1716 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1717 _m_psadbw (__m64 __A, __m64 __B)
1719 return _mm_sad_pu8 (__A, __B);
1722 /* Stores the data in A to the address P without polluting the caches. */
1723 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1724 _mm_stream_pi (__m64 *__P, __m64 __A)
1726 /* Use the data cache block touch for store transient. */
1727 __asm__ (
1728 " dcbtstt 0,%0"
1730 : "b" (__P)
1731 : "memory"
1733 *__P = __A;
1736 /* Likewise. The address must be 16-byte aligned. */
1737 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1738 _mm_stream_ps (float *__P, __m128 __A)
1740 /* Use the data cache block touch for store transient. */
1741 __asm__ (
1742 " dcbtstt 0,%0"
1744 : "b" (__P)
1745 : "memory"
1747 _mm_store_ps (__P, __A);
1750 /* Guarantees that every preceding store is globally visible before
1751 any subsequent store. */
1752 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1753 _mm_sfence (void)
1755 /* Generate a light weight sync. */
1756 __atomic_thread_fence (__ATOMIC_RELEASE);
1759 /* The execution of the next instruction is delayed by an implementation
1760 specific amount of time. The instruction does not modify the
1761 architectural state. This is after the pop_options pragma because
1762 it does not require SSE support in the processor--the encoding is a
1763 nop on processors that do not support it. */
1764 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1765 _mm_pause (void)
1767 /* There is no exact match with this construct, but the following is
1768 close to the desired effect. */
1769 #if _ARCH_PWR8
1770 /* On power8 and later processors we can depend on Program Priority
1771 (PRI) and associated "very low" PPI setting. Since we don't know
1772 what PPI this thread is running at we: 1) save the current PRI
1773 from the PPR SPR into a local GRP, 2) set the PRI to "very low*
1774 via the special or 31,31,31 encoding. 3) issue an "isync" to
1775 insure the PRI change takes effect before we execute any more
1776 instructions.
1777 Now we can execute a lwsync (release barrier) while we execute
1778 this thread at "very low" PRI. Finally we restore the original
1779 PRI and continue execution. */
1780 unsigned long __PPR;
1782 __asm__ volatile (
1783 " mfppr %0;"
1784 " or 31,31,31;"
1785 " isync;"
1786 " lwsync;"
1787 " isync;"
1788 " mtppr %0;"
1789 : "=r" (__PPR)
1791 : "memory"
1793 #else
1794 /* For older processor where we may not even have Program Priority
1795 controls we can only depend on Heavy Weight Sync. */
1796 __atomic_thread_fence (__ATOMIC_SEQ_CST);
1797 #endif
1800 /* Transpose the 4x4 matrix composed of row[0-3]. */
1801 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
1802 do { \
1803 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
1804 __v4sf __t0 = vec_vmrghw (__r0, __r1); \
1805 __v4sf __t1 = vec_vmrghw (__r2, __r3); \
1806 __v4sf __t2 = vec_vmrglw (__r0, __r1); \
1807 __v4sf __t3 = vec_vmrglw (__r2, __r3); \
1808 (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, \
1809 (__vector long long)__t1); \
1810 (row1) = (__v4sf)vec_mergel ((__vector long long)__t0, \
1811 (__vector long long)__t1); \
1812 (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2, \
1813 (__vector long long)__t3); \
1814 (row3) = (__v4sf)vec_mergel ((__vector long long)__t2, \
1815 (__vector long long)__t3); \
1816 } while (0)
1818 /* For backward source compatibility. */
1819 //# include <emmintrin.h>
1821 #endif /* _XMMINTRIN_H_INCLUDED */