RISC-V: Error if function declared with different interrupt modes.
[official-gcc.git] / gcc / config / rs6000 / xmmintrin.h
blob43d03ea059198dae64cc3924658ab81a4243ad0c
1 /* Copyright (C) 2002-2018 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
24 /* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 9.0. */
27 #ifndef NO_WARN_X86_INTRINSICS
28 /* This header is distributed to simplify porting x86_64 code that
29 makes explicit use of Intel intrinsics to powerpc64le.
30 It is the user's responsibility to determine if the results are
31 acceptable and make additional changes as necessary.
32 Note that much code that uses Intel intrinsics can be rewritten in
33 standard C or GNU C extensions, which are more portable and better
34 optimized across multiple targets.
36 In the specific case of X86 SSE (__m128) intrinsics, the PowerPC
37 VMX/VSX ISA is a good match for vector float SIMD operations.
38 However scalar float operations in vector (XMM) registers require
39 the POWER8 VSX ISA (2.07) level. Also there are important
40 differences for data format and placement of float scalars in the
41 vector register. For PowerISA Scalar floats in FPRs (left most
42 64-bits of the low 32 VSRs) is in double format, while X86_64 SSE
43 uses the right most 32-bits of the XMM. These differences require
44 extra steps on POWER to match the SSE scalar float semantics.
46 Most SSE scalar float intrinsic operations can be performed more
47 efficiently as C language float scalar operations or optimized to
48 use vector SIMD operations. We recommend this for new applications.
50 Another difference is the format and details of the X86_64 MXSCR vs
51 the PowerISA FPSCR / VSCR registers. We recommend applications
52 replace direct access to the MXSCR with the more portable <fenv.h>
53 Posix APIs. */
54 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
55 #endif
57 #ifndef _XMMINTRIN_H_INCLUDED
58 #define _XMMINTRIN_H_INCLUDED
60 #include <altivec.h>
62 /* Avoid collisions between altivec.h and strict adherence to C++ and
63 C11 standards. This should eventually be done inside altivec.h itself,
64 but only after testing a full distro build. */
65 #if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \
66 (defined(__STDC_VERSION__) && \
67 __STDC_VERSION__ >= 201112L))
68 #undef vector
69 #undef pixel
70 #undef bool
71 #endif
73 #include <assert.h>
75 /* We need type definitions from the MMX header file. */
76 #include <mmintrin.h>
78 /* Get _mm_malloc () and _mm_free (). */
79 #include <mm_malloc.h>
81 /* The Intel API is flexible enough that we must allow aliasing with other
82 vector types, and their scalar components. */
83 typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
85 /* Internal data types for implementing the intrinsics. */
86 typedef float __v4sf __attribute__ ((__vector_size__ (16)));
88 /* Create an undefined vector. */
89 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
90 _mm_undefined_ps (void)
92 __m128 __Y = __Y;
93 return __Y;
96 /* Create a vector of zeros. */
97 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98 _mm_setzero_ps (void)
100 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
103 /* Load four SPFP values from P. The address must be 16-byte aligned. */
104 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
105 _mm_load_ps (float const *__P)
107 assert(((unsigned long)__P & 0xfUL) == 0UL);
108 return ((__m128)vec_ld(0, (__v4sf*)__P));
111 /* Load four SPFP values from P. The address need not be 16-byte aligned. */
112 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
113 _mm_loadu_ps (float const *__P)
115 return (vec_vsx_ld(0, __P));
118 /* Load four SPFP values in reverse order. The address must be aligned. */
119 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
120 _mm_loadr_ps (float const *__P)
122 __v4sf __tmp;
123 __m128 result;
124 static const __vector unsigned char permute_vector =
125 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
126 0x17, 0x10, 0x11, 0x12, 0x13 };
128 __tmp = vec_ld (0, (__v4sf *) __P);
129 result = (__m128) vec_perm (__tmp, __tmp, permute_vector);
130 return result;
133 /* Create a vector with all four elements equal to F. */
134 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
135 _mm_set1_ps (float __F)
137 return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
140 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
141 _mm_set_ps1 (float __F)
143 return _mm_set1_ps (__F);
146 /* Create the vector [Z Y X W]. */
147 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
148 _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
150 return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
153 /* Create the vector [W X Y Z]. */
154 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
155 _mm_setr_ps (float __Z, float __Y, float __X, float __W)
157 return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
160 /* Store four SPFP values. The address must be 16-byte aligned. */
161 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
162 _mm_store_ps (float *__P, __m128 __A)
164 assert(((unsigned long)__P & 0xfUL) == 0UL);
165 vec_st((__v4sf)__A, 0, (__v4sf*)__P);
168 /* Store four SPFP values. The address need not be 16-byte aligned. */
169 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
170 _mm_storeu_ps (float *__P, __m128 __A)
172 *(__m128 *)__P = __A;
175 /* Store four SPFP values in reverse order. The address must be aligned. */
176 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
177 _mm_storer_ps (float *__P, __m128 __A)
179 __v4sf __tmp;
180 static const __vector unsigned char permute_vector =
181 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
182 0x17, 0x10, 0x11, 0x12, 0x13 };
184 __tmp = (__m128) vec_perm (__A, __A, permute_vector);
186 _mm_store_ps (__P, __tmp);
189 /* Store the lower SPFP value across four words. */
190 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
191 _mm_store1_ps (float *__P, __m128 __A)
193 __v4sf __va = vec_splat((__v4sf)__A, 0);
194 _mm_store_ps (__P, __va);
197 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
198 _mm_store_ps1 (float *__P, __m128 __A)
200 _mm_store1_ps (__P, __A);
203 /* Create a vector with element 0 as F and the rest zero. */
204 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
205 _mm_set_ss (float __F)
207 return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
210 /* Sets the low SPFP value of A from the low value of B. */
211 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
212 _mm_move_ss (__m128 __A, __m128 __B)
214 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
216 return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask));
219 /* Create a vector with element 0 as *P and the rest zero. */
220 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
221 _mm_load_ss (float const *__P)
223 return _mm_set_ss (*__P);
226 /* Stores the lower SPFP value. */
227 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
228 _mm_store_ss (float *__P, __m128 __A)
230 *__P = ((__v4sf)__A)[0];
233 /* Perform the respective operation on the lower SPFP (single-precision
234 floating-point) values of A and B; the upper three SPFP values are
235 passed through from A. */
237 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
238 _mm_add_ss (__m128 __A, __m128 __B)
240 #ifdef _ARCH_PWR7
241 __m128 a, b, c;
242 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
243 /* PowerISA VSX does not allow partial (for just lower double)
244 results. So to insure we don't generate spurious exceptions
245 (from the upper double values) we splat the lower double
246 before we to the operation. */
247 a = vec_splat (__A, 0);
248 b = vec_splat (__B, 0);
249 c = a + b;
250 /* Then we merge the lower float result with the original upper
251 float elements from __A. */
252 return (vec_sel (__A, c, mask));
253 #else
254 __A[0] = __A[0] + __B[0];
255 return (__A);
256 #endif
259 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
260 _mm_sub_ss (__m128 __A, __m128 __B)
262 #ifdef _ARCH_PWR7
263 __m128 a, b, c;
264 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
265 /* PowerISA VSX does not allow partial (for just lower double)
266 results. So to insure we don't generate spurious exceptions
267 (from the upper double values) we splat the lower double
268 before we to the operation. */
269 a = vec_splat (__A, 0);
270 b = vec_splat (__B, 0);
271 c = a - b;
272 /* Then we merge the lower float result with the original upper
273 float elements from __A. */
274 return (vec_sel (__A, c, mask));
275 #else
276 __A[0] = __A[0] - __B[0];
277 return (__A);
278 #endif
281 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
282 _mm_mul_ss (__m128 __A, __m128 __B)
284 #ifdef _ARCH_PWR7
285 __m128 a, b, c;
286 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
287 /* PowerISA VSX does not allow partial (for just lower double)
288 results. So to insure we don't generate spurious exceptions
289 (from the upper double values) we splat the lower double
290 before we to the operation. */
291 a = vec_splat (__A, 0);
292 b = vec_splat (__B, 0);
293 c = a * b;
294 /* Then we merge the lower float result with the original upper
295 float elements from __A. */
296 return (vec_sel (__A, c, mask));
297 #else
298 __A[0] = __A[0] * __B[0];
299 return (__A);
300 #endif
303 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
304 _mm_div_ss (__m128 __A, __m128 __B)
306 #ifdef _ARCH_PWR7
307 __m128 a, b, c;
308 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
309 /* PowerISA VSX does not allow partial (for just lower double)
310 results. So to insure we don't generate spurious exceptions
311 (from the upper double values) we splat the lower double
312 before we to the operation. */
313 a = vec_splat (__A, 0);
314 b = vec_splat (__B, 0);
315 c = a / b;
316 /* Then we merge the lower float result with the original upper
317 float elements from __A. */
318 return (vec_sel (__A, c, mask));
319 #else
320 __A[0] = __A[0] / __B[0];
321 return (__A);
322 #endif
325 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
326 _mm_sqrt_ss (__m128 __A)
328 __m128 a, c;
329 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
330 /* PowerISA VSX does not allow partial (for just lower double)
331 * results. So to insure we don't generate spurious exceptions
332 * (from the upper double values) we splat the lower double
333 * before we to the operation. */
334 a = vec_splat (__A, 0);
335 c = vec_sqrt (a);
336 /* Then we merge the lower float result with the original upper
337 * float elements from __A. */
338 return (vec_sel (__A, c, mask));
341 /* Perform the respective operation on the four SPFP values in A and B. */
342 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
343 _mm_add_ps (__m128 __A, __m128 __B)
345 return (__m128) ((__v4sf)__A + (__v4sf)__B);
348 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
349 _mm_sub_ps (__m128 __A, __m128 __B)
351 return (__m128) ((__v4sf)__A - (__v4sf)__B);
354 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
355 _mm_mul_ps (__m128 __A, __m128 __B)
357 return (__m128) ((__v4sf)__A * (__v4sf)__B);
360 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
361 _mm_div_ps (__m128 __A, __m128 __B)
363 return (__m128) ((__v4sf)__A / (__v4sf)__B);
366 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
367 _mm_sqrt_ps (__m128 __A)
369 return (vec_sqrt ((__v4sf)__A));
372 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
373 _mm_rcp_ps (__m128 __A)
375 return (vec_re ((__v4sf)__A));
378 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
379 _mm_rsqrt_ps (__m128 __A)
381 return (vec_rsqrte (__A));
384 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
385 _mm_rcp_ss (__m128 __A)
387 __m128 a, c;
388 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
389 /* PowerISA VSX does not allow partial (for just lower double)
390 * results. So to insure we don't generate spurious exceptions
391 * (from the upper double values) we splat the lower double
392 * before we to the operation. */
393 a = vec_splat (__A, 0);
394 c = _mm_rcp_ps (a);
395 /* Then we merge the lower float result with the original upper
396 * float elements from __A. */
397 return (vec_sel (__A, c, mask));
400 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
401 _mm_rsqrt_ss (__m128 __A)
403 __m128 a, c;
404 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
405 /* PowerISA VSX does not allow partial (for just lower double)
406 * results. So to insure we don't generate spurious exceptions
407 * (from the upper double values) we splat the lower double
408 * before we to the operation. */
409 a = vec_splat (__A, 0);
410 c = vec_rsqrte (a);
411 /* Then we merge the lower float result with the original upper
412 * float elements from __A. */
413 return (vec_sel (__A, c, mask));
416 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
417 _mm_min_ss (__m128 __A, __m128 __B)
419 __v4sf a, b, c;
420 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
421 /* PowerISA VSX does not allow partial (for just lower float)
422 * results. So to insure we don't generate spurious exceptions
423 * (from the upper float values) we splat the lower float
424 * before we to the operation. */
425 a = vec_splat ((__v4sf)__A, 0);
426 b = vec_splat ((__v4sf)__B, 0);
427 c = vec_min (a, b);
428 /* Then we merge the lower float result with the original upper
429 * float elements from __A. */
430 return (vec_sel ((__v4sf)__A, c, mask));
433 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
434 _mm_max_ss (__m128 __A, __m128 __B)
436 __v4sf a, b, c;
437 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
438 /* PowerISA VSX does not allow partial (for just lower float)
439 * results. So to insure we don't generate spurious exceptions
440 * (from the upper float values) we splat the lower float
441 * before we to the operation. */
442 a = vec_splat (__A, 0);
443 b = vec_splat (__B, 0);
444 c = vec_max (a, b);
445 /* Then we merge the lower float result with the original upper
446 * float elements from __A. */
447 return (vec_sel ((__v4sf)__A, c, mask));
450 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
451 _mm_min_ps (__m128 __A, __m128 __B)
453 __m128 m = (__m128) vec_vcmpgtfp ((__v4sf) __B, (__v4sf) __A);
454 return vec_sel (__B, __A, m);
457 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
458 _mm_max_ps (__m128 __A, __m128 __B)
460 __m128 m = (__m128) vec_vcmpgtfp ((__v4sf) __A, (__v4sf) __B);
461 return vec_sel (__B, __A, m);
464 /* Perform logical bit-wise operations on 128-bit values. */
465 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
466 _mm_and_ps (__m128 __A, __m128 __B)
468 return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B));
469 // return __builtin_ia32_andps (__A, __B);
472 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
473 _mm_andnot_ps (__m128 __A, __m128 __B)
475 return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A));
478 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
479 _mm_or_ps (__m128 __A, __m128 __B)
481 return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B));
484 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
485 _mm_xor_ps (__m128 __A, __m128 __B)
487 return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B));
490 /* Perform a comparison on the four SPFP values of A and B. For each
491 element, if the comparison is true, place a mask of all ones in the
492 result, otherwise a mask of zeros. */
493 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
494 _mm_cmpeq_ps (__m128 __A, __m128 __B)
496 return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B));
499 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
500 _mm_cmplt_ps (__m128 __A, __m128 __B)
502 return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
505 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
506 _mm_cmple_ps (__m128 __A, __m128 __B)
508 return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
511 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
512 _mm_cmpgt_ps (__m128 __A, __m128 __B)
514 return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
517 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
518 _mm_cmpge_ps (__m128 __A, __m128 __B)
520 return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
523 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
524 _mm_cmpneq_ps (__m128 __A, __m128 __B)
526 __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
527 return ((__m128)vec_nor (temp, temp));
530 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
531 _mm_cmpnlt_ps (__m128 __A, __m128 __B)
533 return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
536 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
537 _mm_cmpnle_ps (__m128 __A, __m128 __B)
539 return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
542 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
543 _mm_cmpngt_ps (__m128 __A, __m128 __B)
545 return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
548 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
549 _mm_cmpnge_ps (__m128 __A, __m128 __B)
551 return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
554 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
555 _mm_cmpord_ps (__m128 __A, __m128 __B)
557 __vector unsigned int a, b;
558 __vector unsigned int c, d;
559 static const __vector unsigned int float_exp_mask =
560 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
562 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
563 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
564 c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
565 d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
566 return ((__m128 ) vec_and (c, d));
569 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
570 _mm_cmpunord_ps (__m128 __A, __m128 __B)
572 __vector unsigned int a, b;
573 __vector unsigned int c, d;
574 static const __vector unsigned int float_exp_mask =
575 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
577 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
578 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
579 c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
580 d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
581 return ((__m128 ) vec_or (c, d));
584 /* Perform a comparison on the lower SPFP values of A and B. If the
585 comparison is true, place a mask of all ones in the result, otherwise a
586 mask of zeros. The upper three SPFP values are passed through from A. */
587 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
588 _mm_cmpeq_ss (__m128 __A, __m128 __B)
590 static const __vector unsigned int mask =
591 { 0xffffffff, 0, 0, 0 };
592 __v4sf a, b, c;
593 /* PowerISA VMX does not allow partial (for just element 0)
594 * results. So to insure we don't generate spurious exceptions
595 * (from the upper elements) we splat the lower float
596 * before we to the operation. */
597 a = vec_splat ((__v4sf) __A, 0);
598 b = vec_splat ((__v4sf) __B, 0);
599 c = (__v4sf) vec_cmpeq(a, b);
600 /* Then we merge the lower float result with the original upper
601 * float elements from __A. */
602 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
605 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
606 _mm_cmplt_ss (__m128 __A, __m128 __B)
608 static const __vector unsigned int mask =
609 { 0xffffffff, 0, 0, 0 };
610 __v4sf a, b, c;
611 /* PowerISA VMX does not allow partial (for just element 0)
612 * results. So to insure we don't generate spurious exceptions
613 * (from the upper elements) we splat the lower float
614 * before we to the operation. */
615 a = vec_splat ((__v4sf) __A, 0);
616 b = vec_splat ((__v4sf) __B, 0);
617 c = (__v4sf) vec_cmplt(a, b);
618 /* Then we merge the lower float result with the original upper
619 * float elements from __A. */
620 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
623 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
624 _mm_cmple_ss (__m128 __A, __m128 __B)
626 static const __vector unsigned int mask =
627 { 0xffffffff, 0, 0, 0 };
628 __v4sf a, b, c;
629 /* PowerISA VMX does not allow partial (for just element 0)
630 * results. So to insure we don't generate spurious exceptions
631 * (from the upper elements) we splat the lower float
632 * before we to the operation. */
633 a = vec_splat ((__v4sf) __A, 0);
634 b = vec_splat ((__v4sf) __B, 0);
635 c = (__v4sf) vec_cmple(a, b);
636 /* Then we merge the lower float result with the original upper
637 * float elements from __A. */
638 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
641 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
642 _mm_cmpgt_ss (__m128 __A, __m128 __B)
644 static const __vector unsigned int mask =
645 { 0xffffffff, 0, 0, 0 };
646 __v4sf a, b, c;
647 /* PowerISA VMX does not allow partial (for just element 0)
648 * results. So to insure we don't generate spurious exceptions
649 * (from the upper elements) we splat the lower float
650 * before we to the operation. */
651 a = vec_splat ((__v4sf) __A, 0);
652 b = vec_splat ((__v4sf) __B, 0);
653 c = (__v4sf) vec_cmpgt(a, b);
654 /* Then we merge the lower float result with the original upper
655 * float elements from __A. */
656 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
659 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
660 _mm_cmpge_ss (__m128 __A, __m128 __B)
662 static const __vector unsigned int mask =
663 { 0xffffffff, 0, 0, 0 };
664 __v4sf a, b, c;
665 /* PowerISA VMX does not allow partial (for just element 0)
666 * results. So to insure we don't generate spurious exceptions
667 * (from the upper elements) we splat the lower float
668 * before we to the operation. */
669 a = vec_splat ((__v4sf) __A, 0);
670 b = vec_splat ((__v4sf) __B, 0);
671 c = (__v4sf) vec_cmpge(a, b);
672 /* Then we merge the lower float result with the original upper
673 * float elements from __A. */
674 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
677 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
678 _mm_cmpneq_ss (__m128 __A, __m128 __B)
680 static const __vector unsigned int mask =
681 { 0xffffffff, 0, 0, 0 };
682 __v4sf a, b, c;
683 /* PowerISA VMX does not allow partial (for just element 0)
684 * results. So to insure we don't generate spurious exceptions
685 * (from the upper elements) we splat the lower float
686 * before we to the operation. */
687 a = vec_splat ((__v4sf) __A, 0);
688 b = vec_splat ((__v4sf) __B, 0);
689 c = (__v4sf) vec_cmpeq(a, b);
690 c = vec_nor (c, c);
691 /* Then we merge the lower float result with the original upper
692 * float elements from __A. */
693 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
696 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
697 _mm_cmpnlt_ss (__m128 __A, __m128 __B)
699 static const __vector unsigned int mask =
700 { 0xffffffff, 0, 0, 0 };
701 __v4sf a, b, c;
702 /* PowerISA VMX does not allow partial (for just element 0)
703 * results. So to insure we don't generate spurious exceptions
704 * (from the upper elements) we splat the lower float
705 * before we to the operation. */
706 a = vec_splat ((__v4sf) __A, 0);
707 b = vec_splat ((__v4sf) __B, 0);
708 c = (__v4sf) vec_cmpge(a, b);
709 /* Then we merge the lower float result with the original upper
710 * float elements from __A. */
711 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
714 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
715 _mm_cmpnle_ss (__m128 __A, __m128 __B)
717 static const __vector unsigned int mask =
718 { 0xffffffff, 0, 0, 0 };
719 __v4sf a, b, c;
720 /* PowerISA VMX does not allow partial (for just element 0)
721 * results. So to insure we don't generate spurious exceptions
722 * (from the upper elements) we splat the lower float
723 * before we to the operation. */
724 a = vec_splat ((__v4sf) __A, 0);
725 b = vec_splat ((__v4sf) __B, 0);
726 c = (__v4sf) vec_cmpgt(a, b);
727 /* Then we merge the lower float result with the original upper
728 * float elements from __A. */
729 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
732 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
733 _mm_cmpngt_ss (__m128 __A, __m128 __B)
735 static const __vector unsigned int mask =
736 { 0xffffffff, 0, 0, 0 };
737 __v4sf a, b, c;
738 /* PowerISA VMX does not allow partial (for just element 0)
739 * results. So to insure we don't generate spurious exceptions
740 * (from the upper elements) we splat the lower float
741 * before we to the operation. */
742 a = vec_splat ((__v4sf) __A, 0);
743 b = vec_splat ((__v4sf) __B, 0);
744 c = (__v4sf) vec_cmple(a, b);
745 /* Then we merge the lower float result with the original upper
746 * float elements from __A. */
747 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
750 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
751 _mm_cmpnge_ss (__m128 __A, __m128 __B)
753 static const __vector unsigned int mask =
754 { 0xffffffff, 0, 0, 0 };
755 __v4sf a, b, c;
756 /* PowerISA VMX does not allow partial (for just element 0)
757 * results. So to insure we don't generate spurious exceptions
758 * (from the upper elements) we splat the lower float
759 * before we do the operation. */
760 a = vec_splat ((__v4sf) __A, 0);
761 b = vec_splat ((__v4sf) __B, 0);
762 c = (__v4sf) vec_cmplt(a, b);
763 /* Then we merge the lower float result with the original upper
764 * float elements from __A. */
765 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
768 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
769 _mm_cmpord_ss (__m128 __A, __m128 __B)
771 __vector unsigned int a, b;
772 __vector unsigned int c, d;
773 static const __vector unsigned int float_exp_mask =
774 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
775 static const __vector unsigned int mask =
776 { 0xffffffff, 0, 0, 0 };
778 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
779 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
780 c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
781 d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
782 c = vec_and (c, d);
783 /* Then we merge the lower float result with the original upper
784 * float elements from __A. */
785 return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
788 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
789 _mm_cmpunord_ss (__m128 __A, __m128 __B)
791 __vector unsigned int a, b;
792 __vector unsigned int c, d;
793 static const __vector unsigned int float_exp_mask =
794 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
795 static const __vector unsigned int mask =
796 { 0xffffffff, 0, 0, 0 };
798 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
799 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
800 c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
801 d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
802 c = vec_or (c, d);
803 /* Then we merge the lower float result with the original upper
804 * float elements from __A. */
805 return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
808 /* Compare the lower SPFP values of A and B and return 1 if true
809 and 0 if false. */
810 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
811 _mm_comieq_ss (__m128 __A, __m128 __B)
813 return (__A[0] == __B[0]);
816 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
817 _mm_comilt_ss (__m128 __A, __m128 __B)
819 return (__A[0] < __B[0]);
822 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
823 _mm_comile_ss (__m128 __A, __m128 __B)
825 return (__A[0] <= __B[0]);
828 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
829 _mm_comigt_ss (__m128 __A, __m128 __B)
831 return (__A[0] > __B[0]);
834 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
835 _mm_comige_ss (__m128 __A, __m128 __B)
837 return (__A[0] >= __B[0]);
840 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
841 _mm_comineq_ss (__m128 __A, __m128 __B)
843 return (__A[0] != __B[0]);
846 /* FIXME
847 * The __mm_ucomi??_ss implementations below are exactly the same as
848 * __mm_comi??_ss because GCC for PowerPC only generates unordered
849 * compares (scalar and vector).
850 * Technically __mm_comieq_ss et al should be using the ordered
851 * compare and signal for QNaNs.
852 * The __mm_ucomieq_sd et all should be OK, as is.
854 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
855 _mm_ucomieq_ss (__m128 __A, __m128 __B)
857 return (__A[0] == __B[0]);
860 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
861 _mm_ucomilt_ss (__m128 __A, __m128 __B)
863 return (__A[0] < __B[0]);
866 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
867 _mm_ucomile_ss (__m128 __A, __m128 __B)
869 return (__A[0] <= __B[0]);
872 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
873 _mm_ucomigt_ss (__m128 __A, __m128 __B)
875 return (__A[0] > __B[0]);
878 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
879 _mm_ucomige_ss (__m128 __A, __m128 __B)
881 return (__A[0] >= __B[0]);
884 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
885 _mm_ucomineq_ss (__m128 __A, __m128 __B)
887 return (__A[0] != __B[0]);
890 extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
891 _mm_cvtss_f32 (__m128 __A)
893 return ((__v4sf)__A)[0];
896 /* Convert the lower SPFP value to a 32-bit integer according to the current
897 rounding mode. */
898 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
899 _mm_cvtss_si32 (__m128 __A)
901 __m64 res = 0;
902 #ifdef _ARCH_PWR8
903 __m128 vtmp;
904 __asm__(
905 "xxsldwi %x1,%x2,%x2,3;\n"
906 "xscvspdp %x1,%x1;\n"
907 "fctiw %1,%1;\n"
908 "mfvsrd %0,%x1;\n"
909 : "=r" (res),
910 "=&wi" (vtmp)
911 : "wa" (__A)
912 : );
913 #else
914 res = __builtin_rint(__A[0]);
915 #endif
916 return (res);
919 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
920 _mm_cvt_ss2si (__m128 __A)
922 return _mm_cvtss_si32 (__A);
925 /* Convert the lower SPFP value to a 32-bit integer according to the
926 current rounding mode. */
928 /* Intel intrinsic. */
929 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
930 _mm_cvtss_si64 (__m128 __A)
932 __m64 res = 0;
933 #ifdef _ARCH_PWR8
934 __m128 vtmp;
935 __asm__(
936 "xxsldwi %x1,%x2,%x2,3;\n"
937 "xscvspdp %x1,%x1;\n"
938 "fctid %1,%1;\n"
939 "mfvsrd %0,%x1;\n"
940 : "=r" (res),
941 "=&wi" (vtmp)
942 : "wa" (__A)
943 : );
944 #else
945 res = __builtin_llrint(__A[0]);
946 #endif
947 return (res);
950 /* Microsoft intrinsic. */
951 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
952 _mm_cvtss_si64x (__m128 __A)
954 return _mm_cvtss_si64 ((__v4sf) __A);
957 /* Constants for use with _mm_prefetch. */
958 enum _mm_hint
960 /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */
961 _MM_HINT_ET0 = 7,
962 _MM_HINT_ET1 = 6,
963 _MM_HINT_T0 = 3,
964 _MM_HINT_T1 = 2,
965 _MM_HINT_T2 = 1,
966 _MM_HINT_NTA = 0
969 /* Loads one cache line from address P to a location "closer" to the
970 processor. The selector I specifies the type of prefetch operation. */
971 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
972 _mm_prefetch (const void *__P, enum _mm_hint __I)
974 /* Current PowerPC will ignores the hint parameters. */
975 __builtin_prefetch (__P);
978 /* Convert the two lower SPFP values to 32-bit integers according to the
979 current rounding mode. Return the integers in packed form. */
980 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
981 _mm_cvtps_pi32 (__m128 __A)
983 /* Splat two lower SPFP values to both halves. */
984 __v4sf temp, rounded;
985 __vector __m64 result;
987 /* Splat two lower SPFP values to both halves. */
988 temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
989 rounded = vec_rint(temp);
990 result = (__vector __m64) vec_cts (rounded, 0);
992 return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0));
995 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
996 _mm_cvt_ps2pi (__m128 __A)
998 return _mm_cvtps_pi32 (__A);
1001 /* Truncate the lower SPFP value to a 32-bit integer. */
1002 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1003 _mm_cvttss_si32 (__m128 __A)
1005 /* Extract the lower float element. */
1006 float temp = __A[0];
1007 /* truncate to 32-bit integer and return. */
1008 return temp;
1011 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1012 _mm_cvtt_ss2si (__m128 __A)
1014 return _mm_cvttss_si32 (__A);
1017 /* Intel intrinsic. */
1018 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1019 _mm_cvttss_si64 (__m128 __A)
1021 /* Extract the lower float element. */
1022 float temp = __A[0];
1023 /* truncate to 32-bit integer and return. */
1024 return temp;
1027 /* Microsoft intrinsic. */
1028 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1029 _mm_cvttss_si64x (__m128 __A)
1031 /* Extract the lower float element. */
1032 float temp = __A[0];
1033 /* truncate to 32-bit integer and return. */
1034 return temp;
1037 /* Truncate the two lower SPFP values to 32-bit integers. Return the
1038 integers in packed form. */
1039 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1040 _mm_cvttps_pi32 (__m128 __A)
1042 __v4sf temp;
1043 __vector __m64 result;
1045 /* Splat two lower SPFP values to both halves. */
1046 temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
1047 result = (__vector __m64) vec_cts (temp, 0);
1049 return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0));
1052 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1053 _mm_cvtt_ps2pi (__m128 __A)
1055 return _mm_cvttps_pi32 (__A);
1058 /* Convert B to a SPFP value and insert it as element zero in A. */
1059 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1060 _mm_cvtsi32_ss (__m128 __A, int __B)
1062 float temp = __B;
1063 __A[0] = temp;
1065 return __A;
1068 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1069 _mm_cvt_si2ss (__m128 __A, int __B)
1071 return _mm_cvtsi32_ss (__A, __B);
1074 /* Convert B to a SPFP value and insert it as element zero in A. */
1075 /* Intel intrinsic. */
1076 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1077 _mm_cvtsi64_ss (__m128 __A, long long __B)
1079 float temp = __B;
1080 __A[0] = temp;
1082 return __A;
1085 /* Microsoft intrinsic. */
1086 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1087 _mm_cvtsi64x_ss (__m128 __A, long long __B)
1089 return _mm_cvtsi64_ss (__A, __B);
1092 /* Convert the two 32-bit values in B to SPFP form and insert them
1093 as the two lower elements in A. */
1094 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1095 _mm_cvtpi32_ps (__m128 __A, __m64 __B)
1097 __vector signed int vm1;
1098 __vector float vf1;
1100 vm1 = (__vector signed int) __builtin_pack_vector_int128 (__B, __B);
1101 vf1 = (__vector float) vec_ctf (vm1, 0);
1103 return ((__m128) (__vector __m64)
1104 { ((__vector __m64)vf1) [0], ((__vector __m64)__A) [1]});
1107 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1108 _mm_cvt_pi2ps (__m128 __A, __m64 __B)
1110 return _mm_cvtpi32_ps (__A, __B);
1113 /* Convert the four signed 16-bit values in A to SPFP form. */
1114 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1115 _mm_cvtpi16_ps (__m64 __A)
1117 __vector signed short vs8;
1118 __vector signed int vi4;
1119 __vector float vf1;
1121 vs8 = (__vector signed short) __builtin_pack_vector_int128 (__A, __A);
1122 vi4 = vec_vupklsh (vs8);
1123 vf1 = (__vector float) vec_ctf (vi4, 0);
1125 return (__m128) vf1;
1128 /* Convert the four unsigned 16-bit values in A to SPFP form. */
1129 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1130 _mm_cvtpu16_ps (__m64 __A)
1132 const __vector unsigned short zero =
1133 { 0, 0, 0, 0, 0, 0, 0, 0 };
1134 __vector unsigned short vs8;
1135 __vector unsigned int vi4;
1136 __vector float vf1;
1138 vs8 = (__vector unsigned short) __builtin_pack_vector_int128 (__A, __A);
1139 vi4 = (__vector unsigned int) vec_vmrglh (vs8, zero);
1140 vf1 = (__vector float) vec_ctf (vi4, 0);
1142 return (__m128) vf1;
1145 /* Convert the low four signed 8-bit values in A to SPFP form. */
1146 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1147 _mm_cvtpi8_ps (__m64 __A)
1149 __vector signed char vc16;
1150 __vector signed short vs8;
1151 __vector signed int vi4;
1152 __vector float vf1;
1154 vc16 = (__vector signed char) __builtin_pack_vector_int128 (__A, __A);
1155 vs8 = vec_vupkhsb (vc16);
1156 vi4 = vec_vupkhsh (vs8);
1157 vf1 = (__vector float) vec_ctf (vi4, 0);
1159 return (__m128) vf1;
1162 /* Convert the low four unsigned 8-bit values in A to SPFP form. */
1163 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1165 _mm_cvtpu8_ps (__m64 __A)
1167 const __vector unsigned char zero =
1168 { 0, 0, 0, 0, 0, 0, 0, 0 };
1169 __vector unsigned char vc16;
1170 __vector unsigned short vs8;
1171 __vector unsigned int vi4;
1172 __vector float vf1;
1174 vc16 = (__vector unsigned char) __builtin_pack_vector_int128 (__A, __A);
1175 vs8 = (__vector unsigned short) vec_vmrglb (vc16, zero);
1176 vi4 = (__vector unsigned int) vec_vmrghh (vs8,
1177 (__vector unsigned short) zero);
1178 vf1 = (__vector float) vec_ctf (vi4, 0);
1180 return (__m128) vf1;
1183 /* Convert the four signed 32-bit values in A and B to SPFP form. */
1184 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1185 _mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
1187 __vector signed int vi4;
1188 __vector float vf4;
1190 vi4 = (__vector signed int) __builtin_pack_vector_int128 (__B, __A);
1191 vf4 = (__vector float) vec_ctf (vi4, 0);
1192 return (__m128) vf4;
1195 /* Convert the four SPFP values in A to four signed 16-bit integers. */
1196 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1197 _mm_cvtps_pi16(__m128 __A)
1199 __v4sf rounded;
1200 __vector signed int temp;
1201 __vector __m64 result;
1203 rounded = vec_rint(__A);
1204 temp = vec_cts (rounded, 0);
1205 result = (__vector __m64) vec_pack (temp, temp);
1207 return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0));
1210 /* Convert the four SPFP values in A to four signed 8-bit integers. */
1211 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1212 _mm_cvtps_pi8(__m128 __A)
1214 __v4sf rounded;
1215 __vector signed int tmp_i;
1216 static const __vector signed int zero = {0, 0, 0, 0};
1217 __vector signed short tmp_s;
1218 __vector signed char res_v;
1219 __m64 result;
1221 rounded = vec_rint(__A);
1222 tmp_i = vec_cts (rounded, 0);
1223 tmp_s = vec_pack (tmp_i, zero);
1224 res_v = vec_pack (tmp_s, tmp_s);
1225 result = (__m64) __builtin_unpack_vector_int128 ((__vector __int128)res_v, 0);
1227 return (result);
1230 /* Selects four specific SPFP values from A and B based on MASK. */
1231 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1233 _mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask)
1235 unsigned long element_selector_10 = __mask & 0x03;
1236 unsigned long element_selector_32 = (__mask >> 2) & 0x03;
1237 unsigned long element_selector_54 = (__mask >> 4) & 0x03;
1238 unsigned long element_selector_76 = (__mask >> 6) & 0x03;
1239 static const unsigned int permute_selectors[4] =
1241 #ifdef __LITTLE_ENDIAN__
1242 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1243 #elif __BIG_ENDIAN__
1244 0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203
1245 #endif
1247 __vector unsigned int t;
1249 #ifdef __LITTLE_ENDIAN__
1250 t[0] = permute_selectors[element_selector_10];
1251 t[1] = permute_selectors[element_selector_32];
1252 t[2] = permute_selectors[element_selector_54] + 0x10101010;
1253 t[3] = permute_selectors[element_selector_76] + 0x10101010;
1254 #elif __BIG_ENDIAN__
1255 t[3] = permute_selectors[element_selector_10] + 0x10101010;
1256 t[2] = permute_selectors[element_selector_32] + 0x10101010;
1257 t[1] = permute_selectors[element_selector_54];
1258 t[0] = permute_selectors[element_selector_76];
1259 #endif
1260 return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t);
1263 /* Selects and interleaves the upper two SPFP values from A and B. */
1264 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1265 _mm_unpackhi_ps (__m128 __A, __m128 __B)
1267 return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B);
1270 /* Selects and interleaves the lower two SPFP values from A and B. */
1271 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1272 _mm_unpacklo_ps (__m128 __A, __m128 __B)
1274 return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B);
1277 /* Sets the upper two SPFP values with 64-bits of data loaded from P;
1278 the lower two values are passed through from A. */
1279 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1280 _mm_loadh_pi (__m128 __A, __m64 const *__P)
1282 __vector __m64 __a = (__vector __m64)__A;
1283 __vector __m64 __p = vec_splats(*__P);
1284 __a [1] = __p [1];
1286 return (__m128)__a;
1289 /* Stores the upper two SPFP values of A into P. */
1290 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1291 _mm_storeh_pi (__m64 *__P, __m128 __A)
1293 __vector __m64 __a = (__vector __m64) __A;
1295 *__P = __a[1];
1298 /* Moves the upper two values of B into the lower two values of A. */
1299 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1300 _mm_movehl_ps (__m128 __A, __m128 __B)
1302 return (__m128) vec_mergel ((__vector __m64)__B, (__vector __m64)__A);
1305 /* Moves the lower two values of B into the upper two values of A. */
1306 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1307 _mm_movelh_ps (__m128 __A, __m128 __B)
1309 return (__m128) vec_mergeh ((__vector __m64)__A, (__vector __m64)__B);
1312 /* Sets the lower two SPFP values with 64-bits of data loaded from P;
1313 the upper two values are passed through from A. */
1314 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1315 _mm_loadl_pi (__m128 __A, __m64 const *__P)
1317 __vector __m64 __a = (__vector __m64)__A;
1318 __vector __m64 __p = vec_splats(*__P);
1319 __a [0] = __p [0];
1321 return (__m128)__a;
1324 /* Stores the lower two SPFP values of A into P. */
1325 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1326 _mm_storel_pi (__m64 *__P, __m128 __A)
1328 __vector __m64 __a = (__vector __m64) __A;
1330 *__P = __a[0];
1333 #ifdef _ARCH_PWR8
1334 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1336 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */
1337 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1338 _mm_movemask_ps (__m128 __A)
1340 __vector __m64 result;
1341 static const __vector unsigned int perm_mask =
1343 #ifdef __LITTLE_ENDIAN__
1344 0x00204060, 0x80808080, 0x80808080, 0x80808080
1345 #elif __BIG_ENDIAN__
1346 0x80808080, 0x80808080, 0x80808080, 0x00204060
1347 #endif
1350 result = (__vector __m64) vec_vbpermq ((__vector unsigned char) __A,
1351 (__vector unsigned char) perm_mask);
1353 #ifdef __LITTLE_ENDIAN__
1354 return result[1];
1355 #elif __BIG_ENDIAN__
1356 return result[0];
1357 #endif
1359 #endif /* _ARCH_PWR8 */
1361 /* Create a vector with all four elements equal to *P. */
1362 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1363 _mm_load1_ps (float const *__P)
1365 return _mm_set1_ps (*__P);
1368 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1369 _mm_load_ps1 (float const *__P)
1371 return _mm_load1_ps (__P);
1374 /* Extracts one of the four words of A. The selector N must be immediate. */
1375 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1376 _mm_extract_pi16 (__m64 const __A, int const __N)
1378 const int shiftr = (__N & 3) * 16;
1380 return ((__A >> shiftr) & 0xffff);
1383 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1384 _m_pextrw (__m64 const __A, int const __N)
1386 return _mm_extract_pi16 (__A, __N);
1389 /* Inserts word D into one of four words of A. The selector N must be
1390 immediate. */
1391 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1392 _mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
1394 const int shiftl = (__N & 3) * 16;
1395 const __m64 shiftD = (const __m64) __D << shiftl;
1396 const __m64 mask = 0xffffUL << shiftl;
1397 __m64 result = (__A & (~mask)) | (shiftD & mask);
1399 return (result);
1402 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1403 _m_pinsrw (__m64 const __A, int const __D, int const __N)
1405 return _mm_insert_pi16 (__A, __D, __N);
1408 /* Compute the element-wise maximum of signed 16-bit values. */
1409 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1411 _mm_max_pi16 (__m64 __A, __m64 __B)
1413 #if _ARCH_PWR8
1414 __vector signed short a, b, r;
1415 __vector __bool short c;
1417 a = (__vector signed short)vec_splats (__A);
1418 b = (__vector signed short)vec_splats (__B);
1419 c = (__vector __bool short)vec_cmpgt (a, b);
1420 r = vec_sel (b, a, c);
1421 return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1422 #else
1423 __m64_union m1, m2, res;
1425 m1.as_m64 = __A;
1426 m2.as_m64 = __B;
1428 res.as_short[0] =
1429 (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1430 res.as_short[1] =
1431 (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1432 res.as_short[2] =
1433 (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1434 res.as_short[3] =
1435 (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1437 return (__m64) res.as_m64;
1438 #endif
1441 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1442 _m_pmaxsw (__m64 __A, __m64 __B)
1444 return _mm_max_pi16 (__A, __B);
1447 /* Compute the element-wise maximum of unsigned 8-bit values. */
1448 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1449 _mm_max_pu8 (__m64 __A, __m64 __B)
1451 #if _ARCH_PWR8
1452 __vector unsigned char a, b, r;
1453 __vector __bool char c;
1455 a = (__vector unsigned char)vec_splats (__A);
1456 b = (__vector unsigned char)vec_splats (__B);
1457 c = (__vector __bool char)vec_cmpgt (a, b);
1458 r = vec_sel (b, a, c);
1459 return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1460 #else
1461 __m64_union m1, m2, res;
1462 long i;
1464 m1.as_m64 = __A;
1465 m2.as_m64 = __B;
1468 for (i = 0; i < 8; i++)
1469 res.as_char[i] =
1470 ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ?
1471 m1.as_char[i] : m2.as_char[i];
1473 return (__m64) res.as_m64;
1474 #endif
1477 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1478 _m_pmaxub (__m64 __A, __m64 __B)
1480 return _mm_max_pu8 (__A, __B);
1483 /* Compute the element-wise minimum of signed 16-bit values. */
1484 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1485 _mm_min_pi16 (__m64 __A, __m64 __B)
1487 #if _ARCH_PWR8
1488 __vector signed short a, b, r;
1489 __vector __bool short c;
1491 a = (__vector signed short)vec_splats (__A);
1492 b = (__vector signed short)vec_splats (__B);
1493 c = (__vector __bool short)vec_cmplt (a, b);
1494 r = vec_sel (b, a, c);
1495 return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1496 #else
1497 __m64_union m1, m2, res;
1499 m1.as_m64 = __A;
1500 m2.as_m64 = __B;
1502 res.as_short[0] =
1503 (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1504 res.as_short[1] =
1505 (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1506 res.as_short[2] =
1507 (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1508 res.as_short[3] =
1509 (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1511 return (__m64) res.as_m64;
1512 #endif
1515 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1516 _m_pminsw (__m64 __A, __m64 __B)
1518 return _mm_min_pi16 (__A, __B);
1521 /* Compute the element-wise minimum of unsigned 8-bit values. */
1522 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1523 _mm_min_pu8 (__m64 __A, __m64 __B)
1525 #if _ARCH_PWR8
1526 __vector unsigned char a, b, r;
1527 __vector __bool char c;
1529 a = (__vector unsigned char)vec_splats (__A);
1530 b = (__vector unsigned char)vec_splats (__B);
1531 c = (__vector __bool char)vec_cmplt (a, b);
1532 r = vec_sel (b, a, c);
1533 return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1534 #else
1535 __m64_union m1, m2, res;
1536 long i;
1538 m1.as_m64 = __A;
1539 m2.as_m64 = __B;
1542 for (i = 0; i < 8; i++)
1543 res.as_char[i] =
1544 ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ?
1545 m1.as_char[i] : m2.as_char[i];
1547 return (__m64) res.as_m64;
1548 #endif
1551 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1552 _m_pminub (__m64 __A, __m64 __B)
1554 return _mm_min_pu8 (__A, __B);
1557 /* Create an 8-bit mask of the signs of 8-bit values. */
1558 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1559 _mm_movemask_pi8 (__m64 __A)
1561 unsigned long p = 0x0008101820283038UL; // permute control for sign bits
1563 return __builtin_bpermd (p, __A);
1566 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1567 _m_pmovmskb (__m64 __A)
1569 return _mm_movemask_pi8 (__A);
1572 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1573 in B and produce the high 16 bits of the 32-bit results. */
1574 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1575 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
1577 __vector unsigned short a, b;
1578 __vector unsigned short c;
1579 __vector unsigned int w0, w1;
1580 __vector unsigned char xform1 = {
1581 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1582 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1585 a = (__vector unsigned short)vec_splats (__A);
1586 b = (__vector unsigned short)vec_splats (__B);
1588 w0 = vec_vmuleuh (a, b);
1589 w1 = vec_vmulouh (a, b);
1590 c = (__vector unsigned short)vec_perm (w0, w1, xform1);
1592 return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
1595 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1596 _m_pmulhuw (__m64 __A, __m64 __B)
1598 return _mm_mulhi_pu16 (__A, __B);
1601 /* Return a combination of the four 16-bit values in A. The selector
1602 must be an immediate. */
1603 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1604 _mm_shuffle_pi16 (__m64 __A, int const __N)
1606 unsigned long element_selector_10 = __N & 0x03;
1607 unsigned long element_selector_32 = (__N >> 2) & 0x03;
1608 unsigned long element_selector_54 = (__N >> 4) & 0x03;
1609 unsigned long element_selector_76 = (__N >> 6) & 0x03;
1610 static const unsigned short permute_selectors[4] =
1612 #ifdef __LITTLE_ENDIAN__
1613 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1614 #elif __BIG_ENDIAN__
1615 0x0607, 0x0405, 0x0203, 0x0001
1616 #endif
1618 __m64_union t;
1619 __vector __m64 a, p, r;
1621 #ifdef __LITTLE_ENDIAN__
1622 t.as_short[0] = permute_selectors[element_selector_10];
1623 t.as_short[1] = permute_selectors[element_selector_32];
1624 t.as_short[2] = permute_selectors[element_selector_54];
1625 t.as_short[3] = permute_selectors[element_selector_76];
1626 #elif __BIG_ENDIAN__
1627 t.as_short[3] = permute_selectors[element_selector_10];
1628 t.as_short[2] = permute_selectors[element_selector_32];
1629 t.as_short[1] = permute_selectors[element_selector_54];
1630 t.as_short[0] = permute_selectors[element_selector_76];
1631 #endif
1632 p = vec_splats (t.as_m64);
1633 a = vec_splats (__A);
1634 r = vec_perm (a, a, (__vector unsigned char)p);
1635 return (__builtin_unpack_vector_int128 ((__vector __int128)r, 0));
1638 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1639 _m_pshufw (__m64 __A, int const __N)
1641 return _mm_shuffle_pi16 (__A, __N);
1644 /* Conditionally store byte elements of A into P. The high bit of each
1645 byte in the selector N determines whether the corresponding byte from
1646 A is stored. */
1647 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1648 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1650 __m64 hibit = 0x8080808080808080UL;
1651 __m64 mask, tmp;
1652 __m64 *p = (__m64*)__P;
1654 tmp = *p;
1655 mask = _mm_cmpeq_pi8 ((__N & hibit), hibit);
1656 tmp = (tmp & (~mask)) | (__A & mask);
1657 *p = tmp;
1660 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1661 _m_maskmovq (__m64 __A, __m64 __N, char *__P)
1663 _mm_maskmove_si64 (__A, __N, __P);
1666 /* Compute the rounded averages of the unsigned 8-bit values in A and B. */
1667 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1668 _mm_avg_pu8 (__m64 __A, __m64 __B)
1670 __vector unsigned char a, b, c;
1672 a = (__vector unsigned char)vec_splats (__A);
1673 b = (__vector unsigned char)vec_splats (__B);
1674 c = vec_avg (a, b);
1675 return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
1678 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1679 _m_pavgb (__m64 __A, __m64 __B)
1681 return _mm_avg_pu8 (__A, __B);
1684 /* Compute the rounded averages of the unsigned 16-bit values in A and B. */
1685 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1686 _mm_avg_pu16 (__m64 __A, __m64 __B)
1688 __vector unsigned short a, b, c;
1690 a = (__vector unsigned short)vec_splats (__A);
1691 b = (__vector unsigned short)vec_splats (__B);
1692 c = vec_avg (a, b);
1693 return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
1696 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1697 _m_pavgw (__m64 __A, __m64 __B)
1699 return _mm_avg_pu16 (__A, __B);
1702 /* Compute the sum of the absolute differences of the unsigned 8-bit
1703 values in A and B. Return the value in the lower 16-bit word; the
1704 upper words are cleared. */
1705 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1706 _mm_sad_pu8 (__m64 __A, __m64 __B)
1708 __vector unsigned char a, b;
1709 __vector unsigned char vmin, vmax, vabsdiff;
1710 __vector signed int vsum;
1711 const __vector unsigned int zero =
1712 { 0, 0, 0, 0 };
1713 unsigned short result;
1715 a = (__vector unsigned char) __builtin_pack_vector_int128 (0UL, __A);
1716 b = (__vector unsigned char) __builtin_pack_vector_int128 (0UL, __B);
1717 vmin = vec_min (a, b);
1718 vmax = vec_max (a, b);
1719 vabsdiff = vec_sub (vmax, vmin);
1720 /* Sum four groups of bytes into integers. */
1721 vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
1722 /* Sum across four integers with integer result. */
1723 vsum = vec_sums (vsum, (__vector signed int) zero);
1724 /* The sum is in the right most 32-bits of the vector result.
1725 Transfer to a GPR and truncate to 16 bits. */
1726 result = vsum[3];
1727 return (result);
1730 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1731 _m_psadbw (__m64 __A, __m64 __B)
1733 return _mm_sad_pu8 (__A, __B);
1736 /* Stores the data in A to the address P without polluting the caches. */
1737 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1738 _mm_stream_pi (__m64 *__P, __m64 __A)
1740 /* Use the data cache block touch for store transient. */
1741 __asm__ (
1742 " dcbtstt 0,%0"
1744 : "b" (__P)
1745 : "memory"
1747 *__P = __A;
1750 /* Likewise. The address must be 16-byte aligned. */
1751 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1752 _mm_stream_ps (float *__P, __m128 __A)
1754 /* Use the data cache block touch for store transient. */
1755 __asm__ (
1756 " dcbtstt 0,%0"
1758 : "b" (__P)
1759 : "memory"
1761 _mm_store_ps (__P, __A);
1764 /* Guarantees that every preceding store is globally visible before
1765 any subsequent store. */
1766 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1767 _mm_sfence (void)
1769 /* Generate a light weight sync. */
1770 __atomic_thread_fence (__ATOMIC_RELEASE);
1773 /* The execution of the next instruction is delayed by an implementation
1774 specific amount of time. The instruction does not modify the
1775 architectural state. This is after the pop_options pragma because
1776 it does not require SSE support in the processor--the encoding is a
1777 nop on processors that do not support it. */
1778 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1779 _mm_pause (void)
1781 /* There is no exact match with this construct, but the following is
1782 close to the desired effect. */
1783 #if _ARCH_PWR8
1784 /* On power8 and later processors we can depend on Program Priority
1785 (PRI) and associated "very low" PPI setting. Since we don't know
1786 what PPI this thread is running at we: 1) save the current PRI
1787 from the PPR SPR into a local GRP, 2) set the PRI to "very low*
1788 via the special or 31,31,31 encoding. 3) issue an "isync" to
1789 insure the PRI change takes effect before we execute any more
1790 instructions.
1791 Now we can execute a lwsync (release barrier) while we execute
1792 this thread at "very low" PRI. Finally we restore the original
1793 PRI and continue execution. */
1794 unsigned long __PPR;
1796 __asm__ volatile (
1797 " mfppr %0;"
1798 " or 31,31,31;"
1799 " isync;"
1800 " lwsync;"
1801 " isync;"
1802 " mtppr %0;"
1803 : "=r" (__PPR)
1805 : "memory"
1807 #else
1808 /* For older processor where we may not even have Program Priority
1809 controls we can only depend on Heavy Weight Sync. */
1810 __atomic_thread_fence (__ATOMIC_SEQ_CST);
1811 #endif
1814 /* Transpose the 4x4 matrix composed of row[0-3]. */
1815 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
1816 do { \
1817 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
1818 __v4sf __t0 = vec_vmrghw (__r0, __r1); \
1819 __v4sf __t1 = vec_vmrghw (__r2, __r3); \
1820 __v4sf __t2 = vec_vmrglw (__r0, __r1); \
1821 __v4sf __t3 = vec_vmrglw (__r2, __r3); \
1822 (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, \
1823 (__vector long long)__t1); \
1824 (row1) = (__v4sf)vec_mergel ((__vector long long)__t0, \
1825 (__vector long long)__t1); \
1826 (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2, \
1827 (__vector long long)__t3); \
1828 (row3) = (__v4sf)vec_mergel ((__vector long long)__t2, \
1829 (__vector long long)__t3); \
1830 } while (0)
1832 /* For backward source compatibility. */
1833 //# include <emmintrin.h>
1835 #endif /* _XMMINTRIN_H_INCLUDED */