1 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 *===-----------------------------------------------------------------------===
29 typedef int __v4si
__attribute__((__vector_size__(16)));
30 typedef float __v4sf
__attribute__((__vector_size__(16)));
31 typedef float __m128
__attribute__((__vector_size__(16)));
34 typedef unsigned int __v4su
__attribute__((__vector_size__(16)));
36 /* This header should only be included in a hosted environment as it depends on
37 * a standard library to provide allocation routines. */
39 #include <mm_malloc.h>
42 /* Define the default attributes for the functions in this file. */
43 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse")))
45 /// \brief Adds the 32-bit float values in the low-order bits of the operands.
47 /// \headerfile <x86intrin.h>
49 /// This intrinsic corresponds to the \c VADDSS / ADDSS instructions.
52 /// A 128-bit vector of [4 x float] containing one of the source operands.
53 /// The lower 32 bits of this operand are used in the calculation.
55 /// A 128-bit vector of [4 x float] containing one of the source operands.
56 /// The lower 32 bits of this operand are used in the calculation.
57 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
58 /// of the lower 32 bits of both operands. The upper 96 bits are copied from
59 /// the upper 96 bits of the first source operand.
60 static __inline__ __m128 __DEFAULT_FN_ATTRS
61 _mm_add_ss(__m128 __a
, __m128 __b
)
67 /// \brief Adds two 128-bit vectors of [4 x float], and returns the results of
70 /// \headerfile <x86intrin.h>
72 /// This intrinsic corresponds to the \c VADDPS / ADDPS instructions.
75 /// A 128-bit vector of [4 x float] containing one of the source operands.
77 /// A 128-bit vector of [4 x float] containing one of the source operands.
78 /// \returns A 128-bit vector of [4 x float] containing the sums of both
80 static __inline__ __m128 __DEFAULT_FN_ATTRS
81 _mm_add_ps(__m128 __a
, __m128 __b
)
83 return (__m128
)((__v4sf
)__a
+ (__v4sf
)__b
);
86 /// \brief Subtracts the 32-bit float value in the low-order bits of the second
87 /// operand from the corresponding value in the first operand.
89 /// \headerfile <x86intrin.h>
91 /// This intrinsic corresponds to the \c VSUBSS / SUBSS instructions.
94 /// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
95 /// of this operand are used in the calculation.
97 /// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
98 /// bits of this operand are used in the calculation.
99 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
100 /// difference of the lower 32 bits of both operands. The upper 96 bits are
101 /// copied from the upper 96 bits of the first source operand.
102 static __inline__ __m128 __DEFAULT_FN_ATTRS
103 _mm_sub_ss(__m128 __a
, __m128 __b
)
109 /// \brief Subtracts each of the values of the second operand from the first
110 /// operand, both of which are 128-bit vectors of [4 x float] and returns
111 /// the results of the subtraction.
113 /// \headerfile <x86intrin.h>
115 /// This intrinsic corresponds to the \c VSUBPS / SUBPS instructions.
118 /// A 128-bit vector of [4 x float] containing the minuend.
120 /// A 128-bit vector of [4 x float] containing the subtrahend.
121 /// \returns A 128-bit vector of [4 x float] containing the differences between
123 static __inline__ __m128 __DEFAULT_FN_ATTRS
124 _mm_sub_ps(__m128 __a
, __m128 __b
)
126 return (__m128
)((__v4sf
)__a
- (__v4sf
)__b
);
129 /// \brief Multiplies two 32-bit float values in the low-order bits of the
132 /// \headerfile <x86intrin.h>
134 /// This intrinsic corresponds to the \c VMULSS / MULSS instructions.
137 /// A 128-bit vector of [4 x float] containing one of the source operands.
138 /// The lower 32 bits of this operand are used in the calculation.
140 /// A 128-bit vector of [4 x float] containing one of the source operands.
141 /// The lower 32 bits of this operand are used in the calculation.
142 /// \returns A 128-bit vector of [4 x float] containing the product of the lower
143 /// 32 bits of both operands. The upper 96 bits are copied from the upper 96
144 /// bits of the first source operand.
145 static __inline__ __m128 __DEFAULT_FN_ATTRS
146 _mm_mul_ss(__m128 __a
, __m128 __b
)
152 /// \brief Multiplies two 128-bit vectors of [4 x float] and returns the
153 /// results of the multiplication.
155 /// \headerfile <x86intrin.h>
157 /// This intrinsic corresponds to the \c VMULPS / MULPS instructions.
160 /// A 128-bit vector of [4 x float] containing one of the source operands.
162 /// A 128-bit vector of [4 x float] containing one of the source operands.
163 /// \returns A 128-bit vector of [4 x float] containing the products of both
165 static __inline__ __m128 __DEFAULT_FN_ATTRS
166 _mm_mul_ps(__m128 __a
, __m128 __b
)
168 return (__m128
)((__v4sf
)__a
* (__v4sf
)__b
);
171 /// \brief Divides the value in the low-order 32 bits of the first operand by
172 /// the corresponding value in the second operand.
174 /// \headerfile <x86intrin.h>
176 /// This intrinsic corresponds to the \c VDIVSS / DIVSS instructions.
179 /// A 128-bit vector of [4 x float] containing the dividend. The lower 32
180 /// bits of this operand are used in the calculation.
182 /// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
183 /// of this operand are used in the calculation.
184 /// \returns A 128-bit vector of [4 x float] containing the quotients of the
185 /// lower 32 bits of both operands. The upper 96 bits are copied from the
186 /// upper 96 bits of the first source operand.
187 static __inline__ __m128 __DEFAULT_FN_ATTRS
188 _mm_div_ss(__m128 __a
, __m128 __b
)
194 /// \brief Divides two 128-bit vectors of [4 x float].
196 /// \headerfile <x86intrin.h>
198 /// This intrinsic corresponds to the \c VDIVPS / DIVPS instructions.
201 /// A 128-bit vector of [4 x float] containing the dividend.
203 /// A 128-bit vector of [4 x float] containing the divisor.
204 /// \returns A 128-bit vector of [4 x float] containing the quotients of both
206 static __inline__ __m128 __DEFAULT_FN_ATTRS
207 _mm_div_ps(__m128 __a
, __m128 __b
)
209 return (__m128
)((__v4sf
)__a
/ (__v4sf
)__b
);
212 /// \brief Calculates the square root of the value stored in the low-order bits
213 /// of a 128-bit vector of [4 x float].
215 /// \headerfile <x86intrin.h>
217 /// This intrinsic corresponds to the \c VSQRTSS / SQRTSS instructions.
220 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
221 /// used in the calculation.
222 /// \returns A 128-bit vector of [4 x float] containing the square root of the
223 /// value in the low-order bits of the operand.
224 static __inline__ __m128 __DEFAULT_FN_ATTRS
225 _mm_sqrt_ss(__m128 __a
)
227 __m128 __c
= __builtin_ia32_sqrtss((__v4sf
)__a
);
228 return (__m128
) { __c
[0], __a
[1], __a
[2], __a
[3] };
231 /// \brief Calculates the square roots of the values stored in a 128-bit vector
234 /// \headerfile <x86intrin.h>
236 /// This intrinsic corresponds to the \c VSQRTPS / SQRTPS instructions.
239 /// A 128-bit vector of [4 x float].
240 /// \returns A 128-bit vector of [4 x float] containing the square roots of the
241 /// values in the operand.
242 static __inline__ __m128 __DEFAULT_FN_ATTRS
243 _mm_sqrt_ps(__m128 __a
)
245 return __builtin_ia32_sqrtps((__v4sf
)__a
);
248 /// \brief Calculates the approximate reciprocal of the value stored in the
249 /// low-order bits of a 128-bit vector of [4 x float].
251 /// \headerfile <x86intrin.h>
253 /// This intrinsic corresponds to the \c VRCPSS / RCPSS instructions.
256 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
257 /// used in the calculation.
258 /// \returns A 128-bit vector of [4 x float] containing the approximate
259 /// reciprocal of the value in the low-order bits of the operand.
260 static __inline__ __m128 __DEFAULT_FN_ATTRS
261 _mm_rcp_ss(__m128 __a
)
263 __m128 __c
= __builtin_ia32_rcpss((__v4sf
)__a
);
264 return (__m128
) { __c
[0], __a
[1], __a
[2], __a
[3] };
267 /// \brief Calculates the approximate reciprocals of the values stored in a
268 /// 128-bit vector of [4 x float].
270 /// \headerfile <x86intrin.h>
272 /// This intrinsic corresponds to the \c VRCPPS / RCPPS instructions.
275 /// A 128-bit vector of [4 x float].
276 /// \returns A 128-bit vector of [4 x float] containing the approximate
277 /// reciprocals of the values in the operand.
278 static __inline__ __m128 __DEFAULT_FN_ATTRS
279 _mm_rcp_ps(__m128 __a
)
281 return __builtin_ia32_rcpps((__v4sf
)__a
);
284 /// \brief Calculates the approximate reciprocal of the square root of the value
285 /// stored in the low-order bits of a 128-bit vector of [4 x float].
287 /// \headerfile <x86intrin.h>
289 /// This intrinsic corresponds to the \c VRSQRTSS / RSQRTSS instructions.
292 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
293 /// used in the calculation.
294 /// \returns A 128-bit vector of [4 x float] containing the approximate
295 /// reciprocal of the square root of the value in the low-order bits of the
297 static __inline__ __m128 __DEFAULT_FN_ATTRS
298 _mm_rsqrt_ss(__m128 __a
)
300 __m128 __c
= __builtin_ia32_rsqrtss((__v4sf
)__a
);
301 return (__m128
) { __c
[0], __a
[1], __a
[2], __a
[3] };
304 /// \brief Calculates the approximate reciprocals of the square roots of the
305 /// values stored in a 128-bit vector of [4 x float].
307 /// \headerfile <x86intrin.h>
309 /// This intrinsic corresponds to the \c VRSQRTPS / RSQRTPS instructions.
312 /// A 128-bit vector of [4 x float].
313 /// \returns A 128-bit vector of [4 x float] containing the approximate
314 /// reciprocals of the square roots of the values in the operand.
315 static __inline__ __m128 __DEFAULT_FN_ATTRS
316 _mm_rsqrt_ps(__m128 __a
)
318 return __builtin_ia32_rsqrtps((__v4sf
)__a
);
321 /// \brief Compares two 32-bit float values in the low-order bits of both
322 /// operands and returns the lesser value in the low-order bits of the
323 /// vector of [4 x float].
325 /// \headerfile <x86intrin.h>
327 /// This intrinsic corresponds to the \c VMINSS / MINSS instructions.
330 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
331 /// 32 bits of this operand are used in the comparison.
333 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
334 /// 32 bits of this operand are used in the comparison.
335 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
336 /// minimum value between both operands. The upper 96 bits are copied from
337 /// the upper 96 bits of the first source operand.
338 static __inline__ __m128 __DEFAULT_FN_ATTRS
339 _mm_min_ss(__m128 __a
, __m128 __b
)
341 return __builtin_ia32_minss((__v4sf
)__a
, (__v4sf
)__b
);
344 /// \brief Compares two 128-bit vectors of [4 x float] and returns the
345 /// lesser of each pair of values.
347 /// \headerfile <x86intrin.h>
349 /// This intrinsic corresponds to the \c VMINPS / MINPS instructions.
352 /// A 128-bit vector of [4 x float] containing one of the operands.
354 /// A 128-bit vector of [4 x float] containing one of the operands.
355 /// \returns A 128-bit vector of [4 x float] containing the minimum values
356 /// between both operands.
357 static __inline__ __m128 __DEFAULT_FN_ATTRS
358 _mm_min_ps(__m128 __a
, __m128 __b
)
360 return __builtin_ia32_minps((__v4sf
)__a
, (__v4sf
)__b
);
363 /// \brief Compares two 32-bit float values in the low-order bits of both
364 /// operands and returns the greater value in the low-order bits of
365 /// a vector [4 x float].
367 /// \headerfile <x86intrin.h>
369 /// This intrinsic corresponds to the \c VMAXSS / MAXSS instructions.
372 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
373 /// 32 bits of this operand are used in the comparison.
375 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
376 /// 32 bits of this operand are used in the comparison.
377 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
378 /// maximum value between both operands. The upper 96 bits are copied from
379 /// the upper 96 bits of the first source operand.
380 static __inline__ __m128 __DEFAULT_FN_ATTRS
381 _mm_max_ss(__m128 __a
, __m128 __b
)
383 return __builtin_ia32_maxss((__v4sf
)__a
, (__v4sf
)__b
);
386 /// \brief Compares two 128-bit vectors of [4 x float] and returns the greater
387 /// of each pair of values.
389 /// \headerfile <x86intrin.h>
391 /// This intrinsic corresponds to the \c VMAXPS / MAXPS instructions.
394 /// A 128-bit vector of [4 x float] containing one of the operands.
396 /// A 128-bit vector of [4 x float] containing one of the operands.
397 /// \returns A 128-bit vector of [4 x float] containing the maximum values
398 /// between both operands.
399 static __inline__ __m128 __DEFAULT_FN_ATTRS
400 _mm_max_ps(__m128 __a
, __m128 __b
)
402 return __builtin_ia32_maxps((__v4sf
)__a
, (__v4sf
)__b
);
405 /// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float].
407 /// \headerfile <x86intrin.h>
409 /// This intrinsic corresponds to the \c VANDPS / ANDPS instructions.
412 /// A 128-bit vector containing one of the source operands.
414 /// A 128-bit vector containing one of the source operands.
415 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
416 /// values between both operands.
417 static __inline__ __m128 __DEFAULT_FN_ATTRS
418 _mm_and_ps(__m128 __a
, __m128 __b
)
420 return (__m128
)((__v4su
)__a
& (__v4su
)__b
);
423 /// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float], using
424 /// the one's complement of the values contained in the first source
427 /// \headerfile <x86intrin.h>
429 /// This intrinsic corresponds to the \c VANDNPS / ANDNPS instructions.
432 /// A 128-bit vector of [4 x float] containing the first source operand. The
433 /// one's complement of this value is used in the bitwise AND.
435 /// A 128-bit vector of [4 x float] containing the second source operand.
436 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
437 /// one's complement of the first operand and the values in the second
439 static __inline__ __m128 __DEFAULT_FN_ATTRS
440 _mm_andnot_ps(__m128 __a
, __m128 __b
)
442 return (__m128
)(~(__v4su
)__a
& (__v4su
)__b
);
445 /// \brief Performs a bitwise OR of two 128-bit vectors of [4 x float].
447 /// \headerfile <x86intrin.h>
449 /// This intrinsic corresponds to the \c VORPS / ORPS instructions.
452 /// A 128-bit vector of [4 x float] containing one of the source operands.
454 /// A 128-bit vector of [4 x float] containing one of the source operands.
455 /// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
456 /// values between both operands.
457 static __inline__ __m128 __DEFAULT_FN_ATTRS
458 _mm_or_ps(__m128 __a
, __m128 __b
)
460 return (__m128
)((__v4su
)__a
| (__v4su
)__b
);
463 /// \brief Performs a bitwise exclusive OR of two 128-bit vectors of
466 /// \headerfile <x86intrin.h>
468 /// This intrinsic corresponds to the \c VXORPS / XORPS instructions.
471 /// A 128-bit vector of [4 x float] containing one of the source operands.
473 /// A 128-bit vector of [4 x float] containing one of the source operands.
474 /// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
475 /// of the values between both operands.
476 static __inline__ __m128 __DEFAULT_FN_ATTRS
477 _mm_xor_ps(__m128 __a
, __m128 __b
)
479 return (__m128
)((__v4su
)__a
^ (__v4su
)__b
);
482 /// \brief Compares two 32-bit float values in the low-order bits of both
483 /// operands for equality and returns the result of the comparison in the
484 /// low-order bits of a vector [4 x float].
486 /// \headerfile <x86intrin.h>
488 /// This intrinsic corresponds to the \c VCMPEQSS / CMPEQSS instructions.
491 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
492 /// 32 bits of this operand are used in the comparison.
494 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
495 /// 32 bits of this operand are used in the comparison.
496 /// \returns A 128-bit vector of [4 x float] containing the comparison results
497 /// in the low-order bits.
498 static __inline__ __m128 __DEFAULT_FN_ATTRS
499 _mm_cmpeq_ss(__m128 __a
, __m128 __b
)
501 return (__m128
)__builtin_ia32_cmpeqss((__v4sf
)__a
, (__v4sf
)__b
);
504 /// \brief Compares each of the corresponding 32-bit float values of the
505 /// 128-bit vectors of [4 x float] for equality.
507 /// \headerfile <x86intrin.h>
509 /// This intrinsic corresponds to the \c VCMPEQPS / CMPEQPS instructions.
512 /// A 128-bit vector of [4 x float].
514 /// A 128-bit vector of [4 x float].
515 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
516 static __inline__ __m128 __DEFAULT_FN_ATTRS
517 _mm_cmpeq_ps(__m128 __a
, __m128 __b
)
519 return (__m128
)__builtin_ia32_cmpeqps((__v4sf
)__a
, (__v4sf
)__b
);
522 /// \brief Compares two 32-bit float values in the low-order bits of both
523 /// operands to determine if the value in the first operand is less than the
524 /// corresponding value in the second operand and returns the result of the
525 /// comparison in the low-order bits of a vector of [4 x float].
527 /// \headerfile <x86intrin.h>
529 /// This intrinsic corresponds to the \c VCMPLTSS / CMPLTSS instructions.
532 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
533 /// 32 bits of this operand are used in the comparison.
535 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
536 /// 32 bits of this operand are used in the comparison.
537 /// \returns A 128-bit vector of [4 x float] containing the comparison results
538 /// in the low-order bits.
539 static __inline__ __m128 __DEFAULT_FN_ATTRS
540 _mm_cmplt_ss(__m128 __a
, __m128 __b
)
542 return (__m128
)__builtin_ia32_cmpltss((__v4sf
)__a
, (__v4sf
)__b
);
545 /// \brief Compares each of the corresponding 32-bit float values of the
546 /// 128-bit vectors of [4 x float] to determine if the values in the first
547 /// operand are less than those in the second operand.
549 /// \headerfile <x86intrin.h>
551 /// This intrinsic corresponds to the \c VCMPLTPS / CMPLTPS instructions.
554 /// A 128-bit vector of [4 x float].
556 /// A 128-bit vector of [4 x float].
557 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
558 static __inline__ __m128 __DEFAULT_FN_ATTRS
559 _mm_cmplt_ps(__m128 __a
, __m128 __b
)
561 return (__m128
)__builtin_ia32_cmpltps((__v4sf
)__a
, (__v4sf
)__b
);
564 /// \brief Compares two 32-bit float values in the low-order bits of both
565 /// operands to determine if the value in the first operand is less than or
566 /// equal to the corresponding value in the second operand and returns the
567 /// result of the comparison in the low-order bits of a vector of
570 /// \headerfile <x86intrin.h>
572 /// This intrinsic corresponds to the \c VCMPLESS / CMPLESS instructions.
575 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
576 /// 32 bits of this operand are used in the comparison.
578 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
579 /// 32 bits of this operand are used in the comparison.
580 /// \returns A 128-bit vector of [4 x float] containing the comparison results
581 /// in the low-order bits.
582 static __inline__ __m128 __DEFAULT_FN_ATTRS
583 _mm_cmple_ss(__m128 __a
, __m128 __b
)
585 return (__m128
)__builtin_ia32_cmpless((__v4sf
)__a
, (__v4sf
)__b
);
588 /// \brief Compares each of the corresponding 32-bit float values of the
589 /// 128-bit vectors of [4 x float] to determine if the values in the first
590 /// operand are less than or equal to those in the second operand.
592 /// \headerfile <x86intrin.h>
594 /// This intrinsic corresponds to the \c VCMPLEPS / CMPLEPS instructions.
597 /// A 128-bit vector of [4 x float].
599 /// A 128-bit vector of [4 x float].
600 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
601 static __inline__ __m128 __DEFAULT_FN_ATTRS
602 _mm_cmple_ps(__m128 __a
, __m128 __b
)
604 return (__m128
)__builtin_ia32_cmpleps((__v4sf
)__a
, (__v4sf
)__b
);
607 /// \brief Compares two 32-bit float values in the low-order bits of both
608 /// operands to determine if the value in the first operand is greater than
609 /// the corresponding value in the second operand and returns the result of
610 /// the comparison in the low-order bits of a vector of [4 x float].
612 /// \headerfile <x86intrin.h>
614 /// This intrinsic corresponds to the \c VCMPLTSS / CMPLTSS instructions.
617 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
618 /// 32 bits of this operand are used in the comparison.
620 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
621 /// 32 bits of this operand are used in the comparison.
622 /// \returns A 128-bit vector of [4 x float] containing the comparison results
623 /// in the low-order bits.
624 static __inline__ __m128 __DEFAULT_FN_ATTRS
625 _mm_cmpgt_ss(__m128 __a
, __m128 __b
)
627 return (__m128
)__builtin_shufflevector((__v4sf
)__a
,
628 (__v4sf
)__builtin_ia32_cmpltss((__v4sf
)__b
, (__v4sf
)__a
),
632 /// \brief Compares each of the corresponding 32-bit float values of the
633 /// 128-bit vectors of [4 x float] to determine if the values in the first
634 /// operand are greater than those in the second operand.
636 /// \headerfile <x86intrin.h>
638 /// This intrinsic corresponds to the \c VCMPLTPS / CMPLTPS instructions.
641 /// A 128-bit vector of [4 x float].
643 /// A 128-bit vector of [4 x float].
644 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
645 static __inline__ __m128 __DEFAULT_FN_ATTRS
646 _mm_cmpgt_ps(__m128 __a
, __m128 __b
)
648 return (__m128
)__builtin_ia32_cmpltps((__v4sf
)__b
, (__v4sf
)__a
);
651 /// \brief Compares two 32-bit float values in the low-order bits of both
652 /// operands to determine if the value in the first operand is greater than
653 /// or equal to the corresponding value in the second operand and returns
654 /// the result of the comparison in the low-order bits of a vector of
657 /// \headerfile <x86intrin.h>
659 /// This intrinsic corresponds to the \c VCMPLESS / CMPLESS instructions.
662 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
663 /// 32 bits of this operand are used in the comparison.
665 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
666 /// 32 bits of this operand are used in the comparison.
667 /// \returns A 128-bit vector of [4 x float] containing the comparison results
668 /// in the low-order bits.
669 static __inline__ __m128 __DEFAULT_FN_ATTRS
670 _mm_cmpge_ss(__m128 __a
, __m128 __b
)
672 return (__m128
)__builtin_shufflevector((__v4sf
)__a
,
673 (__v4sf
)__builtin_ia32_cmpless((__v4sf
)__b
, (__v4sf
)__a
),
677 /// \brief Compares each of the corresponding 32-bit float values of the
678 /// 128-bit vectors of [4 x float] to determine if the values in the first
679 /// operand are greater than or equal to those in the second operand.
681 /// \headerfile <x86intrin.h>
683 /// This intrinsic corresponds to the \c VCMPLEPS / CMPLEPS instructions.
686 /// A 128-bit vector of [4 x float].
688 /// A 128-bit vector of [4 x float].
689 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
690 static __inline__ __m128 __DEFAULT_FN_ATTRS
691 _mm_cmpge_ps(__m128 __a
, __m128 __b
)
693 return (__m128
)__builtin_ia32_cmpleps((__v4sf
)__b
, (__v4sf
)__a
);
696 /// \brief Compares two 32-bit float values in the low-order bits of both
697 /// operands for inequality and returns the result of the comparison in the
698 /// low-order bits of a vector of [4 x float].
700 /// \headerfile <x86intrin.h>
702 /// This intrinsic corresponds to the \c VCMPNEQSS / CMPNEQSS instructions.
705 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
706 /// 32 bits of this operand are used in the comparison.
708 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
709 /// 32 bits of this operand are used in the comparison.
710 /// \returns A 128-bit vector of [4 x float] containing the comparison results
711 /// in the low-order bits.
712 static __inline__ __m128 __DEFAULT_FN_ATTRS
713 _mm_cmpneq_ss(__m128 __a
, __m128 __b
)
715 return (__m128
)__builtin_ia32_cmpneqss((__v4sf
)__a
, (__v4sf
)__b
);
718 /// \brief Compares each of the corresponding 32-bit float values of the
719 /// 128-bit vectors of [4 x float] for inequality.
721 /// \headerfile <x86intrin.h>
723 /// This intrinsic corresponds to the \c VCMPNEQPS / CMPNEQPS instructions.
726 /// A 128-bit vector of [4 x float].
728 /// A 128-bit vector of [4 x float].
729 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
730 static __inline__ __m128 __DEFAULT_FN_ATTRS
731 _mm_cmpneq_ps(__m128 __a
, __m128 __b
)
733 return (__m128
)__builtin_ia32_cmpneqps((__v4sf
)__a
, (__v4sf
)__b
);
736 /// \brief Compares two 32-bit float values in the low-order bits of both
737 /// operands to determine if the value in the first operand is not less than
738 /// the corresponding value in the second operand and returns the result of
739 /// the comparison in the low-order bits of a vector of [4 x float].
741 /// \headerfile <x86intrin.h>
743 /// This intrinsic corresponds to the \c VCMPNLTSS / CMPNLTSS instructions.
746 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
747 /// 32 bits of this operand are used in the comparison.
749 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
750 /// 32 bits of this operand are used in the comparison.
751 /// \returns A 128-bit vector of [4 x float] containing the comparison results
752 /// in the low-order bits.
753 static __inline__ __m128 __DEFAULT_FN_ATTRS
754 _mm_cmpnlt_ss(__m128 __a
, __m128 __b
)
756 return (__m128
)__builtin_ia32_cmpnltss((__v4sf
)__a
, (__v4sf
)__b
);
759 /// \brief Compares each of the corresponding 32-bit float values of the
760 /// 128-bit vectors of [4 x float] to determine if the values in the first
761 /// operand are not less than those in the second operand.
763 /// \headerfile <x86intrin.h>
765 /// This intrinsic corresponds to the \c VCMPNLTPS / CMPNLTPS instructions.
768 /// A 128-bit vector of [4 x float].
770 /// A 128-bit vector of [4 x float].
771 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
772 static __inline__ __m128 __DEFAULT_FN_ATTRS
773 _mm_cmpnlt_ps(__m128 __a
, __m128 __b
)
775 return (__m128
)__builtin_ia32_cmpnltps((__v4sf
)__a
, (__v4sf
)__b
);
778 /// \brief Compares two 32-bit float values in the low-order bits of both
779 /// operands to determine if the value in the first operand is not less than
780 /// or equal to the corresponding value in the second operand and returns
781 /// the result of the comparison in the low-order bits of a vector of
784 /// \headerfile <x86intrin.h>
786 /// This intrinsic corresponds to the \c VCMPNLESS / CMPNLESS instructions.
789 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
790 /// 32 bits of this operand are used in the comparison.
792 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
793 /// 32 bits of this operand are used in the comparison.
794 /// \returns A 128-bit vector of [4 x float] containing the comparison results
795 /// in the low-order bits.
796 static __inline__ __m128 __DEFAULT_FN_ATTRS
797 _mm_cmpnle_ss(__m128 __a
, __m128 __b
)
799 return (__m128
)__builtin_ia32_cmpnless((__v4sf
)__a
, (__v4sf
)__b
);
802 /// \brief Compares each of the corresponding 32-bit float values of the
803 /// 128-bit vectors of [4 x float] to determine if the values in the first
804 /// operand are not less than or equal to those in the second operand.
806 /// \headerfile <x86intrin.h>
808 /// This intrinsic corresponds to the \c VCMPNLEPS / CMPNLEPS instructions.
811 /// A 128-bit vector of [4 x float].
813 /// A 128-bit vector of [4 x float].
814 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
815 static __inline__ __m128 __DEFAULT_FN_ATTRS
816 _mm_cmpnle_ps(__m128 __a
, __m128 __b
)
818 return (__m128
)__builtin_ia32_cmpnleps((__v4sf
)__a
, (__v4sf
)__b
);
821 /// \brief Compares two 32-bit float values in the low-order bits of both
822 /// operands to determine if the value in the first operand is not greater
823 /// than the corresponding value in the second operand and returns the
824 /// result of the comparison in the low-order bits of a vector of
827 /// \headerfile <x86intrin.h>
829 /// This intrinsic corresponds to the \c VCMPNLTSS / CMPNLTSS instructions.
832 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
833 /// 32 bits of this operand are used in the comparison.
835 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
836 /// 32 bits of this operand are used in the comparison.
837 /// \returns A 128-bit vector of [4 x float] containing the comparison results
838 /// in the low-order bits.
839 static __inline__ __m128 __DEFAULT_FN_ATTRS
840 _mm_cmpngt_ss(__m128 __a
, __m128 __b
)
842 return (__m128
)__builtin_shufflevector((__v4sf
)__a
,
843 (__v4sf
)__builtin_ia32_cmpnltss((__v4sf
)__b
, (__v4sf
)__a
),
847 /// \brief Compares each of the corresponding 32-bit float values of the
848 /// 128-bit vectors of [4 x float] to determine if the values in the first
849 /// operand are not greater than those in the second operand.
851 /// \headerfile <x86intrin.h>
853 /// This intrinsic corresponds to the \c VCMPNLTPS / CMPNLTPS instructions.
856 /// A 128-bit vector of [4 x float].
858 /// A 128-bit vector of [4 x float].
859 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
860 static __inline__ __m128 __DEFAULT_FN_ATTRS
861 _mm_cmpngt_ps(__m128 __a
, __m128 __b
)
863 return (__m128
)__builtin_ia32_cmpnltps((__v4sf
)__b
, (__v4sf
)__a
);
866 /// \brief Compares two 32-bit float values in the low-order bits of both
867 /// operands to determine if the value in the first operand is not greater
868 /// than or equal to the corresponding value in the second operand and
869 /// returns the result of the comparison in the low-order bits of a vector
872 /// \headerfile <x86intrin.h>
874 /// This intrinsic corresponds to the \c VCMPNLESS / CMPNLESS instructions.
877 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
878 /// 32 bits of this operand are used in the comparison.
880 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
881 /// 32 bits of this operand are used in the comparison.
882 /// \returns A 128-bit vector of [4 x float] containing the comparison results
883 /// in the low-order bits.
884 static __inline__ __m128 __DEFAULT_FN_ATTRS
885 _mm_cmpnge_ss(__m128 __a
, __m128 __b
)
887 return (__m128
)__builtin_shufflevector((__v4sf
)__a
,
888 (__v4sf
)__builtin_ia32_cmpnless((__v4sf
)__b
, (__v4sf
)__a
),
892 /// \brief Compares each of the corresponding 32-bit float values of the
893 /// 128-bit vectors of [4 x float] to determine if the values in the first
894 /// operand are not greater than or equal to those in the second operand.
896 /// \headerfile <x86intrin.h>
898 /// This intrinsic corresponds to the \c VCMPNLEPS / CMPNLEPS instructions.
901 /// A 128-bit vector of [4 x float].
903 /// A 128-bit vector of [4 x float].
904 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
905 static __inline__ __m128 __DEFAULT_FN_ATTRS
906 _mm_cmpnge_ps(__m128 __a
, __m128 __b
)
908 return (__m128
)__builtin_ia32_cmpnleps((__v4sf
)__b
, (__v4sf
)__a
);
911 /// \brief Compares two 32-bit float values in the low-order bits of both
912 /// operands to determine if the value in the first operand is ordered with
913 /// respect to the corresponding value in the second operand and returns the
914 /// result of the comparison in the low-order bits of a vector of
917 /// \headerfile <x86intrin.h>
919 /// This intrinsic corresponds to the \c VCMPORDSS / CMPORDSS instructions.
922 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
923 /// 32 bits of this operand are used in the comparison.
925 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
926 /// 32 bits of this operand are used in the comparison.
927 /// \returns A 128-bit vector of [4 x float] containing the comparison results
928 /// in the low-order bits.
929 static __inline__ __m128 __DEFAULT_FN_ATTRS
930 _mm_cmpord_ss(__m128 __a
, __m128 __b
)
932 return (__m128
)__builtin_ia32_cmpordss((__v4sf
)__a
, (__v4sf
)__b
);
935 /// \brief Compares each of the corresponding 32-bit float values of the
936 /// 128-bit vectors of [4 x float] to determine if the values in the first
937 /// operand are ordered with respect to those in the second operand.
939 /// \headerfile <x86intrin.h>
941 /// This intrinsic corresponds to the \c VCMPORDPS / CMPORDPS instructions.
944 /// A 128-bit vector of [4 x float].
946 /// A 128-bit vector of [4 x float].
947 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
948 static __inline__ __m128 __DEFAULT_FN_ATTRS
949 _mm_cmpord_ps(__m128 __a
, __m128 __b
)
951 return (__m128
)__builtin_ia32_cmpordps((__v4sf
)__a
, (__v4sf
)__b
);
954 /// \brief Compares two 32-bit float values in the low-order bits of both
955 /// operands to determine if the value in the first operand is unordered
956 /// with respect to the corresponding value in the second operand and
957 /// returns the result of the comparison in the low-order bits of a vector
960 /// \headerfile <x86intrin.h>
962 /// This intrinsic corresponds to the \c VCMPUNORDSS / CMPUNORDSS instructions.
965 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
966 /// 32 bits of this operand are used in the comparison.
968 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
969 /// 32 bits of this operand are used in the comparison.
970 /// \returns A 128-bit vector of [4 x float] containing the comparison results
971 /// in the low-order bits.
972 static __inline__ __m128 __DEFAULT_FN_ATTRS
973 _mm_cmpunord_ss(__m128 __a
, __m128 __b
)
975 return (__m128
)__builtin_ia32_cmpunordss((__v4sf
)__a
, (__v4sf
)__b
);
978 /// \brief Compares each of the corresponding 32-bit float values of the
979 /// 128-bit vectors of [4 x float] to determine if the values in the first
980 /// operand are unordered with respect to those in the second operand.
982 /// \headerfile <x86intrin.h>
984 /// This intrinsic corresponds to the \c VCMPUNORDPS / CMPUNORDPS instructions.
987 /// A 128-bit vector of [4 x float].
989 /// A 128-bit vector of [4 x float].
990 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
991 static __inline__ __m128 __DEFAULT_FN_ATTRS
992 _mm_cmpunord_ps(__m128 __a
, __m128 __b
)
994 return (__m128
)__builtin_ia32_cmpunordps((__v4sf
)__a
, (__v4sf
)__b
);
997 /// \brief Compares two 32-bit float values in the low-order bits of both
998 /// operands for equality and returns the result of the comparison.
1000 /// \headerfile <x86intrin.h>
1002 /// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
1005 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1006 /// used in the comparison.
1008 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1009 /// used in the comparison.
1010 /// \returns An integer containing the comparison results.
1011 static __inline__
int __DEFAULT_FN_ATTRS
1012 _mm_comieq_ss(__m128 __a
, __m128 __b
)
1014 return __builtin_ia32_comieq((__v4sf
)__a
, (__v4sf
)__b
);
1017 /// \brief Compares two 32-bit float values in the low-order bits of both
1018 /// operands to determine if the first operand is less than the second
1019 /// operand and returns the result of the comparison.
1021 /// \headerfile <x86intrin.h>
1023 /// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
1026 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1027 /// used in the comparison.
1029 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1030 /// used in the comparison.
1031 /// \returns An integer containing the comparison results.
1032 static __inline__
int __DEFAULT_FN_ATTRS
1033 _mm_comilt_ss(__m128 __a
, __m128 __b
)
1035 return __builtin_ia32_comilt((__v4sf
)__a
, (__v4sf
)__b
);
1038 /// \brief Compares two 32-bit float values in the low-order bits of both
1039 /// operands to determine if the first operand is less than or equal to the
1040 /// second operand and returns the result of the comparison.
1042 /// \headerfile <x86intrin.h>
1044 /// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
1047 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1048 /// used in the comparison.
1050 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1051 /// used in the comparison.
1052 /// \returns An integer containing the comparison results.
1053 static __inline__
int __DEFAULT_FN_ATTRS
1054 _mm_comile_ss(__m128 __a
, __m128 __b
)
1056 return __builtin_ia32_comile((__v4sf
)__a
, (__v4sf
)__b
);
1059 /// \brief Compares two 32-bit float values in the low-order bits of both
1060 /// operands to determine if the first operand is greater than the second
1061 /// operand and returns the result of the comparison.
1063 /// \headerfile <x86intrin.h>
1065 /// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
1068 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1069 /// used in the comparison.
1071 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1072 /// used in the comparison.
1073 /// \returns An integer containing the comparison results.
1074 static __inline__
int __DEFAULT_FN_ATTRS
1075 _mm_comigt_ss(__m128 __a
, __m128 __b
)
1077 return __builtin_ia32_comigt((__v4sf
)__a
, (__v4sf
)__b
);
1080 /// \brief Compares two 32-bit float values in the low-order bits of both
1081 /// operands to determine if the first operand is greater than or equal to
1082 /// the second operand and returns the result of the comparison.
1084 /// \headerfile <x86intrin.h>
1086 /// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
1089 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1090 /// used in the comparison.
1092 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1093 /// used in the comparison.
1094 /// \returns An integer containing the comparison results.
1095 static __inline__
int __DEFAULT_FN_ATTRS
1096 _mm_comige_ss(__m128 __a
, __m128 __b
)
1098 return __builtin_ia32_comige((__v4sf
)__a
, (__v4sf
)__b
);
1101 /// \brief Compares two 32-bit float values in the low-order bits of both
1102 /// operands to determine if the first operand is not equal to the second
1103 /// operand and returns the result of the comparison.
1105 /// \headerfile <x86intrin.h>
1107 /// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
1110 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1111 /// used in the comparison.
1113 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1114 /// used in the comparison.
1115 /// \returns An integer containing the comparison results.
1116 static __inline__
int __DEFAULT_FN_ATTRS
1117 _mm_comineq_ss(__m128 __a
, __m128 __b
)
1119 return __builtin_ia32_comineq((__v4sf
)__a
, (__v4sf
)__b
);
1122 /// \brief Performs an unordered comparison of two 32-bit float values using
1123 /// the low-order bits of both operands to determine equality and returns
1124 /// the result of the comparison.
1126 /// \headerfile <x86intrin.h>
1128 /// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
1131 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1132 /// used in the comparison.
1134 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1135 /// used in the comparison.
1136 /// \returns An integer containing the comparison results.
1137 static __inline__
int __DEFAULT_FN_ATTRS
1138 _mm_ucomieq_ss(__m128 __a
, __m128 __b
)
1140 return __builtin_ia32_ucomieq((__v4sf
)__a
, (__v4sf
)__b
);
1143 /// \brief Performs an unordered comparison of two 32-bit float values using
1144 /// the low-order bits of both operands to determine if the first operand is
1145 /// less than the second operand and returns the result of the comparison.
1147 /// \headerfile <x86intrin.h>
1149 /// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
1152 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1153 /// used in the comparison.
1155 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1156 /// used in the comparison.
1157 /// \returns An integer containing the comparison results.
1158 static __inline__
int __DEFAULT_FN_ATTRS
1159 _mm_ucomilt_ss(__m128 __a
, __m128 __b
)
1161 return __builtin_ia32_ucomilt((__v4sf
)__a
, (__v4sf
)__b
);
1164 /// \brief Performs an unordered comparison of two 32-bit float values using
1165 /// the low-order bits of both operands to determine if the first operand
1166 /// is less than or equal to the second operand and returns the result of
1169 /// \headerfile <x86intrin.h>
1171 /// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
1174 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1175 /// used in the comparison.
1177 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1178 /// used in the comparison.
1179 /// \returns An integer containing the comparison results.
1180 static __inline__
int __DEFAULT_FN_ATTRS
1181 _mm_ucomile_ss(__m128 __a
, __m128 __b
)
1183 return __builtin_ia32_ucomile((__v4sf
)__a
, (__v4sf
)__b
);
1186 /// \brief Performs an unordered comparison of two 32-bit float values using
1187 /// the low-order bits of both operands to determine if the first operand
1188 /// is greater than the second operand and returns the result of the
1191 /// \headerfile <x86intrin.h>
1193 /// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
1196 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1197 /// used in the comparison.
1199 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1200 /// used in the comparison.
1201 /// \returns An integer containing the comparison results.
1202 static __inline__
int __DEFAULT_FN_ATTRS
1203 _mm_ucomigt_ss(__m128 __a
, __m128 __b
)
1205 return __builtin_ia32_ucomigt((__v4sf
)__a
, (__v4sf
)__b
);
1208 /// \brief Performs an unordered comparison of two 32-bit float values using
1209 /// the low-order bits of both operands to determine if the first operand is
1210 /// greater than or equal to the second operand and returns the result of
1213 /// \headerfile <x86intrin.h>
1215 /// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
1218 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1219 /// used in the comparison.
1221 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1222 /// used in the comparison.
1223 /// \returns An integer containing the comparison results.
1224 static __inline__
int __DEFAULT_FN_ATTRS
1225 _mm_ucomige_ss(__m128 __a
, __m128 __b
)
1227 return __builtin_ia32_ucomige((__v4sf
)__a
, (__v4sf
)__b
);
1230 /// \brief Performs an unordered comparison of two 32-bit float values using
1231 /// the low-order bits of both operands to determine inequality and returns
1232 /// the result of the comparison.
1234 /// \headerfile <x86intrin.h>
1236 /// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
1239 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1240 /// used in the comparison.
1242 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1243 /// used in the comparison.
1244 /// \returns An integer containing the comparison results.
1245 static __inline__
int __DEFAULT_FN_ATTRS
1246 _mm_ucomineq_ss(__m128 __a
, __m128 __b
)
1248 return __builtin_ia32_ucomineq((__v4sf
)__a
, (__v4sf
)__b
);
1251 /// \brief Converts a float value contained in the lower 32 bits of a vector of
1252 /// [4 x float] into a 32-bit integer.
1254 /// \headerfile <x86intrin.h>
1256 /// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
1259 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1260 /// used in the conversion.
1261 /// \returns A 32-bit integer containing the converted value.
1262 static __inline__
int __DEFAULT_FN_ATTRS
1263 _mm_cvtss_si32(__m128 __a
)
1265 return __builtin_ia32_cvtss2si((__v4sf
)__a
);
1268 /// \brief Converts a float value contained in the lower 32 bits of a vector of
1269 /// [4 x float] into a 32-bit integer.
1271 /// \headerfile <x86intrin.h>
1273 /// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
1276 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1277 /// used in the conversion.
1278 /// \returns A 32-bit integer containing the converted value.
1279 static __inline__
int __DEFAULT_FN_ATTRS
1280 _mm_cvt_ss2si(__m128 __a
)
1282 return _mm_cvtss_si32(__a
);
1287 /// \brief Converts a float value contained in the lower 32 bits of a vector of
1288 /// [4 x float] into a 64-bit integer.
1290 /// \headerfile <x86intrin.h>
1292 /// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
1295 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1296 /// used in the conversion.
1297 /// \returns A 64-bit integer containing the converted value.
1298 static __inline__
long long __DEFAULT_FN_ATTRS
1299 _mm_cvtss_si64(__m128 __a
)
1301 return __builtin_ia32_cvtss2si64((__v4sf
)__a
);
1306 /// \brief Converts two low-order float values in a 128-bit vector of
1307 /// [4 x float] into a 64-bit vector of [2 x i32].
1309 /// \headerfile <x86intrin.h>
1311 /// This intrinsic corresponds to the \c CVTPS2PI instruction.
1314 /// A 128-bit vector of [4 x float].
1315 /// \returns A 64-bit integer vector containing the converted values.
1316 static __inline__ __m64 __DEFAULT_FN_ATTRS
1317 _mm_cvtps_pi32(__m128 __a
)
1319 return (__m64
)__builtin_ia32_cvtps2pi((__v4sf
)__a
);
1322 /// \brief Converts two low-order float values in a 128-bit vector of
1323 /// [4 x float] into a 64-bit vector of [2 x i32].
1325 /// \headerfile <x86intrin.h>
1327 /// This intrinsic corresponds to the \c CVTPS2PI instruction.
1330 /// A 128-bit vector of [4 x float].
1331 /// \returns A 64-bit integer vector containing the converted values.
1332 static __inline__ __m64 __DEFAULT_FN_ATTRS
1333 _mm_cvt_ps2pi(__m128 __a
)
1335 return _mm_cvtps_pi32(__a
);
1338 /// \brief Converts a float value contained in the lower 32 bits of a vector of
1339 /// [4 x float] into a 32-bit integer, truncating the result when it is
1342 /// \headerfile <x86intrin.h>
1344 /// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
1347 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1348 /// used in the conversion.
1349 /// \returns A 32-bit integer containing the converted value.
1350 static __inline__
int __DEFAULT_FN_ATTRS
1351 _mm_cvttss_si32(__m128 __a
)
1356 /// \brief Converts a float value contained in the lower 32 bits of a vector of
1357 /// [4 x float] into a 32-bit integer, truncating the result when it is
1360 /// \headerfile <x86intrin.h>
1362 /// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
1365 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1366 /// used in the conversion.
1367 /// \returns A 32-bit integer containing the converted value.
1368 static __inline__
int __DEFAULT_FN_ATTRS
1369 _mm_cvtt_ss2si(__m128 __a
)
1371 return _mm_cvttss_si32(__a
);
1374 /// \brief Converts a float value contained in the lower 32 bits of a vector of
1375 /// [4 x float] into a 64-bit integer, truncating the result when it is
1378 /// \headerfile <x86intrin.h>
1380 /// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
1383 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1384 /// used in the conversion.
1385 /// \returns A 64-bit integer containing the converted value.
1386 static __inline__
long long __DEFAULT_FN_ATTRS
1387 _mm_cvttss_si64(__m128 __a
)
1392 /// \brief Converts two low-order float values in a 128-bit vector of
1393 /// [4 x float] into a 64-bit vector of [2 x i32], truncating the result
1394 /// when it is inexact.
1396 /// \headerfile <x86intrin.h>
1398 /// This intrinsic corresponds to the \c CVTTPS2PI / VTTPS2PI instructions.
1401 /// A 128-bit vector of [4 x float].
1402 /// \returns A 64-bit integer vector containing the converted values.
1403 static __inline__ __m64 __DEFAULT_FN_ATTRS
1404 _mm_cvttps_pi32(__m128 __a
)
1406 return (__m64
)__builtin_ia32_cvttps2pi((__v4sf
)__a
);
1409 /// \brief Converts two low-order float values in a 128-bit vector of [4 x
1410 /// float] into a 64-bit vector of [2 x i32], truncating the result when it
1413 /// \headerfile <x86intrin.h>
1415 /// This intrinsic corresponds to the \c CVTTPS2PI instruction.
1418 /// A 128-bit vector of [4 x float].
1419 /// \returns A 64-bit integer vector containing the converted values.
1420 static __inline__ __m64 __DEFAULT_FN_ATTRS
1421 _mm_cvtt_ps2pi(__m128 __a
)
1423 return _mm_cvttps_pi32(__a
);
1426 /// \brief Converts a 32-bit signed integer value into a floating point value
1427 /// and writes it to the lower 32 bits of the destination. The remaining
1428 /// higher order elements of the destination vector are copied from the
1429 /// corresponding elements in the first operand.
1431 /// \headerfile <x86intrin.h>
1433 /// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
1436 /// A 128-bit vector of [4 x float].
1438 /// A 32-bit signed integer operand containing the value to be converted.
1439 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1440 /// converted value of the second operand. The upper 96 bits are copied from
1441 /// the upper 96 bits of the first operand.
1442 static __inline__ __m128 __DEFAULT_FN_ATTRS
1443 _mm_cvtsi32_ss(__m128 __a
, int __b
)
1449 /// \brief Converts a 32-bit signed integer value into a floating point value
1450 /// and writes it to the lower 32 bits of the destination. The remaining
1451 /// higher order elements of the destination are copied from the
1452 /// corresponding elements in the first operand.
1454 /// \headerfile <x86intrin.h>
1456 /// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
1459 /// A 128-bit vector of [4 x float].
1461 /// A 32-bit signed integer operand containing the value to be converted.
1462 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1463 /// converted value of the second operand. The upper 96 bits are copied from
1464 /// the upper 96 bits of the first operand.
1465 static __inline__ __m128 __DEFAULT_FN_ATTRS
1466 _mm_cvt_si2ss(__m128 __a
, int __b
)
1468 return _mm_cvtsi32_ss(__a
, __b
);
1473 /// \brief Converts a 64-bit signed integer value into a floating point value
1474 /// and writes it to the lower 32 bits of the destination. The remaining
1475 /// higher order elements of the destination are copied from the
1476 /// corresponding elements in the first operand.
1478 /// \headerfile <x86intrin.h>
1480 /// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
1483 /// A 128-bit vector of [4 x float].
1485 /// A 64-bit signed integer operand containing the value to be converted.
1486 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1487 /// converted value of the second operand. The upper 96 bits are copied from
1488 /// the upper 96 bits of the first operand.
1489 static __inline__ __m128 __DEFAULT_FN_ATTRS
1490 _mm_cvtsi64_ss(__m128 __a
, long long __b
)
1498 /// \brief Converts two elements of a 64-bit vector of [2 x i32] into two
1499 /// floating point values and writes them to the lower 64-bits of the
1500 /// destination. The remaining higher order elements of the destination are
1501 /// copied from the corresponding elements in the first operand.
1503 /// \headerfile <x86intrin.h>
1505 /// This intrinsic corresponds to the \c CVTPI2PS instruction.
1508 /// A 128-bit vector of [4 x float].
1510 /// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1511 /// and written to the corresponding low-order elements in the destination.
1512 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1513 /// converted value of the second operand. The upper 64 bits are copied from
1514 /// the upper 64 bits of the first operand.
1515 static __inline__ __m128 __DEFAULT_FN_ATTRS
1516 _mm_cvtpi32_ps(__m128 __a
, __m64 __b
)
1518 return __builtin_ia32_cvtpi2ps((__v4sf
)__a
, (__v2si
)__b
);
1521 /// \brief Converts two elements of a 64-bit vector of [2 x i32] into two
1522 /// floating point values and writes them to the lower 64-bits of the
1523 /// destination. The remaining higher order elements of the destination are
1524 /// copied from the corresponding elements in the first operand.
1526 /// \headerfile <x86intrin.h>
1528 /// This intrinsic corresponds to the \c CVTPI2PS instruction.
1531 /// A 128-bit vector of [4 x float].
1533 /// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1534 /// and written to the corresponding low-order elements in the destination.
1535 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1536 /// converted value from the second operand. The upper 64 bits are copied
1537 /// from the upper 64 bits of the first operand.
1538 static __inline__ __m128 __DEFAULT_FN_ATTRS
1539 _mm_cvt_pi2ps(__m128 __a
, __m64 __b
)
1541 return _mm_cvtpi32_ps(__a
, __b
);
1544 /// \brief Extracts a float value contained in the lower 32 bits of a vector of
1547 /// \headerfile <x86intrin.h>
1549 /// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
1552 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1553 /// used in the extraction.
1554 /// \returns A 32-bit float containing the extracted value.
1555 static __inline__
float __DEFAULT_FN_ATTRS
1556 _mm_cvtss_f32(__m128 __a
)
1561 /// \brief Loads two packed float values from the address __p into the
1562 /// high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1563 /// are copied from the low-order bits of the first operand.
1565 /// \headerfile <x86intrin.h>
1567 /// This intrinsic corresponds to the \c VMOVHPD / MOVHPD instruction.
1570 /// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1571 /// of the destination.
1573 /// A pointer to two packed float values. Bits [63:0] are written to bits
1574 /// [127:64] of the destination.
1575 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1576 static __inline__ __m128 __DEFAULT_FN_ATTRS
1577 _mm_loadh_pi(__m128 __a
, const __m64
*__p
)
1579 typedef float __mm_loadh_pi_v2f32
__attribute__((__vector_size__(8)));
1580 struct __mm_loadh_pi_struct
{
1581 __mm_loadh_pi_v2f32 __u
;
1582 } __attribute__((__packed__
, __may_alias__
));
1583 __mm_loadh_pi_v2f32 __b
= ((struct __mm_loadh_pi_struct
*)__p
)->__u
;
1584 __m128 __bb
= __builtin_shufflevector(__b
, __b
, 0, 1, 0, 1);
1585 return __builtin_shufflevector(__a
, __bb
, 0, 1, 4, 5);
1588 /// \brief Loads two packed float values from the address __p into the low-order
1589 /// bits of a 128-bit vector of [4 x float]. The high-order bits are copied
1590 /// from the high-order bits of the first operand.
1592 /// \headerfile <x86intrin.h>
1594 /// This intrinsic corresponds to the \c VMOVLPD / MOVLPD instruction.
1597 /// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1598 /// [127:64] of the destination.
1600 /// A pointer to two packed float values. Bits [63:0] are written to bits
1601 /// [63:0] of the destination.
1602 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1603 static __inline__ __m128 __DEFAULT_FN_ATTRS
1604 _mm_loadl_pi(__m128 __a
, const __m64
*__p
)
1606 typedef float __mm_loadl_pi_v2f32
__attribute__((__vector_size__(8)));
1607 struct __mm_loadl_pi_struct
{
1608 __mm_loadl_pi_v2f32 __u
;
1609 } __attribute__((__packed__
, __may_alias__
));
1610 __mm_loadl_pi_v2f32 __b
= ((struct __mm_loadl_pi_struct
*)__p
)->__u
;
1611 __m128 __bb
= __builtin_shufflevector(__b
, __b
, 0, 1, 0, 1);
1612 return __builtin_shufflevector(__a
, __bb
, 4, 5, 2, 3);
1615 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
1616 /// 32 bits of the vector are initialized with the single-precision
1617 /// floating-point value loaded from a specified memory location. The upper
1618 /// 96 bits are set to zero.
1620 /// \headerfile <x86intrin.h>
1622 /// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
1625 /// A pointer to a 32-bit memory location containing a single-precision
1626 /// floating-point value.
1627 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1628 /// lower 32 bits contain the value loaded from the memory location. The
1629 /// upper 96 bits are set to zero.
1630 static __inline__ __m128 __DEFAULT_FN_ATTRS
1631 _mm_load_ss(const float *__p
)
1633 struct __mm_load_ss_struct
{
1635 } __attribute__((__packed__
, __may_alias__
));
1636 float __u
= ((struct __mm_load_ss_struct
*)__p
)->__u
;
1637 return (__m128
){ __u
, 0, 0, 0 };
1640 /// \brief Loads a 32-bit float value and duplicates it to all four vector
1641 /// elements of a 128-bit vector of [4 x float].
1643 /// \headerfile <x86intrin.h>
1645 /// This intrinsic corresponds to the \c VMOVSS / MOVSS + \c shuffling
1649 /// A pointer to a float value to be loaded and duplicated.
1650 /// \returns A 128-bit vector of [4 x float] containing the loaded
1651 /// and duplicated values.
1652 static __inline__ __m128 __DEFAULT_FN_ATTRS
1653 _mm_load1_ps(const float *__p
)
1655 struct __mm_load1_ps_struct
{
1657 } __attribute__((__packed__
, __may_alias__
));
1658 float __u
= ((struct __mm_load1_ps_struct
*)__p
)->__u
;
1659 return (__m128
){ __u
, __u
, __u
, __u
};
1662 #define _mm_load_ps1(p) _mm_load1_ps(p)
1664 /// \brief Loads a 128-bit floating-point vector of [4 x float] from an aligned
1665 /// memory location.
1667 /// \headerfile <x86intrin.h>
1669 /// This intrinsic corresponds to the \c VMOVAPS / MOVAPS instruction.
1672 /// A pointer to a 128-bit memory location. The address of the memory
1673 /// location has to be 128-bit aligned.
1674 /// \returns A 128-bit vector of [4 x float] containing the loaded valus.
1675 static __inline__ __m128 __DEFAULT_FN_ATTRS
1676 _mm_load_ps(const float *__p
)
1678 return *(__m128
*)__p
;
1681 /// \brief Loads a 128-bit floating-point vector of [4 x float] from an
1682 /// unaligned memory location.
1684 /// \headerfile <x86intrin.h>
1686 /// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction.
1689 /// A pointer to a 128-bit memory location. The address of the memory
1690 /// location does not have to be aligned.
1691 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1692 static __inline__ __m128 __DEFAULT_FN_ATTRS
1693 _mm_loadu_ps(const float *__p
)
1697 } __attribute__((__packed__
, __may_alias__
));
1698 return ((struct __loadu_ps
*)__p
)->__v
;
1701 /// \brief Loads four packed float values, in reverse order, from an aligned
1702 /// memory location to 32-bit elements in a 128-bit vector of [4 x float].
1704 /// \headerfile <x86intrin.h>
1706 /// This intrinsic corresponds to the \c VMOVAPS / MOVAPS + \c shuffling
1710 /// A pointer to a 128-bit memory location. The address of the memory
1711 /// location has to be 128-bit aligned.
1712 /// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1713 /// in reverse order.
1714 static __inline__ __m128 __DEFAULT_FN_ATTRS
1715 _mm_loadr_ps(const float *__p
)
1717 __m128 __a
= _mm_load_ps(__p
);
1718 return __builtin_shufflevector((__v4sf
)__a
, (__v4sf
)__a
, 3, 2, 1, 0);
1721 /// \brief Create a 128-bit vector of [4 x float] with undefined values.
1723 /// \headerfile <x86intrin.h>
1725 /// This intrinsic has no corresponding instruction.
1727 /// \returns A 128-bit vector of [4 x float] containing undefined values.
1729 static __inline__ __m128 __DEFAULT_FN_ATTRS
1730 _mm_undefined_ps(void)
1732 return (__m128
)__builtin_ia32_undef128();
1735 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
1736 /// 32 bits of the vector are initialized with the specified single-precision
1737 /// floating-point value. The upper 96 bits are set to zero.
1739 /// \headerfile <x86intrin.h>
1741 /// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
1744 /// A single-precision floating-point value used to initialize the lower 32
1745 /// bits of the result.
1746 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1747 /// lower 32 bits contain the value provided in the source operand. The
1748 /// upper 96 bits are set to zero.
1749 static __inline__ __m128 __DEFAULT_FN_ATTRS
1750 _mm_set_ss(float __w
)
1752 return (__m128
){ __w
, 0, 0, 0 };
1755 /// \brief Constructs a 128-bit floating-point vector of [4 x float], with each
1756 /// of the four single-precision floating-point vector elements set to the
1757 /// specified single-precision floating-point value.
1759 /// \headerfile <x86intrin.h>
1761 /// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
1764 /// A single-precision floating-point value used to initialize each vector
1765 /// element of the result.
1766 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1767 static __inline__ __m128 __DEFAULT_FN_ATTRS
1768 _mm_set1_ps(float __w
)
1770 return (__m128
){ __w
, __w
, __w
, __w
};
1773 /* Microsoft specific. */
1774 /// \brief Constructs a 128-bit floating-point vector of [4 x float], with each
1775 /// of the four single-precision floating-point vector elements set to the
1776 /// specified single-precision floating-point value.
1778 /// \headerfile <x86intrin.h>
1780 /// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
1783 /// A single-precision floating-point value used to initialize each vector
1784 /// element of the result.
1785 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1786 static __inline__ __m128 __DEFAULT_FN_ATTRS
1787 _mm_set_ps1(float __w
)
1789 return _mm_set1_ps(__w
);
1792 /// \brief Constructs a 128-bit floating-point vector of [4 x float]
1793 /// initialized with the specified single-precision floating-point values.
1795 /// \headerfile <x86intrin.h>
1797 /// This intrinsic is a utility function and does not correspond to a specific
1801 /// A single-precision floating-point value used to initialize bits [127:96]
1804 /// A single-precision floating-point value used to initialize bits [95:64]
1807 /// A single-precision floating-point value used to initialize bits [63:32]
1810 /// A single-precision floating-point value used to initialize bits [31:0]
1812 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1813 static __inline__ __m128 __DEFAULT_FN_ATTRS
1814 _mm_set_ps(float __z
, float __y
, float __x
, float __w
)
1816 return (__m128
){ __w
, __x
, __y
, __z
};
1819 /// \brief Constructs a 128-bit floating-point vector of [4 x float],
1820 /// initialized in reverse order with the specified 32-bit single-precision
1821 /// float-point values.
1823 /// \headerfile <x86intrin.h>
1825 /// This intrinsic is a utility function and does not correspond to a specific
1829 /// A single-precision floating-point value used to initialize bits [31:0]
1832 /// A single-precision floating-point value used to initialize bits [63:32]
1835 /// A single-precision floating-point value used to initialize bits [95:64]
1838 /// A single-precision floating-point value used to initialize bits [127:96]
1840 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1841 static __inline__ __m128 __DEFAULT_FN_ATTRS
1842 _mm_setr_ps(float __z
, float __y
, float __x
, float __w
)
1844 return (__m128
){ __z
, __y
, __x
, __w
};
1847 /// \brief Constructs a 128-bit floating-point vector of [4 x float] initialized
1850 /// \headerfile <x86intrin.h>
1852 /// This intrinsic corresponds to the \c VXORPS / XORPS instruction.
1854 /// \returns An initialized 128-bit floating-point vector of [4 x float] with
1855 /// all elements set to zero.
1856 static __inline__ __m128 __DEFAULT_FN_ATTRS
1857 _mm_setzero_ps(void)
1859 return (__m128
){ 0, 0, 0, 0 };
1862 /// \brief Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
1863 /// memory location.
1865 /// \headerfile <x86intrin.h>
1867 /// This intrinsic corresponds to the \c VPEXTRQ / MOVQ instruction.
1870 /// A pointer to a 64-bit memory location.
1872 /// A 128-bit vector of [4 x float] containing the values to be stored.
1873 static __inline__
void __DEFAULT_FN_ATTRS
1874 _mm_storeh_pi(__m64
*__p
, __m128 __a
)
1876 __builtin_ia32_storehps((__v2si
*)__p
, (__v4sf
)__a
);
1879 /// \brief Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
1880 /// memory location.
1882 /// \headerfile <x86intrin.h>
1884 /// This intrinsic corresponds to the \c VMOVLPS / MOVLPS instruction.
1887 /// A pointer to a memory location that will receive the float values.
1889 /// A 128-bit vector of [4 x float] containing the values to be stored.
1890 static __inline__
void __DEFAULT_FN_ATTRS
1891 _mm_storel_pi(__m64
*__p
, __m128 __a
)
1893 __builtin_ia32_storelps((__v2si
*)__p
, (__v4sf
)__a
);
1896 /// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
1897 /// memory location.
1899 /// \headerfile <x86intrin.h>
1901 /// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
1904 /// A pointer to a 32-bit memory location.
1906 /// A 128-bit vector of [4 x float] containing the value to be stored.
1907 static __inline__
void __DEFAULT_FN_ATTRS
1908 _mm_store_ss(float *__p
, __m128 __a
)
1910 struct __mm_store_ss_struct
{
1912 } __attribute__((__packed__
, __may_alias__
));
1913 ((struct __mm_store_ss_struct
*)__p
)->__u
= __a
[0];
1916 /// \brief Stores float values from a 128-bit vector of [4 x float] to an
1917 /// unaligned memory location.
1919 /// \headerfile <x86intrin.h>
1921 /// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction.
1924 /// A pointer to a 128-bit memory location. The address of the memory
1925 /// location does not have to be aligned.
1927 /// A 128-bit vector of [4 x float] containing the values to be stored.
1928 static __inline__
void __DEFAULT_FN_ATTRS
1929 _mm_storeu_ps(float *__p
, __m128 __a
)
1931 struct __storeu_ps
{
1933 } __attribute__((__packed__
, __may_alias__
));
1934 ((struct __storeu_ps
*)__p
)->__v
= __a
;
1937 /// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
1938 /// four contiguous elements in an aligned memory location.
1940 /// \headerfile <x86intrin.h>
1942 /// This intrinsic corresponds to \c VMOVAPS / MOVAPS + \c shuffling
1946 /// A pointer to a 128-bit memory location.
1948 /// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
1949 /// of the four contiguous elements pointed by __p.
1950 static __inline__
void __DEFAULT_FN_ATTRS
1951 _mm_store_ps(float *__p
, __m128 __a
)
1953 *(__m128
*)__p
= __a
;
1956 /// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
1957 /// four contiguous elements in an aligned memory location.
1959 /// \headerfile <x86intrin.h>
1961 /// This intrinsic corresponds to \c VMOVAPS / MOVAPS + \c shuffling
1965 /// A pointer to a 128-bit memory location.
1967 /// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
1968 /// of the four contiguous elements pointed by __p.
1969 static __inline__
void __DEFAULT_FN_ATTRS
1970 _mm_store1_ps(float *__p
, __m128 __a
)
1972 __a
= __builtin_shufflevector((__v4sf
)__a
, (__v4sf
)__a
, 0, 0, 0, 0);
1973 _mm_store_ps(__p
, __a
);
1976 /// \brief Stores float values from a 128-bit vector of [4 x float] to an
1977 /// aligned memory location.
1979 /// \headerfile <x86intrin.h>
1981 /// This intrinsic corresponds to the \c VMOVAPS / MOVAPS instruction.
1984 /// A pointer to a 128-bit memory location. The address of the memory
1985 /// location has to be 128-bit aligned.
1987 /// A 128-bit vector of [4 x float] containing the values to be stored.
1988 static __inline__
void __DEFAULT_FN_ATTRS
1989 _mm_store_ps1(float *__p
, __m128 __a
)
1991 return _mm_store1_ps(__p
, __a
);
1994 /// \brief Stores float values from a 128-bit vector of [4 x float] to an
1995 /// aligned memory location in reverse order.
1997 /// \headerfile <x86intrin.h>
1999 /// This intrinsic corresponds to the \c VMOVAPS / MOVAPS + \c shuffling
2003 /// A pointer to a 128-bit memory location. The address of the memory
2004 /// location has to be 128-bit aligned.
2006 /// A 128-bit vector of [4 x float] containing the values to be stored.
2007 static __inline__
void __DEFAULT_FN_ATTRS
2008 _mm_storer_ps(float *__p
, __m128 __a
)
2010 __a
= __builtin_shufflevector((__v4sf
)__a
, (__v4sf
)__a
, 3, 2, 1, 0);
2011 _mm_store_ps(__p
, __a
);
2014 #define _MM_HINT_T0 3
2015 #define _MM_HINT_T1 2
2016 #define _MM_HINT_T2 1
2017 #define _MM_HINT_NTA 0
2020 /* FIXME: We have to #define this because "sel" must be a constant integer, and
2021 Sema doesn't do any form of constant propagation yet. */
2023 /// \brief Loads one cache line of data from the specified address to a location
2024 /// closer to the processor.
2026 /// \headerfile <x86intrin.h>
2029 /// void _mm_prefetch(const void * a, const int sel);
2032 /// This intrinsic corresponds to the \c PREFETCHNTA instruction.
2035 /// A pointer to a memory location containing a cache line of data.
2037 /// A predefined integer constant specifying the type of prefetch operation:
2038 /// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint.
2039 /// The PREFETCHNTA instruction will be generated.
2040 /// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2042 /// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2044 /// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2046 #define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
2049 /// \brief Stores a 64-bit integer in the specified aligned memory location. To
2050 /// minimize caching, the data is flagged as non-temporal (unlikely to be
2051 /// used again soon).
2053 /// \headerfile <x86intrin.h>
2055 /// This intrinsic corresponds to the \c MOVNTQ instruction.
2058 /// A pointer to an aligned memory location used to store the register value.
2060 /// A 64-bit integer containing the value to be stored.
2061 static __inline__
void __DEFAULT_FN_ATTRS
2062 _mm_stream_pi(__m64
*__p
, __m64 __a
)
2064 __builtin_ia32_movntq(__p
, __a
);
2067 /// \brief Moves packed float values from a 128-bit vector of [4 x float] to a
2068 /// 128-bit aligned memory location. To minimize caching, the data is flagged
2069 /// as non-temporal (unlikely to be used again soon).
2071 /// \headerfile <x86intrin.h>
2073 /// This intrinsic corresponds to the \c VMOVNTPS / MOVNTPS instruction.
2076 /// A pointer to a 128-bit aligned memory location that will receive the
2079 /// A 128-bit vector of [4 x float] containing the values to be moved.
2080 static __inline__
void __DEFAULT_FN_ATTRS
2081 _mm_stream_ps(float *__p
, __m128 __a
)
2083 __builtin_nontemporal_store((__v4sf
)__a
, (__v4sf
*)__p
);
2086 /// \brief Forces strong memory ordering (serialization) between store
2087 /// instructions preceding this instruction and store instructions following
2088 /// this instruction, ensuring the system completes all previous stores
2089 /// before executing subsequent stores.
2091 /// \headerfile <x86intrin.h>
2093 /// This intrinsic corresponds to the \c SFENCE instruction.
2095 static __inline__
void __DEFAULT_FN_ATTRS
2098 __builtin_ia32_sfence();
2101 /// \brief Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2102 /// returns it, as specified by the immediate integer operand.
2104 /// \headerfile <x86intrin.h>
2106 /// This intrinsic corresponds to the \c VPEXTRW / PEXTRW instruction.
2109 /// A 64-bit vector of [4 x i16].
2111 /// An immediate integer operand that determines which bits are extracted:
2112 /// 0: Bits [15:0] are copied to the destination.
2113 /// 1: Bits [31:16] are copied to the destination.
2114 /// 2: Bits [47:32] are copied to the destination.
2115 /// 3: Bits [63:48] are copied to the destination.
2116 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2117 #define _mm_extract_pi16(a, n) __extension__ ({ \
2118 (int)__builtin_ia32_vec_ext_v4hi((__m64)a, (int)n); })
2120 /// \brief Copies data from the 64-bit vector of [4 x i16] to the destination,
2121 /// and inserts the lower 16-bits of an integer operand at the 16-bit offset
2122 /// specified by the immediate operand __n.
2124 /// \headerfile <x86intrin.h>
2126 /// This intrinsic corresponds to the \c VPINSRW / PINSRW instruction.
2129 /// A 64-bit vector of [4 x i16].
2131 /// An integer. The lower 16-bit value from this operand is written to the
2132 /// destination at the offset specified by operand __n.
2134 /// An immediate integer operant that determines which the bits to be used
2135 /// in the destination.
2136 /// 0: Bits [15:0] are copied to the destination.
2137 /// 1: Bits [31:16] are copied to the destination.
2138 /// 2: Bits [47:32] are copied to the destination.
2139 /// 3: Bits [63:48] are copied to the destination.
2140 /// The remaining bits in the destination are copied from the corresponding
2141 /// bits in operand __a.
2142 /// \returns A 64-bit integer vector containing the copied packed data from the
2144 #define _mm_insert_pi16(a, d, n) __extension__ ({ \
2145 (__m64)__builtin_ia32_vec_set_v4hi((__m64)a, (int)d, (int)n); })
2147 /// \brief Compares each of the corresponding packed 16-bit integer values of
2148 /// the 64-bit integer vectors, and writes the greater value to the
2149 /// corresponding bits in the destination.
2151 /// \headerfile <x86intrin.h>
2153 /// This intrinsic corresponds to the \c PMAXSW instruction.
2156 /// A 64-bit integer vector containing one of the source operands.
2158 /// A 64-bit integer vector containing one of the source operands.
2159 /// \returns A 64-bit integer vector containing the comparison results.
2160 static __inline__ __m64 __DEFAULT_FN_ATTRS
2161 _mm_max_pi16(__m64 __a
, __m64 __b
)
2163 return (__m64
)__builtin_ia32_pmaxsw((__v4hi
)__a
, (__v4hi
)__b
);
2166 /// \brief Compares each of the corresponding packed 8-bit unsigned integer
2167 /// values of the 64-bit integer vectors, and writes the greater value to the
2168 /// corresponding bits in the destination.
2170 /// \headerfile <x86intrin.h>
2172 /// This intrinsic corresponds to the \c PMAXUB instruction.
2175 /// A 64-bit integer vector containing one of the source operands.
2177 /// A 64-bit integer vector containing one of the source operands.
2178 /// \returns A 64-bit integer vector containing the comparison results.
2179 static __inline__ __m64 __DEFAULT_FN_ATTRS
2180 _mm_max_pu8(__m64 __a
, __m64 __b
)
2182 return (__m64
)__builtin_ia32_pmaxub((__v8qi
)__a
, (__v8qi
)__b
);
2185 /// \brief Compares each of the corresponding packed 16-bit integer values of
2186 /// the 64-bit integer vectors, and writes the lesser value to the
2187 /// corresponding bits in the destination.
2189 /// \headerfile <x86intrin.h>
2191 /// This intrinsic corresponds to the \c PMINSW instruction.
2194 /// A 64-bit integer vector containing one of the source operands.
2196 /// A 64-bit integer vector containing one of the source operands.
2197 /// \returns A 64-bit integer vector containing the comparison results.
2198 static __inline__ __m64 __DEFAULT_FN_ATTRS
2199 _mm_min_pi16(__m64 __a
, __m64 __b
)
2201 return (__m64
)__builtin_ia32_pminsw((__v4hi
)__a
, (__v4hi
)__b
);
2204 /// \brief Compares each of the corresponding packed 8-bit unsigned integer
2205 /// values of the 64-bit integer vectors, and writes the lesser value to the
2206 /// corresponding bits in the destination.
2208 /// \headerfile <x86intrin.h>
2210 /// This intrinsic corresponds to the \c PMINUB instruction.
2213 /// A 64-bit integer vector containing one of the source operands.
2215 /// A 64-bit integer vector containing one of the source operands.
2216 /// \returns A 64-bit integer vector containing the comparison results.
2217 static __inline__ __m64 __DEFAULT_FN_ATTRS
2218 _mm_min_pu8(__m64 __a
, __m64 __b
)
2220 return (__m64
)__builtin_ia32_pminub((__v8qi
)__a
, (__v8qi
)__b
);
2223 /// \brief Takes the most significant bit from each 8-bit element in a 64-bit
2224 /// integer vector to create a 16-bit mask value. Zero-extends the value to
2225 /// 32-bit integer and writes it to the destination.
2227 /// \headerfile <x86intrin.h>
2229 /// This intrinsic corresponds to the \c PMOVMSKB instruction.
2232 /// A 64-bit integer vector containing the values with bits to be extracted.
2233 /// \returns The most significant bit from each 8-bit element in the operand,
2234 /// written to bits [15:0].
2235 static __inline__
int __DEFAULT_FN_ATTRS
2236 _mm_movemask_pi8(__m64 __a
)
2238 return __builtin_ia32_pmovmskb((__v8qi
)__a
);
2241 /// \brief Multiplies packed 16-bit unsigned integer values and writes the
2242 /// high-order 16 bits of each 32-bit product to the corresponding bits in
2243 /// the destination.
2245 /// \headerfile <x86intrin.h>
2247 /// This intrinsic corresponds to the \c PMULHUW instruction.
2250 /// A 64-bit integer vector containing one of the source operands.
2252 /// A 64-bit integer vector containing one of the source operands.
2253 /// \returns A 64-bit integer vector containing the products of both operands.
2254 static __inline__ __m64 __DEFAULT_FN_ATTRS
2255 _mm_mulhi_pu16(__m64 __a
, __m64 __b
)
2257 return (__m64
)__builtin_ia32_pmulhuw((__v4hi
)__a
, (__v4hi
)__b
);
2260 /// \brief Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2261 /// destination, as specified by the immediate value operand.
2263 /// \headerfile <x86intrin.h>
2265 /// This intrinsic corresponds to the \c PSHUFW instruction.
2268 /// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2272 /// A 64-bit integer vector containing the values to be shuffled.
2274 /// An immediate value containing an 8-bit value specifying which elements to
2275 /// copy from a. The destinations within the 64-bit destination are assigned
2276 /// values as follows:
2277 /// Bits [1:0] are used to assign values to bits [15:0] in the destination.
2278 /// Bits [3:2] are used to assign values to bits [31:16] in the destination.
2279 /// Bits [5:4] are used to assign values to bits [47:32] in the destination.
2280 /// Bits [7:6] are used to assign values to bits [63:48] in the destination.
2281 /// Bit value assignments:
2282 /// 00: assigned from bits [15:0] of a.
2283 /// 01: assigned from bits [31:16] of a.
2284 /// 10: assigned from bits [47:32] of a.
2285 /// 11: assigned from bits [63:48] of a.
2286 /// \returns A 64-bit integer vector containing the shuffled values.
2287 #define _mm_shuffle_pi16(a, n) __extension__ ({ \
2288 (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)); })
2290 /// \brief Conditionally copies the values from each 8-bit element in the first
2291 /// 64-bit integer vector operand to the specified memory location, as
2292 /// specified by the most significant bit in the corresponding element in the
2293 /// second 64-bit integer vector operand. To minimize caching, the data is
2294 /// flagged as non-temporal (unlikely to be used again soon).
2296 /// \headerfile <x86intrin.h>
2298 /// This intrinsic corresponds to the \c MASKMOVQ instruction.
2301 /// A 64-bit integer vector containing the values with elements to be copied.
2303 /// A 64-bit integer vector operand. The most significant bit from each 8-bit
2304 /// element determines whether the corresponding element in operand __d is
2305 /// copied. If the most significant bit of a given element is 1, the
2306 /// corresponding element in operand __d is copied.
2308 /// A pointer to a 64-bit memory location that will receive the conditionally
2309 /// copied integer values. The address of the memory location does not have
2311 static __inline__
void __DEFAULT_FN_ATTRS
2312 _mm_maskmove_si64(__m64 __d
, __m64 __n
, char *__p
)
2314 __builtin_ia32_maskmovq((__v8qi
)__d
, (__v8qi
)__n
, __p
);
2317 /// \brief Computes the rounded averages of the packed unsigned 8-bit integer
2318 /// values and writes the averages to the corresponding bits in the
2321 /// \headerfile <x86intrin.h>
2323 /// This intrinsic corresponds to the \c PAVGB instruction.
2326 /// A 64-bit integer vector containing one of the source operands.
2328 /// A 64-bit integer vector containing one of the source operands.
2329 /// \returns A 64-bit integer vector containing the averages of both operands.
2330 static __inline__ __m64 __DEFAULT_FN_ATTRS
2331 _mm_avg_pu8(__m64 __a
, __m64 __b
)
2333 return (__m64
)__builtin_ia32_pavgb((__v8qi
)__a
, (__v8qi
)__b
);
2336 /// \brief Computes the rounded averages of the packed unsigned 16-bit integer
2337 /// values and writes the averages to the corresponding bits in the
2340 /// \headerfile <x86intrin.h>
2342 /// This intrinsic corresponds to the \c PAVGW instruction.
2345 /// A 64-bit integer vector containing one of the source operands.
2347 /// A 64-bit integer vector containing one of the source operands.
2348 /// \returns A 64-bit integer vector containing the averages of both operands.
2349 static __inline__ __m64 __DEFAULT_FN_ATTRS
2350 _mm_avg_pu16(__m64 __a
, __m64 __b
)
2352 return (__m64
)__builtin_ia32_pavgw((__v4hi
)__a
, (__v4hi
)__b
);
2355 /// \brief Subtracts the corresponding 8-bit unsigned integer values of the two
2356 /// 64-bit vector operands and computes the absolute value for each of the
2357 /// difference. Then sum of the 8 absolute differences is written to the
2358 /// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2360 /// \headerfile <x86intrin.h>
2362 /// This intrinsic corresponds to the \c PSADBW instruction.
2365 /// A 64-bit integer vector containing one of the source operands.
2367 /// A 64-bit integer vector containing one of the source operands.
2368 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2369 /// sets of absolute differences between both operands. The upper bits are
2371 static __inline__ __m64 __DEFAULT_FN_ATTRS
2372 _mm_sad_pu8(__m64 __a
, __m64 __b
)
2374 return (__m64
)__builtin_ia32_psadbw((__v8qi
)__a
, (__v8qi
)__b
);
2377 /// \brief Returns the contents of the MXCSR register as a 32-bit unsigned
2378 /// integer value. There are several groups of macros associated with this
2379 /// intrinsic, including:
2380 /// * For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2381 /// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2382 /// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2383 /// _MM_GET_EXCEPTION_STATE().
2384 /// * For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2385 /// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2386 /// There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2387 /// * For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2388 /// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2389 /// _MM_GET_ROUNDING_MODE(x) where x is one of these macros.
2390 /// * For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2391 /// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2392 /// * For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2393 /// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2394 /// _MM_GET_DENORMALS_ZERO_MODE().
2396 /// For example, the expression below checks if an overflow exception has
2398 /// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2400 /// The following example gets the current rounding mode:
2401 /// _MM_GET_ROUNDING_MODE()
2403 /// \headerfile <x86intrin.h>
2405 /// This intrinsic corresponds to the \c VSTMXCSR / STMXCSR instruction.
2407 /// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2409 static __inline__
unsigned int __DEFAULT_FN_ATTRS
2412 return __builtin_ia32_stmxcsr();
2415 /// \brief Sets the MXCSR register with the 32-bit unsigned integer value. There
2416 /// are several groups of macros associated with this intrinsic, including:
2417 /// * For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2418 /// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2419 /// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2420 /// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2421 /// * For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2422 /// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2423 /// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2424 /// of these macros.
2425 /// * For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2426 /// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2427 /// _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2428 /// * For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2429 /// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2430 /// one of these macros.
2431 /// * For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2432 /// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2433 /// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2435 /// For example, the following expression causes subsequent floating-point
2436 /// operations to round up:
2437 /// _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2439 /// The following example sets the DAZ and FTZ flags:
2440 /// void setFlags() {
2441 /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)
2442 /// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)
2445 /// \headerfile <x86intrin.h>
2447 /// This intrinsic corresponds to the \c VLDMXCSR / LDMXCSR instruction.
2450 /// A 32-bit unsigned integer value to be written to the MXCSR register.
2451 static __inline__
void __DEFAULT_FN_ATTRS
2452 _mm_setcsr(unsigned int __i
)
2454 __builtin_ia32_ldmxcsr(__i
);
2457 /// \brief Selects 4 float values from the 128-bit operands of [4 x float], as
2458 /// specified by the immediate value operand.
2460 /// \headerfile <x86intrin.h>
2463 /// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2466 /// This intrinsic corresponds to the \c VSHUFPS / SHUFPS instruction.
2469 /// A 128-bit vector of [4 x float].
2471 /// A 128-bit vector of [4 x float].
2473 /// An immediate value containing an 8-bit value specifying which elements to
2474 /// copy from a and b.
2475 /// Bits [3:0] specify the values copied from operand a.
2476 /// Bits [7:4] specify the values copied from operand b. The destinations
2477 /// within the 128-bit destination are assigned values as follows:
2478 /// Bits [1:0] are used to assign values to bits [31:0] in the destination.
2479 /// Bits [3:2] are used to assign values to bits [63:32] in the destination.
2480 /// Bits [5:4] are used to assign values to bits [95:64] in the destination.
2481 /// Bits [7:6] are used to assign values to bits [127:96] in the destination.
2482 /// Bit value assignments:
2483 /// 00: Bits [31:0] copied from the specified operand.
2484 /// 01: Bits [63:32] copied from the specified operand.
2485 /// 10: Bits [95:64] copied from the specified operand.
2486 /// 11: Bits [127:96] copied from the specified operand.
2487 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2488 #define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
2489 (__m128)__builtin_shufflevector((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2490 0 + (((mask) >> 0) & 0x3), \
2491 0 + (((mask) >> 2) & 0x3), \
2492 4 + (((mask) >> 4) & 0x3), \
2493 4 + (((mask) >> 6) & 0x3)); })
2495 /// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2496 /// [4 x float] and interleaves them into a 128-bit vector of [4 x
2499 /// \headerfile <x86intrin.h>
2501 /// This intrinsic corresponds to the \c VUNPCKHPS / UNPCKHPS instruction.
2504 /// A 128-bit vector of [4 x float].
2505 /// Bits [95:64] are written to bits [31:0] of the destination.
2506 /// Bits [127:96] are written to bits [95:64] of the destination.
2508 /// A 128-bit vector of [4 x float].
2509 /// Bits [95:64] are written to bits [63:32] of the destination.
2510 /// Bits [127:96] are written to bits [127:96] of the destination.
2511 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2512 static __inline__ __m128 __DEFAULT_FN_ATTRS
2513 _mm_unpackhi_ps(__m128 __a
, __m128 __b
)
2515 return __builtin_shufflevector((__v4sf
)__a
, (__v4sf
)__b
, 2, 6, 3, 7);
2518 /// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2519 /// [4 x float] and interleaves them into a 128-bit vector of [4 x
2522 /// \headerfile <x86intrin.h>
2524 /// This intrinsic corresponds to the \c VUNPCKLPS / UNPCKLPS instruction.
2527 /// A 128-bit vector of [4 x float].
2528 /// Bits [31:0] are written to bits [31:0] of the destination.
2529 /// Bits [63:32] are written to bits [95:64] of the destination.
2531 /// A 128-bit vector of [4 x float].
2532 /// Bits [31:0] are written to bits [63:32] of the destination.
2533 /// Bits [63:32] are written to bits [127:96] of the destination.
2534 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2535 static __inline__ __m128 __DEFAULT_FN_ATTRS
2536 _mm_unpacklo_ps(__m128 __a
, __m128 __b
)
2538 return __builtin_shufflevector((__v4sf
)__a
, (__v4sf
)__b
, 0, 4, 1, 5);
2541 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
2542 /// 32 bits are set to the lower 32 bits of the second parameter. The upper
2543 /// 96 bits are set to the upper 96 bits of the first parameter.
2545 /// \headerfile <x86intrin.h>
2547 /// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
2550 /// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2551 /// written to the upper 96 bits of the result.
2553 /// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2554 /// written to the lower 32 bits of the result.
2555 /// \returns A 128-bit floating-point vector of [4 x float].
2556 static __inline__ __m128 __DEFAULT_FN_ATTRS
2557 _mm_move_ss(__m128 __a
, __m128 __b
)
2559 return __builtin_shufflevector((__v4sf
)__a
, (__v4sf
)__b
, 4, 1, 2, 3);
2562 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
2563 /// 64 bits are set to the upper 64 bits of the second parameter. The upper
2564 /// 64 bits are set to the upper 64 bits of the first parameter.
2566 /// \headerfile <x86intrin.h>
2568 /// This intrinsic corresponds to the \c VUNPCKHPD / UNPCKHPD instruction.
2571 /// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2572 /// written to the upper 64 bits of the result.
2574 /// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2575 /// written to the lower 64 bits of the result.
2576 /// \returns A 128-bit floating-point vector of [4 x float].
2577 static __inline__ __m128 __DEFAULT_FN_ATTRS
2578 _mm_movehl_ps(__m128 __a
, __m128 __b
)
2580 return __builtin_shufflevector((__v4sf
)__a
, (__v4sf
)__b
, 6, 7, 2, 3);
2583 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
2584 /// 64 bits are set to the lower 64 bits of the first parameter. The upper
2585 /// 64 bits are set to the lower 64 bits of the second parameter.
2587 /// \headerfile <x86intrin.h>
2589 /// This intrinsic corresponds to the \c VUNPCKLPD / UNPCKLPD instruction.
2592 /// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2593 /// written to the lower 64 bits of the result.
2595 /// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2596 /// written to the upper 64 bits of the result.
2597 /// \returns A 128-bit floating-point vector of [4 x float].
2598 static __inline__ __m128 __DEFAULT_FN_ATTRS
2599 _mm_movelh_ps(__m128 __a
, __m128 __b
)
2601 return __builtin_shufflevector((__v4sf
)__a
, (__v4sf
)__b
, 0, 1, 4, 5);
2604 /// \brief Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2607 /// \headerfile <x86intrin.h>
2609 /// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
2612 /// A 64-bit vector of [4 x i16]. The elements of the destination are copied
2613 /// from the corresponding elements in this operand.
2614 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2615 /// values from the operand.
2616 static __inline__ __m128 __DEFAULT_FN_ATTRS
2617 _mm_cvtpi16_ps(__m64 __a
)
2622 __b
= _mm_setzero_si64();
2623 __b
= _mm_cmpgt_pi16(__b
, __a
);
2624 __c
= _mm_unpackhi_pi16(__a
, __b
);
2625 __r
= _mm_setzero_ps();
2626 __r
= _mm_cvtpi32_ps(__r
, __c
);
2627 __r
= _mm_movelh_ps(__r
, __r
);
2628 __c
= _mm_unpacklo_pi16(__a
, __b
);
2629 __r
= _mm_cvtpi32_ps(__r
, __c
);
2634 /// \brief Converts a 64-bit vector of 16-bit unsigned integer values into a
2635 /// 128-bit vector of [4 x float].
2637 /// \headerfile <x86intrin.h>
2639 /// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
2642 /// A 64-bit vector of 16-bit unsigned integer values. The elements of the
2643 /// destination are copied from the corresponding elements in this operand.
2644 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2645 /// values from the operand.
2646 static __inline__ __m128 __DEFAULT_FN_ATTRS
2647 _mm_cvtpu16_ps(__m64 __a
)
2652 __b
= _mm_setzero_si64();
2653 __c
= _mm_unpackhi_pi16(__a
, __b
);
2654 __r
= _mm_setzero_ps();
2655 __r
= _mm_cvtpi32_ps(__r
, __c
);
2656 __r
= _mm_movelh_ps(__r
, __r
);
2657 __c
= _mm_unpacklo_pi16(__a
, __b
);
2658 __r
= _mm_cvtpi32_ps(__r
, __c
);
2663 /// \brief Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2664 /// into a 128-bit vector of [4 x float].
2666 /// \headerfile <x86intrin.h>
2668 /// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
2671 /// A 64-bit vector of [8 x i8]. The elements of the destination are copied
2672 /// from the corresponding lower 4 elements in this operand.
2673 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2674 /// values from the operand.
2675 static __inline__ __m128 __DEFAULT_FN_ATTRS
2676 _mm_cvtpi8_ps(__m64 __a
)
2680 __b
= _mm_setzero_si64();
2681 __b
= _mm_cmpgt_pi8(__b
, __a
);
2682 __b
= _mm_unpacklo_pi8(__a
, __b
);
2684 return _mm_cvtpi16_ps(__b
);
2687 /// \brief Converts the lower four unsigned 8-bit integer values from a 64-bit
2688 /// vector of [8 x u8] into a 128-bit vector of [4 x float].
2690 /// \headerfile <x86intrin.h>
2692 /// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
2695 /// A 64-bit vector of unsigned 8-bit integer values. The elements of the
2696 /// destination are copied from the corresponding lower 4 elements in this
2698 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2699 /// values from the source operand.
2700 static __inline__ __m128 __DEFAULT_FN_ATTRS
2701 _mm_cvtpu8_ps(__m64 __a
)
2705 __b
= _mm_setzero_si64();
2706 __b
= _mm_unpacklo_pi8(__a
, __b
);
2708 return _mm_cvtpi16_ps(__b
);
2711 /// \brief Converts the two 32-bit signed integer values from each 64-bit vector
2712 /// operand of [2 x i32] into a 128-bit vector of [4 x float].
2714 /// \headerfile <x86intrin.h>
2716 /// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
2719 /// A 64-bit vector of [2 x i32]. The lower elements of the destination are
2720 /// copied from the elements in this operand.
2722 /// A 64-bit vector of [2 x i32]. The upper elements of the destination are
2723 /// copied from the elements in this operand.
2724 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2725 /// copied and converted values from the first operand. The upper 64 bits
2726 /// contain the copied and converted values from the second operand.
2727 static __inline__ __m128 __DEFAULT_FN_ATTRS
2728 _mm_cvtpi32x2_ps(__m64 __a
, __m64 __b
)
2732 __c
= _mm_setzero_ps();
2733 __c
= _mm_cvtpi32_ps(__c
, __b
);
2734 __c
= _mm_movelh_ps(__c
, __c
);
2736 return _mm_cvtpi32_ps(__c
, __a
);
2739 /// \brief Converts each single-precision floating-point element of a 128-bit
2740 /// floating-point vector of [4 x float] into a 16-bit signed integer, and
2741 /// packs the results into a 64-bit integer vector of [4 x i16]. If the
2742 /// floating-point element is NaN or infinity, or if the floating-point
2743 /// element is greater than 0x7FFFFFFF or less than -0x8000, it is converted
2744 /// to 0x8000. Otherwise if the floating-point element is greater
2745 /// than 0x7FFF, it is converted to 0x7FFF.
2747 /// \headerfile <x86intrin.h>
2749 /// This intrinsic corresponds to the \c CVTPS2PI + \c COMPOSITE instruction.
2752 /// A 128-bit floating-point vector of [4 x float].
2753 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
2755 static __inline__ __m64 __DEFAULT_FN_ATTRS
2756 _mm_cvtps_pi16(__m128 __a
)
2760 __b
= _mm_cvtps_pi32(__a
);
2761 __a
= _mm_movehl_ps(__a
, __a
);
2762 __c
= _mm_cvtps_pi32(__a
);
2764 return _mm_packs_pi32(__b
, __c
);
2767 /// \brief Converts each single-precision floating-point element of a 128-bit
2768 /// floating-point vector of [4 x float] into an 8-bit signed integer, and
2769 /// packs the results into the lower 32 bits of a 64-bit integer vector of
2770 /// [8 x i8]. The upper 32 bits of the vector are set to 0. If the
2771 /// floating-point element is NaN or infinity, or if the floating-point
2772 /// element is greater than 0x7FFFFFFF or less than -0x80, it is converted
2773 /// to 0x80. Otherwise if the floating-point element is greater
2774 /// than 0x7F, it is converted to 0x7F.
2776 /// \headerfile <x86intrin.h>
2778 /// This intrinsic corresponds to the \c CVTPS2PI + \c COMPOSITE instruction.
2781 /// 128-bit floating-point vector of [4 x float].
2782 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
2783 /// converted values and the uppper 32 bits are set to zero.
2784 static __inline__ __m64 __DEFAULT_FN_ATTRS
2785 _mm_cvtps_pi8(__m128 __a
)
2789 __b
= _mm_cvtps_pi16(__a
);
2790 __c
= _mm_setzero_si64();
2792 return _mm_packs_pi16(__b
, __c
);
2795 /// \brief Extracts the sign bits from each single-precision floating-point
2796 /// element of a 128-bit floating-point vector of [4 x float] and returns the
2797 /// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
2800 /// \headerfile <x86intrin.h>
2802 /// This intrinsic corresponds to the \c VMOVMSKPS / MOVMSKPS instruction.
2805 /// A 128-bit floating-point vector of [4 x float].
2806 /// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
2807 /// single-precision floating-point element of the parameter. Bits [31:4] are
2809 static __inline__
int __DEFAULT_FN_ATTRS
2810 _mm_movemask_ps(__m128 __a
)
2812 return __builtin_ia32_movmskps((__v4sf
)__a
);
2816 #define _MM_ALIGN16 __attribute__((aligned(16)))
2818 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
2820 #define _MM_EXCEPT_INVALID (0x0001)
2821 #define _MM_EXCEPT_DENORM (0x0002)
2822 #define _MM_EXCEPT_DIV_ZERO (0x0004)
2823 #define _MM_EXCEPT_OVERFLOW (0x0008)
2824 #define _MM_EXCEPT_UNDERFLOW (0x0010)
2825 #define _MM_EXCEPT_INEXACT (0x0020)
2826 #define _MM_EXCEPT_MASK (0x003f)
2828 #define _MM_MASK_INVALID (0x0080)
2829 #define _MM_MASK_DENORM (0x0100)
2830 #define _MM_MASK_DIV_ZERO (0x0200)
2831 #define _MM_MASK_OVERFLOW (0x0400)
2832 #define _MM_MASK_UNDERFLOW (0x0800)
2833 #define _MM_MASK_INEXACT (0x1000)
2834 #define _MM_MASK_MASK (0x1f80)
2836 #define _MM_ROUND_NEAREST (0x0000)
2837 #define _MM_ROUND_DOWN (0x2000)
2838 #define _MM_ROUND_UP (0x4000)
2839 #define _MM_ROUND_TOWARD_ZERO (0x6000)
2840 #define _MM_ROUND_MASK (0x6000)
2842 #define _MM_FLUSH_ZERO_MASK (0x8000)
2843 #define _MM_FLUSH_ZERO_ON (0x8000)
2844 #define _MM_FLUSH_ZERO_OFF (0x0000)
2846 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
2847 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
2848 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
2849 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
2851 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
2852 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
2853 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
2854 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
2856 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2858 __m128 tmp3, tmp2, tmp1, tmp0; \
2859 tmp0 = _mm_unpacklo_ps((row0), (row1)); \
2860 tmp2 = _mm_unpacklo_ps((row2), (row3)); \
2861 tmp1 = _mm_unpackhi_ps((row0), (row1)); \
2862 tmp3 = _mm_unpackhi_ps((row2), (row3)); \
2863 (row0) = _mm_movelh_ps(tmp0, tmp2); \
2864 (row1) = _mm_movehl_ps(tmp2, tmp0); \
2865 (row2) = _mm_movelh_ps(tmp1, tmp3); \
2866 (row3) = _mm_movehl_ps(tmp3, tmp1); \
2869 /* Aliases for compatibility. */
2870 #define _m_pextrw _mm_extract_pi16
2871 #define _m_pinsrw _mm_insert_pi16
2872 #define _m_pmaxsw _mm_max_pi16
2873 #define _m_pmaxub _mm_max_pu8
2874 #define _m_pminsw _mm_min_pi16
2875 #define _m_pminub _mm_min_pu8
2876 #define _m_pmovmskb _mm_movemask_pi8
2877 #define _m_pmulhuw _mm_mulhi_pu16
2878 #define _m_pshufw _mm_shuffle_pi16
2879 #define _m_maskmovq _mm_maskmove_si64
2880 #define _m_pavgb _mm_avg_pu8
2881 #define _m_pavgw _mm_avg_pu16
2882 #define _m_psadbw _mm_sad_pu8
2886 #undef __DEFAULT_FN_ATTRS
2888 /* Ugly hack for backwards-compatibility (compatible with gcc) */
2889 #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
2890 #include <emmintrin.h>
2893 #endif /* __XMMINTRIN_H */