1 /* Cell BEA specific SPU intrinsics to PPU/VMX intrinsics
2 Copyright (C) 2007 Free Software Foundation, Inc.
4 This file is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2 of the License, or (at your option)
9 This file is distributed in the hope that it will be useful, but WITHOUT
10 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 You should have received a copy of the GNU General Public License
15 along with this file; see the file COPYING. If not, write to the Free
16 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
19 /* As a special exception, if you include this header file into source files
20 compiled by GCC, this header file does not by itself cause the resulting
21 executable to be covered by the GNU General Public License. This exception
22 does not however invalidate any other reasons why the executable file might be
23 covered by the GNU General Public License. */
31 #include <vec_types.h>
34 /* Specify a default halt action for spu_hcmpeq and spu_hcmpgt intrinsics.
35 * Users can override the action by defining it prior to including this
38 #ifndef SPU_HALT_ACTION
39 #define SPU_HALT_ACTION abort()
42 /* Specify a default stop action for the spu_stop intrinsic.
43 * Users can override the action by defining it prior to including this
46 #ifndef SPU_STOP_ACTION
47 #define SPU_STOP_ACTION abort()
51 /* Specify a default action for unsupported intrinsic.
52 * Users can override the action by defining it prior to including this
55 #ifndef SPU_UNSUPPORTED_ACTION
56 #define SPU_UNSUPPORTED_ACTION abort()
60 /* Casting intrinsics - from scalar to quadword
63 static __inline qword
si_from_uchar(unsigned char c
) {
72 static __inline qword
si_from_char(signed char c
) {
81 static __inline qword
si_from_ushort(unsigned short s
) {
90 static __inline qword
si_from_short(short s
) {
100 static __inline qword
si_from_uint(unsigned int i
) {
109 static __inline qword
si_from_int(int i
) {
118 static __inline qword
si_from_ullong(unsigned long long l
) {
121 unsigned long long l
[2];
127 static __inline qword
si_from_llong(long long l
) {
136 static __inline qword
si_from_float(float f
) {
145 static __inline qword
si_from_double(double d
) {
154 static __inline qword
si_from_ptr(void *ptr
) {
164 /* Casting intrinsics - from quadword to scalar
166 static __inline
unsigned char si_to_uchar(qword q
) {
175 static __inline
signed char si_to_char(qword q
) {
184 static __inline
unsigned short si_to_ushort(qword q
) {
193 static __inline
short si_to_short(qword q
) {
202 static __inline
unsigned int si_to_uint(qword q
) {
211 static __inline
int si_to_int(qword q
) {
220 static __inline
unsigned long long si_to_ullong(qword q
) {
223 unsigned long long l
[2];
229 static __inline
long long si_to_llong(qword q
) {
238 static __inline
float si_to_float(qword q
) {
247 static __inline
double si_to_double(qword q
) {
256 static __inline
void * si_to_ptr(qword q
) {
266 /* Absolute difference
268 static __inline qword
si_absdb(qword a
, qword b
)
270 vec_uchar16 ac
, bc
, dc
;
272 ac
= (vec_uchar16
)(a
);
273 bc
= (vec_uchar16
)(b
);
274 dc
= vec_sel(vec_sub(bc
, ac
), vec_sub(ac
, bc
), vec_cmpgt(ac
, bc
));
276 return ((qword
)(dc
));
281 #define si_a(_a, _b) ((qword)(vec_add((vec_uint4)(_a), (vec_uint4)(_b))))
283 #define si_ah(_a, _b) ((qword)(vec_add((vec_ushort8)(_a), (vec_ushort8)(_b))))
285 static __inline qword
si_ai(qword a
, int b
)
287 return ((qword
)(vec_add((vec_int4
)(a
),
288 vec_splat((vec_int4
)(si_from_int(b
)), 0))));
292 static __inline qword
si_ahi(qword a
, short b
)
294 return ((qword
)(vec_add((vec_short8
)(a
),
295 vec_splat((vec_short8
)(si_from_short(b
)), 1))));
299 #define si_fa(_a, _b) ((qword)(vec_add((vec_float4)(_a), (vec_float4)(_b))))
302 static __inline qword
si_dfa(qword a
, qword b
)
309 ad
.v
= (vec_double2
)(a
);
310 bd
.v
= (vec_double2
)(b
);
311 dd
.d
[0] = ad
.d
[0] + bd
.d
[0];
312 dd
.d
[1] = ad
.d
[1] + bd
.d
[1];
314 return ((qword
)(dd
.v
));
319 #define si_addx(_a, _b, _c) ((qword)(vec_add(vec_add((vec_uint4)(_a), (vec_uint4)(_b)), \
320 vec_and((vec_uint4)(_c), vec_splat_u32(1)))))
325 #define si_and(_a, _b) ((qword)(vec_and((vec_uint4)(_a), (vec_uint4)(_b))))
328 static __inline qword
si_andbi(qword a
, signed char b
)
330 return ((qword
)(vec_and((vec_char16
)(a
),
331 vec_splat((vec_char16
)(si_from_char(b
)), 3))));
334 static __inline qword
si_andhi(qword a
, signed short b
)
336 return ((qword
)(vec_and((vec_short8
)(a
),
337 vec_splat((vec_short8
)(si_from_short(b
)), 1))));
341 static __inline qword
si_andi(qword a
, signed int b
)
343 return ((qword
)(vec_and((vec_int4
)(a
),
344 vec_splat((vec_int4
)(si_from_int(b
)), 0))));
348 /* Bit-wise AND with complement
350 #define si_andc(_a, _b) ((qword)(vec_andc((vec_uchar16)(_a), (vec_uchar16)(_b))))
353 /* Average byte vectors
355 #define si_avgb(_a, _b) ((qword)(vec_avg((vec_uchar16)(_a), (vec_uchar16)(_b))))
358 /* Branch indirect and set link on external data
360 #define si_bisled(_func) /* not mappable */
361 #define si_bisledd(_func) /* not mappable */
362 #define si_bislede(_func) /* not mappable */
367 #define si_bg(_a, _b) ((qword)(vec_subc((vec_uint4)(_b), (vec_uint4)(_a))))
369 #define si_bgx(_a, _b, _c) ((qword)(vec_and(vec_or(vec_cmpgt((vec_uint4)(_b), (vec_uint4)(_a)), \
370 vec_and(vec_cmpeq((vec_uint4)(_b), (vec_uint4)(_a)), \
371 (vec_uint4)(_c))), vec_splat_u32(1))))
373 /* Compare absolute equal
375 static __inline qword
si_fcmeq(qword a
, qword b
)
377 vec_float4 msb
= (vec_float4
)((vec_uint4
){0x80000000, 0x80000000, 0x80000000, 0x80000000});
379 return ((qword
)(vec_cmpeq(vec_andc((vec_float4
)(a
), msb
),
380 vec_andc((vec_float4
)(b
), msb
))));
383 static __inline qword
si_dfcmeq(qword a
, qword b
)
385 vec_uint4 sign_mask
= (vec_uint4
) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
386 vec_uint4 nan_mask
= (vec_uint4
) { 0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000 };
387 vec_uchar16 hihi_promote
= (vec_uchar16
) { 0,1,2,3, 16,17,18,19, 8,9,10,11, 24,25,26,27};
405 /* Mask out sign bits */
406 aabs
= vec_and((vec_uint4
)a
,sign_mask
);
407 babs
= vec_and((vec_uint4
)b
,sign_mask
);
409 /* A) Check for bit equality, store in high word */
410 biteq
= (vec_uint4
) vec_cmpeq((vec_uint4
)aabs
,(vec_uint4
)babs
);
411 biteq
= vec_and(biteq
,(vec_uint4
)vec_slo((vec_uchar16
)biteq
,x
.v
));
414 B) Check if a is NaN, store in high word
416 B1) If the high word is greater than max_exp (indicates a NaN)
417 B2) If the low word is greater than 0
419 a_gt
= (vec_uint4
)vec_cmpgt(aabs
,nan_mask
);
421 /* B3) Check if the high word is equal to the inf exponent */
422 ahi_inf
= (vec_uint4
)vec_cmpeq(aabs
,nan_mask
);
424 /* anan = B1[hi] or (B2[lo] and B3[hi]) */
425 anan
= (vec_uint4
)vec_or(a_gt
,vec_and((vec_uint4
)vec_slo((vec_uchar16
)a_gt
,x
.v
),ahi_inf
));
427 /* result = A and not B */
428 result
= vec_andc(biteq
, anan
);
430 /* Promote high words to 64 bits and return */
431 return ((qword
)(vec_perm((vec_uchar16
)result
, (vec_uchar16
)result
, hihi_promote
)));
435 /* Compare absolute greater than
437 static __inline qword
si_fcmgt(qword a
, qword b
)
439 vec_float4 msb
= (vec_float4
)((vec_uint4
){0x80000000, 0x80000000, 0x80000000, 0x80000000});
441 return ((qword
)(vec_cmpgt(vec_andc((vec_float4
)(a
), msb
),
442 vec_andc((vec_float4
)(b
), msb
))));
445 static __inline qword
si_dfcmgt(qword a
, qword b
)
447 vec_uchar16 splat_hi
= (vec_uchar16
) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
448 vec_uint4 nan_mask
= (vec_uint4
) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
449 vec_uint4 sign_mask
= (vec_uint4
) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
459 // absolute value of a,b
460 vec_uint4 aabs
= vec_and((vec_uint4
)a
, sign_mask
);
461 vec_uint4 babs
= vec_and((vec_uint4
)b
, sign_mask
);
464 vec_uint4 a_inf
= (vec_uint4
)vec_cmpeq(aabs
, nan_mask
);
465 vec_uint4 a_nan
= (vec_uint4
)vec_cmpgt(aabs
, nan_mask
);
466 a_nan
= vec_or(a_nan
, vec_and((vec_uint4
)vec_slo((vec_uchar16
)a_nan
,x
.v
),a_inf
));
467 a_nan
= (vec_uint4
)vec_perm((vec_uchar16
)a_nan
, (vec_uchar16
)a_nan
, splat_hi
);
470 vec_uint4 b_inf
= (vec_uint4
)vec_cmpeq(babs
, nan_mask
);
471 vec_uint4 b_nan
= (vec_uint4
)vec_cmpgt(babs
, nan_mask
);
472 b_nan
= vec_or(b_nan
, vec_and((vec_uint4
)vec_slo((vec_uchar16
)b_nan
,x
.v
),b_inf
));
473 b_nan
= (vec_uint4
)vec_perm((vec_uchar16
)b_nan
, (vec_uchar16
)b_nan
, splat_hi
);
475 // A) Check if the exponents are different
476 vec_uint4 gt_hi
= (vec_uint4
)vec_cmpgt(aabs
,babs
);
478 // B) Check if high word equal, and low word greater
479 vec_uint4 gt_lo
= (vec_uint4
)vec_cmpgt((vec_uint4
)aabs
, (vec_uint4
)babs
);
480 vec_uint4 eq
= (vec_uint4
)vec_cmpeq(aabs
, babs
);
481 vec_uint4 eqgt
= vec_and(eq
,vec_slo(gt_lo
,x
.v
));
483 // If either A or B is true, return true (unless NaNs detected)
484 vec_uint4 r
= vec_or(gt_hi
, eqgt
);
486 // splat the high words of the comparison step
487 r
= (vec_uint4
)vec_perm((vec_uchar16
)r
,(vec_uchar16
)r
,splat_hi
);
489 // correct for NaNs in input
490 return ((qword
)vec_andc(r
,vec_or(a_nan
,b_nan
)));
496 static __inline qword
si_ceqb(qword a
, qword b
)
498 return ((qword
)(vec_cmpeq((vec_uchar16
)(a
), (vec_uchar16
)(b
))));
501 static __inline qword
si_ceqh(qword a
, qword b
)
503 return ((qword
)(vec_cmpeq((vec_ushort8
)(a
), (vec_ushort8
)(b
))));
506 static __inline qword
si_ceq(qword a
, qword b
)
508 return ((qword
)(vec_cmpeq((vec_uint4
)(a
), (vec_uint4
)(b
))));
511 static __inline qword
si_fceq(qword a
, qword b
)
513 return ((qword
)(vec_cmpeq((vec_float4
)(a
), (vec_float4
)(b
))));
516 static __inline qword
si_ceqbi(qword a
, signed char b
)
518 return ((qword
)(vec_cmpeq((vec_char16
)(a
),
519 vec_splat((vec_char16
)(si_from_char(b
)), 3))));
522 static __inline qword
si_ceqhi(qword a
, signed short b
)
524 return ((qword
)(vec_cmpeq((vec_short8
)(a
),
525 vec_splat((vec_short8
)(si_from_short(b
)), 1))));
528 static __inline qword
si_ceqi(qword a
, signed int b
)
530 return ((qword
)(vec_cmpeq((vec_int4
)(a
),
531 vec_splat((vec_int4
)(si_from_int(b
)), 0))));
534 static __inline qword
si_dfceq(qword a
, qword b
)
536 vec_uint4 sign_mask
= (vec_uint4
) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
537 vec_uint4 nan_mask
= (vec_uint4
) { 0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000 };
538 vec_uchar16 hihi_promote
= (vec_uchar16
) { 0,1,2,3, 16,17,18,19, 8,9,10,11, 24,25,26,27};
557 /* A) Check for bit equality, store in high word */
558 biteq
= (vec_uint4
) vec_cmpeq((vec_uint4
)a
,(vec_uint4
)b
);
559 biteq
= vec_and(biteq
,(vec_uint4
)vec_slo((vec_uchar16
)biteq
,x
.v
));
561 /* Mask out sign bits */
562 aabs
= vec_and((vec_uint4
)a
,sign_mask
);
563 babs
= vec_and((vec_uint4
)b
,sign_mask
);
566 B) Check if a is NaN, store in high word
568 B1) If the high word is greater than max_exp (indicates a NaN)
569 B2) If the low word is greater than 0
571 a_gt
= (vec_uint4
)vec_cmpgt(aabs
,nan_mask
);
573 /* B3) Check if the high word is equal to the inf exponent */
574 ahi_inf
= (vec_uint4
)vec_cmpeq(aabs
,nan_mask
);
576 /* anan = B1[hi] or (B2[lo] and B3[hi]) */
577 anan
= (vec_uint4
)vec_or(a_gt
,vec_and((vec_uint4
)vec_slo((vec_uchar16
)a_gt
,x
.v
),ahi_inf
));
579 /* C) Check for 0 = -0 special case */
580 iszero
=(vec_uint4
)vec_cmpeq((vec_uint4
)vec_or(aabs
,babs
),(vec_uint4
)vec_splat_u32(0));
581 iszero
= vec_and(iszero
,(vec_uint4
)vec_slo((vec_uchar16
)iszero
,x
.v
));
583 /* result = (A or C) and not B */
584 result
= vec_or(biteq
,iszero
);
585 result
= vec_andc(result
, anan
);
587 /* Promote high words to 64 bits and return */
588 return ((qword
)(vec_perm((vec_uchar16
)result
, (vec_uchar16
)result
, hihi_promote
)));
592 /* Compare greater than
594 static __inline qword
si_cgtb(qword a
, qword b
)
596 return ((qword
)(vec_cmpgt((vec_char16
)(a
), (vec_char16
)(b
))));
599 static __inline qword
si_cgth(qword a
, qword b
)
601 return ((qword
)(vec_cmpgt((vec_short8
)(a
), (vec_short8
)(b
))));
604 static __inline qword
si_cgt(qword a
, qword b
)
606 return ((qword
)(vec_cmpgt((vec_int4
)(a
), (vec_int4
)(b
))));
609 static __inline qword
si_clgtb(qword a
, qword b
)
611 return ((qword
)(vec_cmpgt((vec_uchar16
)(a
), (vec_uchar16
)(b
))));
614 static __inline qword
si_clgth(qword a
, qword b
)
616 return ((qword
)(vec_cmpgt((vec_ushort8
)(a
), (vec_ushort8
)(b
))));
619 static __inline qword
si_clgt(qword a
, qword b
)
621 return ((qword
)(vec_cmpgt((vec_uint4
)(a
), (vec_uint4
)(b
))));
624 static __inline qword
si_fcgt(qword a
, qword b
)
626 return ((qword
)(vec_cmpgt((vec_float4
)(a
), (vec_float4
)(b
))));
629 static __inline qword
si_dfcgt(qword a
, qword b
)
631 vec_uchar16 splat_hi
= (vec_uchar16
) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
632 vec_uchar16 borrow_shuffle
= (vec_uchar16
) { 4,5,6,7, 192,192,192,192, 12,13,14,15, 192,192,192,192 };
633 vec_uint4 nan_mask
= (vec_uint4
) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
634 vec_uint4 sign_mask
= (vec_uint4
) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
644 // absolute value of a,b
645 vec_uint4 aabs
= vec_and((vec_uint4
)a
, sign_mask
);
646 vec_uint4 babs
= vec_and((vec_uint4
)b
, sign_mask
);
649 vec_uint4 a_inf
= (vec_uint4
)vec_cmpeq(aabs
, nan_mask
);
650 vec_uint4 a_nan
= (vec_uint4
)vec_cmpgt(aabs
, nan_mask
);
651 a_nan
= vec_or(a_nan
, vec_and((vec_uint4
)vec_slo((vec_uchar16
)a_nan
,x
.v
),a_inf
));
652 a_nan
= (vec_uint4
)vec_perm((vec_uchar16
)a_nan
, (vec_uchar16
)a_nan
, splat_hi
);
655 vec_uint4 b_inf
= (vec_uint4
)vec_cmpeq(babs
, nan_mask
);
656 vec_uint4 b_nan
= (vec_uint4
)vec_cmpgt(babs
, nan_mask
);
657 b_nan
= vec_or(b_nan
, vec_and((vec_uint4
)vec_slo((vec_uchar16
)b_nan
,x
.v
),b_inf
));
658 b_nan
= (vec_uint4
)vec_perm((vec_uchar16
)b_nan
, (vec_uchar16
)b_nan
, splat_hi
);
661 vec_uint4 asel
= (vec_uint4
)vec_sra((vec_int4
)(a
), (vec_uint4
)vec_splat(((vec_uint4
)si_from_int(31)), 0));
662 asel
= (vec_uint4
)vec_perm((vec_uchar16
)asel
,(vec_uchar16
)asel
,splat_hi
);
665 vec_uint4 bsel
= (vec_uint4
)vec_sra((vec_int4
)(b
), (vec_uint4
)vec_splat(((vec_uint4
)si_from_int(31)), 0));
666 bsel
= (vec_uint4
)vec_perm((vec_uchar16
)bsel
,(vec_uchar16
)bsel
,splat_hi
);
669 vec_uint4 abor
= vec_subc((vec_uint4
)vec_splat_u32(0), aabs
);
670 vec_uchar16 pat
= vec_sel(((vec_uchar16
){0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}), vec_sr(borrow_shuffle
, vec_splat_u8(3)), vec_sra(borrow_shuffle
, vec_splat_u8(7)));
671 abor
= (vec_uint4
)(vec_perm(vec_perm((vec_uchar16
)abor
, (vec_uchar16
)abor
, borrow_shuffle
),((vec_uchar16
){0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),pat
));
672 vec_uint4 aneg
= vec_add(vec_add(vec_splat_u32(0), vec_nor(aabs
, aabs
)), vec_and(abor
, vec_splat_u32(1)));
674 // pick the one we want
675 vec_int4 aval
= (vec_int4
)vec_sel((vec_uchar16
)aabs
, (vec_uchar16
)aneg
, (vec_uchar16
)asel
);
678 vec_uint4 bbor
= vec_subc((vec_uint4
)vec_splat_u32(0), babs
);
679 bbor
= (vec_uint4
)(vec_perm(vec_perm((vec_uchar16
)bbor
, (vec_uchar16
)bbor
, borrow_shuffle
),((vec_uchar16
){0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),pat
));
680 vec_uint4 bneg
= vec_add(vec_nor(babs
, babs
), vec_and(bbor
, vec_splat_u32(1)));
682 // pick the one we want
683 vec_int4 bval
=(vec_int4
)vec_sel((vec_uchar16
)babs
, (vec_uchar16
)bneg
, (vec_uchar16
)bsel
);
685 // A) Check if the exponents are different
686 vec_uint4 gt_hi
= (vec_uint4
)vec_cmpgt(aval
,bval
);
688 // B) Check if high word equal, and low word greater
689 vec_uint4 gt_lo
= (vec_uint4
)vec_cmpgt((vec_uint4
)aval
, (vec_uint4
)bval
);
690 vec_uint4 eq
= (vec_uint4
)vec_cmpeq(aval
, bval
);
691 vec_uint4 eqgt
= vec_and(eq
,vec_slo(gt_lo
,x
.v
));
693 // If either A or B is true, return true (unless NaNs detected)
694 vec_uint4 r
= vec_or(gt_hi
, eqgt
);
696 // splat the high words of the comparison step
697 r
= (vec_uint4
)vec_perm((vec_uchar16
)r
,(vec_uchar16
)r
,splat_hi
);
699 // correct for NaNs in input
700 return ((qword
)vec_andc(r
,vec_or(a_nan
,b_nan
)));
703 static __inline qword
si_cgtbi(qword a
, signed char b
)
705 return ((qword
)(vec_cmpgt((vec_char16
)(a
),
706 vec_splat((vec_char16
)(si_from_char(b
)), 3))));
709 static __inline qword
si_cgthi(qword a
, signed short b
)
711 return ((qword
)(vec_cmpgt((vec_short8
)(a
),
712 vec_splat((vec_short8
)(si_from_short(b
)), 1))));
715 static __inline qword
si_cgti(qword a
, signed int b
)
717 return ((qword
)(vec_cmpgt((vec_int4
)(a
),
718 vec_splat((vec_int4
)(si_from_int(b
)), 0))));
721 static __inline qword
si_clgtbi(qword a
, unsigned char b
)
723 return ((qword
)(vec_cmpgt((vec_uchar16
)(a
),
724 vec_splat((vec_uchar16
)(si_from_uchar(b
)), 3))));
727 static __inline qword
si_clgthi(qword a
, unsigned short b
)
729 return ((qword
)(vec_cmpgt((vec_ushort8
)(a
),
730 vec_splat((vec_ushort8
)(si_from_ushort(b
)), 1))));
733 static __inline qword
si_clgti(qword a
, unsigned int b
)
735 return ((qword
)(vec_cmpgt((vec_uint4
)(a
),
736 vec_splat((vec_uint4
)(si_from_uint(b
)), 0))));
739 static __inline qword
si_dftsv(qword a
, char b
)
741 vec_uchar16 splat_hi
= (vec_uchar16
) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
742 vec_uint4 sign_mask
= (vec_uint4
) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
743 vec_uint4 result
= (vec_uint4
){0};
744 vec_uint4 sign
= (vec_uint4
)vec_sra((vec_int4
)(a
), (vec_uint4
)vec_splat(((vec_uint4
)si_from_int(31)), 0));
745 sign
= (vec_uint4
)vec_perm((vec_uchar16
)sign
,(vec_uchar16
)sign
,splat_hi
);
746 vec_uint4 aabs
= vec_and((vec_uint4
)a
,sign_mask
);
756 /* Nan or +inf or -inf */
759 vec_uint4 nan_mask
= (vec_uint4
) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
760 vec_uint4 a_inf
= (vec_uint4
)vec_cmpeq(aabs
, nan_mask
);
764 vec_uint4 a_nan
= (vec_uint4
)vec_cmpgt(aabs
, nan_mask
);
765 a_nan
= vec_or(a_nan
, vec_and((vec_uint4
)vec_slo((vec_uchar16
)a_nan
,x
.v
),a_inf
));
766 a_nan
= (vec_uint4
)vec_perm((vec_uchar16
)a_nan
, (vec_uchar16
)a_nan
, splat_hi
);
767 result
= vec_or(result
, a_nan
);
772 a_inf
= vec_and((vec_uint4
)vec_slo((vec_uchar16
)a_inf
,x
.v
), a_inf
);
773 a_inf
= (vec_uint4
)vec_perm((vec_uchar16
)a_inf
, (vec_uchar16
)a_inf
, splat_hi
);
776 result
= vec_or(vec_andc(a_inf
, sign
), result
);
779 result
= vec_or(vec_and(a_inf
, sign
), result
);
785 vec_uint4 iszero
=(vec_uint4
)vec_cmpeq(aabs
,(vec_uint4
)vec_splat_u32(0));
786 iszero
= vec_and(iszero
,(vec_uint4
)vec_slo((vec_uchar16
)iszero
,x
.v
));
790 vec_uint4 denorm_mask
= (vec_uint4
){0xFFFFF, 0xFFFFF, 0xFFFFF, 0xFFFFF};
791 vec_uint4 isdenorm
= vec_nor((vec_uint4
)vec_cmpgt(aabs
, denorm_mask
), iszero
);
792 isdenorm
= (vec_uint4
)vec_perm((vec_uchar16
)isdenorm
, (vec_uchar16
)isdenorm
, splat_hi
);
795 result
= vec_or(vec_andc(isdenorm
, sign
), result
);
798 result
= vec_or(vec_and(isdenorm
, sign
), result
);
803 iszero
= (vec_uint4
)vec_perm((vec_uchar16
)iszero
, (vec_uchar16
)iszero
, splat_hi
);
806 result
= vec_or(vec_andc(iszero
, sign
), result
);
809 result
= vec_or(vec_and(iszero
, sign
), result
);
812 return ((qword
)result
);
818 #define si_cg(_a, _b) ((qword)(vec_addc((vec_uint4)(_a), (vec_uint4)(_b))))
820 #define si_cgx(_a, _b, _c) ((qword)(vec_or(vec_addc((vec_uint4)(_a), (vec_uint4)(_b)), \
821 vec_addc(vec_add((vec_uint4)(_a), (vec_uint4)(_b)), \
822 vec_and((vec_uint4)(_c), vec_splat_u32(1))))))
825 /* Count ones for bytes
827 static __inline qword
si_cntb(qword a
)
829 vec_uchar16 nib_cnt
= (vec_uchar16
){0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
830 vec_uchar16 four
= { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 };
833 av
= (vec_uchar16
)(a
);
835 return ((qword
)(vec_add(vec_perm(nib_cnt
, nib_cnt
, av
),
836 vec_perm(nib_cnt
, nib_cnt
, vec_sr (av
, four
)))));
839 /* Count ones for bytes
841 static __inline qword
si_clz(qword a
)
844 vec_uchar16 cnt_hi
, cnt_lo
, cnt
, tmp1
, tmp2
, tmp3
;
845 vec_uchar16 four
= vec_splat_u8(4);
846 vec_uchar16 nib_cnt
= (vec_uchar16
){4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0};
847 vec_uchar16 eight
= vec_splat_u8(8);
848 vec_uchar16 sixteen
= (vec_uchar16
){16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16};
849 vec_uchar16 twentyfour
= (vec_uchar16
){24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24};
851 av
= (vec_uchar16
)(a
);
853 cnt_hi
= vec_perm(nib_cnt
, nib_cnt
, vec_sr(av
, four
));
854 cnt_lo
= vec_perm(nib_cnt
, nib_cnt
, av
);
856 cnt
= vec_add(cnt_hi
, vec_and(cnt_lo
, vec_cmpeq(cnt_hi
, four
)));
858 tmp1
= (vec_uchar16
)vec_sl((vec_uint4
)(cnt
), (vec_uint4
)(eight
));
859 tmp2
= (vec_uchar16
)vec_sl((vec_uint4
)(cnt
), (vec_uint4
)(sixteen
));
860 tmp3
= (vec_uchar16
)vec_sl((vec_uint4
)(cnt
), (vec_uint4
)(twentyfour
));
862 cnt
= vec_add(cnt
, vec_and(tmp1
, vec_cmpeq(cnt
, eight
)));
863 cnt
= vec_add(cnt
, vec_and(tmp2
, vec_cmpeq(cnt
, sixteen
)));
864 cnt
= vec_add(cnt
, vec_and(tmp3
, vec_cmpeq(cnt
, twentyfour
)));
866 return (qword
)((vec_sr((vec_uint4
)(cnt
), (vec_uint4
)(twentyfour
))));
871 #define si_cuflt(_a, _b) ((qword)(vec_ctf((vec_uint4)(_a), _b)))
872 #define si_csflt(_a, _b) ((qword)(vec_ctf((vec_int4)(_a), _b)))
874 /* Convert to signed int
876 #define si_cflts(_a, _b) ((qword)(vec_cts((vec_float4)(_a), _b)))
878 /* Convert to unsigned int
880 #define si_cfltu(_a, _b) ((qword)(vec_ctu((vec_float4)(_a), _b)))
884 #define si_dsync() /* do nothing */
885 #define si_sync() /* do nothing */
886 #define si_syncc() /* do nothing */
891 static __inline qword
si_eqv(qword a
, qword b
)
895 d
= vec_xor((vec_uchar16
)(a
), (vec_uchar16
)(b
));
896 return ((qword
)(vec_nor(d
, d
)));
901 static __inline qword
si_xsbh(qword a
)
905 av
= (vec_char16
)(a
);
906 return ((qword
)(vec_unpackh(vec_perm(av
, av
, ((vec_uchar16
){1, 3, 5, 7, 9,11,13,15,
907 0, 0, 0, 0, 0, 0, 0, 0})))));
910 static __inline qword
si_xshw(qword a
)
914 av
= (vec_short8
)(a
);
915 return ((qword
)(vec_unpackh(vec_perm(av
, av
, ((vec_uchar16
){2, 3, 6, 7,
921 static __inline qword
si_xswd(qword a
)
926 return ((qword
)(vec_perm(av
, vec_sra(av
, ((vec_uint4
){31,31,31,31})),
927 ((vec_uchar16
){20, 21, 22, 23,
933 static __inline qword
si_fesd(qword a
)
944 in
.vf
= (vec_float4
)(a
);
945 out
.d
[0] = (double)(in
.f
[0]);
946 out
.d
[1] = (double)(in
.f
[2]);
947 return ((qword
)(out
.vd
));
952 static __inline qword
si_gbb(qword a
)
957 bits
= vec_sl(vec_and((vec_uchar16
)(a
), vec_splat_u8(1)), ((vec_uchar16
){7, 6, 5, 4, 3, 2, 1, 0,
958 7, 6, 5, 4, 3, 2, 1, 0}));
959 bytes
= (vec_uint4
)vec_sum2s((vec_int4
)(vec_sum4s(bits
, ((vec_uint4
){0}))), ((vec_int4
){0}));
961 return ((qword
)(vec_perm(bytes
, bytes
, ((vec_uchar16
){0, 0, 7,15, 0, 0, 0, 0,
962 0, 0, 0, 0, 0, 0, 0, 0}))));
966 static __inline qword
si_gbh(qword a
)
971 bits
= vec_sl(vec_and((vec_ushort8
)(a
), vec_splat_u16(1)), ((vec_ushort8
){7, 6, 5, 4, 3, 2, 1, 0}));
973 bytes
= (vec_uint4
)vec_sums((vec_int4
)(vec_sum4s((vec_short8
)(bits
), (vec_int4
){0})), (vec_int4
){0});
975 return ((qword
)(vec_sld(bytes
, bytes
, 12)));
978 static __inline qword
si_gb(qword a
)
983 bits
= vec_sl(vec_and((vec_uint4
)(a
), vec_splat_u32(1)), ((vec_uint4
){3, 2, 1, 0}));
984 bytes
= (vec_uint4
)vec_sums((vec_int4
)(bits
), ((vec_int4
){0}));
985 return ((qword
)(vec_sld(bytes
, bytes
, 12)));
991 static __inline
void si_heq(qword a
, qword b
)
994 vector
unsigned int v
;
998 aa
.v
= (vector
unsigned int)(a
);
999 bb
.v
= (vector
unsigned int)(b
);
1001 if (aa
.i
[0] == bb
.i
[0]) { SPU_HALT_ACTION
; };
1004 static __inline
void si_heqi(qword a
, unsigned int b
)
1007 vector
unsigned int v
;
1011 aa
.v
= (vector
unsigned int)(a
);
1013 if (aa
.i
[0] == b
) { SPU_HALT_ACTION
; };
1016 static __inline
void si_hgt(qword a
, qword b
)
1019 vector
signed int v
;
1023 aa
.v
= (vector
signed int)(a
);
1024 bb
.v
= (vector
signed int)(b
);
1026 if (aa
.i
[0] > bb
.i
[0]) { SPU_HALT_ACTION
; };
1029 static __inline
void si_hgti(qword a
, signed int b
)
1032 vector
signed int v
;
1036 aa
.v
= (vector
signed int)(a
);
1038 if (aa
.i
[0] > b
) { SPU_HALT_ACTION
; };
1041 static __inline
void si_hlgt(qword a
, qword b
)
1044 vector
unsigned int v
;
1048 aa
.v
= (vector
unsigned int)(a
);
1049 bb
.v
= (vector
unsigned int)(b
);
1051 if (aa
.i
[0] > bb
.i
[0]) { SPU_HALT_ACTION
; };
1054 static __inline
void si_hlgti(qword a
, unsigned int b
)
1057 vector
unsigned int v
;
1061 aa
.v
= (vector
unsigned int)(a
);
1063 if (aa
.i
[0] > b
) { SPU_HALT_ACTION
; };
1069 static __inline qword
si_mpya(qword a
, qword b
, qword c
)
1071 return ((qword
)(vec_msum(vec_and((vec_short8
)(a
),
1072 ((vec_short8
){0, -1, 0, -1, 0, -1, 0, -1})),
1073 (vec_short8
)(b
), (vec_int4
)(c
))));
1076 static __inline qword
si_fma(qword a
, qword b
, qword c
)
1078 return ((qword
)(vec_madd((vec_float4
)(a
), (vec_float4
)(b
), (vec_float4
)(c
))));
1081 static __inline qword
si_dfma(qword a
, qword b
, qword c
)
1088 aa
.v
= (vec_double2
)(a
);
1089 bb
.v
= (vec_double2
)(b
);
1090 cc
.v
= (vec_double2
)(c
);
1091 dd
.d
[0] = aa
.d
[0] * bb
.d
[0] + cc
.d
[0];
1092 dd
.d
[1] = aa
.d
[1] * bb
.d
[1] + cc
.d
[1];
1093 return ((qword
)(dd
.v
));
1098 #define si_fsmbi(_a) si_fsmb(si_from_int(_a))
1100 static __inline qword
si_fsmb(qword a
)
1105 in
= (vec_ushort8
)(a
);
1106 mask
= (vec_char16
)(vec_perm(in
, in
, ((vec_uchar16
){2, 2, 2, 2, 2, 2, 2, 2,
1107 3, 3, 3, 3, 3, 3, 3, 3})));
1108 return ((qword
)(vec_sra(vec_sl(mask
, ((vec_uchar16
){0, 1, 2, 3, 4, 5, 6, 7,
1109 0, 1, 2, 3, 4, 5, 6, 7})),
1114 static __inline qword
si_fsmh(qword a
)
1119 in
= (vec_uchar16
)(a
);
1120 mask
= (vec_short8
)(vec_splat(in
, 3));
1121 return ((qword
)(vec_sra(vec_sl(mask
, ((vec_ushort8
){0, 1, 2, 3, 4, 5, 6, 7})),
1122 vec_splat_u16(15))));
1125 static __inline qword
si_fsm(qword a
)
1130 in
= (vec_uchar16
)(a
);
1131 mask
= (vec_int4
)(vec_splat(in
, 3));
1132 return ((qword
)(vec_sra(vec_sl(mask
, ((vec_uint4
){28, 29, 30, 31})),
1133 ((vec_uint4
){31,31,31,31}))));
1136 /* Move from/to registers
1138 #define si_fscrrd() ((qword)((vec_uint4){0}))
1139 #define si_fscrwr(_a)
1141 #define si_mfspr(_reg) ((qword)((vec_uint4){0}))
1142 #define si_mtspr(_reg, _a)
1144 /* Multiply High High Add
1146 static __inline qword
si_mpyhha(qword a
, qword b
, qword c
)
1148 return ((qword
)(vec_add(vec_mule((vec_short8
)(a
), (vec_short8
)(b
)), (vec_int4
)(c
))));
1151 static __inline qword
si_mpyhhau(qword a
, qword b
, qword c
)
1153 return ((qword
)(vec_add(vec_mule((vec_ushort8
)(a
), (vec_ushort8
)(b
)), (vec_uint4
)(c
))));
1156 /* Multiply Subtract
1158 static __inline qword
si_fms(qword a
, qword b
, qword c
)
1160 return ((qword
)(vec_madd((vec_float4
)(a
), (vec_float4
)(b
),
1161 vec_sub(((vec_float4
){0.0f
}), (vec_float4
)(c
)))));
1164 static __inline qword
si_dfms(qword a
, qword b
, qword c
)
1171 aa
.v
= (vec_double2
)(a
);
1172 bb
.v
= (vec_double2
)(b
);
1173 cc
.v
= (vec_double2
)(c
);
1174 dd
.d
[0] = aa
.d
[0] * bb
.d
[0] - cc
.d
[0];
1175 dd
.d
[1] = aa
.d
[1] * bb
.d
[1] - cc
.d
[1];
1176 return ((qword
)(dd
.v
));
1181 static __inline qword
si_fm(qword a
, qword b
)
1183 return ((qword
)(vec_madd((vec_float4
)(a
), (vec_float4
)(b
), ((vec_float4
){0.0f
}))));
1186 static __inline qword
si_dfm(qword a
, qword b
)
1193 aa
.v
= (vec_double2
)(a
);
1194 bb
.v
= (vec_double2
)(b
);
1195 dd
.d
[0] = aa
.d
[0] * bb
.d
[0];
1196 dd
.d
[1] = aa
.d
[1] * bb
.d
[1];
1197 return ((qword
)(dd
.v
));
1202 static __inline qword
si_mpyh(qword a
, qword b
)
1204 vec_uint4 sixteen
= (vec_uint4
){16, 16, 16, 16};
1206 return ((qword
)(vec_sl(vec_mule((vec_short8
)(a
), (vec_short8
)(vec_sl((vec_uint4
)(b
), sixteen
))), sixteen
)));
1210 /* Multiply High High
1212 static __inline qword
si_mpyhh(qword a
, qword b
)
1214 return ((qword
)(vec_mule((vec_short8
)(a
), (vec_short8
)(b
))));
1217 static __inline qword
si_mpyhhu(qword a
, qword b
)
1219 return ((qword
)(vec_mule((vec_ushort8
)(a
), (vec_ushort8
)(b
))));
1224 static __inline qword
si_mpy(qword a
, qword b
)
1226 return ((qword
)(vec_mulo((vec_short8
)(a
), (vec_short8
)(b
))));
1229 static __inline qword
si_mpyu(qword a
, qword b
)
1231 return ((qword
)(vec_mulo((vec_ushort8
)(a
), (vec_ushort8
)(b
))));
1234 static __inline qword
si_mpyi(qword a
, short b
)
1236 return ((qword
)(vec_mulo((vec_short8
)(a
),
1237 vec_splat((vec_short8
)(si_from_short(b
)), 1))));
1240 static __inline qword
si_mpyui(qword a
, unsigned short b
)
1242 return ((qword
)(vec_mulo((vec_ushort8
)(a
),
1243 vec_splat((vec_ushort8
)(si_from_ushort(b
)), 1))));
1246 /* Multiply and Shift Right
1248 static __inline qword
si_mpys(qword a
, qword b
)
1250 return ((qword
)(vec_sra(vec_mulo((vec_short8
)(a
), (vec_short8
)(b
)), ((vec_uint4
){16,16,16,16}))));
1255 static __inline qword
si_nand(qword a
, qword b
)
1259 d
= vec_and((vec_uchar16
)(a
), (vec_uchar16
)(b
));
1260 return ((qword
)(vec_nor(d
, d
)));
1263 /* Negative Multiply Add
1265 static __inline qword
si_dfnma(qword a
, qword b
, qword c
)
1272 aa
.v
= (vec_double2
)(a
);
1273 bb
.v
= (vec_double2
)(b
);
1274 cc
.v
= (vec_double2
)(c
);
1275 dd
.d
[0] = -cc
.d
[0] - aa
.d
[0] * bb
.d
[0];
1276 dd
.d
[1] = -cc
.d
[1] - aa
.d
[1] * bb
.d
[1];
1277 return ((qword
)(dd
.v
));
1280 /* Negative Multiply and Subtract
1282 static __inline qword
si_fnms(qword a
, qword b
, qword c
)
1284 return ((qword
)(vec_nmsub((vec_float4
)(a
), (vec_float4
)(b
), (vec_float4
)(c
))));
1287 static __inline qword
si_dfnms(qword a
, qword b
, qword c
)
1294 aa
.v
= (vec_double2
)(a
);
1295 bb
.v
= (vec_double2
)(b
);
1296 cc
.v
= (vec_double2
)(c
);
1297 dd
.d
[0] = cc
.d
[0] - aa
.d
[0] * bb
.d
[0];
1298 dd
.d
[1] = cc
.d
[1] - aa
.d
[1] * bb
.d
[1];
1299 return ((qword
)(dd
.v
));
1304 static __inline qword
si_nor(qword a
, qword b
)
1306 return ((qword
)(vec_nor((vec_uchar16
)(a
), (vec_uchar16
)(b
))));
1311 static __inline qword
si_or(qword a
, qword b
)
1313 return ((qword
)(vec_or((vec_uchar16
)(a
), (vec_uchar16
)(b
))));
1316 static __inline qword
si_orbi(qword a
, unsigned char b
)
1318 return ((qword
)(vec_or((vec_uchar16
)(a
),
1319 vec_splat((vec_uchar16
)(si_from_uchar(b
)), 3))));
1322 static __inline qword
si_orhi(qword a
, unsigned short b
)
1324 return ((qword
)(vec_or((vec_ushort8
)(a
),
1325 vec_splat((vec_ushort8
)(si_from_ushort(b
)), 1))));
1328 static __inline qword
si_ori(qword a
, unsigned int b
)
1330 return ((qword
)(vec_or((vec_uint4
)(a
),
1331 vec_splat((vec_uint4
)(si_from_uint(b
)), 0))));
1336 static __inline qword
si_orc(qword a
, qword b
)
1338 return ((qword
)(vec_or((vec_uchar16
)(a
), vec_nor((vec_uchar16
)(b
), (vec_uchar16
)(b
)))));
1344 static __inline qword
si_orx(qword a
)
1347 tmp
= (vec_uchar16
)(a
);
1348 tmp
= vec_or(tmp
, vec_sld(tmp
, tmp
, 8));
1349 tmp
= vec_or(tmp
, vec_sld(tmp
, tmp
, 4));
1350 return ((qword
)(vec_and(tmp
, ((vec_uchar16
){0xFF,0xFF,0xFF,0xFF, 0x00,0x00,0x00,0x00,
1351 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00}))));
1357 static __inline qword
si_frest(qword a
)
1359 return ((qword
)(vec_re((vec_float4
)(a
))));
1362 static __inline qword
si_frsqest(qword a
)
1364 return ((qword
)(vec_rsqrte((vec_float4
)(a
))));
1367 #define si_fi(_a, _d) (_d)
1369 /* Channel Read and Write
1371 #define si_rdch(_channel) ((qword)(vec_splat_u8(0))) /* not mappable */
1372 #define si_rchcnt(_channel) ((qword)(vec_splat_u8(0))) /* not mappable */
1373 #define si_wrch(_channel, _a) /* not mappable */
1377 static __inline qword
si_roth(qword a
, qword b
)
1379 return ((qword
)(vec_rl((vec_ushort8
)(a
), (vec_ushort8
)(b
))));
1382 static __inline qword
si_rot(qword a
, qword b
)
1384 return ((qword
)(vec_rl((vec_uint4
)(a
), (vec_uint4
)(b
))));
1387 static __inline qword
si_rothi(qword a
, int b
)
1389 return ((qword
)(vec_rl((vec_ushort8
)(a
),
1390 vec_splat((vec_ushort8
)(si_from_int(b
)), 1))));
1393 static __inline qword
si_roti(qword a
, int b
)
1395 return ((qword
)(vec_rl((vec_uint4
)(a
),
1396 vec_splat((vec_uint4
)(si_from_int(b
)), 0))));
1399 /* Rotate Left with Mask
1401 static __inline qword
si_rothm(qword a
, qword b
)
1406 neg_b
= (vec_ushort8
)vec_sub(vec_splat_s16(0), (vec_short8
)(b
));
1407 mask
= vec_sra(vec_sl(neg_b
, vec_splat_u16(11)), vec_splat_u16(15));
1408 return ((qword
)(vec_andc(vec_sr((vec_ushort8
)(a
), neg_b
), mask
)));
1411 static __inline qword
si_rotm(qword a
, qword b
)
1416 neg_b
= (vec_uint4
)vec_sub(vec_splat_s32(0), (vec_int4
)(b
));
1417 mask
= vec_sra(vec_sl(neg_b
, ((vec_uint4
){26,26,26,26})), ((vec_uint4
){31,31,31,31}));
1418 return ((qword
)(vec_andc(vec_sr((vec_uint4
)(a
), neg_b
), mask
)));
1421 static __inline qword
si_rothmi(qword a
, int b
)
1426 neg_b
= vec_splat((vec_ushort8
)(si_from_int(-b
)), 1);
1427 mask
= vec_sra(vec_sl(neg_b
, vec_splat_u16(11)), vec_splat_u16(15));
1428 return ((qword
)(vec_andc(vec_sr((vec_ushort8
)(a
), neg_b
), mask
)));
1431 static __inline qword
si_rotmi(qword a
, int b
)
1436 neg_b
= vec_splat((vec_uint4
)(si_from_int(-b
)), 0);
1437 mask
= vec_sra(vec_sl(neg_b
, ((vec_uint4
){26,26,26,26})), ((vec_uint4
){31,31,31,31}));
1438 return ((qword
)(vec_andc(vec_sr((vec_uint4
)(a
), neg_b
), mask
)));
1442 /* Rotate Left Algebraic with Mask
1444 static __inline qword
si_rotmah(qword a
, qword b
)
1449 neg_b
= (vec_ushort8
)vec_sub(vec_splat_s16(0), (vec_short8
)(b
));
1450 mask
= vec_sra(vec_sl(neg_b
, vec_splat_u16(11)), vec_splat_u16(15));
1451 return ((qword
)(vec_sra((vec_short8
)(a
), (vec_ushort8
)vec_or(neg_b
, mask
))));
1454 static __inline qword
si_rotma(qword a
, qword b
)
1459 neg_b
= (vec_uint4
)vec_sub(vec_splat_s32(0), (vec_int4
)(b
));
1460 mask
= vec_sra(vec_sl(neg_b
, ((vec_uint4
){26,26,26,26})), ((vec_uint4
){31,31,31,31}));
1461 return ((qword
)(vec_sra((vec_int4
)(a
), (vec_uint4
)vec_or(neg_b
, mask
))));
1465 static __inline qword
si_rotmahi(qword a
, int b
)
1470 neg_b
= vec_splat((vec_ushort8
)(si_from_int(-b
)), 1);
1471 mask
= vec_sra(vec_sl(neg_b
, vec_splat_u16(11)), vec_splat_u16(15));
1472 return ((qword
)(vec_sra((vec_short8
)(a
), (vec_ushort8
)vec_or(neg_b
, mask
))));
1475 static __inline qword
si_rotmai(qword a
, int b
)
1480 neg_b
= vec_splat((vec_uint4
)(si_from_int(-b
)), 0);
1481 mask
= vec_sra(vec_sl(neg_b
, ((vec_uint4
){26,26,26,26})), ((vec_uint4
){31,31,31,31}));
1482 return ((qword
)(vec_sra((vec_int4
)(a
), (vec_uint4
)vec_or(neg_b
, mask
))));
1486 /* Rotate Left Quadword by Bytes with Mask
1488 static __inline qword
si_rotqmbyi(qword a
, int count
)
1497 x
.i
[3] = count
<< 3;
1498 mask
= (count
& 0x10) ? vec_splat_u8(0) : vec_splat_u8(-1);
1500 return ((qword
)(vec_and(vec_sro((vec_uchar16
)(a
), x
.v
), mask
)));
1504 static __inline qword
si_rotqmby(qword a
, qword count
)
1513 x
.v
= (vec_uchar16
)(count
);
1514 x
.i
[0] = cnt
= (0 - x
.i
[0]) << 3;
1516 x
.v
= vec_splat(x
.v
, 3);
1517 mask
= (cnt
& 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1519 return ((qword
)(vec_and(vec_sro((vec_uchar16
)(a
), x
.v
), mask
)));
1523 /* Rotate Left Quadword by Bytes
1525 static __inline qword
si_rotqbyi(qword a
, int count
)
1534 right
.i
[3] = 0 - count
;
1535 return ((qword
)(vec_or(vec_slo((vec_uchar16
)(a
), left
.v
), vec_sro((vec_uchar16
)(a
), right
.v
))));
1538 static __inline qword
si_rotqby(qword a
, qword count
)
1540 vec_uchar16 left
, right
;
1542 left
= vec_sl(vec_splat((vec_uchar16
)(count
), 3), vec_splat_u8(3));
1543 right
= vec_sub(vec_splat_u8(0), left
);
1544 return ((qword
)(vec_or(vec_slo((vec_uchar16
)(a
), left
), vec_sro((vec_uchar16
)(a
), right
))));
1547 /* Rotate Left Quadword by Bytes Bit Count
1549 static __inline qword
si_rotqbybi(qword a
, qword count
)
1551 vec_uchar16 left
, right
;
1553 left
= vec_splat((vec_uchar16
)(count
), 3);
1554 right
= vec_sub(vec_splat_u8(7), left
);
1555 return ((qword
)(vec_or(vec_slo((vec_uchar16
)(a
), left
), vec_sro((vec_uchar16
)(a
), right
))));
1559 /* Rotate Left Quadword by Bytes Bit Count
1561 static __inline qword
si_rotqbii(qword a
, int count
)
1566 x
= vec_splat((vec_uchar16
)(si_from_int(count
& 7)), 3);
1567 y
= (vec_uchar16
)(vec_sr((vec_uint4
)vec_sro((vec_uchar16
)(a
), ((vec_uchar16
)((vec_uint4
){0,0,0,120}))),
1568 (vec_uint4
)vec_sub(vec_splat_u8(8), x
)));
1569 result
= vec_or(vec_sll((qword
)(a
), x
), y
);
1570 return ((qword
)(result
));
1573 static __inline qword
si_rotqbi(qword a
, qword count
)
1578 x
= vec_and(vec_splat((vec_uchar16
)(count
), 3), vec_splat_u8(7));
1579 y
= (vec_uchar16
)(vec_sr((vec_uint4
)vec_sro((vec_uchar16
)(a
), ((vec_uchar16
)((vec_uint4
){0,0,0,120}))),
1580 (vec_uint4
)vec_sub(vec_splat_u8(8), x
)));
1582 result
= vec_or(vec_sll((qword
)(a
), x
), y
);
1583 return ((qword
)(result
));
1587 /* Rotate Left Quadword and Mask by Bits
1589 static __inline qword
si_rotqmbii(qword a
, int count
)
1591 return ((qword
)(vec_srl((vec_uchar16
)(a
), vec_splat((vec_uchar16
)(si_from_int(0 - count
)), 3))));
1594 static __inline qword
si_rotqmbi(qword a
, qword count
)
1596 return ((qword
)(vec_srl((vec_uchar16
)(a
), vec_sub(vec_splat_u8(0), vec_splat((vec_uchar16
)(count
), 3)))));
1600 /* Rotate Left Quadword and Mask by Bytes with Bit Count
1602 static __inline qword
si_rotqmbybi(qword a
, qword count
)
1611 x
.v
= (vec_uchar16
)(count
);
1612 x
.i
[0] = cnt
= 0 - (x
.i
[0] & ~7);
1613 x
.v
= vec_splat(x
.v
, 3);
1614 mask
= (cnt
& 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1616 return ((qword
)(vec_and(vec_sro((vec_uchar16
)(a
), x
.v
), mask
)));
1622 /* Round Double to Float
1624 static __inline qword
si_frds(qword a
)
1635 in
.v
= (vec_double2
)(a
);
1636 d
.v
= (vec_float4
){0.0f
};
1637 d
.f
[0] = (float)in
.d
[0];
1638 d
.f
[2] = (float)in
.d
[1];
1640 return ((qword
)(d
.v
));
1645 static __inline qword
si_selb(qword a
, qword b
, qword c
)
1647 return ((qword
)(vec_sel((vec_uchar16
)(a
), (vec_uchar16
)(b
), (vec_uchar16
)(c
))));
1653 static __inline qword
si_shufb(qword a
, qword b
, qword pattern
)
1657 pat
= vec_sel(((vec_uchar16
){0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}),
1658 vec_sr((vec_uchar16
)(pattern
), vec_splat_u8(3)),
1659 vec_sra((vec_uchar16
)(pattern
), vec_splat_u8(7)));
1660 return ((qword
)(vec_perm(vec_perm(a
, b
, pattern
),
1661 ((vec_uchar16
){0, 0, 0, 0, 0, 0, 0, 0,
1662 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),
1669 static __inline qword
si_shlh(qword a
, qword b
)
1673 mask
= (vec_ushort8
)vec_sra(vec_sl((vec_ushort8
)(b
), vec_splat_u16(11)), vec_splat_u16(15));
1674 return ((qword
)(vec_andc(vec_sl((vec_ushort8
)(a
), (vec_ushort8
)(b
)), mask
)));
1677 static __inline qword
si_shl(qword a
, qword b
)
1681 mask
= (vec_uint4
)vec_sra(vec_sl((vec_uint4
)(b
), ((vec_uint4
){26,26,26,26})), ((vec_uint4
){31,31,31,31}));
1682 return ((qword
)(vec_andc(vec_sl((vec_uint4
)(a
), (vec_uint4
)(b
)), mask
)));
1686 static __inline qword
si_shlhi(qword a
, unsigned int b
)
1691 bv
= vec_splat((vec_ushort8
)(si_from_int(b
)), 1);
1692 mask
= (vec_ushort8
)vec_sra(vec_sl(bv
, vec_splat_u16(11)), vec_splat_u16(15));
1693 return ((qword
)(vec_andc(vec_sl((vec_ushort8
)(a
), bv
), mask
)));
1696 static __inline qword
si_shli(qword a
, unsigned int b
)
1701 bv
= vec_splat((vec_uint4
)(si_from_uint(b
)), 0);
1702 mask
= (vec_uint4
)vec_sra(vec_sl(bv
, ((vec_uint4
){26,26,26,26})), ((vec_uint4
){31,31,31,31}));
1703 return ((qword
)(vec_andc(vec_sl((vec_uint4
)(a
), bv
), mask
)));
1707 /* Shift Left Quadword
1709 static __inline qword
si_shlqbii(qword a
, unsigned int count
)
1713 x
= vec_splat((vec_uchar16
)(si_from_uint(count
)), 3);
1714 return ((qword
)(vec_sll((vec_uchar16
)(a
), x
)));
1717 static __inline qword
si_shlqbi(qword a
, qword count
)
1721 x
= vec_splat((vec_uchar16
)(count
), 3);
1722 return ((qword
)(vec_sll((vec_uchar16
)(a
), x
)));
1726 /* Shift Left Quadword by Bytes
1728 static __inline qword
si_shlqbyi(qword a
, unsigned int count
)
1736 x
.i
[3] = count
<< 3;
1737 mask
= (count
& 0x10) ? vec_splat_u8(0) : vec_splat_u8(-1);
1738 return ((qword
)(vec_and(vec_slo((vec_uchar16
)(a
), x
.v
), mask
)));
1741 static __inline qword
si_shlqby(qword a
, qword count
)
1750 x
.v
= vec_sl(vec_splat((vec_uchar16
)(count
), 3), vec_splat_u8(3));
1752 mask
= (cnt
& 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1753 return ((qword
)(vec_and(vec_slo((vec_uchar16
)(a
), x
.v
), mask
)));
1756 /* Shift Left Quadword by Bytes with Bit Count
1758 static __inline qword
si_shlqbybi(qword a
, qword count
)
1767 x
.v
= vec_splat((vec_uchar16
)(count
), 3);
1769 mask
= (cnt
& 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1770 return ((qword
)(vec_and(vec_slo((vec_uchar16
)(a
), x
.v
), mask
)));
1776 #define si_stop(_type) SPU_STOP_ACTION
1777 #define si_stopd(a, b, c) SPU_STOP_ACTION
1782 static __inline qword
si_sfh(qword a
, qword b
)
1784 return ((qword
)(vec_sub((vec_ushort8
)(b
), (vec_ushort8
)(a
))));
1787 static __inline qword
si_sf(qword a
, qword b
)
1789 return ((qword
)(vec_sub((vec_uint4
)(b
), (vec_uint4
)(a
))));
1792 static __inline qword
si_fs(qword a
, qword b
)
1794 return ((qword
)(vec_sub((vec_float4
)(a
), (vec_float4
)(b
))));
1797 static __inline qword
si_dfs(qword a
, qword b
)
1804 aa
.v
= (vec_double2
)(a
);
1805 bb
.v
= (vec_double2
)(b
);
1806 dd
.d
[0] = aa
.d
[0] - bb
.d
[0];
1807 dd
.d
[1] = aa
.d
[1] - bb
.d
[1];
1808 return ((qword
)(dd
.v
));
1811 static __inline qword
si_sfhi(qword a
, short b
)
1813 return ((qword
)(vec_sub(vec_splat((vec_short8
)(si_from_short(b
)), 1),
1817 static __inline qword
si_sfi(qword a
, int b
)
1819 return ((qword
)(vec_sub(vec_splat((vec_int4
)(si_from_int(b
)), 0),
1823 /* Subtract word extended
1825 #define si_sfx(_a, _b, _c) ((qword)(vec_add(vec_add((vec_uint4)(_b), \
1826 vec_nor((vec_uint4)(_a), (vec_uint4)(_a))), \
1827 vec_and((vec_uint4)(_c), vec_splat_u32(1)))))
1830 /* Sum Bytes into Shorts
1832 static __inline qword
si_sumb(qword a
, qword b
)
1834 vec_uint4 zero
= (vec_uint4
){0};
1835 vec_ushort8 sum_a
, sum_b
;
1837 sum_a
= (vec_ushort8
)vec_sum4s((vec_uchar16
)(a
), zero
);
1838 sum_b
= (vec_ushort8
)vec_sum4s((vec_uchar16
)(b
), zero
);
1840 return ((qword
)(vec_perm(sum_a
, sum_b
, ((vec_uchar16
){18, 19, 2, 3, 22, 23, 6, 7,
1841 26, 27, 10, 11, 30, 31, 14, 15}))));
1846 static __inline qword
si_xor(qword a
, qword b
)
1848 return ((qword
)(vec_xor((vec_uchar16
)(a
), (vec_uchar16
)(b
))));
1851 static __inline qword
si_xorbi(qword a
, unsigned char b
)
1853 return ((qword
)(vec_xor((vec_uchar16
)(a
),
1854 vec_splat((vec_uchar16
)(si_from_uchar(b
)), 3))));
1857 static __inline qword
si_xorhi(qword a
, unsigned short b
)
1859 return ((qword
)(vec_xor((vec_ushort8
)(a
),
1860 vec_splat((vec_ushort8
)(si_from_ushort(b
)), 1))));
1863 static __inline qword
si_xori(qword a
, unsigned int b
)
1865 return ((qword
)(vec_xor((vec_uint4
)(a
),
1866 vec_splat((vec_uint4
)(si_from_uint(b
)), 0))));
1870 /* Generate Controls for Sub-Quadword Insertion
1872 static __inline qword
si_cbd(qword a
, int imm
)
1876 unsigned char c
[16];
1879 shmask
.v
= ((vec_uint4
){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1880 shmask
.c
[(si_to_uint(a
) + (unsigned int)(imm
)) & 0xF] = 0x03;
1881 return ((qword
)(shmask
.v
));
1884 static __inline qword
si_cdd(qword a
, int imm
)
1888 unsigned long long ll
[2];
1891 shmask
.v
= ((vec_uint4
){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1892 shmask
.ll
[((si_to_uint(a
) + (unsigned int)(imm
)) >> 3) & 0x1] = 0x0001020304050607ULL
;
1893 return ((qword
)(shmask
.v
));
1896 static __inline qword
si_chd(qword a
, int imm
)
1900 unsigned short s
[8];
1903 shmask
.v
= ((vec_uint4
){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1904 shmask
.s
[((si_to_uint(a
) + (unsigned int)(imm
)) >> 1) & 0x7] = 0x0203;
1905 return ((qword
)(shmask
.v
));
1908 static __inline qword
si_cwd(qword a
, int imm
)
1915 shmask
.v
= ((vec_uint4
){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1916 shmask
.i
[((si_to_uint(a
) + (unsigned int)(imm
)) >> 2) & 0x3] = 0x00010203;
1917 return ((qword
)(shmask
.v
));
1920 static __inline qword
si_cbx(qword a
, qword b
)
1924 unsigned char c
[16];
1927 shmask
.v
= ((vec_uint4
){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1928 shmask
.c
[si_to_uint((qword
)(vec_add((vec_uint4
)(a
), (vec_uint4
)(b
)))) & 0xF] = 0x03;
1929 return ((qword
)(shmask
.v
));
1933 static __inline qword
si_cdx(qword a
, qword b
)
1937 unsigned long long ll
[2];
1940 shmask
.v
= ((vec_uint4
){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1941 shmask
.ll
[(si_to_uint((qword
)(vec_add((vec_uint4
)(a
), (vec_uint4
)(b
)))) >> 3) & 0x1] = 0x0001020304050607ULL
;
1942 return ((qword
)(shmask
.v
));
1945 static __inline qword
si_chx(qword a
, qword b
)
1949 unsigned short s
[8];
1952 shmask
.v
= ((vec_uint4
){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1953 shmask
.s
[(si_to_uint((qword
)(vec_add((vec_uint4
)(a
), (vec_uint4
)(b
)))) >> 1) & 0x7] = 0x0203;
1954 return ((qword
)(shmask
.v
));
1957 static __inline qword
si_cwx(qword a
, qword b
)
1964 shmask
.v
= ((vec_uint4
){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1965 shmask
.i
[(si_to_uint((qword
)(vec_add((vec_uint4
)(a
), (vec_uint4
)(b
)))) >> 2) & 0x3] = 0x00010203;
1966 return ((qword
)(shmask
.v
));
1970 /* Constant Formation
1972 static __inline qword
si_il(signed short imm
)
1974 return ((qword
)(vec_splat((vec_int4
)(si_from_int((signed int)(imm
))), 0)));
1978 static __inline qword
si_ila(unsigned int imm
)
1980 return ((qword
)(vec_splat((vec_uint4
)(si_from_uint(imm
)), 0)));
1983 static __inline qword
si_ilh(signed short imm
)
1985 return ((qword
)(vec_splat((vec_short8
)(si_from_short(imm
)), 1)));
1988 static __inline qword
si_ilhu(signed short imm
)
1990 return ((qword
)(vec_splat((vec_uint4
)(si_from_uint((unsigned int)(imm
) << 16)), 0)));
1993 static __inline qword
si_iohl(qword a
, unsigned short imm
)
1995 return ((qword
)(vec_or((vec_uint4
)(a
), vec_splat((vec_uint4
)(si_from_uint((unsigned int)(imm
))), 0))));
2000 #define si_lnop() /* do nothing */
2001 #define si_nop() /* do nothing */
2004 /* Memory Load and Store
2006 static __inline qword
si_lqa(unsigned int imm
)
2008 return ((qword
)(vec_ld(0, (vector
unsigned char *)(imm
))));
2011 static __inline qword
si_lqd(qword a
, unsigned int imm
)
2013 return ((qword
)(vec_ld(si_to_uint(a
) & ~0xF, (vector
unsigned char *)(imm
))));
2016 static __inline qword
si_lqr(unsigned int imm
)
2018 return ((qword
)(vec_ld(0, (vector
unsigned char *)(imm
))));
2021 static __inline qword
si_lqx(qword a
, qword b
)
2023 return ((qword
)(vec_ld(si_to_uint((qword
)(vec_add((vec_uint4
)(a
), (vec_uint4
)(b
)))), (vector
unsigned char *)(0))));
2026 static __inline
void si_stqa(qword a
, unsigned int imm
)
2028 vec_st((vec_uchar16
)(a
), 0, (vector
unsigned char *)(imm
));
2031 static __inline
void si_stqd(qword a
, qword b
, unsigned int imm
)
2033 vec_st((vec_uchar16
)(a
), si_to_uint(b
) & ~0xF, (vector
unsigned char *)(imm
));
2036 static __inline
void si_stqr(qword a
, unsigned int imm
)
2038 vec_st((vec_uchar16
)(a
), 0, (vector
unsigned char *)(imm
));
2041 static __inline
void si_stqx(qword a
, qword b
, qword c
)
2043 vec_st((vec_uchar16
)(a
),
2044 si_to_uint((qword
)(vec_add((vec_uint4
)(b
), (vec_uint4
)(c
)))),
2045 (vector
unsigned char *)(0));
2048 #endif /* !__SPU__ */
2049 #endif /* !_SI2VMX_H_ */