1 /* Cell BEA specific SPU intrinsics to PPU/VMX intrinsics
2 Copyright (C) 2007, 2009 Free Software Foundation, Inc.
4 This file is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 3 of the License, or (at your option)
9 This file is distributed in the hope that it will be useful, but WITHOUT
10 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 Under Section 7 of GPL version 3, you are granted additional
15 permissions described in the GCC Runtime Library Exception, version
16 3.1, as published by the Free Software Foundation.
18 You should have received a copy of the GNU General Public License and
19 a copy of the GCC Runtime Library Exception along with this program;
20 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
21 <http://www.gnu.org/licenses/>. */
29 #include <vec_types.h>
32 /* Specify a default halt action for spu_hcmpeq and spu_hcmpgt intrinsics.
33 * Users can override the action by defining it prior to including this
36 #ifndef SPU_HALT_ACTION
37 #define SPU_HALT_ACTION abort()
40 /* Specify a default stop action for the spu_stop intrinsic.
41 * Users can override the action by defining it prior to including this
44 #ifndef SPU_STOP_ACTION
45 #define SPU_STOP_ACTION abort()
49 /* Specify a default action for unsupported intrinsic.
50 * Users can override the action by defining it prior to including this
53 #ifndef SPU_UNSUPPORTED_ACTION
54 #define SPU_UNSUPPORTED_ACTION abort()
58 /* Casting intrinsics - from scalar to quadword
61 static __inline qword
si_from_uchar(unsigned char c
) {
70 static __inline qword
si_from_char(signed char c
) {
79 static __inline qword
si_from_ushort(unsigned short s
) {
88 static __inline qword
si_from_short(short s
) {
98 static __inline qword
si_from_uint(unsigned int i
) {
107 static __inline qword
si_from_int(int i
) {
116 static __inline qword
si_from_ullong(unsigned long long l
) {
119 unsigned long long l
[2];
125 static __inline qword
si_from_llong(long long l
) {
134 static __inline qword
si_from_float(float f
) {
143 static __inline qword
si_from_double(double d
) {
152 static __inline qword
si_from_ptr(void *ptr
) {
162 /* Casting intrinsics - from quadword to scalar
164 static __inline
unsigned char si_to_uchar(qword q
) {
173 static __inline
signed char si_to_char(qword q
) {
182 static __inline
unsigned short si_to_ushort(qword q
) {
191 static __inline
short si_to_short(qword q
) {
200 static __inline
unsigned int si_to_uint(qword q
) {
209 static __inline
int si_to_int(qword q
) {
218 static __inline
unsigned long long si_to_ullong(qword q
) {
221 unsigned long long l
[2];
227 static __inline
long long si_to_llong(qword q
) {
236 static __inline
float si_to_float(qword q
) {
245 static __inline
double si_to_double(qword q
) {
254 static __inline
void * si_to_ptr(qword q
) {
264 /* Absolute difference
266 static __inline qword
si_absdb(qword a
, qword b
)
268 vec_uchar16 ac
, bc
, dc
;
270 ac
= (vec_uchar16
)(a
);
271 bc
= (vec_uchar16
)(b
);
272 dc
= vec_sel(vec_sub(bc
, ac
), vec_sub(ac
, bc
), vec_cmpgt(ac
, bc
));
274 return ((qword
)(dc
));
279 #define si_a(_a, _b) ((qword)(vec_add((vec_uint4)(_a), (vec_uint4)(_b))))
281 #define si_ah(_a, _b) ((qword)(vec_add((vec_ushort8)(_a), (vec_ushort8)(_b))))
283 static __inline qword
si_ai(qword a
, int b
)
285 return ((qword
)(vec_add((vec_int4
)(a
),
286 vec_splat((vec_int4
)(si_from_int(b
)), 0))));
290 static __inline qword
si_ahi(qword a
, short b
)
292 return ((qword
)(vec_add((vec_short8
)(a
),
293 vec_splat((vec_short8
)(si_from_short(b
)), 1))));
297 #define si_fa(_a, _b) ((qword)(vec_add((vec_float4)(_a), (vec_float4)(_b))))
300 static __inline qword
si_dfa(qword a
, qword b
)
307 ad
.v
= (vec_double2
)(a
);
308 bd
.v
= (vec_double2
)(b
);
309 dd
.d
[0] = ad
.d
[0] + bd
.d
[0];
310 dd
.d
[1] = ad
.d
[1] + bd
.d
[1];
312 return ((qword
)(dd
.v
));
317 #define si_addx(_a, _b, _c) ((qword)(vec_add(vec_add((vec_uint4)(_a), (vec_uint4)(_b)), \
318 vec_and((vec_uint4)(_c), vec_splat_u32(1)))))
323 #define si_and(_a, _b) ((qword)(vec_and((vec_uint4)(_a), (vec_uint4)(_b))))
326 static __inline qword
si_andbi(qword a
, signed char b
)
328 return ((qword
)(vec_and((vec_char16
)(a
),
329 vec_splat((vec_char16
)(si_from_char(b
)), 3))));
332 static __inline qword
si_andhi(qword a
, signed short b
)
334 return ((qword
)(vec_and((vec_short8
)(a
),
335 vec_splat((vec_short8
)(si_from_short(b
)), 1))));
339 static __inline qword
si_andi(qword a
, signed int b
)
341 return ((qword
)(vec_and((vec_int4
)(a
),
342 vec_splat((vec_int4
)(si_from_int(b
)), 0))));
346 /* Bit-wise AND with complement
348 #define si_andc(_a, _b) ((qword)(vec_andc((vec_uchar16)(_a), (vec_uchar16)(_b))))
351 /* Average byte vectors
353 #define si_avgb(_a, _b) ((qword)(vec_avg((vec_uchar16)(_a), (vec_uchar16)(_b))))
356 /* Branch indirect and set link on external data
358 #define si_bisled(_func) /* not mappable */
359 #define si_bisledd(_func) /* not mappable */
360 #define si_bislede(_func) /* not mappable */
365 #define si_bg(_a, _b) ((qword)(vec_subc((vec_uint4)(_b), (vec_uint4)(_a))))
367 #define si_bgx(_a, _b, _c) ((qword)(vec_and(vec_or(vec_cmpgt((vec_uint4)(_b), (vec_uint4)(_a)), \
368 vec_and(vec_cmpeq((vec_uint4)(_b), (vec_uint4)(_a)), \
369 (vec_uint4)(_c))), vec_splat_u32(1))))
371 /* Compare absolute equal
373 static __inline qword
si_fcmeq(qword a
, qword b
)
375 vec_float4 msb
= (vec_float4
)((vec_uint4
){0x80000000, 0x80000000, 0x80000000, 0x80000000});
377 return ((qword
)(vec_cmpeq(vec_andc((vec_float4
)(a
), msb
),
378 vec_andc((vec_float4
)(b
), msb
))));
381 static __inline qword
si_dfcmeq(qword a
, qword b
)
383 vec_uint4 sign_mask
= (vec_uint4
) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
384 vec_uint4 nan_mask
= (vec_uint4
) { 0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000 };
385 vec_uchar16 hihi_promote
= (vec_uchar16
) { 0,1,2,3, 16,17,18,19, 8,9,10,11, 24,25,26,27};
403 /* Mask out sign bits */
404 aabs
= vec_and((vec_uint4
)a
,sign_mask
);
405 babs
= vec_and((vec_uint4
)b
,sign_mask
);
407 /* A) Check for bit equality, store in high word */
408 biteq
= (vec_uint4
) vec_cmpeq((vec_uint4
)aabs
,(vec_uint4
)babs
);
409 biteq
= vec_and(biteq
,(vec_uint4
)vec_slo((vec_uchar16
)biteq
,x
.v
));
412 B) Check if a is NaN, store in high word
414 B1) If the high word is greater than max_exp (indicates a NaN)
415 B2) If the low word is greater than 0
417 a_gt
= (vec_uint4
)vec_cmpgt(aabs
,nan_mask
);
419 /* B3) Check if the high word is equal to the inf exponent */
420 ahi_inf
= (vec_uint4
)vec_cmpeq(aabs
,nan_mask
);
422 /* anan = B1[hi] or (B2[lo] and B3[hi]) */
423 anan
= (vec_uint4
)vec_or(a_gt
,vec_and((vec_uint4
)vec_slo((vec_uchar16
)a_gt
,x
.v
),ahi_inf
));
425 /* result = A and not B */
426 result
= vec_andc(biteq
, anan
);
428 /* Promote high words to 64 bits and return */
429 return ((qword
)(vec_perm((vec_uchar16
)result
, (vec_uchar16
)result
, hihi_promote
)));
433 /* Compare absolute greater than
435 static __inline qword
si_fcmgt(qword a
, qword b
)
437 vec_float4 msb
= (vec_float4
)((vec_uint4
){0x80000000, 0x80000000, 0x80000000, 0x80000000});
439 return ((qword
)(vec_cmpgt(vec_andc((vec_float4
)(a
), msb
),
440 vec_andc((vec_float4
)(b
), msb
))));
443 static __inline qword
si_dfcmgt(qword a
, qword b
)
445 vec_uchar16 splat_hi
= (vec_uchar16
) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
446 vec_uint4 nan_mask
= (vec_uint4
) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
447 vec_uint4 sign_mask
= (vec_uint4
) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
457 // absolute value of a,b
458 vec_uint4 aabs
= vec_and((vec_uint4
)a
, sign_mask
);
459 vec_uint4 babs
= vec_and((vec_uint4
)b
, sign_mask
);
462 vec_uint4 a_inf
= (vec_uint4
)vec_cmpeq(aabs
, nan_mask
);
463 vec_uint4 a_nan
= (vec_uint4
)vec_cmpgt(aabs
, nan_mask
);
464 a_nan
= vec_or(a_nan
, vec_and((vec_uint4
)vec_slo((vec_uchar16
)a_nan
,x
.v
),a_inf
));
465 a_nan
= (vec_uint4
)vec_perm((vec_uchar16
)a_nan
, (vec_uchar16
)a_nan
, splat_hi
);
468 vec_uint4 b_inf
= (vec_uint4
)vec_cmpeq(babs
, nan_mask
);
469 vec_uint4 b_nan
= (vec_uint4
)vec_cmpgt(babs
, nan_mask
);
470 b_nan
= vec_or(b_nan
, vec_and((vec_uint4
)vec_slo((vec_uchar16
)b_nan
,x
.v
),b_inf
));
471 b_nan
= (vec_uint4
)vec_perm((vec_uchar16
)b_nan
, (vec_uchar16
)b_nan
, splat_hi
);
473 // A) Check if the exponents are different
474 vec_uint4 gt_hi
= (vec_uint4
)vec_cmpgt(aabs
,babs
);
476 // B) Check if high word equal, and low word greater
477 vec_uint4 gt_lo
= (vec_uint4
)vec_cmpgt((vec_uint4
)aabs
, (vec_uint4
)babs
);
478 vec_uint4 eq
= (vec_uint4
)vec_cmpeq(aabs
, babs
);
479 vec_uint4 eqgt
= vec_and(eq
,vec_slo(gt_lo
,x
.v
));
481 // If either A or B is true, return true (unless NaNs detected)
482 vec_uint4 r
= vec_or(gt_hi
, eqgt
);
484 // splat the high words of the comparison step
485 r
= (vec_uint4
)vec_perm((vec_uchar16
)r
,(vec_uchar16
)r
,splat_hi
);
487 // correct for NaNs in input
488 return ((qword
)vec_andc(r
,vec_or(a_nan
,b_nan
)));
494 static __inline qword
si_ceqb(qword a
, qword b
)
496 return ((qword
)(vec_cmpeq((vec_uchar16
)(a
), (vec_uchar16
)(b
))));
499 static __inline qword
si_ceqh(qword a
, qword b
)
501 return ((qword
)(vec_cmpeq((vec_ushort8
)(a
), (vec_ushort8
)(b
))));
504 static __inline qword
si_ceq(qword a
, qword b
)
506 return ((qword
)(vec_cmpeq((vec_uint4
)(a
), (vec_uint4
)(b
))));
509 static __inline qword
si_fceq(qword a
, qword b
)
511 return ((qword
)(vec_cmpeq((vec_float4
)(a
), (vec_float4
)(b
))));
514 static __inline qword
si_ceqbi(qword a
, signed char b
)
516 return ((qword
)(vec_cmpeq((vec_char16
)(a
),
517 vec_splat((vec_char16
)(si_from_char(b
)), 3))));
520 static __inline qword
si_ceqhi(qword a
, signed short b
)
522 return ((qword
)(vec_cmpeq((vec_short8
)(a
),
523 vec_splat((vec_short8
)(si_from_short(b
)), 1))));
526 static __inline qword
si_ceqi(qword a
, signed int b
)
528 return ((qword
)(vec_cmpeq((vec_int4
)(a
),
529 vec_splat((vec_int4
)(si_from_int(b
)), 0))));
532 static __inline qword
si_dfceq(qword a
, qword b
)
534 vec_uint4 sign_mask
= (vec_uint4
) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
535 vec_uint4 nan_mask
= (vec_uint4
) { 0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000 };
536 vec_uchar16 hihi_promote
= (vec_uchar16
) { 0,1,2,3, 16,17,18,19, 8,9,10,11, 24,25,26,27};
555 /* A) Check for bit equality, store in high word */
556 biteq
= (vec_uint4
) vec_cmpeq((vec_uint4
)a
,(vec_uint4
)b
);
557 biteq
= vec_and(biteq
,(vec_uint4
)vec_slo((vec_uchar16
)biteq
,x
.v
));
559 /* Mask out sign bits */
560 aabs
= vec_and((vec_uint4
)a
,sign_mask
);
561 babs
= vec_and((vec_uint4
)b
,sign_mask
);
564 B) Check if a is NaN, store in high word
566 B1) If the high word is greater than max_exp (indicates a NaN)
567 B2) If the low word is greater than 0
569 a_gt
= (vec_uint4
)vec_cmpgt(aabs
,nan_mask
);
571 /* B3) Check if the high word is equal to the inf exponent */
572 ahi_inf
= (vec_uint4
)vec_cmpeq(aabs
,nan_mask
);
574 /* anan = B1[hi] or (B2[lo] and B3[hi]) */
575 anan
= (vec_uint4
)vec_or(a_gt
,vec_and((vec_uint4
)vec_slo((vec_uchar16
)a_gt
,x
.v
),ahi_inf
));
577 /* C) Check for 0 = -0 special case */
578 iszero
=(vec_uint4
)vec_cmpeq((vec_uint4
)vec_or(aabs
,babs
),(vec_uint4
)vec_splat_u32(0));
579 iszero
= vec_and(iszero
,(vec_uint4
)vec_slo((vec_uchar16
)iszero
,x
.v
));
581 /* result = (A or C) and not B */
582 result
= vec_or(biteq
,iszero
);
583 result
= vec_andc(result
, anan
);
585 /* Promote high words to 64 bits and return */
586 return ((qword
)(vec_perm((vec_uchar16
)result
, (vec_uchar16
)result
, hihi_promote
)));
590 /* Compare greater than
592 static __inline qword
si_cgtb(qword a
, qword b
)
594 return ((qword
)(vec_cmpgt((vec_char16
)(a
), (vec_char16
)(b
))));
597 static __inline qword
si_cgth(qword a
, qword b
)
599 return ((qword
)(vec_cmpgt((vec_short8
)(a
), (vec_short8
)(b
))));
602 static __inline qword
si_cgt(qword a
, qword b
)
604 return ((qword
)(vec_cmpgt((vec_int4
)(a
), (vec_int4
)(b
))));
607 static __inline qword
si_clgtb(qword a
, qword b
)
609 return ((qword
)(vec_cmpgt((vec_uchar16
)(a
), (vec_uchar16
)(b
))));
612 static __inline qword
si_clgth(qword a
, qword b
)
614 return ((qword
)(vec_cmpgt((vec_ushort8
)(a
), (vec_ushort8
)(b
))));
617 static __inline qword
si_clgt(qword a
, qword b
)
619 return ((qword
)(vec_cmpgt((vec_uint4
)(a
), (vec_uint4
)(b
))));
622 static __inline qword
si_fcgt(qword a
, qword b
)
624 return ((qword
)(vec_cmpgt((vec_float4
)(a
), (vec_float4
)(b
))));
627 static __inline qword
si_dfcgt(qword a
, qword b
)
629 vec_uchar16 splat_hi
= (vec_uchar16
) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
630 vec_uchar16 borrow_shuffle
= (vec_uchar16
) { 4,5,6,7, 192,192,192,192, 12,13,14,15, 192,192,192,192 };
631 vec_uint4 nan_mask
= (vec_uint4
) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
632 vec_uint4 sign_mask
= (vec_uint4
) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
642 // absolute value of a,b
643 vec_uint4 aabs
= vec_and((vec_uint4
)a
, sign_mask
);
644 vec_uint4 babs
= vec_and((vec_uint4
)b
, sign_mask
);
647 vec_uint4 a_inf
= (vec_uint4
)vec_cmpeq(aabs
, nan_mask
);
648 vec_uint4 a_nan
= (vec_uint4
)vec_cmpgt(aabs
, nan_mask
);
649 a_nan
= vec_or(a_nan
, vec_and((vec_uint4
)vec_slo((vec_uchar16
)a_nan
,x
.v
),a_inf
));
650 a_nan
= (vec_uint4
)vec_perm((vec_uchar16
)a_nan
, (vec_uchar16
)a_nan
, splat_hi
);
653 vec_uint4 b_inf
= (vec_uint4
)vec_cmpeq(babs
, nan_mask
);
654 vec_uint4 b_nan
= (vec_uint4
)vec_cmpgt(babs
, nan_mask
);
655 b_nan
= vec_or(b_nan
, vec_and((vec_uint4
)vec_slo((vec_uchar16
)b_nan
,x
.v
),b_inf
));
656 b_nan
= (vec_uint4
)vec_perm((vec_uchar16
)b_nan
, (vec_uchar16
)b_nan
, splat_hi
);
659 vec_uint4 asel
= (vec_uint4
)vec_sra((vec_int4
)(a
), (vec_uint4
)vec_splat(((vec_uint4
)si_from_int(31)), 0));
660 asel
= (vec_uint4
)vec_perm((vec_uchar16
)asel
,(vec_uchar16
)asel
,splat_hi
);
663 vec_uint4 bsel
= (vec_uint4
)vec_sra((vec_int4
)(b
), (vec_uint4
)vec_splat(((vec_uint4
)si_from_int(31)), 0));
664 bsel
= (vec_uint4
)vec_perm((vec_uchar16
)bsel
,(vec_uchar16
)bsel
,splat_hi
);
667 vec_uint4 abor
= vec_subc((vec_uint4
)vec_splat_u32(0), aabs
);
668 vec_uchar16 pat
= vec_sel(((vec_uchar16
){0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}), vec_sr(borrow_shuffle
, vec_splat_u8(3)), vec_sra(borrow_shuffle
, vec_splat_u8(7)));
669 abor
= (vec_uint4
)(vec_perm(vec_perm((vec_uchar16
)abor
, (vec_uchar16
)abor
, borrow_shuffle
),((vec_uchar16
){0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),pat
));
670 vec_uint4 aneg
= vec_add(vec_add(vec_splat_u32(0), vec_nor(aabs
, aabs
)), vec_and(abor
, vec_splat_u32(1)));
672 // pick the one we want
673 vec_int4 aval
= (vec_int4
)vec_sel((vec_uchar16
)aabs
, (vec_uchar16
)aneg
, (vec_uchar16
)asel
);
676 vec_uint4 bbor
= vec_subc((vec_uint4
)vec_splat_u32(0), babs
);
677 bbor
= (vec_uint4
)(vec_perm(vec_perm((vec_uchar16
)bbor
, (vec_uchar16
)bbor
, borrow_shuffle
),((vec_uchar16
){0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),pat
));
678 vec_uint4 bneg
= vec_add(vec_nor(babs
, babs
), vec_and(bbor
, vec_splat_u32(1)));
680 // pick the one we want
681 vec_int4 bval
=(vec_int4
)vec_sel((vec_uchar16
)babs
, (vec_uchar16
)bneg
, (vec_uchar16
)bsel
);
683 // A) Check if the exponents are different
684 vec_uint4 gt_hi
= (vec_uint4
)vec_cmpgt(aval
,bval
);
686 // B) Check if high word equal, and low word greater
687 vec_uint4 gt_lo
= (vec_uint4
)vec_cmpgt((vec_uint4
)aval
, (vec_uint4
)bval
);
688 vec_uint4 eq
= (vec_uint4
)vec_cmpeq(aval
, bval
);
689 vec_uint4 eqgt
= vec_and(eq
,vec_slo(gt_lo
,x
.v
));
691 // If either A or B is true, return true (unless NaNs detected)
692 vec_uint4 r
= vec_or(gt_hi
, eqgt
);
694 // splat the high words of the comparison step
695 r
= (vec_uint4
)vec_perm((vec_uchar16
)r
,(vec_uchar16
)r
,splat_hi
);
697 // correct for NaNs in input
698 return ((qword
)vec_andc(r
,vec_or(a_nan
,b_nan
)));
701 static __inline qword
si_cgtbi(qword a
, signed char b
)
703 return ((qword
)(vec_cmpgt((vec_char16
)(a
),
704 vec_splat((vec_char16
)(si_from_char(b
)), 3))));
707 static __inline qword
si_cgthi(qword a
, signed short b
)
709 return ((qword
)(vec_cmpgt((vec_short8
)(a
),
710 vec_splat((vec_short8
)(si_from_short(b
)), 1))));
713 static __inline qword
si_cgti(qword a
, signed int b
)
715 return ((qword
)(vec_cmpgt((vec_int4
)(a
),
716 vec_splat((vec_int4
)(si_from_int(b
)), 0))));
719 static __inline qword
si_clgtbi(qword a
, unsigned char b
)
721 return ((qword
)(vec_cmpgt((vec_uchar16
)(a
),
722 vec_splat((vec_uchar16
)(si_from_uchar(b
)), 3))));
725 static __inline qword
si_clgthi(qword a
, unsigned short b
)
727 return ((qword
)(vec_cmpgt((vec_ushort8
)(a
),
728 vec_splat((vec_ushort8
)(si_from_ushort(b
)), 1))));
731 static __inline qword
si_clgti(qword a
, unsigned int b
)
733 return ((qword
)(vec_cmpgt((vec_uint4
)(a
),
734 vec_splat((vec_uint4
)(si_from_uint(b
)), 0))));
737 static __inline qword
si_dftsv(qword a
, char b
)
739 vec_uchar16 splat_hi
= (vec_uchar16
) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
740 vec_uint4 sign_mask
= (vec_uint4
) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
741 vec_uint4 result
= (vec_uint4
){0};
742 vec_uint4 sign
= (vec_uint4
)vec_sra((vec_int4
)(a
), (vec_uint4
)vec_splat(((vec_uint4
)si_from_int(31)), 0));
743 sign
= (vec_uint4
)vec_perm((vec_uchar16
)sign
,(vec_uchar16
)sign
,splat_hi
);
744 vec_uint4 aabs
= vec_and((vec_uint4
)a
,sign_mask
);
754 /* Nan or +inf or -inf */
757 vec_uint4 nan_mask
= (vec_uint4
) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
758 vec_uint4 a_inf
= (vec_uint4
)vec_cmpeq(aabs
, nan_mask
);
762 vec_uint4 a_nan
= (vec_uint4
)vec_cmpgt(aabs
, nan_mask
);
763 a_nan
= vec_or(a_nan
, vec_and((vec_uint4
)vec_slo((vec_uchar16
)a_nan
,x
.v
),a_inf
));
764 a_nan
= (vec_uint4
)vec_perm((vec_uchar16
)a_nan
, (vec_uchar16
)a_nan
, splat_hi
);
765 result
= vec_or(result
, a_nan
);
770 a_inf
= vec_and((vec_uint4
)vec_slo((vec_uchar16
)a_inf
,x
.v
), a_inf
);
771 a_inf
= (vec_uint4
)vec_perm((vec_uchar16
)a_inf
, (vec_uchar16
)a_inf
, splat_hi
);
774 result
= vec_or(vec_andc(a_inf
, sign
), result
);
777 result
= vec_or(vec_and(a_inf
, sign
), result
);
783 vec_uint4 iszero
=(vec_uint4
)vec_cmpeq(aabs
,(vec_uint4
)vec_splat_u32(0));
784 iszero
= vec_and(iszero
,(vec_uint4
)vec_slo((vec_uchar16
)iszero
,x
.v
));
788 vec_uint4 denorm_mask
= (vec_uint4
){0xFFFFF, 0xFFFFF, 0xFFFFF, 0xFFFFF};
789 vec_uint4 isdenorm
= vec_nor((vec_uint4
)vec_cmpgt(aabs
, denorm_mask
), iszero
);
790 isdenorm
= (vec_uint4
)vec_perm((vec_uchar16
)isdenorm
, (vec_uchar16
)isdenorm
, splat_hi
);
793 result
= vec_or(vec_andc(isdenorm
, sign
), result
);
796 result
= vec_or(vec_and(isdenorm
, sign
), result
);
801 iszero
= (vec_uint4
)vec_perm((vec_uchar16
)iszero
, (vec_uchar16
)iszero
, splat_hi
);
804 result
= vec_or(vec_andc(iszero
, sign
), result
);
807 result
= vec_or(vec_and(iszero
, sign
), result
);
810 return ((qword
)result
);
816 #define si_cg(_a, _b) ((qword)(vec_addc((vec_uint4)(_a), (vec_uint4)(_b))))
818 #define si_cgx(_a, _b, _c) ((qword)(vec_or(vec_addc((vec_uint4)(_a), (vec_uint4)(_b)), \
819 vec_addc(vec_add((vec_uint4)(_a), (vec_uint4)(_b)), \
820 vec_and((vec_uint4)(_c), vec_splat_u32(1))))))
823 /* Count ones for bytes
825 static __inline qword
si_cntb(qword a
)
827 vec_uchar16 nib_cnt
= (vec_uchar16
){0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
828 vec_uchar16 four
= { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 };
831 av
= (vec_uchar16
)(a
);
833 return ((qword
)(vec_add(vec_perm(nib_cnt
, nib_cnt
, av
),
834 vec_perm(nib_cnt
, nib_cnt
, vec_sr (av
, four
)))));
837 /* Count ones for bytes
839 static __inline qword
si_clz(qword a
)
842 vec_uchar16 cnt_hi
, cnt_lo
, cnt
, tmp1
, tmp2
, tmp3
;
843 vec_uchar16 four
= vec_splat_u8(4);
844 vec_uchar16 nib_cnt
= (vec_uchar16
){4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0};
845 vec_uchar16 eight
= vec_splat_u8(8);
846 vec_uchar16 sixteen
= (vec_uchar16
){16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16};
847 vec_uchar16 twentyfour
= (vec_uchar16
){24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24};
849 av
= (vec_uchar16
)(a
);
851 cnt_hi
= vec_perm(nib_cnt
, nib_cnt
, vec_sr(av
, four
));
852 cnt_lo
= vec_perm(nib_cnt
, nib_cnt
, av
);
854 cnt
= vec_add(cnt_hi
, vec_and(cnt_lo
, vec_cmpeq(cnt_hi
, four
)));
856 tmp1
= (vec_uchar16
)vec_sl((vec_uint4
)(cnt
), (vec_uint4
)(eight
));
857 tmp2
= (vec_uchar16
)vec_sl((vec_uint4
)(cnt
), (vec_uint4
)(sixteen
));
858 tmp3
= (vec_uchar16
)vec_sl((vec_uint4
)(cnt
), (vec_uint4
)(twentyfour
));
860 cnt
= vec_add(cnt
, vec_and(tmp1
, vec_cmpeq(cnt
, eight
)));
861 cnt
= vec_add(cnt
, vec_and(tmp2
, vec_cmpeq(cnt
, sixteen
)));
862 cnt
= vec_add(cnt
, vec_and(tmp3
, vec_cmpeq(cnt
, twentyfour
)));
864 return (qword
)((vec_sr((vec_uint4
)(cnt
), (vec_uint4
)(twentyfour
))));
869 #define si_cuflt(_a, _b) ((qword)(vec_ctf((vec_uint4)(_a), _b)))
870 #define si_csflt(_a, _b) ((qword)(vec_ctf((vec_int4)(_a), _b)))
872 /* Convert to signed int
874 #define si_cflts(_a, _b) ((qword)(vec_cts((vec_float4)(_a), _b)))
876 /* Convert to unsigned int
878 #define si_cfltu(_a, _b) ((qword)(vec_ctu((vec_float4)(_a), _b)))
882 #define si_dsync() /* do nothing */
883 #define si_sync() /* do nothing */
884 #define si_syncc() /* do nothing */
889 static __inline qword
si_eqv(qword a
, qword b
)
893 d
= vec_xor((vec_uchar16
)(a
), (vec_uchar16
)(b
));
894 return ((qword
)(vec_nor(d
, d
)));
899 static __inline qword
si_xsbh(qword a
)
903 av
= (vec_char16
)(a
);
904 return ((qword
)(vec_unpackh(vec_perm(av
, av
, ((vec_uchar16
){1, 3, 5, 7, 9,11,13,15,
905 0, 0, 0, 0, 0, 0, 0, 0})))));
908 static __inline qword
si_xshw(qword a
)
912 av
= (vec_short8
)(a
);
913 return ((qword
)(vec_unpackh(vec_perm(av
, av
, ((vec_uchar16
){2, 3, 6, 7,
919 static __inline qword
si_xswd(qword a
)
924 return ((qword
)(vec_perm(av
, vec_sra(av
, ((vec_uint4
){31,31,31,31})),
925 ((vec_uchar16
){20, 21, 22, 23,
931 static __inline qword
si_fesd(qword a
)
942 in
.vf
= (vec_float4
)(a
);
943 out
.d
[0] = (double)(in
.f
[0]);
944 out
.d
[1] = (double)(in
.f
[2]);
945 return ((qword
)(out
.vd
));
950 static __inline qword
si_gbb(qword a
)
955 bits
= vec_sl(vec_and((vec_uchar16
)(a
), vec_splat_u8(1)), ((vec_uchar16
){7, 6, 5, 4, 3, 2, 1, 0,
956 7, 6, 5, 4, 3, 2, 1, 0}));
957 bytes
= (vec_uint4
)vec_sum2s((vec_int4
)(vec_sum4s(bits
, ((vec_uint4
){0}))), ((vec_int4
){0}));
959 return ((qword
)(vec_perm(bytes
, bytes
, ((vec_uchar16
){0, 0, 7,15, 0, 0, 0, 0,
960 0, 0, 0, 0, 0, 0, 0, 0}))));
964 static __inline qword
si_gbh(qword a
)
969 bits
= vec_sl(vec_and((vec_ushort8
)(a
), vec_splat_u16(1)), ((vec_ushort8
){7, 6, 5, 4, 3, 2, 1, 0}));
971 bytes
= (vec_uint4
)vec_sums((vec_int4
)(vec_sum4s((vec_short8
)(bits
), (vec_int4
){0})), (vec_int4
){0});
973 return ((qword
)(vec_sld(bytes
, bytes
, 12)));
976 static __inline qword
si_gb(qword a
)
981 bits
= vec_sl(vec_and((vec_uint4
)(a
), vec_splat_u32(1)), ((vec_uint4
){3, 2, 1, 0}));
982 bytes
= (vec_uint4
)vec_sums((vec_int4
)(bits
), ((vec_int4
){0}));
983 return ((qword
)(vec_sld(bytes
, bytes
, 12)));
989 static __inline
void si_heq(qword a
, qword b
)
992 vector
unsigned int v
;
996 aa
.v
= (vector
unsigned int)(a
);
997 bb
.v
= (vector
unsigned int)(b
);
999 if (aa
.i
[0] == bb
.i
[0]) { SPU_HALT_ACTION
; };
1002 static __inline
void si_heqi(qword a
, unsigned int b
)
1005 vector
unsigned int v
;
1009 aa
.v
= (vector
unsigned int)(a
);
1011 if (aa
.i
[0] == b
) { SPU_HALT_ACTION
; };
1014 static __inline
void si_hgt(qword a
, qword b
)
1017 vector
signed int v
;
1021 aa
.v
= (vector
signed int)(a
);
1022 bb
.v
= (vector
signed int)(b
);
1024 if (aa
.i
[0] > bb
.i
[0]) { SPU_HALT_ACTION
; };
1027 static __inline
void si_hgti(qword a
, signed int b
)
1030 vector
signed int v
;
1034 aa
.v
= (vector
signed int)(a
);
1036 if (aa
.i
[0] > b
) { SPU_HALT_ACTION
; };
1039 static __inline
void si_hlgt(qword a
, qword b
)
1042 vector
unsigned int v
;
1046 aa
.v
= (vector
unsigned int)(a
);
1047 bb
.v
= (vector
unsigned int)(b
);
1049 if (aa
.i
[0] > bb
.i
[0]) { SPU_HALT_ACTION
; };
1052 static __inline
void si_hlgti(qword a
, unsigned int b
)
1055 vector
unsigned int v
;
1059 aa
.v
= (vector
unsigned int)(a
);
1061 if (aa
.i
[0] > b
) { SPU_HALT_ACTION
; };
1067 static __inline qword
si_mpya(qword a
, qword b
, qword c
)
1069 return ((qword
)(vec_msum(vec_and((vec_short8
)(a
),
1070 ((vec_short8
){0, -1, 0, -1, 0, -1, 0, -1})),
1071 (vec_short8
)(b
), (vec_int4
)(c
))));
1074 static __inline qword
si_fma(qword a
, qword b
, qword c
)
1076 return ((qword
)(vec_madd((vec_float4
)(a
), (vec_float4
)(b
), (vec_float4
)(c
))));
1079 static __inline qword
si_dfma(qword a
, qword b
, qword c
)
1086 aa
.v
= (vec_double2
)(a
);
1087 bb
.v
= (vec_double2
)(b
);
1088 cc
.v
= (vec_double2
)(c
);
1089 dd
.d
[0] = aa
.d
[0] * bb
.d
[0] + cc
.d
[0];
1090 dd
.d
[1] = aa
.d
[1] * bb
.d
[1] + cc
.d
[1];
1091 return ((qword
)(dd
.v
));
1096 #define si_fsmbi(_a) si_fsmb(si_from_int(_a))
1098 static __inline qword
si_fsmb(qword a
)
1103 in
= (vec_ushort8
)(a
);
1104 mask
= (vec_char16
)(vec_perm(in
, in
, ((vec_uchar16
){2, 2, 2, 2, 2, 2, 2, 2,
1105 3, 3, 3, 3, 3, 3, 3, 3})));
1106 return ((qword
)(vec_sra(vec_sl(mask
, ((vec_uchar16
){0, 1, 2, 3, 4, 5, 6, 7,
1107 0, 1, 2, 3, 4, 5, 6, 7})),
1112 static __inline qword
si_fsmh(qword a
)
1117 in
= (vec_uchar16
)(a
);
1118 mask
= (vec_short8
)(vec_splat(in
, 3));
1119 return ((qword
)(vec_sra(vec_sl(mask
, ((vec_ushort8
){0, 1, 2, 3, 4, 5, 6, 7})),
1120 vec_splat_u16(15))));
1123 static __inline qword
si_fsm(qword a
)
1128 in
= (vec_uchar16
)(a
);
1129 mask
= (vec_int4
)(vec_splat(in
, 3));
1130 return ((qword
)(vec_sra(vec_sl(mask
, ((vec_uint4
){28, 29, 30, 31})),
1131 ((vec_uint4
){31,31,31,31}))));
1134 /* Move from/to registers
1136 #define si_fscrrd() ((qword)((vec_uint4){0}))
1137 #define si_fscrwr(_a)
1139 #define si_mfspr(_reg) ((qword)((vec_uint4){0}))
1140 #define si_mtspr(_reg, _a)
1142 /* Multiply High High Add
1144 static __inline qword
si_mpyhha(qword a
, qword b
, qword c
)
1146 return ((qword
)(vec_add(vec_mule((vec_short8
)(a
), (vec_short8
)(b
)), (vec_int4
)(c
))));
1149 static __inline qword
si_mpyhhau(qword a
, qword b
, qword c
)
1151 return ((qword
)(vec_add(vec_mule((vec_ushort8
)(a
), (vec_ushort8
)(b
)), (vec_uint4
)(c
))));
1154 /* Multiply Subtract
1156 static __inline qword
si_fms(qword a
, qword b
, qword c
)
1158 return ((qword
)(vec_madd((vec_float4
)(a
), (vec_float4
)(b
),
1159 vec_sub(((vec_float4
){0.0f
}), (vec_float4
)(c
)))));
1162 static __inline qword
si_dfms(qword a
, qword b
, qword c
)
1169 aa
.v
= (vec_double2
)(a
);
1170 bb
.v
= (vec_double2
)(b
);
1171 cc
.v
= (vec_double2
)(c
);
1172 dd
.d
[0] = aa
.d
[0] * bb
.d
[0] - cc
.d
[0];
1173 dd
.d
[1] = aa
.d
[1] * bb
.d
[1] - cc
.d
[1];
1174 return ((qword
)(dd
.v
));
1179 static __inline qword
si_fm(qword a
, qword b
)
1181 return ((qword
)(vec_madd((vec_float4
)(a
), (vec_float4
)(b
), ((vec_float4
){0.0f
}))));
1184 static __inline qword
si_dfm(qword a
, qword b
)
1191 aa
.v
= (vec_double2
)(a
);
1192 bb
.v
= (vec_double2
)(b
);
1193 dd
.d
[0] = aa
.d
[0] * bb
.d
[0];
1194 dd
.d
[1] = aa
.d
[1] * bb
.d
[1];
1195 return ((qword
)(dd
.v
));
1200 static __inline qword
si_mpyh(qword a
, qword b
)
1202 vec_uint4 sixteen
= (vec_uint4
){16, 16, 16, 16};
1204 return ((qword
)(vec_sl(vec_mule((vec_short8
)(a
), (vec_short8
)(vec_sl((vec_uint4
)(b
), sixteen
))), sixteen
)));
1208 /* Multiply High High
1210 static __inline qword
si_mpyhh(qword a
, qword b
)
1212 return ((qword
)(vec_mule((vec_short8
)(a
), (vec_short8
)(b
))));
1215 static __inline qword
si_mpyhhu(qword a
, qword b
)
1217 return ((qword
)(vec_mule((vec_ushort8
)(a
), (vec_ushort8
)(b
))));
1222 static __inline qword
si_mpy(qword a
, qword b
)
1224 return ((qword
)(vec_mulo((vec_short8
)(a
), (vec_short8
)(b
))));
1227 static __inline qword
si_mpyu(qword a
, qword b
)
1229 return ((qword
)(vec_mulo((vec_ushort8
)(a
), (vec_ushort8
)(b
))));
1232 static __inline qword
si_mpyi(qword a
, short b
)
1234 return ((qword
)(vec_mulo((vec_short8
)(a
),
1235 vec_splat((vec_short8
)(si_from_short(b
)), 1))));
1238 static __inline qword
si_mpyui(qword a
, unsigned short b
)
1240 return ((qword
)(vec_mulo((vec_ushort8
)(a
),
1241 vec_splat((vec_ushort8
)(si_from_ushort(b
)), 1))));
1244 /* Multiply and Shift Right
1246 static __inline qword
si_mpys(qword a
, qword b
)
1248 return ((qword
)(vec_sra(vec_mulo((vec_short8
)(a
), (vec_short8
)(b
)), ((vec_uint4
){16,16,16,16}))));
1253 static __inline qword
si_nand(qword a
, qword b
)
1257 d
= vec_and((vec_uchar16
)(a
), (vec_uchar16
)(b
));
1258 return ((qword
)(vec_nor(d
, d
)));
1261 /* Negative Multiply Add
1263 static __inline qword
si_dfnma(qword a
, qword b
, qword c
)
1270 aa
.v
= (vec_double2
)(a
);
1271 bb
.v
= (vec_double2
)(b
);
1272 cc
.v
= (vec_double2
)(c
);
1273 dd
.d
[0] = -cc
.d
[0] - aa
.d
[0] * bb
.d
[0];
1274 dd
.d
[1] = -cc
.d
[1] - aa
.d
[1] * bb
.d
[1];
1275 return ((qword
)(dd
.v
));
1278 /* Negative Multiply and Subtract
1280 static __inline qword
si_fnms(qword a
, qword b
, qword c
)
1282 return ((qword
)(vec_nmsub((vec_float4
)(a
), (vec_float4
)(b
), (vec_float4
)(c
))));
1285 static __inline qword
si_dfnms(qword a
, qword b
, qword c
)
1292 aa
.v
= (vec_double2
)(a
);
1293 bb
.v
= (vec_double2
)(b
);
1294 cc
.v
= (vec_double2
)(c
);
1295 dd
.d
[0] = cc
.d
[0] - aa
.d
[0] * bb
.d
[0];
1296 dd
.d
[1] = cc
.d
[1] - aa
.d
[1] * bb
.d
[1];
1297 return ((qword
)(dd
.v
));
1302 static __inline qword
si_nor(qword a
, qword b
)
1304 return ((qword
)(vec_nor((vec_uchar16
)(a
), (vec_uchar16
)(b
))));
1309 static __inline qword
si_or(qword a
, qword b
)
1311 return ((qword
)(vec_or((vec_uchar16
)(a
), (vec_uchar16
)(b
))));
1314 static __inline qword
si_orbi(qword a
, unsigned char b
)
1316 return ((qword
)(vec_or((vec_uchar16
)(a
),
1317 vec_splat((vec_uchar16
)(si_from_uchar(b
)), 3))));
1320 static __inline qword
si_orhi(qword a
, unsigned short b
)
1322 return ((qword
)(vec_or((vec_ushort8
)(a
),
1323 vec_splat((vec_ushort8
)(si_from_ushort(b
)), 1))));
1326 static __inline qword
si_ori(qword a
, unsigned int b
)
1328 return ((qword
)(vec_or((vec_uint4
)(a
),
1329 vec_splat((vec_uint4
)(si_from_uint(b
)), 0))));
1334 static __inline qword
si_orc(qword a
, qword b
)
1336 return ((qword
)(vec_or((vec_uchar16
)(a
), vec_nor((vec_uchar16
)(b
), (vec_uchar16
)(b
)))));
1342 static __inline qword
si_orx(qword a
)
1345 tmp
= (vec_uchar16
)(a
);
1346 tmp
= vec_or(tmp
, vec_sld(tmp
, tmp
, 8));
1347 tmp
= vec_or(tmp
, vec_sld(tmp
, tmp
, 4));
1348 return ((qword
)(vec_and(tmp
, ((vec_uchar16
){0xFF,0xFF,0xFF,0xFF, 0x00,0x00,0x00,0x00,
1349 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00}))));
1355 static __inline qword
si_frest(qword a
)
1357 return ((qword
)(vec_re((vec_float4
)(a
))));
1360 static __inline qword
si_frsqest(qword a
)
1362 return ((qword
)(vec_rsqrte((vec_float4
)(a
))));
1365 #define si_fi(_a, _d) (_d)
1367 /* Channel Read and Write
1369 #define si_rdch(_channel) ((qword)(vec_splat_u8(0))) /* not mappable */
1370 #define si_rchcnt(_channel) ((qword)(vec_splat_u8(0))) /* not mappable */
1371 #define si_wrch(_channel, _a) /* not mappable */
1375 static __inline qword
si_roth(qword a
, qword b
)
1377 return ((qword
)(vec_rl((vec_ushort8
)(a
), (vec_ushort8
)(b
))));
1380 static __inline qword
si_rot(qword a
, qword b
)
1382 return ((qword
)(vec_rl((vec_uint4
)(a
), (vec_uint4
)(b
))));
1385 static __inline qword
si_rothi(qword a
, int b
)
1387 return ((qword
)(vec_rl((vec_ushort8
)(a
),
1388 vec_splat((vec_ushort8
)(si_from_int(b
)), 1))));
1391 static __inline qword
si_roti(qword a
, int b
)
1393 return ((qword
)(vec_rl((vec_uint4
)(a
),
1394 vec_splat((vec_uint4
)(si_from_int(b
)), 0))));
1397 /* Rotate Left with Mask
1399 static __inline qword
si_rothm(qword a
, qword b
)
1404 neg_b
= (vec_ushort8
)vec_sub(vec_splat_s16(0), (vec_short8
)(b
));
1405 mask
= vec_sra(vec_sl(neg_b
, vec_splat_u16(11)), vec_splat_u16(15));
1406 return ((qword
)(vec_andc(vec_sr((vec_ushort8
)(a
), neg_b
), mask
)));
1409 static __inline qword
si_rotm(qword a
, qword b
)
1414 neg_b
= (vec_uint4
)vec_sub(vec_splat_s32(0), (vec_int4
)(b
));
1415 mask
= vec_sra(vec_sl(neg_b
, ((vec_uint4
){26,26,26,26})), ((vec_uint4
){31,31,31,31}));
1416 return ((qword
)(vec_andc(vec_sr((vec_uint4
)(a
), neg_b
), mask
)));
1419 static __inline qword
si_rothmi(qword a
, int b
)
1424 neg_b
= vec_splat((vec_ushort8
)(si_from_int(-b
)), 1);
1425 mask
= vec_sra(vec_sl(neg_b
, vec_splat_u16(11)), vec_splat_u16(15));
1426 return ((qword
)(vec_andc(vec_sr((vec_ushort8
)(a
), neg_b
), mask
)));
1429 static __inline qword
si_rotmi(qword a
, int b
)
1434 neg_b
= vec_splat((vec_uint4
)(si_from_int(-b
)), 0);
1435 mask
= vec_sra(vec_sl(neg_b
, ((vec_uint4
){26,26,26,26})), ((vec_uint4
){31,31,31,31}));
1436 return ((qword
)(vec_andc(vec_sr((vec_uint4
)(a
), neg_b
), mask
)));
1440 /* Rotate Left Algebraic with Mask
1442 static __inline qword
si_rotmah(qword a
, qword b
)
1447 neg_b
= (vec_ushort8
)vec_sub(vec_splat_s16(0), (vec_short8
)(b
));
1448 mask
= vec_sra(vec_sl(neg_b
, vec_splat_u16(11)), vec_splat_u16(15));
1449 return ((qword
)(vec_sra((vec_short8
)(a
), (vec_ushort8
)vec_or(neg_b
, mask
))));
1452 static __inline qword
si_rotma(qword a
, qword b
)
1457 neg_b
= (vec_uint4
)vec_sub(vec_splat_s32(0), (vec_int4
)(b
));
1458 mask
= vec_sra(vec_sl(neg_b
, ((vec_uint4
){26,26,26,26})), ((vec_uint4
){31,31,31,31}));
1459 return ((qword
)(vec_sra((vec_int4
)(a
), (vec_uint4
)vec_or(neg_b
, mask
))));
1463 static __inline qword
si_rotmahi(qword a
, int b
)
1468 neg_b
= vec_splat((vec_ushort8
)(si_from_int(-b
)), 1);
1469 mask
= vec_sra(vec_sl(neg_b
, vec_splat_u16(11)), vec_splat_u16(15));
1470 return ((qword
)(vec_sra((vec_short8
)(a
), (vec_ushort8
)vec_or(neg_b
, mask
))));
1473 static __inline qword
si_rotmai(qword a
, int b
)
1478 neg_b
= vec_splat((vec_uint4
)(si_from_int(-b
)), 0);
1479 mask
= vec_sra(vec_sl(neg_b
, ((vec_uint4
){26,26,26,26})), ((vec_uint4
){31,31,31,31}));
1480 return ((qword
)(vec_sra((vec_int4
)(a
), (vec_uint4
)vec_or(neg_b
, mask
))));
1484 /* Rotate Left Quadword by Bytes with Mask
1486 static __inline qword
si_rotqmbyi(qword a
, int count
)
1495 x
.i
[3] = count
<< 3;
1496 mask
= (count
& 0x10) ? vec_splat_u8(0) : vec_splat_u8(-1);
1498 return ((qword
)(vec_and(vec_sro((vec_uchar16
)(a
), x
.v
), mask
)));
1502 static __inline qword
si_rotqmby(qword a
, qword count
)
1511 x
.v
= (vec_uchar16
)(count
);
1512 x
.i
[0] = cnt
= (0 - x
.i
[0]) << 3;
1514 x
.v
= vec_splat(x
.v
, 3);
1515 mask
= (cnt
& 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1517 return ((qword
)(vec_and(vec_sro((vec_uchar16
)(a
), x
.v
), mask
)));
1521 /* Rotate Left Quadword by Bytes
1523 static __inline qword
si_rotqbyi(qword a
, int count
)
1532 right
.i
[3] = 0 - count
;
1533 return ((qword
)(vec_or(vec_slo((vec_uchar16
)(a
), left
.v
), vec_sro((vec_uchar16
)(a
), right
.v
))));
1536 static __inline qword
si_rotqby(qword a
, qword count
)
1538 vec_uchar16 left
, right
;
1540 left
= vec_sl(vec_splat((vec_uchar16
)(count
), 3), vec_splat_u8(3));
1541 right
= vec_sub(vec_splat_u8(0), left
);
1542 return ((qword
)(vec_or(vec_slo((vec_uchar16
)(a
), left
), vec_sro((vec_uchar16
)(a
), right
))));
1545 /* Rotate Left Quadword by Bytes Bit Count
1547 static __inline qword
si_rotqbybi(qword a
, qword count
)
1549 vec_uchar16 left
, right
;
1551 left
= vec_splat((vec_uchar16
)(count
), 3);
1552 right
= vec_sub(vec_splat_u8(7), left
);
1553 return ((qword
)(vec_or(vec_slo((vec_uchar16
)(a
), left
), vec_sro((vec_uchar16
)(a
), right
))));
1557 /* Rotate Left Quadword by Bytes Bit Count
1559 static __inline qword
si_rotqbii(qword a
, int count
)
1564 x
= vec_splat((vec_uchar16
)(si_from_int(count
& 7)), 3);
1565 y
= (vec_uchar16
)(vec_sr((vec_uint4
)vec_sro((vec_uchar16
)(a
), ((vec_uchar16
)((vec_uint4
){0,0,0,120}))),
1566 (vec_uint4
)vec_sub(vec_splat_u8(8), x
)));
1567 result
= vec_or(vec_sll((qword
)(a
), x
), y
);
1568 return ((qword
)(result
));
1571 static __inline qword
si_rotqbi(qword a
, qword count
)
1576 x
= vec_and(vec_splat((vec_uchar16
)(count
), 3), vec_splat_u8(7));
1577 y
= (vec_uchar16
)(vec_sr((vec_uint4
)vec_sro((vec_uchar16
)(a
), ((vec_uchar16
)((vec_uint4
){0,0,0,120}))),
1578 (vec_uint4
)vec_sub(vec_splat_u8(8), x
)));
1580 result
= vec_or(vec_sll((qword
)(a
), x
), y
);
1581 return ((qword
)(result
));
1585 /* Rotate Left Quadword and Mask by Bits
1587 static __inline qword
si_rotqmbii(qword a
, int count
)
1589 return ((qword
)(vec_srl((vec_uchar16
)(a
), vec_splat((vec_uchar16
)(si_from_int(0 - count
)), 3))));
1592 static __inline qword
si_rotqmbi(qword a
, qword count
)
1594 return ((qword
)(vec_srl((vec_uchar16
)(a
), vec_sub(vec_splat_u8(0), vec_splat((vec_uchar16
)(count
), 3)))));
1598 /* Rotate Left Quadword and Mask by Bytes with Bit Count
1600 static __inline qword
si_rotqmbybi(qword a
, qword count
)
1609 x
.v
= (vec_uchar16
)(count
);
1610 x
.i
[0] = cnt
= 0 - (x
.i
[0] & ~7);
1611 x
.v
= vec_splat(x
.v
, 3);
1612 mask
= (cnt
& 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1614 return ((qword
)(vec_and(vec_sro((vec_uchar16
)(a
), x
.v
), mask
)));
1620 /* Round Double to Float
1622 static __inline qword
si_frds(qword a
)
1633 in
.v
= (vec_double2
)(a
);
1634 d
.v
= (vec_float4
){0.0f
};
1635 d
.f
[0] = (float)in
.d
[0];
1636 d
.f
[2] = (float)in
.d
[1];
1638 return ((qword
)(d
.v
));
1643 static __inline qword
si_selb(qword a
, qword b
, qword c
)
1645 return ((qword
)(vec_sel((vec_uchar16
)(a
), (vec_uchar16
)(b
), (vec_uchar16
)(c
))));
1651 static __inline qword
si_shufb(qword a
, qword b
, qword pattern
)
1655 pat
= vec_sel(((vec_uchar16
){0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}),
1656 vec_sr((vec_uchar16
)(pattern
), vec_splat_u8(3)),
1657 vec_sra((vec_uchar16
)(pattern
), vec_splat_u8(7)));
1658 return ((qword
)(vec_perm(vec_perm(a
, b
, pattern
),
1659 ((vec_uchar16
){0, 0, 0, 0, 0, 0, 0, 0,
1660 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),
1667 static __inline qword
si_shlh(qword a
, qword b
)
1671 mask
= (vec_ushort8
)vec_sra(vec_sl((vec_ushort8
)(b
), vec_splat_u16(11)), vec_splat_u16(15));
1672 return ((qword
)(vec_andc(vec_sl((vec_ushort8
)(a
), (vec_ushort8
)(b
)), mask
)));
1675 static __inline qword
si_shl(qword a
, qword b
)
1679 mask
= (vec_uint4
)vec_sra(vec_sl((vec_uint4
)(b
), ((vec_uint4
){26,26,26,26})), ((vec_uint4
){31,31,31,31}));
1680 return ((qword
)(vec_andc(vec_sl((vec_uint4
)(a
), (vec_uint4
)(b
)), mask
)));
1684 static __inline qword
si_shlhi(qword a
, unsigned int b
)
1689 bv
= vec_splat((vec_ushort8
)(si_from_int(b
)), 1);
1690 mask
= (vec_ushort8
)vec_sra(vec_sl(bv
, vec_splat_u16(11)), vec_splat_u16(15));
1691 return ((qword
)(vec_andc(vec_sl((vec_ushort8
)(a
), bv
), mask
)));
1694 static __inline qword
si_shli(qword a
, unsigned int b
)
1699 bv
= vec_splat((vec_uint4
)(si_from_uint(b
)), 0);
1700 mask
= (vec_uint4
)vec_sra(vec_sl(bv
, ((vec_uint4
){26,26,26,26})), ((vec_uint4
){31,31,31,31}));
1701 return ((qword
)(vec_andc(vec_sl((vec_uint4
)(a
), bv
), mask
)));
1705 /* Shift Left Quadword
1707 static __inline qword
si_shlqbii(qword a
, unsigned int count
)
1711 x
= vec_splat((vec_uchar16
)(si_from_uint(count
)), 3);
1712 return ((qword
)(vec_sll((vec_uchar16
)(a
), x
)));
1715 static __inline qword
si_shlqbi(qword a
, qword count
)
1719 x
= vec_splat((vec_uchar16
)(count
), 3);
1720 return ((qword
)(vec_sll((vec_uchar16
)(a
), x
)));
1724 /* Shift Left Quadword by Bytes
1726 static __inline qword
si_shlqbyi(qword a
, unsigned int count
)
1734 x
.i
[3] = count
<< 3;
1735 mask
= (count
& 0x10) ? vec_splat_u8(0) : vec_splat_u8(-1);
1736 return ((qword
)(vec_and(vec_slo((vec_uchar16
)(a
), x
.v
), mask
)));
1739 static __inline qword
si_shlqby(qword a
, qword count
)
1748 x
.v
= vec_sl(vec_splat((vec_uchar16
)(count
), 3), vec_splat_u8(3));
1750 mask
= (cnt
& 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1751 return ((qword
)(vec_and(vec_slo((vec_uchar16
)(a
), x
.v
), mask
)));
1754 /* Shift Left Quadword by Bytes with Bit Count
1756 static __inline qword
si_shlqbybi(qword a
, qword count
)
1765 x
.v
= vec_splat((vec_uchar16
)(count
), 3);
1767 mask
= (cnt
& 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1768 return ((qword
)(vec_and(vec_slo((vec_uchar16
)(a
), x
.v
), mask
)));
1774 #define si_stop(_type) SPU_STOP_ACTION
1775 #define si_stopd(a, b, c) SPU_STOP_ACTION
1780 static __inline qword
si_sfh(qword a
, qword b
)
1782 return ((qword
)(vec_sub((vec_ushort8
)(b
), (vec_ushort8
)(a
))));
1785 static __inline qword
si_sf(qword a
, qword b
)
1787 return ((qword
)(vec_sub((vec_uint4
)(b
), (vec_uint4
)(a
))));
1790 static __inline qword
si_fs(qword a
, qword b
)
1792 return ((qword
)(vec_sub((vec_float4
)(a
), (vec_float4
)(b
))));
1795 static __inline qword
si_dfs(qword a
, qword b
)
1802 aa
.v
= (vec_double2
)(a
);
1803 bb
.v
= (vec_double2
)(b
);
1804 dd
.d
[0] = aa
.d
[0] - bb
.d
[0];
1805 dd
.d
[1] = aa
.d
[1] - bb
.d
[1];
1806 return ((qword
)(dd
.v
));
1809 static __inline qword
si_sfhi(qword a
, short b
)
1811 return ((qword
)(vec_sub(vec_splat((vec_short8
)(si_from_short(b
)), 1),
1815 static __inline qword
si_sfi(qword a
, int b
)
1817 return ((qword
)(vec_sub(vec_splat((vec_int4
)(si_from_int(b
)), 0),
1821 /* Subtract word extended
1823 #define si_sfx(_a, _b, _c) ((qword)(vec_add(vec_add((vec_uint4)(_b), \
1824 vec_nor((vec_uint4)(_a), (vec_uint4)(_a))), \
1825 vec_and((vec_uint4)(_c), vec_splat_u32(1)))))
1828 /* Sum Bytes into Shorts
1830 static __inline qword
si_sumb(qword a
, qword b
)
1832 vec_uint4 zero
= (vec_uint4
){0};
1833 vec_ushort8 sum_a
, sum_b
;
1835 sum_a
= (vec_ushort8
)vec_sum4s((vec_uchar16
)(a
), zero
);
1836 sum_b
= (vec_ushort8
)vec_sum4s((vec_uchar16
)(b
), zero
);
1838 return ((qword
)(vec_perm(sum_a
, sum_b
, ((vec_uchar16
){18, 19, 2, 3, 22, 23, 6, 7,
1839 26, 27, 10, 11, 30, 31, 14, 15}))));
1844 static __inline qword
si_xor(qword a
, qword b
)
1846 return ((qword
)(vec_xor((vec_uchar16
)(a
), (vec_uchar16
)(b
))));
1849 static __inline qword
si_xorbi(qword a
, unsigned char b
)
1851 return ((qword
)(vec_xor((vec_uchar16
)(a
),
1852 vec_splat((vec_uchar16
)(si_from_uchar(b
)), 3))));
1855 static __inline qword
si_xorhi(qword a
, unsigned short b
)
1857 return ((qword
)(vec_xor((vec_ushort8
)(a
),
1858 vec_splat((vec_ushort8
)(si_from_ushort(b
)), 1))));
1861 static __inline qword
si_xori(qword a
, unsigned int b
)
1863 return ((qword
)(vec_xor((vec_uint4
)(a
),
1864 vec_splat((vec_uint4
)(si_from_uint(b
)), 0))));
1868 /* Generate Controls for Sub-Quadword Insertion
1870 static __inline qword
si_cbd(qword a
, int imm
)
1874 unsigned char c
[16];
1877 shmask
.v
= ((vec_uint4
){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1878 shmask
.c
[(si_to_uint(a
) + (unsigned int)(imm
)) & 0xF] = 0x03;
1879 return ((qword
)(shmask
.v
));
1882 static __inline qword
si_cdd(qword a
, int imm
)
1886 unsigned long long ll
[2];
1889 shmask
.v
= ((vec_uint4
){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1890 shmask
.ll
[((si_to_uint(a
) + (unsigned int)(imm
)) >> 3) & 0x1] = 0x0001020304050607ULL
;
1891 return ((qword
)(shmask
.v
));
1894 static __inline qword
si_chd(qword a
, int imm
)
1898 unsigned short s
[8];
1901 shmask
.v
= ((vec_uint4
){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1902 shmask
.s
[((si_to_uint(a
) + (unsigned int)(imm
)) >> 1) & 0x7] = 0x0203;
1903 return ((qword
)(shmask
.v
));
1906 static __inline qword
si_cwd(qword a
, int imm
)
1913 shmask
.v
= ((vec_uint4
){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1914 shmask
.i
[((si_to_uint(a
) + (unsigned int)(imm
)) >> 2) & 0x3] = 0x00010203;
1915 return ((qword
)(shmask
.v
));
1918 static __inline qword
si_cbx(qword a
, qword b
)
1922 unsigned char c
[16];
1925 shmask
.v
= ((vec_uint4
){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1926 shmask
.c
[si_to_uint((qword
)(vec_add((vec_uint4
)(a
), (vec_uint4
)(b
)))) & 0xF] = 0x03;
1927 return ((qword
)(shmask
.v
));
1931 static __inline qword
si_cdx(qword a
, qword b
)
1935 unsigned long long ll
[2];
1938 shmask
.v
= ((vec_uint4
){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1939 shmask
.ll
[(si_to_uint((qword
)(vec_add((vec_uint4
)(a
), (vec_uint4
)(b
)))) >> 3) & 0x1] = 0x0001020304050607ULL
;
1940 return ((qword
)(shmask
.v
));
1943 static __inline qword
si_chx(qword a
, qword b
)
1947 unsigned short s
[8];
1950 shmask
.v
= ((vec_uint4
){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1951 shmask
.s
[(si_to_uint((qword
)(vec_add((vec_uint4
)(a
), (vec_uint4
)(b
)))) >> 1) & 0x7] = 0x0203;
1952 return ((qword
)(shmask
.v
));
1955 static __inline qword
si_cwx(qword a
, qword b
)
1962 shmask
.v
= ((vec_uint4
){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1963 shmask
.i
[(si_to_uint((qword
)(vec_add((vec_uint4
)(a
), (vec_uint4
)(b
)))) >> 2) & 0x3] = 0x00010203;
1964 return ((qword
)(shmask
.v
));
1968 /* Constant Formation
1970 static __inline qword
si_il(signed short imm
)
1972 return ((qword
)(vec_splat((vec_int4
)(si_from_int((signed int)(imm
))), 0)));
1976 static __inline qword
si_ila(unsigned int imm
)
1978 return ((qword
)(vec_splat((vec_uint4
)(si_from_uint(imm
)), 0)));
1981 static __inline qword
si_ilh(signed short imm
)
1983 return ((qword
)(vec_splat((vec_short8
)(si_from_short(imm
)), 1)));
1986 static __inline qword
si_ilhu(signed short imm
)
1988 return ((qword
)(vec_splat((vec_uint4
)(si_from_uint((unsigned int)(imm
) << 16)), 0)));
1991 static __inline qword
si_iohl(qword a
, unsigned short imm
)
1993 return ((qword
)(vec_or((vec_uint4
)(a
), vec_splat((vec_uint4
)(si_from_uint((unsigned int)(imm
))), 0))));
1998 #define si_lnop() /* do nothing */
1999 #define si_nop() /* do nothing */
2002 /* Memory Load and Store
2004 static __inline qword
si_lqa(unsigned int imm
)
2006 return ((qword
)(vec_ld(0, (vector
unsigned char *)(imm
))));
2009 static __inline qword
si_lqd(qword a
, unsigned int imm
)
2011 return ((qword
)(vec_ld(si_to_uint(a
) & ~0xF, (vector
unsigned char *)(imm
))));
2014 static __inline qword
si_lqr(unsigned int imm
)
2016 return ((qword
)(vec_ld(0, (vector
unsigned char *)(imm
))));
2019 static __inline qword
si_lqx(qword a
, qword b
)
2021 return ((qword
)(vec_ld(si_to_uint((qword
)(vec_add((vec_uint4
)(a
), (vec_uint4
)(b
)))), (vector
unsigned char *)(0))));
2024 static __inline
void si_stqa(qword a
, unsigned int imm
)
2026 vec_st((vec_uchar16
)(a
), 0, (vector
unsigned char *)(imm
));
2029 static __inline
void si_stqd(qword a
, qword b
, unsigned int imm
)
2031 vec_st((vec_uchar16
)(a
), si_to_uint(b
) & ~0xF, (vector
unsigned char *)(imm
));
2034 static __inline
void si_stqr(qword a
, unsigned int imm
)
2036 vec_st((vec_uchar16
)(a
), 0, (vector
unsigned char *)(imm
));
2039 static __inline
void si_stqx(qword a
, qword b
, qword c
)
2041 vec_st((vec_uchar16
)(a
),
2042 si_to_uint((qword
)(vec_add((vec_uint4
)(b
), (vec_uint4
)(c
)))),
2043 (vector
unsigned char *)(0));
2046 #endif /* !__SPU__ */
2047 #endif /* !_SI2VMX_H_ */