[13/77] Make floatn_mode return an opt_scalar_float_mode
[official-gcc.git] / gcc / config / powerpcspe / si2vmx.h
blob6b8cca783c74998b5403ef169783e77d422a206a
1 /* Cell BEA specific SPU intrinsics to PPU/VMX intrinsics
2 Copyright (C) 2007-2017 Free Software Foundation, Inc.
4 This file is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 3 of the License, or (at your option)
7 any later version.
9 This file is distributed in the hope that it will be useful, but WITHOUT
10 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 for more details.
14 Under Section 7 of GPL version 3, you are granted additional
15 permissions described in the GCC Runtime Library Exception, version
16 3.1, as published by the Free Software Foundation.
18 You should have received a copy of the GNU General Public License and
19 a copy of the GCC Runtime Library Exception along with this program;
20 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
21 <http://www.gnu.org/licenses/>. */
23 #ifndef _SI2VMX_H_
24 #define _SI2VMX_H_ 1
26 #ifndef __SPU__
28 #include <stdlib.h>
29 #include <vec_types.h>
32 /* Specify a default halt action for spu_hcmpeq and spu_hcmpgt intrinsics.
33 * Users can override the action by defining it prior to including this
34 * header file.
36 #ifndef SPU_HALT_ACTION
37 #define SPU_HALT_ACTION abort()
38 #endif
40 /* Specify a default stop action for the spu_stop intrinsic.
41 * Users can override the action by defining it prior to including this
42 * header file.
44 #ifndef SPU_STOP_ACTION
45 #define SPU_STOP_ACTION abort()
46 #endif
49 /* Specify a default action for unsupported intrinsic.
50 * Users can override the action by defining it prior to including this
51 * header file.
53 #ifndef SPU_UNSUPPORTED_ACTION
54 #define SPU_UNSUPPORTED_ACTION abort()
55 #endif
58 /* Casting intrinsics - from scalar to quadword
61 static __inline qword si_from_uchar(unsigned char c) {
62 union {
63 qword q;
64 unsigned char c[16];
65 } x;
66 x.c[3] = c;
67 return (x.q);
70 static __inline qword si_from_char(signed char c) {
71 union {
72 qword q;
73 signed char c[16];
74 } x;
75 x.c[3] = c;
76 return (x.q);
79 static __inline qword si_from_ushort(unsigned short s) {
80 union {
81 qword q;
82 unsigned short s[8];
83 } x;
84 x.s[1] = s;
85 return (x.q);
88 static __inline qword si_from_short(short s) {
89 union {
90 qword q;
91 short s[8];
92 } x;
93 x.s[1] = s;
94 return (x.q);
98 static __inline qword si_from_uint(unsigned int i) {
99 union {
100 qword q;
101 unsigned int i[4];
102 } x;
103 x.i[0] = i;
104 return (x.q);
107 static __inline qword si_from_int(int i) {
108 union {
109 qword q;
110 int i[4];
111 } x;
112 x.i[0] = i;
113 return (x.q);
116 static __inline qword si_from_ullong(unsigned long long l) {
117 union {
118 qword q;
119 unsigned long long l[2];
120 } x;
121 x.l[0] = l;
122 return (x.q);
125 static __inline qword si_from_llong(long long l) {
126 union {
127 qword q;
128 long long l[2];
129 } x;
130 x.l[0] = l;
131 return (x.q);
134 static __inline qword si_from_float(float f) {
135 union {
136 qword q;
137 float f[4];
138 } x;
139 x.f[0] = f;
140 return (x.q);
143 static __inline qword si_from_double(double d) {
144 union {
145 qword q;
146 double d[2];
147 } x;
148 x.d[0] = d;
149 return (x.q);
152 static __inline qword si_from_ptr(void *ptr) {
153 union {
154 qword q;
155 void *p;
156 } x;
157 x.p = ptr;
158 return (x.q);
162 /* Casting intrinsics - from quadword to scalar
164 static __inline unsigned char si_to_uchar(qword q) {
165 union {
166 qword q;
167 unsigned char c[16];
168 } x;
169 x.q = q;
170 return (x.c[3]);
173 static __inline signed char si_to_char(qword q) {
174 union {
175 qword q;
176 signed char c[16];
177 } x;
178 x.q = q;
179 return (x.c[3]);
182 static __inline unsigned short si_to_ushort(qword q) {
183 union {
184 qword q;
185 unsigned short s[8];
186 } x;
187 x.q = q;
188 return (x.s[1]);
191 static __inline short si_to_short(qword q) {
192 union {
193 qword q;
194 short s[8];
195 } x;
196 x.q = q;
197 return (x.s[1]);
200 static __inline unsigned int si_to_uint(qword q) {
201 union {
202 qword q;
203 unsigned int i[4];
204 } x;
205 x.q = q;
206 return (x.i[0]);
209 static __inline int si_to_int(qword q) {
210 union {
211 qword q;
212 int i[4];
213 } x;
214 x.q = q;
215 return (x.i[0]);
218 static __inline unsigned long long si_to_ullong(qword q) {
219 union {
220 qword q;
221 unsigned long long l[2];
222 } x;
223 x.q = q;
224 return (x.l[0]);
227 static __inline long long si_to_llong(qword q) {
228 union {
229 qword q;
230 long long l[2];
231 } x;
232 x.q = q;
233 return (x.l[0]);
236 static __inline float si_to_float(qword q) {
237 union {
238 qword q;
239 float f[4];
240 } x;
241 x.q = q;
242 return (x.f[0]);
245 static __inline double si_to_double(qword q) {
246 union {
247 qword q;
248 double d[2];
249 } x;
250 x.q = q;
251 return (x.d[0]);
254 static __inline void * si_to_ptr(qword q) {
255 union {
256 qword q;
257 void *p;
258 } x;
259 x.q = q;
260 return (x.p);
264 /* Absolute difference
266 static __inline qword si_absdb(qword a, qword b)
268 vec_uchar16 ac, bc, dc;
270 ac = (vec_uchar16)(a);
271 bc = (vec_uchar16)(b);
272 dc = vec_sel(vec_sub(bc, ac), vec_sub(ac, bc), vec_cmpgt(ac, bc));
274 return ((qword)(dc));
277 /* Add intrinsics
279 #define si_a(_a, _b) ((qword)(vec_add((vec_uint4)(_a), (vec_uint4)(_b))))
281 #define si_ah(_a, _b) ((qword)(vec_add((vec_ushort8)(_a), (vec_ushort8)(_b))))
283 static __inline qword si_ai(qword a, int b)
285 return ((qword)(vec_add((vec_int4)(a),
286 vec_splat((vec_int4)(si_from_int(b)), 0))));
290 static __inline qword si_ahi(qword a, short b)
292 return ((qword)(vec_add((vec_short8)(a),
293 vec_splat((vec_short8)(si_from_short(b)), 1))));
297 #define si_fa(_a, _b) ((qword)(vec_add((vec_float4)(_a), (vec_float4)(_b))))
300 static __inline qword si_dfa(qword a, qword b)
302 union {
303 vec_double2 v;
304 double d[2];
305 } ad, bd, dd;
307 ad.v = (vec_double2)(a);
308 bd.v = (vec_double2)(b);
309 dd.d[0] = ad.d[0] + bd.d[0];
310 dd.d[1] = ad.d[1] + bd.d[1];
312 return ((qword)(dd.v));
315 /* Add word extended
317 #define si_addx(_a, _b, _c) ((qword)(vec_add(vec_add((vec_uint4)(_a), (vec_uint4)(_b)), \
318 vec_and((vec_uint4)(_c), vec_splat_u32(1)))))
321 /* Bit-wise AND
323 #define si_and(_a, _b) ((qword)(vec_and((vec_uint4)(_a), (vec_uint4)(_b))))
326 static __inline qword si_andbi(qword a, signed char b)
328 return ((qword)(vec_and((vec_char16)(a),
329 vec_splat((vec_char16)(si_from_char(b)), 3))));
332 static __inline qword si_andhi(qword a, signed short b)
334 return ((qword)(vec_and((vec_short8)(a),
335 vec_splat((vec_short8)(si_from_short(b)), 1))));
339 static __inline qword si_andi(qword a, signed int b)
341 return ((qword)(vec_and((vec_int4)(a),
342 vec_splat((vec_int4)(si_from_int(b)), 0))));
346 /* Bit-wise AND with complement
348 #define si_andc(_a, _b) ((qword)(vec_andc((vec_uchar16)(_a), (vec_uchar16)(_b))))
351 /* Average byte vectors
353 #define si_avgb(_a, _b) ((qword)(vec_avg((vec_uchar16)(_a), (vec_uchar16)(_b))))
356 /* Branch indirect and set link on external data
358 #define si_bisled(_func) /* not mappable */
359 #define si_bisledd(_func) /* not mappable */
360 #define si_bislede(_func) /* not mappable */
363 /* Borrow generate
365 #define si_bg(_a, _b) ((qword)(vec_subc((vec_uint4)(_b), (vec_uint4)(_a))))
367 #define si_bgx(_a, _b, _c) ((qword)(vec_and(vec_or(vec_cmpgt((vec_uint4)(_b), (vec_uint4)(_a)), \
368 vec_and(vec_cmpeq((vec_uint4)(_b), (vec_uint4)(_a)), \
369 (vec_uint4)(_c))), vec_splat_u32(1))))
371 /* Compare absolute equal
373 static __inline qword si_fcmeq(qword a, qword b)
375 vec_float4 msb = (vec_float4)((vec_uint4){0x80000000, 0x80000000, 0x80000000, 0x80000000});
377 return ((qword)(vec_cmpeq(vec_andc((vec_float4)(a), msb),
378 vec_andc((vec_float4)(b), msb))));
381 static __inline qword si_dfcmeq(qword a, qword b)
383 vec_uint4 sign_mask= (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
384 vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000 };
385 vec_uchar16 hihi_promote = (vec_uchar16) { 0,1,2,3, 16,17,18,19, 8,9,10,11, 24,25,26,27};
387 vec_uint4 biteq;
388 vec_uint4 aabs;
389 vec_uint4 babs;
390 vec_uint4 a_gt;
391 vec_uint4 ahi_inf;
392 vec_uint4 anan;
393 vec_uint4 result;
395 union {
396 vec_uchar16 v;
397 int i[4];
398 } x;
400 /* Shift 4 bytes */
401 x.i[3] = 4 << 3;
403 /* Mask out sign bits */
404 aabs = vec_and((vec_uint4)a,sign_mask);
405 babs = vec_and((vec_uint4)b,sign_mask);
407 /* A) Check for bit equality, store in high word */
408 biteq = (vec_uint4) vec_cmpeq((vec_uint4)aabs,(vec_uint4)babs);
409 biteq = vec_and(biteq,(vec_uint4)vec_slo((vec_uchar16)biteq,x.v));
412 B) Check if a is NaN, store in high word
414 B1) If the high word is greater than max_exp (indicates a NaN)
415 B2) If the low word is greater than 0
417 a_gt = (vec_uint4)vec_cmpgt(aabs,nan_mask);
419 /* B3) Check if the high word is equal to the inf exponent */
420 ahi_inf = (vec_uint4)vec_cmpeq(aabs,nan_mask);
422 /* anan = B1[hi] or (B2[lo] and B3[hi]) */
423 anan = (vec_uint4)vec_or(a_gt,vec_and((vec_uint4)vec_slo((vec_uchar16)a_gt,x.v),ahi_inf));
425 /* result = A and not B */
426 result = vec_andc(biteq, anan);
428 /* Promote high words to 64 bits and return */
429 return ((qword)(vec_perm((vec_uchar16)result, (vec_uchar16)result, hihi_promote)));
433 /* Compare absolute greater than
435 static __inline qword si_fcmgt(qword a, qword b)
437 vec_float4 msb = (vec_float4)((vec_uint4){0x80000000, 0x80000000, 0x80000000, 0x80000000});
439 return ((qword)(vec_cmpgt(vec_andc((vec_float4)(a), msb),
440 vec_andc((vec_float4)(b), msb))));
443 static __inline qword si_dfcmgt(qword a, qword b)
445 vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
446 vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
447 vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
449 union {
450 vec_uchar16 v;
451 int i[4];
452 } x;
454 /* Shift 4 bytes */
455 x.i[3] = 4 << 3;
457 // absolute value of a,b
458 vec_uint4 aabs = vec_and((vec_uint4)a, sign_mask);
459 vec_uint4 babs = vec_and((vec_uint4)b, sign_mask);
461 // check if a is nan
462 vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask);
463 vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask);
464 a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf));
465 a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi);
467 // check if b is nan
468 vec_uint4 b_inf = (vec_uint4)vec_cmpeq(babs, nan_mask);
469 vec_uint4 b_nan = (vec_uint4)vec_cmpgt(babs, nan_mask);
470 b_nan = vec_or(b_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)b_nan,x.v),b_inf));
471 b_nan = (vec_uint4)vec_perm((vec_uchar16)b_nan, (vec_uchar16)b_nan, splat_hi);
473 // A) Check if the exponents are different
474 vec_uint4 gt_hi = (vec_uint4)vec_cmpgt(aabs,babs);
476 // B) Check if high word equal, and low word greater
477 vec_uint4 gt_lo = (vec_uint4)vec_cmpgt((vec_uint4)aabs, (vec_uint4)babs);
478 vec_uint4 eq = (vec_uint4)vec_cmpeq(aabs, babs);
479 vec_uint4 eqgt = vec_and(eq,vec_slo(gt_lo,x.v));
481 // If either A or B is true, return true (unless NaNs detected)
482 vec_uint4 r = vec_or(gt_hi, eqgt);
484 // splat the high words of the comparison step
485 r = (vec_uint4)vec_perm((vec_uchar16)r,(vec_uchar16)r,splat_hi);
487 // correct for NaNs in input
488 return ((qword)vec_andc(r,vec_or(a_nan,b_nan)));
492 /* Compare equal
494 static __inline qword si_ceqb(qword a, qword b)
496 return ((qword)(vec_cmpeq((vec_uchar16)(a), (vec_uchar16)(b))));
499 static __inline qword si_ceqh(qword a, qword b)
501 return ((qword)(vec_cmpeq((vec_ushort8)(a), (vec_ushort8)(b))));
504 static __inline qword si_ceq(qword a, qword b)
506 return ((qword)(vec_cmpeq((vec_uint4)(a), (vec_uint4)(b))));
509 static __inline qword si_fceq(qword a, qword b)
511 return ((qword)(vec_cmpeq((vec_float4)(a), (vec_float4)(b))));
514 static __inline qword si_ceqbi(qword a, signed char b)
516 return ((qword)(vec_cmpeq((vec_char16)(a),
517 vec_splat((vec_char16)(si_from_char(b)), 3))));
520 static __inline qword si_ceqhi(qword a, signed short b)
522 return ((qword)(vec_cmpeq((vec_short8)(a),
523 vec_splat((vec_short8)(si_from_short(b)), 1))));
526 static __inline qword si_ceqi(qword a, signed int b)
528 return ((qword)(vec_cmpeq((vec_int4)(a),
529 vec_splat((vec_int4)(si_from_int(b)), 0))));
532 static __inline qword si_dfceq(qword a, qword b)
534 vec_uint4 sign_mask= (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
535 vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000 };
536 vec_uchar16 hihi_promote = (vec_uchar16) { 0,1,2,3, 16,17,18,19, 8,9,10,11, 24,25,26,27};
538 vec_uint4 biteq;
539 vec_uint4 aabs;
540 vec_uint4 babs;
541 vec_uint4 a_gt;
542 vec_uint4 ahi_inf;
543 vec_uint4 anan;
544 vec_uint4 iszero;
545 vec_uint4 result;
547 union {
548 vec_uchar16 v;
549 int i[4];
550 } x;
552 /* Shift 4 bytes */
553 x.i[3] = 4 << 3;
555 /* A) Check for bit equality, store in high word */
556 biteq = (vec_uint4) vec_cmpeq((vec_uint4)a,(vec_uint4)b);
557 biteq = vec_and(biteq,(vec_uint4)vec_slo((vec_uchar16)biteq,x.v));
559 /* Mask out sign bits */
560 aabs = vec_and((vec_uint4)a,sign_mask);
561 babs = vec_and((vec_uint4)b,sign_mask);
564 B) Check if a is NaN, store in high word
566 B1) If the high word is greater than max_exp (indicates a NaN)
567 B2) If the low word is greater than 0
569 a_gt = (vec_uint4)vec_cmpgt(aabs,nan_mask);
571 /* B3) Check if the high word is equal to the inf exponent */
572 ahi_inf = (vec_uint4)vec_cmpeq(aabs,nan_mask);
574 /* anan = B1[hi] or (B2[lo] and B3[hi]) */
575 anan = (vec_uint4)vec_or(a_gt,vec_and((vec_uint4)vec_slo((vec_uchar16)a_gt,x.v),ahi_inf));
577 /* C) Check for 0 = -0 special case */
578 iszero =(vec_uint4)vec_cmpeq((vec_uint4)vec_or(aabs,babs),(vec_uint4)vec_splat_u32(0));
579 iszero = vec_and(iszero,(vec_uint4)vec_slo((vec_uchar16)iszero,x.v));
581 /* result = (A or C) and not B */
582 result = vec_or(biteq,iszero);
583 result = vec_andc(result, anan);
585 /* Promote high words to 64 bits and return */
586 return ((qword)(vec_perm((vec_uchar16)result, (vec_uchar16)result, hihi_promote)));
590 /* Compare greater than
592 static __inline qword si_cgtb(qword a, qword b)
594 return ((qword)(vec_cmpgt((vec_char16)(a), (vec_char16)(b))));
597 static __inline qword si_cgth(qword a, qword b)
599 return ((qword)(vec_cmpgt((vec_short8)(a), (vec_short8)(b))));
602 static __inline qword si_cgt(qword a, qword b)
604 return ((qword)(vec_cmpgt((vec_int4)(a), (vec_int4)(b))));
607 static __inline qword si_clgtb(qword a, qword b)
609 return ((qword)(vec_cmpgt((vec_uchar16)(a), (vec_uchar16)(b))));
612 static __inline qword si_clgth(qword a, qword b)
614 return ((qword)(vec_cmpgt((vec_ushort8)(a), (vec_ushort8)(b))));
617 static __inline qword si_clgt(qword a, qword b)
619 return ((qword)(vec_cmpgt((vec_uint4)(a), (vec_uint4)(b))));
622 static __inline qword si_fcgt(qword a, qword b)
624 return ((qword)(vec_cmpgt((vec_float4)(a), (vec_float4)(b))));
627 static __inline qword si_dfcgt(qword a, qword b)
629 vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
630 vec_uchar16 borrow_shuffle = (vec_uchar16) { 4,5,6,7, 192,192,192,192, 12,13,14,15, 192,192,192,192 };
631 vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
632 vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
634 union {
635 vec_uchar16 v;
636 int i[4];
637 } x;
639 /* Shift 4 bytes */
640 x.i[3] = 4 << 3;
642 // absolute value of a,b
643 vec_uint4 aabs = vec_and((vec_uint4)a, sign_mask);
644 vec_uint4 babs = vec_and((vec_uint4)b, sign_mask);
646 // check if a is nan
647 vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask);
648 vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask);
649 a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf));
650 a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi);
652 // check if b is nan
653 vec_uint4 b_inf = (vec_uint4)vec_cmpeq(babs, nan_mask);
654 vec_uint4 b_nan = (vec_uint4)vec_cmpgt(babs, nan_mask);
655 b_nan = vec_or(b_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)b_nan,x.v),b_inf));
656 b_nan = (vec_uint4)vec_perm((vec_uchar16)b_nan, (vec_uchar16)b_nan, splat_hi);
658 // sign of a
659 vec_uint4 asel = (vec_uint4)vec_sra((vec_int4)(a), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0));
660 asel = (vec_uint4)vec_perm((vec_uchar16)asel,(vec_uchar16)asel,splat_hi);
662 // sign of b
663 vec_uint4 bsel = (vec_uint4)vec_sra((vec_int4)(b), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0));
664 bsel = (vec_uint4)vec_perm((vec_uchar16)bsel,(vec_uchar16)bsel,splat_hi);
666 // negative a
667 vec_uint4 abor = vec_subc((vec_uint4)vec_splat_u32(0), aabs);
668 vec_uchar16 pat = vec_sel(((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}), vec_sr(borrow_shuffle, vec_splat_u8(3)), vec_sra(borrow_shuffle, vec_splat_u8(7)));
669 abor = (vec_uint4)(vec_perm(vec_perm((vec_uchar16)abor, (vec_uchar16)abor, borrow_shuffle),((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),pat));
670 vec_uint4 aneg = vec_add(vec_add(vec_splat_u32(0), vec_nor(aabs, aabs)), vec_and(abor, vec_splat_u32(1)));
672 // pick the one we want
673 vec_int4 aval = (vec_int4)vec_sel((vec_uchar16)aabs, (vec_uchar16)aneg, (vec_uchar16)asel);
675 // negative b
676 vec_uint4 bbor = vec_subc((vec_uint4)vec_splat_u32(0), babs);
677 bbor = (vec_uint4)(vec_perm(vec_perm((vec_uchar16)bbor, (vec_uchar16)bbor, borrow_shuffle),((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),pat));
678 vec_uint4 bneg = vec_add(vec_nor(babs, babs), vec_and(bbor, vec_splat_u32(1)));
680 // pick the one we want
681 vec_int4 bval=(vec_int4)vec_sel((vec_uchar16)babs, (vec_uchar16)bneg, (vec_uchar16)bsel);
683 // A) Check if the exponents are different
684 vec_uint4 gt_hi = (vec_uint4)vec_cmpgt(aval,bval);
686 // B) Check if high word equal, and low word greater
687 vec_uint4 gt_lo = (vec_uint4)vec_cmpgt((vec_uint4)aval, (vec_uint4)bval);
688 vec_uint4 eq = (vec_uint4)vec_cmpeq(aval, bval);
689 vec_uint4 eqgt = vec_and(eq,vec_slo(gt_lo,x.v));
691 // If either A or B is true, return true (unless NaNs detected)
692 vec_uint4 r = vec_or(gt_hi, eqgt);
694 // splat the high words of the comparison step
695 r = (vec_uint4)vec_perm((vec_uchar16)r,(vec_uchar16)r,splat_hi);
697 // correct for NaNs in input
698 return ((qword)vec_andc(r,vec_or(a_nan,b_nan)));
701 static __inline qword si_cgtbi(qword a, signed char b)
703 return ((qword)(vec_cmpgt((vec_char16)(a),
704 vec_splat((vec_char16)(si_from_char(b)), 3))));
707 static __inline qword si_cgthi(qword a, signed short b)
709 return ((qword)(vec_cmpgt((vec_short8)(a),
710 vec_splat((vec_short8)(si_from_short(b)), 1))));
713 static __inline qword si_cgti(qword a, signed int b)
715 return ((qword)(vec_cmpgt((vec_int4)(a),
716 vec_splat((vec_int4)(si_from_int(b)), 0))));
719 static __inline qword si_clgtbi(qword a, unsigned char b)
721 return ((qword)(vec_cmpgt((vec_uchar16)(a),
722 vec_splat((vec_uchar16)(si_from_uchar(b)), 3))));
725 static __inline qword si_clgthi(qword a, unsigned short b)
727 return ((qword)(vec_cmpgt((vec_ushort8)(a),
728 vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
731 static __inline qword si_clgti(qword a, unsigned int b)
733 return ((qword)(vec_cmpgt((vec_uint4)(a),
734 vec_splat((vec_uint4)(si_from_uint(b)), 0))));
737 static __inline qword si_dftsv(qword a, char b)
739 vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
740 vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
741 vec_uint4 result = (vec_uint4){0};
742 vec_uint4 sign = (vec_uint4)vec_sra((vec_int4)(a), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0));
743 sign = (vec_uint4)vec_perm((vec_uchar16)sign,(vec_uchar16)sign,splat_hi);
744 vec_uint4 aabs = vec_and((vec_uint4)a,sign_mask);
746 union {
747 vec_uchar16 v;
748 int i[4];
749 } x;
751 /* Shift 4 bytes */
752 x.i[3] = 4 << 3;
754 /* Nan or +inf or -inf */
755 if (b & 0x70)
757 vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
758 vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask);
759 /* NaN */
760 if (b & 0x40)
762 vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask);
763 a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf));
764 a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi);
765 result = vec_or(result, a_nan);
767 /* inf */
768 if (b & 0x30)
770 a_inf = vec_and((vec_uint4)vec_slo((vec_uchar16)a_inf,x.v), a_inf);
771 a_inf = (vec_uint4)vec_perm((vec_uchar16)a_inf, (vec_uchar16)a_inf, splat_hi);
772 /* +inf */
773 if (b & 0x20)
774 result = vec_or(vec_andc(a_inf, sign), result);
775 /* -inf */
776 if (b & 0x10)
777 result = vec_or(vec_and(a_inf, sign), result);
780 /* 0 or denorm */
781 if (b & 0xF)
783 vec_uint4 iszero =(vec_uint4)vec_cmpeq(aabs,(vec_uint4)vec_splat_u32(0));
784 iszero = vec_and(iszero,(vec_uint4)vec_slo((vec_uchar16)iszero,x.v));
785 /* denorm */
786 if (b & 0x3)
788 vec_uint4 denorm_mask = (vec_uint4){0xFFFFF, 0xFFFFF, 0xFFFFF, 0xFFFFF};
789 vec_uint4 isdenorm = vec_nor((vec_uint4)vec_cmpgt(aabs, denorm_mask), iszero);
790 isdenorm = (vec_uint4)vec_perm((vec_uchar16)isdenorm, (vec_uchar16)isdenorm, splat_hi);
791 /* +denorm */
792 if (b & 0x2)
793 result = vec_or(vec_andc(isdenorm, sign), result);
794 /* -denorm */
795 if (b & 0x1)
796 result = vec_or(vec_and(isdenorm, sign), result);
798 /* 0 */
799 if (b & 0xC)
801 iszero = (vec_uint4)vec_perm((vec_uchar16)iszero, (vec_uchar16)iszero, splat_hi);
802 /* +0 */
803 if (b & 0x8)
804 result = vec_or(vec_andc(iszero, sign), result);
805 /* -0 */
806 if (b & 0x4)
807 result = vec_or(vec_and(iszero, sign), result);
810 return ((qword)result);
814 /* Carry generate
816 #define si_cg(_a, _b) ((qword)(vec_addc((vec_uint4)(_a), (vec_uint4)(_b))))
818 #define si_cgx(_a, _b, _c) ((qword)(vec_or(vec_addc((vec_uint4)(_a), (vec_uint4)(_b)), \
819 vec_addc(vec_add((vec_uint4)(_a), (vec_uint4)(_b)), \
820 vec_and((vec_uint4)(_c), vec_splat_u32(1))))))
823 /* Count ones for bytes
825 static __inline qword si_cntb(qword a)
827 vec_uchar16 nib_cnt = (vec_uchar16){0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
828 vec_uchar16 four = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 };
829 vec_uchar16 av;
831 av = (vec_uchar16)(a);
833 return ((qword)(vec_add(vec_perm(nib_cnt, nib_cnt, av),
834 vec_perm(nib_cnt, nib_cnt, vec_sr (av, four)))));
837 /* Count ones for bytes
839 static __inline qword si_clz(qword a)
841 vec_uchar16 av;
842 vec_uchar16 cnt_hi, cnt_lo, cnt, tmp1, tmp2, tmp3;
843 vec_uchar16 four = vec_splat_u8(4);
844 vec_uchar16 nib_cnt = (vec_uchar16){4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0};
845 vec_uchar16 eight = vec_splat_u8(8);
846 vec_uchar16 sixteen = (vec_uchar16){16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16};
847 vec_uchar16 twentyfour = (vec_uchar16){24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24};
849 av = (vec_uchar16)(a);
851 cnt_hi = vec_perm(nib_cnt, nib_cnt, vec_sr(av, four));
852 cnt_lo = vec_perm(nib_cnt, nib_cnt, av);
854 cnt = vec_add(cnt_hi, vec_and(cnt_lo, vec_cmpeq(cnt_hi, four)));
856 tmp1 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(eight));
857 tmp2 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(sixteen));
858 tmp3 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(twentyfour));
860 cnt = vec_add(cnt, vec_and(tmp1, vec_cmpeq(cnt, eight)));
861 cnt = vec_add(cnt, vec_and(tmp2, vec_cmpeq(cnt, sixteen)));
862 cnt = vec_add(cnt, vec_and(tmp3, vec_cmpeq(cnt, twentyfour)));
864 return (qword)((vec_sr((vec_uint4)(cnt), (vec_uint4)(twentyfour))));
867 /* Convert to float
869 #define si_cuflt(_a, _b) ((qword)(vec_ctf((vec_uint4)(_a), _b)))
870 #define si_csflt(_a, _b) ((qword)(vec_ctf((vec_int4)(_a), _b)))
872 /* Convert to signed int
874 #define si_cflts(_a, _b) ((qword)(vec_cts((vec_float4)(_a), _b)))
876 /* Convert to unsigned int
878 #define si_cfltu(_a, _b) ((qword)(vec_ctu((vec_float4)(_a), _b)))
880 /* Synchronize
882 #define si_dsync() /* do nothing */
883 #define si_sync() /* do nothing */
884 #define si_syncc() /* do nothing */
887 /* Equivalence
889 static __inline qword si_eqv(qword a, qword b)
891 vec_uchar16 d;
893 d = vec_xor((vec_uchar16)(a), (vec_uchar16)(b));
894 return ((qword)(vec_nor(d, d)));
897 /* Extend
899 static __inline qword si_xsbh(qword a)
901 vec_char16 av;
903 av = (vec_char16)(a);
904 return ((qword)(vec_unpackh(vec_perm(av, av, ((vec_uchar16){1, 3, 5, 7, 9,11,13,15,
905 0, 0, 0, 0, 0, 0, 0, 0})))));
908 static __inline qword si_xshw(qword a)
910 vec_short8 av;
912 av = (vec_short8)(a);
913 return ((qword)(vec_unpackh(vec_perm(av, av, ((vec_uchar16){2, 3, 6, 7,
914 10,11,14,15,
915 0, 0, 0, 0,
916 0, 0, 0, 0})))));
919 static __inline qword si_xswd(qword a)
921 vec_int4 av;
923 av = (vec_int4)(a);
924 return ((qword)(vec_perm(av, vec_sra(av, ((vec_uint4){31,31,31,31})),
925 ((vec_uchar16){20, 21, 22, 23,
926 4, 5, 6, 7,
927 28, 29, 30, 31,
928 12, 13, 14, 15}))));
931 static __inline qword si_fesd(qword a)
933 union {
934 double d[2];
935 vec_double2 vd;
936 } out;
937 union {
938 float f[4];
939 vec_float4 vf;
940 } in;
942 in.vf = (vec_float4)(a);
943 out.d[0] = (double)(in.f[0]);
944 out.d[1] = (double)(in.f[2]);
945 return ((qword)(out.vd));
948 /* Gather
950 static __inline qword si_gbb(qword a)
952 vec_uchar16 bits;
953 vec_uint4 bytes;
955 bits = vec_sl(vec_and((vec_uchar16)(a), vec_splat_u8(1)), ((vec_uchar16){7, 6, 5, 4, 3, 2, 1, 0,
956 7, 6, 5, 4, 3, 2, 1, 0}));
957 bytes = (vec_uint4)vec_sum2s((vec_int4)(vec_sum4s(bits, ((vec_uint4){0}))), ((vec_int4){0}));
959 return ((qword)(vec_perm(bytes, bytes, ((vec_uchar16){0, 0, 7,15, 0, 0, 0, 0,
960 0, 0, 0, 0, 0, 0, 0, 0}))));
964 static __inline qword si_gbh(qword a)
966 vec_ushort8 bits;
967 vec_uint4 bytes;
969 bits = vec_sl(vec_and((vec_ushort8)(a), vec_splat_u16(1)), ((vec_ushort8){7, 6, 5, 4, 3, 2, 1, 0}));
971 bytes = (vec_uint4)vec_sums((vec_int4)(vec_sum4s((vec_short8)(bits), (vec_int4){0})), (vec_int4){0});
973 return ((qword)(vec_sld(bytes, bytes, 12)));
976 static __inline qword si_gb(qword a)
978 vec_uint4 bits;
979 vec_uint4 bytes;
981 bits = vec_sl(vec_and((vec_uint4)(a), vec_splat_u32(1)), ((vec_uint4){3, 2, 1, 0}));
982 bytes = (vec_uint4)vec_sums((vec_int4)(bits), ((vec_int4){0}));
983 return ((qword)(vec_sld(bytes, bytes, 12)));
987 /* Compare and halt
989 static __inline void si_heq(qword a, qword b)
991 union {
992 vector unsigned int v;
993 unsigned int i[4];
994 } aa, bb;
996 aa.v = (vector unsigned int)(a);
997 bb.v = (vector unsigned int)(b);
999 if (aa.i[0] == bb.i[0]) { SPU_HALT_ACTION; };
1002 static __inline void si_heqi(qword a, unsigned int b)
1004 union {
1005 vector unsigned int v;
1006 unsigned int i[4];
1007 } aa;
1009 aa.v = (vector unsigned int)(a);
1011 if (aa.i[0] == b) { SPU_HALT_ACTION; };
1014 static __inline void si_hgt(qword a, qword b)
1016 union {
1017 vector signed int v;
1018 signed int i[4];
1019 } aa, bb;
1021 aa.v = (vector signed int)(a);
1022 bb.v = (vector signed int)(b);
1024 if (aa.i[0] > bb.i[0]) { SPU_HALT_ACTION; };
1027 static __inline void si_hgti(qword a, signed int b)
1029 union {
1030 vector signed int v;
1031 signed int i[4];
1032 } aa;
1034 aa.v = (vector signed int)(a);
1036 if (aa.i[0] > b) { SPU_HALT_ACTION; };
1039 static __inline void si_hlgt(qword a, qword b)
1041 union {
1042 vector unsigned int v;
1043 unsigned int i[4];
1044 } aa, bb;
1046 aa.v = (vector unsigned int)(a);
1047 bb.v = (vector unsigned int)(b);
1049 if (aa.i[0] > bb.i[0]) { SPU_HALT_ACTION; };
1052 static __inline void si_hlgti(qword a, unsigned int b)
1054 union {
1055 vector unsigned int v;
1056 unsigned int i[4];
1057 } aa;
1059 aa.v = (vector unsigned int)(a);
1061 if (aa.i[0] > b) { SPU_HALT_ACTION; };
1065 /* Multiply and Add
1067 static __inline qword si_mpya(qword a, qword b, qword c)
1069 return ((qword)(vec_msum(vec_and((vec_short8)(a),
1070 ((vec_short8){0, -1, 0, -1, 0, -1, 0, -1})),
1071 (vec_short8)(b), (vec_int4)(c))));
1074 static __inline qword si_fma(qword a, qword b, qword c)
1076 return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b), (vec_float4)(c))));
1079 static __inline qword si_dfma(qword a, qword b, qword c)
1081 union {
1082 vec_double2 v;
1083 double d[2];
1084 } aa, bb, cc, dd;
1086 aa.v = (vec_double2)(a);
1087 bb.v = (vec_double2)(b);
1088 cc.v = (vec_double2)(c);
1089 dd.d[0] = aa.d[0] * bb.d[0] + cc.d[0];
1090 dd.d[1] = aa.d[1] * bb.d[1] + cc.d[1];
1091 return ((qword)(dd.v));
1094 /* Form Mask
1096 #define si_fsmbi(_a) si_fsmb(si_from_int(_a))
1098 static __inline qword si_fsmb(qword a)
1100 vec_char16 mask;
1101 vec_ushort8 in;
1103 in = (vec_ushort8)(a);
1104 mask = (vec_char16)(vec_perm(in, in, ((vec_uchar16){2, 2, 2, 2, 2, 2, 2, 2,
1105 3, 3, 3, 3, 3, 3, 3, 3})));
1106 return ((qword)(vec_sra(vec_sl(mask, ((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7,
1107 0, 1, 2, 3, 4, 5, 6, 7})),
1108 vec_splat_u8(7))));
1112 static __inline qword si_fsmh(qword a)
1114 vec_uchar16 in;
1115 vec_short8 mask;
1117 in = (vec_uchar16)(a);
1118 mask = (vec_short8)(vec_splat(in, 3));
1119 return ((qword)(vec_sra(vec_sl(mask, ((vec_ushort8){0, 1, 2, 3, 4, 5, 6, 7})),
1120 vec_splat_u16(15))));
1123 static __inline qword si_fsm(qword a)
1125 vec_uchar16 in;
1126 vec_int4 mask;
1128 in = (vec_uchar16)(a);
1129 mask = (vec_int4)(vec_splat(in, 3));
1130 return ((qword)(vec_sra(vec_sl(mask, ((vec_uint4){28, 29, 30, 31})),
1131 ((vec_uint4){31,31,31,31}))));
1134 /* Move from/to registers
1136 #define si_fscrrd() ((qword)((vec_uint4){0}))
1137 #define si_fscrwr(_a)
1139 #define si_mfspr(_reg) ((qword)((vec_uint4){0}))
1140 #define si_mtspr(_reg, _a)
1142 /* Multiply High High Add
1144 static __inline qword si_mpyhha(qword a, qword b, qword c)
1146 return ((qword)(vec_add(vec_mule((vec_short8)(a), (vec_short8)(b)), (vec_int4)(c))));
1149 static __inline qword si_mpyhhau(qword a, qword b, qword c)
1151 return ((qword)(vec_add(vec_mule((vec_ushort8)(a), (vec_ushort8)(b)), (vec_uint4)(c))));
1154 /* Multiply Subtract
1156 static __inline qword si_fms(qword a, qword b, qword c)
1158 return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b),
1159 vec_sub(((vec_float4){0.0f}), (vec_float4)(c)))));
1162 static __inline qword si_dfms(qword a, qword b, qword c)
1164 union {
1165 vec_double2 v;
1166 double d[2];
1167 } aa, bb, cc, dd;
1169 aa.v = (vec_double2)(a);
1170 bb.v = (vec_double2)(b);
1171 cc.v = (vec_double2)(c);
1172 dd.d[0] = aa.d[0] * bb.d[0] - cc.d[0];
1173 dd.d[1] = aa.d[1] * bb.d[1] - cc.d[1];
1174 return ((qword)(dd.v));
1177 /* Multiply
1179 static __inline qword si_fm(qword a, qword b)
1181 return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b), ((vec_float4){0.0f}))));
1184 static __inline qword si_dfm(qword a, qword b)
1186 union {
1187 vec_double2 v;
1188 double d[2];
1189 } aa, bb, dd;
1191 aa.v = (vec_double2)(a);
1192 bb.v = (vec_double2)(b);
1193 dd.d[0] = aa.d[0] * bb.d[0];
1194 dd.d[1] = aa.d[1] * bb.d[1];
1195 return ((qword)(dd.v));
1198 /* Multiply High
1200 static __inline qword si_mpyh(qword a, qword b)
1202 vec_uint4 sixteen = (vec_uint4){16, 16, 16, 16};
1204 return ((qword)(vec_sl(vec_mule((vec_short8)(a), (vec_short8)(vec_sl((vec_uint4)(b), sixteen))), sixteen)));
1208 /* Multiply High High
1210 static __inline qword si_mpyhh(qword a, qword b)
1212 return ((qword)(vec_mule((vec_short8)(a), (vec_short8)(b))));
1215 static __inline qword si_mpyhhu(qword a, qword b)
1217 return ((qword)(vec_mule((vec_ushort8)(a), (vec_ushort8)(b))));
1220 /* Multiply Odd
1222 static __inline qword si_mpy(qword a, qword b)
1224 return ((qword)(vec_mulo((vec_short8)(a), (vec_short8)(b))));
1227 static __inline qword si_mpyu(qword a, qword b)
1229 return ((qword)(vec_mulo((vec_ushort8)(a), (vec_ushort8)(b))));
1232 static __inline qword si_mpyi(qword a, short b)
1234 return ((qword)(vec_mulo((vec_short8)(a),
1235 vec_splat((vec_short8)(si_from_short(b)), 1))));
1238 static __inline qword si_mpyui(qword a, unsigned short b)
1240 return ((qword)(vec_mulo((vec_ushort8)(a),
1241 vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
1244 /* Multiply and Shift Right
1246 static __inline qword si_mpys(qword a, qword b)
1248 return ((qword)(vec_sra(vec_mulo((vec_short8)(a), (vec_short8)(b)), ((vec_uint4){16,16,16,16}))));
1251 /* Nand
1253 static __inline qword si_nand(qword a, qword b)
1255 vec_uchar16 d;
1257 d = vec_and((vec_uchar16)(a), (vec_uchar16)(b));
1258 return ((qword)(vec_nor(d, d)));
1261 /* Negative Multiply Add
1263 static __inline qword si_dfnma(qword a, qword b, qword c)
1265 union {
1266 vec_double2 v;
1267 double d[2];
1268 } aa, bb, cc, dd;
1270 aa.v = (vec_double2)(a);
1271 bb.v = (vec_double2)(b);
1272 cc.v = (vec_double2)(c);
1273 dd.d[0] = -cc.d[0] - aa.d[0] * bb.d[0];
1274 dd.d[1] = -cc.d[1] - aa.d[1] * bb.d[1];
1275 return ((qword)(dd.v));
1278 /* Negative Multiply and Subtract
1280 static __inline qword si_fnms(qword a, qword b, qword c)
1282 return ((qword)(vec_nmsub((vec_float4)(a), (vec_float4)(b), (vec_float4)(c))));
1285 static __inline qword si_dfnms(qword a, qword b, qword c)
1287 union {
1288 vec_double2 v;
1289 double d[2];
1290 } aa, bb, cc, dd;
1292 aa.v = (vec_double2)(a);
1293 bb.v = (vec_double2)(b);
1294 cc.v = (vec_double2)(c);
1295 dd.d[0] = cc.d[0] - aa.d[0] * bb.d[0];
1296 dd.d[1] = cc.d[1] - aa.d[1] * bb.d[1];
1297 return ((qword)(dd.v));
1300 /* Nor
1302 static __inline qword si_nor(qword a, qword b)
1304 return ((qword)(vec_nor((vec_uchar16)(a), (vec_uchar16)(b))));
1307 /* Or
1309 static __inline qword si_or(qword a, qword b)
1311 return ((qword)(vec_or((vec_uchar16)(a), (vec_uchar16)(b))));
1314 static __inline qword si_orbi(qword a, unsigned char b)
1316 return ((qword)(vec_or((vec_uchar16)(a),
1317 vec_splat((vec_uchar16)(si_from_uchar(b)), 3))));
1320 static __inline qword si_orhi(qword a, unsigned short b)
1322 return ((qword)(vec_or((vec_ushort8)(a),
1323 vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
1326 static __inline qword si_ori(qword a, unsigned int b)
1328 return ((qword)(vec_or((vec_uint4)(a),
1329 vec_splat((vec_uint4)(si_from_uint(b)), 0))));
1332 /* Or Complement
1334 static __inline qword si_orc(qword a, qword b)
1336 return ((qword)(vec_or((vec_uchar16)(a), vec_nor((vec_uchar16)(b), (vec_uchar16)(b)))));
1340 /* Or Across
1342 static __inline qword si_orx(qword a)
1344 vec_uchar16 tmp;
1345 tmp = (vec_uchar16)(a);
1346 tmp = vec_or(tmp, vec_sld(tmp, tmp, 8));
1347 tmp = vec_or(tmp, vec_sld(tmp, tmp, 4));
1348 return ((qword)(vec_and(tmp, ((vec_uchar16){0xFF,0xFF,0xFF,0xFF, 0x00,0x00,0x00,0x00,
1349 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00}))));
1353 /* Estimates
1355 static __inline qword si_frest(qword a)
1357 return ((qword)(vec_re((vec_float4)(a))));
1360 static __inline qword si_frsqest(qword a)
1362 return ((qword)(vec_rsqrte((vec_float4)(a))));
1365 #define si_fi(_a, _d) (_d)
1367 /* Channel Read and Write
1369 #define si_rdch(_channel) ((qword)(vec_splat_u8(0))) /* not mappable */
1370 #define si_rchcnt(_channel) ((qword)(vec_splat_u8(0))) /* not mappable */
1371 #define si_wrch(_channel, _a) /* not mappable */
1373 /* Rotate Left
1375 static __inline qword si_roth(qword a, qword b)
1377 return ((qword)(vec_rl((vec_ushort8)(a), (vec_ushort8)(b))));
1380 static __inline qword si_rot(qword a, qword b)
1382 return ((qword)(vec_rl((vec_uint4)(a), (vec_uint4)(b))));
1385 static __inline qword si_rothi(qword a, int b)
1387 return ((qword)(vec_rl((vec_ushort8)(a),
1388 vec_splat((vec_ushort8)(si_from_int(b)), 1))));
1391 static __inline qword si_roti(qword a, int b)
1393 return ((qword)(vec_rl((vec_uint4)(a),
1394 vec_splat((vec_uint4)(si_from_int(b)), 0))));
1397 /* Rotate Left with Mask
1399 static __inline qword si_rothm(qword a, qword b)
1401 vec_ushort8 neg_b;
1402 vec_ushort8 mask;
1404 neg_b = (vec_ushort8)vec_sub(vec_splat_s16(0), (vec_short8)(b));
1405 mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1406 return ((qword)(vec_andc(vec_sr((vec_ushort8)(a), neg_b), mask)));
1409 static __inline qword si_rotm(qword a, qword b)
1411 vec_uint4 neg_b;
1412 vec_uint4 mask;
1414 neg_b = (vec_uint4)vec_sub(vec_splat_s32(0), (vec_int4)(b));
1415 mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1416 return ((qword)(vec_andc(vec_sr((vec_uint4)(a), neg_b), mask)));
1419 static __inline qword si_rothmi(qword a, int b)
1421 vec_ushort8 neg_b;
1422 vec_ushort8 mask;
1424 neg_b = vec_splat((vec_ushort8)(si_from_int(-b)), 1);
1425 mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1426 return ((qword)(vec_andc(vec_sr((vec_ushort8)(a), neg_b), mask)));
1429 static __inline qword si_rotmi(qword a, int b)
1431 vec_uint4 neg_b;
1432 vec_uint4 mask;
1434 neg_b = vec_splat((vec_uint4)(si_from_int(-b)), 0);
1435 mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1436 return ((qword)(vec_andc(vec_sr((vec_uint4)(a), neg_b), mask)));
1440 /* Rotate Left Algebraic with Mask
1442 static __inline qword si_rotmah(qword a, qword b)
1444 vec_ushort8 neg_b;
1445 vec_ushort8 mask;
1447 neg_b = (vec_ushort8)vec_sub(vec_splat_s16(0), (vec_short8)(b));
1448 mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1449 return ((qword)(vec_sra((vec_short8)(a), (vec_ushort8)vec_or(neg_b, mask))));
1452 static __inline qword si_rotma(qword a, qword b)
1454 vec_uint4 neg_b;
1455 vec_uint4 mask;
1457 neg_b = (vec_uint4)vec_sub(vec_splat_s32(0), (vec_int4)(b));
1458 mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1459 return ((qword)(vec_sra((vec_int4)(a), (vec_uint4)vec_or(neg_b, mask))));
1463 static __inline qword si_rotmahi(qword a, int b)
1465 vec_ushort8 neg_b;
1466 vec_ushort8 mask;
1468 neg_b = vec_splat((vec_ushort8)(si_from_int(-b)), 1);
1469 mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1470 return ((qword)(vec_sra((vec_short8)(a), (vec_ushort8)vec_or(neg_b, mask))));
1473 static __inline qword si_rotmai(qword a, int b)
1475 vec_uint4 neg_b;
1476 vec_uint4 mask;
1478 neg_b = vec_splat((vec_uint4)(si_from_int(-b)), 0);
1479 mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1480 return ((qword)(vec_sra((vec_int4)(a), (vec_uint4)vec_or(neg_b, mask))));
1484 /* Rotate Left Quadword by Bytes with Mask
1486 static __inline qword si_rotqmbyi(qword a, int count)
1488 union {
1489 vec_uchar16 v;
1490 int i[4];
1491 } x;
1492 vec_uchar16 mask;
1494 count = 0 - count;
1495 x.i[3] = count << 3;
1496 mask = (count & 0x10) ? vec_splat_u8(0) : vec_splat_u8(-1);
1498 return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask)));
1502 static __inline qword si_rotqmby(qword a, qword count)
1504 union {
1505 vec_uchar16 v;
1506 int i[4];
1507 } x;
1508 int cnt;
1509 vec_uchar16 mask;
1511 x.v = (vec_uchar16)(count);
1512 x.i[0] = cnt = (0 - x.i[0]) << 3;
1514 x.v = vec_splat(x.v, 3);
1515 mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1517 return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask)));
1521 /* Rotate Left Quadword by Bytes
1523 static __inline qword si_rotqbyi(qword a, int count)
1525 union {
1526 vec_uchar16 v;
1527 int i[4];
1528 } left, right;
1530 count <<= 3;
1531 left.i[3] = count;
1532 right.i[3] = 0 - count;
1533 return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left.v), vec_sro((vec_uchar16)(a), right.v))));
1536 static __inline qword si_rotqby(qword a, qword count)
1538 vec_uchar16 left, right;
1540 left = vec_sl(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(3));
1541 right = vec_sub(vec_splat_u8(0), left);
1542 return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left), vec_sro((vec_uchar16)(a), right))));
1545 /* Rotate Left Quadword by Bytes Bit Count
1547 static __inline qword si_rotqbybi(qword a, qword count)
1549 vec_uchar16 left, right;
1551 left = vec_splat((vec_uchar16)(count), 3);
1552 right = vec_sub(vec_splat_u8(7), left);
1553 return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left), vec_sro((vec_uchar16)(a), right))));
1557 /* Rotate Left Quadword by Bytes Bit Count
1559 static __inline qword si_rotqbii(qword a, int count)
1561 vec_uchar16 x, y;
1562 vec_uchar16 result;
1564 x = vec_splat((vec_uchar16)(si_from_int(count & 7)), 3);
1565 y = (vec_uchar16)(vec_sr((vec_uint4)vec_sro((vec_uchar16)(a), ((vec_uchar16)((vec_uint4){0,0,0,120}))),
1566 (vec_uint4)vec_sub(vec_splat_u8(8), x)));
1567 result = vec_or(vec_sll((qword)(a), x), y);
1568 return ((qword)(result));
1571 static __inline qword si_rotqbi(qword a, qword count)
1573 vec_uchar16 x, y;
1574 vec_uchar16 result;
1576 x = vec_and(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(7));
1577 y = (vec_uchar16)(vec_sr((vec_uint4)vec_sro((vec_uchar16)(a), ((vec_uchar16)((vec_uint4){0,0,0,120}))),
1578 (vec_uint4)vec_sub(vec_splat_u8(8), x)));
1580 result = vec_or(vec_sll((qword)(a), x), y);
1581 return ((qword)(result));
1585 /* Rotate Left Quadword and Mask by Bits
1587 static __inline qword si_rotqmbii(qword a, int count)
1589 return ((qword)(vec_srl((vec_uchar16)(a), vec_splat((vec_uchar16)(si_from_int(0 - count)), 3))));
1592 static __inline qword si_rotqmbi(qword a, qword count)
1594 return ((qword)(vec_srl((vec_uchar16)(a), vec_sub(vec_splat_u8(0), vec_splat((vec_uchar16)(count), 3)))));
1598 /* Rotate Left Quadword and Mask by Bytes with Bit Count
1600 static __inline qword si_rotqmbybi(qword a, qword count)
1602 union {
1603 vec_uchar16 v;
1604 int i[4];
1605 } x;
1606 int cnt;
1607 vec_uchar16 mask;
1609 x.v = (vec_uchar16)(count);
1610 x.i[0] = cnt = 0 - (x.i[0] & ~7);
1611 x.v = vec_splat(x.v, 3);
1612 mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1614 return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask)));
1620 /* Round Double to Float
1622 static __inline qword si_frds(qword a)
1624 union {
1625 vec_float4 v;
1626 float f[4];
1627 } d;
1628 union {
1629 vec_double2 v;
1630 double d[2];
1631 } in;
1633 in.v = (vec_double2)(a);
1634 d.v = (vec_float4){0.0f};
1635 d.f[0] = (float)in.d[0];
1636 d.f[2] = (float)in.d[1];
1638 return ((qword)(d.v));
1641 /* Select Bits
1643 static __inline qword si_selb(qword a, qword b, qword c)
1645 return ((qword)(vec_sel((vec_uchar16)(a), (vec_uchar16)(b), (vec_uchar16)(c))));
1649 /* Shuffle Bytes
1651 static __inline qword si_shufb(qword a, qword b, qword pattern)
1653 vec_uchar16 pat;
1655 pat = vec_sel(((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}),
1656 vec_sr((vec_uchar16)(pattern), vec_splat_u8(3)),
1657 vec_sra((vec_uchar16)(pattern), vec_splat_u8(7)));
1658 return ((qword)(vec_perm(vec_perm(a, b, pattern),
1659 ((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0,
1660 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),
1661 pat)));
1665 /* Shift Left
1667 static __inline qword si_shlh(qword a, qword b)
1669 vec_ushort8 mask;
1671 mask = (vec_ushort8)vec_sra(vec_sl((vec_ushort8)(b), vec_splat_u16(11)), vec_splat_u16(15));
1672 return ((qword)(vec_andc(vec_sl((vec_ushort8)(a), (vec_ushort8)(b)), mask)));
1675 static __inline qword si_shl(qword a, qword b)
1677 vec_uint4 mask;
1679 mask = (vec_uint4)vec_sra(vec_sl((vec_uint4)(b), ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1680 return ((qword)(vec_andc(vec_sl((vec_uint4)(a), (vec_uint4)(b)), mask)));
1684 static __inline qword si_shlhi(qword a, unsigned int b)
1686 vec_ushort8 mask;
1687 vec_ushort8 bv;
1689 bv = vec_splat((vec_ushort8)(si_from_int(b)), 1);
1690 mask = (vec_ushort8)vec_sra(vec_sl(bv, vec_splat_u16(11)), vec_splat_u16(15));
1691 return ((qword)(vec_andc(vec_sl((vec_ushort8)(a), bv), mask)));
1694 static __inline qword si_shli(qword a, unsigned int b)
1696 vec_uint4 bv;
1697 vec_uint4 mask;
1699 bv = vec_splat((vec_uint4)(si_from_uint(b)), 0);
1700 mask = (vec_uint4)vec_sra(vec_sl(bv, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1701 return ((qword)(vec_andc(vec_sl((vec_uint4)(a), bv), mask)));
1705 /* Shift Left Quadword
1707 static __inline qword si_shlqbii(qword a, unsigned int count)
1709 vec_uchar16 x;
1711 x = vec_splat((vec_uchar16)(si_from_uint(count)), 3);
1712 return ((qword)(vec_sll((vec_uchar16)(a), x)));
1715 static __inline qword si_shlqbi(qword a, qword count)
1717 vec_uchar16 x;
1719 x = vec_splat((vec_uchar16)(count), 3);
1720 return ((qword)(vec_sll((vec_uchar16)(a), x)));
1724 /* Shift Left Quadword by Bytes
1726 static __inline qword si_shlqbyi(qword a, unsigned int count)
1728 union {
1729 vec_uchar16 v;
1730 int i[4];
1731 } x;
1732 vec_uchar16 mask;
1734 x.i[3] = count << 3;
1735 mask = (count & 0x10) ? vec_splat_u8(0) : vec_splat_u8(-1);
1736 return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask)));
1739 static __inline qword si_shlqby(qword a, qword count)
1741 union {
1742 vec_uchar16 v;
1743 unsigned int i[4];
1744 } x;
1745 unsigned int cnt;
1746 vec_uchar16 mask;
1748 x.v = vec_sl(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(3));
1749 cnt = x.i[0];
1750 mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1751 return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask)));
1754 /* Shift Left Quadword by Bytes with Bit Count
1756 static __inline qword si_shlqbybi(qword a, qword count)
1758 union {
1759 vec_uchar16 v;
1760 int i[4];
1761 } x;
1762 unsigned int cnt;
1763 vec_uchar16 mask;
1765 x.v = vec_splat((vec_uchar16)(count), 3);
1766 cnt = x.i[0];
1767 mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1768 return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask)));
1772 /* Stop and Signal
1774 #define si_stop(_type) SPU_STOP_ACTION
1775 #define si_stopd(a, b, c) SPU_STOP_ACTION
1778 /* Subtract
1780 static __inline qword si_sfh(qword a, qword b)
1782 return ((qword)(vec_sub((vec_ushort8)(b), (vec_ushort8)(a))));
1785 static __inline qword si_sf(qword a, qword b)
1787 return ((qword)(vec_sub((vec_uint4)(b), (vec_uint4)(a))));
1790 static __inline qword si_fs(qword a, qword b)
1792 return ((qword)(vec_sub((vec_float4)(a), (vec_float4)(b))));
1795 static __inline qword si_dfs(qword a, qword b)
1797 union {
1798 vec_double2 v;
1799 double d[2];
1800 } aa, bb, dd;
1802 aa.v = (vec_double2)(a);
1803 bb.v = (vec_double2)(b);
1804 dd.d[0] = aa.d[0] - bb.d[0];
1805 dd.d[1] = aa.d[1] - bb.d[1];
1806 return ((qword)(dd.v));
1809 static __inline qword si_sfhi(qword a, short b)
1811 return ((qword)(vec_sub(vec_splat((vec_short8)(si_from_short(b)), 1),
1812 (vec_short8)(a))));
1815 static __inline qword si_sfi(qword a, int b)
1817 return ((qword)(vec_sub(vec_splat((vec_int4)(si_from_int(b)), 0),
1818 (vec_int4)(a))));
1821 /* Subtract word extended
1823 #define si_sfx(_a, _b, _c) ((qword)(vec_add(vec_add((vec_uint4)(_b), \
1824 vec_nor((vec_uint4)(_a), (vec_uint4)(_a))), \
1825 vec_and((vec_uint4)(_c), vec_splat_u32(1)))))
1828 /* Sum Bytes into Shorts
1830 static __inline qword si_sumb(qword a, qword b)
1832 vec_uint4 zero = (vec_uint4){0};
1833 vec_ushort8 sum_a, sum_b;
1835 sum_a = (vec_ushort8)vec_sum4s((vec_uchar16)(a), zero);
1836 sum_b = (vec_ushort8)vec_sum4s((vec_uchar16)(b), zero);
1838 return ((qword)(vec_perm(sum_a, sum_b, ((vec_uchar16){18, 19, 2, 3, 22, 23, 6, 7,
1839 26, 27, 10, 11, 30, 31, 14, 15}))));
1842 /* Exclusive OR
1844 static __inline qword si_xor(qword a, qword b)
1846 return ((qword)(vec_xor((vec_uchar16)(a), (vec_uchar16)(b))));
1849 static __inline qword si_xorbi(qword a, unsigned char b)
1851 return ((qword)(vec_xor((vec_uchar16)(a),
1852 vec_splat((vec_uchar16)(si_from_uchar(b)), 3))));
1855 static __inline qword si_xorhi(qword a, unsigned short b)
1857 return ((qword)(vec_xor((vec_ushort8)(a),
1858 vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
1861 static __inline qword si_xori(qword a, unsigned int b)
1863 return ((qword)(vec_xor((vec_uint4)(a),
1864 vec_splat((vec_uint4)(si_from_uint(b)), 0))));
1868 /* Generate Controls for Sub-Quadword Insertion
1870 static __inline qword si_cbd(qword a, int imm)
1872 union {
1873 vec_uint4 v;
1874 unsigned char c[16];
1875 } shmask;
1877 shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1878 shmask.c[(si_to_uint(a) + (unsigned int)(imm)) & 0xF] = 0x03;
1879 return ((qword)(shmask.v));
1882 static __inline qword si_cdd(qword a, int imm)
1884 union {
1885 vec_uint4 v;
1886 unsigned long long ll[2];
1887 } shmask;
1889 shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1890 shmask.ll[((si_to_uint(a) + (unsigned int)(imm)) >> 3) & 0x1] = 0x0001020304050607ULL;
1891 return ((qword)(shmask.v));
1894 static __inline qword si_chd(qword a, int imm)
1896 union {
1897 vec_uint4 v;
1898 unsigned short s[8];
1899 } shmask;
1901 shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1902 shmask.s[((si_to_uint(a) + (unsigned int)(imm)) >> 1) & 0x7] = 0x0203;
1903 return ((qword)(shmask.v));
1906 static __inline qword si_cwd(qword a, int imm)
1908 union {
1909 vec_uint4 v;
1910 unsigned int i[4];
1911 } shmask;
1913 shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1914 shmask.i[((si_to_uint(a) + (unsigned int)(imm)) >> 2) & 0x3] = 0x00010203;
1915 return ((qword)(shmask.v));
1918 static __inline qword si_cbx(qword a, qword b)
1920 union {
1921 vec_uint4 v;
1922 unsigned char c[16];
1923 } shmask;
1925 shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1926 shmask.c[si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) & 0xF] = 0x03;
1927 return ((qword)(shmask.v));
1931 static __inline qword si_cdx(qword a, qword b)
1933 union {
1934 vec_uint4 v;
1935 unsigned long long ll[2];
1936 } shmask;
1938 shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1939 shmask.ll[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 3) & 0x1] = 0x0001020304050607ULL;
1940 return ((qword)(shmask.v));
1943 static __inline qword si_chx(qword a, qword b)
1945 union {
1946 vec_uint4 v;
1947 unsigned short s[8];
1948 } shmask;
1950 shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1951 shmask.s[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 1) & 0x7] = 0x0203;
1952 return ((qword)(shmask.v));
1955 static __inline qword si_cwx(qword a, qword b)
1957 union {
1958 vec_uint4 v;
1959 unsigned int i[4];
1960 } shmask;
1962 shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1963 shmask.i[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 2) & 0x3] = 0x00010203;
1964 return ((qword)(shmask.v));
1968 /* Constant Formation
1970 static __inline qword si_il(signed short imm)
1972 return ((qword)(vec_splat((vec_int4)(si_from_int((signed int)(imm))), 0)));
1976 static __inline qword si_ila(unsigned int imm)
1978 return ((qword)(vec_splat((vec_uint4)(si_from_uint(imm)), 0)));
1981 static __inline qword si_ilh(signed short imm)
1983 return ((qword)(vec_splat((vec_short8)(si_from_short(imm)), 1)));
1986 static __inline qword si_ilhu(signed short imm)
1988 return ((qword)(vec_splat((vec_uint4)(si_from_uint((unsigned int)(imm) << 16)), 0)));
1991 static __inline qword si_iohl(qword a, unsigned short imm)
1993 return ((qword)(vec_or((vec_uint4)(a), vec_splat((vec_uint4)(si_from_uint((unsigned int)(imm))), 0))));
1996 /* No Operation
1998 #define si_lnop() /* do nothing */
1999 #define si_nop() /* do nothing */
2002 /* Memory Load and Store
2004 static __inline qword si_lqa(unsigned int imm)
2006 return ((qword)(vec_ld(0, (vector unsigned char *)(imm))));
2009 static __inline qword si_lqd(qword a, unsigned int imm)
2011 return ((qword)(vec_ld(si_to_uint(a) & ~0xF, (vector unsigned char *)(imm))));
2014 static __inline qword si_lqr(unsigned int imm)
2016 return ((qword)(vec_ld(0, (vector unsigned char *)(imm))));
2019 static __inline qword si_lqx(qword a, qword b)
2021 return ((qword)(vec_ld(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))), (vector unsigned char *)(0))));
2024 static __inline void si_stqa(qword a, unsigned int imm)
2026 vec_st((vec_uchar16)(a), 0, (vector unsigned char *)(imm));
2029 static __inline void si_stqd(qword a, qword b, unsigned int imm)
2031 vec_st((vec_uchar16)(a), si_to_uint(b) & ~0xF, (vector unsigned char *)(imm));
2034 static __inline void si_stqr(qword a, unsigned int imm)
2036 vec_st((vec_uchar16)(a), 0, (vector unsigned char *)(imm));
2039 static __inline void si_stqx(qword a, qword b, qword c)
2041 vec_st((vec_uchar16)(a),
2042 si_to_uint((qword)(vec_add((vec_uint4)(b), (vec_uint4)(c)))),
2043 (vector unsigned char *)(0));
2046 #endif /* !__SPU__ */
2047 #endif /* !_SI2VMX_H_ */