Merged with mainline at revision 128810.
[official-gcc.git] / gcc / config / rs6000 / si2vmx.h
blob2de3438707f4e97bb9fb92beb0486cbe6138cd6e
1 /* Cell BEA specific SPU intrinsics to PPU/VMX intrinsics
2 Copyright (C) 2007 Free Software Foundation, Inc.
4 This file is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2 of the License, or (at your option)
7 any later version.
9 This file is distributed in the hope that it will be useful, but WITHOUT
10 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 for more details.
14 You should have received a copy of the GNU General Public License
15 along with this file; see the file COPYING. If not, write to the Free
16 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
17 02110-1301, USA. */
19 /* As a special exception, if you include this header file into source files
20 compiled by GCC, this header file does not by itself cause the resulting
21 executable to be covered by the GNU General Public License. This exception
22 does not however invalidate any other reasons why the executable file might be
23 covered by the GNU General Public License. */
25 #ifndef _SI2VMX_H_
26 #define _SI2VMX_H_ 1
28 #ifndef __SPU__
30 #include <stdlib.h>
31 #include <vec_types.h>
34 /* Specify a default halt action for spu_hcmpeq and spu_hcmpgt intrinsics.
35 * Users can override the action by defining it prior to including this
36 * header file.
38 #ifndef SPU_HALT_ACTION
39 #define SPU_HALT_ACTION abort()
40 #endif
42 /* Specify a default stop action for the spu_stop intrinsic.
43 * Users can override the action by defining it prior to including this
44 * header file.
46 #ifndef SPU_STOP_ACTION
47 #define SPU_STOP_ACTION abort()
48 #endif
51 /* Specify a default action for unsupported intrinsic.
52 * Users can override the action by defining it prior to including this
53 * header file.
55 #ifndef SPU_UNSUPPORTED_ACTION
56 #define SPU_UNSUPPORTED_ACTION abort()
57 #endif
60 /* Casting intrinsics - from scalar to quadword
63 static __inline qword si_from_uchar(unsigned char c) {
64 union {
65 qword q;
66 unsigned char c[16];
67 } x;
68 x.c[3] = c;
69 return (x.q);
72 static __inline qword si_from_char(signed char c) {
73 union {
74 qword q;
75 signed char c[16];
76 } x;
77 x.c[3] = c;
78 return (x.q);
81 static __inline qword si_from_ushort(unsigned short s) {
82 union {
83 qword q;
84 unsigned short s[8];
85 } x;
86 x.s[1] = s;
87 return (x.q);
90 static __inline qword si_from_short(short s) {
91 union {
92 qword q;
93 short s[8];
94 } x;
95 x.s[1] = s;
96 return (x.q);
100 static __inline qword si_from_uint(unsigned int i) {
101 union {
102 qword q;
103 unsigned int i[4];
104 } x;
105 x.i[0] = i;
106 return (x.q);
109 static __inline qword si_from_int(int i) {
110 union {
111 qword q;
112 int i[4];
113 } x;
114 x.i[0] = i;
115 return (x.q);
118 static __inline qword si_from_ullong(unsigned long long l) {
119 union {
120 qword q;
121 unsigned long long l[2];
122 } x;
123 x.l[0] = l;
124 return (x.q);
127 static __inline qword si_from_llong(long long l) {
128 union {
129 qword q;
130 long long l[2];
131 } x;
132 x.l[0] = l;
133 return (x.q);
136 static __inline qword si_from_float(float f) {
137 union {
138 qword q;
139 float f[4];
140 } x;
141 x.f[0] = f;
142 return (x.q);
145 static __inline qword si_from_double(double d) {
146 union {
147 qword q;
148 double d[2];
149 } x;
150 x.d[0] = d;
151 return (x.q);
154 static __inline qword si_from_ptr(void *ptr) {
155 union {
156 qword q;
157 void *p;
158 } x;
159 x.p = ptr;
160 return (x.q);
164 /* Casting intrinsics - from quadword to scalar
166 static __inline unsigned char si_to_uchar(qword q) {
167 union {
168 qword q;
169 unsigned char c[16];
170 } x;
171 x.q = q;
172 return (x.c[3]);
175 static __inline signed char si_to_char(qword q) {
176 union {
177 qword q;
178 signed char c[16];
179 } x;
180 x.q = q;
181 return (x.c[3]);
184 static __inline unsigned short si_to_ushort(qword q) {
185 union {
186 qword q;
187 unsigned short s[8];
188 } x;
189 x.q = q;
190 return (x.s[1]);
193 static __inline short si_to_short(qword q) {
194 union {
195 qword q;
196 short s[8];
197 } x;
198 x.q = q;
199 return (x.s[1]);
202 static __inline unsigned int si_to_uint(qword q) {
203 union {
204 qword q;
205 unsigned int i[4];
206 } x;
207 x.q = q;
208 return (x.i[0]);
211 static __inline int si_to_int(qword q) {
212 union {
213 qword q;
214 int i[4];
215 } x;
216 x.q = q;
217 return (x.i[0]);
220 static __inline unsigned long long si_to_ullong(qword q) {
221 union {
222 qword q;
223 unsigned long long l[2];
224 } x;
225 x.q = q;
226 return (x.l[0]);
229 static __inline long long si_to_llong(qword q) {
230 union {
231 qword q;
232 long long l[2];
233 } x;
234 x.q = q;
235 return (x.l[0]);
238 static __inline float si_to_float(qword q) {
239 union {
240 qword q;
241 float f[4];
242 } x;
243 x.q = q;
244 return (x.f[0]);
247 static __inline double si_to_double(qword q) {
248 union {
249 qword q;
250 double d[2];
251 } x;
252 x.q = q;
253 return (x.d[0]);
256 static __inline void * si_to_ptr(qword q) {
257 union {
258 qword q;
259 void *p;
260 } x;
261 x.q = q;
262 return (x.p);
266 /* Absolute difference
268 static __inline qword si_absdb(qword a, qword b)
270 vec_uchar16 ac, bc, dc;
272 ac = (vec_uchar16)(a);
273 bc = (vec_uchar16)(b);
274 dc = vec_sel(vec_sub(bc, ac), vec_sub(ac, bc), vec_cmpgt(ac, bc));
276 return ((qword)(dc));
279 /* Add intrinsics
281 #define si_a(_a, _b) ((qword)(vec_add((vec_uint4)(_a), (vec_uint4)(_b))))
283 #define si_ah(_a, _b) ((qword)(vec_add((vec_ushort8)(_a), (vec_ushort8)(_b))))
285 static __inline qword si_ai(qword a, int b)
287 return ((qword)(vec_add((vec_int4)(a),
288 vec_splat((vec_int4)(si_from_int(b)), 0))));
292 static __inline qword si_ahi(qword a, short b)
294 return ((qword)(vec_add((vec_short8)(a),
295 vec_splat((vec_short8)(si_from_short(b)), 1))));
299 #define si_fa(_a, _b) ((qword)(vec_add((vec_float4)(_a), (vec_float4)(_b))))
302 static __inline qword si_dfa(qword a, qword b)
304 union {
305 vec_double2 v;
306 double d[2];
307 } ad, bd, dd;
309 ad.v = (vec_double2)(a);
310 bd.v = (vec_double2)(b);
311 dd.d[0] = ad.d[0] + bd.d[0];
312 dd.d[1] = ad.d[1] + bd.d[1];
314 return ((qword)(dd.v));
317 /* Add word extended
319 #define si_addx(_a, _b, _c) ((qword)(vec_add(vec_add((vec_uint4)(_a), (vec_uint4)(_b)), \
320 vec_and((vec_uint4)(_c), vec_splat_u32(1)))))
323 /* Bit-wise AND
325 #define si_and(_a, _b) ((qword)(vec_and((vec_uint4)(_a), (vec_uint4)(_b))))
328 static __inline qword si_andbi(qword a, signed char b)
330 return ((qword)(vec_and((vec_char16)(a),
331 vec_splat((vec_char16)(si_from_char(b)), 3))));
334 static __inline qword si_andhi(qword a, signed short b)
336 return ((qword)(vec_and((vec_short8)(a),
337 vec_splat((vec_short8)(si_from_short(b)), 1))));
341 static __inline qword si_andi(qword a, signed int b)
343 return ((qword)(vec_and((vec_int4)(a),
344 vec_splat((vec_int4)(si_from_int(b)), 0))));
348 /* Bit-wise AND with complement
350 #define si_andc(_a, _b) ((qword)(vec_andc((vec_uchar16)(_a), (vec_uchar16)(_b))))
353 /* Average byte vectors
355 #define si_avgb(_a, _b) ((qword)(vec_avg((vec_uchar16)(_a), (vec_uchar16)(_b))))
358 /* Branch indirect and set link on external data
360 #define si_bisled(_func) /* not mappable */
361 #define si_bisledd(_func) /* not mappable */
362 #define si_bislede(_func) /* not mappable */
365 /* Borrow generate
367 #define si_bg(_a, _b) ((qword)(vec_subc((vec_uint4)(_b), (vec_uint4)(_a))))
369 #define si_bgx(_a, _b, _c) ((qword)(vec_and(vec_or(vec_cmpgt((vec_uint4)(_b), (vec_uint4)(_a)), \
370 vec_and(vec_cmpeq((vec_uint4)(_b), (vec_uint4)(_a)), \
371 (vec_uint4)(_c))), vec_splat_u32(1))))
373 /* Compare absolute equal
375 static __inline qword si_fcmeq(qword a, qword b)
377 vec_float4 msb = (vec_float4)((vec_uint4){0x80000000, 0x80000000, 0x80000000, 0x80000000});
379 return ((qword)(vec_cmpeq(vec_andc((vec_float4)(a), msb),
380 vec_andc((vec_float4)(b), msb))));
383 static __inline qword si_dfcmeq(qword a, qword b)
385 vec_uint4 sign_mask= (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
386 vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000 };
387 vec_uchar16 hihi_promote = (vec_uchar16) { 0,1,2,3, 16,17,18,19, 8,9,10,11, 24,25,26,27};
389 vec_uint4 biteq;
390 vec_uint4 aabs;
391 vec_uint4 babs;
392 vec_uint4 a_gt;
393 vec_uint4 ahi_inf;
394 vec_uint4 anan;
395 vec_uint4 result;
397 union {
398 vec_uchar16 v;
399 int i[4];
400 } x;
402 /* Shift 4 bytes */
403 x.i[3] = 4 << 3;
405 /* Mask out sign bits */
406 aabs = vec_and((vec_uint4)a,sign_mask);
407 babs = vec_and((vec_uint4)b,sign_mask);
409 /* A) Check for bit equality, store in high word */
410 biteq = (vec_uint4) vec_cmpeq((vec_uint4)aabs,(vec_uint4)babs);
411 biteq = vec_and(biteq,(vec_uint4)vec_slo((vec_uchar16)biteq,x.v));
414 B) Check if a is NaN, store in high word
416 B1) If the high word is greater than max_exp (indicates a NaN)
417 B2) If the low word is greater than 0
419 a_gt = (vec_uint4)vec_cmpgt(aabs,nan_mask);
421 /* B3) Check if the high word is equal to the inf exponent */
422 ahi_inf = (vec_uint4)vec_cmpeq(aabs,nan_mask);
424 /* anan = B1[hi] or (B2[lo] and B3[hi]) */
425 anan = (vec_uint4)vec_or(a_gt,vec_and((vec_uint4)vec_slo((vec_uchar16)a_gt,x.v),ahi_inf));
427 /* result = A and not B */
428 result = vec_andc(biteq, anan);
430 /* Promote high words to 64 bits and return */
431 return ((qword)(vec_perm((vec_uchar16)result, (vec_uchar16)result, hihi_promote)));
435 /* Compare absolute greater than
437 static __inline qword si_fcmgt(qword a, qword b)
439 vec_float4 msb = (vec_float4)((vec_uint4){0x80000000, 0x80000000, 0x80000000, 0x80000000});
441 return ((qword)(vec_cmpgt(vec_andc((vec_float4)(a), msb),
442 vec_andc((vec_float4)(b), msb))));
445 static __inline qword si_dfcmgt(qword a, qword b)
447 vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
448 vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
449 vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
451 union {
452 vec_uchar16 v;
453 int i[4];
454 } x;
456 /* Shift 4 bytes */
457 x.i[3] = 4 << 3;
459 // absolute value of a,b
460 vec_uint4 aabs = vec_and((vec_uint4)a, sign_mask);
461 vec_uint4 babs = vec_and((vec_uint4)b, sign_mask);
463 // check if a is nan
464 vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask);
465 vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask);
466 a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf));
467 a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi);
469 // check if b is nan
470 vec_uint4 b_inf = (vec_uint4)vec_cmpeq(babs, nan_mask);
471 vec_uint4 b_nan = (vec_uint4)vec_cmpgt(babs, nan_mask);
472 b_nan = vec_or(b_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)b_nan,x.v),b_inf));
473 b_nan = (vec_uint4)vec_perm((vec_uchar16)b_nan, (vec_uchar16)b_nan, splat_hi);
475 // A) Check if the exponents are different
476 vec_uint4 gt_hi = (vec_uint4)vec_cmpgt(aabs,babs);
478 // B) Check if high word equal, and low word greater
479 vec_uint4 gt_lo = (vec_uint4)vec_cmpgt((vec_uint4)aabs, (vec_uint4)babs);
480 vec_uint4 eq = (vec_uint4)vec_cmpeq(aabs, babs);
481 vec_uint4 eqgt = vec_and(eq,vec_slo(gt_lo,x.v));
483 // If either A or B is true, return true (unless NaNs detected)
484 vec_uint4 r = vec_or(gt_hi, eqgt);
486 // splat the high words of the comparison step
487 r = (vec_uint4)vec_perm((vec_uchar16)r,(vec_uchar16)r,splat_hi);
489 // correct for NaNs in input
490 return ((qword)vec_andc(r,vec_or(a_nan,b_nan)));
494 /* Compare equal
496 static __inline qword si_ceqb(qword a, qword b)
498 return ((qword)(vec_cmpeq((vec_uchar16)(a), (vec_uchar16)(b))));
501 static __inline qword si_ceqh(qword a, qword b)
503 return ((qword)(vec_cmpeq((vec_ushort8)(a), (vec_ushort8)(b))));
506 static __inline qword si_ceq(qword a, qword b)
508 return ((qword)(vec_cmpeq((vec_uint4)(a), (vec_uint4)(b))));
511 static __inline qword si_fceq(qword a, qword b)
513 return ((qword)(vec_cmpeq((vec_float4)(a), (vec_float4)(b))));
516 static __inline qword si_ceqbi(qword a, signed char b)
518 return ((qword)(vec_cmpeq((vec_char16)(a),
519 vec_splat((vec_char16)(si_from_char(b)), 3))));
522 static __inline qword si_ceqhi(qword a, signed short b)
524 return ((qword)(vec_cmpeq((vec_short8)(a),
525 vec_splat((vec_short8)(si_from_short(b)), 1))));
528 static __inline qword si_ceqi(qword a, signed int b)
530 return ((qword)(vec_cmpeq((vec_int4)(a),
531 vec_splat((vec_int4)(si_from_int(b)), 0))));
534 static __inline qword si_dfceq(qword a, qword b)
536 vec_uint4 sign_mask= (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
537 vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000 };
538 vec_uchar16 hihi_promote = (vec_uchar16) { 0,1,2,3, 16,17,18,19, 8,9,10,11, 24,25,26,27};
540 vec_uint4 biteq;
541 vec_uint4 aabs;
542 vec_uint4 babs;
543 vec_uint4 a_gt;
544 vec_uint4 ahi_inf;
545 vec_uint4 anan;
546 vec_uint4 iszero;
547 vec_uint4 result;
549 union {
550 vec_uchar16 v;
551 int i[4];
552 } x;
554 /* Shift 4 bytes */
555 x.i[3] = 4 << 3;
557 /* A) Check for bit equality, store in high word */
558 biteq = (vec_uint4) vec_cmpeq((vec_uint4)a,(vec_uint4)b);
559 biteq = vec_and(biteq,(vec_uint4)vec_slo((vec_uchar16)biteq,x.v));
561 /* Mask out sign bits */
562 aabs = vec_and((vec_uint4)a,sign_mask);
563 babs = vec_and((vec_uint4)b,sign_mask);
566 B) Check if a is NaN, store in high word
568 B1) If the high word is greater than max_exp (indicates a NaN)
569 B2) If the low word is greater than 0
571 a_gt = (vec_uint4)vec_cmpgt(aabs,nan_mask);
573 /* B3) Check if the high word is equal to the inf exponent */
574 ahi_inf = (vec_uint4)vec_cmpeq(aabs,nan_mask);
576 /* anan = B1[hi] or (B2[lo] and B3[hi]) */
577 anan = (vec_uint4)vec_or(a_gt,vec_and((vec_uint4)vec_slo((vec_uchar16)a_gt,x.v),ahi_inf));
579 /* C) Check for 0 = -0 special case */
580 iszero =(vec_uint4)vec_cmpeq((vec_uint4)vec_or(aabs,babs),(vec_uint4)vec_splat_u32(0));
581 iszero = vec_and(iszero,(vec_uint4)vec_slo((vec_uchar16)iszero,x.v));
583 /* result = (A or C) and not B */
584 result = vec_or(biteq,iszero);
585 result = vec_andc(result, anan);
587 /* Promote high words to 64 bits and return */
588 return ((qword)(vec_perm((vec_uchar16)result, (vec_uchar16)result, hihi_promote)));
592 /* Compare greater than
594 static __inline qword si_cgtb(qword a, qword b)
596 return ((qword)(vec_cmpgt((vec_char16)(a), (vec_char16)(b))));
599 static __inline qword si_cgth(qword a, qword b)
601 return ((qword)(vec_cmpgt((vec_short8)(a), (vec_short8)(b))));
604 static __inline qword si_cgt(qword a, qword b)
606 return ((qword)(vec_cmpgt((vec_int4)(a), (vec_int4)(b))));
609 static __inline qword si_clgtb(qword a, qword b)
611 return ((qword)(vec_cmpgt((vec_uchar16)(a), (vec_uchar16)(b))));
614 static __inline qword si_clgth(qword a, qword b)
616 return ((qword)(vec_cmpgt((vec_ushort8)(a), (vec_ushort8)(b))));
619 static __inline qword si_clgt(qword a, qword b)
621 return ((qword)(vec_cmpgt((vec_uint4)(a), (vec_uint4)(b))));
624 static __inline qword si_fcgt(qword a, qword b)
626 return ((qword)(vec_cmpgt((vec_float4)(a), (vec_float4)(b))));
629 static __inline qword si_dfcgt(qword a, qword b)
631 vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
632 vec_uchar16 borrow_shuffle = (vec_uchar16) { 4,5,6,7, 192,192,192,192, 12,13,14,15, 192,192,192,192 };
633 vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
634 vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
636 union {
637 vec_uchar16 v;
638 int i[4];
639 } x;
641 /* Shift 4 bytes */
642 x.i[3] = 4 << 3;
644 // absolute value of a,b
645 vec_uint4 aabs = vec_and((vec_uint4)a, sign_mask);
646 vec_uint4 babs = vec_and((vec_uint4)b, sign_mask);
648 // check if a is nan
649 vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask);
650 vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask);
651 a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf));
652 a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi);
654 // check if b is nan
655 vec_uint4 b_inf = (vec_uint4)vec_cmpeq(babs, nan_mask);
656 vec_uint4 b_nan = (vec_uint4)vec_cmpgt(babs, nan_mask);
657 b_nan = vec_or(b_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)b_nan,x.v),b_inf));
658 b_nan = (vec_uint4)vec_perm((vec_uchar16)b_nan, (vec_uchar16)b_nan, splat_hi);
660 // sign of a
661 vec_uint4 asel = (vec_uint4)vec_sra((vec_int4)(a), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0));
662 asel = (vec_uint4)vec_perm((vec_uchar16)asel,(vec_uchar16)asel,splat_hi);
664 // sign of b
665 vec_uint4 bsel = (vec_uint4)vec_sra((vec_int4)(b), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0));
666 bsel = (vec_uint4)vec_perm((vec_uchar16)bsel,(vec_uchar16)bsel,splat_hi);
668 // negative a
669 vec_uint4 abor = vec_subc((vec_uint4)vec_splat_u32(0), aabs);
670 vec_uchar16 pat = vec_sel(((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}), vec_sr(borrow_shuffle, vec_splat_u8(3)), vec_sra(borrow_shuffle, vec_splat_u8(7)));
671 abor = (vec_uint4)(vec_perm(vec_perm((vec_uchar16)abor, (vec_uchar16)abor, borrow_shuffle),((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),pat));
672 vec_uint4 aneg = vec_add(vec_add(vec_splat_u32(0), vec_nor(aabs, aabs)), vec_and(abor, vec_splat_u32(1)));
674 // pick the one we want
675 vec_int4 aval = (vec_int4)vec_sel((vec_uchar16)aabs, (vec_uchar16)aneg, (vec_uchar16)asel);
677 // negative b
678 vec_uint4 bbor = vec_subc((vec_uint4)vec_splat_u32(0), babs);
679 bbor = (vec_uint4)(vec_perm(vec_perm((vec_uchar16)bbor, (vec_uchar16)bbor, borrow_shuffle),((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),pat));
680 vec_uint4 bneg = vec_add(vec_nor(babs, babs), vec_and(bbor, vec_splat_u32(1)));
682 // pick the one we want
683 vec_int4 bval=(vec_int4)vec_sel((vec_uchar16)babs, (vec_uchar16)bneg, (vec_uchar16)bsel);
685 // A) Check if the exponents are different
686 vec_uint4 gt_hi = (vec_uint4)vec_cmpgt(aval,bval);
688 // B) Check if high word equal, and low word greater
689 vec_uint4 gt_lo = (vec_uint4)vec_cmpgt((vec_uint4)aval, (vec_uint4)bval);
690 vec_uint4 eq = (vec_uint4)vec_cmpeq(aval, bval);
691 vec_uint4 eqgt = vec_and(eq,vec_slo(gt_lo,x.v));
693 // If either A or B is true, return true (unless NaNs detected)
694 vec_uint4 r = vec_or(gt_hi, eqgt);
696 // splat the high words of the comparison step
697 r = (vec_uint4)vec_perm((vec_uchar16)r,(vec_uchar16)r,splat_hi);
699 // correct for NaNs in input
700 return ((qword)vec_andc(r,vec_or(a_nan,b_nan)));
703 static __inline qword si_cgtbi(qword a, signed char b)
705 return ((qword)(vec_cmpgt((vec_char16)(a),
706 vec_splat((vec_char16)(si_from_char(b)), 3))));
709 static __inline qword si_cgthi(qword a, signed short b)
711 return ((qword)(vec_cmpgt((vec_short8)(a),
712 vec_splat((vec_short8)(si_from_short(b)), 1))));
715 static __inline qword si_cgti(qword a, signed int b)
717 return ((qword)(vec_cmpgt((vec_int4)(a),
718 vec_splat((vec_int4)(si_from_int(b)), 0))));
721 static __inline qword si_clgtbi(qword a, unsigned char b)
723 return ((qword)(vec_cmpgt((vec_uchar16)(a),
724 vec_splat((vec_uchar16)(si_from_uchar(b)), 3))));
727 static __inline qword si_clgthi(qword a, unsigned short b)
729 return ((qword)(vec_cmpgt((vec_ushort8)(a),
730 vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
733 static __inline qword si_clgti(qword a, unsigned int b)
735 return ((qword)(vec_cmpgt((vec_uint4)(a),
736 vec_splat((vec_uint4)(si_from_uint(b)), 0))));
739 static __inline qword si_dftsv(qword a, char b)
741 vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
742 vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
743 vec_uint4 result = (vec_uint4){0};
744 vec_uint4 sign = (vec_uint4)vec_sra((vec_int4)(a), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0));
745 sign = (vec_uint4)vec_perm((vec_uchar16)sign,(vec_uchar16)sign,splat_hi);
746 vec_uint4 aabs = vec_and((vec_uint4)a,sign_mask);
748 union {
749 vec_uchar16 v;
750 int i[4];
751 } x;
753 /* Shift 4 bytes */
754 x.i[3] = 4 << 3;
756 /* Nan or +inf or -inf */
757 if (b & 0x70)
759 vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
760 vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask);
761 /* NaN */
762 if (b & 0x40)
764 vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask);
765 a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf));
766 a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi);
767 result = vec_or(result, a_nan);
769 /* inf */
770 if (b & 0x30)
772 a_inf = vec_and((vec_uint4)vec_slo((vec_uchar16)a_inf,x.v), a_inf);
773 a_inf = (vec_uint4)vec_perm((vec_uchar16)a_inf, (vec_uchar16)a_inf, splat_hi);
774 /* +inf */
775 if (b & 0x20)
776 result = vec_or(vec_andc(a_inf, sign), result);
777 /* -inf */
778 if (b & 0x10)
779 result = vec_or(vec_and(a_inf, sign), result);
782 /* 0 or denorm */
783 if (b & 0xF)
785 vec_uint4 iszero =(vec_uint4)vec_cmpeq(aabs,(vec_uint4)vec_splat_u32(0));
786 iszero = vec_and(iszero,(vec_uint4)vec_slo((vec_uchar16)iszero,x.v));
787 /* denorm */
788 if (b & 0x3)
790 vec_uint4 denorm_mask = (vec_uint4){0xFFFFF, 0xFFFFF, 0xFFFFF, 0xFFFFF};
791 vec_uint4 isdenorm = vec_nor((vec_uint4)vec_cmpgt(aabs, denorm_mask), iszero);
792 isdenorm = (vec_uint4)vec_perm((vec_uchar16)isdenorm, (vec_uchar16)isdenorm, splat_hi);
793 /* +denorm */
794 if (b & 0x2)
795 result = vec_or(vec_andc(isdenorm, sign), result);
796 /* -denorm */
797 if (b & 0x1)
798 result = vec_or(vec_and(isdenorm, sign), result);
800 /* 0 */
801 if (b & 0xC)
803 iszero = (vec_uint4)vec_perm((vec_uchar16)iszero, (vec_uchar16)iszero, splat_hi);
804 /* +0 */
805 if (b & 0x8)
806 result = vec_or(vec_andc(iszero, sign), result);
807 /* -0 */
808 if (b & 0x4)
809 result = vec_or(vec_and(iszero, sign), result);
812 return ((qword)result);
816 /* Carry generate
818 #define si_cg(_a, _b) ((qword)(vec_addc((vec_uint4)(_a), (vec_uint4)(_b))))
820 #define si_cgx(_a, _b, _c) ((qword)(vec_or(vec_addc((vec_uint4)(_a), (vec_uint4)(_b)), \
821 vec_addc(vec_add((vec_uint4)(_a), (vec_uint4)(_b)), \
822 vec_and((vec_uint4)(_c), vec_splat_u32(1))))))
825 /* Count ones for bytes
827 static __inline qword si_cntb(qword a)
829 vec_uchar16 nib_cnt = (vec_uchar16){0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
830 vec_uchar16 four = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 };
831 vec_uchar16 av;
833 av = (vec_uchar16)(a);
835 return ((qword)(vec_add(vec_perm(nib_cnt, nib_cnt, av),
836 vec_perm(nib_cnt, nib_cnt, vec_sr (av, four)))));
839 /* Count ones for bytes
841 static __inline qword si_clz(qword a)
843 vec_uchar16 av;
844 vec_uchar16 cnt_hi, cnt_lo, cnt, tmp1, tmp2, tmp3;
845 vec_uchar16 four = vec_splat_u8(4);
846 vec_uchar16 nib_cnt = (vec_uchar16){4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0};
847 vec_uchar16 eight = vec_splat_u8(8);
848 vec_uchar16 sixteen = (vec_uchar16){16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16};
849 vec_uchar16 twentyfour = (vec_uchar16){24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24};
851 av = (vec_uchar16)(a);
853 cnt_hi = vec_perm(nib_cnt, nib_cnt, vec_sr(av, four));
854 cnt_lo = vec_perm(nib_cnt, nib_cnt, av);
856 cnt = vec_add(cnt_hi, vec_and(cnt_lo, vec_cmpeq(cnt_hi, four)));
858 tmp1 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(eight));
859 tmp2 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(sixteen));
860 tmp3 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(twentyfour));
862 cnt = vec_add(cnt, vec_and(tmp1, vec_cmpeq(cnt, eight)));
863 cnt = vec_add(cnt, vec_and(tmp2, vec_cmpeq(cnt, sixteen)));
864 cnt = vec_add(cnt, vec_and(tmp3, vec_cmpeq(cnt, twentyfour)));
866 return (qword)((vec_sr((vec_uint4)(cnt), (vec_uint4)(twentyfour))));
869 /* Convert to float
871 #define si_cuflt(_a, _b) ((qword)(vec_ctf((vec_uint4)(_a), _b)))
872 #define si_csflt(_a, _b) ((qword)(vec_ctf((vec_int4)(_a), _b)))
874 /* Convert to signed int
876 #define si_cflts(_a, _b) ((qword)(vec_cts((vec_float4)(_a), _b)))
878 /* Convert to unsigned int
880 #define si_cfltu(_a, _b) ((qword)(vec_ctu((vec_float4)(_a), _b)))
882 /* Synchronize
884 #define si_dsync() /* do nothing */
885 #define si_sync() /* do nothing */
886 #define si_syncc() /* do nothing */
889 /* Equivalence
891 static __inline qword si_eqv(qword a, qword b)
893 vec_uchar16 d;
895 d = vec_xor((vec_uchar16)(a), (vec_uchar16)(b));
896 return ((qword)(vec_nor(d, d)));
899 /* Extend
901 static __inline qword si_xsbh(qword a)
903 vec_char16 av;
905 av = (vec_char16)(a);
906 return ((qword)(vec_unpackh(vec_perm(av, av, ((vec_uchar16){1, 3, 5, 7, 9,11,13,15,
907 0, 0, 0, 0, 0, 0, 0, 0})))));
910 static __inline qword si_xshw(qword a)
912 vec_short8 av;
914 av = (vec_short8)(a);
915 return ((qword)(vec_unpackh(vec_perm(av, av, ((vec_uchar16){2, 3, 6, 7,
916 10,11,14,15,
917 0, 0, 0, 0,
918 0, 0, 0, 0})))));
921 static __inline qword si_xswd(qword a)
923 vec_int4 av;
925 av = (vec_int4)(a);
926 return ((qword)(vec_perm(av, vec_sra(av, ((vec_uint4){31,31,31,31})),
927 ((vec_uchar16){20, 21, 22, 23,
928 4, 5, 6, 7,
929 28, 29, 30, 31,
930 12, 13, 14, 15}))));
933 static __inline qword si_fesd(qword a)
935 union {
936 double d[2];
937 vec_double2 vd;
938 } out;
939 union {
940 float f[4];
941 vec_float4 vf;
942 } in;
944 in.vf = (vec_float4)(a);
945 out.d[0] = (double)(in.f[0]);
946 out.d[1] = (double)(in.f[2]);
947 return ((qword)(out.vd));
950 /* Gather
952 static __inline qword si_gbb(qword a)
954 vec_uchar16 bits;
955 vec_uint4 bytes;
957 bits = vec_sl(vec_and((vec_uchar16)(a), vec_splat_u8(1)), ((vec_uchar16){7, 6, 5, 4, 3, 2, 1, 0,
958 7, 6, 5, 4, 3, 2, 1, 0}));
959 bytes = (vec_uint4)vec_sum2s((vec_int4)(vec_sum4s(bits, ((vec_uint4){0}))), ((vec_int4){0}));
961 return ((qword)(vec_perm(bytes, bytes, ((vec_uchar16){0, 0, 7,15, 0, 0, 0, 0,
962 0, 0, 0, 0, 0, 0, 0, 0}))));
966 static __inline qword si_gbh(qword a)
968 vec_ushort8 bits;
969 vec_uint4 bytes;
971 bits = vec_sl(vec_and((vec_ushort8)(a), vec_splat_u16(1)), ((vec_ushort8){7, 6, 5, 4, 3, 2, 1, 0}));
973 bytes = (vec_uint4)vec_sums((vec_int4)(vec_sum4s((vec_short8)(bits), (vec_int4){0})), (vec_int4){0});
975 return ((qword)(vec_sld(bytes, bytes, 12)));
978 static __inline qword si_gb(qword a)
980 vec_uint4 bits;
981 vec_uint4 bytes;
983 bits = vec_sl(vec_and((vec_uint4)(a), vec_splat_u32(1)), ((vec_uint4){3, 2, 1, 0}));
984 bytes = (vec_uint4)vec_sums((vec_int4)(bits), ((vec_int4){0}));
985 return ((qword)(vec_sld(bytes, bytes, 12)));
989 /* Compare and halt
991 static __inline void si_heq(qword a, qword b)
993 union {
994 vector unsigned int v;
995 unsigned int i[4];
996 } aa, bb;
998 aa.v = (vector unsigned int)(a);
999 bb.v = (vector unsigned int)(b);
1001 if (aa.i[0] == bb.i[0]) { SPU_HALT_ACTION; };
1004 static __inline void si_heqi(qword a, unsigned int b)
1006 union {
1007 vector unsigned int v;
1008 unsigned int i[4];
1009 } aa;
1011 aa.v = (vector unsigned int)(a);
1013 if (aa.i[0] == b) { SPU_HALT_ACTION; };
1016 static __inline void si_hgt(qword a, qword b)
1018 union {
1019 vector signed int v;
1020 signed int i[4];
1021 } aa, bb;
1023 aa.v = (vector signed int)(a);
1024 bb.v = (vector signed int)(b);
1026 if (aa.i[0] > bb.i[0]) { SPU_HALT_ACTION; };
1029 static __inline void si_hgti(qword a, signed int b)
1031 union {
1032 vector signed int v;
1033 signed int i[4];
1034 } aa;
1036 aa.v = (vector signed int)(a);
1038 if (aa.i[0] > b) { SPU_HALT_ACTION; };
1041 static __inline void si_hlgt(qword a, qword b)
1043 union {
1044 vector unsigned int v;
1045 unsigned int i[4];
1046 } aa, bb;
1048 aa.v = (vector unsigned int)(a);
1049 bb.v = (vector unsigned int)(b);
1051 if (aa.i[0] > bb.i[0]) { SPU_HALT_ACTION; };
1054 static __inline void si_hlgti(qword a, unsigned int b)
1056 union {
1057 vector unsigned int v;
1058 unsigned int i[4];
1059 } aa;
1061 aa.v = (vector unsigned int)(a);
1063 if (aa.i[0] > b) { SPU_HALT_ACTION; };
1067 /* Multiply and Add
1069 static __inline qword si_mpya(qword a, qword b, qword c)
1071 return ((qword)(vec_msum(vec_and((vec_short8)(a),
1072 ((vec_short8){0, -1, 0, -1, 0, -1, 0, -1})),
1073 (vec_short8)(b), (vec_int4)(c))));
1076 static __inline qword si_fma(qword a, qword b, qword c)
1078 return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b), (vec_float4)(c))));
1081 static __inline qword si_dfma(qword a, qword b, qword c)
1083 union {
1084 vec_double2 v;
1085 double d[2];
1086 } aa, bb, cc, dd;
1088 aa.v = (vec_double2)(a);
1089 bb.v = (vec_double2)(b);
1090 cc.v = (vec_double2)(c);
1091 dd.d[0] = aa.d[0] * bb.d[0] + cc.d[0];
1092 dd.d[1] = aa.d[1] * bb.d[1] + cc.d[1];
1093 return ((qword)(dd.v));
1096 /* Form Mask
1098 #define si_fsmbi(_a) si_fsmb(si_from_int(_a))
1100 static __inline qword si_fsmb(qword a)
1102 vec_char16 mask;
1103 vec_ushort8 in;
1105 in = (vec_ushort8)(a);
1106 mask = (vec_char16)(vec_perm(in, in, ((vec_uchar16){2, 2, 2, 2, 2, 2, 2, 2,
1107 3, 3, 3, 3, 3, 3, 3, 3})));
1108 return ((qword)(vec_sra(vec_sl(mask, ((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7,
1109 0, 1, 2, 3, 4, 5, 6, 7})),
1110 vec_splat_u8(7))));
1114 static __inline qword si_fsmh(qword a)
1116 vec_uchar16 in;
1117 vec_short8 mask;
1119 in = (vec_uchar16)(a);
1120 mask = (vec_short8)(vec_splat(in, 3));
1121 return ((qword)(vec_sra(vec_sl(mask, ((vec_ushort8){0, 1, 2, 3, 4, 5, 6, 7})),
1122 vec_splat_u16(15))));
1125 static __inline qword si_fsm(qword a)
1127 vec_uchar16 in;
1128 vec_int4 mask;
1130 in = (vec_uchar16)(a);
1131 mask = (vec_int4)(vec_splat(in, 3));
1132 return ((qword)(vec_sra(vec_sl(mask, ((vec_uint4){28, 29, 30, 31})),
1133 ((vec_uint4){31,31,31,31}))));
1136 /* Move from/to registers
1138 #define si_fscrrd() ((qword)((vec_uint4){0}))
1139 #define si_fscrwr(_a)
1141 #define si_mfspr(_reg) ((qword)((vec_uint4){0}))
1142 #define si_mtspr(_reg, _a)
1144 /* Multiply High High Add
1146 static __inline qword si_mpyhha(qword a, qword b, qword c)
1148 return ((qword)(vec_add(vec_mule((vec_short8)(a), (vec_short8)(b)), (vec_int4)(c))));
1151 static __inline qword si_mpyhhau(qword a, qword b, qword c)
1153 return ((qword)(vec_add(vec_mule((vec_ushort8)(a), (vec_ushort8)(b)), (vec_uint4)(c))));
1156 /* Multiply Subtract
1158 static __inline qword si_fms(qword a, qword b, qword c)
1160 return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b),
1161 vec_sub(((vec_float4){0.0f}), (vec_float4)(c)))));
1164 static __inline qword si_dfms(qword a, qword b, qword c)
1166 union {
1167 vec_double2 v;
1168 double d[2];
1169 } aa, bb, cc, dd;
1171 aa.v = (vec_double2)(a);
1172 bb.v = (vec_double2)(b);
1173 cc.v = (vec_double2)(c);
1174 dd.d[0] = aa.d[0] * bb.d[0] - cc.d[0];
1175 dd.d[1] = aa.d[1] * bb.d[1] - cc.d[1];
1176 return ((qword)(dd.v));
1179 /* Multiply
1181 static __inline qword si_fm(qword a, qword b)
1183 return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b), ((vec_float4){0.0f}))));
1186 static __inline qword si_dfm(qword a, qword b)
1188 union {
1189 vec_double2 v;
1190 double d[2];
1191 } aa, bb, dd;
1193 aa.v = (vec_double2)(a);
1194 bb.v = (vec_double2)(b);
1195 dd.d[0] = aa.d[0] * bb.d[0];
1196 dd.d[1] = aa.d[1] * bb.d[1];
1197 return ((qword)(dd.v));
1200 /* Multiply High
1202 static __inline qword si_mpyh(qword a, qword b)
1204 vec_uint4 sixteen = (vec_uint4){16, 16, 16, 16};
1206 return ((qword)(vec_sl(vec_mule((vec_short8)(a), (vec_short8)(vec_sl((vec_uint4)(b), sixteen))), sixteen)));
1210 /* Multiply High High
1212 static __inline qword si_mpyhh(qword a, qword b)
1214 return ((qword)(vec_mule((vec_short8)(a), (vec_short8)(b))));
1217 static __inline qword si_mpyhhu(qword a, qword b)
1219 return ((qword)(vec_mule((vec_ushort8)(a), (vec_ushort8)(b))));
1222 /* Multiply Odd
1224 static __inline qword si_mpy(qword a, qword b)
1226 return ((qword)(vec_mulo((vec_short8)(a), (vec_short8)(b))));
1229 static __inline qword si_mpyu(qword a, qword b)
1231 return ((qword)(vec_mulo((vec_ushort8)(a), (vec_ushort8)(b))));
1234 static __inline qword si_mpyi(qword a, short b)
1236 return ((qword)(vec_mulo((vec_short8)(a),
1237 vec_splat((vec_short8)(si_from_short(b)), 1))));
1240 static __inline qword si_mpyui(qword a, unsigned short b)
1242 return ((qword)(vec_mulo((vec_ushort8)(a),
1243 vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
1246 /* Multiply and Shift Right
1248 static __inline qword si_mpys(qword a, qword b)
1250 return ((qword)(vec_sra(vec_mulo((vec_short8)(a), (vec_short8)(b)), ((vec_uint4){16,16,16,16}))));
1253 /* Nand
1255 static __inline qword si_nand(qword a, qword b)
1257 vec_uchar16 d;
1259 d = vec_and((vec_uchar16)(a), (vec_uchar16)(b));
1260 return ((qword)(vec_nor(d, d)));
1263 /* Negative Multiply Add
1265 static __inline qword si_dfnma(qword a, qword b, qword c)
1267 union {
1268 vec_double2 v;
1269 double d[2];
1270 } aa, bb, cc, dd;
1272 aa.v = (vec_double2)(a);
1273 bb.v = (vec_double2)(b);
1274 cc.v = (vec_double2)(c);
1275 dd.d[0] = -cc.d[0] - aa.d[0] * bb.d[0];
1276 dd.d[1] = -cc.d[1] - aa.d[1] * bb.d[1];
1277 return ((qword)(dd.v));
1280 /* Negative Multiply and Subtract
1282 static __inline qword si_fnms(qword a, qword b, qword c)
1284 return ((qword)(vec_nmsub((vec_float4)(a), (vec_float4)(b), (vec_float4)(c))));
1287 static __inline qword si_dfnms(qword a, qword b, qword c)
1289 union {
1290 vec_double2 v;
1291 double d[2];
1292 } aa, bb, cc, dd;
1294 aa.v = (vec_double2)(a);
1295 bb.v = (vec_double2)(b);
1296 cc.v = (vec_double2)(c);
1297 dd.d[0] = cc.d[0] - aa.d[0] * bb.d[0];
1298 dd.d[1] = cc.d[1] - aa.d[1] * bb.d[1];
1299 return ((qword)(dd.v));
1302 /* Nor
1304 static __inline qword si_nor(qword a, qword b)
1306 return ((qword)(vec_nor((vec_uchar16)(a), (vec_uchar16)(b))));
1309 /* Or
1311 static __inline qword si_or(qword a, qword b)
1313 return ((qword)(vec_or((vec_uchar16)(a), (vec_uchar16)(b))));
1316 static __inline qword si_orbi(qword a, unsigned char b)
1318 return ((qword)(vec_or((vec_uchar16)(a),
1319 vec_splat((vec_uchar16)(si_from_uchar(b)), 3))));
1322 static __inline qword si_orhi(qword a, unsigned short b)
1324 return ((qword)(vec_or((vec_ushort8)(a),
1325 vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
1328 static __inline qword si_ori(qword a, unsigned int b)
1330 return ((qword)(vec_or((vec_uint4)(a),
1331 vec_splat((vec_uint4)(si_from_uint(b)), 0))));
1334 /* Or Complement
1336 static __inline qword si_orc(qword a, qword b)
1338 return ((qword)(vec_or((vec_uchar16)(a), vec_nor((vec_uchar16)(b), (vec_uchar16)(b)))));
1342 /* Or Across
1344 static __inline qword si_orx(qword a)
1346 vec_uchar16 tmp;
1347 tmp = (vec_uchar16)(a);
1348 tmp = vec_or(tmp, vec_sld(tmp, tmp, 8));
1349 tmp = vec_or(tmp, vec_sld(tmp, tmp, 4));
1350 return ((qword)(vec_and(tmp, ((vec_uchar16){0xFF,0xFF,0xFF,0xFF, 0x00,0x00,0x00,0x00,
1351 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00}))));
1355 /* Estimates
1357 static __inline qword si_frest(qword a)
1359 return ((qword)(vec_re((vec_float4)(a))));
1362 static __inline qword si_frsqest(qword a)
1364 return ((qword)(vec_rsqrte((vec_float4)(a))));
1367 #define si_fi(_a, _d) (_d)
1369 /* Channel Read and Write
1371 #define si_rdch(_channel) ((qword)(vec_splat_u8(0))) /* not mappable */
1372 #define si_rchcnt(_channel) ((qword)(vec_splat_u8(0))) /* not mappable */
1373 #define si_wrch(_channel, _a) /* not mappable */
1375 /* Rotate Left
1377 static __inline qword si_roth(qword a, qword b)
1379 return ((qword)(vec_rl((vec_ushort8)(a), (vec_ushort8)(b))));
1382 static __inline qword si_rot(qword a, qword b)
1384 return ((qword)(vec_rl((vec_uint4)(a), (vec_uint4)(b))));
1387 static __inline qword si_rothi(qword a, int b)
1389 return ((qword)(vec_rl((vec_ushort8)(a),
1390 vec_splat((vec_ushort8)(si_from_int(b)), 1))));
1393 static __inline qword si_roti(qword a, int b)
1395 return ((qword)(vec_rl((vec_uint4)(a),
1396 vec_splat((vec_uint4)(si_from_int(b)), 0))));
1399 /* Rotate Left with Mask
1401 static __inline qword si_rothm(qword a, qword b)
1403 vec_ushort8 neg_b;
1404 vec_ushort8 mask;
1406 neg_b = (vec_ushort8)vec_sub(vec_splat_s16(0), (vec_short8)(b));
1407 mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1408 return ((qword)(vec_andc(vec_sr((vec_ushort8)(a), neg_b), mask)));
1411 static __inline qword si_rotm(qword a, qword b)
1413 vec_uint4 neg_b;
1414 vec_uint4 mask;
1416 neg_b = (vec_uint4)vec_sub(vec_splat_s32(0), (vec_int4)(b));
1417 mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1418 return ((qword)(vec_andc(vec_sr((vec_uint4)(a), neg_b), mask)));
1421 static __inline qword si_rothmi(qword a, int b)
1423 vec_ushort8 neg_b;
1424 vec_ushort8 mask;
1426 neg_b = vec_splat((vec_ushort8)(si_from_int(-b)), 1);
1427 mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1428 return ((qword)(vec_andc(vec_sr((vec_ushort8)(a), neg_b), mask)));
1431 static __inline qword si_rotmi(qword a, int b)
1433 vec_uint4 neg_b;
1434 vec_uint4 mask;
1436 neg_b = vec_splat((vec_uint4)(si_from_int(-b)), 0);
1437 mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1438 return ((qword)(vec_andc(vec_sr((vec_uint4)(a), neg_b), mask)));
1442 /* Rotate Left Algebraic with Mask
1444 static __inline qword si_rotmah(qword a, qword b)
1446 vec_ushort8 neg_b;
1447 vec_ushort8 mask;
1449 neg_b = (vec_ushort8)vec_sub(vec_splat_s16(0), (vec_short8)(b));
1450 mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1451 return ((qword)(vec_sra((vec_short8)(a), (vec_ushort8)vec_or(neg_b, mask))));
1454 static __inline qword si_rotma(qword a, qword b)
1456 vec_uint4 neg_b;
1457 vec_uint4 mask;
1459 neg_b = (vec_uint4)vec_sub(vec_splat_s32(0), (vec_int4)(b));
1460 mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1461 return ((qword)(vec_sra((vec_int4)(a), (vec_uint4)vec_or(neg_b, mask))));
1465 static __inline qword si_rotmahi(qword a, int b)
1467 vec_ushort8 neg_b;
1468 vec_ushort8 mask;
1470 neg_b = vec_splat((vec_ushort8)(si_from_int(-b)), 1);
1471 mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1472 return ((qword)(vec_sra((vec_short8)(a), (vec_ushort8)vec_or(neg_b, mask))));
1475 static __inline qword si_rotmai(qword a, int b)
1477 vec_uint4 neg_b;
1478 vec_uint4 mask;
1480 neg_b = vec_splat((vec_uint4)(si_from_int(-b)), 0);
1481 mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1482 return ((qword)(vec_sra((vec_int4)(a), (vec_uint4)vec_or(neg_b, mask))));
1486 /* Rotate Left Quadword by Bytes with Mask
1488 static __inline qword si_rotqmbyi(qword a, int count)
1490 union {
1491 vec_uchar16 v;
1492 int i[4];
1493 } x;
1494 vec_uchar16 mask;
1496 count = 0 - count;
1497 x.i[3] = count << 3;
1498 mask = (count & 0x10) ? vec_splat_u8(0) : vec_splat_u8(-1);
1500 return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask)));
1504 static __inline qword si_rotqmby(qword a, qword count)
1506 union {
1507 vec_uchar16 v;
1508 int i[4];
1509 } x;
1510 int cnt;
1511 vec_uchar16 mask;
1513 x.v = (vec_uchar16)(count);
1514 x.i[0] = cnt = (0 - x.i[0]) << 3;
1516 x.v = vec_splat(x.v, 3);
1517 mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1519 return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask)));
1523 /* Rotate Left Quadword by Bytes
1525 static __inline qword si_rotqbyi(qword a, int count)
1527 union {
1528 vec_uchar16 v;
1529 int i[4];
1530 } left, right;
1532 count <<= 3;
1533 left.i[3] = count;
1534 right.i[3] = 0 - count;
1535 return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left.v), vec_sro((vec_uchar16)(a), right.v))));
1538 static __inline qword si_rotqby(qword a, qword count)
1540 vec_uchar16 left, right;
1542 left = vec_sl(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(3));
1543 right = vec_sub(vec_splat_u8(0), left);
1544 return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left), vec_sro((vec_uchar16)(a), right))));
1547 /* Rotate Left Quadword by Bytes Bit Count
1549 static __inline qword si_rotqbybi(qword a, qword count)
1551 vec_uchar16 left, right;
1553 left = vec_splat((vec_uchar16)(count), 3);
1554 right = vec_sub(vec_splat_u8(7), left);
1555 return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left), vec_sro((vec_uchar16)(a), right))));
1559 /* Rotate Left Quadword by Bytes Bit Count
1561 static __inline qword si_rotqbii(qword a, int count)
1563 vec_uchar16 x, y;
1564 vec_uchar16 result;
1566 x = vec_splat((vec_uchar16)(si_from_int(count & 7)), 3);
1567 y = (vec_uchar16)(vec_sr((vec_uint4)vec_sro((vec_uchar16)(a), ((vec_uchar16)((vec_uint4){0,0,0,120}))),
1568 (vec_uint4)vec_sub(vec_splat_u8(8), x)));
1569 result = vec_or(vec_sll((qword)(a), x), y);
1570 return ((qword)(result));
1573 static __inline qword si_rotqbi(qword a, qword count)
1575 vec_uchar16 x, y;
1576 vec_uchar16 result;
1578 x = vec_and(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(7));
1579 y = (vec_uchar16)(vec_sr((vec_uint4)vec_sro((vec_uchar16)(a), ((vec_uchar16)((vec_uint4){0,0,0,120}))),
1580 (vec_uint4)vec_sub(vec_splat_u8(8), x)));
1582 result = vec_or(vec_sll((qword)(a), x), y);
1583 return ((qword)(result));
1587 /* Rotate Left Quadword and Mask by Bits
1589 static __inline qword si_rotqmbii(qword a, int count)
1591 return ((qword)(vec_srl((vec_uchar16)(a), vec_splat((vec_uchar16)(si_from_int(0 - count)), 3))));
1594 static __inline qword si_rotqmbi(qword a, qword count)
1596 return ((qword)(vec_srl((vec_uchar16)(a), vec_sub(vec_splat_u8(0), vec_splat((vec_uchar16)(count), 3)))));
1600 /* Rotate Left Quadword and Mask by Bytes with Bit Count
1602 static __inline qword si_rotqmbybi(qword a, qword count)
1604 union {
1605 vec_uchar16 v;
1606 int i[4];
1607 } x;
1608 int cnt;
1609 vec_uchar16 mask;
1611 x.v = (vec_uchar16)(count);
1612 x.i[0] = cnt = 0 - (x.i[0] & ~7);
1613 x.v = vec_splat(x.v, 3);
1614 mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1616 return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask)));
1622 /* Round Double to Float
1624 static __inline qword si_frds(qword a)
1626 union {
1627 vec_float4 v;
1628 float f[4];
1629 } d;
1630 union {
1631 vec_double2 v;
1632 double d[2];
1633 } in;
1635 in.v = (vec_double2)(a);
1636 d.v = (vec_float4){0.0f};
1637 d.f[0] = (float)in.d[0];
1638 d.f[2] = (float)in.d[1];
1640 return ((qword)(d.v));
1643 /* Select Bits
1645 static __inline qword si_selb(qword a, qword b, qword c)
1647 return ((qword)(vec_sel((vec_uchar16)(a), (vec_uchar16)(b), (vec_uchar16)(c))));
1651 /* Shuffle Bytes
1653 static __inline qword si_shufb(qword a, qword b, qword pattern)
1655 vec_uchar16 pat;
1657 pat = vec_sel(((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}),
1658 vec_sr((vec_uchar16)(pattern), vec_splat_u8(3)),
1659 vec_sra((vec_uchar16)(pattern), vec_splat_u8(7)));
1660 return ((qword)(vec_perm(vec_perm(a, b, pattern),
1661 ((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0,
1662 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),
1663 pat)));
1667 /* Shift Left
1669 static __inline qword si_shlh(qword a, qword b)
1671 vec_ushort8 mask;
1673 mask = (vec_ushort8)vec_sra(vec_sl((vec_ushort8)(b), vec_splat_u16(11)), vec_splat_u16(15));
1674 return ((qword)(vec_andc(vec_sl((vec_ushort8)(a), (vec_ushort8)(b)), mask)));
1677 static __inline qword si_shl(qword a, qword b)
1679 vec_uint4 mask;
1681 mask = (vec_uint4)vec_sra(vec_sl((vec_uint4)(b), ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1682 return ((qword)(vec_andc(vec_sl((vec_uint4)(a), (vec_uint4)(b)), mask)));
1686 static __inline qword si_shlhi(qword a, unsigned int b)
1688 vec_ushort8 mask;
1689 vec_ushort8 bv;
1691 bv = vec_splat((vec_ushort8)(si_from_int(b)), 1);
1692 mask = (vec_ushort8)vec_sra(vec_sl(bv, vec_splat_u16(11)), vec_splat_u16(15));
1693 return ((qword)(vec_andc(vec_sl((vec_ushort8)(a), bv), mask)));
1696 static __inline qword si_shli(qword a, unsigned int b)
1698 vec_uint4 bv;
1699 vec_uint4 mask;
1701 bv = vec_splat((vec_uint4)(si_from_uint(b)), 0);
1702 mask = (vec_uint4)vec_sra(vec_sl(bv, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1703 return ((qword)(vec_andc(vec_sl((vec_uint4)(a), bv), mask)));
1707 /* Shift Left Quadword
1709 static __inline qword si_shlqbii(qword a, unsigned int count)
1711 vec_uchar16 x;
1713 x = vec_splat((vec_uchar16)(si_from_uint(count)), 3);
1714 return ((qword)(vec_sll((vec_uchar16)(a), x)));
1717 static __inline qword si_shlqbi(qword a, qword count)
1719 vec_uchar16 x;
1721 x = vec_splat((vec_uchar16)(count), 3);
1722 return ((qword)(vec_sll((vec_uchar16)(a), x)));
1726 /* Shift Left Quadword by Bytes
1728 static __inline qword si_shlqbyi(qword a, unsigned int count)
1730 union {
1731 vec_uchar16 v;
1732 int i[4];
1733 } x;
1734 vec_uchar16 mask;
1736 x.i[3] = count << 3;
1737 mask = (count & 0x10) ? vec_splat_u8(0) : vec_splat_u8(-1);
1738 return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask)));
1741 static __inline qword si_shlqby(qword a, qword count)
1743 union {
1744 vec_uchar16 v;
1745 unsigned int i[4];
1746 } x;
1747 unsigned int cnt;
1748 vec_uchar16 mask;
1750 x.v = vec_sl(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(3));
1751 cnt = x.i[0];
1752 mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1753 return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask)));
1756 /* Shift Left Quadword by Bytes with Bit Count
1758 static __inline qword si_shlqbybi(qword a, qword count)
1760 union {
1761 vec_uchar16 v;
1762 int i[4];
1763 } x;
1764 unsigned int cnt;
1765 vec_uchar16 mask;
1767 x.v = vec_splat((vec_uchar16)(count), 3);
1768 cnt = x.i[0];
1769 mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1770 return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask)));
1774 /* Stop and Signal
1776 #define si_stop(_type) SPU_STOP_ACTION
1777 #define si_stopd(a, b, c) SPU_STOP_ACTION
1780 /* Subtract
1782 static __inline qword si_sfh(qword a, qword b)
1784 return ((qword)(vec_sub((vec_ushort8)(b), (vec_ushort8)(a))));
1787 static __inline qword si_sf(qword a, qword b)
1789 return ((qword)(vec_sub((vec_uint4)(b), (vec_uint4)(a))));
1792 static __inline qword si_fs(qword a, qword b)
1794 return ((qword)(vec_sub((vec_float4)(a), (vec_float4)(b))));
1797 static __inline qword si_dfs(qword a, qword b)
1799 union {
1800 vec_double2 v;
1801 double d[2];
1802 } aa, bb, dd;
1804 aa.v = (vec_double2)(a);
1805 bb.v = (vec_double2)(b);
1806 dd.d[0] = aa.d[0] - bb.d[0];
1807 dd.d[1] = aa.d[1] - bb.d[1];
1808 return ((qword)(dd.v));
1811 static __inline qword si_sfhi(qword a, short b)
1813 return ((qword)(vec_sub(vec_splat((vec_short8)(si_from_short(b)), 1),
1814 (vec_short8)(a))));
1817 static __inline qword si_sfi(qword a, int b)
1819 return ((qword)(vec_sub(vec_splat((vec_int4)(si_from_int(b)), 0),
1820 (vec_int4)(a))));
1823 /* Subtract word extended
1825 #define si_sfx(_a, _b, _c) ((qword)(vec_add(vec_add((vec_uint4)(_b), \
1826 vec_nor((vec_uint4)(_a), (vec_uint4)(_a))), \
1827 vec_and((vec_uint4)(_c), vec_splat_u32(1)))))
1830 /* Sum Bytes into Shorts
1832 static __inline qword si_sumb(qword a, qword b)
1834 vec_uint4 zero = (vec_uint4){0};
1835 vec_ushort8 sum_a, sum_b;
1837 sum_a = (vec_ushort8)vec_sum4s((vec_uchar16)(a), zero);
1838 sum_b = (vec_ushort8)vec_sum4s((vec_uchar16)(b), zero);
1840 return ((qword)(vec_perm(sum_a, sum_b, ((vec_uchar16){18, 19, 2, 3, 22, 23, 6, 7,
1841 26, 27, 10, 11, 30, 31, 14, 15}))));
1844 /* Exclusive OR
1846 static __inline qword si_xor(qword a, qword b)
1848 return ((qword)(vec_xor((vec_uchar16)(a), (vec_uchar16)(b))));
1851 static __inline qword si_xorbi(qword a, unsigned char b)
1853 return ((qword)(vec_xor((vec_uchar16)(a),
1854 vec_splat((vec_uchar16)(si_from_uchar(b)), 3))));
1857 static __inline qword si_xorhi(qword a, unsigned short b)
1859 return ((qword)(vec_xor((vec_ushort8)(a),
1860 vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
1863 static __inline qword si_xori(qword a, unsigned int b)
1865 return ((qword)(vec_xor((vec_uint4)(a),
1866 vec_splat((vec_uint4)(si_from_uint(b)), 0))));
1870 /* Generate Controls for Sub-Quadword Insertion
1872 static __inline qword si_cbd(qword a, int imm)
1874 union {
1875 vec_uint4 v;
1876 unsigned char c[16];
1877 } shmask;
1879 shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1880 shmask.c[(si_to_uint(a) + (unsigned int)(imm)) & 0xF] = 0x03;
1881 return ((qword)(shmask.v));
1884 static __inline qword si_cdd(qword a, int imm)
1886 union {
1887 vec_uint4 v;
1888 unsigned long long ll[2];
1889 } shmask;
1891 shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1892 shmask.ll[((si_to_uint(a) + (unsigned int)(imm)) >> 3) & 0x1] = 0x0001020304050607ULL;
1893 return ((qword)(shmask.v));
1896 static __inline qword si_chd(qword a, int imm)
1898 union {
1899 vec_uint4 v;
1900 unsigned short s[8];
1901 } shmask;
1903 shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1904 shmask.s[((si_to_uint(a) + (unsigned int)(imm)) >> 1) & 0x7] = 0x0203;
1905 return ((qword)(shmask.v));
1908 static __inline qword si_cwd(qword a, int imm)
1910 union {
1911 vec_uint4 v;
1912 unsigned int i[4];
1913 } shmask;
1915 shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1916 shmask.i[((si_to_uint(a) + (unsigned int)(imm)) >> 2) & 0x3] = 0x00010203;
1917 return ((qword)(shmask.v));
1920 static __inline qword si_cbx(qword a, qword b)
1922 union {
1923 vec_uint4 v;
1924 unsigned char c[16];
1925 } shmask;
1927 shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1928 shmask.c[si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) & 0xF] = 0x03;
1929 return ((qword)(shmask.v));
1933 static __inline qword si_cdx(qword a, qword b)
1935 union {
1936 vec_uint4 v;
1937 unsigned long long ll[2];
1938 } shmask;
1940 shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1941 shmask.ll[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 3) & 0x1] = 0x0001020304050607ULL;
1942 return ((qword)(shmask.v));
1945 static __inline qword si_chx(qword a, qword b)
1947 union {
1948 vec_uint4 v;
1949 unsigned short s[8];
1950 } shmask;
1952 shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1953 shmask.s[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 1) & 0x7] = 0x0203;
1954 return ((qword)(shmask.v));
1957 static __inline qword si_cwx(qword a, qword b)
1959 union {
1960 vec_uint4 v;
1961 unsigned int i[4];
1962 } shmask;
1964 shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1965 shmask.i[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 2) & 0x3] = 0x00010203;
1966 return ((qword)(shmask.v));
1970 /* Constant Formation
1972 static __inline qword si_il(signed short imm)
1974 return ((qword)(vec_splat((vec_int4)(si_from_int((signed int)(imm))), 0)));
1978 static __inline qword si_ila(unsigned int imm)
1980 return ((qword)(vec_splat((vec_uint4)(si_from_uint(imm)), 0)));
1983 static __inline qword si_ilh(signed short imm)
1985 return ((qword)(vec_splat((vec_short8)(si_from_short(imm)), 1)));
1988 static __inline qword si_ilhu(signed short imm)
1990 return ((qword)(vec_splat((vec_uint4)(si_from_uint((unsigned int)(imm) << 16)), 0)));
1993 static __inline qword si_iohl(qword a, unsigned short imm)
1995 return ((qword)(vec_or((vec_uint4)(a), vec_splat((vec_uint4)(si_from_uint((unsigned int)(imm))), 0))));
1998 /* No Operation
2000 #define si_lnop() /* do nothing */
2001 #define si_nop() /* do nothing */
2004 /* Memory Load and Store
2006 static __inline qword si_lqa(unsigned int imm)
2008 return ((qword)(vec_ld(0, (vector unsigned char *)(imm))));
2011 static __inline qword si_lqd(qword a, unsigned int imm)
2013 return ((qword)(vec_ld(si_to_uint(a) & ~0xF, (vector unsigned char *)(imm))));
2016 static __inline qword si_lqr(unsigned int imm)
2018 return ((qword)(vec_ld(0, (vector unsigned char *)(imm))));
2021 static __inline qword si_lqx(qword a, qword b)
2023 return ((qword)(vec_ld(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))), (vector unsigned char *)(0))));
2026 static __inline void si_stqa(qword a, unsigned int imm)
2028 vec_st((vec_uchar16)(a), 0, (vector unsigned char *)(imm));
2031 static __inline void si_stqd(qword a, qword b, unsigned int imm)
2033 vec_st((vec_uchar16)(a), si_to_uint(b) & ~0xF, (vector unsigned char *)(imm));
2036 static __inline void si_stqr(qword a, unsigned int imm)
2038 vec_st((vec_uchar16)(a), 0, (vector unsigned char *)(imm));
2041 static __inline void si_stqx(qword a, qword b, qword c)
2043 vec_st((vec_uchar16)(a),
2044 si_to_uint((qword)(vec_add((vec_uint4)(b), (vec_uint4)(c)))),
2045 (vector unsigned char *)(0));
2048 #endif /* !__SPU__ */
2049 #endif /* !_SI2VMX_H_ */