Merged with mainline at revision 128810.
[official-gcc.git] / gcc / config / spu / vmx2spu.h
blobd135d4d97bff52e2a322f8662c008f3de43ca768
1 /* Copyright (C) 2006, 2007 Free Software Foundation, Inc.
3 This file is free software; you can redistribute it and/or modify it under
4 the terms of the GNU General Public License as published by the Free
5 Software Foundation; either version 2 of the License, or (at your option)
6 any later version.
8 This file is distributed in the hope that it will be useful, but WITHOUT
9 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
11 for more details.
13 You should have received a copy of the GNU General Public License
14 along with this file; see the file COPYING. If not, write to the Free
15 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
16 02110-1301, USA. */
18 /* As a special exception, if you include this header file into source files
19 compiled by GCC, this header file does not by itself cause the resulting
20 executable to be covered by the GNU General Public License. This exception
21 does not however invalidate any other reasons why the executable file might be
22 covered by the GNU General Public License. */
24 #ifndef _VMX2SPU_H_
25 #define _VMX2SPU_H_ 1
27 #ifdef __cplusplus
29 #ifdef __SPU__
31 #include <spu_intrinsics.h>
32 #include <vec_types.h>
34 /* This file maps generic VMX intrinsics and predicates to the SPU using
35 * overloaded C++ functions.
38 /************************************************************************
39 * INTRINSICS
40 ************************************************************************/
42 /* vec_abs (vector absolute value)
43 * =======
45 static inline vec_char16 vec_abs(vec_char16 a)
47 vec_char16 minus_a;
49 minus_a = (vec_char16)(spu_add((vec_ushort8)(spu_and(spu_xor(a, 0xFF), 0x7F)), 0x101));
50 return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
53 static inline vec_short8 vec_abs(vec_short8 a)
55 return (spu_sel(spu_sub(0, a), a, spu_cmpgt(a, -1)));
58 static inline vec_int4 vec_abs(vec_int4 a)
60 return (spu_sel(spu_sub(0, a), a, spu_cmpgt(a, -1)));
63 static inline vec_float4 vec_abs(vec_float4 a)
65 return ((vec_float4)(spu_rlmask(spu_sl((vec_uint4)(a), 1), -1)));
68 /* vec_abss (vector absolute value saturate)
69 * ========
71 static inline vec_char16 vec_abss(vec_char16 a)
73 vec_char16 minus_a;
75 minus_a = (vec_char16)spu_add((vec_short8)(spu_xor(a, -1)),
76 (vec_short8)(spu_and(spu_cmpgt((vec_uchar16)(a), 0x80), 1)));
77 return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
80 static inline vec_short8 vec_abss(vec_short8 a)
82 vec_short8 minus_a;
84 minus_a = spu_add(spu_sub(0, a), (vec_short8)(spu_cmpeq(a, ((vec_short8){0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000}))));
85 return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
88 static inline vec_int4 vec_abss(vec_int4 a)
90 vec_int4 minus_a;
92 minus_a = spu_add(spu_sub(0, a), (vec_int4)(spu_cmpeq(a, ((vec_int4){0x80000000,0x80000000,0x80000000,0x80000000}))));
93 return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
97 /* vec_add (vector add)
98 * =======
100 static inline vec_uchar16 vec_add(vec_uchar16 a, vec_uchar16 b)
102 return ((vec_uchar16)(spu_sel(spu_add((vec_ushort8)(a), (vec_ushort8)(b)),
103 spu_add(spu_and((vec_ushort8)(a), 0xFF00), spu_and((vec_ushort8)(b), 0xFF00)),
104 spu_splats((unsigned short)(0xFF00)))));
107 static inline vec_char16 vec_add(vec_char16 a, vec_char16 b)
109 return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b)));
112 static inline vec_char16 vec_add(vec_bchar16 a, vec_char16 b)
114 return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b)));
117 static inline vec_char16 vec_add(vec_char16 a, vec_bchar16 b)
119 return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b)));
122 static inline vec_ushort8 vec_add(vec_ushort8 a, vec_ushort8 b)
124 return (spu_add(a, b));
127 static inline vec_short8 vec_add(vec_short8 a, vec_short8 b)
129 return (spu_add(a, b));
132 static inline vec_short8 vec_add(vec_bshort8 a, vec_short8 b)
134 return (spu_add((vec_short8)(a), b));
137 static inline vec_short8 vec_add(vec_short8 a, vec_bshort8 b)
139 return (spu_add(a, (vec_short8)(b)));
142 static inline vec_uint4 vec_add(vec_uint4 a, vec_uint4 b)
144 return (spu_add(a, b));
147 static inline vec_int4 vec_add(vec_int4 a, vec_int4 b)
149 return (spu_add(a, b));
152 static inline vec_int4 vec_add(vec_bint4 a, vec_int4 b)
154 return (spu_add((vec_int4)(a), b));
157 static inline vec_int4 vec_add(vec_int4 a, vec_bint4 b)
159 return (spu_add(a, (vec_int4)(b)));
162 static inline vec_float4 vec_add(vec_float4 a, vec_float4 b)
164 return (spu_add(a, b));
167 /* vec_addc (vector add carryout unsigned word)
168 * ========
170 #define vec_addc(_a, _b) spu_genc(_a, _b)
172 /* vec_adds (vector add saturated)
173 * ========
175 static inline vec_uchar16 vec_adds(vec_uchar16 a, vec_uchar16 b)
177 vec_uchar16 s1, s2, s, d;
179 s1 = (vec_uchar16)(spu_add(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8)));
180 s2 = (vec_uchar16)(spu_add(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF)));
181 s = spu_shuffle(s1, s2, ((vec_uchar16){0, 16, 2, 18, 4, 20, 6, 22,
182 8, 24, 10, 26, 12, 28, 14, 30}));
183 d = spu_shuffle(s1, s2, ((vec_uchar16){1, 17, 3, 19, 5, 21, 7, 23,
184 9, 25, 11, 27, 13, 29, 15, 31}));
185 return (spu_or(d, spu_cmpeq(s, 1)));
188 static inline vec_char16 vec_adds(vec_char16 a, vec_char16 b)
190 vec_uchar16 s1, s2, s, d;
192 s1 = (vec_uchar16)(spu_add(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8)));
193 s2 = (vec_uchar16)(spu_add(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF)));
194 s = spu_shuffle(s1, s2, ((vec_uchar16){1, 17, 3, 19, 5, 21, 7, 23,
195 9, 25, 11, 27, 13, 29, 15, 31}));
196 d = spu_sel(s, spu_splats((unsigned char)0x7F), spu_cmpgt(spu_and(s, (vec_uchar16)(spu_nor(a, b))), 0x7F));
197 d = spu_sel(d, spu_splats((unsigned char)0x80), spu_cmpgt(spu_nor(s, (vec_uchar16)(spu_nand(a, b))), 0x7F));
198 return ((vec_char16)(d));
201 static inline vec_char16 vec_adds(vec_bchar16 a, vec_char16 b)
203 return (vec_adds((vec_char16)(a), b));
206 static inline vec_char16 vec_adds(vec_char16 a, vec_bchar16 b)
208 return (vec_adds(a, (vec_char16)(b)));
211 static inline vec_ushort8 vec_adds(vec_ushort8 a, vec_ushort8 b)
213 vec_ushort8 s, d;
215 s = spu_add(a, b);
216 d = spu_or(s, spu_rlmaska(spu_sel(spu_xor(s, -1), a, spu_eqv(a, b)), -15));
217 return (d);
220 static inline vec_short8 vec_adds(vec_short8 a, vec_short8 b)
222 vec_short8 s, d;
224 s = spu_add(a, b);
225 d = spu_sel(s, spu_splats((signed short)0x7FFF), (vec_ushort8)(spu_rlmaska(spu_and(s, spu_nor(a, b)), -15)));
226 d = spu_sel(d, spu_splats((signed short)0x8000), (vec_ushort8)(spu_rlmaska(spu_nor(s, spu_nand(a, b)), -15)));
227 return (d);
230 static inline vec_short8 vec_adds(vec_bshort8 a, vec_short8 b)
232 return (vec_adds((vec_short8)(a), b));
235 static inline vec_short8 vec_adds(vec_short8 a, vec_bshort8 b)
237 return (vec_adds(a, (vec_short8)(b)));
240 static inline vec_uint4 vec_adds(vec_uint4 a, vec_uint4 b)
242 return (spu_or(spu_add(a, b), spu_rlmaska(spu_sl(spu_genc(a, b), 31), -31)));
245 static inline vec_int4 vec_adds(vec_int4 a, vec_int4 b)
247 vec_int4 s, d;
249 s = spu_add(a, b);
250 d = spu_sel(s, spu_splats((signed int)0x7FFFFFFF), (vec_uint4)spu_rlmaska(spu_and(s, spu_nor(a, b)), -31));
251 d = spu_sel(d, spu_splats((signed int)0x80000000), (vec_uint4)spu_rlmaska(spu_nor(s, spu_nand(a, b)), -31));
252 return (d);
255 static inline vec_int4 vec_adds(vec_bint4 a, vec_int4 b)
257 return (vec_adds((vec_int4)(a), b));
260 static inline vec_int4 vec_adds(vec_int4 a, vec_bint4 b)
262 return (vec_adds(a, (vec_int4)(b)));
265 /* vec_and (vector logical and)
266 * =======
268 static inline vec_uchar16 vec_and(vec_uchar16 a, vec_uchar16 b)
270 return (spu_and(a, b));
273 static inline vec_char16 vec_and(vec_char16 a, vec_char16 b)
275 return (spu_and(a, b));
278 static inline vec_char16 vec_and(vec_bchar16 a, vec_char16 b)
280 return (spu_and((vec_char16)(a), b));
283 static inline vec_char16 vec_and(vec_char16 a, vec_bchar16 b)
285 return (spu_and(a, (vec_char16)(b)));
288 static inline vec_ushort8 vec_and(vec_ushort8 a, vec_ushort8 b)
290 return (spu_and(a, b));
293 static inline vec_short8 vec_and(vec_short8 a, vec_short8 b)
295 return (spu_and(a, b));
298 static inline vec_short8 vec_and(vec_bshort8 a, vec_short8 b)
300 return (spu_and((vec_short8)(a), b));
303 static inline vec_short8 vec_and(vec_short8 a, vec_bshort8 b)
305 return (spu_and(a, (vec_short8)(b)));
308 static inline vec_uint4 vec_and(vec_uint4 a, vec_uint4 b)
310 return (spu_and(a, b));
313 static inline vec_int4 vec_and(vec_int4 a, vec_int4 b)
315 return (spu_and(a, b));
318 static inline vec_int4 vec_and(vec_bint4 a, vec_int4 b)
320 return (spu_and((vec_int4)(a), b));
323 static inline vec_int4 vec_and(vec_int4 a, vec_bint4 b)
325 return (spu_and(a, (vec_int4)(b)));
328 static inline vec_float4 vec_and(vec_float4 a, vec_float4 b)
330 return (spu_and(a, b));
333 static inline vec_float4 vec_and(vec_bint4 a, vec_float4 b)
335 return (spu_and((vec_float4)(a),b));
338 static inline vec_float4 vec_and(vec_float4 a, vec_bint4 b)
340 return (spu_and(a, (vec_float4)(b)));
344 /* vec_andc (vector logical and with complement)
345 * ========
347 static inline vec_uchar16 vec_andc(vec_uchar16 a, vec_uchar16 b)
349 return (spu_andc(a, b));
352 static inline vec_char16 vec_andc(vec_char16 a, vec_char16 b)
354 return (spu_andc(a, b));
357 static inline vec_char16 vec_andc(vec_bchar16 a, vec_char16 b)
359 return (spu_andc((vec_char16)(a), b));
362 static inline vec_char16 vec_andc(vec_char16 a, vec_bchar16 b)
364 return (spu_andc(a, (vec_char16)(b)));
367 static inline vec_ushort8 vec_andc(vec_ushort8 a, vec_ushort8 b)
369 return (spu_andc(a, b));
372 static inline vec_short8 vec_andc(vec_short8 a, vec_short8 b)
374 return (spu_andc(a, b));
377 static inline vec_short8 vec_andc(vec_bshort8 a, vec_short8 b)
379 return (spu_andc((vec_short8)(a), b));
382 static inline vec_short8 vec_andc(vec_short8 a, vec_bshort8 b)
384 return (spu_andc(a, (vec_short8)(b)));
387 static inline vec_uint4 vec_andc(vec_uint4 a, vec_uint4 b)
389 return (spu_andc(a, b));
392 static inline vec_int4 vec_andc(vec_int4 a, vec_int4 b)
394 return (spu_andc(a, b));
397 static inline vec_int4 vec_andc(vec_bint4 a, vec_int4 b)
399 return (spu_andc((vec_int4)(a), b));
402 static inline vec_int4 vec_andc(vec_int4 a, vec_bint4 b)
404 return (spu_andc(a, (vec_int4)(b)));
407 static inline vec_float4 vec_andc(vec_float4 a, vec_float4 b)
409 return (spu_andc(a,b));
412 static inline vec_float4 vec_andc(vec_bint4 a, vec_float4 b)
414 return (spu_andc((vec_float4)(a),b));
417 static inline vec_float4 vec_andc(vec_float4 a, vec_bint4 b)
419 return (spu_andc(a, (vec_float4)(b)));
422 /* vec_avg (vector average)
423 * =======
425 static inline vec_uchar16 vec_avg(vec_uchar16 a, vec_uchar16 b)
427 return (spu_avg(a, b));
430 static inline vec_char16 vec_avg(vec_char16 a, vec_char16 b)
432 return ((vec_char16)(spu_xor(spu_avg((vec_uchar16)(a), (vec_uchar16)(b)),
433 (vec_uchar16)(spu_and(spu_xor(a,b), 0x80)))));
436 static inline vec_ushort8 vec_avg(vec_ushort8 a, vec_ushort8 b)
438 return (spu_add(spu_add(spu_rlmask(a, -1), spu_rlmask(b, -1)),
439 spu_and(spu_or(a, b), 1)));
442 static inline vec_short8 vec_avg(vec_short8 a, vec_short8 b)
444 return (spu_add(spu_add(spu_rlmaska(a, -1), spu_rlmaska(b, -1)),
445 spu_and(spu_or(a, b), 1)));
448 static inline vec_uint4 vec_avg(vec_uint4 a, vec_uint4 b)
450 return (spu_add(spu_add(spu_rlmask(a, -1), spu_rlmask(b, -1)),
451 spu_and(spu_or(a, b), 1)));
454 static inline vec_int4 vec_avg(vec_int4 a, vec_int4 b)
456 return (spu_add(spu_add(spu_rlmaska(a, -1), spu_rlmaska(b, -1)),
457 spu_and(spu_or(a, b), 1)));
461 /* vec_ceil (vector ceiling)
462 * ========
464 static inline vec_float4 vec_ceil(vec_float4 a)
466 vec_int4 exp;
467 vec_uint4 mask;
469 a = spu_add(a, (vec_float4)(spu_and(spu_xor(spu_rlmaska((vec_int4)a, -31), -1), spu_splats((signed int)0x3F7FFFFF))));
470 exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
471 mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
472 mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
473 mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
475 return ((vec_float4)(spu_andc((vec_uint4)(a), mask)));
479 /* vec_cmpb (vector compare bounds floating-point)
480 * ========
482 static inline vec_int4 vec_cmpb(vec_float4 a, vec_float4 b)
484 vec_int4 b0 = (vec_int4)spu_splats(0x80000000);
485 vec_int4 b1 = (vec_int4)spu_splats(0x40000000);
487 return (spu_or(spu_and((vec_int4)spu_cmpgt(a, b), b0),
488 spu_and((vec_int4)spu_cmpgt(spu_xor(b, (vec_float4)(b0)), a), b1)));
491 /* vec_cmpeq (vector compare equal)
492 * =========
494 #define vec_cmpeq(_a, _b) spu_cmpeq(_a, _b)
497 /* vec_cmpge (vector compare greater than or equal)
498 * =========
500 static inline vec_bint4 vec_cmpge(vec_float4 a, vec_float4 b)
502 return (spu_xor(spu_cmpgt(b, a), -1));
506 /* vec_cmpgt (vector compare greater than)
507 * =========
509 #define vec_cmpgt(_a, _b) spu_cmpgt(_a, _b)
512 /* vec_cmple (vector compare less than or equal)
513 * =========
515 static inline vec_bint4 vec_cmple(vec_float4 a, vec_float4 b)
517 return (spu_xor(spu_cmpgt(a, b), -1));
521 /* vec_cmplt (vector compare less than)
522 * =========
524 #define vec_cmplt(_a, _b) spu_cmpgt(_b, _a)
527 /* vec_ctf (vector convert from fixed-point word)
528 * =======
530 #define vec_ctf(_a, _b) spu_convtf(_a, _b)
533 /* vec_cts (vector convert to signed fixed-point word saturate)
534 * =======
536 #define vec_cts(_a, _b) spu_convts(_a, _b)
539 /* vec_ctu (vector convert to unsigned fixed-point word saturate)
540 * =======
542 #define vec_ctu(_a, _b) spu_convtu(_a, _b)
545 /* vec_dss (vector data stream stop)
546 * =======
548 #define vec_dss(_a)
551 /* vec_dssall (vector data stream stop all)
552 * ==========
554 #define vec_dssall()
557 /* vec_dst (vector data stream touch)
558 * =======
560 #define vec_dst(_a, _b, _c)
563 /* vec_dstst (vector data stream touch for store)
564 * =========
566 #define vec_dstst(_a, _b, _c)
569 /* vec_dststt (vector data stream touch for store transient)
570 * ==========
572 #define vec_dststt(_a, _b, _c)
575 /* vec_dstt (vector data stream touch transient)
576 * ========
578 #define vec_dstt(_a, _b, _c)
581 /* vec_expte (vector is 2 raised tp the exponent estimate floating-point)
582 * =========
584 static inline vec_float4 vec_expte(vec_float4 a)
586 vec_float4 bias, frac, exp;
587 vec_int4 ia;
589 bias = (vec_float4)(spu_andc(spu_splats((signed int)0x3F7FFFFF), spu_rlmaska((vec_int4)(a), -31)));
590 ia = spu_convts(spu_add(a, bias), 0);
591 frac = spu_sub(spu_convtf(ia, 0), a);
592 exp = (vec_float4)(spu_sl(spu_add(ia, 127), 23));
594 return (spu_mul(spu_madd(spu_madd(spu_splats(0.17157287f), frac, spu_splats(-0.67157287f)),
595 frac, spu_splats(1.0f)), exp));
599 /* vec_floor (vector floor)
600 * =========
602 static inline vec_float4 vec_floor(vec_float4 a)
604 vec_int4 exp;
605 vec_uint4 mask;
607 a = spu_sub(a, (vec_float4)(spu_and(spu_rlmaska((vec_int4)a, -31), spu_splats((signed int)0x3F7FFFFF))));
608 exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
609 mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
610 mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
611 mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
613 return ((vec_float4)(spu_andc((vec_uint4)(a), mask)));
617 /* vec_ld (vector load indexed)
618 * ======
620 static inline vec_uchar16 vec_ld(int a, unsigned char *b)
622 return (*((vec_uchar16 *)(b+a)));
625 static inline vec_uchar16 vec_ld(int a, vec_uchar16 *b)
627 return (*((vec_uchar16 *)((unsigned char *)(b)+a)));
630 static inline vec_char16 vec_ld(int a, signed char *b)
632 return (*((vec_char16 *)(b+a)));
635 static inline vec_char16 vec_ld(int a, vec_char16 *b)
637 return (*((vec_char16 *)((signed char *)(b)+a)));
640 static inline vec_ushort8 vec_ld(int a, unsigned short *b)
642 return (*((vec_ushort8 *)((unsigned char *)(b)+a)));
645 static inline vec_ushort8 vec_ld(int a, vec_ushort8 *b)
647 return (*((vec_ushort8 *)((unsigned char *)(b)+a)));
650 static inline vec_short8 vec_ld(int a, signed short *b)
652 return (*((vec_short8 *)((unsigned char *)(b)+a)));
655 static inline vec_short8 vec_ld(int a, vec_short8 *b)
657 return (*((vec_short8 *)((signed char *)(b)+a)));
660 static inline vec_uint4 vec_ld(int a, unsigned int *b)
662 return (*((vec_uint4 *)((unsigned char *)(b)+a)));
665 static inline vec_uint4 vec_ld(int a, vec_uint4 *b)
667 return (*((vec_uint4 *)((unsigned char *)(b)+a)));
670 static inline vec_int4 vec_ld(int a, signed int *b)
672 return (*((vec_int4 *)((unsigned char *)(b)+a)));
675 static inline vec_int4 vec_ld(int a, vec_int4 *b)
677 return (*((vec_int4 *)((signed char *)(b)+a)));
680 static inline vec_float4 vec_ld(int a, float *b)
682 return (*((vec_float4 *)((unsigned char *)(b)+a)));
685 static inline vec_float4 vec_ld(int a, vec_float4 *b)
687 return (*((vec_float4 *)((unsigned char *)(b)+a)));
690 /* vec_lde (vector load element indexed)
691 * =======
693 static inline vec_uchar16 vec_lde(int a, unsigned char *b)
695 return (*((vec_uchar16 *)(b+a)));
698 static inline vec_char16 vec_lde(int a, signed char *b)
700 return (*((vec_char16 *)(b+a)));
703 static inline vec_ushort8 vec_lde(int a, unsigned short *b)
705 return (*((vec_ushort8 *)((unsigned char *)(b)+a)));
708 static inline vec_short8 vec_lde(int a, signed short *b)
710 return (*((vec_short8 *)((unsigned char *)(b)+a)));
714 static inline vec_uint4 vec_lde(int a, unsigned int *b)
716 return (*((vec_uint4 *)((unsigned char *)(b)+a)));
719 static inline vec_int4 vec_lde(int a, signed int *b)
721 return (*((vec_int4 *)((unsigned char *)(b)+a)));
725 static inline vec_float4 vec_lde(int a, float *b)
727 return (*((vec_float4 *)((unsigned char *)(b)+a)));
730 /* vec_ldl (vector load indexed LRU)
731 * =======
733 #define vec_ldl(_a, _b) vec_ld(_a, _b)
736 /* vec_loge (vector log2 estimate floating-point)
737 * ========
739 static inline vec_float4 vec_loge(vec_float4 a)
741 vec_int4 exp;
742 vec_float4 frac;
744 exp = spu_add((vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)), -127);
745 frac = (vec_float4)(spu_sub((vec_int4)(a), spu_sl(exp, 23)));
747 return (spu_madd(spu_madd(spu_splats(-0.33985f), frac, spu_splats(2.01955f)),
748 frac, spu_sub(spu_convtf(exp, 0), spu_splats(1.6797f))));
752 /* vec_lvsl (vector load for shift left)
753 * ========
755 static inline vec_uchar16 vec_lvsl(int a, unsigned char *b)
757 return ((vec_uchar16)spu_add((vec_ushort8)(spu_splats((unsigned char)((a + (int)(b)) & 0xF))),
758 ((vec_ushort8){0x0001, 0x0203, 0x0405, 0x0607,
759 0x0809, 0x0A0B, 0x0C0D, 0x0E0F})));
762 static inline vec_uchar16 vec_lvsl(int a, signed char *b)
764 return (vec_lvsl(a, (unsigned char *)b));
767 static inline vec_uchar16 vec_lvsl(int a, unsigned short *b)
769 return (vec_lvsl(a, (unsigned char *)b));
772 static inline vec_uchar16 vec_lvsl(int a, short *b)
774 return (vec_lvsl(a, (unsigned char *)b));
777 static inline vec_uchar16 vec_lvsl(int a, unsigned int *b)
779 return (vec_lvsl(a, (unsigned char *)b));
782 static inline vec_uchar16 vec_lvsl(int a, int *b)
784 return (vec_lvsl(a, (unsigned char *)b));
787 static inline vec_uchar16 vec_lvsl(int a, float *b)
789 return (vec_lvsl(a, (unsigned char *)b));
793 /* vec_lvsr (vector load for shift right)
794 * ========
796 static inline vec_uchar16 vec_lvsr(int a, unsigned char *b)
798 return ((vec_uchar16)(spu_sub(((vec_ushort8){0x1011, 0x1213, 0x1415, 0x1617,
799 0x1819, 0x1A1B, 0x1C1D, 0x1E1F}),
800 (vec_ushort8)(spu_splats((unsigned char)((a + (int)(b)) & 0xF))))));
803 static inline vec_uchar16 vec_lvsr(int a, signed char *b)
805 return (vec_lvsr(a, (unsigned char *)b));
808 static inline vec_uchar16 vec_lvsr(int a, unsigned short *b)
810 return (vec_lvsr(a, (unsigned char *)b));
813 static inline vec_uchar16 vec_lvsr(int a, short *b)
815 return (vec_lvsr(a, (unsigned char *)b));
818 static inline vec_uchar16 vec_lvsr(int a, unsigned int *b)
820 return (vec_lvsr(a, (unsigned char *)b));
823 static inline vec_uchar16 vec_lvsr(int a, int *b)
825 return (vec_lvsr(a, (unsigned char *)b));
828 static inline vec_uchar16 vec_lvsr(int a, float *b)
830 return (vec_lvsr(a, (unsigned char *)b));
833 /* vec_madd (vector multiply add)
834 * ========
836 #define vec_madd(_a, _b, _c) spu_madd(_a, _b, _c)
840 /* vec_madds (vector multiply add saturate)
841 * =========
843 static inline vec_short8 vec_madds(vec_short8 a, vec_short8 b, vec_short8 c)
845 return (vec_adds(c, spu_sel((vec_short8)(spu_sl(spu_mule(a, b), 1)),
846 (vec_short8)(spu_rlmask(spu_mulo(a, b), -15)),
847 ((vec_ushort8){0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF}))));
850 /* vec_max (vector maximum)
851 * =======
853 static inline vec_uchar16 vec_max(vec_uchar16 a, vec_uchar16 b)
855 return (spu_sel(b, a, spu_cmpgt(a, b)));
858 static inline vec_char16 vec_max(vec_char16 a, vec_char16 b)
860 return (spu_sel(b, a, spu_cmpgt(a, b)));
863 static inline vec_char16 vec_max(vec_bchar16 a, vec_char16 b)
865 return (spu_sel(b, (vec_char16)(a), spu_cmpgt((vec_char16)(a), b)));
868 static inline vec_char16 vec_max(vec_char16 a, vec_bchar16 b)
870 return (spu_sel((vec_char16)(b), a, spu_cmpgt(a, (vec_char16)(b))));
873 static inline vec_ushort8 vec_max(vec_ushort8 a, vec_ushort8 b)
875 return (spu_sel(b, a, spu_cmpgt(a, b)));
878 static inline vec_short8 vec_max(vec_short8 a, vec_short8 b)
880 return (spu_sel(b, a, spu_cmpgt(a, b)));
883 static inline vec_short8 vec_max(vec_bshort8 a, vec_short8 b)
885 return (spu_sel(b, (vec_short8)(a), spu_cmpgt((vec_short8)(a), b)));
888 static inline vec_short8 vec_max(vec_short8 a, vec_bshort8 b)
890 return (spu_sel((vec_short8)(b), a, spu_cmpgt(a, (vec_short8)(b))));
893 static inline vec_uint4 vec_max(vec_uint4 a, vec_uint4 b)
895 return (spu_sel(b, a, spu_cmpgt(a, b)));
898 static inline vec_int4 vec_max(vec_int4 a, vec_int4 b)
900 return (spu_sel(b, a, spu_cmpgt(a, b)));
903 static inline vec_int4 vec_max(vec_bint4 a, vec_int4 b)
905 return (spu_sel(b, (vec_int4)(a), spu_cmpgt((vec_int4)(a), b)));
908 static inline vec_int4 vec_max(vec_int4 a, vec_bint4 b)
910 return (spu_sel((vec_int4)(b), a, spu_cmpgt(a, (vec_int4)(b))));
913 static inline vec_float4 vec_max(vec_float4 a, vec_float4 b)
915 return (spu_sel(b, a, spu_cmpgt(a, b)));
919 /* vec_mergeh (vector merge high)
920 * ==========
922 static inline vec_uchar16 vec_mergeh(vec_uchar16 a, vec_uchar16 b)
924 return (spu_shuffle(a, b, ((vec_uchar16){0, 16, 1, 17, 2, 18, 3, 19,
925 4, 20, 5, 21, 6, 22, 7, 23})));
928 static inline vec_char16 vec_mergeh(vec_char16 a, vec_char16 b)
930 return (spu_shuffle(a, b, ((vec_uchar16){0, 16, 1, 17, 2, 18, 3, 19,
931 4, 20, 5, 21, 6, 22, 7, 23})));
934 static inline vec_ushort8 vec_mergeh(vec_ushort8 a, vec_ushort8 b)
936 return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 16, 17, 2, 3, 18, 19,
937 4, 5, 20, 21, 6, 7, 22, 23})));
940 static inline vec_short8 vec_mergeh(vec_short8 a, vec_short8 b)
942 return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 16, 17, 2, 3, 18, 19,
943 4, 5, 20, 21, 6, 7, 22, 23})));
946 static inline vec_uint4 vec_mergeh(vec_uint4 a, vec_uint4 b)
948 return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19,
949 4, 5, 6, 7, 20, 21, 22, 23})));
952 static inline vec_int4 vec_mergeh(vec_int4 a, vec_int4 b)
954 return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19,
955 4, 5, 6, 7, 20, 21, 22, 23})));
958 static inline vec_float4 vec_mergeh(vec_float4 a, vec_float4 b)
960 return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19,
961 4, 5, 6, 7, 20, 21, 22, 23})));
964 /* vec_mergel (vector merge low)
965 * ==========
967 static inline vec_uchar16 vec_mergel(vec_uchar16 a, vec_uchar16 b)
969 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 24, 9, 25, 10, 26, 11, 27,
970 12, 28, 13, 29, 14, 30, 15, 31})));
973 static inline vec_char16 vec_mergel(vec_char16 a, vec_char16 b)
975 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 24, 9, 25, 10, 26, 11, 27,
976 12, 28, 13, 29, 14, 30, 15, 31})));
979 static inline vec_ushort8 vec_mergel(vec_ushort8 a, vec_ushort8 b)
981 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 24, 25, 10, 11, 26, 27,
982 12, 13, 28, 29, 14, 15, 30, 31})));
985 static inline vec_short8 vec_mergel(vec_short8 a, vec_short8 b)
987 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 24, 25, 10, 11, 26, 27,
988 12, 13, 28, 29, 14, 15, 30, 31})));
991 static inline vec_uint4 vec_mergel(vec_uint4 a, vec_uint4 b)
993 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 10, 11, 24, 25, 26, 27,
994 12, 13, 14, 15, 28, 29, 30, 31})));
997 static inline vec_int4 vec_mergel(vec_int4 a, vec_int4 b)
999 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 10, 11, 24, 25, 26, 27,
1000 12, 13, 14, 15, 28, 29, 30, 31})));
1003 static inline vec_float4 vec_mergel(vec_float4 a, vec_float4 b)
1005 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 10, 11, 24, 25, 26, 27,
1006 12, 13, 14, 15, 28, 29, 30, 31})));
1009 /* vec_mfvscr (vector move from vector status and control register)
1010 * ==========
1012 static inline vec_ushort8 vec_mfvscr()
1014 return ((vec_ushort8)spu_splats(0)); /* not supported */
1018 /* vec_min (vector minimum)
1019 * =======
1021 static inline vec_uchar16 vec_min(vec_uchar16 a, vec_uchar16 b)
1023 return (spu_sel(a, b, spu_cmpgt(a, b)));
1026 static inline vec_char16 vec_min(vec_char16 a, vec_char16 b)
1028 return (spu_sel(a, b, spu_cmpgt(a, b)));
1031 static inline vec_char16 vec_min(vec_bchar16 a, vec_char16 b)
1033 return (spu_sel((vec_char16)(a), b, spu_cmpgt((vec_char16)(a), b)));
1036 static inline vec_char16 vec_min(vec_char16 a, vec_bchar16 b)
1038 return (spu_sel(a, (vec_char16)(b), spu_cmpgt(a, (vec_char16)(b))));
1041 static inline vec_ushort8 vec_min(vec_ushort8 a, vec_ushort8 b)
1043 return (spu_sel(a, b, spu_cmpgt(a, b)));
1046 static inline vec_short8 vec_min(vec_short8 a, vec_short8 b)
1048 return (spu_sel(a, b, spu_cmpgt(a, b)));
1051 static inline vec_short8 vec_min(vec_bshort8 a, vec_short8 b)
1053 return (spu_sel((vec_short8)(a), b, spu_cmpgt((vec_short8)(a), b)));
1056 static inline vec_short8 vec_min(vec_short8 a, vec_bshort8 b)
1058 return (spu_sel(a, (vec_short8)(b), spu_cmpgt(a, (vec_short8)(b))));
1061 static inline vec_uint4 vec_min(vec_uint4 a, vec_uint4 b)
1063 return (spu_sel(a, b, spu_cmpgt(a, b)));
1066 static inline vec_int4 vec_min(vec_int4 a, vec_int4 b)
1068 return (spu_sel(a, b, spu_cmpgt(a, b)));
1071 static inline vec_int4 vec_min(vec_bint4 a, vec_int4 b)
1073 return (spu_sel((vec_int4)(a), b, spu_cmpgt((vec_int4)(a), b)));
1076 static inline vec_int4 vec_min(vec_int4 a, vec_bint4 b)
1078 return (spu_sel(a, (vec_int4)(b), spu_cmpgt(a, (vec_int4)(b))));
1081 static inline vec_float4 vec_min(vec_float4 a, vec_float4 b)
1083 return (spu_sel(a, b, spu_cmpgt(a, b)));
1086 /* vec_mladd (vector multiply low and add unsigned half word)
1087 * =========
1089 static inline vec_short8 vec_mladd(vec_short8 a, vec_short8 b, vec_short8 c)
1091 return ((vec_short8)(spu_shuffle(spu_madd((vec_short8)(spu_rl((vec_uint4)(a), -16)),
1092 (vec_short8)(spu_rl((vec_uint4)(b), -16)),
1093 (vec_int4)(spu_rl((vec_uint4)(c), -16))),
1094 spu_madd(a, b, spu_extend(c)),
1095 ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23,
1096 10, 11, 26, 27, 14, 15, 30, 31}))));
1100 static inline vec_ushort8 vec_mladd(vec_ushort8 a, vec_ushort8 b, vec_ushort8 c)
1102 return ((vec_ushort8)(vec_mladd((vec_short8)(a), (vec_short8)(b), (vec_short8)(c))));
1105 static inline vec_short8 vec_mladd(vec_ushort8 a, vec_short8 b, vec_short8 c)
1107 return (vec_mladd((vec_short8)(a), b, c));
1110 static inline vec_short8 vec_mladd(vec_short8 a, vec_ushort8 b, vec_ushort8 c)
1112 return (vec_mladd(a, (vec_short8)(b), (vec_short8)(c)));
1116 /* vec_mradds (vector multiply round and add saturate)
1117 * ==========
1119 static inline vec_short8 vec_mradds(vec_short8 a, vec_short8 b, vec_short8 c)
1121 vec_int4 round = (vec_int4)spu_splats(0x4000);
1122 vec_short8 hi, lo;
1124 hi = (vec_short8)(spu_sl(spu_add(spu_mule(a, b), round), 1));
1125 lo = (vec_short8)(spu_rlmask(spu_add(spu_mulo(a, b), round), -15));
1127 return (vec_adds(spu_sel(hi, lo, ((vec_ushort8){0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF})), c));
1131 /* vec_msum (vector multiply sum)
1132 * ========
1134 static inline vec_uint4 vec_msum(vec_uchar16 a, vec_uchar16 b, vec_uint4 c)
1136 vec_ushort8 a1, a2, b1, b2;
1137 vec_uint4 p1, p2;
1139 a1 = spu_and((vec_ushort8)(a), 0xFF);
1140 a2 = spu_rlmask((vec_ushort8)(a), -8);
1141 b1 = spu_and((vec_ushort8)(b), 0xFF);
1142 b2 = spu_rlmask((vec_ushort8)(b), -8);
1144 p1 = spu_add(spu_mulo(a1, b1), spu_mulo(spu_rlqwbyte(a1, -2), spu_rlqwbyte(b1, -2)));
1145 p2 = spu_add(spu_mulo(a2, b2), spu_mulo(spu_rlqwbyte(a2, -2), spu_rlqwbyte(b2, -2)));
1146 return (spu_add(p2, spu_add(p1, c)));
1149 static inline vec_int4 vec_msum(vec_char16 a, vec_uchar16 b, vec_int4 c)
1151 vec_short8 a1, a2, b1, b2;
1152 vec_int4 p1, p2;
1154 a1 = (vec_short8)(spu_extend(a));
1155 a2 = spu_rlmaska((vec_short8)(a), -8);
1156 b1 = (vec_short8)(spu_and((vec_ushort8)(b), 0xFF));
1157 b2 = (vec_short8)spu_rlmask((vec_ushort8)(b), -8);
1159 p1 = spu_add(spu_mulo(a1, b1), spu_mulo(spu_rlqwbyte(a1, -2), spu_rlqwbyte(b1, -2)));
1160 p2 = spu_add(spu_mulo(a2, b2), spu_mulo(spu_rlqwbyte(a2, -2), spu_rlqwbyte(b2, -2)));
1161 return (spu_add(p2, spu_add(p1, c)));
1164 static inline vec_uint4 vec_msum(vec_ushort8 a, vec_ushort8 b, vec_uint4 c)
1166 return (spu_add(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c));
1169 static inline vec_int4 vec_msum(vec_short8 a, vec_short8 b, vec_int4 c)
1171 return (spu_add(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c));
1175 /* vec_msums (vector multiply sum saturate)
1176 * ========
1178 static inline vec_uint4 vec_msums(vec_ushort8 a, vec_ushort8 b, vec_uint4 c)
1180 vec_uint4 p1, p2;
1182 p1 = spu_mulo(a, b);
1183 p2 = spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2));
1185 return (vec_adds(p2, vec_adds(p1, c)));
1188 static inline vec_int4 vec_msums(vec_short8 a, vec_short8 b, vec_int4 c)
1190 return (vec_adds(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c));
1193 /* vec_mtvscr (vector move to vector status and control register)
1194 * ==========
1196 #define vec_mtvscr(_a) /* not supported */
1199 /* vec_mule (vector multiply even)
1200 * ========
1202 static inline vec_ushort8 vec_mule(vec_uchar16 a, vec_uchar16 b)
1204 vec_ushort8 hi, lo;
1206 hi = (vec_ushort8)spu_mulo((vec_ushort8)(spu_rlmask((vec_uint4)(a), -24)),
1207 (vec_ushort8)(spu_rlmask((vec_uint4)(b), -24)));
1208 lo = (vec_ushort8)spu_mulo((vec_ushort8)(spu_rlmask((vec_short8)(a), -8)),
1209 (vec_ushort8)(spu_rlmask((vec_short8)(b), -8)));
1211 return (spu_shuffle(hi, lo, ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23,
1212 10, 11, 26, 27, 14, 15, 30, 31})));
1215 static inline vec_short8 vec_mule(vec_char16 a, vec_char16 b)
1217 vec_short8 hi, lo;
1219 hi = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_uint4)(a), -24)),
1220 (vec_short8)(spu_rlmaska((vec_uint4)(b), -24)));
1221 lo = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_short8)(a), -8)),
1222 (vec_short8)(spu_rlmaska((vec_short8)(b), -8)));
1224 return (spu_shuffle(hi, lo, ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23,
1225 10, 11, 26, 27, 14, 15, 30, 31})));
1228 static inline vec_uint4 vec_mule(vec_ushort8 a, vec_ushort8 b)
1230 return (spu_mulo((vec_ushort8)spu_rlmask((vec_uint4)(a), -16),
1231 (vec_ushort8)spu_rlmask((vec_uint4)(b), -16)));
1235 static inline vec_int4 vec_mule(vec_short8 a, vec_short8 b)
1237 return (spu_mulo((vec_short8)spu_rlmaska((vec_int4)(a), -16),
1238 (vec_short8)spu_rlmaska((vec_int4)(b), -16)));
1242 /* vec_mulo (vector multiply odd)
1243 * ========
1245 static inline vec_ushort8 vec_mulo(vec_uchar16 a, vec_uchar16 b)
1247 vec_ushort8 hi, lo;
1249 hi = (vec_ushort8)spu_mulo((vec_ushort8)(spu_and(spu_rlmask((vec_uint4)(a), -16), 0xFF)),
1250 (vec_ushort8)(spu_and(spu_rlmask((vec_uint4)(b), -16), 0xFF)));
1251 lo = (vec_ushort8)spu_mulo(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF));
1253 return (spu_shuffle(hi, lo, ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23,
1254 10, 11, 26, 27, 14, 15, 30, 31})));
1257 static inline vec_short8 vec_mulo(vec_char16 a, vec_char16 b)
1259 vec_short8 aa, bb, hi, lo;
1261 aa = spu_extend(a);
1262 bb = spu_extend(b);
1264 hi = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_uint4)(aa), -16)),
1265 (vec_short8)(spu_rlmaska((vec_uint4)(bb), -16)));
1266 lo = (vec_short8)spu_mulo(aa, bb);
1267 return (spu_shuffle(hi, lo, ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23,
1268 10, 11, 26, 27, 14, 15, 30, 31})));
1271 static inline vec_uint4 vec_mulo(vec_ushort8 a, vec_ushort8 b)
1273 return (spu_mulo(a, b));
1277 static inline vec_int4 vec_mulo(vec_short8 a, vec_short8 b)
1279 return (spu_mulo(a, b));
1283 /* vec_nmsub (vector negative multiply subtract)
1284 * =========
1286 #define vec_nmsub(_a, _b, _c) spu_nmsub(_a, _b, _c)
1289 /* vec_nor (vector logical nor)
1290 * =======
1292 #define vec_nor(_a, _b) spu_nor(_a, _b)
1295 /* vec_or (vector logical or)
1296 * ======
1298 static inline vec_uchar16 vec_or(vec_uchar16 a, vec_uchar16 b)
1300 return (spu_or(a, b));
1303 static inline vec_char16 vec_or(vec_char16 a, vec_char16 b)
1305 return (spu_or(a, b));
1308 static inline vec_char16 vec_or(vec_bchar16 a, vec_char16 b)
1310 return (spu_or((vec_char16)(a), b));
1313 static inline vec_char16 vec_or(vec_char16 a, vec_bchar16 b)
1315 return (spu_or(a, (vec_char16)(b)));
1318 static inline vec_ushort8 vec_or(vec_ushort8 a, vec_ushort8 b)
1320 return (spu_or(a, b));
1323 static inline vec_short8 vec_or(vec_short8 a, vec_short8 b)
1325 return (spu_or(a, b));
1328 static inline vec_short8 vec_or(vec_bshort8 a, vec_short8 b)
1330 return (spu_or((vec_short8)(a), b));
1333 static inline vec_short8 vec_or(vec_short8 a, vec_bshort8 b)
1335 return (spu_or(a, (vec_short8)(b)));
1338 static inline vec_uint4 vec_or(vec_uint4 a, vec_uint4 b)
1340 return (spu_or(a, b));
1343 static inline vec_int4 vec_or(vec_int4 a, vec_int4 b)
1345 return (spu_or(a, b));
1348 static inline vec_int4 vec_or(vec_bint4 a, vec_int4 b)
1350 return (spu_or((vec_int4)(a), b));
1353 static inline vec_int4 vec_or(vec_int4 a, vec_bint4 b)
1355 return (spu_or(a, (vec_int4)(b)));
1358 static inline vec_float4 vec_or(vec_float4 a, vec_float4 b)
1360 return (spu_or(a, b));
1363 static inline vec_float4 vec_or(vec_bint4 a, vec_float4 b)
1365 return (spu_or((vec_float4)(a),b));
1368 static inline vec_float4 vec_or(vec_float4 a, vec_bint4 b)
1370 return (spu_or(a, (vec_float4)(b)));
1374 /* vec_pack (vector pack)
1375 * ========
1377 static inline vec_uchar16 vec_pack(vec_ushort8 a, vec_ushort8 b)
1379 return ((vec_uchar16)spu_shuffle(a, b, ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15,
1380 17, 19, 21, 23, 25, 27, 29, 31})));
1383 static inline vec_char16 vec_pack(vec_short8 a, vec_short8 b)
1385 return ((vec_char16)spu_shuffle(a, b, ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15,
1386 17, 19, 21, 23, 25, 27, 29, 31})));
1389 static inline vec_ushort8 vec_pack(vec_uint4 a, vec_uint4 b)
1391 return ((vec_ushort8)spu_shuffle(a, b, ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15,
1392 18, 19, 22, 23, 26, 27, 30, 31})));
1395 static inline vec_short8 vec_pack(vec_int4 a, vec_int4 b)
1397 return ((vec_short8)spu_shuffle(a, b, ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15,
1398 18, 19, 22, 23, 26, 27, 30, 31})));
1402 /* vec_packpx (vector pack pixel)
1403 * ==========
1405 static inline vec_pixel8 vec_packpx(vec_uint4 a, vec_uint4 b)
1407 vec_uint4 x03FF = (vec_uint4)(spu_splats((unsigned short)0x03FF));
1408 vec_uint4 x001F = (vec_uint4)(spu_splats((unsigned short)0x001F));
1410 return ((vec_pixel8)(spu_shuffle(spu_sel(spu_sel(spu_sl(a, 7), spu_sl(a, 10), x03FF),
1411 spu_sl(a, 13), x001F),
1412 spu_sel(spu_sel(spu_sl(b, 7), spu_sl(b, 10), x03FF),
1413 spu_sl(b, 13), x001F),
1414 ((vec_uchar16){ 0, 1, 4, 5, 8, 9, 12, 13,
1415 16, 17, 20, 21, 24, 25, 28, 29}))));
1419 /* vec_packs (vector pack saturate)
1420 * =========
1422 static inline vec_uchar16 vec_packs(vec_ushort8 a, vec_ushort8 b)
1424 vec_ushort8 max = spu_splats((unsigned short)0x00FF);
1426 return ((vec_uchar16)(spu_shuffle(spu_sel(a, max, spu_cmpgt(a, 255)),
1427 spu_sel(b, max, spu_cmpgt(b, 255)),
1428 ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15,
1429 17, 19, 21, 23, 25, 27, 29, 31}))));
1432 static inline vec_char16 vec_packs(vec_short8 a, vec_short8 b)
1434 vec_short8 max = spu_splats((signed short)0x007F);
1435 vec_short8 min = spu_splats((signed short)0xFF80);
1437 return ((vec_char16)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, 127)), spu_cmpgt(a, -128)),
1438 spu_sel(min, spu_sel(b, max, spu_cmpgt(b, 127)), spu_cmpgt(b, -128)),
1439 ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15,
1440 17, 19, 21, 23, 25, 27, 29, 31}))));
1443 static inline vec_ushort8 vec_packs(vec_uint4 a, vec_uint4 b)
1445 vec_uint4 max = spu_splats((unsigned int)0x0000FFFF);
1447 return ((vec_ushort8)(spu_shuffle(spu_sel(a, max, spu_cmpgt(a, max)),
1448 spu_sel(b, max, spu_cmpgt(b, max)),
1449 ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15,
1450 18, 19, 22, 23, 26, 27, 30, 31}))));
1453 static inline vec_short8 vec_packs(vec_int4 a, vec_int4 b)
1455 vec_int4 max = spu_splats((signed int)0x00007FFF);
1456 vec_int4 min = spu_splats((signed int)0xFFFF8000);
1458 return ((vec_short8)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, max)), spu_cmpgt(a, min)),
1459 spu_sel(min, spu_sel(b, max, spu_cmpgt(b, max)), spu_cmpgt(b, min)),
1460 ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15,
1461 18, 19, 22, 23, 26, 27, 30, 31}))));
1465 /* vec_packsu (vector pack saturate unsigned)
1466 * ==========
1468 static inline vec_uchar16 vec_packsu(vec_ushort8 a, vec_ushort8 b)
1470 return ((vec_uchar16)spu_shuffle(spu_or(a, (vec_ushort8)(spu_cmpgt(a, 255))),
1471 spu_or(b, (vec_ushort8)(spu_cmpgt(b, 255))),
1472 ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15,
1473 17, 19, 21, 23, 25, 27, 29, 31})));
1476 static inline vec_uchar16 vec_packsu(vec_short8 a, vec_short8 b)
1478 vec_short8 max = spu_splats((signed short)0x00FF);
1479 vec_short8 min = spu_splats((signed short)0x0000);
1481 return ((vec_uchar16)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, 255)), spu_cmpgt(a, 0)),
1482 spu_sel(min, spu_sel(b, max, spu_cmpgt(b, 255)), spu_cmpgt(b, 0)),
1483 ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15,
1484 17, 19, 21, 23, 25, 27, 29, 31}))));
1486 return (vec_packsu((vec_ushort8)(a), (vec_ushort8)(b)));
1489 static inline vec_ushort8 vec_packsu(vec_uint4 a, vec_uint4 b)
1491 vec_uint4 max = spu_splats((unsigned int)0xFFFF);
1493 return ((vec_ushort8)spu_shuffle(spu_or(a, (vec_uint4)(spu_cmpgt(a, max))),
1494 spu_or(b, (vec_uint4)(spu_cmpgt(b, max))),
1495 ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15,
1496 18, 19, 22, 23, 26, 27, 30, 31})));
1499 static inline vec_ushort8 vec_packsu(vec_int4 a, vec_int4 b)
1501 vec_int4 max = spu_splats((signed int)0x0000FFFF);
1502 vec_int4 min = spu_splats((signed int)0x00000000);
1504 return ((vec_ushort8)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, max)), spu_cmpgt(a, min)),
1505 spu_sel(min, spu_sel(b, max, spu_cmpgt(b, max)), spu_cmpgt(b, min)),
1506 ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15,
1507 18, 19, 22, 23, 26, 27, 30, 31}))));
1511 /* vec_perm (vector permute)
1512 * ========
1514 static inline vec_uchar16 vec_perm(vec_uchar16 a, vec_uchar16 b, vec_uchar16 c)
1516 return (spu_shuffle(a, b, spu_and(c, 0x1F)));
1519 static inline vec_char16 vec_perm(vec_char16 a, vec_char16 b, vec_uchar16 c)
1521 return ((vec_char16)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1524 static inline vec_ushort8 vec_perm(vec_ushort8 a, vec_ushort8 b, vec_uchar16 c)
1526 return ((vec_ushort8)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1529 static inline vec_short8 vec_perm(vec_short8 a, vec_short8 b, vec_uchar16 c)
1531 return ((vec_short8)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1534 static inline vec_uint4 vec_perm(vec_uint4 a, vec_uint4 b, vec_uchar16 c)
1536 return ((vec_uint4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1539 static inline vec_int4 vec_perm(vec_int4 a, vec_int4 b, vec_uchar16 c)
1541 return ((vec_int4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1544 static inline vec_float4 vec_perm(vec_float4 a, vec_float4 b, vec_uchar16 c)
1546 return ((vec_float4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1550 /* vec_re (vector reciprocal estimate)
1551 * ======
1553 #define vec_re(_a) spu_re(_a)
1556 /* vec_rl (vector rotate left)
1557 * ======
1559 static inline vec_uchar16 vec_rl(vec_uchar16 a, vec_uchar16 b)
1561 vec_ushort8 r1, r2;
1563 r1 = spu_rl(spu_and((vec_ushort8)(a), 0xFF), (vec_short8)spu_and((vec_ushort8)(b), 7));
1564 r2 = spu_rl(spu_and((vec_ushort8)(a), -256), (vec_short8)spu_and(spu_rlmask((vec_ushort8)(b), -8), 7));
1565 return ((vec_uchar16)(spu_sel(spu_or(r2, spu_sl(r2, 8)), spu_or(r1, spu_rlmask(r1, -8)), spu_splats((unsigned short)0xFF))));
1568 static inline vec_char16 vec_rl(vec_char16 a, vec_uchar16 b)
1570 return ((vec_char16)(vec_rl((vec_uchar16)(a), b)));
1573 static inline vec_ushort8 vec_rl(vec_ushort8 a, vec_ushort8 b)
1575 return (spu_rl(a, (vec_short8)(b)));
1578 static inline vec_short8 vec_rl(vec_short8 a, vec_ushort8 b)
1580 return (spu_rl(a, (vec_short8)(b)));
1583 static inline vec_uint4 vec_rl(vec_uint4 a, vec_uint4 b)
1585 return (spu_rl(a, (vec_int4)(b)));
1588 static inline vec_int4 vec_rl(vec_int4 a, vec_uint4 b)
1590 return (spu_rl(a, (vec_int4)(b)));
1594 /* vec_round (vector round)
1595 * =========
1597 static inline vec_float4 vec_round(vec_float4 a)
1599 vec_float4 s_half, s_one, d;
1600 vec_uint4 odd;
1601 vec_uint4 msb = spu_splats((unsigned int)0x80000000);
1602 vec_float4 half = spu_splats(0.5f);
1603 vec_int4 exp;
1604 vec_uint4 mask;
1606 s_half = (vec_float4)(spu_sel((vec_uint4)(half), (vec_uint4)(a), msb));
1607 a = spu_add(a, s_half);
1608 s_one = spu_add(s_half, s_half);
1609 exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
1610 mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
1611 mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
1612 mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
1614 odd = spu_and((vec_uint4)(spu_convts(a, 0)), 1);
1615 s_one = spu_andc(s_one, (vec_float4)spu_cmpeq(mask, 0));
1616 s_one = spu_and(s_one, spu_and((vec_float4)spu_cmpeq(spu_and((vec_uint4)(a), mask), 0),
1617 (vec_float4)spu_cmpeq(odd, 1)));
1618 d = spu_andc(a, (vec_float4)(mask));
1619 d = spu_sub(d, s_one);
1620 return (d);
1623 /* vec_rsqrte (vector reciprocal square root estimate)
1624 * ==========
1626 #define vec_rsqrte(_a) spu_rsqrte(_a)
1629 /* vec_sel (vector select)
1630 * =======
1632 #define vec_sel(_a, _b, _c) spu_sel(_a, _b, _c)
1635 /* vec_sl (vector shift left)
1636 * ======
1638 static inline vec_uchar16 vec_sl(vec_uchar16 a, vec_uchar16 b)
1640 vec_ushort8 hi, lo;
1642 lo = spu_and(spu_sl((vec_ushort8)(a), spu_and((vec_ushort8)(b), 7)), 0xFF);
1643 hi = spu_sl(spu_and((vec_ushort8)(a), -256), spu_and(spu_rlmask((vec_ushort8)(b), -8), 7));
1645 return ((vec_uchar16)(spu_or(hi, lo)));
1648 static inline vec_char16 vec_sl(vec_char16 a, vec_uchar16 b)
1650 return ((vec_char16)(vec_sl((vec_uchar16)(a), b)));
1653 static inline vec_ushort8 vec_sl(vec_ushort8 a, vec_ushort8 b)
1655 return (spu_sl(a, spu_and(b, 15)));
1658 static inline vec_short8 vec_sl(vec_short8 a, vec_ushort8 b)
1660 return (spu_sl(a, spu_and((vec_ushort8)(b), 15)));
1663 static inline vec_uint4 vec_sl(vec_uint4 a, vec_uint4 b)
1665 return (spu_sl(a, spu_and(b, 31)));
1668 static inline vec_int4 vec_sl(vec_int4 a, vec_uint4 b)
1670 return (spu_sl(a, spu_and(b, 31)));
1674 /* vec_sld (vector shift left double)
1675 * =======
1677 #define vec_sld(_a, _b, _c) spu_shuffle(_a, _b, ((vec_uchar16){ 0+(_c), 1+(_c), 2+(_c), 3+(_c), \
1678 4+(_c), 5+(_c), 6+(_c), 7+(_c), \
1679 8+(_c), 9+(_c), 10+(_c), 11+(_c), \
1680 12+(_c), 13+(_c), 14+(_c), 15+(_c)}))
1683 /* vec_sll (vector shift left long)
1684 * =======
1686 #define vec_sll(_a, _b) spu_slqw(_a, spu_extract((vec_uint4)(_b), 0))
1689 /* vec_slo (vector shift left by octet)
1690 * =======
1692 #define vec_slo(_a, _b) spu_slqwbytebc(_a, spu_extract((vec_uint4)(_b), 3) & 0x7F)
1695 /* vec_splat (vector splat)
1696 * =========
1698 #define vec_splat(_a, _b) spu_splats(spu_extract(_a, _b))
1701 /* vec_splat_s8 (vector splat signed byte)
1702 * ============
1704 #define vec_splat_s8(_a) spu_splats((signed char)(_a))
1707 /* vec_splat_s16 (vector splat signed half-word)
1708 * =============
1710 #define vec_splat_s16(_a) spu_splats((signed short)(_a))
1713 /* vec_splat_s32 (vector splat signed word)
1714 * =============
1716 #define vec_splat_s32(_a) spu_splats((signed int)(_a))
1719 /* vec_splat_u8 (vector splat unsigned byte)
1720 * ============
1722 #define vec_splat_u8(_a) spu_splats((unsigned char)(_a))
1725 /* vec_splat_u16 (vector splat unsigned half-word)
1726 * =============
1728 #define vec_splat_u16(_a) spu_splats((unsigned short)(_a))
1731 /* vec_splat_u32 (vector splat unsigned word)
1732 * =============
1734 #define vec_splat_u32(_a) spu_splats((unsigned int)(_a))
1737 /* vec_sr (vector shift right)
1738 * ======
1740 static inline vec_uchar16 vec_sr(vec_uchar16 a, vec_uchar16 b)
1742 vec_ushort8 hi, lo;
1744 lo = spu_rlmask(spu_and((vec_ushort8)(a), 0xFF), spu_sub(0, (vec_short8)(spu_and((vec_ushort8)(b), 7))));
1745 hi = spu_and(spu_rlmask((vec_ushort8)(a), spu_sub(0, (vec_short8)(spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)))), -256);
1747 return ((vec_uchar16)(spu_or(hi, lo)));
1750 static inline vec_char16 vec_sr(vec_char16 a, vec_uchar16 b)
1752 return ((vec_char16)(vec_sr((vec_uchar16)(a), b)));
1755 static inline vec_ushort8 vec_sr(vec_ushort8 a, vec_ushort8 b)
1757 return (spu_rlmask(a, spu_sub(0, (vec_short8)(spu_and(b, 15)))));
1760 static inline vec_short8 vec_sr(vec_short8 a, vec_ushort8 b)
1762 return ((vec_short8)(vec_sr((vec_ushort8)(a), b)));
1765 static inline vec_uint4 vec_sr(vec_uint4 a, vec_uint4 b)
1767 return (spu_rlmask(a, spu_sub(0, (vec_int4)(spu_and(b, 31)))));
1770 static inline vec_int4 vec_sr(vec_int4 a, vec_uint4 b)
1772 return ((vec_int4)(vec_sr((vec_uint4)(a), b)));
1776 /* vec_sra (vector shift right algebraic)
1777 * =======
1779 static inline vec_char16 vec_sra(vec_char16 a, vec_uchar16 b)
1781 vec_short8 hi, lo;
1783 lo = spu_and(spu_rlmaska(spu_extend(a), spu_sub(0, (vec_short8)(spu_and((vec_ushort8)(b), 7)))), 0xFF);
1784 hi = spu_and(spu_rlmaska((vec_short8)(a), spu_sub(0, (vec_short8)(spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)))), -256);
1786 return ((vec_char16)(spu_or(hi, lo)));
1789 static inline vec_uchar16 vec_sra(vec_uchar16 a, vec_uchar16 b)
1791 return ((vec_uchar16)(vec_sra((vec_char16)(a), b)));
1794 static inline vec_short8 vec_sra(vec_short8 a, vec_ushort8 b)
1796 return (spu_rlmaska(a, spu_sub(0, (vec_short8)(spu_and(b, 15)))));
1799 static inline vec_ushort8 vec_sra(vec_ushort8 a, vec_ushort8 b)
1801 return ((vec_ushort8)(vec_sra((vec_short8)(a), b)));
1804 static inline vec_int4 vec_sra(vec_int4 a, vec_uint4 b)
1806 return (spu_rlmaska(a, spu_sub(0, (vec_int4)(spu_and(b, 31)))));
1809 static inline vec_uint4 vec_sra(vec_uint4 a, vec_uint4 b)
1811 return ((vec_uint4)(vec_sra((vec_int4)(a), b)));
1815 /* vec_srl (vector shift right long)
1816 * =======
1818 #define vec_srl(_a, _b) spu_rlmaskqw(_a, 0-spu_extract((vec_int4)(_b), 3))
1821 /* vec_sro (vector shift right by octet)
1822 * =======
1824 #define vec_sro(_a, _b) spu_rlmaskqwbyte(_a, 0 - ((spu_extract((vec_int4)(_b), 3) >> 3) & 0xF))
1826 /* vec_st (vector store indexed)
1827 * ======
1829 static inline void vec_st(vec_uchar16 a, int b, unsigned char *c)
1831 *((vec_uchar16 *)(c+b)) = a;
1834 static inline void vec_st(vec_uchar16 a, int b, vec_uchar16 *c)
1836 *((vec_uchar16 *)((unsigned char *)(c)+b)) = a;
1839 static inline void vec_st(vec_char16 a, int b, signed char *c)
1841 *((vec_char16 *)(c+b)) = a;
1844 static inline void vec_st(vec_char16 a, int b, vec_char16 *c)
1846 *((vec_char16 *)((signed char *)(c)+b)) = a;
1849 static inline void vec_st(vec_bchar16 a, int b, signed char *c)
1851 *((vec_bchar16 *)((signed char *)(c)+b)) = a;
1854 static inline void vec_st(vec_ushort8 a, int b, unsigned short *c)
1856 *((vec_ushort8 *)((unsigned char *)(c)+b)) = a;
1859 static inline void vec_st(vec_ushort8 a, int b, vec_ushort8 *c)
1861 *((vec_ushort8 *)((unsigned char *)(c)+b)) = a;
1864 static inline void vec_st(vec_short8 a, int b, signed short *c)
1866 *((vec_short8 *)((unsigned char *)(c)+b)) = a;
1869 static inline void vec_st(vec_short8 a, int b, vec_short8 *c)
1871 *((vec_short8 *)((signed char *)(c)+b)) = a;
1874 static inline void vec_st(vec_bshort8 a, int b, signed short *c)
1876 *((vec_bshort8 *)((signed char *)(c)+b)) = a;
1879 static inline void vec_st(vec_uint4 a, int b, unsigned int *c)
1881 *((vec_uint4 *)((unsigned char *)(c)+b)) = a;
1884 static inline void vec_st(vec_uint4 a, int b, vec_uint4 *c)
1886 *((vec_uint4 *)((unsigned char *)(c)+b)) = a;
1889 static inline void vec_st(vec_int4 a, int b, signed int *c)
1891 *((vec_int4 *)((unsigned char *)(c)+b)) = a;
1894 static inline void vec_st(vec_int4 a, int b, vec_int4 *c)
1896 *((vec_int4 *)((signed char *)(c)+b)) = a;
1899 static inline void vec_st(vec_bint4 a, int b, signed int *c)
1901 *((vec_bint4 *)((signed char *)(c)+b)) = a;
1904 static inline void vec_st(vec_float4 a, int b, float *c)
1906 *((vec_float4 *)((unsigned char *)(c)+b)) = a;
1909 static inline void vec_st(vec_float4 a, int b, vec_float4 *c)
1911 *((vec_float4 *)((unsigned char *)(c)+b)) = a;
1915 /* vec_ste (vector store element indexed)
1916 * =======
1918 static inline void vec_ste(vec_uchar16 a, int b, unsigned char *c)
1920 unsigned char *ptr;
1922 ptr = c + b;
1923 *ptr = spu_extract(a, (int)(ptr) & 15);
1926 static inline void vec_ste(vec_char16 a, int b, signed char *c)
1928 vec_ste((vec_uchar16)(a), b, (unsigned char *)(c));
1931 static inline void vec_ste(vec_bchar16 a, int b, signed char *c)
1933 vec_ste((vec_uchar16)(a), b, (unsigned char *)(c));
1936 static inline void vec_ste(vec_ushort8 a, int b, unsigned short *c)
1938 unsigned short *ptr;
1940 ptr = (unsigned short *)(((unsigned int)(c) + b) & ~1);
1941 *ptr = spu_extract(a, ((int)(ptr) >> 1) & 7);
1944 static inline void vec_ste(vec_short8 a, int b, signed short *c)
1946 vec_ste((vec_ushort8)(a), b, (unsigned short *)(c));
1949 static inline void vec_ste(vec_bshort8 a, int b, signed short *c)
1951 vec_ste((vec_ushort8)(a), b, (unsigned short *)(c));
1954 static inline void vec_ste(vec_uint4 a, int b, unsigned int *c)
1956 unsigned int *ptr;
1958 ptr = (unsigned int *)(((unsigned int)(c) + b) & ~3);
1959 *ptr = spu_extract(a, ((int)(ptr) >> 2) & 3);
1962 static inline void vec_ste(vec_int4 a, int b, signed int *c)
1964 vec_ste((vec_uint4)(a), b, (unsigned int *)(c));
1967 static inline void vec_ste(vec_bint4 a, int b, signed int *c)
1969 vec_ste((vec_uint4)(a), b, (unsigned int *)(c));
1972 static inline void vec_ste(vec_float4 a, int b, float *c)
1974 vec_ste((vec_uint4)(a), b, (unsigned int *)(c));
1978 /* vec_stl (vector store indexed LRU)
1979 * =======
1981 #define vec_stl(_a, _b, _c) vec_st(_a, _b, _c)
1984 /* vec_sub (vector subtract)
1985 * =======
1987 static inline vec_uchar16 vec_sub(vec_uchar16 a, vec_uchar16 b)
1989 return ((vec_uchar16)(spu_sel(spu_sub((vec_ushort8)(a), (vec_ushort8)(b)),
1990 spu_sub(spu_and((vec_ushort8)(a), -256), spu_and((vec_ushort8)(b), -256)),
1991 spu_splats((unsigned short)0xFF00))));
1994 static inline vec_char16 vec_sub(vec_char16 a, vec_char16 b)
1996 return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b))));
1999 static inline vec_char16 vec_sub(vec_bchar16 a, vec_char16 b)
2001 return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b))));
2004 static inline vec_char16 vec_sub(vec_char16 a, vec_bchar16 b)
2006 return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b))));
2009 static inline vec_ushort8 vec_sub(vec_ushort8 a, vec_ushort8 b)
2011 return (spu_sub(a, b));
2014 static inline vec_short8 vec_sub(vec_short8 a, vec_short8 b)
2016 return (spu_sub(a, b));
2019 static inline vec_short8 vec_sub(vec_bshort8 a, vec_short8 b)
2021 return (spu_sub((vec_short8)(a), b));
2024 static inline vec_short8 vec_sub(vec_short8 a, vec_bshort8 b)
2026 return (spu_sub(a, (vec_short8)(b)));
2029 static inline vec_uint4 vec_sub(vec_uint4 a, vec_uint4 b)
2031 return (spu_sub(a, b));
2034 static inline vec_int4 vec_sub(vec_int4 a, vec_int4 b)
2036 return (spu_sub(a, b));
2039 static inline vec_int4 vec_sub(vec_bint4 a, vec_int4 b)
2041 return (spu_sub((vec_int4)(a), b));
2044 static inline vec_int4 vec_sub(vec_int4 a, vec_bint4 b)
2046 return (spu_sub(a, (vec_int4)(b)));
2049 static inline vec_float4 vec_sub(vec_float4 a, vec_float4 b)
2051 return (spu_sub(a, b));
2055 /* vec_subc (vector subtract carryout)
2056 * ========
2058 #define vec_subc(_a, _b) spu_genb(_a, _b)
2061 /* vec_subs (vector subtract saturate)
2062 * ========
2064 static inline vec_uchar16 vec_subs(vec_uchar16 a, vec_uchar16 b)
2066 vec_ushort8 s1, s2;
2067 vec_uchar16 s, d;
2069 s1 = spu_sub(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8));
2070 s2 = spu_sub(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF));
2071 s = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){0, 16, 2, 18, 4, 20, 6, 22,
2072 8, 24, 10, 26, 12, 28, 14, 30})));
2073 d = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){1, 17, 3, 19, 5, 21, 7, 23,
2074 9, 25, 11, 27, 13, 29, 15, 31})));
2075 return (spu_andc(d, s));
2078 static inline vec_char16 vec_subs(vec_char16 a, vec_char16 b)
2080 vec_ushort8 s1, s2;
2081 vec_uchar16 s, d;
2083 s1 = spu_sub(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8));
2084 s2 = spu_sub(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF));
2085 s = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){1, 17, 3, 19, 5, 21, 7, 23,
2086 9, 25, 11, 27, 13, 29, 15, 31})));
2087 d = spu_sel(s, spu_splats((unsigned char)0x7F), spu_cmpgt(spu_nor((vec_uchar16)(a), spu_nand(s, (vec_uchar16)(b))), 0x7F));
2088 d = spu_sel(d, spu_splats((unsigned char)0x80), spu_cmpgt(spu_and((vec_uchar16)(a), spu_nor(s, (vec_uchar16)(b))), 0x7F));
2090 return ((vec_char16)(d));
2093 static inline vec_char16 vec_subs(vec_bchar16 a, vec_char16 b)
2095 return (vec_subs((vec_char16)(a), b));
2098 static inline vec_char16 vec_subs(vec_char16 a, vec_bchar16 b)
2100 return (vec_subs(a, (vec_char16)(b)));
2103 static inline vec_ushort8 vec_subs(vec_ushort8 a, vec_ushort8 b)
2105 return (spu_andc(spu_sub(a, b), spu_cmpgt(b, a)));
2108 static inline vec_short8 vec_subs(vec_short8 a, vec_short8 b)
2110 vec_short8 s;
2111 vec_short8 d;
2113 s = spu_sub(a, b);
2114 d = spu_sel(s, spu_splats((signed short)0x7FFF), (vec_ushort8)(spu_rlmaska(spu_nor(a, spu_nand(s, b)), -15)));
2115 d = spu_sel(d, spu_splats((signed short)0x8000), (vec_ushort8)(spu_rlmaska(spu_and(a, spu_nor(s, b)), -15)));
2117 return (d);
2120 static inline vec_short8 vec_subs(vec_bshort8 a, vec_short8 b)
2122 return ((vec_short8)(vec_subs((vec_short8)(a), b)));
2125 static inline vec_short8 vec_subs(vec_short8 a, vec_bshort8 b)
2127 return ((vec_short8)(vec_subs(a, (vec_short8)(b))));
2130 static inline vec_uint4 vec_subs(vec_uint4 a, vec_uint4 b)
2132 return (spu_andc(spu_sub(a, b), spu_cmpgt(b, a)));
2135 static inline vec_int4 vec_subs(vec_int4 a, vec_int4 b)
2137 vec_int4 s;
2138 vec_int4 d;
2140 s = spu_sub(a, b);
2141 d = spu_sel(s, spu_splats((signed int)0x7FFFFFFF), (vec_uint4)(spu_rlmaska(spu_nor(a, spu_nand(s, b)), -31)));
2142 d = spu_sel(d, spu_splats((signed int)0x80000000), (vec_uint4)(spu_rlmaska(spu_and(a, spu_nor(s, b)), -31)));
2144 return (d);
2147 static inline vec_int4 vec_subs(vec_bint4 a, vec_int4 b)
2149 return ((vec_int4)(vec_subs((vec_int4)(a), b)));
2152 static inline vec_int4 vec_subs(vec_int4 a, vec_bint4 b)
2154 return ((vec_int4)(vec_subs(a, (vec_int4)(b))));
2158 /* vec_sum4s (vector sum across partial (1/4) saturated)
2159 * =========
2161 static inline vec_uint4 vec_sum4s(vec_uchar16 a, vec_uint4 b)
2163 vec_uint4 a01_23, a0123;
2165 a01_23 = (vec_uint4)(spu_add(spu_rlmask((vec_ushort8)(a), -8),
2166 spu_and((vec_ushort8)(a), 0xFF)));
2167 a0123 = spu_add(spu_rlmask(a01_23, -16), spu_and(a01_23, 0x1FF));
2168 return (vec_adds(a0123, b));
2171 static inline vec_int4 vec_sum4s(vec_char16 a, vec_int4 b)
2173 vec_int4 a01_23, a0123;
2175 a01_23 = (vec_int4)(spu_add(spu_rlmaska((vec_short8)(a), -8),
2176 spu_extend(a)));
2177 a0123 = spu_add(spu_rlmaska(a01_23, -16), spu_extend((vec_short8)(a01_23)));
2178 return (vec_adds(a0123, b));
2181 static inline vec_int4 vec_sum4s(vec_short8 a, vec_int4 b)
2183 vec_int4 a0123;
2185 a0123 = spu_add(spu_rlmaska((vec_int4)(a), -16), spu_extend(a));
2186 return (vec_adds(a0123, b));
2190 /* vec_sum2s (vector sum across partial (1/2) saturated)
2191 * =========
2193 static inline vec_int4 vec_sum2s(vec_int4 a, vec_int4 b)
2195 vec_int4 c, d;
2196 vec_int4 sign1, sign2, sign3;
2197 vec_int4 carry, sum_l, sum_h, sat, sat_val;
2199 sign1 = spu_rlmaska(a, -31);
2200 sign2 = spu_rlmaska(b, -31);
2202 c = spu_rlqwbyte(a, -4);
2203 sign3 = spu_rlqwbyte(sign1, -4);
2205 carry = spu_genc(a, b);
2206 sum_l = spu_add(a, b);
2207 sum_h = spu_addx(sign1, sign2, carry);
2209 carry = spu_genc(sum_l, c);
2210 sum_l = spu_add(sum_l, c);
2211 sum_h = spu_addx(sum_h, sign3, carry);
2213 sign1 = spu_rlmaska(sum_l, -31);
2214 sign2 = spu_rlmaska(sum_h, -31);
2216 sat_val = spu_xor(sign2, spu_splats((signed int)0x7FFFFFFF));
2218 sat = spu_orc(spu_xor(sign1, sign2), (vec_int4)spu_cmpeq(sum_h, sign2));
2220 d = spu_and(spu_sel(sum_l, sat_val, (vec_uint4)(sat)), (vec_int4){0, -1, 0, -1});
2222 return (d);
2226 /* vec_sums (vector sum saturated)
2227 * ========
2229 static inline vec_int4 vec_sums(vec_int4 a, vec_int4 b)
2231 vec_int4 a0, a1, a2, c0, c1, c2, d;
2232 vec_int4 sign_a, sign_b, sign_l, sign_h;
2233 vec_int4 sum_l, sum_h, sat, sat_val;
2235 sign_a = spu_rlmaska(a, -31);
2236 sign_b = spu_rlmaska(b, -31);
2238 a0 = spu_rlqwbyte(a, -12);
2239 a1 = spu_rlqwbyte(a, -8);
2240 a2 = spu_rlqwbyte(a, -4);
2242 sum_l = spu_add(a, b);
2243 sum_h = spu_addx(sign_a, sign_b, spu_genc(a, b));
2245 c2 = spu_genc(sum_l, a2);
2246 sum_l = spu_add(sum_l, a2);
2247 sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -4), c2);
2249 c1 = spu_genc(sum_l, a1);
2250 sum_l = spu_add(sum_l, a1);
2251 sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -8), c1);
2253 c0 = spu_genc(sum_l, a0);
2254 sum_l = spu_add(sum_l, a0);
2255 sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -12), c0);
2257 sign_l = spu_rlmaska(sum_l, -31);
2258 sign_h = spu_rlmaska(sum_h, -31);
2260 sat_val = spu_xor(sign_h, spu_splats((signed int)0x7FFFFFFF));
2262 sat = spu_orc(spu_xor(sign_l, sign_h), (vec_int4)spu_cmpeq(sum_h, sign_h));
2264 d = spu_and(spu_sel(sum_l, sat_val, (vec_uint4)(sat)), ((vec_int4){0, 0, 0, -1}));
2266 return (d);
2270 /* vec_trunc (vector truncate)
2271 * =========
2273 static inline vec_float4 vec_trunc(vec_float4 a)
2275 vec_int4 exp;
2276 vec_uint4 mask;
2278 exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
2279 mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
2280 mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
2281 mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
2282 return (spu_andc(a, (vec_float4)(mask)));
2285 /* vec_unpackh (vector unpack high element)
2286 * ===========
2288 static inline vec_short8 vec_unpackh(vec_char16 a)
2290 return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 1, 1, 2, 2, 3, 3,
2291 4, 4, 5, 5, 6, 6, 7, 7}))));
2294 static inline vec_bshort8 vec_unpackh(vec_bchar16 a)
2296 return ((vec_bshort8)(vec_unpackh((vec_char16)(a))));
2299 static inline vec_int4 vec_unpackh(vec_short8 a)
2301 return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 0, 1, 0, 0, 2, 3,
2302 0, 0, 4, 5, 0, 0, 6, 7}))));
2305 #ifdef SUPPORT_UNPACK_PIXEL
2306 /* Due to type conflicts, unpacking of pixel types and boolean shorts
2307 * can not simultaneously be supported. By default, the boolean short is
2308 * supported.
2310 static inline vec_uint4 vec_unpackh(vec_pixel8 a)
2312 vec_ushort8 p1, p2;
2314 p1 = spu_shuffle((vec_ushort8)(spu_rlmaska((vec_short8)(a.p), -7)),
2315 spu_and((vec_ushort8)(a.p), 0x1F),
2316 ((vec_uchar16){ 0, 128, 128, 17, 2, 128, 128, 19,
2317 4, 128, 128, 21, 6, 128, 128, 23}));
2318 p2 = spu_shuffle(spu_and(spu_rlmask((vec_ushort8)(a.p), -5), 0x1F),
2319 spu_and(spu_rlmask((vec_ushort8)(a.p), -10), 0x1F),
2320 ((vec_uchar16){ 128, 17, 1, 128, 128, 19, 3, 128,
2321 128, 21, 5, 128, 128, 23, 7, 128}));
2322 return ((vec_uint4)(spu_or(p1, p2)));
2325 #else
2327 static inline vec_bint4 vec_unpackh(vec_bshort8 a)
2329 return ((vec_bint4)(vec_unpackh((vec_short8)(a))));
2331 #endif
2337 /* vec_unpackl (vector unpack low element)
2338 * ===========
2340 static inline vec_short8 vec_unpackl(vec_char16 a)
2342 return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){8, 8, 9, 9, 10, 10, 11, 11,
2343 12, 12, 13, 13, 14, 14, 15, 15}))));
2346 static inline vec_bshort8 vec_unpackl(vec_bchar16 a)
2348 return ((vec_bshort8)(vec_unpackl((vec_char16)(a))));
2352 static inline vec_int4 vec_unpackl(vec_short8 a)
2354 return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 8, 9, 0, 0, 10, 11,
2355 0, 0,12,13, 0, 0, 14, 15}))));
2359 #ifdef SUPPORT_UNPACK_PIXEL
2360 /* Due to type conflicts, unpacking of pixel types and boolean shorts
2361 * can not simultaneously be supported. By default, the boolean short is
2362 * supported.
2364 static inline vec_uint4 vec_unpackl(vec_pixel8 a)
2366 vec_ushort8 p1, p2;
2368 p1 = spu_shuffle((vec_ushort8)(spu_rlmaska((vec_short8)(a), -7)),
2369 spu_and((vec_ushort8)(a), 0x1F),
2370 ((vec_uchar16){ 8, 128, 128, 25, 10, 128, 128, 27,
2371 12, 128, 128, 29, 14, 128, 128, 31}));
2372 p2 = spu_shuffle(spu_and(spu_rlmask((vec_ushort8)(a), -5), 0x1F),
2373 spu_and(spu_rlmask((vec_ushort8)(a), -10), 0x1F),
2374 ((vec_uchar16){ 128, 25, 9, 128, 128, 27, 11, 128,
2375 128, 29, 13, 128, 128, 31, 15, 128}));
2376 return ((vec_uint4)(spu_or(p1, p2)));
2379 #else
2381 static inline vec_bint4 vec_unpackl(vec_bshort8 a)
2383 return ((vec_bint4)(vec_unpackl((vec_short8)(a))));
2386 #endif
2390 /* vec_xor (vector logical xor)
2391 * ======
2393 static inline vec_uchar16 vec_xor(vec_uchar16 a, vec_uchar16 b)
2395 return (spu_xor(a, b));
2398 static inline vec_char16 vec_xor(vec_char16 a, vec_char16 b)
2400 return (spu_xor(a, b));
2403 static inline vec_char16 vec_xor(vec_bchar16 a, vec_char16 b)
2405 return (spu_xor((vec_char16)(a), b));
2408 static inline vec_char16 vec_xor(vec_char16 a, vec_bchar16 b)
2410 return (spu_xor(a, (vec_char16)(b)));
2413 static inline vec_ushort8 vec_xor(vec_ushort8 a, vec_ushort8 b)
2415 return (spu_xor(a, b));
2418 static inline vec_short8 vec_xor(vec_short8 a, vec_short8 b)
2420 return (spu_xor(a, b));
2423 static inline vec_short8 vec_xor(vec_bshort8 a, vec_short8 b)
2425 return (spu_xor((vec_short8)(a), b));
2428 static inline vec_short8 vec_xor(vec_short8 a, vec_bshort8 b)
2430 return (spu_xor(a, (vec_short8)(b)));
2433 static inline vec_uint4 vec_xor(vec_uint4 a, vec_uint4 b)
2435 return (spu_xor(a, b));
2438 static inline vec_int4 vec_xor(vec_int4 a, vec_int4 b)
2440 return (spu_xor(a, b));
2443 static inline vec_int4 vec_xor(vec_bint4 a, vec_int4 b)
2445 return (spu_xor((vec_int4)(a), b));
2448 static inline vec_int4 vec_xor(vec_int4 a, vec_bint4 b)
2450 return (spu_xor(a, (vec_int4)(b)));
2453 static inline vec_float4 vec_xor(vec_float4 a, vec_float4 b)
2455 return (spu_xor(a, b));
2458 static inline vec_float4 vec_xor(vec_bint4 a, vec_float4 b)
2460 return (spu_xor((vec_float4)(a),b));
2463 static inline vec_float4 vec_xor(vec_float4 a, vec_bint4 b)
2465 return (spu_xor(a, (vec_float4)(b)));
2468 /************************************************************************
2469 * PREDICATES
2470 ************************************************************************/
2472 /* vec_all_eq (all elements equal)
2473 * ==========
2475 static inline int vec_all_eq(vec_uchar16 a, vec_uchar16 b)
2477 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFFFF));
2480 static inline int vec_all_eq(vec_char16 a, vec_char16 b)
2482 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFFFF));
2485 static inline int vec_all_eq(vec_bchar16 a, vec_char16 b)
2487 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) == 0xFFFF));
2490 static inline int vec_all_eq(vec_char16 a, vec_bchar16 b)
2492 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) == 0xFFFF));
2495 static inline int vec_all_eq(vec_ushort8 a, vec_ushort8 b)
2497 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFF));
2500 static inline int vec_all_eq(vec_short8 a, vec_short8 b)
2502 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFF));
2505 static inline int vec_all_eq(vec_bshort8 a, vec_short8 b)
2507 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) == 0xFF));
2510 static inline int vec_all_eq(vec_short8 a, vec_bshort8 b)
2512 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) == 0xFF));
2515 static inline int vec_all_eq(vec_uint4 a, vec_uint4 b)
2517 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF));
2520 static inline int vec_all_eq(vec_int4 a, vec_int4 b)
2522 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF));
2525 static inline int vec_all_eq(vec_bint4 a, vec_int4 b)
2527 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) == 0xF));
2530 static inline int vec_all_eq(vec_int4 a, vec_bint4 b)
2532 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) == 0xF));
2535 static inline int vec_all_eq(vec_float4 a, vec_float4 b)
2537 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF));
2541 /* vec_all_ge (all elements greater than or equal)
2542 * ==========
2544 static inline int vec_all_ge(vec_uchar16 a, vec_uchar16 b)
2546 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2549 static inline int vec_all_ge(vec_char16 a, vec_char16 b)
2551 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2554 static inline int vec_all_ge(vec_bchar16 a, vec_char16 b)
2556 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) == 0));
2559 static inline int vec_all_ge(vec_char16 a, vec_bchar16 b)
2561 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) == 0));
2564 static inline int vec_all_ge(vec_ushort8 a, vec_ushort8 b)
2566 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2569 static inline int vec_all_ge(vec_short8 a, vec_short8 b)
2571 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2574 static inline int vec_all_ge(vec_bshort8 a, vec_short8 b)
2576 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) == 0));
2579 static inline int vec_all_ge(vec_short8 a, vec_bshort8 b)
2581 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) == 0));
2584 static inline int vec_all_ge(vec_uint4 a, vec_uint4 b)
2586 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2589 static inline int vec_all_ge(vec_int4 a, vec_int4 b)
2591 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2594 static inline int vec_all_ge(vec_bint4 a, vec_int4 b)
2596 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) == 0));
2599 static inline int vec_all_ge(vec_int4 a, vec_bint4 b)
2601 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) == 0));
2604 static inline int vec_all_ge(vec_float4 a, vec_float4 b)
2606 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2610 /* vec_all_gt (all elements greater than)
2611 * ==========
2613 static inline int vec_all_gt(vec_uchar16 a, vec_uchar16 b)
2615 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFFFF));
2618 static inline int vec_all_gt(vec_char16 a, vec_char16 b)
2620 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFFFF));
2623 static inline int vec_all_gt(vec_bchar16 a, vec_char16 b)
2625 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) == 0xFFFF));
2628 static inline int vec_all_gt(vec_char16 a, vec_bchar16 b)
2630 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) == 0xFFFF));
2633 static inline int vec_all_gt(vec_ushort8 a, vec_ushort8 b)
2635 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFF));
2638 static inline int vec_all_gt(vec_short8 a, vec_short8 b)
2640 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFF));
2643 static inline int vec_all_gt(vec_bshort8 a, vec_short8 b)
2645 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) == 0xFF));
2648 static inline int vec_all_gt(vec_short8 a, vec_bshort8 b)
2650 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) == 0xFF));
2653 static inline int vec_all_gt(vec_uint4 a, vec_uint4 b)
2655 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2658 static inline int vec_all_gt(vec_int4 a, vec_int4 b)
2660 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2663 static inline int vec_all_gt(vec_bint4 a, vec_int4 b)
2665 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) == 0xF));
2668 static inline int vec_all_gt(vec_int4 a, vec_bint4 b)
2670 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) == 0xF));
2673 static inline int vec_all_gt(vec_float4 a, vec_float4 b)
2675 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2679 /* vec_all_in (all elements in bounds)
2680 * ==========
2682 static inline int vec_all_in(vec_float4 a, vec_float4 b)
2684 return (spu_extract(spu_gather(spu_nor(spu_cmpabsgt(a, b), (vec_uint4)(spu_rlmaska((vec_int4)(b), -31)))), 0) == 0xF);
2688 /* vec_all_le (all elements less than or equal)
2689 * ==========
2691 static inline int vec_all_le(vec_uchar16 a, vec_uchar16 b)
2693 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2696 static inline int vec_all_le(vec_char16 a, vec_char16 b)
2698 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2701 static inline int vec_all_le(vec_bchar16 a, vec_char16 b)
2703 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) == 0));
2706 static inline int vec_all_le(vec_char16 a, vec_bchar16 b)
2708 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) == 0));
2711 static inline int vec_all_le(vec_ushort8 a, vec_ushort8 b)
2713 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2716 static inline int vec_all_le(vec_short8 a, vec_short8 b)
2718 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2721 static inline int vec_all_le(vec_bshort8 a, vec_short8 b)
2723 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) == 0));
2726 static inline int vec_all_le(vec_short8 a, vec_bshort8 b)
2728 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) == 0));
2731 static inline int vec_all_le(vec_uint4 a, vec_uint4 b)
2733 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2736 static inline int vec_all_le(vec_int4 a, vec_int4 b)
2738 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2741 static inline int vec_all_le(vec_bint4 a, vec_int4 b)
2743 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) == 0));
2746 static inline int vec_all_le(vec_int4 a, vec_bint4 b)
2748 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) == 0));
2751 static inline int vec_all_le(vec_float4 a, vec_float4 b)
2753 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2757 /* vec_all_lt (all elements less than)
2758 * ==========
2760 static inline int vec_all_lt(vec_uchar16 a, vec_uchar16 b)
2762 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFFFF));
2765 static inline int vec_all_lt(vec_char16 a, vec_char16 b)
2767 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFFFF));
2770 static inline int vec_all_lt(vec_bchar16 a, vec_char16 b)
2772 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) == 0xFFFF));
2775 static inline int vec_all_lt(vec_char16 a, vec_bchar16 b)
2777 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) == 0xFFFF));
2780 static inline int vec_all_lt(vec_ushort8 a, vec_ushort8 b)
2782 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFF));
2785 static inline int vec_all_lt(vec_short8 a, vec_short8 b)
2787 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFF));
2790 static inline int vec_all_lt(vec_bshort8 a, vec_short8 b)
2792 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) == 0xFF));
2795 static inline int vec_all_lt(vec_short8 a, vec_bshort8 b)
2797 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) == 0xFF));
2800 static inline int vec_all_lt(vec_uint4 a, vec_uint4 b)
2802 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2805 static inline int vec_all_lt(vec_int4 a, vec_int4 b)
2807 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2810 static inline int vec_all_lt(vec_bint4 a, vec_int4 b)
2812 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) == 0xF));
2815 static inline int vec_all_lt(vec_int4 a, vec_bint4 b)
2817 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) == 0xF));
2820 static inline int vec_all_lt(vec_float4 a, vec_float4 b)
2822 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2826 /* vec_all_nan (all elements not a number)
2827 * ===========
2829 static inline int vec_all_nan(vec_float4 a)
2831 vec_uint4 exp, man;
2832 vec_uint4 exp_mask = spu_splats((unsigned int)0x7F800000);
2834 exp = spu_and((vec_uint4)(a), exp_mask);
2835 man = spu_and((vec_uint4)(a), spu_splats((unsigned int)0x007FFFFF));
2836 return ((int)(spu_extract(spu_gather(spu_andc(spu_cmpeq(exp, exp_mask),
2837 spu_cmpeq(man, 0))), 0) == 0xF));
2840 #define vec_all_nan(_a) (0)
2843 /* vec_all_ne (all elements not equal)
2844 * ==========
2846 static inline int vec_all_ne(vec_uchar16 a, vec_uchar16 b)
2848 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2851 static inline int vec_all_ne(vec_char16 a, vec_char16 b)
2853 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2856 static inline int vec_all_ne(vec_bchar16 a, vec_char16 b)
2858 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) == 0));
2861 static inline int vec_all_ne(vec_char16 a, vec_bchar16 b)
2863 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) == 0));
2866 static inline int vec_all_ne(vec_ushort8 a, vec_ushort8 b)
2868 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2871 static inline int vec_all_ne(vec_short8 a, vec_short8 b)
2873 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2876 static inline int vec_all_ne(vec_bshort8 a, vec_short8 b)
2878 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) == 0));
2881 static inline int vec_all_ne(vec_short8 a, vec_bshort8 b)
2883 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) == 0));
2886 static inline int vec_all_ne(vec_uint4 a, vec_uint4 b)
2888 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2891 static inline int vec_all_ne(vec_int4 a, vec_int4 b)
2893 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2896 static inline int vec_all_ne(vec_bint4 a, vec_int4 b)
2898 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) == 0));
2901 static inline int vec_all_ne(vec_int4 a, vec_bint4 b)
2903 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) == 0));
2906 static inline int vec_all_ne(vec_float4 a, vec_float4 b)
2908 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2912 /* vec_all_nge (all elements not greater than or equal)
2913 * ===========
2915 static inline int vec_all_nge(vec_float4 a, vec_float4 b)
2917 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2921 /* vec_all_ngt (all elements not greater than)
2922 * ===========
2924 static inline int vec_all_ngt(vec_float4 a, vec_float4 b)
2926 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2930 /* vec_all_nle (all elements not less than or equal)
2931 * ===========
2933 static inline int vec_all_nle(vec_float4 a, vec_float4 b)
2935 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2939 /* vec_all_nlt (all elements not less than)
2940 * ===========
2942 static inline int vec_all_nlt(vec_float4 a, vec_float4 b)
2944 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2948 /* vec_all_numeric (all elements numeric)
2949 * ===========
2951 static inline int vec_all_numeric(vec_float4 a)
2953 vec_uint4 exp;
2955 exp = spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF);
2956 return ((int)(spu_extract(spu_gather(spu_cmpeq(exp, 255)), 0) == 0));
2961 /* vec_any_eq (any elements equal)
2962 * ==========
2964 static inline int vec_any_eq(vec_uchar16 a, vec_uchar16 b)
2966 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2969 static inline int vec_any_eq(vec_char16 a, vec_char16 b)
2971 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2974 static inline int vec_any_eq(vec_bchar16 a, vec_char16 b)
2976 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) != 0));
2979 static inline int vec_any_eq(vec_char16 a, vec_bchar16 b)
2981 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) != 0));
2984 static inline int vec_any_eq(vec_ushort8 a, vec_ushort8 b)
2986 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2989 static inline int vec_any_eq(vec_short8 a, vec_short8 b)
2991 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2994 static inline int vec_any_eq(vec_bshort8 a, vec_short8 b)
2996 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) != 0));
2999 static inline int vec_any_eq(vec_short8 a, vec_bshort8 b)
3001 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) != 0));
3004 static inline int vec_any_eq(vec_uint4 a, vec_uint4 b)
3006 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0)));
3009 static inline int vec_any_eq(vec_int4 a, vec_int4 b)
3011 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0)));
3014 static inline int vec_any_eq(vec_bint4 a, vec_int4 b)
3016 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq((vec_int4)(a), b), -31)), 0)));
3019 static inline int vec_any_eq(vec_int4 a, vec_bint4 b)
3021 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, (vec_int4)(b)), -31)), 0)));
3024 static inline int vec_any_eq(vec_float4 a, vec_float4 b)
3026 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0)));
3029 /* vec_any_ge (any elements greater than or equal)
3030 * ==========
3032 static inline int vec_any_ge(vec_uchar16 a, vec_uchar16 b)
3034 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFFFF));
3037 static inline int vec_any_ge(vec_char16 a, vec_char16 b)
3039 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFFFF));
3042 static inline int vec_any_ge(vec_bchar16 a, vec_char16 b)
3044 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) != 0xFFFF));
3047 static inline int vec_any_ge(vec_char16 a, vec_bchar16 b)
3049 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) != 0xFFFF));
3052 static inline int vec_any_ge(vec_ushort8 a, vec_ushort8 b)
3054 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFF));
3057 static inline int vec_any_ge(vec_short8 a, vec_short8 b)
3059 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFF));
3062 static inline int vec_any_ge(vec_bshort8 a, vec_short8 b)
3064 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) != 0xFF));
3067 static inline int vec_any_ge(vec_short8 a, vec_bshort8 b)
3069 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) != 0xFF));
3072 static inline int vec_any_ge(vec_uint4 a, vec_uint4 b)
3074 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3077 static inline int vec_any_ge(vec_int4 a, vec_int4 b)
3079 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3082 static inline int vec_any_ge(vec_bint4 a, vec_int4 b)
3084 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) != 0xF));
3087 static inline int vec_any_ge(vec_int4 a, vec_bint4 b)
3089 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) != 0xF));
3092 static inline int vec_any_ge(vec_float4 a, vec_float4 b)
3094 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3098 /* vec_any_gt (any elements greater than)
3099 * ==========
3101 static inline int vec_any_gt(vec_uchar16 a, vec_uchar16 b)
3103 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3106 static inline int vec_any_gt(vec_char16 a, vec_char16 b)
3108 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3111 static inline int vec_any_gt(vec_bchar16 a, vec_char16 b)
3113 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) != 0));
3116 static inline int vec_any_gt(vec_char16 a, vec_bchar16 b)
3118 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) != 0));
3121 static inline int vec_any_gt(vec_ushort8 a, vec_ushort8 b)
3123 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3126 static inline int vec_any_gt(vec_short8 a, vec_short8 b)
3128 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3131 static inline int vec_any_gt(vec_bshort8 a, vec_short8 b)
3133 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) != 0));
3136 static inline int vec_any_gt(vec_short8 a, vec_bshort8 b)
3138 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) != 0));
3142 static inline int vec_any_gt(vec_uint4 a, vec_uint4 b)
3144 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0)));
3147 static inline int vec_any_gt(vec_int4 a, vec_int4 b)
3149 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0)));
3152 static inline int vec_any_gt(vec_bint4 a, vec_int4 b)
3154 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt((vec_int4)(a), b), -31)), 0)));
3157 static inline int vec_any_gt(vec_int4 a, vec_bint4 b)
3159 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, (vec_int4)(b)), -31)), 0)));
3162 static inline int vec_any_gt(vec_float4 a, vec_float4 b)
3164 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0)));
3167 /* vec_any_le (any elements less than or equal)
3168 * ==========
3170 static inline int vec_any_le(vec_uchar16 a, vec_uchar16 b)
3172 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFFFF));
3175 static inline int vec_any_le(vec_char16 a, vec_char16 b)
3177 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFFFF));
3180 static inline int vec_any_le(vec_bchar16 a, vec_char16 b)
3182 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) != 0xFFFF));
3185 static inline int vec_any_le(vec_char16 a, vec_bchar16 b)
3187 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) != 0xFFFF));
3190 static inline int vec_any_le(vec_ushort8 a, vec_ushort8 b)
3192 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFF));
3195 static inline int vec_any_le(vec_short8 a, vec_short8 b)
3197 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFF));
3200 static inline int vec_any_le(vec_bshort8 a, vec_short8 b)
3202 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) != 0xFF));
3205 static inline int vec_any_le(vec_short8 a, vec_bshort8 b)
3207 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) != 0xFF));
3210 static inline int vec_any_le(vec_uint4 a, vec_uint4 b)
3212 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3215 static inline int vec_any_le(vec_int4 a, vec_int4 b)
3217 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3220 static inline int vec_any_le(vec_bint4 a, vec_int4 b)
3222 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) != 0xF));
3225 static inline int vec_any_le(vec_int4 a, vec_bint4 b)
3227 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) != 0xF));
3230 static inline int vec_any_le(vec_float4 a, vec_float4 b)
3232 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3236 /* vec_any_lt (any elements less than)
3237 * ==========
3239 static inline int vec_any_lt(vec_uchar16 a, vec_uchar16 b)
3241 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3244 static inline int vec_any_lt(vec_char16 a, vec_char16 b)
3246 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3249 static inline int vec_any_lt(vec_bchar16 a, vec_char16 b)
3251 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) != 0));
3254 static inline int vec_any_lt(vec_char16 a, vec_bchar16 b)
3256 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) != 0));
3259 static inline int vec_any_lt(vec_ushort8 a, vec_ushort8 b)
3261 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3264 static inline int vec_any_lt(vec_short8 a, vec_short8 b)
3266 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3269 static inline int vec_any_lt(vec_bshort8 a, vec_short8 b)
3271 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) != 0));
3274 static inline int vec_any_lt(vec_short8 a, vec_bshort8 b)
3276 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) != 0));
3279 static inline int vec_any_lt(vec_uint4 a, vec_uint4 b)
3281 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3284 static inline int vec_any_lt(vec_int4 a, vec_int4 b)
3286 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3289 static inline int vec_any_lt(vec_bint4 a, vec_int4 b)
3291 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, (vec_int4)(a)), -31)), 0)));
3294 static inline int vec_any_lt(vec_int4 a, vec_bint4 b)
3296 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt((vec_int4)(b), a), -31)), 0)));
3299 static inline int vec_any_lt(vec_float4 a, vec_float4 b)
3301 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3304 /* vec_any_nan (any elements not a number)
3305 * ===========
3307 static inline int vec_any_nan(vec_float4 a)
3309 vec_uint4 exp, man;
3310 vec_uint4 exp_mask = spu_splats((unsigned int)0x7F800000);
3312 exp = spu_and((vec_uint4)(a), exp_mask);
3313 man = spu_and((vec_uint4)(a), spu_splats((unsigned int)0x007FFFFF));
3314 return ((int)(spu_extract(spu_gather(spu_andc(spu_cmpeq(exp, exp_mask),
3315 spu_cmpeq(man, 0))), 0) != 0));
3319 /* vec_any_ne (any elements not equal)
3320 * ==========
3322 static inline int vec_any_ne(vec_uchar16 a, vec_uchar16 b)
3324 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFFFF));
3327 static inline int vec_any_ne(vec_char16 a, vec_char16 b)
3329 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFFFF));
3332 static inline int vec_any_ne(vec_bchar16 a, vec_char16 b)
3334 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) != 0xFFFF));
3337 static inline int vec_any_ne(vec_char16 a, vec_bchar16 b)
3339 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) != 0xFFFF));
3342 static inline int vec_any_ne(vec_ushort8 a, vec_ushort8 b)
3344 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFF));
3347 static inline int vec_any_ne(vec_short8 a, vec_short8 b)
3349 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFF));
3352 static inline int vec_any_ne(vec_bshort8 a, vec_short8 b)
3354 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) != 0xFF));
3357 static inline int vec_any_ne(vec_short8 a, vec_bshort8 b)
3359 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) != 0xFF));
3362 static inline int vec_any_ne(vec_uint4 a, vec_uint4 b)
3364 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF));
3367 static inline int vec_any_ne(vec_int4 a, vec_int4 b)
3369 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF));
3372 static inline int vec_any_ne(vec_bint4 a, vec_int4 b)
3374 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) != 0xF));
3377 static inline int vec_any_ne(vec_int4 a, vec_bint4 b)
3379 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) != 0xF));
3382 static inline int vec_any_ne(vec_float4 a, vec_float4 b)
3384 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF));
3388 /* vec_any_nge (any elements not greater than or equal)
3389 * ===========
3391 static inline int vec_any_nge(vec_float4 a, vec_float4 b)
3393 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3396 /* vec_any_ngt (any elements not greater than)
3397 * ===========
3399 static inline int vec_any_ngt(vec_float4 a, vec_float4 b)
3401 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3405 /* vec_any_nle (any elements not less than or equal)
3406 * ===========
3408 static inline int vec_any_nle(vec_float4 a, vec_float4 b)
3410 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3414 /* vec_any_nlt (any elements not less than)
3415 * ===========
3417 static inline int vec_any_nlt(vec_float4 a, vec_float4 b)
3419 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3423 /* vec_any_numeric (any elements numeric)
3424 * ===============
3426 static inline int vec_any_numeric(vec_float4 a)
3428 vec_uint4 exp;
3430 exp = spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF);
3431 return ((int)(spu_extract(spu_gather(spu_cmpeq(exp, 255)), 0) != 0xF));
3435 /* vec_any_out (any elements out of bounds)
3436 * ===========
3438 static inline int vec_any_out(vec_float4 a, vec_float4 b)
3440 return (spu_extract(spu_gather(spu_nor(spu_cmpabsgt(a, b), (vec_uint4)(spu_rlmaska((vec_int4)(b), -31)))), 0) != 0xF);
3444 /* CBE Language Extension Intrinsics
3447 /* vec_extract (extract element from vector)
3448 * ===========
3450 #define vec_extract(_a, _element) spu_extract(_a, _element)
3453 /* vec_insert (insert scalar into specified vector element)
3454 * ==========
3456 #define vec_insert(_a, _b, _element) spu_insert(_a, _b, _element)
3458 /* vec_lvlx (load vector left indexed)
3459 * ========
3461 static inline vec_uchar16 vec_lvlx(int a, unsigned char *b)
3463 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3464 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3467 static inline vec_uchar16 vec_lvlx(int a, vec_uchar16 *b)
3469 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3470 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3473 static inline vec_char16 vec_lvlx(int a, signed char *b)
3475 vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3476 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3479 static inline vec_char16 vec_lvlx(int a, vec_char16 *b)
3481 vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3482 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3485 static inline vec_ushort8 vec_lvlx(int a, unsigned short *b)
3487 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3488 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3491 static inline vec_ushort8 vec_lvlx(int a, vec_ushort8 *b)
3493 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3494 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3497 static inline vec_short8 vec_lvlx(int a, signed short *b)
3499 vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3500 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3503 static inline vec_short8 vec_lvlx(int a, vec_short8 *b)
3505 vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3506 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3509 static inline vec_uint4 vec_lvlx(int a, unsigned int *b)
3511 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3512 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3515 static inline vec_uint4 vec_lvlx(int a, vec_uint4 *b)
3517 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3518 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3521 static inline vec_int4 vec_lvlx(int a, signed int *b)
3523 vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3524 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3527 static inline vec_int4 vec_lvlx(int a, vec_int4 *b)
3529 vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3530 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3533 static inline vec_float4 vec_lvlx(int a, float *b)
3535 vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3536 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3539 static inline vec_float4 vec_lvlx(int a, vec_float4 *b)
3541 vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3542 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3546 /* vec_lvlxl (load vector left indexed last)
3547 * =========
3549 #define vec_lvlxl(_a, _b) vec_lvlx(_a, _b)
3552 /* vec_lvrx (load vector right indexed)
3553 * ========
3555 static inline vec_uchar16 vec_lvrx(int a, unsigned char *b)
3557 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3558 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3561 static inline vec_uchar16 vec_lvrx(int a, vec_uchar16 *b)
3563 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3564 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3567 static inline vec_char16 vec_lvrx(int a, signed char *b)
3569 vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3570 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3573 static inline vec_char16 vec_lvrx(int a, vec_char16 *b)
3575 vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3576 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3579 static inline vec_ushort8 vec_lvrx(int a, unsigned short *b)
3581 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3582 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3585 static inline vec_ushort8 vec_lvrx(int a, vec_ushort8 *b)
3587 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3588 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3591 static inline vec_short8 vec_lvrx(int a, signed short *b)
3593 vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3594 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3597 static inline vec_short8 vec_lvrx(int a, vec_short8 *b)
3599 vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3600 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3603 static inline vec_uint4 vec_lvrx(int a, unsigned int *b)
3605 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3606 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3609 static inline vec_uint4 vec_lvrx(int a, vec_uint4 *b)
3611 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3612 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3615 static inline vec_int4 vec_lvrx(int a, signed int *b)
3617 vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3618 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3621 static inline vec_int4 vec_lvrx(int a, vec_int4 *b)
3623 vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3624 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3627 static inline vec_float4 vec_lvrx(int a, float *b)
3629 vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3630 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3633 static inline vec_float4 vec_lvrx(int a, vec_float4 *b)
3635 vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3636 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3641 /* vec_lvrxl (load vector right indexed last)
3642 * =========
3644 #define vec_lvrxl(_a, _b) vec_lvrx(_a, _b)
3647 /* vec_promote (promote scalar to a vector)
3648 * ===========
3650 #define vec_promote(_a, _element) spu_promote(_a, _element)
3653 /* vec_splats (splat scalar to a vector)
3654 * ==========
3656 #define vec_splats(_a) spu_splats(_a)
3659 /* vec_stvlx (store vector left indexed)
3660 * =========
3662 static inline void vec_stvlx(vec_uchar16 a, int b, unsigned char *c)
3664 int shift;
3665 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3667 shift = -((int)p & 0xF);
3668 *p = spu_sel(*p,
3669 spu_rlmaskqwbyte(a, shift),
3670 spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3673 static inline void vec_stvlx(vec_uchar16 a, int b, vec_uchar16 *c)
3675 int shift;
3676 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3678 shift = -((int)p & 0xF);
3679 *p = spu_sel(*p,
3680 spu_rlmaskqwbyte(a, shift),
3681 spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3684 static inline void vec_stvlx(vec_char16 a, int b, signed char *c)
3686 int shift;
3687 vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3689 shift = -((int)p & 0xF);
3690 *p = spu_sel(*p,
3691 spu_rlmaskqwbyte(a, shift),
3692 spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3695 static inline void vec_stvlx(vec_char16 a, int b, vec_char16 *c)
3697 int shift;
3698 vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3700 shift = -((int)p & 0xF);
3701 *p = spu_sel(*p,
3702 spu_rlmaskqwbyte(a, shift),
3703 spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3706 static inline void vec_stvlx(vec_ushort8 a, int b, unsigned short *c)
3708 int shift;
3709 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3711 shift = -((int)p & 0xF);
3712 *p = spu_sel(*p,
3713 spu_rlmaskqwbyte(a, shift),
3714 spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3717 static inline void vec_stvlx(vec_ushort8 a, int b, vec_ushort8 *c)
3719 int shift;
3720 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3722 shift = -((int)p & 0xF);
3723 *p = spu_sel(*p,
3724 spu_rlmaskqwbyte(a, shift),
3725 spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3728 static inline void vec_stvlx(vec_short8 a, int b, signed short *c)
3730 int shift;
3731 vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3733 shift = -((int)p & 0xF);
3734 *p = spu_sel(*p,
3735 spu_rlmaskqwbyte(a, shift),
3736 spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3739 static inline void vec_stvlx(vec_short8 a, int b, vec_short8 *c)
3741 int shift;
3742 vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3744 shift = -((int)p & 0xF);
3745 *p = spu_sel(*p,
3746 spu_rlmaskqwbyte(a, shift),
3747 spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3750 static inline void vec_stvlx(vec_uint4 a, int b, unsigned int *c)
3752 int shift;
3753 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3755 shift = -((int)p & 0xF);
3756 *p = spu_sel(*p,
3757 spu_rlmaskqwbyte(a, shift),
3758 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3761 static inline void vec_stvlx(vec_uint4 a, int b, vec_uint4 *c)
3763 int shift;
3764 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3766 shift = -((int)p & 0xF);
3767 *p = spu_sel(*p,
3768 spu_rlmaskqwbyte(a, shift),
3769 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3772 static inline void vec_stvlx(vec_int4 a, int b, signed int *c)
3774 int shift;
3775 vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3777 shift = -((int)p & 0xF);
3778 *p = spu_sel(*p,
3779 spu_rlmaskqwbyte(a, shift),
3780 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3783 static inline void vec_stvlx(vec_int4 a, int b, vec_int4 *c)
3785 int shift;
3786 vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3788 shift = -((int)p & 0xF);
3789 *p = spu_sel(*p,
3790 spu_rlmaskqwbyte(a, shift),
3791 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3794 static inline void vec_stvlx(vec_float4 a, int b, float *c)
3796 int shift;
3797 vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3799 shift = -((int)p & 0xF);
3800 *p = spu_sel(*p,
3801 spu_rlmaskqwbyte(a, shift),
3802 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3805 static inline void vec_stvlx(vec_float4 a, int b, vec_float4 *c)
3807 int shift;
3808 vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3810 shift = -((int)p & 0xF);
3811 *p = spu_sel(*p,
3812 spu_rlmaskqwbyte(a, shift),
3813 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3816 /* vec_stvlxl (store vector left indexed last)
3817 * ==========
3819 #define vec_stvlxl(_a, _b, _c) vec_stvlx(_a, _b, _c)
3822 /* vec_stvrx (store vector right indexed)
3823 * =========
3825 static inline void vec_stvrx(vec_uchar16 a, int b, unsigned char *c)
3827 int shift;
3828 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3830 shift = 16-((int)p & 0xF);
3831 *p = spu_sel(*p,
3832 spu_slqwbyte(a, shift),
3833 spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3836 static inline void vec_stvrx(vec_uchar16 a, int b, vec_uchar16 *c)
3838 int shift;
3839 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3841 shift = 16-((int)p & 0xF);
3842 *p = spu_sel(*p,
3843 spu_slqwbyte(a, shift),
3844 spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3847 static inline void vec_stvrx(vec_char16 a, int b, signed char *c)
3849 int shift;
3850 vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3852 shift = 16-((int)p & 0xF);
3853 *p = spu_sel(*p,
3854 spu_slqwbyte(a, shift),
3855 spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3858 static inline void vec_stvrx(vec_char16 a, int b, vec_char16 *c)
3860 int shift;
3861 vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3863 shift = 16-((int)p & 0xF);
3864 *p = spu_sel(*p,
3865 spu_slqwbyte(a, shift),
3866 spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3869 static inline void vec_stvrx(vec_ushort8 a, int b, unsigned short *c)
3871 int shift;
3872 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3874 shift = 16-((int)p & 0xF);
3875 *p = spu_sel(*p,
3876 spu_slqwbyte(a, shift),
3877 spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3880 static inline void vec_stvrx(vec_ushort8 a, int b, vec_ushort8 *c)
3882 int shift;
3883 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3885 shift = 16-((int)p & 0xF);
3886 *p = spu_sel(*p,
3887 spu_slqwbyte(a, shift),
3888 spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3891 static inline void vec_stvrx(vec_short8 a, int b, signed short *c)
3893 int shift;
3894 vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3896 shift = 16-((int)p & 0xF);
3897 *p = spu_sel(*p,
3898 spu_slqwbyte(a, shift),
3899 spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3902 static inline void vec_stvrx(vec_short8 a, int b, vec_short8 *c)
3904 int shift;
3905 vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3907 shift = 16-((int)p & 0xF);
3908 *p = spu_sel(*p,
3909 spu_slqwbyte(a, shift),
3910 spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3913 static inline void vec_stvrx(vec_uint4 a, int b, unsigned int *c)
3915 int shift;
3916 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3918 shift = 16-((int)p & 0xF);
3919 *p = spu_sel(*p,
3920 spu_slqwbyte(a, shift),
3921 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3924 static inline void vec_stvrx(vec_uint4 a, int b, vec_uint4 *c)
3926 int shift;
3927 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3929 shift = 16-((int)p & 0xF);
3930 *p = spu_sel(*p,
3931 spu_slqwbyte(a, shift),
3932 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3935 static inline void vec_stvrx(vec_int4 a, int b, signed int *c)
3937 int shift;
3938 vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3940 shift = 16-((int)p & 0xF);
3941 *p = spu_sel(*p,
3942 spu_slqwbyte(a, shift),
3943 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3946 static inline void vec_stvrx(vec_int4 a, int b, vec_int4 *c)
3948 int shift;
3949 vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3951 shift = 16-((int)p & 0xF);
3952 *p = spu_sel(*p,
3953 spu_slqwbyte(a, shift),
3954 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3957 static inline void vec_stvrx(vec_float4 a, int b, float *c)
3959 int shift;
3960 vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3962 shift = 16-((int)p & 0xF);
3963 *p = spu_sel(*p,
3964 spu_slqwbyte(a, shift),
3965 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3968 static inline void vec_stvrx(vec_float4 a, int b, vec_float4 *c)
3970 int shift;
3971 vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3973 shift = 16-((int)p & 0xF);
3974 *p = spu_sel(*p,
3975 spu_slqwbyte(a, shift),
3976 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3979 /* vec_stvrxl (store vector right indexed last)
3980 * ==========
3982 #define vec_stvrxl(_a, _b, _c) vec_stvrx(_a, _b, _c)
3985 #endif /* __SPU__ */
3986 #endif /* __cplusplus */
3987 #endif /* !_VMX2SPU_H_ */