2009-06-03 Richard Guenther <rguenther@suse.de>
[official-gcc.git] / gcc / config / spu / vmx2spu.h
blob409d73f78350e47d2454e313834ce122bb77c689
1 /* Copyright (C) 2006, 2007, 2009 Free Software Foundation, Inc.
3 This file is free software; you can redistribute it and/or modify it under
4 the terms of the GNU General Public License as published by the Free
5 Software Foundation; either version 3 of the License, or (at your option)
6 any later version.
8 This file is distributed in the hope that it will be useful, but WITHOUT
9 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
11 for more details.
13 Under Section 7 of GPL version 3, you are granted additional
14 permissions described in the GCC Runtime Library Exception, version
15 3.1, as published by the Free Software Foundation.
17 You should have received a copy of the GNU General Public License and
18 a copy of the GCC Runtime Library Exception along with this program;
19 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
20 <http://www.gnu.org/licenses/>. */
22 #ifndef _VMX2SPU_H_
23 #define _VMX2SPU_H_ 1
25 #ifdef __cplusplus
27 #ifdef __SPU__
29 #include <spu_intrinsics.h>
30 #include <vec_types.h>
32 /* This file maps generic VMX intrinsics and predicates to the SPU using
33 * overloaded C++ functions.
36 /************************************************************************
37 * INTRINSICS
38 ************************************************************************/
40 /* vec_abs (vector absolute value)
41 * =======
43 static inline vec_char16 vec_abs(vec_char16 a)
45 vec_char16 minus_a;
47 minus_a = (vec_char16)(spu_add((vec_ushort8)(spu_and(spu_xor(a, 0xFF), 0x7F)), 0x101));
48 return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
51 static inline vec_short8 vec_abs(vec_short8 a)
53 return (spu_sel(spu_sub(0, a), a, spu_cmpgt(a, -1)));
56 static inline vec_int4 vec_abs(vec_int4 a)
58 return (spu_sel(spu_sub(0, a), a, spu_cmpgt(a, -1)));
61 static inline vec_float4 vec_abs(vec_float4 a)
63 return ((vec_float4)(spu_rlmask(spu_sl((vec_uint4)(a), 1), -1)));
66 /* vec_abss (vector absolute value saturate)
67 * ========
69 static inline vec_char16 vec_abss(vec_char16 a)
71 vec_char16 minus_a;
73 minus_a = (vec_char16)spu_add((vec_short8)(spu_xor(a, -1)),
74 (vec_short8)(spu_and(spu_cmpgt((vec_uchar16)(a), 0x80), 1)));
75 return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
78 static inline vec_short8 vec_abss(vec_short8 a)
80 vec_short8 minus_a;
82 minus_a = spu_add(spu_sub(0, a), (vec_short8)(spu_cmpeq(a, ((vec_short8){0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000}))));
83 return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
86 static inline vec_int4 vec_abss(vec_int4 a)
88 vec_int4 minus_a;
90 minus_a = spu_add(spu_sub(0, a), (vec_int4)(spu_cmpeq(a, ((vec_int4){0x80000000,0x80000000,0x80000000,0x80000000}))));
91 return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
95 /* vec_add (vector add)
96 * =======
98 static inline vec_uchar16 vec_add(vec_uchar16 a, vec_uchar16 b)
100 return ((vec_uchar16)(spu_sel(spu_add((vec_ushort8)(a), (vec_ushort8)(b)),
101 spu_add(spu_and((vec_ushort8)(a), 0xFF00), spu_and((vec_ushort8)(b), 0xFF00)),
102 spu_splats((unsigned short)(0xFF00)))));
105 static inline vec_char16 vec_add(vec_char16 a, vec_char16 b)
107 return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b)));
110 static inline vec_char16 vec_add(vec_bchar16 a, vec_char16 b)
112 return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b)));
115 static inline vec_char16 vec_add(vec_char16 a, vec_bchar16 b)
117 return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b)));
120 static inline vec_ushort8 vec_add(vec_ushort8 a, vec_ushort8 b)
122 return (spu_add(a, b));
125 static inline vec_short8 vec_add(vec_short8 a, vec_short8 b)
127 return (spu_add(a, b));
130 static inline vec_short8 vec_add(vec_bshort8 a, vec_short8 b)
132 return (spu_add((vec_short8)(a), b));
135 static inline vec_short8 vec_add(vec_short8 a, vec_bshort8 b)
137 return (spu_add(a, (vec_short8)(b)));
140 static inline vec_uint4 vec_add(vec_uint4 a, vec_uint4 b)
142 return (spu_add(a, b));
145 static inline vec_int4 vec_add(vec_int4 a, vec_int4 b)
147 return (spu_add(a, b));
150 static inline vec_int4 vec_add(vec_bint4 a, vec_int4 b)
152 return (spu_add((vec_int4)(a), b));
155 static inline vec_int4 vec_add(vec_int4 a, vec_bint4 b)
157 return (spu_add(a, (vec_int4)(b)));
160 static inline vec_float4 vec_add(vec_float4 a, vec_float4 b)
162 return (spu_add(a, b));
165 /* vec_addc (vector add carryout unsigned word)
166 * ========
168 #define vec_addc(_a, _b) spu_genc(_a, _b)
170 /* vec_adds (vector add saturated)
171 * ========
173 static inline vec_uchar16 vec_adds(vec_uchar16 a, vec_uchar16 b)
175 vec_uchar16 s1, s2, s, d;
177 s1 = (vec_uchar16)(spu_add(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8)));
178 s2 = (vec_uchar16)(spu_add(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF)));
179 s = spu_shuffle(s1, s2, ((vec_uchar16){0, 16, 2, 18, 4, 20, 6, 22,
180 8, 24, 10, 26, 12, 28, 14, 30}));
181 d = spu_shuffle(s1, s2, ((vec_uchar16){1, 17, 3, 19, 5, 21, 7, 23,
182 9, 25, 11, 27, 13, 29, 15, 31}));
183 return (spu_or(d, spu_cmpeq(s, 1)));
186 static inline vec_char16 vec_adds(vec_char16 a, vec_char16 b)
188 vec_uchar16 s1, s2, s, d;
190 s1 = (vec_uchar16)(spu_add(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8)));
191 s2 = (vec_uchar16)(spu_add(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF)));
192 s = spu_shuffle(s1, s2, ((vec_uchar16){1, 17, 3, 19, 5, 21, 7, 23,
193 9, 25, 11, 27, 13, 29, 15, 31}));
194 d = spu_sel(s, spu_splats((unsigned char)0x7F), spu_cmpgt(spu_and(s, (vec_uchar16)(spu_nor(a, b))), 0x7F));
195 d = spu_sel(d, spu_splats((unsigned char)0x80), spu_cmpgt(spu_nor(s, (vec_uchar16)(spu_nand(a, b))), 0x7F));
196 return ((vec_char16)(d));
199 static inline vec_char16 vec_adds(vec_bchar16 a, vec_char16 b)
201 return (vec_adds((vec_char16)(a), b));
204 static inline vec_char16 vec_adds(vec_char16 a, vec_bchar16 b)
206 return (vec_adds(a, (vec_char16)(b)));
209 static inline vec_ushort8 vec_adds(vec_ushort8 a, vec_ushort8 b)
211 vec_ushort8 s, d;
213 s = spu_add(a, b);
214 d = spu_or(s, spu_rlmaska(spu_sel(spu_xor(s, -1), a, spu_eqv(a, b)), -15));
215 return (d);
218 static inline vec_short8 vec_adds(vec_short8 a, vec_short8 b)
220 vec_short8 s, d;
222 s = spu_add(a, b);
223 d = spu_sel(s, spu_splats((signed short)0x7FFF), (vec_ushort8)(spu_rlmaska(spu_and(s, spu_nor(a, b)), -15)));
224 d = spu_sel(d, spu_splats((signed short)0x8000), (vec_ushort8)(spu_rlmaska(spu_nor(s, spu_nand(a, b)), -15)));
225 return (d);
228 static inline vec_short8 vec_adds(vec_bshort8 a, vec_short8 b)
230 return (vec_adds((vec_short8)(a), b));
233 static inline vec_short8 vec_adds(vec_short8 a, vec_bshort8 b)
235 return (vec_adds(a, (vec_short8)(b)));
238 static inline vec_uint4 vec_adds(vec_uint4 a, vec_uint4 b)
240 return (spu_or(spu_add(a, b), spu_rlmaska(spu_sl(spu_genc(a, b), 31), -31)));
243 static inline vec_int4 vec_adds(vec_int4 a, vec_int4 b)
245 vec_int4 s, d;
247 s = spu_add(a, b);
248 d = spu_sel(s, spu_splats((signed int)0x7FFFFFFF), (vec_uint4)spu_rlmaska(spu_and(s, spu_nor(a, b)), -31));
249 d = spu_sel(d, spu_splats((signed int)0x80000000), (vec_uint4)spu_rlmaska(spu_nor(s, spu_nand(a, b)), -31));
250 return (d);
253 static inline vec_int4 vec_adds(vec_bint4 a, vec_int4 b)
255 return (vec_adds((vec_int4)(a), b));
258 static inline vec_int4 vec_adds(vec_int4 a, vec_bint4 b)
260 return (vec_adds(a, (vec_int4)(b)));
263 /* vec_and (vector logical and)
264 * =======
266 static inline vec_uchar16 vec_and(vec_uchar16 a, vec_uchar16 b)
268 return (spu_and(a, b));
271 static inline vec_char16 vec_and(vec_char16 a, vec_char16 b)
273 return (spu_and(a, b));
276 static inline vec_char16 vec_and(vec_bchar16 a, vec_char16 b)
278 return (spu_and((vec_char16)(a), b));
281 static inline vec_char16 vec_and(vec_char16 a, vec_bchar16 b)
283 return (spu_and(a, (vec_char16)(b)));
286 static inline vec_ushort8 vec_and(vec_ushort8 a, vec_ushort8 b)
288 return (spu_and(a, b));
291 static inline vec_short8 vec_and(vec_short8 a, vec_short8 b)
293 return (spu_and(a, b));
296 static inline vec_short8 vec_and(vec_bshort8 a, vec_short8 b)
298 return (spu_and((vec_short8)(a), b));
301 static inline vec_short8 vec_and(vec_short8 a, vec_bshort8 b)
303 return (spu_and(a, (vec_short8)(b)));
306 static inline vec_uint4 vec_and(vec_uint4 a, vec_uint4 b)
308 return (spu_and(a, b));
311 static inline vec_int4 vec_and(vec_int4 a, vec_int4 b)
313 return (spu_and(a, b));
316 static inline vec_int4 vec_and(vec_bint4 a, vec_int4 b)
318 return (spu_and((vec_int4)(a), b));
321 static inline vec_int4 vec_and(vec_int4 a, vec_bint4 b)
323 return (spu_and(a, (vec_int4)(b)));
326 static inline vec_float4 vec_and(vec_float4 a, vec_float4 b)
328 return (spu_and(a, b));
331 static inline vec_float4 vec_and(vec_bint4 a, vec_float4 b)
333 return (spu_and((vec_float4)(a),b));
336 static inline vec_float4 vec_and(vec_float4 a, vec_bint4 b)
338 return (spu_and(a, (vec_float4)(b)));
342 /* vec_andc (vector logical and with complement)
343 * ========
345 static inline vec_uchar16 vec_andc(vec_uchar16 a, vec_uchar16 b)
347 return (spu_andc(a, b));
350 static inline vec_char16 vec_andc(vec_char16 a, vec_char16 b)
352 return (spu_andc(a, b));
355 static inline vec_char16 vec_andc(vec_bchar16 a, vec_char16 b)
357 return (spu_andc((vec_char16)(a), b));
360 static inline vec_char16 vec_andc(vec_char16 a, vec_bchar16 b)
362 return (spu_andc(a, (vec_char16)(b)));
365 static inline vec_ushort8 vec_andc(vec_ushort8 a, vec_ushort8 b)
367 return (spu_andc(a, b));
370 static inline vec_short8 vec_andc(vec_short8 a, vec_short8 b)
372 return (spu_andc(a, b));
375 static inline vec_short8 vec_andc(vec_bshort8 a, vec_short8 b)
377 return (spu_andc((vec_short8)(a), b));
380 static inline vec_short8 vec_andc(vec_short8 a, vec_bshort8 b)
382 return (spu_andc(a, (vec_short8)(b)));
385 static inline vec_uint4 vec_andc(vec_uint4 a, vec_uint4 b)
387 return (spu_andc(a, b));
390 static inline vec_int4 vec_andc(vec_int4 a, vec_int4 b)
392 return (spu_andc(a, b));
395 static inline vec_int4 vec_andc(vec_bint4 a, vec_int4 b)
397 return (spu_andc((vec_int4)(a), b));
400 static inline vec_int4 vec_andc(vec_int4 a, vec_bint4 b)
402 return (spu_andc(a, (vec_int4)(b)));
405 static inline vec_float4 vec_andc(vec_float4 a, vec_float4 b)
407 return (spu_andc(a,b));
410 static inline vec_float4 vec_andc(vec_bint4 a, vec_float4 b)
412 return (spu_andc((vec_float4)(a),b));
415 static inline vec_float4 vec_andc(vec_float4 a, vec_bint4 b)
417 return (spu_andc(a, (vec_float4)(b)));
420 /* vec_avg (vector average)
421 * =======
423 static inline vec_uchar16 vec_avg(vec_uchar16 a, vec_uchar16 b)
425 return (spu_avg(a, b));
428 static inline vec_char16 vec_avg(vec_char16 a, vec_char16 b)
430 return ((vec_char16)(spu_xor(spu_avg((vec_uchar16)(a), (vec_uchar16)(b)),
431 (vec_uchar16)(spu_and(spu_xor(a,b), 0x80)))));
434 static inline vec_ushort8 vec_avg(vec_ushort8 a, vec_ushort8 b)
436 return (spu_add(spu_add(spu_rlmask(a, -1), spu_rlmask(b, -1)),
437 spu_and(spu_or(a, b), 1)));
440 static inline vec_short8 vec_avg(vec_short8 a, vec_short8 b)
442 return (spu_add(spu_add(spu_rlmaska(a, -1), spu_rlmaska(b, -1)),
443 spu_and(spu_or(a, b), 1)));
446 static inline vec_uint4 vec_avg(vec_uint4 a, vec_uint4 b)
448 return (spu_add(spu_add(spu_rlmask(a, -1), spu_rlmask(b, -1)),
449 spu_and(spu_or(a, b), 1)));
452 static inline vec_int4 vec_avg(vec_int4 a, vec_int4 b)
454 return (spu_add(spu_add(spu_rlmaska(a, -1), spu_rlmaska(b, -1)),
455 spu_and(spu_or(a, b), 1)));
459 /* vec_ceil (vector ceiling)
460 * ========
462 static inline vec_float4 vec_ceil(vec_float4 a)
464 vec_int4 exp;
465 vec_uint4 mask;
467 a = spu_add(a, (vec_float4)(spu_and(spu_xor(spu_rlmaska((vec_int4)a, -31), -1), spu_splats((signed int)0x3F7FFFFF))));
468 exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
469 mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
470 mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
471 mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
473 return ((vec_float4)(spu_andc((vec_uint4)(a), mask)));
477 /* vec_cmpb (vector compare bounds floating-point)
478 * ========
480 static inline vec_int4 vec_cmpb(vec_float4 a, vec_float4 b)
482 vec_int4 b0 = (vec_int4)spu_splats(0x80000000);
483 vec_int4 b1 = (vec_int4)spu_splats(0x40000000);
485 return (spu_or(spu_and((vec_int4)spu_cmpgt(a, b), b0),
486 spu_and((vec_int4)spu_cmpgt(spu_xor(b, (vec_float4)(b0)), a), b1)));
489 /* vec_cmpeq (vector compare equal)
490 * =========
492 #define vec_cmpeq(_a, _b) spu_cmpeq(_a, _b)
495 /* vec_cmpge (vector compare greater than or equal)
496 * =========
498 static inline vec_bint4 vec_cmpge(vec_float4 a, vec_float4 b)
500 return (spu_xor(spu_cmpgt(b, a), -1));
504 /* vec_cmpgt (vector compare greater than)
505 * =========
507 #define vec_cmpgt(_a, _b) spu_cmpgt(_a, _b)
510 /* vec_cmple (vector compare less than or equal)
511 * =========
513 static inline vec_bint4 vec_cmple(vec_float4 a, vec_float4 b)
515 return (spu_xor(spu_cmpgt(a, b), -1));
519 /* vec_cmplt (vector compare less than)
520 * =========
522 #define vec_cmplt(_a, _b) spu_cmpgt(_b, _a)
525 /* vec_ctf (vector convert from fixed-point word)
526 * =======
528 #define vec_ctf(_a, _b) spu_convtf(_a, _b)
531 /* vec_cts (vector convert to signed fixed-point word saturate)
532 * =======
534 #define vec_cts(_a, _b) spu_convts(_a, _b)
537 /* vec_ctu (vector convert to unsigned fixed-point word saturate)
538 * =======
540 #define vec_ctu(_a, _b) spu_convtu(_a, _b)
543 /* vec_dss (vector data stream stop)
544 * =======
546 #define vec_dss(_a)
549 /* vec_dssall (vector data stream stop all)
550 * ==========
552 #define vec_dssall()
555 /* vec_dst (vector data stream touch)
556 * =======
558 #define vec_dst(_a, _b, _c)
561 /* vec_dstst (vector data stream touch for store)
562 * =========
564 #define vec_dstst(_a, _b, _c)
567 /* vec_dststt (vector data stream touch for store transient)
568 * ==========
570 #define vec_dststt(_a, _b, _c)
573 /* vec_dstt (vector data stream touch transient)
574 * ========
576 #define vec_dstt(_a, _b, _c)
579 /* vec_expte (vector is 2 raised tp the exponent estimate floating-point)
580 * =========
582 static inline vec_float4 vec_expte(vec_float4 a)
584 vec_float4 bias, frac, exp;
585 vec_int4 ia;
587 bias = (vec_float4)(spu_andc(spu_splats((signed int)0x3F7FFFFF), spu_rlmaska((vec_int4)(a), -31)));
588 ia = spu_convts(spu_add(a, bias), 0);
589 frac = spu_sub(spu_convtf(ia, 0), a);
590 exp = (vec_float4)(spu_sl(spu_add(ia, 127), 23));
592 return (spu_mul(spu_madd(spu_madd(spu_splats(0.17157287f), frac, spu_splats(-0.67157287f)),
593 frac, spu_splats(1.0f)), exp));
597 /* vec_floor (vector floor)
598 * =========
600 static inline vec_float4 vec_floor(vec_float4 a)
602 vec_int4 exp;
603 vec_uint4 mask;
605 a = spu_sub(a, (vec_float4)(spu_and(spu_rlmaska((vec_int4)a, -31), spu_splats((signed int)0x3F7FFFFF))));
606 exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
607 mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
608 mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
609 mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
611 return ((vec_float4)(spu_andc((vec_uint4)(a), mask)));
615 /* vec_ld (vector load indexed)
616 * ======
618 static inline vec_uchar16 vec_ld(int a, unsigned char *b)
620 return (*((vec_uchar16 *)(b+a)));
623 static inline vec_uchar16 vec_ld(int a, vec_uchar16 *b)
625 return (*((vec_uchar16 *)((unsigned char *)(b)+a)));
628 static inline vec_char16 vec_ld(int a, signed char *b)
630 return (*((vec_char16 *)(b+a)));
633 static inline vec_char16 vec_ld(int a, vec_char16 *b)
635 return (*((vec_char16 *)((signed char *)(b)+a)));
638 static inline vec_ushort8 vec_ld(int a, unsigned short *b)
640 return (*((vec_ushort8 *)((unsigned char *)(b)+a)));
643 static inline vec_ushort8 vec_ld(int a, vec_ushort8 *b)
645 return (*((vec_ushort8 *)((unsigned char *)(b)+a)));
648 static inline vec_short8 vec_ld(int a, signed short *b)
650 return (*((vec_short8 *)((unsigned char *)(b)+a)));
653 static inline vec_short8 vec_ld(int a, vec_short8 *b)
655 return (*((vec_short8 *)((signed char *)(b)+a)));
658 static inline vec_uint4 vec_ld(int a, unsigned int *b)
660 return (*((vec_uint4 *)((unsigned char *)(b)+a)));
663 static inline vec_uint4 vec_ld(int a, vec_uint4 *b)
665 return (*((vec_uint4 *)((unsigned char *)(b)+a)));
668 static inline vec_int4 vec_ld(int a, signed int *b)
670 return (*((vec_int4 *)((unsigned char *)(b)+a)));
673 static inline vec_int4 vec_ld(int a, vec_int4 *b)
675 return (*((vec_int4 *)((signed char *)(b)+a)));
678 static inline vec_float4 vec_ld(int a, float *b)
680 return (*((vec_float4 *)((unsigned char *)(b)+a)));
683 static inline vec_float4 vec_ld(int a, vec_float4 *b)
685 return (*((vec_float4 *)((unsigned char *)(b)+a)));
688 /* vec_lde (vector load element indexed)
689 * =======
691 static inline vec_uchar16 vec_lde(int a, unsigned char *b)
693 return (*((vec_uchar16 *)(b+a)));
696 static inline vec_char16 vec_lde(int a, signed char *b)
698 return (*((vec_char16 *)(b+a)));
701 static inline vec_ushort8 vec_lde(int a, unsigned short *b)
703 return (*((vec_ushort8 *)((unsigned char *)(b)+a)));
706 static inline vec_short8 vec_lde(int a, signed short *b)
708 return (*((vec_short8 *)((unsigned char *)(b)+a)));
712 static inline vec_uint4 vec_lde(int a, unsigned int *b)
714 return (*((vec_uint4 *)((unsigned char *)(b)+a)));
717 static inline vec_int4 vec_lde(int a, signed int *b)
719 return (*((vec_int4 *)((unsigned char *)(b)+a)));
723 static inline vec_float4 vec_lde(int a, float *b)
725 return (*((vec_float4 *)((unsigned char *)(b)+a)));
728 /* vec_ldl (vector load indexed LRU)
729 * =======
731 #define vec_ldl(_a, _b) vec_ld(_a, _b)
734 /* vec_loge (vector log2 estimate floating-point)
735 * ========
737 static inline vec_float4 vec_loge(vec_float4 a)
739 vec_int4 exp;
740 vec_float4 frac;
742 exp = spu_add((vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)), -127);
743 frac = (vec_float4)(spu_sub((vec_int4)(a), spu_sl(exp, 23)));
745 return (spu_madd(spu_madd(spu_splats(-0.33985f), frac, spu_splats(2.01955f)),
746 frac, spu_sub(spu_convtf(exp, 0), spu_splats(1.6797f))));
750 /* vec_lvsl (vector load for shift left)
751 * ========
753 static inline vec_uchar16 vec_lvsl(int a, unsigned char *b)
755 return ((vec_uchar16)spu_add((vec_ushort8)(spu_splats((unsigned char)((a + (int)(b)) & 0xF))),
756 ((vec_ushort8){0x0001, 0x0203, 0x0405, 0x0607,
757 0x0809, 0x0A0B, 0x0C0D, 0x0E0F})));
760 static inline vec_uchar16 vec_lvsl(int a, signed char *b)
762 return (vec_lvsl(a, (unsigned char *)b));
765 static inline vec_uchar16 vec_lvsl(int a, unsigned short *b)
767 return (vec_lvsl(a, (unsigned char *)b));
770 static inline vec_uchar16 vec_lvsl(int a, short *b)
772 return (vec_lvsl(a, (unsigned char *)b));
775 static inline vec_uchar16 vec_lvsl(int a, unsigned int *b)
777 return (vec_lvsl(a, (unsigned char *)b));
780 static inline vec_uchar16 vec_lvsl(int a, int *b)
782 return (vec_lvsl(a, (unsigned char *)b));
785 static inline vec_uchar16 vec_lvsl(int a, float *b)
787 return (vec_lvsl(a, (unsigned char *)b));
791 /* vec_lvsr (vector load for shift right)
792 * ========
794 static inline vec_uchar16 vec_lvsr(int a, unsigned char *b)
796 return ((vec_uchar16)(spu_sub(((vec_ushort8){0x1011, 0x1213, 0x1415, 0x1617,
797 0x1819, 0x1A1B, 0x1C1D, 0x1E1F}),
798 (vec_ushort8)(spu_splats((unsigned char)((a + (int)(b)) & 0xF))))));
801 static inline vec_uchar16 vec_lvsr(int a, signed char *b)
803 return (vec_lvsr(a, (unsigned char *)b));
806 static inline vec_uchar16 vec_lvsr(int a, unsigned short *b)
808 return (vec_lvsr(a, (unsigned char *)b));
811 static inline vec_uchar16 vec_lvsr(int a, short *b)
813 return (vec_lvsr(a, (unsigned char *)b));
816 static inline vec_uchar16 vec_lvsr(int a, unsigned int *b)
818 return (vec_lvsr(a, (unsigned char *)b));
821 static inline vec_uchar16 vec_lvsr(int a, int *b)
823 return (vec_lvsr(a, (unsigned char *)b));
826 static inline vec_uchar16 vec_lvsr(int a, float *b)
828 return (vec_lvsr(a, (unsigned char *)b));
831 /* vec_madd (vector multiply add)
832 * ========
834 #define vec_madd(_a, _b, _c) spu_madd(_a, _b, _c)
838 /* vec_madds (vector multiply add saturate)
839 * =========
841 static inline vec_short8 vec_madds(vec_short8 a, vec_short8 b, vec_short8 c)
843 return (vec_adds(c, spu_sel((vec_short8)(spu_sl(spu_mule(a, b), 1)),
844 (vec_short8)(spu_rlmask(spu_mulo(a, b), -15)),
845 ((vec_ushort8){0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF}))));
848 /* vec_max (vector maximum)
849 * =======
851 static inline vec_uchar16 vec_max(vec_uchar16 a, vec_uchar16 b)
853 return (spu_sel(b, a, spu_cmpgt(a, b)));
856 static inline vec_char16 vec_max(vec_char16 a, vec_char16 b)
858 return (spu_sel(b, a, spu_cmpgt(a, b)));
861 static inline vec_char16 vec_max(vec_bchar16 a, vec_char16 b)
863 return (spu_sel(b, (vec_char16)(a), spu_cmpgt((vec_char16)(a), b)));
866 static inline vec_char16 vec_max(vec_char16 a, vec_bchar16 b)
868 return (spu_sel((vec_char16)(b), a, spu_cmpgt(a, (vec_char16)(b))));
871 static inline vec_ushort8 vec_max(vec_ushort8 a, vec_ushort8 b)
873 return (spu_sel(b, a, spu_cmpgt(a, b)));
876 static inline vec_short8 vec_max(vec_short8 a, vec_short8 b)
878 return (spu_sel(b, a, spu_cmpgt(a, b)));
881 static inline vec_short8 vec_max(vec_bshort8 a, vec_short8 b)
883 return (spu_sel(b, (vec_short8)(a), spu_cmpgt((vec_short8)(a), b)));
886 static inline vec_short8 vec_max(vec_short8 a, vec_bshort8 b)
888 return (spu_sel((vec_short8)(b), a, spu_cmpgt(a, (vec_short8)(b))));
891 static inline vec_uint4 vec_max(vec_uint4 a, vec_uint4 b)
893 return (spu_sel(b, a, spu_cmpgt(a, b)));
896 static inline vec_int4 vec_max(vec_int4 a, vec_int4 b)
898 return (spu_sel(b, a, spu_cmpgt(a, b)));
901 static inline vec_int4 vec_max(vec_bint4 a, vec_int4 b)
903 return (spu_sel(b, (vec_int4)(a), spu_cmpgt((vec_int4)(a), b)));
906 static inline vec_int4 vec_max(vec_int4 a, vec_bint4 b)
908 return (spu_sel((vec_int4)(b), a, spu_cmpgt(a, (vec_int4)(b))));
911 static inline vec_float4 vec_max(vec_float4 a, vec_float4 b)
913 return (spu_sel(b, a, spu_cmpgt(a, b)));
917 /* vec_mergeh (vector merge high)
918 * ==========
920 static inline vec_uchar16 vec_mergeh(vec_uchar16 a, vec_uchar16 b)
922 return (spu_shuffle(a, b, ((vec_uchar16){0, 16, 1, 17, 2, 18, 3, 19,
923 4, 20, 5, 21, 6, 22, 7, 23})));
926 static inline vec_char16 vec_mergeh(vec_char16 a, vec_char16 b)
928 return (spu_shuffle(a, b, ((vec_uchar16){0, 16, 1, 17, 2, 18, 3, 19,
929 4, 20, 5, 21, 6, 22, 7, 23})));
932 static inline vec_ushort8 vec_mergeh(vec_ushort8 a, vec_ushort8 b)
934 return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 16, 17, 2, 3, 18, 19,
935 4, 5, 20, 21, 6, 7, 22, 23})));
938 static inline vec_short8 vec_mergeh(vec_short8 a, vec_short8 b)
940 return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 16, 17, 2, 3, 18, 19,
941 4, 5, 20, 21, 6, 7, 22, 23})));
944 static inline vec_uint4 vec_mergeh(vec_uint4 a, vec_uint4 b)
946 return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19,
947 4, 5, 6, 7, 20, 21, 22, 23})));
950 static inline vec_int4 vec_mergeh(vec_int4 a, vec_int4 b)
952 return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19,
953 4, 5, 6, 7, 20, 21, 22, 23})));
956 static inline vec_float4 vec_mergeh(vec_float4 a, vec_float4 b)
958 return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19,
959 4, 5, 6, 7, 20, 21, 22, 23})));
962 /* vec_mergel (vector merge low)
963 * ==========
965 static inline vec_uchar16 vec_mergel(vec_uchar16 a, vec_uchar16 b)
967 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 24, 9, 25, 10, 26, 11, 27,
968 12, 28, 13, 29, 14, 30, 15, 31})));
971 static inline vec_char16 vec_mergel(vec_char16 a, vec_char16 b)
973 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 24, 9, 25, 10, 26, 11, 27,
974 12, 28, 13, 29, 14, 30, 15, 31})));
977 static inline vec_ushort8 vec_mergel(vec_ushort8 a, vec_ushort8 b)
979 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 24, 25, 10, 11, 26, 27,
980 12, 13, 28, 29, 14, 15, 30, 31})));
983 static inline vec_short8 vec_mergel(vec_short8 a, vec_short8 b)
985 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 24, 25, 10, 11, 26, 27,
986 12, 13, 28, 29, 14, 15, 30, 31})));
989 static inline vec_uint4 vec_mergel(vec_uint4 a, vec_uint4 b)
991 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 10, 11, 24, 25, 26, 27,
992 12, 13, 14, 15, 28, 29, 30, 31})));
995 static inline vec_int4 vec_mergel(vec_int4 a, vec_int4 b)
997 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 10, 11, 24, 25, 26, 27,
998 12, 13, 14, 15, 28, 29, 30, 31})));
1001 static inline vec_float4 vec_mergel(vec_float4 a, vec_float4 b)
1003 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 10, 11, 24, 25, 26, 27,
1004 12, 13, 14, 15, 28, 29, 30, 31})));
1007 /* vec_mfvscr (vector move from vector status and control register)
1008 * ==========
1010 static inline vec_ushort8 vec_mfvscr()
1012 return ((vec_ushort8)spu_splats(0)); /* not supported */
1016 /* vec_min (vector minimum)
1017 * =======
1019 static inline vec_uchar16 vec_min(vec_uchar16 a, vec_uchar16 b)
1021 return (spu_sel(a, b, spu_cmpgt(a, b)));
1024 static inline vec_char16 vec_min(vec_char16 a, vec_char16 b)
1026 return (spu_sel(a, b, spu_cmpgt(a, b)));
1029 static inline vec_char16 vec_min(vec_bchar16 a, vec_char16 b)
1031 return (spu_sel((vec_char16)(a), b, spu_cmpgt((vec_char16)(a), b)));
1034 static inline vec_char16 vec_min(vec_char16 a, vec_bchar16 b)
1036 return (spu_sel(a, (vec_char16)(b), spu_cmpgt(a, (vec_char16)(b))));
1039 static inline vec_ushort8 vec_min(vec_ushort8 a, vec_ushort8 b)
1041 return (spu_sel(a, b, spu_cmpgt(a, b)));
1044 static inline vec_short8 vec_min(vec_short8 a, vec_short8 b)
1046 return (spu_sel(a, b, spu_cmpgt(a, b)));
1049 static inline vec_short8 vec_min(vec_bshort8 a, vec_short8 b)
1051 return (spu_sel((vec_short8)(a), b, spu_cmpgt((vec_short8)(a), b)));
1054 static inline vec_short8 vec_min(vec_short8 a, vec_bshort8 b)
1056 return (spu_sel(a, (vec_short8)(b), spu_cmpgt(a, (vec_short8)(b))));
1059 static inline vec_uint4 vec_min(vec_uint4 a, vec_uint4 b)
1061 return (spu_sel(a, b, spu_cmpgt(a, b)));
1064 static inline vec_int4 vec_min(vec_int4 a, vec_int4 b)
1066 return (spu_sel(a, b, spu_cmpgt(a, b)));
1069 static inline vec_int4 vec_min(vec_bint4 a, vec_int4 b)
1071 return (spu_sel((vec_int4)(a), b, spu_cmpgt((vec_int4)(a), b)));
1074 static inline vec_int4 vec_min(vec_int4 a, vec_bint4 b)
1076 return (spu_sel(a, (vec_int4)(b), spu_cmpgt(a, (vec_int4)(b))));
1079 static inline vec_float4 vec_min(vec_float4 a, vec_float4 b)
1081 return (spu_sel(a, b, spu_cmpgt(a, b)));
1084 /* vec_mladd (vector multiply low and add unsigned half word)
1085 * =========
1087 static inline vec_short8 vec_mladd(vec_short8 a, vec_short8 b, vec_short8 c)
1089 return ((vec_short8)(spu_shuffle(spu_madd((vec_short8)(spu_rl((vec_uint4)(a), -16)),
1090 (vec_short8)(spu_rl((vec_uint4)(b), -16)),
1091 (vec_int4)(spu_rl((vec_uint4)(c), -16))),
1092 spu_madd(a, b, spu_extend(c)),
1093 ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23,
1094 10, 11, 26, 27, 14, 15, 30, 31}))));
1098 static inline vec_ushort8 vec_mladd(vec_ushort8 a, vec_ushort8 b, vec_ushort8 c)
1100 return ((vec_ushort8)(vec_mladd((vec_short8)(a), (vec_short8)(b), (vec_short8)(c))));
1103 static inline vec_short8 vec_mladd(vec_ushort8 a, vec_short8 b, vec_short8 c)
1105 return (vec_mladd((vec_short8)(a), b, c));
1108 static inline vec_short8 vec_mladd(vec_short8 a, vec_ushort8 b, vec_ushort8 c)
1110 return (vec_mladd(a, (vec_short8)(b), (vec_short8)(c)));
1114 /* vec_mradds (vector multiply round and add saturate)
1115 * ==========
1117 static inline vec_short8 vec_mradds(vec_short8 a, vec_short8 b, vec_short8 c)
1119 vec_int4 round = (vec_int4)spu_splats(0x4000);
1120 vec_short8 hi, lo;
1122 hi = (vec_short8)(spu_sl(spu_add(spu_mule(a, b), round), 1));
1123 lo = (vec_short8)(spu_rlmask(spu_add(spu_mulo(a, b), round), -15));
1125 return (vec_adds(spu_sel(hi, lo, ((vec_ushort8){0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF})), c));
1129 /* vec_msum (vector multiply sum)
1130 * ========
1132 static inline vec_uint4 vec_msum(vec_uchar16 a, vec_uchar16 b, vec_uint4 c)
1134 vec_ushort8 a1, a2, b1, b2;
1135 vec_uint4 p1, p2;
1137 a1 = spu_and((vec_ushort8)(a), 0xFF);
1138 a2 = spu_rlmask((vec_ushort8)(a), -8);
1139 b1 = spu_and((vec_ushort8)(b), 0xFF);
1140 b2 = spu_rlmask((vec_ushort8)(b), -8);
1142 p1 = spu_add(spu_mulo(a1, b1), spu_mulo(spu_rlqwbyte(a1, -2), spu_rlqwbyte(b1, -2)));
1143 p2 = spu_add(spu_mulo(a2, b2), spu_mulo(spu_rlqwbyte(a2, -2), spu_rlqwbyte(b2, -2)));
1144 return (spu_add(p2, spu_add(p1, c)));
1147 static inline vec_int4 vec_msum(vec_char16 a, vec_uchar16 b, vec_int4 c)
1149 vec_short8 a1, a2, b1, b2;
1150 vec_int4 p1, p2;
1152 a1 = (vec_short8)(spu_extend(a));
1153 a2 = spu_rlmaska((vec_short8)(a), -8);
1154 b1 = (vec_short8)(spu_and((vec_ushort8)(b), 0xFF));
1155 b2 = (vec_short8)spu_rlmask((vec_ushort8)(b), -8);
1157 p1 = spu_add(spu_mulo(a1, b1), spu_mulo(spu_rlqwbyte(a1, -2), spu_rlqwbyte(b1, -2)));
1158 p2 = spu_add(spu_mulo(a2, b2), spu_mulo(spu_rlqwbyte(a2, -2), spu_rlqwbyte(b2, -2)));
1159 return (spu_add(p2, spu_add(p1, c)));
1162 static inline vec_uint4 vec_msum(vec_ushort8 a, vec_ushort8 b, vec_uint4 c)
1164 return (spu_add(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c));
1167 static inline vec_int4 vec_msum(vec_short8 a, vec_short8 b, vec_int4 c)
1169 return (spu_add(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c));
1173 /* vec_msums (vector multiply sum saturate)
1174 * ========
1176 static inline vec_uint4 vec_msums(vec_ushort8 a, vec_ushort8 b, vec_uint4 c)
1178 vec_uint4 p1, p2;
1180 p1 = spu_mulo(a, b);
1181 p2 = spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2));
1183 return (vec_adds(p2, vec_adds(p1, c)));
1186 static inline vec_int4 vec_msums(vec_short8 a, vec_short8 b, vec_int4 c)
1188 return (vec_adds(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c));
1191 /* vec_mtvscr (vector move to vector status and control register)
1192 * ==========
1194 #define vec_mtvscr(_a) /* not supported */
1197 /* vec_mule (vector multiply even)
1198 * ========
1200 static inline vec_ushort8 vec_mule(vec_uchar16 a, vec_uchar16 b)
1202 vec_ushort8 hi, lo;
1204 hi = (vec_ushort8)spu_mulo((vec_ushort8)(spu_rlmask((vec_uint4)(a), -24)),
1205 (vec_ushort8)(spu_rlmask((vec_uint4)(b), -24)));
1206 lo = (vec_ushort8)spu_mulo((vec_ushort8)(spu_rlmask((vec_short8)(a), -8)),
1207 (vec_ushort8)(spu_rlmask((vec_short8)(b), -8)));
1209 return (spu_shuffle(hi, lo, ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23,
1210 10, 11, 26, 27, 14, 15, 30, 31})));
1213 static inline vec_short8 vec_mule(vec_char16 a, vec_char16 b)
1215 vec_short8 hi, lo;
1217 hi = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_uint4)(a), -24)),
1218 (vec_short8)(spu_rlmaska((vec_uint4)(b), -24)));
1219 lo = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_short8)(a), -8)),
1220 (vec_short8)(spu_rlmaska((vec_short8)(b), -8)));
1222 return (spu_shuffle(hi, lo, ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23,
1223 10, 11, 26, 27, 14, 15, 30, 31})));
1226 static inline vec_uint4 vec_mule(vec_ushort8 a, vec_ushort8 b)
1228 return (spu_mulo((vec_ushort8)spu_rlmask((vec_uint4)(a), -16),
1229 (vec_ushort8)spu_rlmask((vec_uint4)(b), -16)));
1233 static inline vec_int4 vec_mule(vec_short8 a, vec_short8 b)
1235 return (spu_mulo((vec_short8)spu_rlmaska((vec_int4)(a), -16),
1236 (vec_short8)spu_rlmaska((vec_int4)(b), -16)));
1240 /* vec_mulo (vector multiply odd)
1241 * ========
1243 static inline vec_ushort8 vec_mulo(vec_uchar16 a, vec_uchar16 b)
1245 vec_ushort8 hi, lo;
1247 hi = (vec_ushort8)spu_mulo((vec_ushort8)(spu_and(spu_rlmask((vec_uint4)(a), -16), 0xFF)),
1248 (vec_ushort8)(spu_and(spu_rlmask((vec_uint4)(b), -16), 0xFF)));
1249 lo = (vec_ushort8)spu_mulo(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF));
1251 return (spu_shuffle(hi, lo, ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23,
1252 10, 11, 26, 27, 14, 15, 30, 31})));
1255 static inline vec_short8 vec_mulo(vec_char16 a, vec_char16 b)
1257 vec_short8 aa, bb, hi, lo;
1259 aa = spu_extend(a);
1260 bb = spu_extend(b);
1262 hi = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_uint4)(aa), -16)),
1263 (vec_short8)(spu_rlmaska((vec_uint4)(bb), -16)));
1264 lo = (vec_short8)spu_mulo(aa, bb);
1265 return (spu_shuffle(hi, lo, ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23,
1266 10, 11, 26, 27, 14, 15, 30, 31})));
1269 static inline vec_uint4 vec_mulo(vec_ushort8 a, vec_ushort8 b)
1271 return (spu_mulo(a, b));
1275 static inline vec_int4 vec_mulo(vec_short8 a, vec_short8 b)
1277 return (spu_mulo(a, b));
1281 /* vec_nmsub (vector negative multiply subtract)
1282 * =========
1284 #define vec_nmsub(_a, _b, _c) spu_nmsub(_a, _b, _c)
1287 /* vec_nor (vector logical nor)
1288 * =======
1290 #define vec_nor(_a, _b) spu_nor(_a, _b)
1293 /* vec_or (vector logical or)
1294 * ======
1296 static inline vec_uchar16 vec_or(vec_uchar16 a, vec_uchar16 b)
1298 return (spu_or(a, b));
1301 static inline vec_char16 vec_or(vec_char16 a, vec_char16 b)
1303 return (spu_or(a, b));
1306 static inline vec_char16 vec_or(vec_bchar16 a, vec_char16 b)
1308 return (spu_or((vec_char16)(a), b));
1311 static inline vec_char16 vec_or(vec_char16 a, vec_bchar16 b)
1313 return (spu_or(a, (vec_char16)(b)));
1316 static inline vec_ushort8 vec_or(vec_ushort8 a, vec_ushort8 b)
1318 return (spu_or(a, b));
1321 static inline vec_short8 vec_or(vec_short8 a, vec_short8 b)
1323 return (spu_or(a, b));
1326 static inline vec_short8 vec_or(vec_bshort8 a, vec_short8 b)
1328 return (spu_or((vec_short8)(a), b));
1331 static inline vec_short8 vec_or(vec_short8 a, vec_bshort8 b)
1333 return (spu_or(a, (vec_short8)(b)));
1336 static inline vec_uint4 vec_or(vec_uint4 a, vec_uint4 b)
1338 return (spu_or(a, b));
1341 static inline vec_int4 vec_or(vec_int4 a, vec_int4 b)
1343 return (spu_or(a, b));
1346 static inline vec_int4 vec_or(vec_bint4 a, vec_int4 b)
1348 return (spu_or((vec_int4)(a), b));
1351 static inline vec_int4 vec_or(vec_int4 a, vec_bint4 b)
1353 return (spu_or(a, (vec_int4)(b)));
1356 static inline vec_float4 vec_or(vec_float4 a, vec_float4 b)
1358 return (spu_or(a, b));
1361 static inline vec_float4 vec_or(vec_bint4 a, vec_float4 b)
1363 return (spu_or((vec_float4)(a),b));
1366 static inline vec_float4 vec_or(vec_float4 a, vec_bint4 b)
1368 return (spu_or(a, (vec_float4)(b)));
1372 /* vec_pack (vector pack)
1373 * ========
1375 static inline vec_uchar16 vec_pack(vec_ushort8 a, vec_ushort8 b)
1377 return ((vec_uchar16)spu_shuffle(a, b, ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15,
1378 17, 19, 21, 23, 25, 27, 29, 31})));
1381 static inline vec_char16 vec_pack(vec_short8 a, vec_short8 b)
1383 return ((vec_char16)spu_shuffle(a, b, ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15,
1384 17, 19, 21, 23, 25, 27, 29, 31})));
1387 static inline vec_ushort8 vec_pack(vec_uint4 a, vec_uint4 b)
1389 return ((vec_ushort8)spu_shuffle(a, b, ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15,
1390 18, 19, 22, 23, 26, 27, 30, 31})));
1393 static inline vec_short8 vec_pack(vec_int4 a, vec_int4 b)
1395 return ((vec_short8)spu_shuffle(a, b, ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15,
1396 18, 19, 22, 23, 26, 27, 30, 31})));
1400 /* vec_packpx (vector pack pixel)
1401 * ==========
1403 static inline vec_pixel8 vec_packpx(vec_uint4 a, vec_uint4 b)
1405 vec_uint4 x03FF = (vec_uint4)(spu_splats((unsigned short)0x03FF));
1406 vec_uint4 x001F = (vec_uint4)(spu_splats((unsigned short)0x001F));
1408 return ((vec_pixel8)(spu_shuffle(spu_sel(spu_sel(spu_sl(a, 7), spu_sl(a, 10), x03FF),
1409 spu_sl(a, 13), x001F),
1410 spu_sel(spu_sel(spu_sl(b, 7), spu_sl(b, 10), x03FF),
1411 spu_sl(b, 13), x001F),
1412 ((vec_uchar16){ 0, 1, 4, 5, 8, 9, 12, 13,
1413 16, 17, 20, 21, 24, 25, 28, 29}))));
1417 /* vec_packs (vector pack saturate)
1418 * =========
1420 static inline vec_uchar16 vec_packs(vec_ushort8 a, vec_ushort8 b)
1422 vec_ushort8 max = spu_splats((unsigned short)0x00FF);
1424 return ((vec_uchar16)(spu_shuffle(spu_sel(a, max, spu_cmpgt(a, 255)),
1425 spu_sel(b, max, spu_cmpgt(b, 255)),
1426 ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15,
1427 17, 19, 21, 23, 25, 27, 29, 31}))));
1430 static inline vec_char16 vec_packs(vec_short8 a, vec_short8 b)
1432 vec_short8 max = spu_splats((signed short)0x007F);
1433 vec_short8 min = spu_splats((signed short)0xFF80);
1435 return ((vec_char16)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, 127)), spu_cmpgt(a, -128)),
1436 spu_sel(min, spu_sel(b, max, spu_cmpgt(b, 127)), spu_cmpgt(b, -128)),
1437 ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15,
1438 17, 19, 21, 23, 25, 27, 29, 31}))));
1441 static inline vec_ushort8 vec_packs(vec_uint4 a, vec_uint4 b)
1443 vec_uint4 max = spu_splats((unsigned int)0x0000FFFF);
1445 return ((vec_ushort8)(spu_shuffle(spu_sel(a, max, spu_cmpgt(a, max)),
1446 spu_sel(b, max, spu_cmpgt(b, max)),
1447 ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15,
1448 18, 19, 22, 23, 26, 27, 30, 31}))));
1451 static inline vec_short8 vec_packs(vec_int4 a, vec_int4 b)
1453 vec_int4 max = spu_splats((signed int)0x00007FFF);
1454 vec_int4 min = spu_splats((signed int)0xFFFF8000);
1456 return ((vec_short8)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, max)), spu_cmpgt(a, min)),
1457 spu_sel(min, spu_sel(b, max, spu_cmpgt(b, max)), spu_cmpgt(b, min)),
1458 ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15,
1459 18, 19, 22, 23, 26, 27, 30, 31}))));
1463 /* vec_packsu (vector pack saturate unsigned)
1464 * ==========
1466 static inline vec_uchar16 vec_packsu(vec_ushort8 a, vec_ushort8 b)
1468 return ((vec_uchar16)spu_shuffle(spu_or(a, (vec_ushort8)(spu_cmpgt(a, 255))),
1469 spu_or(b, (vec_ushort8)(spu_cmpgt(b, 255))),
1470 ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15,
1471 17, 19, 21, 23, 25, 27, 29, 31})));
1474 static inline vec_uchar16 vec_packsu(vec_short8 a, vec_short8 b)
1476 vec_short8 max = spu_splats((signed short)0x00FF);
1477 vec_short8 min = spu_splats((signed short)0x0000);
1479 return ((vec_uchar16)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, 255)), spu_cmpgt(a, 0)),
1480 spu_sel(min, spu_sel(b, max, spu_cmpgt(b, 255)), spu_cmpgt(b, 0)),
1481 ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15,
1482 17, 19, 21, 23, 25, 27, 29, 31}))));
1484 return (vec_packsu((vec_ushort8)(a), (vec_ushort8)(b)));
1487 static inline vec_ushort8 vec_packsu(vec_uint4 a, vec_uint4 b)
1489 vec_uint4 max = spu_splats((unsigned int)0xFFFF);
1491 return ((vec_ushort8)spu_shuffle(spu_or(a, (vec_uint4)(spu_cmpgt(a, max))),
1492 spu_or(b, (vec_uint4)(spu_cmpgt(b, max))),
1493 ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15,
1494 18, 19, 22, 23, 26, 27, 30, 31})));
1497 static inline vec_ushort8 vec_packsu(vec_int4 a, vec_int4 b)
1499 vec_int4 max = spu_splats((signed int)0x0000FFFF);
1500 vec_int4 min = spu_splats((signed int)0x00000000);
1502 return ((vec_ushort8)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, max)), spu_cmpgt(a, min)),
1503 spu_sel(min, spu_sel(b, max, spu_cmpgt(b, max)), spu_cmpgt(b, min)),
1504 ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15,
1505 18, 19, 22, 23, 26, 27, 30, 31}))));
1509 /* vec_perm (vector permute)
1510 * ========
1512 static inline vec_uchar16 vec_perm(vec_uchar16 a, vec_uchar16 b, vec_uchar16 c)
1514 return (spu_shuffle(a, b, spu_and(c, 0x1F)));
1517 static inline vec_char16 vec_perm(vec_char16 a, vec_char16 b, vec_uchar16 c)
1519 return ((vec_char16)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1522 static inline vec_ushort8 vec_perm(vec_ushort8 a, vec_ushort8 b, vec_uchar16 c)
1524 return ((vec_ushort8)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1527 static inline vec_short8 vec_perm(vec_short8 a, vec_short8 b, vec_uchar16 c)
1529 return ((vec_short8)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1532 static inline vec_uint4 vec_perm(vec_uint4 a, vec_uint4 b, vec_uchar16 c)
1534 return ((vec_uint4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1537 static inline vec_int4 vec_perm(vec_int4 a, vec_int4 b, vec_uchar16 c)
1539 return ((vec_int4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1542 static inline vec_float4 vec_perm(vec_float4 a, vec_float4 b, vec_uchar16 c)
1544 return ((vec_float4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1548 /* vec_re (vector reciprocal estimate)
1549 * ======
1551 #define vec_re(_a) spu_re(_a)
1554 /* vec_rl (vector rotate left)
1555 * ======
1557 static inline vec_uchar16 vec_rl(vec_uchar16 a, vec_uchar16 b)
1559 vec_ushort8 r1, r2;
1561 r1 = spu_rl(spu_and((vec_ushort8)(a), 0xFF), (vec_short8)spu_and((vec_ushort8)(b), 7));
1562 r2 = spu_rl(spu_and((vec_ushort8)(a), -256), (vec_short8)spu_and(spu_rlmask((vec_ushort8)(b), -8), 7));
1563 return ((vec_uchar16)(spu_sel(spu_or(r2, spu_sl(r2, 8)), spu_or(r1, spu_rlmask(r1, -8)), spu_splats((unsigned short)0xFF))));
1566 static inline vec_char16 vec_rl(vec_char16 a, vec_uchar16 b)
1568 return ((vec_char16)(vec_rl((vec_uchar16)(a), b)));
1571 static inline vec_ushort8 vec_rl(vec_ushort8 a, vec_ushort8 b)
1573 return (spu_rl(a, (vec_short8)(b)));
1576 static inline vec_short8 vec_rl(vec_short8 a, vec_ushort8 b)
1578 return (spu_rl(a, (vec_short8)(b)));
1581 static inline vec_uint4 vec_rl(vec_uint4 a, vec_uint4 b)
1583 return (spu_rl(a, (vec_int4)(b)));
1586 static inline vec_int4 vec_rl(vec_int4 a, vec_uint4 b)
1588 return (spu_rl(a, (vec_int4)(b)));
1592 /* vec_round (vector round)
1593 * =========
1595 static inline vec_float4 vec_round(vec_float4 a)
1597 vec_float4 s_half, s_one, d;
1598 vec_uint4 odd;
1599 vec_uint4 msb = spu_splats((unsigned int)0x80000000);
1600 vec_float4 half = spu_splats(0.5f);
1601 vec_int4 exp;
1602 vec_uint4 mask;
1604 s_half = (vec_float4)(spu_sel((vec_uint4)(half), (vec_uint4)(a), msb));
1605 a = spu_add(a, s_half);
1606 s_one = spu_add(s_half, s_half);
1607 exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
1608 mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
1609 mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
1610 mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
1612 odd = spu_and((vec_uint4)(spu_convts(a, 0)), 1);
1613 s_one = spu_andc(s_one, (vec_float4)spu_cmpeq(mask, 0));
1614 s_one = spu_and(s_one, spu_and((vec_float4)spu_cmpeq(spu_and((vec_uint4)(a), mask), 0),
1615 (vec_float4)spu_cmpeq(odd, 1)));
1616 d = spu_andc(a, (vec_float4)(mask));
1617 d = spu_sub(d, s_one);
1618 return (d);
1621 /* vec_rsqrte (vector reciprocal square root estimate)
1622 * ==========
1624 #define vec_rsqrte(_a) spu_rsqrte(_a)
1627 /* vec_sel (vector select)
1628 * =======
1630 #define vec_sel(_a, _b, _c) spu_sel(_a, _b, _c)
1633 /* vec_sl (vector shift left)
1634 * ======
1636 static inline vec_uchar16 vec_sl(vec_uchar16 a, vec_uchar16 b)
1638 vec_ushort8 hi, lo;
1640 lo = spu_and(spu_sl((vec_ushort8)(a), spu_and((vec_ushort8)(b), 7)), 0xFF);
1641 hi = spu_sl(spu_and((vec_ushort8)(a), -256), spu_and(spu_rlmask((vec_ushort8)(b), -8), 7));
1643 return ((vec_uchar16)(spu_or(hi, lo)));
1646 static inline vec_char16 vec_sl(vec_char16 a, vec_uchar16 b)
1648 return ((vec_char16)(vec_sl((vec_uchar16)(a), b)));
1651 static inline vec_ushort8 vec_sl(vec_ushort8 a, vec_ushort8 b)
1653 return (spu_sl(a, spu_and(b, 15)));
1656 static inline vec_short8 vec_sl(vec_short8 a, vec_ushort8 b)
1658 return (spu_sl(a, spu_and((vec_ushort8)(b), 15)));
1661 static inline vec_uint4 vec_sl(vec_uint4 a, vec_uint4 b)
1663 return (spu_sl(a, spu_and(b, 31)));
1666 static inline vec_int4 vec_sl(vec_int4 a, vec_uint4 b)
1668 return (spu_sl(a, spu_and(b, 31)));
1672 /* vec_sld (vector shift left double)
1673 * =======
1675 #define vec_sld(_a, _b, _c) spu_shuffle(_a, _b, ((vec_uchar16){ 0+(_c), 1+(_c), 2+(_c), 3+(_c), \
1676 4+(_c), 5+(_c), 6+(_c), 7+(_c), \
1677 8+(_c), 9+(_c), 10+(_c), 11+(_c), \
1678 12+(_c), 13+(_c), 14+(_c), 15+(_c)}))
1681 /* vec_sll (vector shift left long)
1682 * =======
1684 #define vec_sll(_a, _b) spu_slqw(_a, spu_extract((vec_uint4)(_b), 0))
1687 /* vec_slo (vector shift left by octet)
1688 * =======
1690 #define vec_slo(_a, _b) spu_slqwbytebc(_a, spu_extract((vec_uint4)(_b), 3) & 0x7F)
1693 /* vec_splat (vector splat)
1694 * =========
1696 #define vec_splat(_a, _b) spu_splats(spu_extract(_a, _b))
1699 /* vec_splat_s8 (vector splat signed byte)
1700 * ============
1702 #define vec_splat_s8(_a) spu_splats((signed char)(_a))
1705 /* vec_splat_s16 (vector splat signed half-word)
1706 * =============
1708 #define vec_splat_s16(_a) spu_splats((signed short)(_a))
1711 /* vec_splat_s32 (vector splat signed word)
1712 * =============
1714 #define vec_splat_s32(_a) spu_splats((signed int)(_a))
1717 /* vec_splat_u8 (vector splat unsigned byte)
1718 * ============
1720 #define vec_splat_u8(_a) spu_splats((unsigned char)(_a))
1723 /* vec_splat_u16 (vector splat unsigned half-word)
1724 * =============
1726 #define vec_splat_u16(_a) spu_splats((unsigned short)(_a))
1729 /* vec_splat_u32 (vector splat unsigned word)
1730 * =============
1732 #define vec_splat_u32(_a) spu_splats((unsigned int)(_a))
1735 /* vec_sr (vector shift right)
1736 * ======
1738 static inline vec_uchar16 vec_sr(vec_uchar16 a, vec_uchar16 b)
1740 vec_ushort8 hi, lo;
1742 lo = spu_rlmask(spu_and((vec_ushort8)(a), 0xFF), spu_sub(0, (vec_short8)(spu_and((vec_ushort8)(b), 7))));
1743 hi = spu_and(spu_rlmask((vec_ushort8)(a), spu_sub(0, (vec_short8)(spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)))), -256);
1745 return ((vec_uchar16)(spu_or(hi, lo)));
1748 static inline vec_char16 vec_sr(vec_char16 a, vec_uchar16 b)
1750 return ((vec_char16)(vec_sr((vec_uchar16)(a), b)));
1753 static inline vec_ushort8 vec_sr(vec_ushort8 a, vec_ushort8 b)
1755 return (spu_rlmask(a, spu_sub(0, (vec_short8)(spu_and(b, 15)))));
1758 static inline vec_short8 vec_sr(vec_short8 a, vec_ushort8 b)
1760 return ((vec_short8)(vec_sr((vec_ushort8)(a), b)));
1763 static inline vec_uint4 vec_sr(vec_uint4 a, vec_uint4 b)
1765 return (spu_rlmask(a, spu_sub(0, (vec_int4)(spu_and(b, 31)))));
1768 static inline vec_int4 vec_sr(vec_int4 a, vec_uint4 b)
1770 return ((vec_int4)(vec_sr((vec_uint4)(a), b)));
1774 /* vec_sra (vector shift right algebraic)
1775 * =======
1777 static inline vec_char16 vec_sra(vec_char16 a, vec_uchar16 b)
1779 vec_short8 hi, lo;
1781 lo = spu_and(spu_rlmaska(spu_extend(a), spu_sub(0, (vec_short8)(spu_and((vec_ushort8)(b), 7)))), 0xFF);
1782 hi = spu_and(spu_rlmaska((vec_short8)(a), spu_sub(0, (vec_short8)(spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)))), -256);
1784 return ((vec_char16)(spu_or(hi, lo)));
1787 static inline vec_uchar16 vec_sra(vec_uchar16 a, vec_uchar16 b)
1789 return ((vec_uchar16)(vec_sra((vec_char16)(a), b)));
1792 static inline vec_short8 vec_sra(vec_short8 a, vec_ushort8 b)
1794 return (spu_rlmaska(a, spu_sub(0, (vec_short8)(spu_and(b, 15)))));
1797 static inline vec_ushort8 vec_sra(vec_ushort8 a, vec_ushort8 b)
1799 return ((vec_ushort8)(vec_sra((vec_short8)(a), b)));
1802 static inline vec_int4 vec_sra(vec_int4 a, vec_uint4 b)
1804 return (spu_rlmaska(a, spu_sub(0, (vec_int4)(spu_and(b, 31)))));
1807 static inline vec_uint4 vec_sra(vec_uint4 a, vec_uint4 b)
1809 return ((vec_uint4)(vec_sra((vec_int4)(a), b)));
1813 /* vec_srl (vector shift right long)
1814 * =======
1816 #define vec_srl(_a, _b) spu_rlmaskqw(_a, 0-spu_extract((vec_int4)(_b), 3))
1819 /* vec_sro (vector shift right by octet)
1820 * =======
1822 #define vec_sro(_a, _b) spu_rlmaskqwbyte(_a, 0 - ((spu_extract((vec_int4)(_b), 3) >> 3) & 0xF))
1824 /* vec_st (vector store indexed)
1825 * ======
1827 static inline void vec_st(vec_uchar16 a, int b, unsigned char *c)
1829 *((vec_uchar16 *)(c+b)) = a;
1832 static inline void vec_st(vec_uchar16 a, int b, vec_uchar16 *c)
1834 *((vec_uchar16 *)((unsigned char *)(c)+b)) = a;
1837 static inline void vec_st(vec_char16 a, int b, signed char *c)
1839 *((vec_char16 *)(c+b)) = a;
1842 static inline void vec_st(vec_char16 a, int b, vec_char16 *c)
1844 *((vec_char16 *)((signed char *)(c)+b)) = a;
1847 static inline void vec_st(vec_bchar16 a, int b, signed char *c)
1849 *((vec_bchar16 *)((signed char *)(c)+b)) = a;
1852 static inline void vec_st(vec_ushort8 a, int b, unsigned short *c)
1854 *((vec_ushort8 *)((unsigned char *)(c)+b)) = a;
1857 static inline void vec_st(vec_ushort8 a, int b, vec_ushort8 *c)
1859 *((vec_ushort8 *)((unsigned char *)(c)+b)) = a;
1862 static inline void vec_st(vec_short8 a, int b, signed short *c)
1864 *((vec_short8 *)((unsigned char *)(c)+b)) = a;
1867 static inline void vec_st(vec_short8 a, int b, vec_short8 *c)
1869 *((vec_short8 *)((signed char *)(c)+b)) = a;
1872 static inline void vec_st(vec_bshort8 a, int b, signed short *c)
1874 *((vec_bshort8 *)((signed char *)(c)+b)) = a;
1877 static inline void vec_st(vec_uint4 a, int b, unsigned int *c)
1879 *((vec_uint4 *)((unsigned char *)(c)+b)) = a;
1882 static inline void vec_st(vec_uint4 a, int b, vec_uint4 *c)
1884 *((vec_uint4 *)((unsigned char *)(c)+b)) = a;
1887 static inline void vec_st(vec_int4 a, int b, signed int *c)
1889 *((vec_int4 *)((unsigned char *)(c)+b)) = a;
1892 static inline void vec_st(vec_int4 a, int b, vec_int4 *c)
1894 *((vec_int4 *)((signed char *)(c)+b)) = a;
1897 static inline void vec_st(vec_bint4 a, int b, signed int *c)
1899 *((vec_bint4 *)((signed char *)(c)+b)) = a;
1902 static inline void vec_st(vec_float4 a, int b, float *c)
1904 *((vec_float4 *)((unsigned char *)(c)+b)) = a;
1907 static inline void vec_st(vec_float4 a, int b, vec_float4 *c)
1909 *((vec_float4 *)((unsigned char *)(c)+b)) = a;
1913 /* vec_ste (vector store element indexed)
1914 * =======
1916 static inline void vec_ste(vec_uchar16 a, int b, unsigned char *c)
1918 unsigned char *ptr;
1920 ptr = c + b;
1921 *ptr = spu_extract(a, (int)(ptr) & 15);
1924 static inline void vec_ste(vec_char16 a, int b, signed char *c)
1926 vec_ste((vec_uchar16)(a), b, (unsigned char *)(c));
1929 static inline void vec_ste(vec_bchar16 a, int b, signed char *c)
1931 vec_ste((vec_uchar16)(a), b, (unsigned char *)(c));
1934 static inline void vec_ste(vec_ushort8 a, int b, unsigned short *c)
1936 unsigned short *ptr;
1938 ptr = (unsigned short *)(((unsigned int)(c) + b) & ~1);
1939 *ptr = spu_extract(a, ((int)(ptr) >> 1) & 7);
1942 static inline void vec_ste(vec_short8 a, int b, signed short *c)
1944 vec_ste((vec_ushort8)(a), b, (unsigned short *)(c));
1947 static inline void vec_ste(vec_bshort8 a, int b, signed short *c)
1949 vec_ste((vec_ushort8)(a), b, (unsigned short *)(c));
1952 static inline void vec_ste(vec_uint4 a, int b, unsigned int *c)
1954 unsigned int *ptr;
1956 ptr = (unsigned int *)(((unsigned int)(c) + b) & ~3);
1957 *ptr = spu_extract(a, ((int)(ptr) >> 2) & 3);
1960 static inline void vec_ste(vec_int4 a, int b, signed int *c)
1962 vec_ste((vec_uint4)(a), b, (unsigned int *)(c));
1965 static inline void vec_ste(vec_bint4 a, int b, signed int *c)
1967 vec_ste((vec_uint4)(a), b, (unsigned int *)(c));
1970 static inline void vec_ste(vec_float4 a, int b, float *c)
1972 vec_ste((vec_uint4)(a), b, (unsigned int *)(c));
1976 /* vec_stl (vector store indexed LRU)
1977 * =======
1979 #define vec_stl(_a, _b, _c) vec_st(_a, _b, _c)
1982 /* vec_sub (vector subtract)
1983 * =======
1985 static inline vec_uchar16 vec_sub(vec_uchar16 a, vec_uchar16 b)
1987 return ((vec_uchar16)(spu_sel(spu_sub((vec_ushort8)(a), (vec_ushort8)(b)),
1988 spu_sub(spu_and((vec_ushort8)(a), -256), spu_and((vec_ushort8)(b), -256)),
1989 spu_splats((unsigned short)0xFF00))));
1992 static inline vec_char16 vec_sub(vec_char16 a, vec_char16 b)
1994 return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b))));
1997 static inline vec_char16 vec_sub(vec_bchar16 a, vec_char16 b)
1999 return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b))));
2002 static inline vec_char16 vec_sub(vec_char16 a, vec_bchar16 b)
2004 return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b))));
2007 static inline vec_ushort8 vec_sub(vec_ushort8 a, vec_ushort8 b)
2009 return (spu_sub(a, b));
2012 static inline vec_short8 vec_sub(vec_short8 a, vec_short8 b)
2014 return (spu_sub(a, b));
2017 static inline vec_short8 vec_sub(vec_bshort8 a, vec_short8 b)
2019 return (spu_sub((vec_short8)(a), b));
2022 static inline vec_short8 vec_sub(vec_short8 a, vec_bshort8 b)
2024 return (spu_sub(a, (vec_short8)(b)));
2027 static inline vec_uint4 vec_sub(vec_uint4 a, vec_uint4 b)
2029 return (spu_sub(a, b));
2032 static inline vec_int4 vec_sub(vec_int4 a, vec_int4 b)
2034 return (spu_sub(a, b));
2037 static inline vec_int4 vec_sub(vec_bint4 a, vec_int4 b)
2039 return (spu_sub((vec_int4)(a), b));
2042 static inline vec_int4 vec_sub(vec_int4 a, vec_bint4 b)
2044 return (spu_sub(a, (vec_int4)(b)));
2047 static inline vec_float4 vec_sub(vec_float4 a, vec_float4 b)
2049 return (spu_sub(a, b));
2053 /* vec_subc (vector subtract carryout)
2054 * ========
2056 #define vec_subc(_a, _b) spu_genb(_a, _b)
2059 /* vec_subs (vector subtract saturate)
2060 * ========
2062 static inline vec_uchar16 vec_subs(vec_uchar16 a, vec_uchar16 b)
2064 vec_ushort8 s1, s2;
2065 vec_uchar16 s, d;
2067 s1 = spu_sub(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8));
2068 s2 = spu_sub(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF));
2069 s = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){0, 16, 2, 18, 4, 20, 6, 22,
2070 8, 24, 10, 26, 12, 28, 14, 30})));
2071 d = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){1, 17, 3, 19, 5, 21, 7, 23,
2072 9, 25, 11, 27, 13, 29, 15, 31})));
2073 return (spu_andc(d, s));
2076 static inline vec_char16 vec_subs(vec_char16 a, vec_char16 b)
2078 vec_ushort8 s1, s2;
2079 vec_uchar16 s, d;
2081 s1 = spu_sub(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8));
2082 s2 = spu_sub(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF));
2083 s = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){1, 17, 3, 19, 5, 21, 7, 23,
2084 9, 25, 11, 27, 13, 29, 15, 31})));
2085 d = spu_sel(s, spu_splats((unsigned char)0x7F), spu_cmpgt(spu_nor((vec_uchar16)(a), spu_nand(s, (vec_uchar16)(b))), 0x7F));
2086 d = spu_sel(d, spu_splats((unsigned char)0x80), spu_cmpgt(spu_and((vec_uchar16)(a), spu_nor(s, (vec_uchar16)(b))), 0x7F));
2088 return ((vec_char16)(d));
2091 static inline vec_char16 vec_subs(vec_bchar16 a, vec_char16 b)
2093 return (vec_subs((vec_char16)(a), b));
2096 static inline vec_char16 vec_subs(vec_char16 a, vec_bchar16 b)
2098 return (vec_subs(a, (vec_char16)(b)));
2101 static inline vec_ushort8 vec_subs(vec_ushort8 a, vec_ushort8 b)
2103 return (spu_andc(spu_sub(a, b), spu_cmpgt(b, a)));
2106 static inline vec_short8 vec_subs(vec_short8 a, vec_short8 b)
2108 vec_short8 s;
2109 vec_short8 d;
2111 s = spu_sub(a, b);
2112 d = spu_sel(s, spu_splats((signed short)0x7FFF), (vec_ushort8)(spu_rlmaska(spu_nor(a, spu_nand(s, b)), -15)));
2113 d = spu_sel(d, spu_splats((signed short)0x8000), (vec_ushort8)(spu_rlmaska(spu_and(a, spu_nor(s, b)), -15)));
2115 return (d);
2118 static inline vec_short8 vec_subs(vec_bshort8 a, vec_short8 b)
2120 return ((vec_short8)(vec_subs((vec_short8)(a), b)));
2123 static inline vec_short8 vec_subs(vec_short8 a, vec_bshort8 b)
2125 return ((vec_short8)(vec_subs(a, (vec_short8)(b))));
2128 static inline vec_uint4 vec_subs(vec_uint4 a, vec_uint4 b)
2130 return (spu_andc(spu_sub(a, b), spu_cmpgt(b, a)));
2133 static inline vec_int4 vec_subs(vec_int4 a, vec_int4 b)
2135 vec_int4 s;
2136 vec_int4 d;
2138 s = spu_sub(a, b);
2139 d = spu_sel(s, spu_splats((signed int)0x7FFFFFFF), (vec_uint4)(spu_rlmaska(spu_nor(a, spu_nand(s, b)), -31)));
2140 d = spu_sel(d, spu_splats((signed int)0x80000000), (vec_uint4)(spu_rlmaska(spu_and(a, spu_nor(s, b)), -31)));
2142 return (d);
2145 static inline vec_int4 vec_subs(vec_bint4 a, vec_int4 b)
2147 return ((vec_int4)(vec_subs((vec_int4)(a), b)));
2150 static inline vec_int4 vec_subs(vec_int4 a, vec_bint4 b)
2152 return ((vec_int4)(vec_subs(a, (vec_int4)(b))));
2156 /* vec_sum4s (vector sum across partial (1/4) saturated)
2157 * =========
2159 static inline vec_uint4 vec_sum4s(vec_uchar16 a, vec_uint4 b)
2161 vec_uint4 a01_23, a0123;
2163 a01_23 = (vec_uint4)(spu_add(spu_rlmask((vec_ushort8)(a), -8),
2164 spu_and((vec_ushort8)(a), 0xFF)));
2165 a0123 = spu_add(spu_rlmask(a01_23, -16), spu_and(a01_23, 0x1FF));
2166 return (vec_adds(a0123, b));
2169 static inline vec_int4 vec_sum4s(vec_char16 a, vec_int4 b)
2171 vec_int4 a01_23, a0123;
2173 a01_23 = (vec_int4)(spu_add(spu_rlmaska((vec_short8)(a), -8),
2174 spu_extend(a)));
2175 a0123 = spu_add(spu_rlmaska(a01_23, -16), spu_extend((vec_short8)(a01_23)));
2176 return (vec_adds(a0123, b));
2179 static inline vec_int4 vec_sum4s(vec_short8 a, vec_int4 b)
2181 vec_int4 a0123;
2183 a0123 = spu_add(spu_rlmaska((vec_int4)(a), -16), spu_extend(a));
2184 return (vec_adds(a0123, b));
2188 /* vec_sum2s (vector sum across partial (1/2) saturated)
2189 * =========
2191 static inline vec_int4 vec_sum2s(vec_int4 a, vec_int4 b)
2193 vec_int4 c, d;
2194 vec_int4 sign1, sign2, sign3;
2195 vec_int4 carry, sum_l, sum_h, sat, sat_val;
2197 sign1 = spu_rlmaska(a, -31);
2198 sign2 = spu_rlmaska(b, -31);
2200 c = spu_rlqwbyte(a, -4);
2201 sign3 = spu_rlqwbyte(sign1, -4);
2203 carry = spu_genc(a, b);
2204 sum_l = spu_add(a, b);
2205 sum_h = spu_addx(sign1, sign2, carry);
2207 carry = spu_genc(sum_l, c);
2208 sum_l = spu_add(sum_l, c);
2209 sum_h = spu_addx(sum_h, sign3, carry);
2211 sign1 = spu_rlmaska(sum_l, -31);
2212 sign2 = spu_rlmaska(sum_h, -31);
2214 sat_val = spu_xor(sign2, spu_splats((signed int)0x7FFFFFFF));
2216 sat = spu_orc(spu_xor(sign1, sign2), (vec_int4)spu_cmpeq(sum_h, sign2));
2218 d = spu_and(spu_sel(sum_l, sat_val, (vec_uint4)(sat)), (vec_int4){0, -1, 0, -1});
2220 return (d);
2224 /* vec_sums (vector sum saturated)
2225 * ========
2227 static inline vec_int4 vec_sums(vec_int4 a, vec_int4 b)
2229 vec_int4 a0, a1, a2, c0, c1, c2, d;
2230 vec_int4 sign_a, sign_b, sign_l, sign_h;
2231 vec_int4 sum_l, sum_h, sat, sat_val;
2233 sign_a = spu_rlmaska(a, -31);
2234 sign_b = spu_rlmaska(b, -31);
2236 a0 = spu_rlqwbyte(a, -12);
2237 a1 = spu_rlqwbyte(a, -8);
2238 a2 = spu_rlqwbyte(a, -4);
2240 sum_l = spu_add(a, b);
2241 sum_h = spu_addx(sign_a, sign_b, spu_genc(a, b));
2243 c2 = spu_genc(sum_l, a2);
2244 sum_l = spu_add(sum_l, a2);
2245 sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -4), c2);
2247 c1 = spu_genc(sum_l, a1);
2248 sum_l = spu_add(sum_l, a1);
2249 sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -8), c1);
2251 c0 = spu_genc(sum_l, a0);
2252 sum_l = spu_add(sum_l, a0);
2253 sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -12), c0);
2255 sign_l = spu_rlmaska(sum_l, -31);
2256 sign_h = spu_rlmaska(sum_h, -31);
2258 sat_val = spu_xor(sign_h, spu_splats((signed int)0x7FFFFFFF));
2260 sat = spu_orc(spu_xor(sign_l, sign_h), (vec_int4)spu_cmpeq(sum_h, sign_h));
2262 d = spu_and(spu_sel(sum_l, sat_val, (vec_uint4)(sat)), ((vec_int4){0, 0, 0, -1}));
2264 return (d);
2268 /* vec_trunc (vector truncate)
2269 * =========
2271 static inline vec_float4 vec_trunc(vec_float4 a)
2273 vec_int4 exp;
2274 vec_uint4 mask;
2276 exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
2277 mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
2278 mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
2279 mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
2280 return (spu_andc(a, (vec_float4)(mask)));
2283 /* vec_unpackh (vector unpack high element)
2284 * ===========
2286 static inline vec_short8 vec_unpackh(vec_char16 a)
2288 return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 1, 1, 2, 2, 3, 3,
2289 4, 4, 5, 5, 6, 6, 7, 7}))));
2292 static inline vec_bshort8 vec_unpackh(vec_bchar16 a)
2294 return ((vec_bshort8)(vec_unpackh((vec_char16)(a))));
2297 static inline vec_int4 vec_unpackh(vec_short8 a)
2299 return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 0, 1, 0, 0, 2, 3,
2300 0, 0, 4, 5, 0, 0, 6, 7}))));
2303 #ifdef SUPPORT_UNPACK_PIXEL
2304 /* Due to type conflicts, unpacking of pixel types and boolean shorts
2305 * can not simultaneously be supported. By default, the boolean short is
2306 * supported.
2308 static inline vec_uint4 vec_unpackh(vec_pixel8 a)
2310 vec_ushort8 p1, p2;
2312 p1 = spu_shuffle((vec_ushort8)(spu_rlmaska((vec_short8)(a.p), -7)),
2313 spu_and((vec_ushort8)(a.p), 0x1F),
2314 ((vec_uchar16){ 0, 128, 128, 17, 2, 128, 128, 19,
2315 4, 128, 128, 21, 6, 128, 128, 23}));
2316 p2 = spu_shuffle(spu_and(spu_rlmask((vec_ushort8)(a.p), -5), 0x1F),
2317 spu_and(spu_rlmask((vec_ushort8)(a.p), -10), 0x1F),
2318 ((vec_uchar16){ 128, 17, 1, 128, 128, 19, 3, 128,
2319 128, 21, 5, 128, 128, 23, 7, 128}));
2320 return ((vec_uint4)(spu_or(p1, p2)));
2323 #else
2325 static inline vec_bint4 vec_unpackh(vec_bshort8 a)
2327 return ((vec_bint4)(vec_unpackh((vec_short8)(a))));
2329 #endif
2335 /* vec_unpackl (vector unpack low element)
2336 * ===========
2338 static inline vec_short8 vec_unpackl(vec_char16 a)
2340 return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){8, 8, 9, 9, 10, 10, 11, 11,
2341 12, 12, 13, 13, 14, 14, 15, 15}))));
2344 static inline vec_bshort8 vec_unpackl(vec_bchar16 a)
2346 return ((vec_bshort8)(vec_unpackl((vec_char16)(a))));
2350 static inline vec_int4 vec_unpackl(vec_short8 a)
2352 return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 8, 9, 0, 0, 10, 11,
2353 0, 0,12,13, 0, 0, 14, 15}))));
2357 #ifdef SUPPORT_UNPACK_PIXEL
2358 /* Due to type conflicts, unpacking of pixel types and boolean shorts
2359 * can not simultaneously be supported. By default, the boolean short is
2360 * supported.
2362 static inline vec_uint4 vec_unpackl(vec_pixel8 a)
2364 vec_ushort8 p1, p2;
2366 p1 = spu_shuffle((vec_ushort8)(spu_rlmaska((vec_short8)(a), -7)),
2367 spu_and((vec_ushort8)(a), 0x1F),
2368 ((vec_uchar16){ 8, 128, 128, 25, 10, 128, 128, 27,
2369 12, 128, 128, 29, 14, 128, 128, 31}));
2370 p2 = spu_shuffle(spu_and(spu_rlmask((vec_ushort8)(a), -5), 0x1F),
2371 spu_and(spu_rlmask((vec_ushort8)(a), -10), 0x1F),
2372 ((vec_uchar16){ 128, 25, 9, 128, 128, 27, 11, 128,
2373 128, 29, 13, 128, 128, 31, 15, 128}));
2374 return ((vec_uint4)(spu_or(p1, p2)));
2377 #else
2379 static inline vec_bint4 vec_unpackl(vec_bshort8 a)
2381 return ((vec_bint4)(vec_unpackl((vec_short8)(a))));
2384 #endif
2388 /* vec_xor (vector logical xor)
2389 * ======
2391 static inline vec_uchar16 vec_xor(vec_uchar16 a, vec_uchar16 b)
2393 return (spu_xor(a, b));
2396 static inline vec_char16 vec_xor(vec_char16 a, vec_char16 b)
2398 return (spu_xor(a, b));
2401 static inline vec_char16 vec_xor(vec_bchar16 a, vec_char16 b)
2403 return (spu_xor((vec_char16)(a), b));
2406 static inline vec_char16 vec_xor(vec_char16 a, vec_bchar16 b)
2408 return (spu_xor(a, (vec_char16)(b)));
2411 static inline vec_ushort8 vec_xor(vec_ushort8 a, vec_ushort8 b)
2413 return (spu_xor(a, b));
2416 static inline vec_short8 vec_xor(vec_short8 a, vec_short8 b)
2418 return (spu_xor(a, b));
2421 static inline vec_short8 vec_xor(vec_bshort8 a, vec_short8 b)
2423 return (spu_xor((vec_short8)(a), b));
2426 static inline vec_short8 vec_xor(vec_short8 a, vec_bshort8 b)
2428 return (spu_xor(a, (vec_short8)(b)));
2431 static inline vec_uint4 vec_xor(vec_uint4 a, vec_uint4 b)
2433 return (spu_xor(a, b));
2436 static inline vec_int4 vec_xor(vec_int4 a, vec_int4 b)
2438 return (spu_xor(a, b));
2441 static inline vec_int4 vec_xor(vec_bint4 a, vec_int4 b)
2443 return (spu_xor((vec_int4)(a), b));
2446 static inline vec_int4 vec_xor(vec_int4 a, vec_bint4 b)
2448 return (spu_xor(a, (vec_int4)(b)));
2451 static inline vec_float4 vec_xor(vec_float4 a, vec_float4 b)
2453 return (spu_xor(a, b));
2456 static inline vec_float4 vec_xor(vec_bint4 a, vec_float4 b)
2458 return (spu_xor((vec_float4)(a),b));
2461 static inline vec_float4 vec_xor(vec_float4 a, vec_bint4 b)
2463 return (spu_xor(a, (vec_float4)(b)));
2466 /************************************************************************
2467 * PREDICATES
2468 ************************************************************************/
2470 /* vec_all_eq (all elements equal)
2471 * ==========
2473 static inline int vec_all_eq(vec_uchar16 a, vec_uchar16 b)
2475 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFFFF));
2478 static inline int vec_all_eq(vec_char16 a, vec_char16 b)
2480 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFFFF));
2483 static inline int vec_all_eq(vec_bchar16 a, vec_char16 b)
2485 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) == 0xFFFF));
2488 static inline int vec_all_eq(vec_char16 a, vec_bchar16 b)
2490 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) == 0xFFFF));
2493 static inline int vec_all_eq(vec_ushort8 a, vec_ushort8 b)
2495 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFF));
2498 static inline int vec_all_eq(vec_short8 a, vec_short8 b)
2500 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFF));
2503 static inline int vec_all_eq(vec_bshort8 a, vec_short8 b)
2505 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) == 0xFF));
2508 static inline int vec_all_eq(vec_short8 a, vec_bshort8 b)
2510 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) == 0xFF));
2513 static inline int vec_all_eq(vec_uint4 a, vec_uint4 b)
2515 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF));
2518 static inline int vec_all_eq(vec_int4 a, vec_int4 b)
2520 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF));
2523 static inline int vec_all_eq(vec_bint4 a, vec_int4 b)
2525 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) == 0xF));
2528 static inline int vec_all_eq(vec_int4 a, vec_bint4 b)
2530 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) == 0xF));
2533 static inline int vec_all_eq(vec_float4 a, vec_float4 b)
2535 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF));
2539 /* vec_all_ge (all elements greater than or equal)
2540 * ==========
2542 static inline int vec_all_ge(vec_uchar16 a, vec_uchar16 b)
2544 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2547 static inline int vec_all_ge(vec_char16 a, vec_char16 b)
2549 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2552 static inline int vec_all_ge(vec_bchar16 a, vec_char16 b)
2554 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) == 0));
2557 static inline int vec_all_ge(vec_char16 a, vec_bchar16 b)
2559 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) == 0));
2562 static inline int vec_all_ge(vec_ushort8 a, vec_ushort8 b)
2564 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2567 static inline int vec_all_ge(vec_short8 a, vec_short8 b)
2569 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2572 static inline int vec_all_ge(vec_bshort8 a, vec_short8 b)
2574 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) == 0));
2577 static inline int vec_all_ge(vec_short8 a, vec_bshort8 b)
2579 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) == 0));
2582 static inline int vec_all_ge(vec_uint4 a, vec_uint4 b)
2584 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2587 static inline int vec_all_ge(vec_int4 a, vec_int4 b)
2589 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2592 static inline int vec_all_ge(vec_bint4 a, vec_int4 b)
2594 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) == 0));
2597 static inline int vec_all_ge(vec_int4 a, vec_bint4 b)
2599 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) == 0));
2602 static inline int vec_all_ge(vec_float4 a, vec_float4 b)
2604 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2608 /* vec_all_gt (all elements greater than)
2609 * ==========
2611 static inline int vec_all_gt(vec_uchar16 a, vec_uchar16 b)
2613 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFFFF));
2616 static inline int vec_all_gt(vec_char16 a, vec_char16 b)
2618 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFFFF));
2621 static inline int vec_all_gt(vec_bchar16 a, vec_char16 b)
2623 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) == 0xFFFF));
2626 static inline int vec_all_gt(vec_char16 a, vec_bchar16 b)
2628 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) == 0xFFFF));
2631 static inline int vec_all_gt(vec_ushort8 a, vec_ushort8 b)
2633 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFF));
2636 static inline int vec_all_gt(vec_short8 a, vec_short8 b)
2638 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFF));
2641 static inline int vec_all_gt(vec_bshort8 a, vec_short8 b)
2643 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) == 0xFF));
2646 static inline int vec_all_gt(vec_short8 a, vec_bshort8 b)
2648 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) == 0xFF));
2651 static inline int vec_all_gt(vec_uint4 a, vec_uint4 b)
2653 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2656 static inline int vec_all_gt(vec_int4 a, vec_int4 b)
2658 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2661 static inline int vec_all_gt(vec_bint4 a, vec_int4 b)
2663 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) == 0xF));
2666 static inline int vec_all_gt(vec_int4 a, vec_bint4 b)
2668 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) == 0xF));
2671 static inline int vec_all_gt(vec_float4 a, vec_float4 b)
2673 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2677 /* vec_all_in (all elements in bounds)
2678 * ==========
2680 static inline int vec_all_in(vec_float4 a, vec_float4 b)
2682 return (spu_extract(spu_gather(spu_nor(spu_cmpabsgt(a, b), (vec_uint4)(spu_rlmaska((vec_int4)(b), -31)))), 0) == 0xF);
2686 /* vec_all_le (all elements less than or equal)
2687 * ==========
2689 static inline int vec_all_le(vec_uchar16 a, vec_uchar16 b)
2691 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2694 static inline int vec_all_le(vec_char16 a, vec_char16 b)
2696 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2699 static inline int vec_all_le(vec_bchar16 a, vec_char16 b)
2701 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) == 0));
2704 static inline int vec_all_le(vec_char16 a, vec_bchar16 b)
2706 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) == 0));
2709 static inline int vec_all_le(vec_ushort8 a, vec_ushort8 b)
2711 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2714 static inline int vec_all_le(vec_short8 a, vec_short8 b)
2716 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2719 static inline int vec_all_le(vec_bshort8 a, vec_short8 b)
2721 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) == 0));
2724 static inline int vec_all_le(vec_short8 a, vec_bshort8 b)
2726 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) == 0));
2729 static inline int vec_all_le(vec_uint4 a, vec_uint4 b)
2731 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2734 static inline int vec_all_le(vec_int4 a, vec_int4 b)
2736 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2739 static inline int vec_all_le(vec_bint4 a, vec_int4 b)
2741 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) == 0));
2744 static inline int vec_all_le(vec_int4 a, vec_bint4 b)
2746 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) == 0));
2749 static inline int vec_all_le(vec_float4 a, vec_float4 b)
2751 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2755 /* vec_all_lt (all elements less than)
2756 * ==========
2758 static inline int vec_all_lt(vec_uchar16 a, vec_uchar16 b)
2760 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFFFF));
2763 static inline int vec_all_lt(vec_char16 a, vec_char16 b)
2765 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFFFF));
2768 static inline int vec_all_lt(vec_bchar16 a, vec_char16 b)
2770 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) == 0xFFFF));
2773 static inline int vec_all_lt(vec_char16 a, vec_bchar16 b)
2775 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) == 0xFFFF));
2778 static inline int vec_all_lt(vec_ushort8 a, vec_ushort8 b)
2780 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFF));
2783 static inline int vec_all_lt(vec_short8 a, vec_short8 b)
2785 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFF));
2788 static inline int vec_all_lt(vec_bshort8 a, vec_short8 b)
2790 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) == 0xFF));
2793 static inline int vec_all_lt(vec_short8 a, vec_bshort8 b)
2795 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) == 0xFF));
2798 static inline int vec_all_lt(vec_uint4 a, vec_uint4 b)
2800 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2803 static inline int vec_all_lt(vec_int4 a, vec_int4 b)
2805 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2808 static inline int vec_all_lt(vec_bint4 a, vec_int4 b)
2810 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) == 0xF));
2813 static inline int vec_all_lt(vec_int4 a, vec_bint4 b)
2815 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) == 0xF));
2818 static inline int vec_all_lt(vec_float4 a, vec_float4 b)
2820 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2824 /* vec_all_nan (all elements not a number)
2825 * ===========
2827 static inline int vec_all_nan(vec_float4 a)
2829 vec_uint4 exp, man;
2830 vec_uint4 exp_mask = spu_splats((unsigned int)0x7F800000);
2832 exp = spu_and((vec_uint4)(a), exp_mask);
2833 man = spu_and((vec_uint4)(a), spu_splats((unsigned int)0x007FFFFF));
2834 return ((int)(spu_extract(spu_gather(spu_andc(spu_cmpeq(exp, exp_mask),
2835 spu_cmpeq(man, 0))), 0) == 0xF));
2838 #define vec_all_nan(_a) (0)
2841 /* vec_all_ne (all elements not equal)
2842 * ==========
2844 static inline int vec_all_ne(vec_uchar16 a, vec_uchar16 b)
2846 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2849 static inline int vec_all_ne(vec_char16 a, vec_char16 b)
2851 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2854 static inline int vec_all_ne(vec_bchar16 a, vec_char16 b)
2856 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) == 0));
2859 static inline int vec_all_ne(vec_char16 a, vec_bchar16 b)
2861 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) == 0));
2864 static inline int vec_all_ne(vec_ushort8 a, vec_ushort8 b)
2866 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2869 static inline int vec_all_ne(vec_short8 a, vec_short8 b)
2871 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2874 static inline int vec_all_ne(vec_bshort8 a, vec_short8 b)
2876 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) == 0));
2879 static inline int vec_all_ne(vec_short8 a, vec_bshort8 b)
2881 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) == 0));
2884 static inline int vec_all_ne(vec_uint4 a, vec_uint4 b)
2886 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2889 static inline int vec_all_ne(vec_int4 a, vec_int4 b)
2891 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2894 static inline int vec_all_ne(vec_bint4 a, vec_int4 b)
2896 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) == 0));
2899 static inline int vec_all_ne(vec_int4 a, vec_bint4 b)
2901 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) == 0));
2904 static inline int vec_all_ne(vec_float4 a, vec_float4 b)
2906 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2910 /* vec_all_nge (all elements not greater than or equal)
2911 * ===========
2913 static inline int vec_all_nge(vec_float4 a, vec_float4 b)
2915 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2919 /* vec_all_ngt (all elements not greater than)
2920 * ===========
2922 static inline int vec_all_ngt(vec_float4 a, vec_float4 b)
2924 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2928 /* vec_all_nle (all elements not less than or equal)
2929 * ===========
2931 static inline int vec_all_nle(vec_float4 a, vec_float4 b)
2933 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2937 /* vec_all_nlt (all elements not less than)
2938 * ===========
2940 static inline int vec_all_nlt(vec_float4 a, vec_float4 b)
2942 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2946 /* vec_all_numeric (all elements numeric)
2947 * ===========
2949 static inline int vec_all_numeric(vec_float4 a)
2951 vec_uint4 exp;
2953 exp = spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF);
2954 return ((int)(spu_extract(spu_gather(spu_cmpeq(exp, 255)), 0) == 0));
2959 /* vec_any_eq (any elements equal)
2960 * ==========
2962 static inline int vec_any_eq(vec_uchar16 a, vec_uchar16 b)
2964 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2967 static inline int vec_any_eq(vec_char16 a, vec_char16 b)
2969 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2972 static inline int vec_any_eq(vec_bchar16 a, vec_char16 b)
2974 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) != 0));
2977 static inline int vec_any_eq(vec_char16 a, vec_bchar16 b)
2979 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) != 0));
2982 static inline int vec_any_eq(vec_ushort8 a, vec_ushort8 b)
2984 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2987 static inline int vec_any_eq(vec_short8 a, vec_short8 b)
2989 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2992 static inline int vec_any_eq(vec_bshort8 a, vec_short8 b)
2994 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) != 0));
2997 static inline int vec_any_eq(vec_short8 a, vec_bshort8 b)
2999 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) != 0));
3002 static inline int vec_any_eq(vec_uint4 a, vec_uint4 b)
3004 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0)));
3007 static inline int vec_any_eq(vec_int4 a, vec_int4 b)
3009 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0)));
3012 static inline int vec_any_eq(vec_bint4 a, vec_int4 b)
3014 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq((vec_int4)(a), b), -31)), 0)));
3017 static inline int vec_any_eq(vec_int4 a, vec_bint4 b)
3019 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, (vec_int4)(b)), -31)), 0)));
3022 static inline int vec_any_eq(vec_float4 a, vec_float4 b)
3024 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0)));
3027 /* vec_any_ge (any elements greater than or equal)
3028 * ==========
3030 static inline int vec_any_ge(vec_uchar16 a, vec_uchar16 b)
3032 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFFFF));
3035 static inline int vec_any_ge(vec_char16 a, vec_char16 b)
3037 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFFFF));
3040 static inline int vec_any_ge(vec_bchar16 a, vec_char16 b)
3042 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) != 0xFFFF));
3045 static inline int vec_any_ge(vec_char16 a, vec_bchar16 b)
3047 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) != 0xFFFF));
3050 static inline int vec_any_ge(vec_ushort8 a, vec_ushort8 b)
3052 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFF));
3055 static inline int vec_any_ge(vec_short8 a, vec_short8 b)
3057 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFF));
3060 static inline int vec_any_ge(vec_bshort8 a, vec_short8 b)
3062 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) != 0xFF));
3065 static inline int vec_any_ge(vec_short8 a, vec_bshort8 b)
3067 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) != 0xFF));
3070 static inline int vec_any_ge(vec_uint4 a, vec_uint4 b)
3072 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3075 static inline int vec_any_ge(vec_int4 a, vec_int4 b)
3077 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3080 static inline int vec_any_ge(vec_bint4 a, vec_int4 b)
3082 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) != 0xF));
3085 static inline int vec_any_ge(vec_int4 a, vec_bint4 b)
3087 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) != 0xF));
3090 static inline int vec_any_ge(vec_float4 a, vec_float4 b)
3092 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3096 /* vec_any_gt (any elements greater than)
3097 * ==========
3099 static inline int vec_any_gt(vec_uchar16 a, vec_uchar16 b)
3101 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3104 static inline int vec_any_gt(vec_char16 a, vec_char16 b)
3106 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3109 static inline int vec_any_gt(vec_bchar16 a, vec_char16 b)
3111 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) != 0));
3114 static inline int vec_any_gt(vec_char16 a, vec_bchar16 b)
3116 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) != 0));
3119 static inline int vec_any_gt(vec_ushort8 a, vec_ushort8 b)
3121 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3124 static inline int vec_any_gt(vec_short8 a, vec_short8 b)
3126 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3129 static inline int vec_any_gt(vec_bshort8 a, vec_short8 b)
3131 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) != 0));
3134 static inline int vec_any_gt(vec_short8 a, vec_bshort8 b)
3136 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) != 0));
3140 static inline int vec_any_gt(vec_uint4 a, vec_uint4 b)
3142 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0)));
3145 static inline int vec_any_gt(vec_int4 a, vec_int4 b)
3147 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0)));
3150 static inline int vec_any_gt(vec_bint4 a, vec_int4 b)
3152 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt((vec_int4)(a), b), -31)), 0)));
3155 static inline int vec_any_gt(vec_int4 a, vec_bint4 b)
3157 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, (vec_int4)(b)), -31)), 0)));
3160 static inline int vec_any_gt(vec_float4 a, vec_float4 b)
3162 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0)));
3165 /* vec_any_le (any elements less than or equal)
3166 * ==========
3168 static inline int vec_any_le(vec_uchar16 a, vec_uchar16 b)
3170 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFFFF));
3173 static inline int vec_any_le(vec_char16 a, vec_char16 b)
3175 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFFFF));
3178 static inline int vec_any_le(vec_bchar16 a, vec_char16 b)
3180 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) != 0xFFFF));
3183 static inline int vec_any_le(vec_char16 a, vec_bchar16 b)
3185 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) != 0xFFFF));
3188 static inline int vec_any_le(vec_ushort8 a, vec_ushort8 b)
3190 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFF));
3193 static inline int vec_any_le(vec_short8 a, vec_short8 b)
3195 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFF));
3198 static inline int vec_any_le(vec_bshort8 a, vec_short8 b)
3200 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) != 0xFF));
3203 static inline int vec_any_le(vec_short8 a, vec_bshort8 b)
3205 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) != 0xFF));
3208 static inline int vec_any_le(vec_uint4 a, vec_uint4 b)
3210 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3213 static inline int vec_any_le(vec_int4 a, vec_int4 b)
3215 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3218 static inline int vec_any_le(vec_bint4 a, vec_int4 b)
3220 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) != 0xF));
3223 static inline int vec_any_le(vec_int4 a, vec_bint4 b)
3225 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) != 0xF));
3228 static inline int vec_any_le(vec_float4 a, vec_float4 b)
3230 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3234 /* vec_any_lt (any elements less than)
3235 * ==========
3237 static inline int vec_any_lt(vec_uchar16 a, vec_uchar16 b)
3239 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3242 static inline int vec_any_lt(vec_char16 a, vec_char16 b)
3244 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3247 static inline int vec_any_lt(vec_bchar16 a, vec_char16 b)
3249 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) != 0));
3252 static inline int vec_any_lt(vec_char16 a, vec_bchar16 b)
3254 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) != 0));
3257 static inline int vec_any_lt(vec_ushort8 a, vec_ushort8 b)
3259 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3262 static inline int vec_any_lt(vec_short8 a, vec_short8 b)
3264 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3267 static inline int vec_any_lt(vec_bshort8 a, vec_short8 b)
3269 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) != 0));
3272 static inline int vec_any_lt(vec_short8 a, vec_bshort8 b)
3274 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) != 0));
3277 static inline int vec_any_lt(vec_uint4 a, vec_uint4 b)
3279 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3282 static inline int vec_any_lt(vec_int4 a, vec_int4 b)
3284 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3287 static inline int vec_any_lt(vec_bint4 a, vec_int4 b)
3289 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, (vec_int4)(a)), -31)), 0)));
3292 static inline int vec_any_lt(vec_int4 a, vec_bint4 b)
3294 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt((vec_int4)(b), a), -31)), 0)));
3297 static inline int vec_any_lt(vec_float4 a, vec_float4 b)
3299 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3302 /* vec_any_nan (any elements not a number)
3303 * ===========
3305 static inline int vec_any_nan(vec_float4 a)
3307 vec_uint4 exp, man;
3308 vec_uint4 exp_mask = spu_splats((unsigned int)0x7F800000);
3310 exp = spu_and((vec_uint4)(a), exp_mask);
3311 man = spu_and((vec_uint4)(a), spu_splats((unsigned int)0x007FFFFF));
3312 return ((int)(spu_extract(spu_gather(spu_andc(spu_cmpeq(exp, exp_mask),
3313 spu_cmpeq(man, 0))), 0) != 0));
3317 /* vec_any_ne (any elements not equal)
3318 * ==========
3320 static inline int vec_any_ne(vec_uchar16 a, vec_uchar16 b)
3322 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFFFF));
3325 static inline int vec_any_ne(vec_char16 a, vec_char16 b)
3327 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFFFF));
3330 static inline int vec_any_ne(vec_bchar16 a, vec_char16 b)
3332 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) != 0xFFFF));
3335 static inline int vec_any_ne(vec_char16 a, vec_bchar16 b)
3337 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) != 0xFFFF));
3340 static inline int vec_any_ne(vec_ushort8 a, vec_ushort8 b)
3342 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFF));
3345 static inline int vec_any_ne(vec_short8 a, vec_short8 b)
3347 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFF));
3350 static inline int vec_any_ne(vec_bshort8 a, vec_short8 b)
3352 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) != 0xFF));
3355 static inline int vec_any_ne(vec_short8 a, vec_bshort8 b)
3357 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) != 0xFF));
3360 static inline int vec_any_ne(vec_uint4 a, vec_uint4 b)
3362 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF));
3365 static inline int vec_any_ne(vec_int4 a, vec_int4 b)
3367 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF));
3370 static inline int vec_any_ne(vec_bint4 a, vec_int4 b)
3372 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) != 0xF));
3375 static inline int vec_any_ne(vec_int4 a, vec_bint4 b)
3377 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) != 0xF));
3380 static inline int vec_any_ne(vec_float4 a, vec_float4 b)
3382 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF));
3386 /* vec_any_nge (any elements not greater than or equal)
3387 * ===========
3389 static inline int vec_any_nge(vec_float4 a, vec_float4 b)
3391 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3394 /* vec_any_ngt (any elements not greater than)
3395 * ===========
3397 static inline int vec_any_ngt(vec_float4 a, vec_float4 b)
3399 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3403 /* vec_any_nle (any elements not less than or equal)
3404 * ===========
3406 static inline int vec_any_nle(vec_float4 a, vec_float4 b)
3408 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3412 /* vec_any_nlt (any elements not less than)
3413 * ===========
3415 static inline int vec_any_nlt(vec_float4 a, vec_float4 b)
3417 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3421 /* vec_any_numeric (any elements numeric)
3422 * ===============
3424 static inline int vec_any_numeric(vec_float4 a)
3426 vec_uint4 exp;
3428 exp = spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF);
3429 return ((int)(spu_extract(spu_gather(spu_cmpeq(exp, 255)), 0) != 0xF));
3433 /* vec_any_out (any elements out of bounds)
3434 * ===========
3436 static inline int vec_any_out(vec_float4 a, vec_float4 b)
3438 return (spu_extract(spu_gather(spu_nor(spu_cmpabsgt(a, b), (vec_uint4)(spu_rlmaska((vec_int4)(b), -31)))), 0) != 0xF);
3442 /* CBE Language Extension Intrinsics
3445 /* vec_extract (extract element from vector)
3446 * ===========
3448 #define vec_extract(_a, _element) spu_extract(_a, _element)
3451 /* vec_insert (insert scalar into specified vector element)
3452 * ==========
3454 #define vec_insert(_a, _b, _element) spu_insert(_a, _b, _element)
3456 /* vec_lvlx (load vector left indexed)
3457 * ========
3459 static inline vec_uchar16 vec_lvlx(int a, unsigned char *b)
3461 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3462 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3465 static inline vec_uchar16 vec_lvlx(int a, vec_uchar16 *b)
3467 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3468 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3471 static inline vec_char16 vec_lvlx(int a, signed char *b)
3473 vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3474 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3477 static inline vec_char16 vec_lvlx(int a, vec_char16 *b)
3479 vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3480 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3483 static inline vec_ushort8 vec_lvlx(int a, unsigned short *b)
3485 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3486 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3489 static inline vec_ushort8 vec_lvlx(int a, vec_ushort8 *b)
3491 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3492 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3495 static inline vec_short8 vec_lvlx(int a, signed short *b)
3497 vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3498 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3501 static inline vec_short8 vec_lvlx(int a, vec_short8 *b)
3503 vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3504 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3507 static inline vec_uint4 vec_lvlx(int a, unsigned int *b)
3509 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3510 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3513 static inline vec_uint4 vec_lvlx(int a, vec_uint4 *b)
3515 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3516 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3519 static inline vec_int4 vec_lvlx(int a, signed int *b)
3521 vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3522 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3525 static inline vec_int4 vec_lvlx(int a, vec_int4 *b)
3527 vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3528 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3531 static inline vec_float4 vec_lvlx(int a, float *b)
3533 vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3534 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3537 static inline vec_float4 vec_lvlx(int a, vec_float4 *b)
3539 vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3540 return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3544 /* vec_lvlxl (load vector left indexed last)
3545 * =========
3547 #define vec_lvlxl(_a, _b) vec_lvlx(_a, _b)
3550 /* vec_lvrx (load vector right indexed)
3551 * ========
3553 static inline vec_uchar16 vec_lvrx(int a, unsigned char *b)
3555 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3556 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3559 static inline vec_uchar16 vec_lvrx(int a, vec_uchar16 *b)
3561 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3562 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3565 static inline vec_char16 vec_lvrx(int a, signed char *b)
3567 vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3568 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3571 static inline vec_char16 vec_lvrx(int a, vec_char16 *b)
3573 vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3574 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3577 static inline vec_ushort8 vec_lvrx(int a, unsigned short *b)
3579 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3580 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3583 static inline vec_ushort8 vec_lvrx(int a, vec_ushort8 *b)
3585 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3586 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3589 static inline vec_short8 vec_lvrx(int a, signed short *b)
3591 vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3592 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3595 static inline vec_short8 vec_lvrx(int a, vec_short8 *b)
3597 vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3598 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3601 static inline vec_uint4 vec_lvrx(int a, unsigned int *b)
3603 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3604 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3607 static inline vec_uint4 vec_lvrx(int a, vec_uint4 *b)
3609 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3610 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3613 static inline vec_int4 vec_lvrx(int a, signed int *b)
3615 vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3616 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3619 static inline vec_int4 vec_lvrx(int a, vec_int4 *b)
3621 vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3622 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3625 static inline vec_float4 vec_lvrx(int a, float *b)
3627 vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3628 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3631 static inline vec_float4 vec_lvrx(int a, vec_float4 *b)
3633 vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3634 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3639 /* vec_lvrxl (load vector right indexed last)
3640 * =========
3642 #define vec_lvrxl(_a, _b) vec_lvrx(_a, _b)
3645 /* vec_promote (promote scalar to a vector)
3646 * ===========
3648 #define vec_promote(_a, _element) spu_promote(_a, _element)
3651 /* vec_splats (splat scalar to a vector)
3652 * ==========
3654 #define vec_splats(_a) spu_splats(_a)
3657 /* vec_stvlx (store vector left indexed)
3658 * =========
3660 static inline void vec_stvlx(vec_uchar16 a, int b, unsigned char *c)
3662 int shift;
3663 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3665 shift = -((int)p & 0xF);
3666 *p = spu_sel(*p,
3667 spu_rlmaskqwbyte(a, shift),
3668 spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3671 static inline void vec_stvlx(vec_uchar16 a, int b, vec_uchar16 *c)
3673 int shift;
3674 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3676 shift = -((int)p & 0xF);
3677 *p = spu_sel(*p,
3678 spu_rlmaskqwbyte(a, shift),
3679 spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3682 static inline void vec_stvlx(vec_char16 a, int b, signed char *c)
3684 int shift;
3685 vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3687 shift = -((int)p & 0xF);
3688 *p = spu_sel(*p,
3689 spu_rlmaskqwbyte(a, shift),
3690 spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3693 static inline void vec_stvlx(vec_char16 a, int b, vec_char16 *c)
3695 int shift;
3696 vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3698 shift = -((int)p & 0xF);
3699 *p = spu_sel(*p,
3700 spu_rlmaskqwbyte(a, shift),
3701 spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3704 static inline void vec_stvlx(vec_ushort8 a, int b, unsigned short *c)
3706 int shift;
3707 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3709 shift = -((int)p & 0xF);
3710 *p = spu_sel(*p,
3711 spu_rlmaskqwbyte(a, shift),
3712 spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3715 static inline void vec_stvlx(vec_ushort8 a, int b, vec_ushort8 *c)
3717 int shift;
3718 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3720 shift = -((int)p & 0xF);
3721 *p = spu_sel(*p,
3722 spu_rlmaskqwbyte(a, shift),
3723 spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3726 static inline void vec_stvlx(vec_short8 a, int b, signed short *c)
3728 int shift;
3729 vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3731 shift = -((int)p & 0xF);
3732 *p = spu_sel(*p,
3733 spu_rlmaskqwbyte(a, shift),
3734 spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3737 static inline void vec_stvlx(vec_short8 a, int b, vec_short8 *c)
3739 int shift;
3740 vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3742 shift = -((int)p & 0xF);
3743 *p = spu_sel(*p,
3744 spu_rlmaskqwbyte(a, shift),
3745 spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3748 static inline void vec_stvlx(vec_uint4 a, int b, unsigned int *c)
3750 int shift;
3751 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3753 shift = -((int)p & 0xF);
3754 *p = spu_sel(*p,
3755 spu_rlmaskqwbyte(a, shift),
3756 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3759 static inline void vec_stvlx(vec_uint4 a, int b, vec_uint4 *c)
3761 int shift;
3762 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3764 shift = -((int)p & 0xF);
3765 *p = spu_sel(*p,
3766 spu_rlmaskqwbyte(a, shift),
3767 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3770 static inline void vec_stvlx(vec_int4 a, int b, signed int *c)
3772 int shift;
3773 vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3775 shift = -((int)p & 0xF);
3776 *p = spu_sel(*p,
3777 spu_rlmaskqwbyte(a, shift),
3778 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3781 static inline void vec_stvlx(vec_int4 a, int b, vec_int4 *c)
3783 int shift;
3784 vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3786 shift = -((int)p & 0xF);
3787 *p = spu_sel(*p,
3788 spu_rlmaskqwbyte(a, shift),
3789 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3792 static inline void vec_stvlx(vec_float4 a, int b, float *c)
3794 int shift;
3795 vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3797 shift = -((int)p & 0xF);
3798 *p = spu_sel(*p,
3799 spu_rlmaskqwbyte(a, shift),
3800 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3803 static inline void vec_stvlx(vec_float4 a, int b, vec_float4 *c)
3805 int shift;
3806 vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3808 shift = -((int)p & 0xF);
3809 *p = spu_sel(*p,
3810 spu_rlmaskqwbyte(a, shift),
3811 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3814 /* vec_stvlxl (store vector left indexed last)
3815 * ==========
3817 #define vec_stvlxl(_a, _b, _c) vec_stvlx(_a, _b, _c)
3820 /* vec_stvrx (store vector right indexed)
3821 * =========
3823 static inline void vec_stvrx(vec_uchar16 a, int b, unsigned char *c)
3825 int shift;
3826 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3828 shift = 16-((int)p & 0xF);
3829 *p = spu_sel(*p,
3830 spu_slqwbyte(a, shift),
3831 spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3834 static inline void vec_stvrx(vec_uchar16 a, int b, vec_uchar16 *c)
3836 int shift;
3837 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3839 shift = 16-((int)p & 0xF);
3840 *p = spu_sel(*p,
3841 spu_slqwbyte(a, shift),
3842 spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3845 static inline void vec_stvrx(vec_char16 a, int b, signed char *c)
3847 int shift;
3848 vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3850 shift = 16-((int)p & 0xF);
3851 *p = spu_sel(*p,
3852 spu_slqwbyte(a, shift),
3853 spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3856 static inline void vec_stvrx(vec_char16 a, int b, vec_char16 *c)
3858 int shift;
3859 vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3861 shift = 16-((int)p & 0xF);
3862 *p = spu_sel(*p,
3863 spu_slqwbyte(a, shift),
3864 spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3867 static inline void vec_stvrx(vec_ushort8 a, int b, unsigned short *c)
3869 int shift;
3870 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3872 shift = 16-((int)p & 0xF);
3873 *p = spu_sel(*p,
3874 spu_slqwbyte(a, shift),
3875 spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3878 static inline void vec_stvrx(vec_ushort8 a, int b, vec_ushort8 *c)
3880 int shift;
3881 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3883 shift = 16-((int)p & 0xF);
3884 *p = spu_sel(*p,
3885 spu_slqwbyte(a, shift),
3886 spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3889 static inline void vec_stvrx(vec_short8 a, int b, signed short *c)
3891 int shift;
3892 vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3894 shift = 16-((int)p & 0xF);
3895 *p = spu_sel(*p,
3896 spu_slqwbyte(a, shift),
3897 spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3900 static inline void vec_stvrx(vec_short8 a, int b, vec_short8 *c)
3902 int shift;
3903 vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3905 shift = 16-((int)p & 0xF);
3906 *p = spu_sel(*p,
3907 spu_slqwbyte(a, shift),
3908 spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3911 static inline void vec_stvrx(vec_uint4 a, int b, unsigned int *c)
3913 int shift;
3914 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3916 shift = 16-((int)p & 0xF);
3917 *p = spu_sel(*p,
3918 spu_slqwbyte(a, shift),
3919 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3922 static inline void vec_stvrx(vec_uint4 a, int b, vec_uint4 *c)
3924 int shift;
3925 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3927 shift = 16-((int)p & 0xF);
3928 *p = spu_sel(*p,
3929 spu_slqwbyte(a, shift),
3930 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3933 static inline void vec_stvrx(vec_int4 a, int b, signed int *c)
3935 int shift;
3936 vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3938 shift = 16-((int)p & 0xF);
3939 *p = spu_sel(*p,
3940 spu_slqwbyte(a, shift),
3941 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3944 static inline void vec_stvrx(vec_int4 a, int b, vec_int4 *c)
3946 int shift;
3947 vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3949 shift = 16-((int)p & 0xF);
3950 *p = spu_sel(*p,
3951 spu_slqwbyte(a, shift),
3952 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3955 static inline void vec_stvrx(vec_float4 a, int b, float *c)
3957 int shift;
3958 vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3960 shift = 16-((int)p & 0xF);
3961 *p = spu_sel(*p,
3962 spu_slqwbyte(a, shift),
3963 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3966 static inline void vec_stvrx(vec_float4 a, int b, vec_float4 *c)
3968 int shift;
3969 vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3971 shift = 16-((int)p & 0xF);
3972 *p = spu_sel(*p,
3973 spu_slqwbyte(a, shift),
3974 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3977 /* vec_stvrxl (store vector right indexed last)
3978 * ==========
3980 #define vec_stvrxl(_a, _b, _c) vec_stvrx(_a, _b, _c)
3983 #endif /* __SPU__ */
3984 #endif /* __cplusplus */
3985 #endif /* !_VMX2SPU_H_ */