Cleanup #1489: Delete GPU dummy mempool
[charm.git] / src / util / simd.h
blob230d782e88c60735034b7dde47242a311272986b
1 #ifndef __SIMDIA_H__
2 #define __SIMDIA_H__
5 #if defined(__SSE2__) && !defined(_CRAYC)
6 #include "emmintrin.h"
7 #endif
9 #if CMK_CELL_SPE != 0
10 #include "spu_intrinsics.h"
11 #else
12 #include "math.h"
13 #endif
15 #if defined(__VEC__)
16 #include "altivec.h"
17 #ifdef pixel
18 #undef pixel
19 #endif
20 #ifdef bool
21 #undef bool
22 #endif
23 #endif
26 /* Solaris does not support sqrtf (float), so just map it to sqrt (double) instead */
27 #if !CMK_HAS_SQRTF
28 #define sqrtf(a) ((float)(sqrt((double)(a))))
29 #endif
33 /* Flags to force architecture specific SIMD instructions off */
34 #define SIMDIA_FORCE_NO_SSE (0)
35 #define SIMDIA_FORCE_NO_ALTIVEC (0)
36 #define SIMDIA_FORCE_NO_SPE_SIMD (0)
39 /***** Math Constants *****/
40 #define SIMDIA_CONSTANT_PI (3.141592653589793)
41 #define SIMDIA_CONSTANT_E (2.718281828459045)
42 #define SIMDIA_CONSTANT_SQRT_2 (1.414213562373095)
45 /* TODO | FIXME - Find platform independent way of ensuring alignment
46 * (using __attribute__((aligned(XXX))) doesn't seem to work in netlrts-win and
47 * netlrts-sol builds). Just to be safe since compilers should do this anyway.
50 /* TODO | FIXME - Add a function that will test the functionality of the
51 * various operations defined by these abstractions and somehow tie this test
52 * into the nightly build to ensure these operations give correct results.
56 /*******************************************************************************
57 *******************************************************************************
58 ***** Generic C Implementation
59 *******************************************************************************
60 *******************************************************************************/
62 /*@{*/
64 /* NOTE: This is declared first so any architecture specific implementations
65 * can simply use the generic functions for specific data types or operations
66 * that they do not implement.
69 /***** Data Types *****/
70 /* NOTE (DMK): Since this is the generic implementation, arbitrarily choosing 128 byte "vector" size. */
71 typedef struct __simdia_vec_i { int v0, v1, v2, v3; } __simdia_veci;
72 typedef struct __simdia_vec_f { float v0, v1, v2, v3; } __simdia_vecf;
73 typedef struct __simdia_vec_lf { double v0, v1; } __simdia_veclf;
76 /***** Insert *****/
77 inline __simdia_veci __simdia_vinserti( __simdia_veci v, const int s, const int i) { __simdia_veci r = v; int* rPtr = ( int*)(&r); rPtr[i] = s; return r; }
78 inline __simdia_vecf __simdia_vinsertf( __simdia_vecf v, const float s, const int i) { __simdia_vecf r = v; float* rPtr = ( float*)(&r); rPtr[i] = s; return r; }
79 inline __simdia_veclf __simdia_vinsertlf(__simdia_veclf v, const double s, const int i) { __simdia_veclf r = v; double* rPtr = (double*)(&r); rPtr[i] = s; return r; }
81 /***** Extract *****/
82 inline int __simdia_vextracti( __simdia_veci v, const int i) { int* vPtr = ( int*)(&v); return vPtr[i]; }
83 inline float __simdia_vextractf( __simdia_vecf v, const int i) { float* vPtr = ( float*)(&v); return vPtr[i]; }
84 inline double __simdia_vextractlf(__simdia_veclf v, const int i) { double* vPtr = (double*)(&v); return vPtr[i]; }
86 /***** Set *****/
87 inline __simdia_veci __simdia_vseti(const int a) { __simdia_veci r; r.v0 = r.v1 = r.v2 = r.v3 = a; return r; }
88 inline __simdia_vecf __simdia_vsetf(const float a) { __simdia_vecf r; r.v0 = r.v1 = r.v2 = r.v3 = a; return r; }
89 inline __simdia_veclf __simdia_vsetlf(const double a) { __simdia_veclf r; r.v0 = r.v1 = a; return r; }
91 /* NOTE: Would it be better to generate the constants instead of read them from memory in the generic version? */
93 /***** Constant Zero *****/
94 const __simdia_veci __simdia_const_vzeroi = { 0 , 0 , 0 , 0 };
95 const __simdia_vecf __simdia_const_vzerof = { 0.0f, 0.0f, 0.0f, 0.0f };
96 const __simdia_veclf __simdia_const_vzerolf = { 0.0 , 0.0 };
98 /***** Constant One *****/
99 const __simdia_veci __simdia_const_vonei = { 1 , 1 , 1 , 1 };
100 const __simdia_vecf __simdia_const_vonef = { 1.0f, 1.0f, 1.0f, 1.0f };
101 const __simdia_veclf __simdia_const_vonelf = { 1.0 , 1.0 };
103 /***** Constant Two *****/
104 const __simdia_veci __simdia_const_vtwoi = { 2 , 2 , 2 , 2 };
105 const __simdia_vecf __simdia_const_vtwof = { 2.0f, 2.0f, 2.0f, 2.0f };
106 const __simdia_veclf __simdia_const_vtwolf = { 2.0 , 2.0 };
108 /***** Constant Negative One *****/
109 const __simdia_veci __simdia_const_vnegonei = { -1 , -1 , -1 , -1 };
110 const __simdia_vecf __simdia_const_vnegonef = { -1.0f, -1.0f, -1.0f, -1.0f };
111 const __simdia_veclf __simdia_const_vnegonelf = { -1.0 , -1.0 };
113 /* TODO | FIXME - Try to create constants such that it does not require a
114 * memory operations to access the constants (like the SSE constants).
117 /***** Rotate *****/
118 inline __simdia_veci __simdia_vrothi(const __simdia_veci a, int s) { __simdia_veci b; int* a_ptr = ( int*)(&a); int* b_ptr = ( int*)(&b); s &= 0x3; b_ptr[0] = a_ptr[(0-s)&0x3]; b_ptr[1] = a_ptr[(1-s)&0x3]; b_ptr[2] = a_ptr[(2-s)&0x3]; b_ptr[3] = a_ptr[(3-s)&0x3]; return b; }
119 inline __simdia_vecf __simdia_vrothf(const __simdia_vecf a, int s) { __simdia_vecf b; float* a_ptr = ( float*)(&a); float* b_ptr = ( float*)(&b); s &= 0x3; b_ptr[0] = a_ptr[(0-s)&0x3]; b_ptr[1] = a_ptr[(1-s)&0x3]; b_ptr[2] = a_ptr[(2-s)&0x3]; b_ptr[3] = a_ptr[(3-s)&0x3]; return b; }
120 inline __simdia_veclf __simdia_vrothlf(const __simdia_veclf a, int s) { __simdia_veclf b; double* a_ptr = (double*)(&a); double* b_ptr = (double*)(&b); s &= 0x1; b_ptr[0] = a_ptr[(0-s)&0x1]; b_ptr[1] = a_ptr[(1-s)&0x1]; return b; }
121 inline __simdia_veci __simdia_vrotli(const __simdia_veci a, int s) { __simdia_veci b; int* a_ptr = ( int*)(&a); int* b_ptr = ( int*)(&b); s &= 0x3; b_ptr[0] = a_ptr[(0+s)&0x3]; b_ptr[1] = a_ptr[(1+s)&0x3]; b_ptr[2] = a_ptr[(2+s)&0x3]; b_ptr[3] = a_ptr[(3+s)&0x3]; return b; }
122 inline __simdia_vecf __simdia_vrotlf(const __simdia_vecf a, int s) { __simdia_vecf b; float* a_ptr = ( float*)(&a); float* b_ptr = ( float*)(&b); s &= 0x3; b_ptr[0] = a_ptr[(0+s)&0x3]; b_ptr[1] = a_ptr[(1+s)&0x3]; b_ptr[2] = a_ptr[(2+s)&0x3]; b_ptr[3] = a_ptr[(3+s)&0x3]; return b; }
123 inline __simdia_veclf __simdia_vrotllf(const __simdia_veclf a, int s) { __simdia_veclf b; double* a_ptr = (double*)(&a); double* b_ptr = (double*)(&b); s &= 0x1; b_ptr[0] = a_ptr[(0+s)&0x1]; b_ptr[1] = a_ptr[(1+s)&0x1]; return b; }
125 /***** Addition *****/
126 inline __simdia_veci __simdia_vaddi(const __simdia_veci a, const __simdia_veci b) { __simdia_veci r; r.v0 = a.v0 + b.v0; r.v1 = a.v1 + b.v1; r.v2 = a.v2 + b.v2; r.v3 = a.v3 + b.v3; return r; }
127 inline __simdia_vecf __simdia_vaddf(const __simdia_vecf a, const __simdia_vecf b) { __simdia_vecf r; r.v0 = a.v0 + b.v0; r.v1 = a.v1 + b.v1; r.v2 = a.v2 + b.v2; r.v3 = a.v3 + b.v3; return r; }
128 inline __simdia_veclf __simdia_vaddlf(const __simdia_veclf a, const __simdia_veclf b) { __simdia_veclf r; r.v0 = a.v0 + b.v0; r.v1 = a.v1 + b.v1; return r; }
130 /***** Subtraction *****/
131 inline __simdia_veci __simdia_vsubi(const __simdia_veci a, const __simdia_veci b) { __simdia_veci r; r.v0 = a.v0 - b.v0; r.v1 = a.v1 - b.v1; r.v2 = a.v2 - b.v2; r.v3 = a.v3 - b.v3; return r; }
132 inline __simdia_vecf __simdia_vsubf(const __simdia_vecf a, const __simdia_vecf b) { __simdia_vecf r; r.v0 = a.v0 - b.v0; r.v1 = a.v1 - b.v1; r.v2 = a.v2 - b.v2; r.v3 = a.v3 - b.v3; return r; }
133 inline __simdia_veclf __simdia_vsublf(const __simdia_veclf a, const __simdia_veclf b) { __simdia_veclf r; r.v0 = a.v0 - b.v0; r.v1 = a.v1 - b.v1; return r; }
135 /***** Multiplication *****/
136 inline __simdia_veci __simdia_vmuli(const __simdia_veci a, const __simdia_veci b) { __simdia_veci r; r.v0 = a.v0 * b.v0; r.v1 = a.v1 * b.v1; r.v2 = a.v2 * b.v2; r.v3 = a.v3 * b.v3; return r; }
137 inline __simdia_vecf __simdia_vmulf(const __simdia_vecf a, const __simdia_vecf b) { __simdia_vecf r; r.v0 = a.v0 * b.v0; r.v1 = a.v1 * b.v1; r.v2 = a.v2 * b.v2; r.v3 = a.v3 * b.v3; return r; }
138 inline __simdia_veclf __simdia_vmullf(const __simdia_veclf a, const __simdia_veclf b) { __simdia_veclf r; r.v0 = a.v0 * b.v0; r.v1 = a.v1 * b.v1; return r; }
140 /***** Division *****/
141 inline __simdia_veci __simdia_vdivi(const __simdia_veci a, const __simdia_veci b) { __simdia_veci r; r.v0 = a.v0 / b.v0; r.v1 = a.v1 / b.v1; r.v2 = a.v2 / b.v2; r.v3 = a.v3 / b.v3; return r; }
142 inline __simdia_vecf __simdia_vdivf(const __simdia_vecf a, const __simdia_vecf b) { __simdia_vecf r; r.v0 = a.v0 / b.v0; r.v1 = a.v1 / b.v1; r.v2 = a.v2 / b.v2; r.v3 = a.v3 / b.v3; return r; }
143 inline __simdia_veclf __simdia_vdivlf(const __simdia_veclf a, const __simdia_veclf b) { __simdia_veclf r; r.v0 = a.v0 / b.v0; r.v1 = a.v1 / b.v1; return r; }
145 /***** Fused Multiply Add *****/
146 inline __simdia_veci __simdia_vmaddi(const __simdia_veci a, const __simdia_veci b, const __simdia_veci c) { __simdia_veci r; r.v0 = a.v0 * b.v0 + c.v0; r.v1 = a.v1 * b.v1 + c.v1; r.v2 = a.v2 * b.v2 + c.v2; r.v3 = a.v3 * b.v3 + c.v3; return r; }
147 inline __simdia_vecf __simdia_vmaddf(const __simdia_vecf a, const __simdia_vecf b, const __simdia_vecf c) { __simdia_vecf r; r.v0 = a.v0 * b.v0 + c.v0; r.v1 = a.v1 * b.v1 + c.v1; r.v2 = a.v2 * b.v2 + c.v2; r.v3 = a.v3 * b.v3 + c.v3; return r; }
148 inline __simdia_veclf __simdia_vmaddlf(const __simdia_veclf a, const __simdia_veclf b, const __simdia_veclf c) { __simdia_veclf r; r.v0 = a.v0 * b.v0 + c.v0; r.v1 = a.v1 * b.v1 + c.v1; return r; }
150 /***** Reciprocal *****/
151 /* TODO | FIXME - See if there is a better way to do this (few cycles and avoid the memory load) */
152 inline __simdia_vecf __simdia_vrecipf(const __simdia_vecf a) { __simdia_vecf r; r.v0 = 1.0f / a.v0; r.v1 = 1.0f / a.v1; r.v2 = 1.0f / a.v2; r.v3 = 1.0f / a.v3; return r; }
153 inline __simdia_veclf __simdia_vreciplf(const __simdia_veclf a) { __simdia_veclf r; r.v0 = 1.0f / a.v0; r.v1 = 1.0f / a.v1; return r; }
155 /***** Square Root *****/
156 inline __simdia_vecf __simdia_vsqrtf(const __simdia_vecf a) { __simdia_vecf r; r.v0 = sqrtf(a.v0); r.v1 = sqrtf(a.v1); r.v2 = sqrtf(a.v2); r.v3 = sqrtf(a.v3); return r; }
157 inline __simdia_veclf __simdia_vsqrtlf(const __simdia_veclf a) { __simdia_veclf r; r.v0 = sqrt(a.v0); r.v1 = sqrt(a.v1); return r; }
159 /***** Reciprocal Square Root *****/
160 inline __simdia_vecf __simdia_vrsqrtf(const __simdia_vecf a) { __simdia_vecf r; r.v0 = 1.0f / sqrtf(a.v0); r.v1 = 1.0f / sqrtf(a.v1); r.v2 = 1.0f / sqrtf(a.v2); r.v3 = 1.0f / sqrtf(a.v3); return r; }
161 inline __simdia_veclf __simdia_vrsqrtlf(const __simdia_veclf a) { __simdia_veclf r; r.v0 = 1.0 / sqrt(a.v0); r.v1 = 1.0 / sqrt(a.v1); return r; }
163 /***** Not *****/
164 inline __simdia_veci __simdia_vnoti(const __simdia_veci a) { __simdia_veci r; int* rPtr = (int*)(&r); int* aPtr = (int*)(&a); rPtr[0] = aPtr[0] ^ -1; rPtr[1] = aPtr[1] ^ -1; rPtr[2] = aPtr[2] ^ -1; rPtr[3] = aPtr[3] ^ -1; return r; }
165 inline __simdia_vecf __simdia_vnotf(const __simdia_vecf a) { __simdia_vecf r; int* rPtr = (int*)(&r); int* aPtr = (int*)(&a); rPtr[0] = aPtr[0] ^ -1; rPtr[1] = aPtr[1] ^ -1; rPtr[2] = aPtr[2] ^ -1; rPtr[3] = aPtr[3] ^ -1; return r; }
166 inline __simdia_veclf __simdia_vnotlf(const __simdia_veclf a) { __simdia_veclf r; int* rPtr = (int*)(&r); int* aPtr = (int*)(&a); rPtr[0] = aPtr[0] ^ -1; rPtr[1] = aPtr[1] ^ -1; rPtr[2] = aPtr[2] ^ -1; rPtr[3] = aPtr[3] ^ -1; return r; }
168 /***** Or *****/
169 inline __simdia_veci __simdia_vori(const __simdia_veci a, const __simdia_veci b) { __simdia_veci r; int* rPtr = (int*)(&r); int* aPtr = (int*)(&a); int* bPtr = (int*)(&b); rPtr[0] = aPtr[0] | bPtr[0]; rPtr[1] = aPtr[1] | bPtr[1]; rPtr[2] = aPtr[2] | bPtr[2]; rPtr[3] = aPtr[3] | bPtr[3]; return r; }
170 inline __simdia_vecf __simdia_vorf(const __simdia_vecf a, const __simdia_vecf b) { __simdia_vecf r; int* rPtr = (int*)(&r); int* aPtr = (int*)(&a); int* bPtr = (int*)(&b); rPtr[0] = aPtr[0] | bPtr[0]; rPtr[1] = aPtr[1] | bPtr[1]; rPtr[2] = aPtr[2] | bPtr[2]; rPtr[3] = aPtr[3] | bPtr[3]; return r; }
171 inline __simdia_veclf __simdia_vorlf(const __simdia_veclf a, const __simdia_veclf b) { __simdia_veclf r; int* rPtr = (int*)(&r); int* aPtr = (int*)(&a); int* bPtr = (int*)(&b); rPtr[0] = aPtr[0] | bPtr[0]; rPtr[1] = aPtr[1] | bPtr[1]; rPtr[2] = aPtr[2] | bPtr[2]; rPtr[3] = aPtr[3] | bPtr[3]; return r; }
173 /***** Nor *****/
174 inline __simdia_veci __simdia_vnori(const __simdia_veci a, const __simdia_veci b) { __simdia_veci r; int* rPtr = (int*)(&r); int* aPtr = (int*)(&a); int* bPtr = (int*)(&b); rPtr[0] = (aPtr[0] | bPtr[0]) ^ -1; rPtr[1] = (aPtr[1] | bPtr[1]) ^ -1; rPtr[2] = (aPtr[2] | bPtr[2]) ^ -1; rPtr[3] = (aPtr[3] | bPtr[3]) ^ -1; return r; }
175 inline __simdia_vecf __simdia_vnorf(const __simdia_vecf a, const __simdia_vecf b) { __simdia_vecf r; int* rPtr = (int*)(&r); int* aPtr = (int*)(&a); int* bPtr = (int*)(&b); rPtr[0] = (aPtr[0] | bPtr[0]) ^ -1; rPtr[1] = (aPtr[1] | bPtr[1]) ^ -1; rPtr[2] = (aPtr[2] | bPtr[2]) ^ -1; rPtr[3] = (aPtr[3] | bPtr[3]) ^ -1; return r; }
176 inline __simdia_veclf __simdia_vnorlf(const __simdia_veclf a, const __simdia_veclf b) { __simdia_veclf r; int* rPtr = (int*)(&r); int* aPtr = (int*)(&a); int* bPtr = (int*)(&b); rPtr[0] = (aPtr[0] | bPtr[0]) ^ -1; rPtr[1] = (aPtr[1] | bPtr[1]) ^ -1; rPtr[2] = (aPtr[2] | bPtr[2]) ^ -1; rPtr[3] = (aPtr[3] | bPtr[3]) ^ -1; return r; }
178 /***** And *****/
179 inline __simdia_veci __simdia_vandi(const __simdia_veci a, const __simdia_veci b) { __simdia_veci r; int* rPtr = (int*)(&r); int* aPtr = (int*)(&a); int* bPtr = (int*)(&b); rPtr[0] = aPtr[0] & bPtr[0]; rPtr[1] = aPtr[1] & bPtr[1]; rPtr[2] = aPtr[2] & bPtr[2]; rPtr[3] = aPtr[3] & bPtr[3]; return r; }
180 inline __simdia_vecf __simdia_vandf(const __simdia_vecf a, const __simdia_vecf b) { __simdia_vecf r; int* rPtr = (int*)(&r); int* aPtr = (int*)(&a); int* bPtr = (int*)(&b); rPtr[0] = aPtr[0] & bPtr[0]; rPtr[1] = aPtr[1] & bPtr[1]; rPtr[2] = aPtr[2] & bPtr[2]; rPtr[3] = aPtr[3] & bPtr[3]; return r; }
181 inline __simdia_veclf __simdia_vandlf(const __simdia_veclf a, const __simdia_veclf b) { __simdia_veclf r; int* rPtr = (int*)(&r); int* aPtr = (int*)(&a); int* bPtr = (int*)(&b); rPtr[0] = aPtr[0] & bPtr[0]; rPtr[1] = aPtr[1] & bPtr[1]; rPtr[2] = aPtr[2] & bPtr[2]; rPtr[3] = aPtr[3] & bPtr[3]; return r; }
183 /***** Nand *****/
184 inline __simdia_veci __simdia_vnandi(const __simdia_veci a, const __simdia_veci b) { __simdia_veci r; int* rPtr = (int*)(&r); int* aPtr = (int*)(&a); int* bPtr = (int*)(&b); rPtr[0] = (aPtr[0] & bPtr[0]) ^ -1; rPtr[1] = (aPtr[1] & bPtr[1]) ^ -1; rPtr[2] = (aPtr[2] & bPtr[2]) ^ -1; rPtr[3] = (aPtr[3] & bPtr[3]) ^ -1; return r; }
185 inline __simdia_vecf __simdia_vnandf(const __simdia_vecf a, const __simdia_vecf b) { __simdia_vecf r; int* rPtr = (int*)(&r); int* aPtr = (int*)(&a); int* bPtr = (int*)(&b); rPtr[0] = (aPtr[0] & bPtr[0]) ^ -1; rPtr[1] = (aPtr[1] & bPtr[1]) ^ -1; rPtr[2] = (aPtr[2] & bPtr[2]) ^ -1; rPtr[3] = (aPtr[3] & bPtr[3]) ^ -1; return r; }
186 inline __simdia_veclf __simdia_vnandlf(const __simdia_veclf a, const __simdia_veclf b) { __simdia_veclf r; int* rPtr = (int*)(&r); int* aPtr = (int*)(&a); int* bPtr = (int*)(&b); rPtr[0] = (aPtr[0] & bPtr[0]) ^ -1; rPtr[1] = (aPtr[1] & bPtr[1]) ^ -1; rPtr[2] = (aPtr[2] & bPtr[2]) ^ -1; rPtr[3] = (aPtr[3] & bPtr[3]) ^ -1; return r; }
188 /***** Xor *****/
189 inline __simdia_veci __simdia_vxori(const __simdia_veci a, const __simdia_veci b) { __simdia_veci r; int* rPtr = (int*)(&r); int* aPtr = (int*)(&a); int* bPtr = (int*)(&b); rPtr[0] = aPtr[0] ^ bPtr[0]; rPtr[1] = aPtr[1] ^ bPtr[1]; rPtr[2] = aPtr[2] ^ bPtr[2]; rPtr[3] = aPtr[3] ^ bPtr[3]; return r; }
190 inline __simdia_vecf __simdia_vxorf(const __simdia_vecf a, const __simdia_vecf b) { __simdia_vecf r; int* rPtr = (int*)(&r); int* aPtr = (int*)(&a); int* bPtr = (int*)(&b); rPtr[0] = aPtr[0] ^ bPtr[0]; rPtr[1] = aPtr[1] ^ bPtr[1]; rPtr[2] = aPtr[2] ^ bPtr[2]; rPtr[3] = aPtr[3] ^ bPtr[3]; return r; }
191 inline __simdia_veclf __simdia_vxorlf(const __simdia_veclf a, const __simdia_veclf b) { __simdia_veclf r; int* rPtr = (int*)(&r); int* aPtr = (int*)(&a); int* bPtr = (int*)(&b); rPtr[0] = aPtr[0] ^ bPtr[0]; rPtr[1] = aPtr[1] ^ bPtr[1]; rPtr[2] = aPtr[2] ^ bPtr[2]; rPtr[3] = aPtr[3] ^ bPtr[3]; return r; }
193 /***** Nxor *****/
194 inline __simdia_veci __simdia_vnxori(const __simdia_veci a, const __simdia_veci b) { __simdia_veci r; int* rPtr = (int*)(&r); int* aPtr = (int*)(&a); int* bPtr = (int*)(&b); rPtr[0] = (aPtr[0] ^ bPtr[0]) ^ -1; rPtr[1] = (aPtr[1] ^ bPtr[1]) ^ -1; rPtr[2] = (aPtr[2] ^ bPtr[2]) ^ -1; rPtr[3] = (aPtr[3] ^ bPtr[3]) ^ -1; return r; }
195 inline __simdia_vecf __simdia_vnxorf(const __simdia_vecf a, const __simdia_vecf b) { __simdia_vecf r; int* rPtr = (int*)(&r); int* aPtr = (int*)(&a); int* bPtr = (int*)(&b); rPtr[0] = (aPtr[0] ^ bPtr[0]) ^ -1; rPtr[1] = (aPtr[1] ^ bPtr[1]) ^ -1; rPtr[2] = (aPtr[2] ^ bPtr[2]) ^ -1; rPtr[3] = (aPtr[3] ^ bPtr[3]) ^ -1; return r; }
196 inline __simdia_veclf __simdia_vnxorlf(const __simdia_veclf a, const __simdia_veclf b) { __simdia_veclf r; int* rPtr = (int*)(&r); int* aPtr = (int*)(&a); int* bPtr = (int*)(&b); rPtr[0] = (aPtr[0] ^ bPtr[0]) ^ -1; rPtr[1] = (aPtr[1] ^ bPtr[1]) ^ -1; rPtr[2] = (aPtr[2] ^ bPtr[2]) ^ -1; rPtr[3] = (aPtr[3] ^ bPtr[3]) ^ -1; return r; }
198 /* TODO | FIXME - Try to do the comparisons in a branchless way */
200 /***** Equal To *****/
201 inline __simdia_veci __simdia_vcmpeqi(const __simdia_veci a, const __simdia_veci b) { __simdia_veci r; r.v0 = ((a.v0 == b.v0) ? (0xFFFFFFFF) : (0x0)); r.v1 = ((a.v1 == b.v1) ? (0xFFFFFFFF) : (0x0)); r.v2 = ((a.v2 == b.v2) ? (0xFFFFFFFF) : (0x0)); r.v3 = ((a.v3 == b.v3) ? (0xFFFFFFFF) : (0x0)); return r; }
202 inline __simdia_veci __simdia_vcmpeqf(const __simdia_vecf a, const __simdia_vecf b) { __simdia_veci r; r.v0 = ((a.v0 == b.v0) ? (0xFFFFFFFF) : (0x0)); r.v1 = ((a.v1 == b.v1) ? (0xFFFFFFFF) : (0x0)); r.v2 = ((a.v2 == b.v2) ? (0xFFFFFFFF) : (0x0)); r.v3 = ((a.v3 == b.v3) ? (0xFFFFFFFF) : (0x0)); return r; }
203 inline __simdia_veci __simdia_vcmpeqlf(const __simdia_vecf a, const __simdia_vecf b) { __simdia_veci r; r.v0 = r.v1 = ((a.v0 == b.v0) ? (0xFFFFFFFF) : (0x0)); r.v2 = r.v3 = ((a.v1 == b.v1) ? (0xFFFFFFFF) : (0x0)); return r; }
205 /***** Greater Than *****/
206 inline __simdia_veci __simdia_vcmpgti(const __simdia_veci a, const __simdia_veci b) { __simdia_veci r; r.v0 = ((a.v0 > b.v0) ? (0xFFFFFFFF) : (0x0)); r.v1 = ((a.v1 > b.v1) ? (0xFFFFFFFF) : (0x0)); r.v2 = ((a.v2 > b.v2) ? (0xFFFFFFFF) : (0x0)); r.v3 = ((a.v3 > b.v3) ? (0xFFFFFFFF) : (0x0)); return r; }
207 inline __simdia_veci __simdia_vcmpgtf(const __simdia_vecf a, const __simdia_vecf b) { __simdia_veci r; r.v0 = ((a.v0 > b.v0) ? (0xFFFFFFFF) : (0x0)); r.v1 = ((a.v1 > b.v1) ? (0xFFFFFFFF) : (0x0)); r.v2 = ((a.v2 > b.v2) ? (0xFFFFFFFF) : (0x0)); r.v3 = ((a.v3 > b.v3) ? (0xFFFFFFFF) : (0x0)); return r; }
208 inline __simdia_veci __simdia_vcmpgtlf(const __simdia_vecf a, const __simdia_vecf b) { __simdia_veci r; r.v0 = r.v1 = ((a.v0 > b.v0) ? (0xFFFFFFFF) : (0x0)); r.v2 = r.v3 = ((a.v1 > b.v1) ? (0xFFFFFFFF) : (0x0)); return r; }
210 /***** Greater Than Or Equal To *****/
211 inline __simdia_veci __simdia_vcmpgei(const __simdia_veci a, const __simdia_veci b) { __simdia_veci r; r.v0 = ((a.v0 >= b.v0) ? (0xFFFFFFFF) : (0x0)); r.v1 = ((a.v1 >= b.v1) ? (0xFFFFFFFF) : (0x0)); r.v2 = ((a.v2 >= b.v2) ? (0xFFFFFFFF) : (0x0)); r.v3 = ((a.v3 >= b.v3) ? (0xFFFFFFFF) : (0x0)); return r; }
212 inline __simdia_veci __simdia_vcmpgef(const __simdia_vecf a, const __simdia_vecf b) { __simdia_veci r; r.v0 = ((a.v0 >= b.v0) ? (0xFFFFFFFF) : (0x0)); r.v1 = ((a.v1 >= b.v1) ? (0xFFFFFFFF) : (0x0)); r.v2 = ((a.v2 >= b.v2) ? (0xFFFFFFFF) : (0x0)); r.v3 = ((a.v3 >= b.v3) ? (0xFFFFFFFF) : (0x0)); return r; }
213 inline __simdia_veci __simdia_vcmpgelf(const __simdia_vecf a, const __simdia_vecf b) { __simdia_veci r; r.v0 = r.v1 = ((a.v0 >= b.v0) ? (0xFFFFFFFF) : (0x0)); r.v2 = r.v3 = ((a.v1 >= b.v1) ? (0xFFFFFFFF) : (0x0)); return r; }
215 /***** Less Than *****/
216 inline __simdia_veci __simdia_vcmplti(const __simdia_veci a, const __simdia_veci b) { __simdia_veci r; r.v0 = ((a.v0 < b.v0) ? (0xFFFFFFFF) : (0x0)); r.v1 = ((a.v1 < b.v1) ? (0xFFFFFFFF) : (0x0)); r.v2 = ((a.v2 < b.v2) ? (0xFFFFFFFF) : (0x0)); r.v3 = ((a.v3 < b.v3) ? (0xFFFFFFFF) : (0x0)); return r; }
217 inline __simdia_veci __simdia_vcmpltf(const __simdia_vecf a, const __simdia_vecf b) { __simdia_veci r; r.v0 = ((a.v0 < b.v0) ? (0xFFFFFFFF) : (0x0)); r.v1 = ((a.v1 < b.v1) ? (0xFFFFFFFF) : (0x0)); r.v2 = ((a.v2 < b.v2) ? (0xFFFFFFFF) : (0x0)); r.v3 = ((a.v3 < b.v3) ? (0xFFFFFFFF) : (0x0)); return r; }
218 inline __simdia_veci __simdia_vcmpltlf(const __simdia_vecf a, const __simdia_vecf b) { __simdia_veci r; r.v0 = r.v1 = ((a.v0 < b.v0) ? (0xFFFFFFFF) : (0x0)); r.v2 = r.v3 = ((a.v1 < b.v1) ? (0xFFFFFFFF) : (0x0)); return r; }
220 /***** Less Than Or Equal To *****/
221 inline __simdia_veci __simdia_vcmplei(const __simdia_veci a, const __simdia_veci b) { __simdia_veci r; r.v0 = ((a.v0 <= b.v0) ? (0xFFFFFFFF) : (0x0)); r.v1 = ((a.v1 <= b.v1) ? (0xFFFFFFFF) : (0x0)); r.v2 = ((a.v2 <= b.v2) ? (0xFFFFFFFF) : (0x0)); r.v3 = ((a.v3 <= b.v3) ? (0xFFFFFFFF) : (0x0)); return r; }
222 inline __simdia_veci __simdia_vcmplef(const __simdia_vecf a, const __simdia_vecf b) { __simdia_veci r; r.v0 = ((a.v0 <= b.v0) ? (0xFFFFFFFF) : (0x0)); r.v1 = ((a.v1 <= b.v1) ? (0xFFFFFFFF) : (0x0)); r.v2 = ((a.v2 <= b.v2) ? (0xFFFFFFFF) : (0x0)); r.v3 = ((a.v3 <= b.v3) ? (0xFFFFFFFF) : (0x0)); return r; }
223 inline __simdia_veci __simdia_vcmplelf(const __simdia_vecf a, const __simdia_vecf b) { __simdia_veci r; r.v0 = r.v1 = ((a.v0 <= b.v0) ? (0xFFFFFFFF) : (0x0)); r.v2 = r.v3 = ((a.v1 <= b.v1) ? (0xFFFFFFFF) : (0x0)); return r; }
226 /*******************************************************************************
227 ***** C++ Operators for Generic Implementation
228 *******************************************************************************/
229 #if defined(__cplusplus)
231 /***** Addition *****/
232 inline __simdia_veci operator+(const __simdia_veci &a, const __simdia_veci &b) { return __simdia_vaddi(a, b); }
233 inline __simdia_vecf operator+(const __simdia_vecf &a, const __simdia_vecf &b) { return __simdia_vaddf(a, b); }
234 inline __simdia_veclf operator+(const __simdia_veclf &a, const __simdia_veclf &b) { return __simdia_vaddlf(a, b); }
235 inline __simdia_veci operator+=( __simdia_veci &a, const __simdia_veci &b) { a = __simdia_vaddi(a, b); return a; }
236 inline __simdia_vecf operator+=( __simdia_vecf &a, const __simdia_vecf &b) { a = __simdia_vaddf(a, b); return a; }
237 inline __simdia_veclf operator+=(__simdia_veclf &a, const __simdia_veclf &b) { a = __simdia_vaddlf(a, b); return a; }
239 inline __simdia_veci operator+(const __simdia_veci &a, const int &b) { return __simdia_vaddi(a, __simdia_vseti(b)); }
240 inline __simdia_vecf operator+(const __simdia_vecf &a, const float &b) { return __simdia_vaddf(a, __simdia_vsetf(b)); }
241 inline __simdia_veclf operator+(const __simdia_veclf &a, const double &b) { return __simdia_vaddlf(a, __simdia_vsetlf(b)); }
242 inline __simdia_veci operator+=( __simdia_veci &a, const int &b) { a = __simdia_vaddi(a, __simdia_vseti(b)); return a; }
243 inline __simdia_vecf operator+=( __simdia_vecf &a, const float &b) { a = __simdia_vaddf(a, __simdia_vsetf(b)); return a; }
244 inline __simdia_veclf operator+=(__simdia_veclf &a, const double &b) { a = __simdia_vaddlf(a, __simdia_vsetlf(b)); return a; }
246 /***** Subtraction *****/
247 inline __simdia_veci operator-(const __simdia_veci &a, const __simdia_veci &b) { return __simdia_vsubi(a, b); }
248 inline __simdia_vecf operator-(const __simdia_vecf &a, const __simdia_vecf &b) { return __simdia_vsubf(a, b); }
249 inline __simdia_veclf operator-(const __simdia_veclf &a, const __simdia_veclf &b) { return __simdia_vsublf(a, b); }
250 inline __simdia_veci operator-=( __simdia_veci &a, const __simdia_veci &b) { a = __simdia_vsubi(a, b); return a; }
251 inline __simdia_vecf operator-=( __simdia_vecf &a, const __simdia_vecf &b) { a = __simdia_vsubf(a, b); return a; }
252 inline __simdia_veclf operator-=(__simdia_veclf &a, const __simdia_veclf &b) { a = __simdia_vsublf(a, b); return a; }
254 inline __simdia_veci operator-(const __simdia_veci &a, const int &b) { return __simdia_vsubi(a, __simdia_vseti(b)); }
255 inline __simdia_vecf operator-(const __simdia_vecf &a, const float &b) { return __simdia_vsubf(a, __simdia_vsetf(b)); }
256 inline __simdia_veclf operator-(const __simdia_veclf &a, const double &b) { return __simdia_vsublf(a, __simdia_vsetlf(b)); }
257 inline __simdia_veci operator-=( __simdia_veci &a, const int &b) { a = __simdia_vsubi(a, __simdia_vseti(b)); return a; }
258 inline __simdia_vecf operator-=( __simdia_vecf &a, const float &b) { a = __simdia_vsubf(a, __simdia_vsetf(b)); return a; }
259 inline __simdia_veclf operator-=(__simdia_veclf &a, const double &b) { a = __simdia_vsublf(a, __simdia_vsetlf(b)); return a; }
261 /***** Multiplication *****/
262 inline __simdia_vecf operator*(const __simdia_vecf &a, const __simdia_vecf &b) { return __simdia_vmulf(a, b); }
263 inline __simdia_veclf operator*(const __simdia_veclf &a, const __simdia_veclf &b) { return __simdia_vmullf(a, b); }
264 inline __simdia_vecf operator*=( __simdia_vecf &a, const __simdia_vecf &b) { a = __simdia_vmulf(a, b); return a; }
265 inline __simdia_veclf operator*=(__simdia_veclf &a, const __simdia_veclf &b) { a = __simdia_vmullf(a, b); return a; }
267 inline __simdia_vecf operator*(const __simdia_vecf &a, const float &b) { return __simdia_vmulf(a, __simdia_vsetf(b)); }
268 inline __simdia_veclf operator*(const __simdia_veclf &a, const double &b) { return __simdia_vmullf(a, __simdia_vsetlf(b)); }
269 inline __simdia_vecf operator*=( __simdia_vecf &a, const float &b) { a = __simdia_vmulf(a, __simdia_vsetf(b)); return a; }
270 inline __simdia_veclf operator*=(__simdia_veclf &a, const double &b) { a = __simdia_vmullf(a, __simdia_vsetlf(b)); return a; }
272 /***** Division *****/
273 inline __simdia_vecf operator/(const __simdia_vecf &a, const __simdia_vecf &b) { return __simdia_vdivf(a, b); }
274 inline __simdia_veclf operator/(const __simdia_veclf &a, const __simdia_veclf &b) { return __simdia_vdivlf(a, b); }
275 inline __simdia_vecf operator/=( __simdia_vecf &a, const __simdia_vecf &b) { a = __simdia_vdivf(a, b); return a; }
276 inline __simdia_veclf operator/=(__simdia_veclf &a, const __simdia_veclf &b) { a = __simdia_vdivlf(a, b); return a; }
278 inline __simdia_vecf operator/(const __simdia_vecf &a, const float &b) { return __simdia_vdivf(a, __simdia_vsetf(b)); }
279 inline __simdia_veclf operator/(const __simdia_veclf &a, const double &b) { return __simdia_vdivlf(a, __simdia_vsetlf(b)); }
280 inline __simdia_vecf operator/=( __simdia_vecf &a, const float &b) { a = __simdia_vdivf(a, __simdia_vsetf(b)); return a; }
281 inline __simdia_veclf operator/=(__simdia_veclf &a, const double &b) { a = __simdia_vdivlf(a, __simdia_vsetlf(b)); return a; }
283 /***** Or *****/
284 inline __simdia_veci operator|(const __simdia_veci &a, const __simdia_veci &b) { return __simdia_vori(a, b); }
285 inline __simdia_vecf operator|(const __simdia_vecf &a, const __simdia_vecf &b) { return __simdia_vorf(a, b); }
286 inline __simdia_veclf operator|(const __simdia_veclf &a, const __simdia_veclf &b) { return __simdia_vorlf(a, b); }
287 inline __simdia_veci operator|=( __simdia_veci &a, const __simdia_veci &b) { a = __simdia_vori(a, b); return a; }
288 inline __simdia_vecf operator|=( __simdia_vecf &a, const __simdia_vecf &b) { a = __simdia_vorf(a, b); return a; }
289 inline __simdia_veclf operator|=(__simdia_veclf &a, const __simdia_veclf &b) { a = __simdia_vorlf(a, b); return a; }
291 inline __simdia_veci operator|(const __simdia_veci &a, const int &b) { return __simdia_vori(a, __simdia_vseti(b)); }
292 inline __simdia_vecf operator|(const __simdia_vecf &a, const float &b) { return __simdia_vorf(a, __simdia_vsetf(b)); }
293 inline __simdia_veclf operator|(const __simdia_veclf &a, const double &b) { return __simdia_vorlf(a, __simdia_vsetlf(b)); }
294 inline __simdia_veci operator|=( __simdia_veci &a, const int &b) { a = __simdia_vori(a, __simdia_vseti(b)); return a; }
295 inline __simdia_vecf operator|=( __simdia_vecf &a, const float &b) { a = __simdia_vorf(a, __simdia_vsetf(b)); return a; }
296 inline __simdia_veclf operator|=(__simdia_veclf &a, const double &b) { a = __simdia_vorlf(a, __simdia_vsetlf(b)); return a; }
298 /***** And *****/
299 inline __simdia_veci operator&(const __simdia_veci &a, const __simdia_veci &b) { return __simdia_vandi(a, b); }
300 inline __simdia_vecf operator&(const __simdia_vecf &a, const __simdia_vecf &b) { return __simdia_vandf(a, b); }
301 inline __simdia_veclf operator&(const __simdia_veclf &a, const __simdia_veclf &b) { return __simdia_vandlf(a, b); }
302 inline __simdia_veci operator&=( __simdia_veci &a, const __simdia_veci &b) { a = __simdia_vandi(a, b); return a; }
303 inline __simdia_vecf operator&=( __simdia_vecf &a, const __simdia_vecf &b) { a = __simdia_vandf(a, b); return a; }
304 inline __simdia_veclf operator&=(__simdia_veclf &a, const __simdia_veclf &b) { a = __simdia_vandlf(a, b); return a; }
306 inline __simdia_veci operator&(const __simdia_veci &a, const int &b) { return __simdia_vandi(a, __simdia_vseti(b)); }
307 inline __simdia_vecf operator&(const __simdia_vecf &a, const float &b) { return __simdia_vandf(a, __simdia_vsetf(b)); }
308 inline __simdia_veclf operator&(const __simdia_veclf &a, const double &b) { return __simdia_vandlf(a, __simdia_vsetlf(b)); }
309 inline __simdia_veci operator&=( __simdia_veci &a, const int &b) { a = __simdia_vandi(a, __simdia_vseti(b)); return a; }
310 inline __simdia_vecf operator&=( __simdia_vecf &a, const float &b) { a = __simdia_vandf(a, __simdia_vsetf(b)); return a; }
311 inline __simdia_veclf operator&=(__simdia_veclf &a, const double &b) { a = __simdia_vandlf(a, __simdia_vsetlf(b)); return a; }
313 /***** Xor *****/
314 inline __simdia_veci operator^(const __simdia_veci &a, const __simdia_veci &b) { return __simdia_vxori(a, b); }
315 inline __simdia_vecf operator^(const __simdia_vecf &a, const __simdia_vecf &b) { return __simdia_vxorf(a, b); }
316 inline __simdia_veclf operator^(const __simdia_veclf &a, const __simdia_veclf &b) { return __simdia_vxorlf(a, b); }
317 inline __simdia_veci operator^=( __simdia_veci &a, const __simdia_veci &b) { a = __simdia_vxori(a, b); return a; }
318 inline __simdia_vecf operator^=( __simdia_vecf &a, const __simdia_vecf &b) { a = __simdia_vxorf(a, b); return a; }
319 inline __simdia_veclf operator^=(__simdia_veclf &a, const __simdia_veclf &b) { a = __simdia_vxorlf(a, b); return a; }
321 inline __simdia_veci operator^(const __simdia_veci &a, const int &b) { return __simdia_vxori(a, __simdia_vseti(b)); }
322 inline __simdia_vecf operator^(const __simdia_vecf &a, const float &b) { return __simdia_vxorf(a, __simdia_vsetf(b)); }
323 inline __simdia_veclf operator^(const __simdia_veclf &a, const double &b) { return __simdia_vxorlf(a, __simdia_vsetlf(b)); }
324 inline __simdia_veci operator^=( __simdia_veci &a, const int &b) { a = __simdia_vxori(a, __simdia_vseti(b)); return a; }
325 inline __simdia_vecf operator^=( __simdia_vecf &a, const float &b) { a = __simdia_vxorf(a, __simdia_vsetf(b)); return a; }
326 inline __simdia_veclf operator^=(__simdia_veclf &a, const double &b) { a = __simdia_vxorlf(a, __simdia_vsetlf(b)); return a; }
328 #endif /* defined(__cplusplus) */
330 /*@}*/
333 /*******************************************************************************
334 *******************************************************************************
335 ***** SSE Support
336 *******************************************************************************
337 *******************************************************************************/
338 #if defined(__SSE2__) && (!(SIMDIA_FORCE_NO_SSE)) && !defined(_CRAYC)
340 /* NOTE | TODO | FIXME : Add checks for various version of SSE. For now, only
341 * support and assume that minimum level SSE2.
344 /***** Data Types *****/
345 typedef __m128i simdia_veci;
346 typedef __m128 simdia_vecf;
347 typedef __m128d simdia_veclf;
349 /***** Insert *****/
350 /* TODO | FIXME - Try to make these functions not reference memory so values stay in registers */
351 inline simdia_veci simdia_vinserti( simdia_veci v, const int s, const int i) { simdia_veci r = v; int* rPtr = ( int*)(&r); rPtr[i] = s; return r; }
352 inline simdia_vecf simdia_vinsertf( simdia_vecf v, const float s, const int i) { simdia_vecf r = v; float* rPtr = ( float*)(&r); rPtr[i] = s; return r; }
353 inline simdia_veclf simdia_vinsertlf(simdia_veclf v, const double s, const int i) { simdia_veclf r = v; double* rPtr = (double*)(&r); rPtr[i] = s; return r; }
355 /***** Extract *****/
356 /* TODO | FIXME - Try to make these functions not reference memory so values stay in registers */
357 inline int vextracti( simdia_veci v, const int i) { return (( int*)(&v))[i]; }
358 inline float vextractf( simdia_vecf v, const int i) { return (( float*)(&v))[i]; }
359 inline double vextractlf(simdia_veclf v, const int i) { return ((double*)(&v))[i]; }
361 /***** Set *****/
362 #define simdia_vseti(a) (_mm_set1_epi32((int)(a)))
363 #define simdia_vsetf(a) (_mm_set1_ps((float)(a)))
364 #define simdia_vsetlf(a) (_mm_set1_pd((double)(a)))
366 /***** Constant Zero *****/
367 #define simdia_const_vzeroi (_mm_setzero_si128())
368 #define simdia_const_vzerof (_mm_setzero_ps())
369 #define simdia_const_vzerolf (_mm_setzero_pd())
371 /***** Constant One *****/
372 #define simdia_const_vonei (simdia_vseti(1))
373 #define simdia_const_vonef (simdia_vsetf(1.0f))
374 #define simdia_const_vonelf (simdia_vsetlf(1.0))
376 /***** Constant Two *****/
377 #define simdia_const_vtwoi (simdia_vseti(2))
378 #define simdia_const_vtwof (simdia_vsetf(2.0f))
379 #define simdia_const_vtwolf (simdia_vsetlf(2.0))
381 /***** Constant Negative One *****/
382 #define simdia_const_vnegonei (simdia_vseti(-1))
383 #define simdia_const_vnegonef (simdia_vsetf(-1.0f))
384 #define simdia_const_vnegonelf (simdia_vsetlf(-1.0))
386 /***** Rotate *****/
387 /* TODO : FIXME - Find a better way to do Rotate in SSE */
388 inline simdia_veci simdia_vrothi(const simdia_veci &a, int s) { simdia_veci b; int* a_ptr = ( int*)(&a); int* b_ptr = ( int*)(&b); s &= 0x3; b_ptr[0] = a_ptr[(0-s)&0x3]; b_ptr[1] = a_ptr[(1-s)&0x3]; b_ptr[2] = a_ptr[(2-s)&0x3]; b_ptr[3] = a_ptr[(3-s)&0x3]; return b; }
389 inline simdia_vecf simdia_vrothf(const simdia_vecf &a, int s) { simdia_vecf b; float* a_ptr = ( float*)(&a); float* b_ptr = ( float*)(&b); s &= 0x3; b_ptr[0] = a_ptr[(0-s)&0x3]; b_ptr[1] = a_ptr[(1-s)&0x3]; b_ptr[2] = a_ptr[(2-s)&0x3]; b_ptr[3] = a_ptr[(3-s)&0x3]; return b; }
390 inline simdia_veclf simdia_vrothlf(const simdia_veclf &a, int s) { simdia_veclf b; double* a_ptr = (double*)(&a); double* b_ptr = (double*)(&b); s &= 0x1; b_ptr[0] = a_ptr[(0-s)&0x1]; b_ptr[1] = a_ptr[(1-s)&0x1]; return b; }
391 inline simdia_veci simdia_vrotli(const simdia_veci &a, int s) { simdia_veci b; int* a_ptr = ( int*)(&a); int* b_ptr = ( int*)(&b); s &= 0x3; b_ptr[0] = a_ptr[(0+s)&0x3]; b_ptr[1] = a_ptr[(1+s)&0x3]; b_ptr[2] = a_ptr[(2+s)&0x3]; b_ptr[3] = a_ptr[(3+s)&0x3]; return b; }
392 inline simdia_vecf simdia_vrotlf(const simdia_vecf &a, int s) { simdia_vecf b; float* a_ptr = ( float*)(&a); float* b_ptr = ( float*)(&b); s &= 0x3; b_ptr[0] = a_ptr[(0+s)&0x3]; b_ptr[1] = a_ptr[(1+s)&0x3]; b_ptr[2] = a_ptr[(2+s)&0x3]; b_ptr[3] = a_ptr[(3+s)&0x3]; return b; }
393 inline simdia_veclf simdia_vrotllf(const simdia_veclf &a, int s) { simdia_veclf b; double* a_ptr = (double*)(&a); double* b_ptr = (double*)(&b); s &= 0x1; b_ptr[0] = a_ptr[(0+s)&0x1]; b_ptr[1] = a_ptr[(1+s)&0x1]; return b; }
395 /***** Addition *****/
396 #define simdia_vaddi(a, b) (_mm_add_epi32((a), (b)))
397 #define simdia_vaddf(a, b) (_mm_add_ps((a), (b)))
398 #define simdia_vaddlf(a, b) (_mm_add_pd((a), (b)))
400 /***** Subtraction *****/
401 #define simdia_vsubi(a, b) (_mm_sub_epi32((a), (b)))
402 #define simdia_vsubf(a, b) (_mm_sub_ps((a), (b)))
403 #define simdia_vsublf(a, b) (_mm_sub_pd((a), (b)))
405 /***** Multiplication *****/
406 #define simdia_vmulf(a, b) (_mm_mul_ps((a), (b)))
407 #define simdia_vmullf(a, b) (_mm_mul_pd((a), (b)))
409 /***** Division *****/
410 #define simdia_vdivf(a, b) (_mm_div_ps((a), (b)))
411 #define simdia_vdivlf(a, b) (_mm_div_pd((a), (b)))
413 /***** Fused Multiply Add *****/
414 #define simdia_vmaddf(a, b, c) ( vaddf( vmulf((a), (b)), (c)))
415 #define simdia_vmaddlf(a, b, c) (vaddlf(vmullf((a), (b)), (c)))
417 /***** Reciprocal *****/
418 #define simdia_vrecipf(a) (_mm_rcp_ps(a))
419 inline simdia_veclf simdia_vreciplf(const simdia_veclf a) { simdia_veclf r; double* a_ptr = (double*)(&a); double* r_ptr = (double*)(&r); r_ptr[0] = 1.0f / a_ptr[0]; r_ptr[1] = 1.0f / a_ptr[1]; return r; }
421 /***** Square Root *****/
422 #define simdia_vsqrtf(a) (_mm_sqrt_ps(a))
423 #define simdia_vsqrtlf(a) (_mm_sqrt_pd(a))
425 /***** Reciprocal Square Root *****/
426 #define simdia_vrsqrtf(a) (_mm_rsqrt_ps(a))
427 #define simdia_vrsqrtlf(a) (vreciplf(vsqrtlf(a)))
429 /***** Not *****/
430 #define simdia_vnoti(a) (_mm_xor_si128((a), simdia_const_vnegonei))
431 #define simdia_vnotf(a) (_mm_xor_ps((a), simdia_const_vnegonei))
432 #define simdia_vnotlf(a) (_mm_xor_pd((a), simdia_const_vnegonei))
434 /***** Or *****/
435 #define simdia_vori(a, b) (_mm_or_si128((a), (b)))
436 #define simdia_vorf(a, b) (_mm_or_ps((a), (b)))
437 #define simdia_vorlf(a, b) (_mm_or_pd((a), (b)))
439 /***** Nor *****/
440 #define simdia_vnori(a, b) ( simdia_vnoti( simdia_vori((a), (b))))
441 #define simdia_vnorf(a, b) ( simdia_vnotf( simdia_vorf((a), (b))))
442 #define simdia_vnorlf(a, b) (simdia_vnotlf(simdia_vorlf((a), (b))))
444 /***** And *****/
445 #define simdia_vandi(a, b) (_mm_and_si128((a), (b)))
446 #define simdia_vandf(a, b) (_mm_and_ps((a), (b)))
447 #define simdia_vandlf(a, b) (_mm_and_pd((a), (b)))
449 /***** Nand *****/
450 #define simdia_vnandi(a, b) ( simdia_vnoti( simdia_vandi((a), (b))))
451 #define simdia_vnandf(a, b) ( simdia_vnotf( simdia_vandf((a), (b))))
452 #define simdia_vnandlf(a, b) (simdia_vnotlf(simdia_vandlf((a), (b))))
454 /***** Xor *****/
455 #define simdia_vxori(a, b) (_mm_xor_si128((a), (b)))
456 #define simdia_vxorf(a, b) (_mm_xor_ps((a), (b)))
457 #define simdia_vxorlf(a, b) (_mm_xor_pd((a), (b)))
459 /***** Nxor *****/
460 #define simdia_vnxori(a, b) ( simdia_vnoti( simdia_vxori((a), (b))))
461 #define simdia_vnxorf(a, b) ( simdia_vnotf( simdia_vxorf((a), (b))))
462 #define simdia_vnxorlf(a, b) (simdia_vnotlf(simdia_vxorlf((a), (b))))
464 /***** Equal To *****/
465 #define simdia_vcmpeqi(a, b) ((simdia_veci)(_mm_cmpeq_epi32((a), (b))))
466 #define simdia_vcmpeqf(a, b) ((simdia_veci)(_mm_cmpeq_ps((a), (b))))
467 #define simdia_vcmpeqlf(a, b) ((simdia_veci)(_mm_cmpeq_pd((a), (b))))
469 /***** Greater Than *****/
470 #define simdia_vcmpgti(a, b) ((simdia_veci)(_mm_cmpgt_epi32((a), (b))))
471 #define simdia_vcmpgtf(a, b) ((simdia_veci)(_mm_cmpgt_ps((a), (b))))
472 #define simdia_vcmpgtlf(a, b) ((simdia_veci)(_mm_cmpgt_pd((a), (b))))
474 /***** Greater Than Or Equal To *****/
475 #define simdia_vcmpgei(a, b) ((simdia_veci)(_mm_cmpge_epi32((a), (b))))
476 #define simdia_vcmpgef(a, b) ((simdia_veci)(_mm_cmpge_ps((a), (b))))
477 #define simdia_vcmpgelf(a, b) ((simdia_veci)(_mm_cmpge_pd((a), (b))))
479 /***** Less Than *****/
480 #define simdia_vcmplti(a, b) ((simdia_veci)(_mm_cmplt_epi32((a), (b))))
481 #define simdia_vcmpltf(a, b) ((simdia_veci)(_mm_cmplt_ps((a), (b))))
482 #define simdia_vcmpltlf(a, b) ((simdia_veci)(_mm_cmplt_pd((a), (b))))
484 /***** Less Than Or Equal To *****/
485 #define simdia_vcmplei(a, b) ((simdia_veci)(_mm_cmple_epi32((a), (b))))
486 #define simdia_vcmplef(a, b) ((simdia_veci)(_mm_cmple_ps((a), (b))))
487 #define simdia_vcmplelf(a, b) ((simdia_veci)(_mm_cmple_pd((a), (b))))
490 /*******************************************************************************
491 *******************************************************************************
492 ***** SPE SIMD Instructions
493 *******************************************************************************
494 *******************************************************************************/
495 /* TODO | FIXME : Find a more general check for this (this is Charm++ specific) */
496 #elif (CMK_CELL_SPE != 0) && (!(SIMDIA_FORCE_NO_SPE_SIMD))
498 /***** Data Types *****/
499 typedef vector signed int simdia_veci;
500 typedef vector float simdia_vecf;
501 typedef vector double simdia_veclf;
503 /***** Insert *****/
504 #define simdia_vinserti(v, s, i) (spu_insert((s), (v), (i)))
505 #define simdia_vinsertf(v, s, i) (spu_insert((s), (v), (i)))
506 #define simdia_vinsertlf(v, s, i) (spu_insert((s), (v), (i)))
508 /***** Extract *****/
509 #define simdia_vextracti(v, i) (spu_extract((v), (i)))
510 #define simdia_vextractf(v, i) (spu_extract((v), (i)))
511 #define simdia_vextractlf(v, i) (spu_extract((v), (i)))
513 /***** Set *****/
514 #define simdia_vseti(a) (spu_splats((int)(a)))
515 #define simdia_vsetf(a) (spu_splats((float)(a)))
516 #define simdia_vsetlf(a) (spu_splats((double)(a)))
518 /***** Constant Zero *****/
519 #define simdia_const_vzeroi (vseti(0))
520 #define simdia_const_vzerof (vsetf(0.0f))
521 #define simdia_const_vzerolf (vsetlf(0.0))
523 /***** Constant One *****/
524 #define simdia_const_vonei (vseti(1))
525 #define simdia_const_vonef (vsetf(1.0f))
526 #define simdia_const_vonelf (vsetlf(1.0))
528 /***** Constant Two *****/
529 #define simdia_const_vtwoi (vseti(2))
530 #define simdia_const_vtwof (vsetf(2.0f))
531 #define simdia_const_vtwolf (vsetlf(2.0))
533 /***** Constant Negative One *****/
534 #define simdia_const_vnegonei (vseti(-1))
535 #define simdia_const_vnegonef (vsetf(-1.0f))
536 #define simdia_const_vnegonelf (vsetlf(-1.0))
538 /***** Rotate *****/
539 #define simdia_vrothi(a, s) (spu_rlqwbyte((a), (0x10-(((s)&0x3)<<2)) ))
540 #define simdia_vrothf(a, s) (spu_rlqwbyte((a), (0x10-(((s)&0x3)<<2)) ))
541 #define simdia_vrothlf(a, s) (spu_rlqwbyte((a), (((s)&0x1)<<3) ))
542 #define simdia_vrotli(a, s) (spu_rlqwbyte((a), ((s)&0x3)<<2))
543 #define simdia_vrotlf(a, s) (spu_rlqwbyte((a), ((s)&0x3)<<2))
544 #define simdia_vrotllf(a, s) (spu_rlqwbyte((a), ((s)&0x1)<<3))
546 /***** Addition *****/
547 #define simdia_vaddi(a, b) (spu_add((a), (b)))
548 #define simdia_vaddf(a, b) (spu_add((a), (b)))
549 #define simdia_vaddlf(a, b) (spu_add((a), (b)))
551 /***** Subtraction *****/
552 #define simdia_vsubi(a, b) (spu_sub((a), (b)))
553 #define simdia_vsubf(a, b) (spu_sub((a), (b)))
554 #define simdia_vsublf(a, b) (spu_sub((a), (b)))
556 /***** Multiplication *****/
557 #define simdia_vmulf(a, b) (spu_mul((a), (b)))
558 #define simdia_vmullf(a, b) (spu_mul((a), (b)))
560 /***** Division *****/
561 #define simdia_vdivf(a, b) (spu_mul((a), spu_re(b)))
562 inline simdia_veclf simdia_vdivlf(const simdia_veclf a, const simdia_veclf b) { simdia_veclf r = { 0.0, 0.0 }; spu_insert((spu_extract(a, 0) / spu_extract(b, 0)), r, 0); spu_insert((spu_extract(a, 1) / spu_extract(b, 1)), r, 1); return r; }
564 /***** Fused Multiply Add *****/
565 #define simdia_vmaddf(a, b, c) (spu_madd((a), (b), (c)))
566 #define simdia_vmaddlf(a, b, c) (spu_madd((a), (b), (c)))
568 /***** Reciprocal *****/
569 #define simdia_vrecipf(a) (spu_re(a))
570 inline simdia_veclf simdia_vreciplf(const simdia_veclf a, const simdia_veclf b) { simdia_veclf r = { 0.0, 0.0 }; spu_insert((1.0f / spu_extract(a, 0)), r, 0); spu_insert((1.0f / spu_extract(a, 1)), r, 1); return r; }
572 /***** Square Root *****/
573 #define simdia_vsqrtf(a) (spu_re(spu_rsqrte(a)))
574 inline simdia_veclf simdia_vsqrtlf(const simdia_veclf a, const simdia_veclf b) { simdia_veclf r = { 0.0, 0.0 }; spu_insert(sqrt(spu_extract(a, 0)), r, 0); spu_insert(sqrt(spu_extract(a, 1)), r, 1); return r; }
576 /***** Reciprocal Square Root *****/
577 #define simdia_vrsqrtf(a) (spu_rsqrte(a))
578 inline simdia_veclf simdia_vrsqrtlf(const simdia_veclf a, const simdia_veclf b) { simdia_veclf r = { 0.0, 0.0 }; spu_insert((1.0f / sqrt(spu_extract(a, 0))), r, 0); spu_insert((1.0f / sqrt(spu_extract(a, 1))), r, 1); return r; }
580 /***** Not *****/
581 #define simdia_vnoti(a) (spu_nor((a), (a)))
582 #define simdia_vnotf(a) (spu_nor((a), (a)))
583 #define simdia_vnotlf(a) (spu_nor((a), (a)))
585 /***** Or *****/
586 #define simdia_vori(a, b) (spu_or((a), (b)))
587 #define simdia_vorf(a, b) (spu_or((a), (b)))
588 #define simdia_vorlf(a, b) (spu_or((a), (b)))
590 /***** Nor *****/
591 #define simdia_vnori(a, b) (spu_nor((a), (b)))
592 #define simdia_vnorf(a, b) (spu_nor((a), (b)))
593 #define simdia_vnorlf(a, b) (spu_nor((a), (b)))
595 /***** And *****/
596 #define simdia_vandi(a, b) (spu_and((a), (b)))
597 #define simdia_vandf(a, b) (spu_and((a), (b)))
598 #define simdia_vandlf(a, b) (spu_and((a), (b)))
600 /***** Nand *****/
601 #define simdia_vnandi(a, b) (spu_nand((a), (b)))
602 #define simdia_vnandf(a, b) (spu_nand((a), (b)))
603 #define simdia_vnandlf(a, b) (spu_nand((a), (b)))
605 /***** Xor *****/
606 #define simdia_vxori(a, b) (spu_xor((a), (b)))
607 #define simdia_vxorf(a, b) (spu_xor((a), (b)))
608 #define simdia_vxorlf(a, b) (spu_xor((a), (b)))
610 /***** Nxor *****/
611 #define simdia_vnxori(a, b) ( simdia_vnoti( simdia_vxori((a), (b))))
612 #define simdia_vnxorf(a, b) ( simdia_vnotf( simdia_vxorf((a), (b))))
613 #define simdia_vnxorlf(a, b) (simdia_vnotlf(simdia_vxorlf((a), (b))))
615 /***** Equal To *****/
616 #define simdia_vcmpeqi(a, b) ((simdia_veci)(spu_cmpeq((a), (b))))
617 #define simdia_vcmpeqf(a, b) ((simdia_veci)(spu_cmpeq((a), (b))))
618 #define simdia_vcmpeqlf(a, b) ((simdia_veci)(spu_cmpeq((a), (b))))
620 /***** Greater Than *****/
621 #define simdia_vcmpgti(a, b) ((simdia_veci)(spu_cmpgt((a), (b))))
622 #define simdia_vcmpgtf(a, b) ((simdia_veci)(spu_cmpgt((a), (b))))
623 #define simdia_vcmpgtlf(a, b) ((simdia_veci)(spu_cmpgt((a), (b))))
625 // NOTE : Try to create versions of >= and < that do not double evaluate their inputs
627 /***** Greater Than or Equal To *****/
628 #define simdia_vcmpgei(a, b) (spu_or( simdia_vcmpeqi((a), (b)), simdia_vcmpgti((a), (b))))
629 #define simdia_vcmpgef(a, b) (spu_or( simdia_vcmpeqf((a), (b)), simdia_vcmpgtf((a), (b))))
630 #define simdia_vcmpgelf(a, b) (spu_or(simdia_vcmpeqlf((a), (b)), simdia_vcmpgtlf((a), (b))))
632 /***** Less Than *****/
633 #define simdia_vcmplti(a, b) (spu_nor( simdia_vcmpgti((a), (b)), simdia_vcmpeqi((a), (b))))
634 #define simdia_vcmpltf(a, b) (spu_nor( simdia_vcmpgtf((a), (b)), simdia_vcmpeqf((a), (b))))
635 #define simdia_vcmpltlf(a, b) (spu_nor(simdia_vcmpgtlf((a), (b)), simdia_vcmpeqlf((a), (b))))
637 /***** Less Than or Equal To *****/
638 #define simdia_vcmplei(a, b) (spu_nor( simdia_vcmpgti((a), (b)), simdia_const_vzeroi))
639 #define simdia_vcmplef(a, b) (spu_nor( simdia_vcmpgtf((a), (b)), simdia_const_vzerof))
640 #define simdia_vcmplelf(a, b) (spu_nor(simdia_vcmpgtlf((a), (b)), simdia_const_vzerolf))
643 /*******************************************************************************
644 *******************************************************************************
645 ***** AltiVec
646 *******************************************************************************
647 *******************************************************************************/
648 #elif defined(__VEC__) && (!(SIMDIA_FORCE_NO_ALTIVEC))
650 /***** Data Types *****/
651 typedef vector signed int simdia_veci;
652 typedef vector float simdia_vecf;
653 #ifdef _ARCH_PWR7
654 /** power 7 VSX supports 64 bit operands, it also includes VMX support
655 * which means that things like vec_div, vec_insert, etcetera work for
656 * ints floats and doubles. These intrinsics also require a suitably
657 * new version of the compiler on Power 7. If you are somehow using a
658 * Power 7 with an old compiler, please do not hesitate to open a can
659 * of whoopass on whoever installed the tool chain, because that kind
660 * of stupidity should not be tolerated.
662 typedef vector double simdia_veclf;
663 #else
664 typedef __simdia_veclf simdia_veclf;
665 #endif
667 /***** Insert *****/
668 /* TODO | FIXME - Try to make these functions not reference memory
669 so values stay in registers */
670 #ifdef _ARCH_PWR7
671 // swap argument order
672 #define simdia_vinserti(a, b, c) (vec_insert((b)), ((a)), ((c)))
673 #define simdia_vinsertf(a, b, c) (vec_insert((b)), ((a)), ((c)))
674 #define simdia_vinsertlf(a, b, c) (vec_insert((b)), ((a)), ((c)))
675 #else
676 inline simdia_veci simdia_vinserti( simdia_veci v, const int s, const int i) { simdia_veci r = v; int* rPtr = ( int*)(&r); rPtr[i] = s; return r; }
677 inline simdia_vecf simdia_vinsertf( simdia_vecf v, const float s, const int i) { simdia_vecf r = v; float* rPtr = ( float*)(&r); rPtr[i] = s; return r; }
678 #define simdia_vinsertlf __simdia_vinsertlf
679 #endif
681 /***** Extract *****/
682 #ifdef _ARCH_PWR7
683 #define simdia_vextracti(a, b) (vec_extract((a), (b)))
684 #define simdia_vextractf(a, b) (vec_extract((a), (b)))
685 #define simdia_vextractlf(a, b) (vec_extract((a), (b)))
686 #else
687 /* TODO | FIXME - Try to make these functions not reference memory so values stay in registers */
688 inline int simdia_vextracti( simdia_veci v, const int i) { int* vPtr = ( int*)(&v); return vPtr[i]; }
689 inline float simdia_vextractf( simdia_vecf v, const int i) { float* vPtr = ( float*)(&v); return vPtr[i]; }
690 #define simdia_vextractlf __simdia_vextractlf
691 #endif
693 /***** Set *****/
694 #ifdef _ARCH_PWR7
695 #define simdia_vseti(a) (vec_promote((a), 0))
696 #define simdia_vsetf(a) (vec_promote((a), 0))
697 #define simdia_vsetlf(a) (vec_promote((a), 0))
698 #else
699 /* TODO : FIXME - There must be a better way to do this, but it
700 seems the only way to convert scalar to vector is to go through
701 memory instructions.
703 EJB: converting between scalar and vector is the sort of thing you
704 want to avoid doing on altivec. Better to rethink and find a way to
705 stay in the vector engine if at all possible.
708 inline simdia_veci simdia_vseti(const int a) { __simdia_veci r; r.v0 = a; return vec_splat(*((simdia_veci*)(&r)), 0); }
709 inline simdia_vecf simdia_vsetf(const float a) { __simdia_vecf r; r.v0 = a; return vec_splat(*((simdia_vecf*)(&r)), 0); }
710 #define simdia_vsetlf __simdia_vsetlf
711 #endif
712 /* NOTE: Declare one for unsigned char vector also (required by rotate functions) */
713 inline vector unsigned char simdia_vset16uc(const unsigned char c) { vector unsigned char r __attribute__((aligned(16))); ((unsigned char*)(&r))[0] = c; return vec_splat(r, 0); }
715 /***** Constant Zero *****/
716 #define simdia_const_vzeroi (vec_splat_s32(0))
717 #define simdia_const_vzerof (vec_ctf(vec_splat_s32(0), 0))
718 #ifdef _ARCH_PWR7
719 #define simdia_const_vzerolf (vec_splats(0))
720 #else
721 #define simdia_const_vzerolf (__simdia_const_vzerolf)
722 #endif
724 /***** Constant One *****/
725 #define simdia_const_vonei (vec_splat_s32(1))
726 #define simdia_const_vonef (vec_ctf(vec_splat_s32(1), 0))
727 #ifdef _ARCH_PWR7
728 #define simdia_const_vonelf (vec_splats(1))
729 #else
730 #define simdia_const_vonelf (__simdia_const_vonelf)
731 #endif
733 /***** Constant Two *****/
734 #define simdia_const_vtwoi (vec_splat_s32(2))
735 #define simdia_const_vtwof (vec_ctf(vec_splat_s32(2), 0))
736 #ifdef _ARCH_PWR7
737 #define simdia_const_vtwolf (vec_splats(2))
738 #else
739 #define simdia_const_vtwolf (__simdia_const_vtwolf)
740 #endif
742 /***** Constant Negative One *****/
743 #define simdia_const_vnegonei (vec_splat_s32(-1))
744 #define simdia_const_vnegonef (vec_ctf(vec_splat_s32(-1), 0))
745 #ifdef _ARCH_PWR7
746 #define simdia_const_vnegonelf (vec_splats(-1))
747 #else
748 #define simdia_const_vnegonelf (__const_veclf)
749 #endif
751 /***** Rotate *****/
752 #define __simdia_vrotlbytes(a, s) (vec_or(vec_slo((a), simdia_vset16uc(((s) & 0xf) << 3)), vec_sro((a), simdia_set16uc((16 - ((s) & 0xf)) << 3))))
753 #define __simdia_vrotrbytes(a, s) (vec_or(vec_sro((a), simdia_vset16uc(((s) & 0xf) << 3)), vec_slo((a), simdia_set16uc((16 - ((s) & 0xf)) << 3))))
754 #define simdia_vrotli(a, s) __simdia_vrotlbytes((a), ((s) << 2))
755 #define simdia_vrotlf(a, s) __simdia_vrotlbytes((a), ((s) << 2))
756 #define simdia_vrotllf(a, s) __simdia_vrotlbytes((a), ((s) << 3))
757 #define simdia_vrothi(a, s) __simdia_vrotrbytes((a), ((s) << 2))
758 #define simdia_vrothf(a, s) __simdia_vrotrbytes((a), ((s) << 2))
759 #define simdia_vrothlf(a, s) __simdia_vrotrbytes((a), ((s) << 3))
761 /***** Addition *****/
762 #define simdia_vaddi(a, b) (vec_add((a), (b)))
763 #define simdia_vaddf(a, b) (vec_add((a), (b)))
764 #ifdef _ARCH_PWR7
765 #define simdia_vaddlf(a, b) (vec_add((a), (b)))
766 #else
767 #define simdia_vaddlf __simdia_vaddlf
768 #endif
770 /***** Subtraction *****/
771 #define simdia_vsubi(a, b) (vec_sub((a), (b)))
772 #define simdia_vsubf(a, b) (vec_sub((a), (b)))
773 #ifdef _ARCH_PWR7
774 #define simdia_vsublf(a, b) (vec_sub((a), (b)))
775 #else
776 #define simdia_vsublf __simdia_vsublf
777 #endif
779 /***** Multiplication *****/
780 // NOTE: Try to find a way to do this without double evaluating a
781 #ifdef _ARCH_PWR7
782 #define simdia_vmulf(a, b) (vec_mul((a), (b)))
783 #define simdia_vmullf(a, b) (vec_mul((a), (b)))
784 #else
785 #define simdia_vmulf(a, b) (vec_madd((a), (b), vec_xor((a), (a))))
786 #define simdia_vmullf __simdia_vmullf
787 #endif
789 /***** Division *****/
790 #ifdef _ARCH_PWR7
791 #define simdia_vdivf(a, b) (vec_div((a)), ((b)))
792 #define simdia_vdivlf(a, b) (vec_div((a)), ((b)))
793 #else
794 #define simdia_vdivf(a, b) (simdia_vmulf((a), vec_re(b)))
795 #define simdia_vdivlf __simdia_vdivlf
796 #endif
798 /***** Fused Multiply Add *****/
799 #define simdia_vmaddf(a, b, c) (vec_madd((a), (b), (c)))
800 #ifdef _ARCH_PWR7
801 #define simdia_vmaddlf(a, b, c) (vec_madd((a), (b), (c)))
802 #else
803 #define simdia_vmaddlf __simdia_vmaddlf
804 #endif
806 /***** Reciprocal *****/
807 #define simdia_vrecipf(a) (vec_re(a))
808 #ifdef _ARCH_PWR7
809 #define simdia_vreciplf(a) (vec_re(a))
810 #else
811 #define simdia_vreciplf __simdia_vreciplf
812 #endif
814 /***** Square Root *****/
815 #define simdia_vsqrtf(a) (vec_re(vec_rsqrte(a)))
816 #ifdef _ARCH_PWR7
817 #define simdia_vsqrtlf(a) (vec_sqrt(a))
818 #else
819 #define simdia_vsqrtlf __simdia_vsqrtlf
820 #endif
822 /***** Reciprocal Square Root *****/
823 #define simdia_vrsqrtf(a) (vec_rsqrte(a))
824 #ifdef _ARCH_PWR7
825 #define simdia_vrsqrtlf(a) (vec_rsqrte(a))
826 #else
827 #define simdia_vrsqrtlf __simdia_vrsqrtlf
828 #endif
830 /***** Not *****/
831 #ifdef _ARCH_PWR7
832 #define simdia_vnoti(a) (vec_neg(a))
833 #define simdia_vnotf(a) (vec_neg(a))
834 #define simdia_vnotlf(a) (vec_neg(a))
835 #else
836 #define simdia_vnoti(a) (vec_xor((a), simdia_const_vnegonei))
837 #define simdia_vnotf(a) (vec_xor((a), simdia_const_vnegonei))
838 #define simdia_vnotlf __simdia_vnotlf
839 #endif
841 /***** Or *****/
842 #define simdia_vori(a, b) (vec_or((a), (b)))
843 #define simdia_vorf(a, b) (vec_or((a), (b)))
844 #ifdef _ARCH_PWR7
845 #define simdia_vorlf(a, b) (vec_or((a), (b)))
846 #else
847 #define simdia_vorlf __simdia_vorlf
848 #endif
850 /***** Nor *****/
851 #define simdia_vnori(a, b) (vec_nor((a), (b)))
852 #define simdia_vnorf(a, b) (vec_nor((a), (b)))
853 #ifdef _ARCH_PWR7
854 #define simdia_vnorlf(a, b) (vec_nor((a), (b)))
855 #else
856 #define simdia_vnorlf __simdia_vnorlf
857 #endif
859 /***** And *****/
860 #define simdia_vandi(a, b) (vec_and((a), (b)))
861 #define simdia_vandf(a, b) (vec_and((a), (b)))
862 #ifdef _ARCH_PWR7
863 #define simdia_vandlf(a, b) (vec_and((a), (b)))
864 #else
865 #define simdia_vandlf __simdia_vandlf
866 #endif
868 /***** Nand *****/
869 #define simdia_vnandi(a, b) (simdia_vnoti(simdia_vandi((a), (b))))
870 #define simdia_vnandf(a, b) (simdia_vnotf(simdia_vandf((a), (b))))
871 #ifdef _ARCH_PWR7
872 #define simdia_vnandlf(a, b) (simdia_vnotf(simdia_vandf((a), (b))))
873 #else
874 #define simdia_vnandlf __simdia_vnandlf
875 #endif
877 /***** Xor *****/
878 #define simdia_vxori(a, b) (vec_xor((a), (b)))
879 #define simdia_vxorf(a, b) (vec_xor((a), (b)))
880 #ifdef _ARCH_PWR7
881 #define simdia_vxorlf(a, b) (vec_xor((a), (b)))
882 #else
883 #define simdia_vxorlf __simdia_vxorlf
884 #endif
886 /***** Nxor *****/
887 #define simdia_vnxori(a, b) (simdia_vnoti(simdia_vxori((a), (b))))
888 #define simdia_vnxorf(a, b) (simdia_vnotf(simdia_vxorf((a), (b))))
889 #ifdef _ARCH_PWR7
890 #define simdia_vnxorlf(a, b) (simdia_vnotlf(simdia_vxorf((a), (b))))
891 #else
892 #define simdia_vnxorlf __simdia_vnxorlf
893 #endif
895 /***** Equal To *****/
896 #define simdia_vcmpeqi(a, b) ((simdia_veci)(vec_cmpeq((a), (b))))
897 #define simdia_vcmpeqf(a, b) ((simdia_veci)(vec_cmpeq((a), (b))))
898 #ifdef _ARCH_PWR7
899 #define simdia_vcmpeqlf(a, b) ((simdia_veci)(vec_cmpeq((a), (b))))
900 #else
901 #define simdia_vcmpeqlf __simdia_vcmpeqlf
902 #endif
904 /***** Greater Than *****/
905 #define simdia_vcmpgti(a, b) ((simdia_veci)(vec_cmpgt((a), (b))))
906 #define simdia_vcmpgtf(a, b) ((simdia_veci)(vec_cmpgt((a), (b))))
907 #ifdef _ARCH_PWR7
908 #define simdia_vcmpgtlf(a, b) ((simdia_veci)(vec_cmpgt((a), (b))))
909 #else
910 #define simdia_vcmpgtlf __simdia_vcmpgtlf
911 #endif
913 /***** Greater Than Or Equal To *****/
914 #define simdia_vcmpgei(a, b) ((simdia_veci)(vec_cmpge((a), (b))))
915 #define simdia_vcmpgef(a, b) ((simdia_veci)(vec_cmpge((a), (b))))
916 #ifdef _ARCH_PWR7
917 #define simdia_vcmpgelf(a, b) ((simdia_veci)(vec_cmpge((a), (b))))
918 #else
919 #define simdia_vcmpgelf __simdia_vcmpgelf
920 #endif
922 /***** Less Than *****/
923 #define simdia_vcmplti(a, b) ((simdia_veci)(vec_cmplt((a), (b))))
924 #define simdia_vcmpltf(a, b) ((simdia_veci)(vec_cmplt((a), (b))))
925 #ifdef _ARCH_PWR7
926 #define simdia_vcmpltlf(a, b) ((simdia_veci)(vec_cmplt((a), (b))))
927 #else
928 #define simdia_vcmpltlf __simdia_vcmpltlf
929 #endif
931 /***** Less Than Or Equal To *****/
932 #define simdia_vcmplei(a, b) ((simdia_veci)(vec_cmple((a), (b))))
933 #define simdia_vcmplef(a, b) ((simdia_veci)(vec_cmple((a), (b))))
934 #ifdef _ARCH_PWR7
935 #define simdia_vcmplelf(a, b) ((simdia_veci)(vec_cmple((a), (b))))
936 // NOTE: vec_cmple not listed in Calin's wiki page of builtins for
937 // PWR7, but has a header definition in the compiler
938 #else
939 #define simdia_vcmplelf __simdia_vcmplelf
940 #endif
942 /*******************************************************************************
943 *******************************************************************************
944 ***** Mapping to Generic C Implementation
945 *******************************************************************************
946 *******************************************************************************/
947 #else
949 /***** Data Types *****/
950 typedef __simdia_veci simdia_veci;
951 typedef __simdia_vecf simdia_vecf;
952 typedef __simdia_veclf simdia_veclf;
954 /***** Insert *****/
955 #define simdia_vinserti __simdia_vinserti
956 #define simdia_vinsertf __simdia_vinsertf
957 #define simdia_vinsertlf __simdia_vinsertlf
959 /***** Extract *****/
960 #define simdia_vextracti __simdia_vextracti
961 #define simdia_vextractf __simdia_vextractf
962 #define simdia_vextractlf __simdia_vextractlf
964 /***** Set *****/
965 #define simdia_vseti __simdia_vseti
966 #define simdia_vsetf __simdia_vsetf
967 #define simdia_vsetlf __simdia_vsetlf
969 /***** Constant Zero *****/
970 #define simdia_const_vzeroi __simdia_const_vzeroi
971 #define simdia_const_vzerof __simdia_const_vzerof
972 #define simdia_const_vzerolf __simdia_const_vzerolf
974 /***** Constant One *****/
975 #define simdia_const_vonei __simdia_const_vonei
976 #define simdia_const_vonef __simdia_const_vonef
977 #define simdia_const_vonelf __simdia_const_vonelf
979 /***** Constant Two *****/
980 #define simdia_const_vtwoi __simdia_const_vtwoi
981 #define simdia_const_vtwof __simdia_const_vtwof
982 #define simdia_const_vtwolf __simdia_const_vtwolf
984 /***** Constant Negative One *****/
985 #define simdia_const_vnegonei __simdia_const_vnegonei
986 #define simdia_const_vnegonef __simdia_const_vnegonef
987 #define simdia_const_vnegonelf __simdia_const_vnegonelf
989 /***** Rotate *****/
990 #define simdia_vrothi __simdia_vrothi
991 #define simdia_vrothf __simdia_vrothf
992 #define simdia_vrothlf __simdia_vrothlf
993 #define simdia_vrotli __simdia_vrotli
994 #define simdia_vrotlf __simdia_vrotlf
995 #define simdia_vrotllf __simdia_vrotllf
997 /***** Addition *****/
998 #define simdia_vaddi __simdia_vaddi
999 #define simdia_vaddf __simdia_vaddf
1000 #define simdia_vaddlf __simdia_vaddlf
1002 /***** Subtraction *****/
1003 #define simdia_vsubi __simdia_vsubi
1004 #define simdia_vsubf __simdia_vsubf
1005 #define simdia_vsublf __simdia_vsublf
1007 /***** Multiplication *****/
1008 #define simdia_vmulf __simdia_vmulf
1009 #define simdia_vmullf __simdia_vmullf
1011 /***** Division *****/
1012 #define simdia_vdivf __simdia_vdivf
1013 #define simdia_vdivlf __simdia_vdivlf
1015 /***** Fused Multiply Add *****/
1016 #define simdia_vmaddf __simdia_vmaddf
1017 #define simdia_vmaddlf __simdia_vmaddlf
1019 /***** Reciprocal *****/
1020 #define simdia_vrecipf __simdia_vrecipf
1021 #define simdia_vreciplf __simdia_vreciplf
1023 /***** Square Root *****/
1024 #define simdia_vsqrtf __simdia_vsqrtf
1025 #define simdia_vsqrtlf __simdia_vsqrtlf
1027 /***** Reciprocal Square Root *****/
1028 #define simdia_vrsqrtf __simdia_vrsqrtf
1029 #define simdia_vrsqrtlf __simdia_vrsqrtlf
1031 /***** Not *****/
1032 #define simdia_vnoti __simdia_vnoti
1033 #define simdia_vnotf __simdia_vnotf
1034 #define simdia_vnotlf __simdia_vnotlf
1036 /***** Or *****/
1037 #define simdia_vori __simdia_vori
1038 #define simdia_vorf __simdia_vorf
1039 #define simdia_vorlf __simdia_vorlf
1041 /***** Nor *****/
1042 #define simdia_vnori __simdia_vnori
1043 #define simdia_vnorf __simdia_vnorf
1044 #define simdia_vnorlf __simdia_vnorlf
1046 /***** And *****/
1047 #define simdia_vandi __simdia_vandi
1048 #define simdia_vandf __simdia_vandf
1049 #define simdia_vandlf __simdia_vandlf
1051 /***** Nand *****/
1052 #define simdia_vnandi __simdia_vnandi
1053 #define simdia_vnandf __simdia_vnandf
1054 #define simdia_vnandlf __simdia_vnandlf
1056 /***** Xor *****/
1057 #define simdia_vxori __simdia_vxori
1058 #define simdia_vxorf __simdia_vxorf
1059 #define simdia_vxorlf __simdia_vxorlf
1061 /***** Nxor *****/
1062 #define simdia_vnxori __simdia_vnxori
1063 #define simdia_vnxorf __simdia_vnxorf
1064 #define simdia_vnxorlf __simdia_vnxorlf
1066 /***** Equal To *****/
1067 #define simdia_vcmpeqi __simdia_vcmpeqi
1068 #define simdia_vcmpeqf __simdia_vcmpeqf
1069 #define simdia_vcmpeqlf __simdia_vcmpeqlf
1071 /***** Greater Than *****/
1072 #define simdia_vcmpgti __simdia_vcmpgti
1073 #define simdia_vcmpgtf __simdia_vcmpgtf
1074 #define simdia_vcmpgtlf __simdia_vcmpgtlf
1076 /***** Greater Than Or Equal To *****/
1077 #define simdia_vcmpgei __simdia_vcmpgei
1078 #define simdia_vcmpgef __simdia_vcmpgef
1079 #define simdia_vcmpgelf __simdia_vcmpgelf
1081 /***** Less Than *****/
1082 #define simdia_vcmplti __simdia_vcmplti
1083 #define simdia_vcmpltf __simdia_vcmpltf
1084 #define simdia_vcmpltlf __simdia_vcmpltlf
1086 /***** Less Than Or Equal To *****/
1087 #define simdia_vcmplei __simdia_vcmplei
1088 #define simdia_vcmplef __simdia_vcmplef
1089 #define simdia_vcmplelf __simdia_vcmplelf
1092 #endif
1095 /*******************************************************************************
1096 *******************************************************************************
1097 ***** Shared Combinations
1098 *******************************************************************************
1099 *******************************************************************************/
1101 /* NOTE: If any architecture specific implementation can do any of these
1102 * operations faster, then move them up to the architecture specific areas and
1103 * make individual definitions. This area is just meant to declare commonly
1104 * use combinations so that they don't have to be repeated many times over.
1107 /***** Number of Elements per Vector Type *****/
1108 #define simdia_veci_numElems (sizeof( simdia_veci)/sizeof( int))
1109 #define simdia_vecf_numElems (sizeof( simdia_vecf)/sizeof( float))
1110 #define simdia_veclf_numElems (sizeof(simdia_veclf)/sizeof(double))
1112 /***** Spread (Duplicate functionality of 'Set' by another another name) *****/
1113 #define simdia_vspreadi(a) ( simdia_vseti(a))
1114 #define simdia_vspreadf(a) ( simdia_vsetf(a))
1115 #define simdia_vspreadlf(a) (simdia_vsetlf(a))
1117 #define simdia_visfinitef(a) (isfinite(simdia_vextractf((a),0)) && isfinite(simdia_vextractf((a),1)) && isfinite(simdia_vextractf((a),2)) && isfinite(simdia_vextractf((a),3)))
1118 #define simdia_visfinitelf(a) (isfinite(simdia_vextractlf((a),0)) && isfinite(simdia_vextractlf((a),1)))
1120 /***** Add to Scalar *****/
1121 #define simdia_vaddis(a, b) ( simdia_vaddi((a), simdia_vseti(b)))
1122 #define simdia_vaddfs(a, b) ( simdia_vaddf((a), simdia_vsetf(b)))
1123 #define simdia_vaddlfs(a, b) (simdia_vaddlf((a), simdia_vsetlf(b)))
1125 /***** Subtract a Scalar *****/
1126 #define simdia_vsubis(a, b) ( simdia_vsubi((a), simdia_vseti(b)))
1127 #define simdia_vsubfs(a, b) ( simdia_vsubf((a), simdia_vsetf(b)))
1128 #define simdia_vsublfs(a, b) (simdia_vsublf((a), simdia_vsetlf(b)))
1130 /***** Multiply by Scalar *****/
1131 #define simdia_vmulfs(a, b) ( simdia_vmulf((a), simdia_vsetf(b)))
1132 #define simdia_vmullfs(a, b) (simdia_vmullf((a), simdia_vsetlf(b)))
1134 /***** Divide by Scalar *****/
1135 #define simdia_vdivfs(a, b) ( simdia_vdivf((a), simdia_vsetf(b)))
1136 #define simdia_vdivlfs(a, b) (simdia_vdivlf((a), simdia_vsetlf(b)))
1138 /***** Fused Multiply(Vector) Add(Scalar) *****/
1139 #define simdia_vmaddfs(a, b, c) ( simdia_vmaddf((a), (b), simdia_vsetf(c)))
1140 #define simdia_vmaddlfs(a, b, c) (simdia_vmaddlf((a), (b), simdia_vsetlf(c)))
1142 /***** Fused Multiply(Scalar) Add(Scalar) *****/
1143 #define simdia_vmaddfss(a, b, c) ( simdia_vmaddf((a), simdia_vsetf(b), simdia_vsetf(c)))
1144 #define simdia_vmaddlfss(a, b, c) (simdia_vmaddlf((a), simdia_vsetlf(b), simdia_vsetlf(c)))
1146 #if defined(__VEC__)
1147 #ifdef vector
1148 #undef vector
1149 #endif
1150 #endif
1152 #endif //__SIMDIA_H__