5 #if defined(__SSE2__) && !defined(_CRAYC)
10 #include "spu_intrinsics.h"
26 /* Solaris does not support sqrtf (float), so just map it to sqrt (double) instead */
28 #define sqrtf(a) ((float)(sqrt((double)(a))))
33 /* Flags to force architecture specific SIMD instructions off */
34 #define SIMDIA_FORCE_NO_SSE (0)
35 #define SIMDIA_FORCE_NO_ALTIVEC (0)
36 #define SIMDIA_FORCE_NO_SPE_SIMD (0)
39 /***** Math Constants *****/
40 #define SIMDIA_CONSTANT_PI (3.141592653589793)
41 #define SIMDIA_CONSTANT_E (2.718281828459045)
42 #define SIMDIA_CONSTANT_SQRT_2 (1.414213562373095)
45 /* TODO | FIXME - Find platform independent way of ensuring alignment
46 * (using __attribute__((aligned(XXX))) doesn't seem to work in netlrts-win and
47 * netlrts-sol builds). Just to be safe since compilers should do this anyway.
50 /* TODO | FIXME - Add a function that will test the functionality of the
51 * various operations defined by these abstractions and somehow tie this test
52 * into the nightly build to ensure these operations give correct results.
56 /*******************************************************************************
57 *******************************************************************************
58 ***** Generic C Implementation
59 *******************************************************************************
60 *******************************************************************************/
64 /* NOTE: This is declared first so any architecture specific implementations
65 * can simply use the generic functions for specific data types or operations
66 * that they do not implement.
69 /***** Data Types *****/
70 /* NOTE (DMK): Since this is the generic implementation, arbitrarily choosing 128 byte "vector" size. */
71 typedef struct __simdia_vec_i
{ int v0
, v1
, v2
, v3
; } __simdia_veci
;
72 typedef struct __simdia_vec_f
{ float v0
, v1
, v2
, v3
; } __simdia_vecf
;
73 typedef struct __simdia_vec_lf
{ double v0
, v1
; } __simdia_veclf
;
77 inline __simdia_veci
__simdia_vinserti( __simdia_veci v
, const int s
, const int i
) { __simdia_veci r
= v
; int* rPtr
= ( int*)(&r
); rPtr
[i
] = s
; return r
; }
78 inline __simdia_vecf
__simdia_vinsertf( __simdia_vecf v
, const float s
, const int i
) { __simdia_vecf r
= v
; float* rPtr
= ( float*)(&r
); rPtr
[i
] = s
; return r
; }
79 inline __simdia_veclf
__simdia_vinsertlf(__simdia_veclf v
, const double s
, const int i
) { __simdia_veclf r
= v
; double* rPtr
= (double*)(&r
); rPtr
[i
] = s
; return r
; }
82 inline int __simdia_vextracti( __simdia_veci v
, const int i
) { int* vPtr
= ( int*)(&v
); return vPtr
[i
]; }
83 inline float __simdia_vextractf( __simdia_vecf v
, const int i
) { float* vPtr
= ( float*)(&v
); return vPtr
[i
]; }
84 inline double __simdia_vextractlf(__simdia_veclf v
, const int i
) { double* vPtr
= (double*)(&v
); return vPtr
[i
]; }
87 inline __simdia_veci
__simdia_vseti(const int a
) { __simdia_veci r
; r
.v0
= r
.v1
= r
.v2
= r
.v3
= a
; return r
; }
88 inline __simdia_vecf
__simdia_vsetf(const float a
) { __simdia_vecf r
; r
.v0
= r
.v1
= r
.v2
= r
.v3
= a
; return r
; }
89 inline __simdia_veclf
__simdia_vsetlf(const double a
) { __simdia_veclf r
; r
.v0
= r
.v1
= a
; return r
; }
91 /* NOTE: Would it be better to generate the constants instead of read them from memory in the generic version? */
93 /***** Constant Zero *****/
94 const __simdia_veci __simdia_const_vzeroi
= { 0 , 0 , 0 , 0 };
95 const __simdia_vecf __simdia_const_vzerof
= { 0.0f
, 0.0f
, 0.0f
, 0.0f
};
96 const __simdia_veclf __simdia_const_vzerolf
= { 0.0 , 0.0 };
98 /***** Constant One *****/
99 const __simdia_veci __simdia_const_vonei
= { 1 , 1 , 1 , 1 };
100 const __simdia_vecf __simdia_const_vonef
= { 1.0f
, 1.0f
, 1.0f
, 1.0f
};
101 const __simdia_veclf __simdia_const_vonelf
= { 1.0 , 1.0 };
103 /***** Constant Two *****/
104 const __simdia_veci __simdia_const_vtwoi
= { 2 , 2 , 2 , 2 };
105 const __simdia_vecf __simdia_const_vtwof
= { 2.0f
, 2.0f
, 2.0f
, 2.0f
};
106 const __simdia_veclf __simdia_const_vtwolf
= { 2.0 , 2.0 };
108 /***** Constant Negative One *****/
109 const __simdia_veci __simdia_const_vnegonei
= { -1 , -1 , -1 , -1 };
110 const __simdia_vecf __simdia_const_vnegonef
= { -1.0f
, -1.0f
, -1.0f
, -1.0f
};
111 const __simdia_veclf __simdia_const_vnegonelf
= { -1.0 , -1.0 };
113 /* TODO | FIXME - Try to create constants such that it does not require a
114 * memory operations to access the constants (like the SSE constants).
118 inline __simdia_veci
__simdia_vrothi(const __simdia_veci a
, int s
) { __simdia_veci b
; int* a_ptr
= ( int*)(&a
); int* b_ptr
= ( int*)(&b
); s
&= 0x3; b_ptr
[0] = a_ptr
[(0-s
)&0x3]; b_ptr
[1] = a_ptr
[(1-s
)&0x3]; b_ptr
[2] = a_ptr
[(2-s
)&0x3]; b_ptr
[3] = a_ptr
[(3-s
)&0x3]; return b
; }
119 inline __simdia_vecf
__simdia_vrothf(const __simdia_vecf a
, int s
) { __simdia_vecf b
; float* a_ptr
= ( float*)(&a
); float* b_ptr
= ( float*)(&b
); s
&= 0x3; b_ptr
[0] = a_ptr
[(0-s
)&0x3]; b_ptr
[1] = a_ptr
[(1-s
)&0x3]; b_ptr
[2] = a_ptr
[(2-s
)&0x3]; b_ptr
[3] = a_ptr
[(3-s
)&0x3]; return b
; }
120 inline __simdia_veclf
__simdia_vrothlf(const __simdia_veclf a
, int s
) { __simdia_veclf b
; double* a_ptr
= (double*)(&a
); double* b_ptr
= (double*)(&b
); s
&= 0x1; b_ptr
[0] = a_ptr
[(0-s
)&0x1]; b_ptr
[1] = a_ptr
[(1-s
)&0x1]; return b
; }
121 inline __simdia_veci
__simdia_vrotli(const __simdia_veci a
, int s
) { __simdia_veci b
; int* a_ptr
= ( int*)(&a
); int* b_ptr
= ( int*)(&b
); s
&= 0x3; b_ptr
[0] = a_ptr
[(0+s
)&0x3]; b_ptr
[1] = a_ptr
[(1+s
)&0x3]; b_ptr
[2] = a_ptr
[(2+s
)&0x3]; b_ptr
[3] = a_ptr
[(3+s
)&0x3]; return b
; }
122 inline __simdia_vecf
__simdia_vrotlf(const __simdia_vecf a
, int s
) { __simdia_vecf b
; float* a_ptr
= ( float*)(&a
); float* b_ptr
= ( float*)(&b
); s
&= 0x3; b_ptr
[0] = a_ptr
[(0+s
)&0x3]; b_ptr
[1] = a_ptr
[(1+s
)&0x3]; b_ptr
[2] = a_ptr
[(2+s
)&0x3]; b_ptr
[3] = a_ptr
[(3+s
)&0x3]; return b
; }
123 inline __simdia_veclf
__simdia_vrotllf(const __simdia_veclf a
, int s
) { __simdia_veclf b
; double* a_ptr
= (double*)(&a
); double* b_ptr
= (double*)(&b
); s
&= 0x1; b_ptr
[0] = a_ptr
[(0+s
)&0x1]; b_ptr
[1] = a_ptr
[(1+s
)&0x1]; return b
; }
125 /***** Addition *****/
126 inline __simdia_veci
__simdia_vaddi(const __simdia_veci a
, const __simdia_veci b
) { __simdia_veci r
; r
.v0
= a
.v0
+ b
.v0
; r
.v1
= a
.v1
+ b
.v1
; r
.v2
= a
.v2
+ b
.v2
; r
.v3
= a
.v3
+ b
.v3
; return r
; }
127 inline __simdia_vecf
__simdia_vaddf(const __simdia_vecf a
, const __simdia_vecf b
) { __simdia_vecf r
; r
.v0
= a
.v0
+ b
.v0
; r
.v1
= a
.v1
+ b
.v1
; r
.v2
= a
.v2
+ b
.v2
; r
.v3
= a
.v3
+ b
.v3
; return r
; }
128 inline __simdia_veclf
__simdia_vaddlf(const __simdia_veclf a
, const __simdia_veclf b
) { __simdia_veclf r
; r
.v0
= a
.v0
+ b
.v0
; r
.v1
= a
.v1
+ b
.v1
; return r
; }
130 /***** Subtraction *****/
131 inline __simdia_veci
__simdia_vsubi(const __simdia_veci a
, const __simdia_veci b
) { __simdia_veci r
; r
.v0
= a
.v0
- b
.v0
; r
.v1
= a
.v1
- b
.v1
; r
.v2
= a
.v2
- b
.v2
; r
.v3
= a
.v3
- b
.v3
; return r
; }
132 inline __simdia_vecf
__simdia_vsubf(const __simdia_vecf a
, const __simdia_vecf b
) { __simdia_vecf r
; r
.v0
= a
.v0
- b
.v0
; r
.v1
= a
.v1
- b
.v1
; r
.v2
= a
.v2
- b
.v2
; r
.v3
= a
.v3
- b
.v3
; return r
; }
133 inline __simdia_veclf
__simdia_vsublf(const __simdia_veclf a
, const __simdia_veclf b
) { __simdia_veclf r
; r
.v0
= a
.v0
- b
.v0
; r
.v1
= a
.v1
- b
.v1
; return r
; }
135 /***** Multiplication *****/
136 inline __simdia_veci
__simdia_vmuli(const __simdia_veci a
, const __simdia_veci b
) { __simdia_veci r
; r
.v0
= a
.v0
* b
.v0
; r
.v1
= a
.v1
* b
.v1
; r
.v2
= a
.v2
* b
.v2
; r
.v3
= a
.v3
* b
.v3
; return r
; }
137 inline __simdia_vecf
__simdia_vmulf(const __simdia_vecf a
, const __simdia_vecf b
) { __simdia_vecf r
; r
.v0
= a
.v0
* b
.v0
; r
.v1
= a
.v1
* b
.v1
; r
.v2
= a
.v2
* b
.v2
; r
.v3
= a
.v3
* b
.v3
; return r
; }
138 inline __simdia_veclf
__simdia_vmullf(const __simdia_veclf a
, const __simdia_veclf b
) { __simdia_veclf r
; r
.v0
= a
.v0
* b
.v0
; r
.v1
= a
.v1
* b
.v1
; return r
; }
140 /***** Division *****/
141 inline __simdia_veci
__simdia_vdivi(const __simdia_veci a
, const __simdia_veci b
) { __simdia_veci r
; r
.v0
= a
.v0
/ b
.v0
; r
.v1
= a
.v1
/ b
.v1
; r
.v2
= a
.v2
/ b
.v2
; r
.v3
= a
.v3
/ b
.v3
; return r
; }
142 inline __simdia_vecf
__simdia_vdivf(const __simdia_vecf a
, const __simdia_vecf b
) { __simdia_vecf r
; r
.v0
= a
.v0
/ b
.v0
; r
.v1
= a
.v1
/ b
.v1
; r
.v2
= a
.v2
/ b
.v2
; r
.v3
= a
.v3
/ b
.v3
; return r
; }
143 inline __simdia_veclf
__simdia_vdivlf(const __simdia_veclf a
, const __simdia_veclf b
) { __simdia_veclf r
; r
.v0
= a
.v0
/ b
.v0
; r
.v1
= a
.v1
/ b
.v1
; return r
; }
145 /***** Fused Multiply Add *****/
146 inline __simdia_veci
__simdia_vmaddi(const __simdia_veci a
, const __simdia_veci b
, const __simdia_veci c
) { __simdia_veci r
; r
.v0
= a
.v0
* b
.v0
+ c
.v0
; r
.v1
= a
.v1
* b
.v1
+ c
.v1
; r
.v2
= a
.v2
* b
.v2
+ c
.v2
; r
.v3
= a
.v3
* b
.v3
+ c
.v3
; return r
; }
147 inline __simdia_vecf
__simdia_vmaddf(const __simdia_vecf a
, const __simdia_vecf b
, const __simdia_vecf c
) { __simdia_vecf r
; r
.v0
= a
.v0
* b
.v0
+ c
.v0
; r
.v1
= a
.v1
* b
.v1
+ c
.v1
; r
.v2
= a
.v2
* b
.v2
+ c
.v2
; r
.v3
= a
.v3
* b
.v3
+ c
.v3
; return r
; }
148 inline __simdia_veclf
__simdia_vmaddlf(const __simdia_veclf a
, const __simdia_veclf b
, const __simdia_veclf c
) { __simdia_veclf r
; r
.v0
= a
.v0
* b
.v0
+ c
.v0
; r
.v1
= a
.v1
* b
.v1
+ c
.v1
; return r
; }
150 /***** Reciprocal *****/
151 /* TODO | FIXME - See if there is a better way to do this (few cycles and avoid the memory load) */
152 inline __simdia_vecf
__simdia_vrecipf(const __simdia_vecf a
) { __simdia_vecf r
; r
.v0
= 1.0f
/ a
.v0
; r
.v1
= 1.0f
/ a
.v1
; r
.v2
= 1.0f
/ a
.v2
; r
.v3
= 1.0f
/ a
.v3
; return r
; }
153 inline __simdia_veclf
__simdia_vreciplf(const __simdia_veclf a
) { __simdia_veclf r
; r
.v0
= 1.0f
/ a
.v0
; r
.v1
= 1.0f
/ a
.v1
; return r
; }
155 /***** Square Root *****/
156 inline __simdia_vecf
__simdia_vsqrtf(const __simdia_vecf a
) { __simdia_vecf r
; r
.v0
= sqrtf(a
.v0
); r
.v1
= sqrtf(a
.v1
); r
.v2
= sqrtf(a
.v2
); r
.v3
= sqrtf(a
.v3
); return r
; }
157 inline __simdia_veclf
__simdia_vsqrtlf(const __simdia_veclf a
) { __simdia_veclf r
; r
.v0
= sqrt(a
.v0
); r
.v1
= sqrt(a
.v1
); return r
; }
159 /***** Reciprocal Square Root *****/
160 inline __simdia_vecf
__simdia_vrsqrtf(const __simdia_vecf a
) { __simdia_vecf r
; r
.v0
= 1.0f
/ sqrtf(a
.v0
); r
.v1
= 1.0f
/ sqrtf(a
.v1
); r
.v2
= 1.0f
/ sqrtf(a
.v2
); r
.v3
= 1.0f
/ sqrtf(a
.v3
); return r
; }
161 inline __simdia_veclf
__simdia_vrsqrtlf(const __simdia_veclf a
) { __simdia_veclf r
; r
.v0
= 1.0 / sqrt(a
.v0
); r
.v1
= 1.0 / sqrt(a
.v1
); return r
; }
164 inline __simdia_veci
__simdia_vnoti(const __simdia_veci a
) { __simdia_veci r
; int* rPtr
= (int*)(&r
); int* aPtr
= (int*)(&a
); rPtr
[0] = aPtr
[0] ^ -1; rPtr
[1] = aPtr
[1] ^ -1; rPtr
[2] = aPtr
[2] ^ -1; rPtr
[3] = aPtr
[3] ^ -1; return r
; }
165 inline __simdia_vecf
__simdia_vnotf(const __simdia_vecf a
) { __simdia_vecf r
; int* rPtr
= (int*)(&r
); int* aPtr
= (int*)(&a
); rPtr
[0] = aPtr
[0] ^ -1; rPtr
[1] = aPtr
[1] ^ -1; rPtr
[2] = aPtr
[2] ^ -1; rPtr
[3] = aPtr
[3] ^ -1; return r
; }
166 inline __simdia_veclf
__simdia_vnotlf(const __simdia_veclf a
) { __simdia_veclf r
; int* rPtr
= (int*)(&r
); int* aPtr
= (int*)(&a
); rPtr
[0] = aPtr
[0] ^ -1; rPtr
[1] = aPtr
[1] ^ -1; rPtr
[2] = aPtr
[2] ^ -1; rPtr
[3] = aPtr
[3] ^ -1; return r
; }
169 inline __simdia_veci
__simdia_vori(const __simdia_veci a
, const __simdia_veci b
) { __simdia_veci r
; int* rPtr
= (int*)(&r
); int* aPtr
= (int*)(&a
); int* bPtr
= (int*)(&b
); rPtr
[0] = aPtr
[0] | bPtr
[0]; rPtr
[1] = aPtr
[1] | bPtr
[1]; rPtr
[2] = aPtr
[2] | bPtr
[2]; rPtr
[3] = aPtr
[3] | bPtr
[3]; return r
; }
170 inline __simdia_vecf
__simdia_vorf(const __simdia_vecf a
, const __simdia_vecf b
) { __simdia_vecf r
; int* rPtr
= (int*)(&r
); int* aPtr
= (int*)(&a
); int* bPtr
= (int*)(&b
); rPtr
[0] = aPtr
[0] | bPtr
[0]; rPtr
[1] = aPtr
[1] | bPtr
[1]; rPtr
[2] = aPtr
[2] | bPtr
[2]; rPtr
[3] = aPtr
[3] | bPtr
[3]; return r
; }
171 inline __simdia_veclf
__simdia_vorlf(const __simdia_veclf a
, const __simdia_veclf b
) { __simdia_veclf r
; int* rPtr
= (int*)(&r
); int* aPtr
= (int*)(&a
); int* bPtr
= (int*)(&b
); rPtr
[0] = aPtr
[0] | bPtr
[0]; rPtr
[1] = aPtr
[1] | bPtr
[1]; rPtr
[2] = aPtr
[2] | bPtr
[2]; rPtr
[3] = aPtr
[3] | bPtr
[3]; return r
; }
174 inline __simdia_veci
__simdia_vnori(const __simdia_veci a
, const __simdia_veci b
) { __simdia_veci r
; int* rPtr
= (int*)(&r
); int* aPtr
= (int*)(&a
); int* bPtr
= (int*)(&b
); rPtr
[0] = (aPtr
[0] | bPtr
[0]) ^ -1; rPtr
[1] = (aPtr
[1] | bPtr
[1]) ^ -1; rPtr
[2] = (aPtr
[2] | bPtr
[2]) ^ -1; rPtr
[3] = (aPtr
[3] | bPtr
[3]) ^ -1; return r
; }
175 inline __simdia_vecf
__simdia_vnorf(const __simdia_vecf a
, const __simdia_vecf b
) { __simdia_vecf r
; int* rPtr
= (int*)(&r
); int* aPtr
= (int*)(&a
); int* bPtr
= (int*)(&b
); rPtr
[0] = (aPtr
[0] | bPtr
[0]) ^ -1; rPtr
[1] = (aPtr
[1] | bPtr
[1]) ^ -1; rPtr
[2] = (aPtr
[2] | bPtr
[2]) ^ -1; rPtr
[3] = (aPtr
[3] | bPtr
[3]) ^ -1; return r
; }
176 inline __simdia_veclf
__simdia_vnorlf(const __simdia_veclf a
, const __simdia_veclf b
) { __simdia_veclf r
; int* rPtr
= (int*)(&r
); int* aPtr
= (int*)(&a
); int* bPtr
= (int*)(&b
); rPtr
[0] = (aPtr
[0] | bPtr
[0]) ^ -1; rPtr
[1] = (aPtr
[1] | bPtr
[1]) ^ -1; rPtr
[2] = (aPtr
[2] | bPtr
[2]) ^ -1; rPtr
[3] = (aPtr
[3] | bPtr
[3]) ^ -1; return r
; }
179 inline __simdia_veci
__simdia_vandi(const __simdia_veci a
, const __simdia_veci b
) { __simdia_veci r
; int* rPtr
= (int*)(&r
); int* aPtr
= (int*)(&a
); int* bPtr
= (int*)(&b
); rPtr
[0] = aPtr
[0] & bPtr
[0]; rPtr
[1] = aPtr
[1] & bPtr
[1]; rPtr
[2] = aPtr
[2] & bPtr
[2]; rPtr
[3] = aPtr
[3] & bPtr
[3]; return r
; }
180 inline __simdia_vecf
__simdia_vandf(const __simdia_vecf a
, const __simdia_vecf b
) { __simdia_vecf r
; int* rPtr
= (int*)(&r
); int* aPtr
= (int*)(&a
); int* bPtr
= (int*)(&b
); rPtr
[0] = aPtr
[0] & bPtr
[0]; rPtr
[1] = aPtr
[1] & bPtr
[1]; rPtr
[2] = aPtr
[2] & bPtr
[2]; rPtr
[3] = aPtr
[3] & bPtr
[3]; return r
; }
181 inline __simdia_veclf
__simdia_vandlf(const __simdia_veclf a
, const __simdia_veclf b
) { __simdia_veclf r
; int* rPtr
= (int*)(&r
); int* aPtr
= (int*)(&a
); int* bPtr
= (int*)(&b
); rPtr
[0] = aPtr
[0] & bPtr
[0]; rPtr
[1] = aPtr
[1] & bPtr
[1]; rPtr
[2] = aPtr
[2] & bPtr
[2]; rPtr
[3] = aPtr
[3] & bPtr
[3]; return r
; }
184 inline __simdia_veci
__simdia_vnandi(const __simdia_veci a
, const __simdia_veci b
) { __simdia_veci r
; int* rPtr
= (int*)(&r
); int* aPtr
= (int*)(&a
); int* bPtr
= (int*)(&b
); rPtr
[0] = (aPtr
[0] & bPtr
[0]) ^ -1; rPtr
[1] = (aPtr
[1] & bPtr
[1]) ^ -1; rPtr
[2] = (aPtr
[2] & bPtr
[2]) ^ -1; rPtr
[3] = (aPtr
[3] & bPtr
[3]) ^ -1; return r
; }
185 inline __simdia_vecf
__simdia_vnandf(const __simdia_vecf a
, const __simdia_vecf b
) { __simdia_vecf r
; int* rPtr
= (int*)(&r
); int* aPtr
= (int*)(&a
); int* bPtr
= (int*)(&b
); rPtr
[0] = (aPtr
[0] & bPtr
[0]) ^ -1; rPtr
[1] = (aPtr
[1] & bPtr
[1]) ^ -1; rPtr
[2] = (aPtr
[2] & bPtr
[2]) ^ -1; rPtr
[3] = (aPtr
[3] & bPtr
[3]) ^ -1; return r
; }
186 inline __simdia_veclf
__simdia_vnandlf(const __simdia_veclf a
, const __simdia_veclf b
) { __simdia_veclf r
; int* rPtr
= (int*)(&r
); int* aPtr
= (int*)(&a
); int* bPtr
= (int*)(&b
); rPtr
[0] = (aPtr
[0] & bPtr
[0]) ^ -1; rPtr
[1] = (aPtr
[1] & bPtr
[1]) ^ -1; rPtr
[2] = (aPtr
[2] & bPtr
[2]) ^ -1; rPtr
[3] = (aPtr
[3] & bPtr
[3]) ^ -1; return r
; }
189 inline __simdia_veci
__simdia_vxori(const __simdia_veci a
, const __simdia_veci b
) { __simdia_veci r
; int* rPtr
= (int*)(&r
); int* aPtr
= (int*)(&a
); int* bPtr
= (int*)(&b
); rPtr
[0] = aPtr
[0] ^ bPtr
[0]; rPtr
[1] = aPtr
[1] ^ bPtr
[1]; rPtr
[2] = aPtr
[2] ^ bPtr
[2]; rPtr
[3] = aPtr
[3] ^ bPtr
[3]; return r
; }
190 inline __simdia_vecf
__simdia_vxorf(const __simdia_vecf a
, const __simdia_vecf b
) { __simdia_vecf r
; int* rPtr
= (int*)(&r
); int* aPtr
= (int*)(&a
); int* bPtr
= (int*)(&b
); rPtr
[0] = aPtr
[0] ^ bPtr
[0]; rPtr
[1] = aPtr
[1] ^ bPtr
[1]; rPtr
[2] = aPtr
[2] ^ bPtr
[2]; rPtr
[3] = aPtr
[3] ^ bPtr
[3]; return r
; }
191 inline __simdia_veclf
__simdia_vxorlf(const __simdia_veclf a
, const __simdia_veclf b
) { __simdia_veclf r
; int* rPtr
= (int*)(&r
); int* aPtr
= (int*)(&a
); int* bPtr
= (int*)(&b
); rPtr
[0] = aPtr
[0] ^ bPtr
[0]; rPtr
[1] = aPtr
[1] ^ bPtr
[1]; rPtr
[2] = aPtr
[2] ^ bPtr
[2]; rPtr
[3] = aPtr
[3] ^ bPtr
[3]; return r
; }
194 inline __simdia_veci
__simdia_vnxori(const __simdia_veci a
, const __simdia_veci b
) { __simdia_veci r
; int* rPtr
= (int*)(&r
); int* aPtr
= (int*)(&a
); int* bPtr
= (int*)(&b
); rPtr
[0] = (aPtr
[0] ^ bPtr
[0]) ^ -1; rPtr
[1] = (aPtr
[1] ^ bPtr
[1]) ^ -1; rPtr
[2] = (aPtr
[2] ^ bPtr
[2]) ^ -1; rPtr
[3] = (aPtr
[3] ^ bPtr
[3]) ^ -1; return r
; }
195 inline __simdia_vecf
__simdia_vnxorf(const __simdia_vecf a
, const __simdia_vecf b
) { __simdia_vecf r
; int* rPtr
= (int*)(&r
); int* aPtr
= (int*)(&a
); int* bPtr
= (int*)(&b
); rPtr
[0] = (aPtr
[0] ^ bPtr
[0]) ^ -1; rPtr
[1] = (aPtr
[1] ^ bPtr
[1]) ^ -1; rPtr
[2] = (aPtr
[2] ^ bPtr
[2]) ^ -1; rPtr
[3] = (aPtr
[3] ^ bPtr
[3]) ^ -1; return r
; }
196 inline __simdia_veclf
__simdia_vnxorlf(const __simdia_veclf a
, const __simdia_veclf b
) { __simdia_veclf r
; int* rPtr
= (int*)(&r
); int* aPtr
= (int*)(&a
); int* bPtr
= (int*)(&b
); rPtr
[0] = (aPtr
[0] ^ bPtr
[0]) ^ -1; rPtr
[1] = (aPtr
[1] ^ bPtr
[1]) ^ -1; rPtr
[2] = (aPtr
[2] ^ bPtr
[2]) ^ -1; rPtr
[3] = (aPtr
[3] ^ bPtr
[3]) ^ -1; return r
; }
198 /* TODO | FIXME - Try to do the comparisons in a branchless way */
200 /***** Equal To *****/
201 inline __simdia_veci
__simdia_vcmpeqi(const __simdia_veci a
, const __simdia_veci b
) { __simdia_veci r
; r
.v0
= ((a
.v0
== b
.v0
) ? (0xFFFFFFFF) : (0x0)); r
.v1
= ((a
.v1
== b
.v1
) ? (0xFFFFFFFF) : (0x0)); r
.v2
= ((a
.v2
== b
.v2
) ? (0xFFFFFFFF) : (0x0)); r
.v3
= ((a
.v3
== b
.v3
) ? (0xFFFFFFFF) : (0x0)); return r
; }
202 inline __simdia_veci
__simdia_vcmpeqf(const __simdia_vecf a
, const __simdia_vecf b
) { __simdia_veci r
; r
.v0
= ((a
.v0
== b
.v0
) ? (0xFFFFFFFF) : (0x0)); r
.v1
= ((a
.v1
== b
.v1
) ? (0xFFFFFFFF) : (0x0)); r
.v2
= ((a
.v2
== b
.v2
) ? (0xFFFFFFFF) : (0x0)); r
.v3
= ((a
.v3
== b
.v3
) ? (0xFFFFFFFF) : (0x0)); return r
; }
203 inline __simdia_veci
__simdia_vcmpeqlf(const __simdia_vecf a
, const __simdia_vecf b
) { __simdia_veci r
; r
.v0
= r
.v1
= ((a
.v0
== b
.v0
) ? (0xFFFFFFFF) : (0x0)); r
.v2
= r
.v3
= ((a
.v1
== b
.v1
) ? (0xFFFFFFFF) : (0x0)); return r
; }
205 /***** Greater Than *****/
206 inline __simdia_veci
__simdia_vcmpgti(const __simdia_veci a
, const __simdia_veci b
) { __simdia_veci r
; r
.v0
= ((a
.v0
> b
.v0
) ? (0xFFFFFFFF) : (0x0)); r
.v1
= ((a
.v1
> b
.v1
) ? (0xFFFFFFFF) : (0x0)); r
.v2
= ((a
.v2
> b
.v2
) ? (0xFFFFFFFF) : (0x0)); r
.v3
= ((a
.v3
> b
.v3
) ? (0xFFFFFFFF) : (0x0)); return r
; }
207 inline __simdia_veci
__simdia_vcmpgtf(const __simdia_vecf a
, const __simdia_vecf b
) { __simdia_veci r
; r
.v0
= ((a
.v0
> b
.v0
) ? (0xFFFFFFFF) : (0x0)); r
.v1
= ((a
.v1
> b
.v1
) ? (0xFFFFFFFF) : (0x0)); r
.v2
= ((a
.v2
> b
.v2
) ? (0xFFFFFFFF) : (0x0)); r
.v3
= ((a
.v3
> b
.v3
) ? (0xFFFFFFFF) : (0x0)); return r
; }
208 inline __simdia_veci
__simdia_vcmpgtlf(const __simdia_vecf a
, const __simdia_vecf b
) { __simdia_veci r
; r
.v0
= r
.v1
= ((a
.v0
> b
.v0
) ? (0xFFFFFFFF) : (0x0)); r
.v2
= r
.v3
= ((a
.v1
> b
.v1
) ? (0xFFFFFFFF) : (0x0)); return r
; }
210 /***** Greater Than Or Equal To *****/
211 inline __simdia_veci
__simdia_vcmpgei(const __simdia_veci a
, const __simdia_veci b
) { __simdia_veci r
; r
.v0
= ((a
.v0
>= b
.v0
) ? (0xFFFFFFFF) : (0x0)); r
.v1
= ((a
.v1
>= b
.v1
) ? (0xFFFFFFFF) : (0x0)); r
.v2
= ((a
.v2
>= b
.v2
) ? (0xFFFFFFFF) : (0x0)); r
.v3
= ((a
.v3
>= b
.v3
) ? (0xFFFFFFFF) : (0x0)); return r
; }
212 inline __simdia_veci
__simdia_vcmpgef(const __simdia_vecf a
, const __simdia_vecf b
) { __simdia_veci r
; r
.v0
= ((a
.v0
>= b
.v0
) ? (0xFFFFFFFF) : (0x0)); r
.v1
= ((a
.v1
>= b
.v1
) ? (0xFFFFFFFF) : (0x0)); r
.v2
= ((a
.v2
>= b
.v2
) ? (0xFFFFFFFF) : (0x0)); r
.v3
= ((a
.v3
>= b
.v3
) ? (0xFFFFFFFF) : (0x0)); return r
; }
213 inline __simdia_veci
__simdia_vcmpgelf(const __simdia_vecf a
, const __simdia_vecf b
) { __simdia_veci r
; r
.v0
= r
.v1
= ((a
.v0
>= b
.v0
) ? (0xFFFFFFFF) : (0x0)); r
.v2
= r
.v3
= ((a
.v1
>= b
.v1
) ? (0xFFFFFFFF) : (0x0)); return r
; }
215 /***** Less Than *****/
216 inline __simdia_veci
__simdia_vcmplti(const __simdia_veci a
, const __simdia_veci b
) { __simdia_veci r
; r
.v0
= ((a
.v0
< b
.v0
) ? (0xFFFFFFFF) : (0x0)); r
.v1
= ((a
.v1
< b
.v1
) ? (0xFFFFFFFF) : (0x0)); r
.v2
= ((a
.v2
< b
.v2
) ? (0xFFFFFFFF) : (0x0)); r
.v3
= ((a
.v3
< b
.v3
) ? (0xFFFFFFFF) : (0x0)); return r
; }
217 inline __simdia_veci
__simdia_vcmpltf(const __simdia_vecf a
, const __simdia_vecf b
) { __simdia_veci r
; r
.v0
= ((a
.v0
< b
.v0
) ? (0xFFFFFFFF) : (0x0)); r
.v1
= ((a
.v1
< b
.v1
) ? (0xFFFFFFFF) : (0x0)); r
.v2
= ((a
.v2
< b
.v2
) ? (0xFFFFFFFF) : (0x0)); r
.v3
= ((a
.v3
< b
.v3
) ? (0xFFFFFFFF) : (0x0)); return r
; }
218 inline __simdia_veci
__simdia_vcmpltlf(const __simdia_vecf a
, const __simdia_vecf b
) { __simdia_veci r
; r
.v0
= r
.v1
= ((a
.v0
< b
.v0
) ? (0xFFFFFFFF) : (0x0)); r
.v2
= r
.v3
= ((a
.v1
< b
.v1
) ? (0xFFFFFFFF) : (0x0)); return r
; }
220 /***** Less Than Or Equal To *****/
221 inline __simdia_veci
__simdia_vcmplei(const __simdia_veci a
, const __simdia_veci b
) { __simdia_veci r
; r
.v0
= ((a
.v0
<= b
.v0
) ? (0xFFFFFFFF) : (0x0)); r
.v1
= ((a
.v1
<= b
.v1
) ? (0xFFFFFFFF) : (0x0)); r
.v2
= ((a
.v2
<= b
.v2
) ? (0xFFFFFFFF) : (0x0)); r
.v3
= ((a
.v3
<= b
.v3
) ? (0xFFFFFFFF) : (0x0)); return r
; }
222 inline __simdia_veci
__simdia_vcmplef(const __simdia_vecf a
, const __simdia_vecf b
) { __simdia_veci r
; r
.v0
= ((a
.v0
<= b
.v0
) ? (0xFFFFFFFF) : (0x0)); r
.v1
= ((a
.v1
<= b
.v1
) ? (0xFFFFFFFF) : (0x0)); r
.v2
= ((a
.v2
<= b
.v2
) ? (0xFFFFFFFF) : (0x0)); r
.v3
= ((a
.v3
<= b
.v3
) ? (0xFFFFFFFF) : (0x0)); return r
; }
223 inline __simdia_veci
__simdia_vcmplelf(const __simdia_vecf a
, const __simdia_vecf b
) { __simdia_veci r
; r
.v0
= r
.v1
= ((a
.v0
<= b
.v0
) ? (0xFFFFFFFF) : (0x0)); r
.v2
= r
.v3
= ((a
.v1
<= b
.v1
) ? (0xFFFFFFFF) : (0x0)); return r
; }
226 /*******************************************************************************
227 ***** C++ Operators for Generic Implementation
228 *******************************************************************************/
229 #if defined(__cplusplus)
231 /***** Addition *****/
232 inline __simdia_veci
operator+(const __simdia_veci
&a
, const __simdia_veci
&b
) { return __simdia_vaddi(a
, b
); }
233 inline __simdia_vecf
operator+(const __simdia_vecf
&a
, const __simdia_vecf
&b
) { return __simdia_vaddf(a
, b
); }
234 inline __simdia_veclf
operator+(const __simdia_veclf
&a
, const __simdia_veclf
&b
) { return __simdia_vaddlf(a
, b
); }
235 inline __simdia_veci
operator+=( __simdia_veci
&a
, const __simdia_veci
&b
) { a
= __simdia_vaddi(a
, b
); return a
; }
236 inline __simdia_vecf
operator+=( __simdia_vecf
&a
, const __simdia_vecf
&b
) { a
= __simdia_vaddf(a
, b
); return a
; }
237 inline __simdia_veclf
operator+=(__simdia_veclf
&a
, const __simdia_veclf
&b
) { a
= __simdia_vaddlf(a
, b
); return a
; }
239 inline __simdia_veci
operator+(const __simdia_veci
&a
, const int &b
) { return __simdia_vaddi(a
, __simdia_vseti(b
)); }
240 inline __simdia_vecf
operator+(const __simdia_vecf
&a
, const float &b
) { return __simdia_vaddf(a
, __simdia_vsetf(b
)); }
241 inline __simdia_veclf
operator+(const __simdia_veclf
&a
, const double &b
) { return __simdia_vaddlf(a
, __simdia_vsetlf(b
)); }
242 inline __simdia_veci
operator+=( __simdia_veci
&a
, const int &b
) { a
= __simdia_vaddi(a
, __simdia_vseti(b
)); return a
; }
243 inline __simdia_vecf
operator+=( __simdia_vecf
&a
, const float &b
) { a
= __simdia_vaddf(a
, __simdia_vsetf(b
)); return a
; }
244 inline __simdia_veclf
operator+=(__simdia_veclf
&a
, const double &b
) { a
= __simdia_vaddlf(a
, __simdia_vsetlf(b
)); return a
; }
246 /***** Subtraction *****/
247 inline __simdia_veci
operator-(const __simdia_veci
&a
, const __simdia_veci
&b
) { return __simdia_vsubi(a
, b
); }
248 inline __simdia_vecf
operator-(const __simdia_vecf
&a
, const __simdia_vecf
&b
) { return __simdia_vsubf(a
, b
); }
249 inline __simdia_veclf
operator-(const __simdia_veclf
&a
, const __simdia_veclf
&b
) { return __simdia_vsublf(a
, b
); }
250 inline __simdia_veci
operator-=( __simdia_veci
&a
, const __simdia_veci
&b
) { a
= __simdia_vsubi(a
, b
); return a
; }
251 inline __simdia_vecf
operator-=( __simdia_vecf
&a
, const __simdia_vecf
&b
) { a
= __simdia_vsubf(a
, b
); return a
; }
252 inline __simdia_veclf
operator-=(__simdia_veclf
&a
, const __simdia_veclf
&b
) { a
= __simdia_vsublf(a
, b
); return a
; }
254 inline __simdia_veci
operator-(const __simdia_veci
&a
, const int &b
) { return __simdia_vsubi(a
, __simdia_vseti(b
)); }
255 inline __simdia_vecf
operator-(const __simdia_vecf
&a
, const float &b
) { return __simdia_vsubf(a
, __simdia_vsetf(b
)); }
256 inline __simdia_veclf
operator-(const __simdia_veclf
&a
, const double &b
) { return __simdia_vsublf(a
, __simdia_vsetlf(b
)); }
257 inline __simdia_veci
operator-=( __simdia_veci
&a
, const int &b
) { a
= __simdia_vsubi(a
, __simdia_vseti(b
)); return a
; }
258 inline __simdia_vecf
operator-=( __simdia_vecf
&a
, const float &b
) { a
= __simdia_vsubf(a
, __simdia_vsetf(b
)); return a
; }
259 inline __simdia_veclf
operator-=(__simdia_veclf
&a
, const double &b
) { a
= __simdia_vsublf(a
, __simdia_vsetlf(b
)); return a
; }
261 /***** Multiplication *****/
262 inline __simdia_vecf
operator*(const __simdia_vecf
&a
, const __simdia_vecf
&b
) { return __simdia_vmulf(a
, b
); }
263 inline __simdia_veclf
operator*(const __simdia_veclf
&a
, const __simdia_veclf
&b
) { return __simdia_vmullf(a
, b
); }
264 inline __simdia_vecf
operator*=( __simdia_vecf
&a
, const __simdia_vecf
&b
) { a
= __simdia_vmulf(a
, b
); return a
; }
265 inline __simdia_veclf
operator*=(__simdia_veclf
&a
, const __simdia_veclf
&b
) { a
= __simdia_vmullf(a
, b
); return a
; }
267 inline __simdia_vecf
operator*(const __simdia_vecf
&a
, const float &b
) { return __simdia_vmulf(a
, __simdia_vsetf(b
)); }
268 inline __simdia_veclf
operator*(const __simdia_veclf
&a
, const double &b
) { return __simdia_vmullf(a
, __simdia_vsetlf(b
)); }
269 inline __simdia_vecf
operator*=( __simdia_vecf
&a
, const float &b
) { a
= __simdia_vmulf(a
, __simdia_vsetf(b
)); return a
; }
270 inline __simdia_veclf
operator*=(__simdia_veclf
&a
, const double &b
) { a
= __simdia_vmullf(a
, __simdia_vsetlf(b
)); return a
; }
272 /***** Division *****/
273 inline __simdia_vecf
operator/(const __simdia_vecf
&a
, const __simdia_vecf
&b
) { return __simdia_vdivf(a
, b
); }
274 inline __simdia_veclf
operator/(const __simdia_veclf
&a
, const __simdia_veclf
&b
) { return __simdia_vdivlf(a
, b
); }
275 inline __simdia_vecf
operator/=( __simdia_vecf
&a
, const __simdia_vecf
&b
) { a
= __simdia_vdivf(a
, b
); return a
; }
276 inline __simdia_veclf
operator/=(__simdia_veclf
&a
, const __simdia_veclf
&b
) { a
= __simdia_vdivlf(a
, b
); return a
; }
278 inline __simdia_vecf
operator/(const __simdia_vecf
&a
, const float &b
) { return __simdia_vdivf(a
, __simdia_vsetf(b
)); }
279 inline __simdia_veclf
operator/(const __simdia_veclf
&a
, const double &b
) { return __simdia_vdivlf(a
, __simdia_vsetlf(b
)); }
280 inline __simdia_vecf
operator/=( __simdia_vecf
&a
, const float &b
) { a
= __simdia_vdivf(a
, __simdia_vsetf(b
)); return a
; }
281 inline __simdia_veclf
operator/=(__simdia_veclf
&a
, const double &b
) { a
= __simdia_vdivlf(a
, __simdia_vsetlf(b
)); return a
; }
284 inline __simdia_veci
operator|(const __simdia_veci
&a
, const __simdia_veci
&b
) { return __simdia_vori(a
, b
); }
285 inline __simdia_vecf
operator|(const __simdia_vecf
&a
, const __simdia_vecf
&b
) { return __simdia_vorf(a
, b
); }
286 inline __simdia_veclf
operator|(const __simdia_veclf
&a
, const __simdia_veclf
&b
) { return __simdia_vorlf(a
, b
); }
287 inline __simdia_veci
operator|=( __simdia_veci
&a
, const __simdia_veci
&b
) { a
= __simdia_vori(a
, b
); return a
; }
288 inline __simdia_vecf
operator|=( __simdia_vecf
&a
, const __simdia_vecf
&b
) { a
= __simdia_vorf(a
, b
); return a
; }
289 inline __simdia_veclf
operator|=(__simdia_veclf
&a
, const __simdia_veclf
&b
) { a
= __simdia_vorlf(a
, b
); return a
; }
291 inline __simdia_veci
operator|(const __simdia_veci
&a
, const int &b
) { return __simdia_vori(a
, __simdia_vseti(b
)); }
292 inline __simdia_vecf
operator|(const __simdia_vecf
&a
, const float &b
) { return __simdia_vorf(a
, __simdia_vsetf(b
)); }
293 inline __simdia_veclf
operator|(const __simdia_veclf
&a
, const double &b
) { return __simdia_vorlf(a
, __simdia_vsetlf(b
)); }
294 inline __simdia_veci
operator|=( __simdia_veci
&a
, const int &b
) { a
= __simdia_vori(a
, __simdia_vseti(b
)); return a
; }
295 inline __simdia_vecf
operator|=( __simdia_vecf
&a
, const float &b
) { a
= __simdia_vorf(a
, __simdia_vsetf(b
)); return a
; }
296 inline __simdia_veclf
operator|=(__simdia_veclf
&a
, const double &b
) { a
= __simdia_vorlf(a
, __simdia_vsetlf(b
)); return a
; }
299 inline __simdia_veci
operator&(const __simdia_veci
&a
, const __simdia_veci
&b
) { return __simdia_vandi(a
, b
); }
300 inline __simdia_vecf
operator&(const __simdia_vecf
&a
, const __simdia_vecf
&b
) { return __simdia_vandf(a
, b
); }
301 inline __simdia_veclf
operator&(const __simdia_veclf
&a
, const __simdia_veclf
&b
) { return __simdia_vandlf(a
, b
); }
302 inline __simdia_veci
operator&=( __simdia_veci
&a
, const __simdia_veci
&b
) { a
= __simdia_vandi(a
, b
); return a
; }
303 inline __simdia_vecf
operator&=( __simdia_vecf
&a
, const __simdia_vecf
&b
) { a
= __simdia_vandf(a
, b
); return a
; }
304 inline __simdia_veclf
operator&=(__simdia_veclf
&a
, const __simdia_veclf
&b
) { a
= __simdia_vandlf(a
, b
); return a
; }
306 inline __simdia_veci
operator&(const __simdia_veci
&a
, const int &b
) { return __simdia_vandi(a
, __simdia_vseti(b
)); }
307 inline __simdia_vecf
operator&(const __simdia_vecf
&a
, const float &b
) { return __simdia_vandf(a
, __simdia_vsetf(b
)); }
308 inline __simdia_veclf
operator&(const __simdia_veclf
&a
, const double &b
) { return __simdia_vandlf(a
, __simdia_vsetlf(b
)); }
309 inline __simdia_veci
operator&=( __simdia_veci
&a
, const int &b
) { a
= __simdia_vandi(a
, __simdia_vseti(b
)); return a
; }
310 inline __simdia_vecf
operator&=( __simdia_vecf
&a
, const float &b
) { a
= __simdia_vandf(a
, __simdia_vsetf(b
)); return a
; }
311 inline __simdia_veclf
operator&=(__simdia_veclf
&a
, const double &b
) { a
= __simdia_vandlf(a
, __simdia_vsetlf(b
)); return a
; }
314 inline __simdia_veci
operator^(const __simdia_veci
&a
, const __simdia_veci
&b
) { return __simdia_vxori(a
, b
); }
315 inline __simdia_vecf
operator^(const __simdia_vecf
&a
, const __simdia_vecf
&b
) { return __simdia_vxorf(a
, b
); }
316 inline __simdia_veclf
operator^(const __simdia_veclf
&a
, const __simdia_veclf
&b
) { return __simdia_vxorlf(a
, b
); }
317 inline __simdia_veci
operator^=( __simdia_veci
&a
, const __simdia_veci
&b
) { a
= __simdia_vxori(a
, b
); return a
; }
318 inline __simdia_vecf
operator^=( __simdia_vecf
&a
, const __simdia_vecf
&b
) { a
= __simdia_vxorf(a
, b
); return a
; }
319 inline __simdia_veclf
operator^=(__simdia_veclf
&a
, const __simdia_veclf
&b
) { a
= __simdia_vxorlf(a
, b
); return a
; }
321 inline __simdia_veci
operator^(const __simdia_veci
&a
, const int &b
) { return __simdia_vxori(a
, __simdia_vseti(b
)); }
322 inline __simdia_vecf
operator^(const __simdia_vecf
&a
, const float &b
) { return __simdia_vxorf(a
, __simdia_vsetf(b
)); }
323 inline __simdia_veclf
operator^(const __simdia_veclf
&a
, const double &b
) { return __simdia_vxorlf(a
, __simdia_vsetlf(b
)); }
324 inline __simdia_veci
operator^=( __simdia_veci
&a
, const int &b
) { a
= __simdia_vxori(a
, __simdia_vseti(b
)); return a
; }
325 inline __simdia_vecf
operator^=( __simdia_vecf
&a
, const float &b
) { a
= __simdia_vxorf(a
, __simdia_vsetf(b
)); return a
; }
326 inline __simdia_veclf
operator^=(__simdia_veclf
&a
, const double &b
) { a
= __simdia_vxorlf(a
, __simdia_vsetlf(b
)); return a
; }
328 #endif /* defined(__cplusplus) */
333 /*******************************************************************************
334 *******************************************************************************
336 *******************************************************************************
337 *******************************************************************************/
338 #if defined(__SSE2__) && (!(SIMDIA_FORCE_NO_SSE)) && !defined(_CRAYC)
340 /* NOTE | TODO | FIXME : Add checks for various version of SSE. For now, only
341 * support and assume that minimum level SSE2.
344 /***** Data Types *****/
345 typedef __m128i simdia_veci
;
346 typedef __m128 simdia_vecf
;
347 typedef __m128d simdia_veclf
;
350 /* TODO | FIXME - Try to make these functions not reference memory so values stay in registers */
351 inline simdia_veci
simdia_vinserti( simdia_veci v
, const int s
, const int i
) { simdia_veci r
= v
; int* rPtr
= ( int*)(&r
); rPtr
[i
] = s
; return r
; }
352 inline simdia_vecf
simdia_vinsertf( simdia_vecf v
, const float s
, const int i
) { simdia_vecf r
= v
; float* rPtr
= ( float*)(&r
); rPtr
[i
] = s
; return r
; }
353 inline simdia_veclf
simdia_vinsertlf(simdia_veclf v
, const double s
, const int i
) { simdia_veclf r
= v
; double* rPtr
= (double*)(&r
); rPtr
[i
] = s
; return r
; }
355 /***** Extract *****/
356 /* TODO | FIXME - Try to make these functions not reference memory so values stay in registers */
357 inline int vextracti( simdia_veci v
, const int i
) { return (( int*)(&v
))[i
]; }
358 inline float vextractf( simdia_vecf v
, const int i
) { return (( float*)(&v
))[i
]; }
359 inline double vextractlf(simdia_veclf v
, const int i
) { return ((double*)(&v
))[i
]; }
362 #define simdia_vseti(a) (_mm_set1_epi32((int)(a)))
363 #define simdia_vsetf(a) (_mm_set1_ps((float)(a)))
364 #define simdia_vsetlf(a) (_mm_set1_pd((double)(a)))
366 /***** Constant Zero *****/
367 #define simdia_const_vzeroi (_mm_setzero_si128())
368 #define simdia_const_vzerof (_mm_setzero_ps())
369 #define simdia_const_vzerolf (_mm_setzero_pd())
371 /***** Constant One *****/
372 #define simdia_const_vonei (simdia_vseti(1))
373 #define simdia_const_vonef (simdia_vsetf(1.0f))
374 #define simdia_const_vonelf (simdia_vsetlf(1.0))
376 /***** Constant Two *****/
377 #define simdia_const_vtwoi (simdia_vseti(2))
378 #define simdia_const_vtwof (simdia_vsetf(2.0f))
379 #define simdia_const_vtwolf (simdia_vsetlf(2.0))
381 /***** Constant Negative One *****/
382 #define simdia_const_vnegonei (simdia_vseti(-1))
383 #define simdia_const_vnegonef (simdia_vsetf(-1.0f))
384 #define simdia_const_vnegonelf (simdia_vsetlf(-1.0))
387 /* TODO : FIXME - Find a better way to do Rotate in SSE */
388 inline simdia_veci
simdia_vrothi(const simdia_veci
&a
, int s
) { simdia_veci b
; int* a_ptr
= ( int*)(&a
); int* b_ptr
= ( int*)(&b
); s
&= 0x3; b_ptr
[0] = a_ptr
[(0-s
)&0x3]; b_ptr
[1] = a_ptr
[(1-s
)&0x3]; b_ptr
[2] = a_ptr
[(2-s
)&0x3]; b_ptr
[3] = a_ptr
[(3-s
)&0x3]; return b
; }
389 inline simdia_vecf
simdia_vrothf(const simdia_vecf
&a
, int s
) { simdia_vecf b
; float* a_ptr
= ( float*)(&a
); float* b_ptr
= ( float*)(&b
); s
&= 0x3; b_ptr
[0] = a_ptr
[(0-s
)&0x3]; b_ptr
[1] = a_ptr
[(1-s
)&0x3]; b_ptr
[2] = a_ptr
[(2-s
)&0x3]; b_ptr
[3] = a_ptr
[(3-s
)&0x3]; return b
; }
390 inline simdia_veclf
simdia_vrothlf(const simdia_veclf
&a
, int s
) { simdia_veclf b
; double* a_ptr
= (double*)(&a
); double* b_ptr
= (double*)(&b
); s
&= 0x1; b_ptr
[0] = a_ptr
[(0-s
)&0x1]; b_ptr
[1] = a_ptr
[(1-s
)&0x1]; return b
; }
391 inline simdia_veci
simdia_vrotli(const simdia_veci
&a
, int s
) { simdia_veci b
; int* a_ptr
= ( int*)(&a
); int* b_ptr
= ( int*)(&b
); s
&= 0x3; b_ptr
[0] = a_ptr
[(0+s
)&0x3]; b_ptr
[1] = a_ptr
[(1+s
)&0x3]; b_ptr
[2] = a_ptr
[(2+s
)&0x3]; b_ptr
[3] = a_ptr
[(3+s
)&0x3]; return b
; }
392 inline simdia_vecf
simdia_vrotlf(const simdia_vecf
&a
, int s
) { simdia_vecf b
; float* a_ptr
= ( float*)(&a
); float* b_ptr
= ( float*)(&b
); s
&= 0x3; b_ptr
[0] = a_ptr
[(0+s
)&0x3]; b_ptr
[1] = a_ptr
[(1+s
)&0x3]; b_ptr
[2] = a_ptr
[(2+s
)&0x3]; b_ptr
[3] = a_ptr
[(3+s
)&0x3]; return b
; }
393 inline simdia_veclf
simdia_vrotllf(const simdia_veclf
&a
, int s
) { simdia_veclf b
; double* a_ptr
= (double*)(&a
); double* b_ptr
= (double*)(&b
); s
&= 0x1; b_ptr
[0] = a_ptr
[(0+s
)&0x1]; b_ptr
[1] = a_ptr
[(1+s
)&0x1]; return b
; }
395 /***** Addition *****/
396 #define simdia_vaddi(a, b) (_mm_add_epi32((a), (b)))
397 #define simdia_vaddf(a, b) (_mm_add_ps((a), (b)))
398 #define simdia_vaddlf(a, b) (_mm_add_pd((a), (b)))
400 /***** Subtraction *****/
401 #define simdia_vsubi(a, b) (_mm_sub_epi32((a), (b)))
402 #define simdia_vsubf(a, b) (_mm_sub_ps((a), (b)))
403 #define simdia_vsublf(a, b) (_mm_sub_pd((a), (b)))
405 /***** Multiplication *****/
406 #define simdia_vmulf(a, b) (_mm_mul_ps((a), (b)))
407 #define simdia_vmullf(a, b) (_mm_mul_pd((a), (b)))
409 /***** Division *****/
410 #define simdia_vdivf(a, b) (_mm_div_ps((a), (b)))
411 #define simdia_vdivlf(a, b) (_mm_div_pd((a), (b)))
413 /***** Fused Multiply Add *****/
414 #define simdia_vmaddf(a, b, c) ( vaddf( vmulf((a), (b)), (c)))
415 #define simdia_vmaddlf(a, b, c) (vaddlf(vmullf((a), (b)), (c)))
417 /***** Reciprocal *****/
418 #define simdia_vrecipf(a) (_mm_rcp_ps(a))
419 inline simdia_veclf
simdia_vreciplf(const simdia_veclf a
) { simdia_veclf r
; double* a_ptr
= (double*)(&a
); double* r_ptr
= (double*)(&r
); r_ptr
[0] = 1.0f
/ a_ptr
[0]; r_ptr
[1] = 1.0f
/ a_ptr
[1]; return r
; }
421 /***** Square Root *****/
422 #define simdia_vsqrtf(a) (_mm_sqrt_ps(a))
423 #define simdia_vsqrtlf(a) (_mm_sqrt_pd(a))
425 /***** Reciprocal Square Root *****/
426 #define simdia_vrsqrtf(a) (_mm_rsqrt_ps(a))
427 #define simdia_vrsqrtlf(a) (vreciplf(vsqrtlf(a)))
430 #define simdia_vnoti(a) (_mm_xor_si128((a), simdia_const_vnegonei))
431 #define simdia_vnotf(a) (_mm_xor_ps((a), simdia_const_vnegonei))
432 #define simdia_vnotlf(a) (_mm_xor_pd((a), simdia_const_vnegonei))
435 #define simdia_vori(a, b) (_mm_or_si128((a), (b)))
436 #define simdia_vorf(a, b) (_mm_or_ps((a), (b)))
437 #define simdia_vorlf(a, b) (_mm_or_pd((a), (b)))
440 #define simdia_vnori(a, b) ( simdia_vnoti( simdia_vori((a), (b))))
441 #define simdia_vnorf(a, b) ( simdia_vnotf( simdia_vorf((a), (b))))
442 #define simdia_vnorlf(a, b) (simdia_vnotlf(simdia_vorlf((a), (b))))
445 #define simdia_vandi(a, b) (_mm_and_si128((a), (b)))
446 #define simdia_vandf(a, b) (_mm_and_ps((a), (b)))
447 #define simdia_vandlf(a, b) (_mm_and_pd((a), (b)))
450 #define simdia_vnandi(a, b) ( simdia_vnoti( simdia_vandi((a), (b))))
451 #define simdia_vnandf(a, b) ( simdia_vnotf( simdia_vandf((a), (b))))
452 #define simdia_vnandlf(a, b) (simdia_vnotlf(simdia_vandlf((a), (b))))
455 #define simdia_vxori(a, b) (_mm_xor_si128((a), (b)))
456 #define simdia_vxorf(a, b) (_mm_xor_ps((a), (b)))
457 #define simdia_vxorlf(a, b) (_mm_xor_pd((a), (b)))
460 #define simdia_vnxori(a, b) ( simdia_vnoti( simdia_vxori((a), (b))))
461 #define simdia_vnxorf(a, b) ( simdia_vnotf( simdia_vxorf((a), (b))))
462 #define simdia_vnxorlf(a, b) (simdia_vnotlf(simdia_vxorlf((a), (b))))
464 /***** Equal To *****/
465 #define simdia_vcmpeqi(a, b) ((simdia_veci)(_mm_cmpeq_epi32((a), (b))))
466 #define simdia_vcmpeqf(a, b) ((simdia_veci)(_mm_cmpeq_ps((a), (b))))
467 #define simdia_vcmpeqlf(a, b) ((simdia_veci)(_mm_cmpeq_pd((a), (b))))
469 /***** Greater Than *****/
470 #define simdia_vcmpgti(a, b) ((simdia_veci)(_mm_cmpgt_epi32((a), (b))))
471 #define simdia_vcmpgtf(a, b) ((simdia_veci)(_mm_cmpgt_ps((a), (b))))
472 #define simdia_vcmpgtlf(a, b) ((simdia_veci)(_mm_cmpgt_pd((a), (b))))
474 /***** Greater Than Or Equal To *****/
475 #define simdia_vcmpgei(a, b) ((simdia_veci)(_mm_cmpge_epi32((a), (b))))
476 #define simdia_vcmpgef(a, b) ((simdia_veci)(_mm_cmpge_ps((a), (b))))
477 #define simdia_vcmpgelf(a, b) ((simdia_veci)(_mm_cmpge_pd((a), (b))))
479 /***** Less Than *****/
480 #define simdia_vcmplti(a, b) ((simdia_veci)(_mm_cmplt_epi32((a), (b))))
481 #define simdia_vcmpltf(a, b) ((simdia_veci)(_mm_cmplt_ps((a), (b))))
482 #define simdia_vcmpltlf(a, b) ((simdia_veci)(_mm_cmplt_pd((a), (b))))
484 /***** Less Than Or Equal To *****/
485 #define simdia_vcmplei(a, b) ((simdia_veci)(_mm_cmple_epi32((a), (b))))
486 #define simdia_vcmplef(a, b) ((simdia_veci)(_mm_cmple_ps((a), (b))))
487 #define simdia_vcmplelf(a, b) ((simdia_veci)(_mm_cmple_pd((a), (b))))
490 /*******************************************************************************
491 *******************************************************************************
492 ***** SPE SIMD Instructions
493 *******************************************************************************
494 *******************************************************************************/
495 /* TODO | FIXME : Find a more general check for this (this is Charm++ specific) */
496 #elif (CMK_CELL_SPE != 0) && (!(SIMDIA_FORCE_NO_SPE_SIMD))
498 /***** Data Types *****/
499 typedef vector
signed int simdia_veci
;
500 typedef vector
float simdia_vecf
;
501 typedef vector
double simdia_veclf
;
504 #define simdia_vinserti(v, s, i) (spu_insert((s), (v), (i)))
505 #define simdia_vinsertf(v, s, i) (spu_insert((s), (v), (i)))
506 #define simdia_vinsertlf(v, s, i) (spu_insert((s), (v), (i)))
508 /***** Extract *****/
509 #define simdia_vextracti(v, i) (spu_extract((v), (i)))
510 #define simdia_vextractf(v, i) (spu_extract((v), (i)))
511 #define simdia_vextractlf(v, i) (spu_extract((v), (i)))
514 #define simdia_vseti(a) (spu_splats((int)(a)))
515 #define simdia_vsetf(a) (spu_splats((float)(a)))
516 #define simdia_vsetlf(a) (spu_splats((double)(a)))
518 /***** Constant Zero *****/
519 #define simdia_const_vzeroi (vseti(0))
520 #define simdia_const_vzerof (vsetf(0.0f))
521 #define simdia_const_vzerolf (vsetlf(0.0))
523 /***** Constant One *****/
524 #define simdia_const_vonei (vseti(1))
525 #define simdia_const_vonef (vsetf(1.0f))
526 #define simdia_const_vonelf (vsetlf(1.0))
528 /***** Constant Two *****/
529 #define simdia_const_vtwoi (vseti(2))
530 #define simdia_const_vtwof (vsetf(2.0f))
531 #define simdia_const_vtwolf (vsetlf(2.0))
533 /***** Constant Negative One *****/
534 #define simdia_const_vnegonei (vseti(-1))
535 #define simdia_const_vnegonef (vsetf(-1.0f))
536 #define simdia_const_vnegonelf (vsetlf(-1.0))
539 #define simdia_vrothi(a, s) (spu_rlqwbyte((a), (0x10-(((s)&0x3)<<2)) ))
540 #define simdia_vrothf(a, s) (spu_rlqwbyte((a), (0x10-(((s)&0x3)<<2)) ))
541 #define simdia_vrothlf(a, s) (spu_rlqwbyte((a), (((s)&0x1)<<3) ))
542 #define simdia_vrotli(a, s) (spu_rlqwbyte((a), ((s)&0x3)<<2))
543 #define simdia_vrotlf(a, s) (spu_rlqwbyte((a), ((s)&0x3)<<2))
544 #define simdia_vrotllf(a, s) (spu_rlqwbyte((a), ((s)&0x1)<<3))
546 /***** Addition *****/
547 #define simdia_vaddi(a, b) (spu_add((a), (b)))
548 #define simdia_vaddf(a, b) (spu_add((a), (b)))
549 #define simdia_vaddlf(a, b) (spu_add((a), (b)))
551 /***** Subtraction *****/
552 #define simdia_vsubi(a, b) (spu_sub((a), (b)))
553 #define simdia_vsubf(a, b) (spu_sub((a), (b)))
554 #define simdia_vsublf(a, b) (spu_sub((a), (b)))
556 /***** Multiplication *****/
557 #define simdia_vmulf(a, b) (spu_mul((a), (b)))
558 #define simdia_vmullf(a, b) (spu_mul((a), (b)))
560 /***** Division *****/
561 #define simdia_vdivf(a, b) (spu_mul((a), spu_re(b)))
562 inline simdia_veclf
simdia_vdivlf(const simdia_veclf a
, const simdia_veclf b
) { simdia_veclf r
= { 0.0, 0.0 }; spu_insert((spu_extract(a
, 0) / spu_extract(b
, 0)), r
, 0); spu_insert((spu_extract(a
, 1) / spu_extract(b
, 1)), r
, 1); return r
; }
564 /***** Fused Multiply Add *****/
565 #define simdia_vmaddf(a, b, c) (spu_madd((a), (b), (c)))
566 #define simdia_vmaddlf(a, b, c) (spu_madd((a), (b), (c)))
568 /***** Reciprocal *****/
569 #define simdia_vrecipf(a) (spu_re(a))
570 inline simdia_veclf
simdia_vreciplf(const simdia_veclf a
, const simdia_veclf b
) { simdia_veclf r
= { 0.0, 0.0 }; spu_insert((1.0f
/ spu_extract(a
, 0)), r
, 0); spu_insert((1.0f
/ spu_extract(a
, 1)), r
, 1); return r
; }
572 /***** Square Root *****/
573 #define simdia_vsqrtf(a) (spu_re(spu_rsqrte(a)))
574 inline simdia_veclf
simdia_vsqrtlf(const simdia_veclf a
, const simdia_veclf b
) { simdia_veclf r
= { 0.0, 0.0 }; spu_insert(sqrt(spu_extract(a
, 0)), r
, 0); spu_insert(sqrt(spu_extract(a
, 1)), r
, 1); return r
; }
576 /***** Reciprocal Square Root *****/
577 #define simdia_vrsqrtf(a) (spu_rsqrte(a))
578 inline simdia_veclf
simdia_vrsqrtlf(const simdia_veclf a
, const simdia_veclf b
) { simdia_veclf r
= { 0.0, 0.0 }; spu_insert((1.0f
/ sqrt(spu_extract(a
, 0))), r
, 0); spu_insert((1.0f
/ sqrt(spu_extract(a
, 1))), r
, 1); return r
; }
581 #define simdia_vnoti(a) (spu_nor((a), (a)))
582 #define simdia_vnotf(a) (spu_nor((a), (a)))
583 #define simdia_vnotlf(a) (spu_nor((a), (a)))
586 #define simdia_vori(a, b) (spu_or((a), (b)))
587 #define simdia_vorf(a, b) (spu_or((a), (b)))
588 #define simdia_vorlf(a, b) (spu_or((a), (b)))
591 #define simdia_vnori(a, b) (spu_nor((a), (b)))
592 #define simdia_vnorf(a, b) (spu_nor((a), (b)))
593 #define simdia_vnorlf(a, b) (spu_nor((a), (b)))
596 #define simdia_vandi(a, b) (spu_and((a), (b)))
597 #define simdia_vandf(a, b) (spu_and((a), (b)))
598 #define simdia_vandlf(a, b) (spu_and((a), (b)))
601 #define simdia_vnandi(a, b) (spu_nand((a), (b)))
602 #define simdia_vnandf(a, b) (spu_nand((a), (b)))
603 #define simdia_vnandlf(a, b) (spu_nand((a), (b)))
606 #define simdia_vxori(a, b) (spu_xor((a), (b)))
607 #define simdia_vxorf(a, b) (spu_xor((a), (b)))
608 #define simdia_vxorlf(a, b) (spu_xor((a), (b)))
611 #define simdia_vnxori(a, b) ( simdia_vnoti( simdia_vxori((a), (b))))
612 #define simdia_vnxorf(a, b) ( simdia_vnotf( simdia_vxorf((a), (b))))
613 #define simdia_vnxorlf(a, b) (simdia_vnotlf(simdia_vxorlf((a), (b))))
615 /***** Equal To *****/
616 #define simdia_vcmpeqi(a, b) ((simdia_veci)(spu_cmpeq((a), (b))))
617 #define simdia_vcmpeqf(a, b) ((simdia_veci)(spu_cmpeq((a), (b))))
618 #define simdia_vcmpeqlf(a, b) ((simdia_veci)(spu_cmpeq((a), (b))))
620 /***** Greater Than *****/
621 #define simdia_vcmpgti(a, b) ((simdia_veci)(spu_cmpgt((a), (b))))
622 #define simdia_vcmpgtf(a, b) ((simdia_veci)(spu_cmpgt((a), (b))))
623 #define simdia_vcmpgtlf(a, b) ((simdia_veci)(spu_cmpgt((a), (b))))
625 // NOTE : Try to create versions of >= and < that do not double evaluate their inputs
627 /***** Greater Than or Equal To *****/
628 #define simdia_vcmpgei(a, b) (spu_or( simdia_vcmpeqi((a), (b)), simdia_vcmpgti((a), (b))))
629 #define simdia_vcmpgef(a, b) (spu_or( simdia_vcmpeqf((a), (b)), simdia_vcmpgtf((a), (b))))
630 #define simdia_vcmpgelf(a, b) (spu_or(simdia_vcmpeqlf((a), (b)), simdia_vcmpgtlf((a), (b))))
632 /***** Less Than *****/
633 #define simdia_vcmplti(a, b) (spu_nor( simdia_vcmpgti((a), (b)), simdia_vcmpeqi((a), (b))))
634 #define simdia_vcmpltf(a, b) (spu_nor( simdia_vcmpgtf((a), (b)), simdia_vcmpeqf((a), (b))))
635 #define simdia_vcmpltlf(a, b) (spu_nor(simdia_vcmpgtlf((a), (b)), simdia_vcmpeqlf((a), (b))))
637 /***** Less Than or Equal To *****/
638 #define simdia_vcmplei(a, b) (spu_nor( simdia_vcmpgti((a), (b)), simdia_const_vzeroi))
639 #define simdia_vcmplef(a, b) (spu_nor( simdia_vcmpgtf((a), (b)), simdia_const_vzerof))
640 #define simdia_vcmplelf(a, b) (spu_nor(simdia_vcmpgtlf((a), (b)), simdia_const_vzerolf))
643 /*******************************************************************************
644 *******************************************************************************
646 *******************************************************************************
647 *******************************************************************************/
648 #elif defined(__VEC__) && (!(SIMDIA_FORCE_NO_ALTIVEC))
650 /***** Data Types *****/
651 typedef vector
signed int simdia_veci
;
652 typedef vector
float simdia_vecf
;
654 /** power 7 VSX supports 64 bit operands, it also includes VMX support
655 * which means that things like vec_div, vec_insert, etcetera work for
656 * ints floats and doubles. These intrinsics also require a suitably
657 * new version of the compiler on Power 7. If you are somehow using a
658 * Power 7 with an old compiler, please do not hesitate to open a can
659 * of whoopass on whoever installed the tool chain, because that kind
660 * of stupidity should not be tolerated.
662 typedef vector
double simdia_veclf
;
664 typedef __simdia_veclf simdia_veclf
;
668 /* TODO | FIXME - Try to make these functions not reference memory
669 so values stay in registers */
671 // swap argument order
672 #define simdia_vinserti(a, b, c) (vec_insert((b)), ((a)), ((c)))
673 #define simdia_vinsertf(a, b, c) (vec_insert((b)), ((a)), ((c)))
674 #define simdia_vinsertlf(a, b, c) (vec_insert((b)), ((a)), ((c)))
676 inline simdia_veci
simdia_vinserti( simdia_veci v
, const int s
, const int i
) { simdia_veci r
= v
; int* rPtr
= ( int*)(&r
); rPtr
[i
] = s
; return r
; }
677 inline simdia_vecf
simdia_vinsertf( simdia_vecf v
, const float s
, const int i
) { simdia_vecf r
= v
; float* rPtr
= ( float*)(&r
); rPtr
[i
] = s
; return r
; }
678 #define simdia_vinsertlf __simdia_vinsertlf
681 /***** Extract *****/
683 #define simdia_vextracti(a, b) (vec_extract((a), (b)))
684 #define simdia_vextractf(a, b) (vec_extract((a), (b)))
685 #define simdia_vextractlf(a, b) (vec_extract((a), (b)))
687 /* TODO | FIXME - Try to make these functions not reference memory so values stay in registers */
688 inline int simdia_vextracti( simdia_veci v
, const int i
) { int* vPtr
= ( int*)(&v
); return vPtr
[i
]; }
689 inline float simdia_vextractf( simdia_vecf v
, const int i
) { float* vPtr
= ( float*)(&v
); return vPtr
[i
]; }
690 #define simdia_vextractlf __simdia_vextractlf
695 #define simdia_vseti(a) (vec_promote((a), 0))
696 #define simdia_vsetf(a) (vec_promote((a), 0))
697 #define simdia_vsetlf(a) (vec_promote((a), 0))
699 /* TODO : FIXME - There must be a better way to do this, but it
700 seems the only way to convert scalar to vector is to go through
703 EJB: converting between scalar and vector is the sort of thing you
704 want to avoid doing on altivec. Better to rethink and find a way to
705 stay in the vector engine if at all possible.
708 inline simdia_veci
simdia_vseti(const int a
) { __simdia_veci r
; r
.v0
= a
; return vec_splat(*((simdia_veci
*)(&r
)), 0); }
709 inline simdia_vecf
simdia_vsetf(const float a
) { __simdia_vecf r
; r
.v0
= a
; return vec_splat(*((simdia_vecf
*)(&r
)), 0); }
710 #define simdia_vsetlf __simdia_vsetlf
712 /* NOTE: Declare one for unsigned char vector also (required by rotate functions) */
713 inline vector
unsigned char simdia_vset16uc(const unsigned char c
) { vector
unsigned char r
__attribute__((aligned(16))); ((unsigned char*)(&r
))[0] = c
; return vec_splat(r
, 0); }
715 /***** Constant Zero *****/
716 #define simdia_const_vzeroi (vec_splat_s32(0))
717 #define simdia_const_vzerof (vec_ctf(vec_splat_s32(0), 0))
719 #define simdia_const_vzerolf (vec_splats(0))
721 #define simdia_const_vzerolf (__simdia_const_vzerolf)
724 /***** Constant One *****/
725 #define simdia_const_vonei (vec_splat_s32(1))
726 #define simdia_const_vonef (vec_ctf(vec_splat_s32(1), 0))
728 #define simdia_const_vonelf (vec_splats(1))
730 #define simdia_const_vonelf (__simdia_const_vonelf)
733 /***** Constant Two *****/
734 #define simdia_const_vtwoi (vec_splat_s32(2))
735 #define simdia_const_vtwof (vec_ctf(vec_splat_s32(2), 0))
737 #define simdia_const_vtwolf (vec_splats(2))
739 #define simdia_const_vtwolf (__simdia_const_vtwolf)
742 /***** Constant Negative One *****/
743 #define simdia_const_vnegonei (vec_splat_s32(-1))
744 #define simdia_const_vnegonef (vec_ctf(vec_splat_s32(-1), 0))
746 #define simdia_const_vnegonelf (vec_splats(-1))
748 #define simdia_const_vnegonelf (__const_veclf)
752 #define __simdia_vrotlbytes(a, s) (vec_or(vec_slo((a), simdia_vset16uc(((s) & 0xf) << 3)), vec_sro((a), simdia_set16uc((16 - ((s) & 0xf)) << 3))))
753 #define __simdia_vrotrbytes(a, s) (vec_or(vec_sro((a), simdia_vset16uc(((s) & 0xf) << 3)), vec_slo((a), simdia_set16uc((16 - ((s) & 0xf)) << 3))))
754 #define simdia_vrotli(a, s) __simdia_vrotlbytes((a), ((s) << 2))
755 #define simdia_vrotlf(a, s) __simdia_vrotlbytes((a), ((s) << 2))
756 #define simdia_vrotllf(a, s) __simdia_vrotlbytes((a), ((s) << 3))
757 #define simdia_vrothi(a, s) __simdia_vrotrbytes((a), ((s) << 2))
758 #define simdia_vrothf(a, s) __simdia_vrotrbytes((a), ((s) << 2))
759 #define simdia_vrothlf(a, s) __simdia_vrotrbytes((a), ((s) << 3))
761 /***** Addition *****/
762 #define simdia_vaddi(a, b) (vec_add((a), (b)))
763 #define simdia_vaddf(a, b) (vec_add((a), (b)))
765 #define simdia_vaddlf(a, b) (vec_add((a), (b)))
767 #define simdia_vaddlf __simdia_vaddlf
770 /***** Subtraction *****/
771 #define simdia_vsubi(a, b) (vec_sub((a), (b)))
772 #define simdia_vsubf(a, b) (vec_sub((a), (b)))
774 #define simdia_vsublf(a, b) (vec_sub((a), (b)))
776 #define simdia_vsublf __simdia_vsublf
779 /***** Multiplication *****/
780 // NOTE: Try to find a way to do this without double evaluating a
782 #define simdia_vmulf(a, b) (vec_mul((a), (b)))
783 #define simdia_vmullf(a, b) (vec_mul((a), (b)))
785 #define simdia_vmulf(a, b) (vec_madd((a), (b), vec_xor((a), (a))))
786 #define simdia_vmullf __simdia_vmullf
789 /***** Division *****/
791 #define simdia_vdivf(a, b) (vec_div((a)), ((b)))
792 #define simdia_vdivlf(a, b) (vec_div((a)), ((b)))
794 #define simdia_vdivf(a, b) (simdia_vmulf((a), vec_re(b)))
795 #define simdia_vdivlf __simdia_vdivlf
798 /***** Fused Multiply Add *****/
799 #define simdia_vmaddf(a, b, c) (vec_madd((a), (b), (c)))
801 #define simdia_vmaddlf(a, b, c) (vec_madd((a), (b), (c)))
803 #define simdia_vmaddlf __simdia_vmaddlf
806 /***** Reciprocal *****/
807 #define simdia_vrecipf(a) (vec_re(a))
809 #define simdia_vreciplf(a) (vec_re(a))
811 #define simdia_vreciplf __simdia_vreciplf
814 /***** Square Root *****/
815 #define simdia_vsqrtf(a) (vec_re(vec_rsqrte(a)))
817 #define simdia_vsqrtlf(a) (vec_sqrt(a))
819 #define simdia_vsqrtlf __simdia_vsqrtlf
822 /***** Reciprocal Square Root *****/
823 #define simdia_vrsqrtf(a) (vec_rsqrte(a))
825 #define simdia_vrsqrtlf(a) (vec_rsqrte(a))
827 #define simdia_vrsqrtlf __simdia_vrsqrtlf
832 #define simdia_vnoti(a) (vec_neg(a))
833 #define simdia_vnotf(a) (vec_neg(a))
834 #define simdia_vnotlf(a) (vec_neg(a))
836 #define simdia_vnoti(a) (vec_xor((a), simdia_const_vnegonei))
837 #define simdia_vnotf(a) (vec_xor((a), simdia_const_vnegonei))
838 #define simdia_vnotlf __simdia_vnotlf
842 #define simdia_vori(a, b) (vec_or((a), (b)))
843 #define simdia_vorf(a, b) (vec_or((a), (b)))
845 #define simdia_vorlf(a, b) (vec_or((a), (b)))
847 #define simdia_vorlf __simdia_vorlf
851 #define simdia_vnori(a, b) (vec_nor((a), (b)))
852 #define simdia_vnorf(a, b) (vec_nor((a), (b)))
854 #define simdia_vnorlf(a, b) (vec_nor((a), (b)))
856 #define simdia_vnorlf __simdia_vnorlf
860 #define simdia_vandi(a, b) (vec_and((a), (b)))
861 #define simdia_vandf(a, b) (vec_and((a), (b)))
863 #define simdia_vandlf(a, b) (vec_and((a), (b)))
865 #define simdia_vandlf __simdia_vandlf
869 #define simdia_vnandi(a, b) (simdia_vnoti(simdia_vandi((a), (b))))
870 #define simdia_vnandf(a, b) (simdia_vnotf(simdia_vandf((a), (b))))
872 #define simdia_vnandlf(a, b) (simdia_vnotf(simdia_vandf((a), (b))))
874 #define simdia_vnandlf __simdia_vnandlf
878 #define simdia_vxori(a, b) (vec_xor((a), (b)))
879 #define simdia_vxorf(a, b) (vec_xor((a), (b)))
881 #define simdia_vxorlf(a, b) (vec_xor((a), (b)))
883 #define simdia_vxorlf __simdia_vxorlf
887 #define simdia_vnxori(a, b) (simdia_vnoti(simdia_vxori((a), (b))))
888 #define simdia_vnxorf(a, b) (simdia_vnotf(simdia_vxorf((a), (b))))
890 #define simdia_vnxorlf(a, b) (simdia_vnotlf(simdia_vxorf((a), (b))))
892 #define simdia_vnxorlf __simdia_vnxorlf
895 /***** Equal To *****/
896 #define simdia_vcmpeqi(a, b) ((simdia_veci)(vec_cmpeq((a), (b))))
897 #define simdia_vcmpeqf(a, b) ((simdia_veci)(vec_cmpeq((a), (b))))
899 #define simdia_vcmpeqlf(a, b) ((simdia_veci)(vec_cmpeq((a), (b))))
901 #define simdia_vcmpeqlf __simdia_vcmpeqlf
904 /***** Greater Than *****/
905 #define simdia_vcmpgti(a, b) ((simdia_veci)(vec_cmpgt((a), (b))))
906 #define simdia_vcmpgtf(a, b) ((simdia_veci)(vec_cmpgt((a), (b))))
908 #define simdia_vcmpgtlf(a, b) ((simdia_veci)(vec_cmpgt((a), (b))))
910 #define simdia_vcmpgtlf __simdia_vcmpgtlf
913 /***** Greater Than Or Equal To *****/
914 #define simdia_vcmpgei(a, b) ((simdia_veci)(vec_cmpge((a), (b))))
915 #define simdia_vcmpgef(a, b) ((simdia_veci)(vec_cmpge((a), (b))))
917 #define simdia_vcmpgelf(a, b) ((simdia_veci)(vec_cmpge((a), (b))))
919 #define simdia_vcmpgelf __simdia_vcmpgelf
922 /***** Less Than *****/
923 #define simdia_vcmplti(a, b) ((simdia_veci)(vec_cmplt((a), (b))))
924 #define simdia_vcmpltf(a, b) ((simdia_veci)(vec_cmplt((a), (b))))
926 #define simdia_vcmpltlf(a, b) ((simdia_veci)(vec_cmplt((a), (b))))
928 #define simdia_vcmpltlf __simdia_vcmpltlf
931 /***** Less Than Or Equal To *****/
932 #define simdia_vcmplei(a, b) ((simdia_veci)(vec_cmple((a), (b))))
933 #define simdia_vcmplef(a, b) ((simdia_veci)(vec_cmple((a), (b))))
935 #define simdia_vcmplelf(a, b) ((simdia_veci)(vec_cmple((a), (b))))
936 // NOTE: vec_cmple not listed in Calin's wiki page of builtins for
937 // PWR7, but has a header definition in the compiler
939 #define simdia_vcmplelf __simdia_vcmplelf
942 /*******************************************************************************
943 *******************************************************************************
944 ***** Mapping to Generic C Implementation
945 *******************************************************************************
946 *******************************************************************************/
949 /***** Data Types *****/
950 typedef __simdia_veci simdia_veci
;
951 typedef __simdia_vecf simdia_vecf
;
952 typedef __simdia_veclf simdia_veclf
;
955 #define simdia_vinserti __simdia_vinserti
956 #define simdia_vinsertf __simdia_vinsertf
957 #define simdia_vinsertlf __simdia_vinsertlf
959 /***** Extract *****/
960 #define simdia_vextracti __simdia_vextracti
961 #define simdia_vextractf __simdia_vextractf
962 #define simdia_vextractlf __simdia_vextractlf
965 #define simdia_vseti __simdia_vseti
966 #define simdia_vsetf __simdia_vsetf
967 #define simdia_vsetlf __simdia_vsetlf
969 /***** Constant Zero *****/
970 #define simdia_const_vzeroi __simdia_const_vzeroi
971 #define simdia_const_vzerof __simdia_const_vzerof
972 #define simdia_const_vzerolf __simdia_const_vzerolf
974 /***** Constant One *****/
975 #define simdia_const_vonei __simdia_const_vonei
976 #define simdia_const_vonef __simdia_const_vonef
977 #define simdia_const_vonelf __simdia_const_vonelf
979 /***** Constant Two *****/
980 #define simdia_const_vtwoi __simdia_const_vtwoi
981 #define simdia_const_vtwof __simdia_const_vtwof
982 #define simdia_const_vtwolf __simdia_const_vtwolf
984 /***** Constant Negative One *****/
985 #define simdia_const_vnegonei __simdia_const_vnegonei
986 #define simdia_const_vnegonef __simdia_const_vnegonef
987 #define simdia_const_vnegonelf __simdia_const_vnegonelf
990 #define simdia_vrothi __simdia_vrothi
991 #define simdia_vrothf __simdia_vrothf
992 #define simdia_vrothlf __simdia_vrothlf
993 #define simdia_vrotli __simdia_vrotli
994 #define simdia_vrotlf __simdia_vrotlf
995 #define simdia_vrotllf __simdia_vrotllf
997 /***** Addition *****/
998 #define simdia_vaddi __simdia_vaddi
999 #define simdia_vaddf __simdia_vaddf
1000 #define simdia_vaddlf __simdia_vaddlf
1002 /***** Subtraction *****/
1003 #define simdia_vsubi __simdia_vsubi
1004 #define simdia_vsubf __simdia_vsubf
1005 #define simdia_vsublf __simdia_vsublf
1007 /***** Multiplication *****/
1008 #define simdia_vmulf __simdia_vmulf
1009 #define simdia_vmullf __simdia_vmullf
1011 /***** Division *****/
1012 #define simdia_vdivf __simdia_vdivf
1013 #define simdia_vdivlf __simdia_vdivlf
1015 /***** Fused Multiply Add *****/
1016 #define simdia_vmaddf __simdia_vmaddf
1017 #define simdia_vmaddlf __simdia_vmaddlf
1019 /***** Reciprocal *****/
1020 #define simdia_vrecipf __simdia_vrecipf
1021 #define simdia_vreciplf __simdia_vreciplf
1023 /***** Square Root *****/
1024 #define simdia_vsqrtf __simdia_vsqrtf
1025 #define simdia_vsqrtlf __simdia_vsqrtlf
1027 /***** Reciprocal Square Root *****/
1028 #define simdia_vrsqrtf __simdia_vrsqrtf
1029 #define simdia_vrsqrtlf __simdia_vrsqrtlf
1032 #define simdia_vnoti __simdia_vnoti
1033 #define simdia_vnotf __simdia_vnotf
1034 #define simdia_vnotlf __simdia_vnotlf
1037 #define simdia_vori __simdia_vori
1038 #define simdia_vorf __simdia_vorf
1039 #define simdia_vorlf __simdia_vorlf
1042 #define simdia_vnori __simdia_vnori
1043 #define simdia_vnorf __simdia_vnorf
1044 #define simdia_vnorlf __simdia_vnorlf
1047 #define simdia_vandi __simdia_vandi
1048 #define simdia_vandf __simdia_vandf
1049 #define simdia_vandlf __simdia_vandlf
1052 #define simdia_vnandi __simdia_vnandi
1053 #define simdia_vnandf __simdia_vnandf
1054 #define simdia_vnandlf __simdia_vnandlf
1057 #define simdia_vxori __simdia_vxori
1058 #define simdia_vxorf __simdia_vxorf
1059 #define simdia_vxorlf __simdia_vxorlf
1062 #define simdia_vnxori __simdia_vnxori
1063 #define simdia_vnxorf __simdia_vnxorf
1064 #define simdia_vnxorlf __simdia_vnxorlf
1066 /***** Equal To *****/
1067 #define simdia_vcmpeqi __simdia_vcmpeqi
1068 #define simdia_vcmpeqf __simdia_vcmpeqf
1069 #define simdia_vcmpeqlf __simdia_vcmpeqlf
1071 /***** Greater Than *****/
1072 #define simdia_vcmpgti __simdia_vcmpgti
1073 #define simdia_vcmpgtf __simdia_vcmpgtf
1074 #define simdia_vcmpgtlf __simdia_vcmpgtlf
1076 /***** Greater Than Or Equal To *****/
1077 #define simdia_vcmpgei __simdia_vcmpgei
1078 #define simdia_vcmpgef __simdia_vcmpgef
1079 #define simdia_vcmpgelf __simdia_vcmpgelf
1081 /***** Less Than *****/
1082 #define simdia_vcmplti __simdia_vcmplti
1083 #define simdia_vcmpltf __simdia_vcmpltf
1084 #define simdia_vcmpltlf __simdia_vcmpltlf
1086 /***** Less Than Or Equal To *****/
1087 #define simdia_vcmplei __simdia_vcmplei
1088 #define simdia_vcmplef __simdia_vcmplef
1089 #define simdia_vcmplelf __simdia_vcmplelf
1095 /*******************************************************************************
1096 *******************************************************************************
1097 ***** Shared Combinations
1098 *******************************************************************************
1099 *******************************************************************************/
1101 /* NOTE: If any architecture specific implementation can do any of these
1102 * operations faster, then move them up to the architecture specific areas and
1103 * make individual definitions. This area is just meant to declare commonly
1104 * use combinations so that they don't have to be repeated many times over.
1107 /***** Number of Elements per Vector Type *****/
1108 #define simdia_veci_numElems (sizeof( simdia_veci)/sizeof( int))
1109 #define simdia_vecf_numElems (sizeof( simdia_vecf)/sizeof( float))
1110 #define simdia_veclf_numElems (sizeof(simdia_veclf)/sizeof(double))
1112 /***** Spread (Duplicate functionality of 'Set' by another another name) *****/
1113 #define simdia_vspreadi(a) ( simdia_vseti(a))
1114 #define simdia_vspreadf(a) ( simdia_vsetf(a))
1115 #define simdia_vspreadlf(a) (simdia_vsetlf(a))
1117 #define simdia_visfinitef(a) (isfinite(simdia_vextractf((a),0)) && isfinite(simdia_vextractf((a),1)) && isfinite(simdia_vextractf((a),2)) && isfinite(simdia_vextractf((a),3)))
1118 #define simdia_visfinitelf(a) (isfinite(simdia_vextractlf((a),0)) && isfinite(simdia_vextractlf((a),1)))
1120 /***** Add to Scalar *****/
1121 #define simdia_vaddis(a, b) ( simdia_vaddi((a), simdia_vseti(b)))
1122 #define simdia_vaddfs(a, b) ( simdia_vaddf((a), simdia_vsetf(b)))
1123 #define simdia_vaddlfs(a, b) (simdia_vaddlf((a), simdia_vsetlf(b)))
1125 /***** Subtract a Scalar *****/
1126 #define simdia_vsubis(a, b) ( simdia_vsubi((a), simdia_vseti(b)))
1127 #define simdia_vsubfs(a, b) ( simdia_vsubf((a), simdia_vsetf(b)))
1128 #define simdia_vsublfs(a, b) (simdia_vsublf((a), simdia_vsetlf(b)))
1130 /***** Multiply by Scalar *****/
1131 #define simdia_vmulfs(a, b) ( simdia_vmulf((a), simdia_vsetf(b)))
1132 #define simdia_vmullfs(a, b) (simdia_vmullf((a), simdia_vsetlf(b)))
1134 /***** Divide by Scalar *****/
1135 #define simdia_vdivfs(a, b) ( simdia_vdivf((a), simdia_vsetf(b)))
1136 #define simdia_vdivlfs(a, b) (simdia_vdivlf((a), simdia_vsetlf(b)))
1138 /***** Fused Multiply(Vector) Add(Scalar) *****/
1139 #define simdia_vmaddfs(a, b, c) ( simdia_vmaddf((a), (b), simdia_vsetf(c)))
1140 #define simdia_vmaddlfs(a, b, c) (simdia_vmaddlf((a), (b), simdia_vsetlf(c)))
1142 /***** Fused Multiply(Scalar) Add(Scalar) *****/
1143 #define simdia_vmaddfss(a, b, c) ( simdia_vmaddf((a), simdia_vsetf(b), simdia_vsetf(c)))
1144 #define simdia_vmaddlfss(a, b, c) (simdia_vmaddlf((a), simdia_vsetlf(b), simdia_vsetlf(c)))
1146 #if defined(__VEC__)
1152 #endif //__SIMDIA_H__