Added static_assert from Loki
[ustl.git] / simd.h
blob79895f7f3f6ea210a694c1a1efc3877d6dfde405
1 // This file is part of the ustl library, an STL implementation.
2 //
3 // Copyright (C) 2005 by Mike Sharov <msharov@users.sourceforge.net>
4 // This file is free software, distributed under the MIT License.
5 //
6 /// \file simd.h
7 /// \brief SIMD-type algorithms, with hardware acceleration, if available.
8 ///
9 /// All algorithms are container-based because iterator syntax is just too
10 /// damn verbose and because the specializations need to be able to tell
11 /// how many elements are in the container in order to choose proper SIMD
12 /// instruction set (i.e.: 4 floats select SSE, while 2 floats select 3dNow!)
13 /// Specializations are only for the tuple template because the container
14 /// must be of a fixed and compile-time-known size for the compiler to be
15 /// able to choose the specialization.
16 ///
18 #ifndef SIMD_H_39BE2D970DF4BD00508CCFFB482496F9
19 #define SIMD_H_39BE2D970DF4BD00508CCFFB482496F9
21 #include "ulimits.h"
22 #if HAVE_MATH_H
23 #include <math.h>
24 #endif
26 namespace ustl {
27 namespace simd {
29 //----------------------------------------------------------------------
30 // Generic algorithms
31 //----------------------------------------------------------------------
33 /// Applies \p op to each element in \p op1.
34 template <typename Ctr, typename UnaryOperation>
35 inline void packop (Ctr& op1, UnaryOperation op)
37 foreach (typename Ctr::iterator, i, op1)
38 op (*i);
41 /// Applies \p op to each element in \p op1 and \p op2 and stores in \p op2.
42 template <typename Ctr, typename BinaryOperation>
43 inline void packop (const Ctr& op1, Ctr& op2, BinaryOperation op)
45 assert (op2.size() <= op1.size());
46 typename Ctr::const_iterator i1 (op1.begin());
47 typename Ctr::iterator i2 (op2.begin());
48 for (; i2 != op2.end(); ++i1, ++i2)
49 *i2 = op (*i2, *i1);
52 /// Applies \p op to corresponding elements in \p op1 and \p op2 and stores in \p result.
53 template <typename Ctr, typename BinaryOperation>
54 inline void packop (const Ctr& op1, const Ctr& op2, Ctr& result, BinaryOperation op)
56 assert (op1.size() <= op2.size() && op1.size() <= result.size());
57 passign (op1, result);
58 packop (op2, result);
61 /// Copies \p op1 into \p result.
62 template <typename Ctr>
63 inline void passign (const Ctr& op1, Ctr& result)
65 assert (op1.size() <= result.size());
66 typename Ctr::iterator d (result.begin());
67 foreach (typename Ctr::const_iterator, s, op1)
68 *d++ = *s;
71 /// Copies \p result.size() elements from \p op1 to \p result.
72 template <typename Ctr>
73 inline void ipassign (typename Ctr::const_iterator op1, Ctr& result)
75 foreach (typename Ctr::iterator, d, result)
76 *d = *op1++;
79 template <typename Ctr1, typename Ctr2, typename ConvertFunction>
80 inline void pconvert (const Ctr1& op1, Ctr2& op2, ConvertFunction f)
82 assert (op1.size() <= op2.size());
83 typename Ctr1::const_iterator i1 (op1.begin());
84 typename Ctr2::iterator i2 (op2.begin());
85 for (; i1 != op1.end(); ++i1, ++i2)
86 *i2 = f (*i1);
89 // Functionoids for SIMD operations, like saturation arithmetic, shifts, etc.
90 STD_BINARY_FUNCTOR (fpadds, T, ((b > numeric_limits<T>::max() - a) ? numeric_limits<T>::max() : a + b))
91 STD_BINARY_FUNCTOR (fpsubs, T, ((a < numeric_limits<T>::min() + b) ? numeric_limits<T>::min() : a - b))
92 STD_BINARY_FUNCTOR (fpshl, T, (a << b))
93 STD_BINARY_FUNCTOR (fpshr, T, (a >> b))
94 STD_BINARY_FUNCTOR (fpmin, T, (min (a, b)))
95 STD_BINARY_FUNCTOR (fpmax, T, (max (a, b)))
96 STD_BINARY_FUNCTOR (fpavg, T, ((a + b + 1) / 2))
97 STD_CONVERSION_FUNCTOR (fcast, (D(a)))
98 #if HAVE_MATH_H
99 STD_UNARY_FUNCTOR (fpreciprocal,T, (1 / a))
100 STD_UNARY_FUNCTOR (fpsqrt, T, (reset_mmx(), T (sqrt (a))))
101 STD_UNARY_FUNCTOR (fprecipsqrt, T, (reset_mmx(), 1 / T(sqrt (a))))
102 STD_UNARY_FUNCTOR (fsin, T, (reset_mmx(), T (sin (a))))
103 STD_UNARY_FUNCTOR (fcos, T, (reset_mmx(), T (cos (a))))
104 STD_UNARY_FUNCTOR (ftan, T, (reset_mmx(), T (tan (a))))
105 #if HAVE_RINTF
106 STD_CONVERSION_FUNCTOR (fround, (reset_mmx(), D(rintf(a))))
107 #else
108 STD_CONVERSION_FUNCTOR (fround, (reset_mmx(), D(rint(a))))
109 #endif
110 template <> inline int32_t fround<double,int32_t>::operator()(const double& a) const { reset_mmx(); return (int32_t(rint(a))); }
111 #endif
112 template <> inline float fpavg<float>::operator()(const float& a, const float& b) const { return ((a + b) / 2); }
113 template <> inline double fpavg<double>::operator()(const double& a, const double& b) const { return ((a + b) / 2); }
115 #define SIMD_PACKEDOP1(name, operation) \
116 template <typename Ctr> \
117 inline void name (Ctr& op1) \
119 typedef typename Ctr::value_type value_t; \
120 packop (op1, operation<value_t>()); \
122 #define SIMD_PACKEDOP2(name, operation) \
123 template <typename Ctr> \
124 inline void name (const Ctr& op1, Ctr& op2) \
126 typedef typename Ctr::value_type value_t; \
127 packop (op1, op2, operation<value_t>()); \
129 #define SIMD_PACKEDOP3(name, operation) \
130 template <typename Ctr> \
131 inline void name (const Ctr& op1, const Ctr& op2, Ctr& result) \
133 typedef typename Ctr::value_type value_t; \
134 packop (op1, op2, result, operation<value_t>()); \
136 #define SIMD_SINGLEOP1(name, operation) \
137 template <typename T> \
138 inline T name (T op) \
140 operation<T> obj; \
141 return (obj(op)); \
143 #define SIMD_CONVERTOP(name, operation) \
144 template <typename Ctr1, typename Ctr2> \
145 inline void name (const Ctr1& op1, Ctr2& op2) \
147 typedef typename Ctr1::value_type value1_t; \
148 typedef typename Ctr2::value_type value2_t; \
149 pconvert (op1, op2, operation<value1_t, value2_t>());\
152 SIMD_PACKEDOP2 (padd, plus)
153 SIMD_PACKEDOP2 (psub, minus)
154 SIMD_PACKEDOP2 (pmul, multiplies)
155 SIMD_PACKEDOP2 (pdiv, divides)
156 SIMD_PACKEDOP2 (pand, bitwise_and)
157 SIMD_PACKEDOP2 (por, bitwise_or)
158 SIMD_PACKEDOP2 (pxor, bitwise_xor)
159 SIMD_PACKEDOP2 (pshl, fpshl)
160 SIMD_PACKEDOP2 (pshr, fpshr)
161 SIMD_PACKEDOP2 (psubs, fpsubs)
162 SIMD_PACKEDOP2 (pmin, fpmin)
163 SIMD_PACKEDOP2 (pmax, fpmax)
164 SIMD_PACKEDOP2 (pavg, fpavg)
166 SIMD_PACKEDOP3 (padd, plus)
167 SIMD_PACKEDOP3 (psub, minus)
168 SIMD_PACKEDOP3 (pmul, multiplies)
169 SIMD_PACKEDOP3 (pdiv, divides)
170 SIMD_PACKEDOP3 (pand, bitwise_and)
171 SIMD_PACKEDOP3 (por, bitwise_or)
172 SIMD_PACKEDOP3 (pxor, bitwise_xor)
173 SIMD_PACKEDOP3 (pshl, fpshl)
174 SIMD_PACKEDOP3 (pshr, fpshr)
175 SIMD_PACKEDOP3 (padds, fpadds)
176 SIMD_PACKEDOP3 (psubs, fpsubs)
177 SIMD_PACKEDOP3 (pmin, fpmin)
178 SIMD_PACKEDOP3 (pmax, fpmax)
179 SIMD_PACKEDOP3 (pavg, fpavg)
181 #if HAVE_MATH_H
182 SIMD_PACKEDOP1 (precip, fpreciprocal)
183 SIMD_PACKEDOP1 (psqrt, fpsqrt)
184 SIMD_PACKEDOP1 (precipsqrt, fprecipsqrt)
185 SIMD_PACKEDOP1 (psin, fsin)
186 SIMD_PACKEDOP1 (pcos, fcos)
187 SIMD_PACKEDOP1 (ptan, ftan)
189 SIMD_SINGLEOP1 (srecip, fpreciprocal)
190 SIMD_SINGLEOP1 (ssqrt, fpsqrt)
191 SIMD_SINGLEOP1 (srecipsqrt, fprecipsqrt)
192 SIMD_SINGLEOP1 (ssin, fsin)
193 SIMD_SINGLEOP1 (scos, fcos)
194 SIMD_SINGLEOP1 (stan, ftan)
196 SIMD_CONVERTOP (pround, fround)
198 template <typename T> inline int32_t sround (T op) { fround<T,int32_t> obj; return (obj (op)); }
199 #endif
201 #undef SIMD_SINGLEOP1
202 #undef SIMD_PACKEDOP3
203 #undef SIMD_PACKEDOP2
204 #undef SIMD_PACKEDOP1
206 //----------------------------------------------------------------------
207 // Vector types to cast tuple data to
208 //----------------------------------------------------------------------
210 #if HAVE_VECTOR_EXTENSIONS && __GNUC__ >= 4
211 #define VECTOR_ATTRIBUTE(mode,vs) __attribute__((vector_size(vs)))
212 #else
213 #define VECTOR_ATTRIBUTE(mode,vs)
214 #endif
215 typedef uint8_t v8qi_t VECTOR_ATTRIBUTE (V8QI,8);
216 typedef uint16_t v4hi_t VECTOR_ATTRIBUTE (V4HI,8);
217 typedef uint16_t v8hi_t VECTOR_ATTRIBUTE (V8HI,16);
218 typedef uint32_t v2si_t VECTOR_ATTRIBUTE (V2SI,8);
219 typedef uint32_t v4si_t VECTOR_ATTRIBUTE (V4SI,16);
220 #if HAVE_INT64_T
221 typedef uint64_t v1di_t VECTOR_ATTRIBUTE (V1DI,8);
222 #endif
223 typedef float v2sf_t VECTOR_ATTRIBUTE (V2SF,8);
224 typedef float v4sf_t VECTOR_ATTRIBUTE (V4SF,16);
225 typedef double v2df_t VECTOR_ATTRIBUTE (V2DF,16);
226 #undef VECTOR_ATTRIBUTE
228 #define SIMDA_RI(n) "m"(oin[n])
229 #define SIMDA_RO(n) "m"(oout[n])
230 #define SIMDA_WI(n) "=m"(oin[n])
231 #define SIMDA_WO(n) "=m"(oout[n])
233 //----------------------------------------------------------------------
234 // Hardware accelerated specializations
235 //----------------------------------------------------------------------
237 #define SIMD_PKOP2_SPEC(n, type, optype) \
238 template <> \
239 inline void packop (const tuple<n,type>& oin, tuple<n,type>& oout, optype<type>)
240 #define SIMD_PASSIGN_SPEC(n, type) \
241 template <> \
242 inline void passign (const tuple<n,type>& oin, tuple<n,type>& oout)
243 #define SIMD_IPASSIGN_SPEC(n, type) \
244 template <> \
245 inline void ipassign (tuple<n,type>::const_iterator oin, tuple<n,type>& oout)
246 #define SIMD_CONVERT_SPEC(n, type1, type2, optype) \
247 template <> \
248 inline void pconvert (const tuple<n,type1>& oin, tuple<n,type2>& oout, optype<type1,type2>)
250 #if CPU_HAS_MMX
251 #define STD_MMX_ARGS "=m"(oout[0]) : "m"(oin[0]) : "mm0", "st", "memory"
252 #define DBL_MMX_ARGS "=m"(oout[0]), "=m"(oout[2]) : "m"(oin[0]), "m"(oin[2]) : "mm0", "mm1", "st", "st(1)", "memory"
253 #define MMX_PKOP2_SPEC(n,type,optype,instruction) \
254 SIMD_PKOP2_SPEC(n,type,optype) \
255 { asm ("movq %0, %%mm0\n\t" #instruction " %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
256 #define MMX_DBL_PKOP2_SPEC(n,type,optype,instruction) \
257 SIMD_PKOP2_SPEC(n,type,optype) \
258 { asm ("movq %0, %%mm0\n\tmovq %1, %%mm1\n\t" #instruction " %2, %%mm0\n\t" #instruction " %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
259 #define MMX_PASSIGN_SPEC(n,type) \
260 SIMD_PASSIGN_SPEC(n,type) \
261 { asm ("movq %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
262 #define MMX_DBL_PASSIGN_SPEC(n,type) \
263 SIMD_PASSIGN_SPEC(n,type) \
264 { asm ("movq %2, %%mm0\n\tmovq %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
265 #define MMX_IPASSIGN_SPEC(n,type) \
266 SIMD_IPASSIGN_SPEC(n,type) \
267 { asm ("movq %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
268 #define MMX_DBL_IPASSIGN_SPEC(n,type) \
269 SIMD_IPASSIGN_SPEC(n,type) \
270 { asm ("movq %2, %%mm0\n\tmovq %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
272 MMX_PASSIGN_SPEC(8,uint8_t)
273 MMX_PKOP2_SPEC(8,uint8_t,plus,paddb)
274 MMX_PKOP2_SPEC(8,uint8_t,minus,psubb)
275 MMX_PKOP2_SPEC(8,uint8_t,bitwise_and,pand)
276 MMX_PKOP2_SPEC(8,uint8_t,bitwise_or,por)
277 MMX_PKOP2_SPEC(8,uint8_t,bitwise_xor,pxor)
278 MMX_PKOP2_SPEC(8,uint8_t,fpadds,paddusb)
279 MMX_PKOP2_SPEC(8,uint8_t,fpsubs,psubusb)
281 MMX_PASSIGN_SPEC(8,int8_t)
282 MMX_PKOP2_SPEC(8,int8_t,plus,paddb)
283 MMX_PKOP2_SPEC(8,int8_t,minus,psubb)
284 MMX_PKOP2_SPEC(8,int8_t,bitwise_and,pand)
285 MMX_PKOP2_SPEC(8,int8_t,bitwise_or,por)
286 MMX_PKOP2_SPEC(8,int8_t,bitwise_xor,pxor)
287 MMX_PKOP2_SPEC(8,int8_t,fpadds,paddsb)
288 MMX_PKOP2_SPEC(8,int8_t,fpsubs,psubsb)
290 MMX_PASSIGN_SPEC(4,uint16_t)
291 MMX_PKOP2_SPEC(4,uint16_t,plus,paddw)
292 MMX_PKOP2_SPEC(4,uint16_t,minus,psubw)
293 MMX_PKOP2_SPEC(4,uint16_t,bitwise_and,pand)
294 MMX_PKOP2_SPEC(4,uint16_t,bitwise_or,por)
295 MMX_PKOP2_SPEC(4,uint16_t,bitwise_xor,pxor)
296 /// \todo psllw does not work like other operations, it uses the first element for shift count.
297 //MMX_PKOP2_SPEC(4,uint16_t,fpshl,psllw)
298 //MMX_PKOP2_SPEC(4,uint16_t,fpshr,psrlw)
299 MMX_PKOP2_SPEC(4,uint16_t,fpadds,paddusw)
300 MMX_PKOP2_SPEC(4,uint16_t,fpsubs,psubusw)
302 MMX_PASSIGN_SPEC(4,int16_t)
303 MMX_PKOP2_SPEC(4,int16_t,plus,paddw)
304 MMX_PKOP2_SPEC(4,int16_t,minus,psubw)
305 MMX_PKOP2_SPEC(4,int16_t,bitwise_and,pand)
306 MMX_PKOP2_SPEC(4,int16_t,bitwise_or,por)
307 MMX_PKOP2_SPEC(4,int16_t,bitwise_xor,pxor)
308 //MMX_PKOP2_SPEC(4,int16_t,fpshl,psllw)
309 //MMX_PKOP2_SPEC(4,int16_t,fpshr,psrlw)
310 MMX_PKOP2_SPEC(4,int16_t,fpadds,paddsw)
311 MMX_PKOP2_SPEC(4,int16_t,fpsubs,psubsw)
313 MMX_PASSIGN_SPEC(2,uint32_t)
314 MMX_PKOP2_SPEC(2,uint32_t,plus,paddd)
315 MMX_PKOP2_SPEC(2,uint32_t,minus,psubd)
316 MMX_PKOP2_SPEC(2,uint32_t,bitwise_and,pand)
317 MMX_PKOP2_SPEC(2,uint32_t,bitwise_or,por)
318 MMX_PKOP2_SPEC(2,uint32_t,bitwise_xor,pxor)
319 //MMX_PKOP2_SPEC(2,uint32_t,fpshl,pslld)
320 //MMX_PKOP2_SPEC(2,uint32_t,fpshr,psrld)
322 MMX_PASSIGN_SPEC(2,int32_t)
323 MMX_PKOP2_SPEC(2,int32_t,plus,paddd)
324 MMX_PKOP2_SPEC(2,int32_t,minus,psubd)
325 MMX_PKOP2_SPEC(2,int32_t,bitwise_and,pand)
326 MMX_PKOP2_SPEC(2,int32_t,bitwise_or,por)
327 MMX_PKOP2_SPEC(2,int32_t,bitwise_xor,pxor)
328 //MMX_PKOP2_SPEC(2,int32_t,fpshl,pslld)
329 //MMX_PKOP2_SPEC(2,int32_t,fpshr,psrld)
331 MMX_DBL_PKOP2_SPEC(4,uint32_t,plus,paddd)
332 MMX_DBL_PKOP2_SPEC(4,uint32_t,minus,psubd)
333 MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_and,pand)
334 MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_or,por)
335 MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_xor,pxor)
336 //MMX_DBL_PKOP2_SPEC(2,uint32_t,fpshl,pslld)
337 //MMX_DBL_PKOP2_SPEC(2,uint32_t,fpshr,psrld)
339 MMX_DBL_PKOP2_SPEC(4,int32_t,plus,paddd)
340 MMX_DBL_PKOP2_SPEC(4,int32_t,minus,psubd)
341 MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_and,pand)
342 MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_or,por)
343 MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_xor,pxor)
344 //MMX_DBL_PKOP2_SPEC(2,int32_t,fpshl,pslld)
345 //MMX_DBL_PKOP2_SPEC(2,int32_t,fpshr,psrld)
347 #if CPU_HAS_SSE || CPU_HAS_3DNOW
348 MMX_PKOP2_SPEC(8,uint8_t,fpavg,pavgb)
349 MMX_PKOP2_SPEC(8,int8_t,fpavg,pavgb)
350 MMX_PKOP2_SPEC(4,uint16_t,fpavg,pavgw)
351 MMX_PKOP2_SPEC(4,int16_t,fpavg,pavgw)
352 MMX_PKOP2_SPEC(8,uint8_t,fpmin,pminub)
353 MMX_PKOP2_SPEC(8,uint8_t,fpmax,pmaxub)
354 MMX_PKOP2_SPEC(4,int16_t,fpmax,pmaxsw)
355 MMX_PKOP2_SPEC(4,int16_t,fpmin,pminsw)
356 #endif // CPU_HAS_SSE || CPU_HAS_3DNOW
358 #if CPU_HAS_3DNOW
359 MMX_PASSIGN_SPEC(2,float)
360 MMX_PKOP2_SPEC(2,float,plus,pfadd)
361 MMX_PKOP2_SPEC(2,float,minus,pfsub)
362 MMX_PKOP2_SPEC(2,float,multiplies,pfmul)
363 MMX_PKOP2_SPEC(2,float,fpmin,pfmin)
364 MMX_PKOP2_SPEC(2,float,fpmax,pfmax)
365 #ifndef CPU_HAS_SSE
366 MMX_DBL_PKOP2_SPEC(4,float,plus,pfadd)
367 MMX_DBL_PKOP2_SPEC(4,float,minus,pfsub)
368 MMX_DBL_PKOP2_SPEC(4,float,multiplies,pfmul)
369 MMX_DBL_PKOP2_SPEC(4,float,fpmin,pfmin)
370 MMX_DBL_PKOP2_SPEC(4,float,fpmax,pfmax)
371 #endif
372 #endif // CPU_HAS_3DNOW
374 MMX_IPASSIGN_SPEC(8,uint8_t)
375 MMX_IPASSIGN_SPEC(4,uint16_t)
376 MMX_IPASSIGN_SPEC(2,uint32_t)
377 MMX_IPASSIGN_SPEC(2,float)
379 #ifndef CPU_HAS_SSE
380 MMX_DBL_PASSIGN_SPEC(4,float)
381 MMX_DBL_PASSIGN_SPEC(4,uint32_t)
382 MMX_DBL_PASSIGN_SPEC(4,int32_t)
383 MMX_DBL_IPASSIGN_SPEC(4,float)
384 MMX_DBL_IPASSIGN_SPEC(4,uint32_t)
385 MMX_DBL_IPASSIGN_SPEC(4,int32_t)
386 #endif
388 #undef MMX_IPASSIGN_SPEC
389 #undef MMX_PASSIGN_SPEC
390 #undef MMX_PKOP2_SPEC
391 #undef STD_MMX_ARGS
392 #endif // CPU_HAS_MMX
394 #if CPU_HAS_SSE
395 #define STD_SSE_ARGS "=m"(oout[0]) : "m"(oin[0]) : "xmm0", "memory"
396 #define SSE_PKOP2_SPEC(n,type,optype,instruction) \
397 SIMD_PKOP2_SPEC(n,type,optype) \
398 { asm ("movups %0, %%xmm0\n\tmovups %1, %%xmm1\n\t" #instruction " %%xmm1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
399 #define SSE_PASSIGN_SPEC(n,type) \
400 SIMD_PASSIGN_SPEC(n,type) \
401 { asm ("movups %1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
402 #define SSE_IPASSIGN_SPEC(n,type) \
403 SIMD_IPASSIGN_SPEC(n,type) \
404 { asm ("movups %1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
405 SSE_PASSIGN_SPEC(4,float)
406 SSE_PASSIGN_SPEC(4,int32_t)
407 SSE_PASSIGN_SPEC(4,uint32_t)
408 SSE_PKOP2_SPEC(4,float,plus,addps)
409 SSE_PKOP2_SPEC(4,float,minus,subps)
410 SSE_PKOP2_SPEC(4,float,multiplies,mulps)
411 SSE_PKOP2_SPEC(4,float,divides,divps)
412 SSE_PKOP2_SPEC(4,float,bitwise_and,andps)
413 SSE_PKOP2_SPEC(4,float,bitwise_or,orps)
414 SSE_PKOP2_SPEC(4,float,bitwise_xor,xorps)
415 SSE_PKOP2_SPEC(4,float,fpmax,maxps)
416 SSE_PKOP2_SPEC(4,float,fpmin,minps)
418 SIMD_CONVERT_SPEC(4,float,int32_t,fround) {
419 asm ("cvtps2pi %2, %%mm0\n\t"
420 "cvtps2pi %3, %%mm1\n\t"
421 "movq %%mm0, %0\n\t"
422 "movq %%mm1, %1"
423 : DBL_MMX_ARGS);
424 reset_mmx();
426 SIMD_CONVERT_SPEC(4,int32_t,float,fround) {
427 asm ("cvtpi2ps %2, %%xmm0\n\t"
428 "shufps $0x4E,%%xmm0,%%xmm0\n\t"
429 "cvtpi2ps %1, %%xmm0\n\t"
430 "movups %%xmm0, %0"
431 : "=m"(oout[0]) : "m"(oin[0]), "m"(oin[2]) : "xmm0", "memory");
433 template <> inline int32_t fround<float,int32_t>::operator()(const float& a) const {
434 register int32_t rv;
435 asm ("movss %1, %%xmm0\n\t"
436 "cvtss2si %%xmm0, %0"
437 : "=r"(rv) : "m"(a) : "xmm0" );
438 return (rv);
440 template <> inline uint32_t fround<float,uint32_t>::operator()(const float& a) const {
441 register uint32_t rv;
442 asm ("movss %1, %%xmm0\n\t"
443 "cvtss2si %%xmm0, %0"
444 : "=r"(rv) : "m"(a) : "xmm0" );
445 return (rv);
448 SSE_IPASSIGN_SPEC(4,float)
449 SSE_IPASSIGN_SPEC(4,int32_t)
450 SSE_IPASSIGN_SPEC(4,uint32_t)
452 #undef SSE_IPASSIGN_SPEC
453 #undef SSE_PASSIGN_SPEC
454 #undef SSE_PKOP2_SPEC
455 #undef STD_SSE_ARGS
456 #endif // CPU_HAS_SSE
458 #undef SIMDA_RI
459 #undef SIMDA_RO
460 #undef SIMDA_WI
461 #undef SIMDA_WO
462 #undef SIMD_PACKEDOP_SPEC
464 } // namespace simd
465 } // namespace ustl
467 #endif