memcpy: hide some memory latencies
[nova-simd.git] / simd_utils.hpp
blobc576ffe7fec91fcc8e4f56456cbbef45c620d69e
1 // utilities for the simd implementation
2 // Copyright (C) 2008, 2009 Tim Blechmann
3 //
4 // This program is free software; you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation; either version 2 of the License, or
7 // (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
14 // You should have received a copy of the GNU General Public License
15 // along with this program; see the file COPYING. If not, write to
16 // the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 // Boston, MA 02111-1307, USA.
19 #ifndef SIMD_UTILS_HPP
20 #define SIMD_UTILS_HPP
22 #include <xmmintrin.h>
24 #ifdef __SSE2__
25 #include <emmintrin.h>
26 #endif /* __SSE2__ */
28 #ifdef __SSE4_1__
29 #include <smmintrin.h>
30 #endif /* __SSE41__ */
33 namespace nova
35 namespace detail
38 #ifdef __SSE2__
39 inline __m128 gen_sign_mask(void)
41 __m128i x = _mm_setzero_si128();
42 __m128i ones = _mm_cmpeq_epi32(x, x);
43 return (__m128)_mm_slli_epi32 (_mm_srli_epi32(ones, 31), 31);
46 inline __m128 gen_abs_mask(void)
48 __m128i x = _mm_setzero_si128();
49 __m128i ones = _mm_cmpeq_epi32(x, x);
50 return (__m128)_mm_srli_epi32 (_mm_slli_epi32(ones, 1), 1);
53 inline __m128 gen_one(void)
55 __m128i x = _mm_setzero_si128();
56 __m128i ones = _mm_cmpeq_epi32(x, x);
57 return (__m128)_mm_slli_epi32 (_mm_srli_epi32(ones, 25), 23);
60 inline __m128 gen_05(void)
62 __m128i x = _mm_setzero_si128();
63 __m128i ones = _mm_cmpeq_epi32(x, x);
64 return (__m128)_mm_slli_epi32 (_mm_srli_epi32(ones, 26), 24);
67 #else
69 /* SSE fallback */
71 inline __m128 gen_sign_mask(void)
73 static const int sign_mask = 0x80000000;
74 float * casted = (float*)(&sign_mask);
75 return _mm_set_ps1(*casted);
78 inline __m128 gen_abs_mask(void)
80 static const int abs_mask = 0x7fffffff;
81 float * casted = (float*)(&abs_mask);
82 return _mm_set_ps1(*casted);
85 inline __m128 gen_one(void)
87 return _mm_set_ps1(1.f);
90 inline __m128 gen_05(void)
92 return _mm_set_ps1(0.5f);
95 #endif
97 inline __m128 gen_025(void)
99 return _mm_set_ps1(0.25f);
102 inline float extract_0(__m128 arg)
104 float r;
105 _mm_store_ss(&r, arg);
106 return r;
109 inline float extract_3(__m128 arg)
111 __m128 last = _mm_shuffle_ps(arg, arg, _MM_SHUFFLE(2, 1, 0, 3));
112 float r;
113 _mm_store_ss(&r, last);
114 return r;
117 inline float horizontal_min(__m128 args)
119 __m128 xmm0, xmm1;
120 xmm0 = args;
121 xmm1 = _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(2,2,2,2));
122 xmm0 = _mm_min_ps(xmm0, xmm1);
123 xmm1 = _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(1,1,1,1));
124 xmm0 = _mm_min_ss(xmm0, xmm1);
125 return extract_0(xmm0);
128 inline float horizontal_max(__m128 args)
130 __m128 xmm0, xmm1;
131 xmm0 = args;
132 xmm1 = _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(2,2,2,2));
133 xmm0 = _mm_max_ps(xmm0, xmm1);
134 xmm1 = _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(1,1,1,1));
135 xmm0 = _mm_max_ss(xmm0, xmm1);
136 return extract_0(xmm0);
139 #ifdef __SSE4_1__
141 inline __m128 select_vector(__m128 val0, __m128 val1, __m128 sel)
143 /* if bitmask is set, return value in val1, else value in val0 */
144 return _mm_blendv_ps(val0, val1, sel);
147 #else
149 inline __m128 select_vector(__m128 val0, __m128 val1, __m128 sel)
151 /* if bitmask is set, return value in val1, else value in val0 */
152 return _mm_or_ps(_mm_andnot_ps(sel, val0),
153 _mm_and_ps(val1, sel));
156 #endif
158 } /* namespace detail */
159 } /* namespace nova */
161 #endif /* SIMD_UTILS_HPP */