Improve workload data structures' docs
[gromacs.git] / src / gromacs / simd / impl_x86_sse2 / impl_x86_sse2_simd4_float.h
bloba2af83a127f1d392921599e380ae665c7c5e78c1
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef GMX_SIMD_IMPL_X86_SSE2_SIMD4_FLOAT_H
36 #define GMX_SIMD_IMPL_X86_SSE2_SIMD4_FLOAT_H
38 #include "config.h"
40 #include <cassert>
41 #include <cstddef>
43 #include <emmintrin.h>
45 namespace gmx
48 class Simd4Float
50 public:
51 Simd4Float() {}
53 Simd4Float(float f) : simdInternal_(_mm_set1_ps(f)) {}
55 // Internal utility constructor to simplify return statements
56 Simd4Float(__m128 simd) : simdInternal_(simd) {}
58 __m128 simdInternal_;
61 class Simd4FBool
63 public:
64 Simd4FBool() {}
66 //! \brief Construct from scalar bool
67 Simd4FBool(bool b) : simdInternal_(_mm_castsi128_ps(_mm_set1_epi32( b ? 0xFFFFFFFF : 0))) {}
69 // Internal utility constructor to simplify return statements
70 Simd4FBool(__m128 simd) : simdInternal_(simd) {}
72 __m128 simdInternal_;
75 static inline Simd4Float gmx_simdcall
76 load4(const float *m)
78 assert(size_t(m) % 16 == 0);
79 return {
80 _mm_load_ps(m)
84 static inline void gmx_simdcall
85 store4(float *m, Simd4Float a)
87 assert(size_t(m) % 16 == 0);
88 _mm_store_ps(m, a.simdInternal_);
91 static inline Simd4Float gmx_simdcall
92 load4U(const float *m)
94 return {
95 _mm_loadu_ps(m)
99 static inline void gmx_simdcall
100 store4U(float *m, Simd4Float a)
102 _mm_storeu_ps(m, a.simdInternal_);
105 static inline Simd4Float gmx_simdcall
106 simd4SetZeroF()
108 return {
109 _mm_setzero_ps()
113 static inline Simd4Float gmx_simdcall
114 operator&(Simd4Float a, Simd4Float b)
116 return {
117 _mm_and_ps(a.simdInternal_, b.simdInternal_)
121 static inline Simd4Float gmx_simdcall
122 andNot(Simd4Float a, Simd4Float b)
124 return {
125 _mm_andnot_ps(a.simdInternal_, b.simdInternal_)
129 static inline Simd4Float gmx_simdcall
130 operator|(Simd4Float a, Simd4Float b)
132 return {
133 _mm_or_ps(a.simdInternal_, b.simdInternal_)
137 static inline Simd4Float gmx_simdcall
138 operator^(Simd4Float a, Simd4Float b)
140 return {
141 _mm_xor_ps(a.simdInternal_, b.simdInternal_)
145 static inline Simd4Float gmx_simdcall
146 operator+(Simd4Float a, Simd4Float b)
148 return {
149 _mm_add_ps(a.simdInternal_, b.simdInternal_)
153 static inline Simd4Float gmx_simdcall
154 operator-(Simd4Float a, Simd4Float b)
156 return {
157 _mm_sub_ps(a.simdInternal_, b.simdInternal_)
161 static inline Simd4Float gmx_simdcall
162 operator-(Simd4Float x)
164 return {
165 _mm_xor_ps(x.simdInternal_, _mm_set1_ps(GMX_FLOAT_NEGZERO))
169 static inline Simd4Float gmx_simdcall
170 operator*(Simd4Float a, Simd4Float b)
172 return {
173 _mm_mul_ps(a.simdInternal_, b.simdInternal_)
177 // Override for AVX-128-FMA and higher
178 #if GMX_SIMD_X86_SSE2 || GMX_SIMD_X86_SSE4_1
179 static inline Simd4Float gmx_simdcall
180 fma(Simd4Float a, Simd4Float b, Simd4Float c)
182 return {
183 _mm_add_ps(_mm_mul_ps(a.simdInternal_, b.simdInternal_), c.simdInternal_)
187 static inline Simd4Float gmx_simdcall
188 fms(Simd4Float a, Simd4Float b, Simd4Float c)
190 return {
191 _mm_sub_ps(_mm_mul_ps(a.simdInternal_, b.simdInternal_), c.simdInternal_)
195 static inline Simd4Float gmx_simdcall
196 fnma(Simd4Float a, Simd4Float b, Simd4Float c)
198 return {
199 _mm_sub_ps(c.simdInternal_, _mm_mul_ps(a.simdInternal_, b.simdInternal_))
203 static inline Simd4Float gmx_simdcall
204 fnms(Simd4Float a, Simd4Float b, Simd4Float c)
206 return {
207 _mm_sub_ps(_mm_setzero_ps(), _mm_add_ps(_mm_mul_ps(a.simdInternal_, b.simdInternal_), c.simdInternal_))
210 #endif
212 static inline Simd4Float gmx_simdcall
213 rsqrt(Simd4Float x)
215 return {
216 _mm_rsqrt_ps(x.simdInternal_)
220 static inline Simd4Float gmx_simdcall
221 abs(Simd4Float x)
223 return {
224 _mm_andnot_ps( _mm_set1_ps(GMX_FLOAT_NEGZERO), x.simdInternal_ )
228 static inline Simd4Float gmx_simdcall
229 max(Simd4Float a, Simd4Float b)
231 return {
232 _mm_max_ps(a.simdInternal_, b.simdInternal_)
236 static inline Simd4Float gmx_simdcall
237 min(Simd4Float a, Simd4Float b)
239 return {
240 _mm_min_ps(a.simdInternal_, b.simdInternal_)
244 // Override for SSE4.1 and higher
245 #if GMX_SIMD_X86_SSE2
246 static inline Simd4Float gmx_simdcall
247 round(Simd4Float x)
249 return {
250 _mm_cvtepi32_ps( _mm_cvtps_epi32(x.simdInternal_) )
254 static inline Simd4Float gmx_simdcall
255 trunc(Simd4Float x)
257 return {
258 _mm_cvtepi32_ps( _mm_cvttps_epi32(x.simdInternal_) )
262 static inline float gmx_simdcall
263 dotProduct(Simd4Float a, Simd4Float b)
265 __m128 c, d;
266 c = _mm_mul_ps(a.simdInternal_, b.simdInternal_);
267 d = _mm_add_ps(c, _mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 1, 2, 1)));
268 d = _mm_add_ps(d, _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 2, 3, 2)));
269 return *reinterpret_cast<float *>(&d);
271 #endif
273 static inline void gmx_simdcall
274 transpose(Simd4Float * v0, Simd4Float * v1,
275 Simd4Float * v2, Simd4Float * v3)
277 _MM_TRANSPOSE4_PS(v0->simdInternal_, v1->simdInternal_, v2->simdInternal_, v3->simdInternal_);
280 static inline Simd4FBool gmx_simdcall
281 operator==(Simd4Float a, Simd4Float b)
283 return {
284 _mm_cmpeq_ps(a.simdInternal_, b.simdInternal_)
288 static inline Simd4FBool gmx_simdcall
289 operator!=(Simd4Float a, Simd4Float b)
291 return {
292 _mm_cmpneq_ps(a.simdInternal_, b.simdInternal_)
296 static inline Simd4FBool gmx_simdcall
297 operator<(Simd4Float a, Simd4Float b)
299 return {
300 _mm_cmplt_ps(a.simdInternal_, b.simdInternal_)
304 static inline Simd4FBool gmx_simdcall
305 operator<=(Simd4Float a, Simd4Float b)
307 return {
308 _mm_cmple_ps(a.simdInternal_, b.simdInternal_)
312 static inline Simd4FBool gmx_simdcall
313 operator&&(Simd4FBool a, Simd4FBool b)
315 return {
316 _mm_and_ps(a.simdInternal_, b.simdInternal_)
320 static inline Simd4FBool gmx_simdcall
321 operator||(Simd4FBool a, Simd4FBool b)
323 return {
324 _mm_or_ps(a.simdInternal_, b.simdInternal_)
328 static inline bool gmx_simdcall
329 anyTrue(Simd4FBool a) { return _mm_movemask_ps(a.simdInternal_) != 0; }
331 static inline Simd4Float gmx_simdcall
332 selectByMask(Simd4Float a, Simd4FBool mask)
334 return {
335 _mm_and_ps(a.simdInternal_, mask.simdInternal_)
339 static inline Simd4Float gmx_simdcall
340 selectByNotMask(Simd4Float a, Simd4FBool mask)
342 return {
343 _mm_andnot_ps(mask.simdInternal_, a.simdInternal_)
347 // Override for SSE4.1 and higher
348 #if GMX_SIMD_X86_SSE2
349 static inline Simd4Float gmx_simdcall
350 blend(Simd4Float a, Simd4Float b, Simd4FBool sel)
352 return {
353 _mm_or_ps(_mm_andnot_ps(sel.simdInternal_, a.simdInternal_), _mm_and_ps(sel.simdInternal_, b.simdInternal_))
356 #endif
358 // Override for AVX-128-FMA and higher
359 #if GMX_SIMD_X86_SSE2 || GMX_SIMD_X86_SSE4_1
360 static inline float gmx_simdcall
361 reduce(Simd4Float a)
363 __m128 b;
364 b = _mm_add_ps(a.simdInternal_, _mm_shuffle_ps(a.simdInternal_, a.simdInternal_, _MM_SHUFFLE(1, 0, 3, 2)));
365 b = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 3, 2, 1)));
366 return *reinterpret_cast<float *>(&b);
368 #endif
370 } // namespace gmx
372 #endif // GMX_SIMD_IMPL_X86_SSE2_SIMD4_FLOAT_H