Improve workload data structures' docs
[gromacs.git] / src / gromacs / simd / impl_x86_avx_256 / impl_x86_avx_256_simd4_double.h
blob70d312f744b467b7631a1c4d0d6689d154c9c753
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_X86_AVX_256_SIMD4_DOUBLE_H
37 #define GMX_SIMD_IMPL_X86_AVX_256_SIMD4_DOUBLE_H
39 #include "config.h"
41 #include <cassert>
42 #include <cstddef>
44 #include <immintrin.h>
46 namespace gmx
49 class Simd4Double
51 public:
52 Simd4Double() {}
54 Simd4Double(double d) : simdInternal_(_mm256_set1_pd(d)) {}
56 // Internal utility constructor to simplify return statements
57 Simd4Double(__m256d simd) : simdInternal_(simd) {}
59 __m256d simdInternal_;
62 class Simd4DBool
64 public:
65 Simd4DBool() {}
67 //! \brief Construct from scalar bool
68 Simd4DBool(bool b) : simdInternal_(_mm256_castsi256_pd(_mm256_set1_epi32( b ? 0xFFFFFFFF : 0))) {}
70 // Internal utility constructor to simplify return statements
71 Simd4DBool(__m256d simd) : simdInternal_(simd) {}
73 __m256d simdInternal_;
76 static inline Simd4Double gmx_simdcall
77 load4(const double *m)
79 assert(std::size_t(m) % 32 == 0);
80 return {
81 _mm256_load_pd(m)
85 static inline void gmx_simdcall
86 store4(double *m, Simd4Double a)
88 assert(std::size_t(m) % 32 == 0);
89 _mm256_store_pd(m, a.simdInternal_);
92 static inline Simd4Double gmx_simdcall
93 load4U(const double *m)
95 return {
96 _mm256_loadu_pd(m)
100 static inline void gmx_simdcall
101 store4U(double *m, Simd4Double a)
103 _mm256_storeu_pd(m, a.simdInternal_);
106 static inline Simd4Double gmx_simdcall
107 simd4SetZeroD()
109 return {
110 _mm256_setzero_pd()
114 static inline Simd4Double gmx_simdcall
115 operator&(Simd4Double a, Simd4Double b)
117 return {
118 _mm256_and_pd(a.simdInternal_, b.simdInternal_)
122 static inline Simd4Double gmx_simdcall
123 andNot(Simd4Double a, Simd4Double b)
125 return {
126 _mm256_andnot_pd(a.simdInternal_, b.simdInternal_)
130 static inline Simd4Double gmx_simdcall
131 operator|(Simd4Double a, Simd4Double b)
133 return {
134 _mm256_or_pd(a.simdInternal_, b.simdInternal_)
138 static inline Simd4Double gmx_simdcall
139 operator^(Simd4Double a, Simd4Double b)
141 return {
142 _mm256_xor_pd(a.simdInternal_, b.simdInternal_)
146 static inline Simd4Double gmx_simdcall
147 operator+(Simd4Double a, Simd4Double b)
149 return {
150 _mm256_add_pd(a.simdInternal_, b.simdInternal_)
154 static inline Simd4Double gmx_simdcall
155 operator-(Simd4Double a, Simd4Double b)
157 return {
158 _mm256_sub_pd(a.simdInternal_, b.simdInternal_)
162 static inline Simd4Double gmx_simdcall
163 operator-(Simd4Double x)
165 return {
166 _mm256_xor_pd(x.simdInternal_, _mm256_set1_pd(GMX_DOUBLE_NEGZERO))
170 static inline Simd4Double gmx_simdcall
171 operator*(Simd4Double a, Simd4Double b)
173 return {
174 _mm256_mul_pd(a.simdInternal_, b.simdInternal_)
178 // Override for AVX2 and higher
179 #if GMX_SIMD_X86_AVX_256
180 static inline Simd4Double gmx_simdcall
181 fma(Simd4Double a, Simd4Double b, Simd4Double c)
183 return {
184 _mm256_add_pd(_mm256_mul_pd(a.simdInternal_, b.simdInternal_), c.simdInternal_)
188 static inline Simd4Double gmx_simdcall
189 fms(Simd4Double a, Simd4Double b, Simd4Double c)
191 return {
192 _mm256_sub_pd(_mm256_mul_pd(a.simdInternal_, b.simdInternal_), c.simdInternal_)
196 static inline Simd4Double gmx_simdcall
197 fnma(Simd4Double a, Simd4Double b, Simd4Double c)
199 return {
200 _mm256_sub_pd(c.simdInternal_, _mm256_mul_pd(a.simdInternal_, b.simdInternal_))
204 static inline Simd4Double gmx_simdcall
205 fnms(Simd4Double a, Simd4Double b, Simd4Double c)
207 return {
208 _mm256_sub_pd(_mm256_setzero_pd(), _mm256_add_pd(_mm256_mul_pd(a.simdInternal_, b.simdInternal_), c.simdInternal_))
211 #endif
213 static inline Simd4Double gmx_simdcall
214 rsqrt(Simd4Double x)
216 return {
217 _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(x.simdInternal_)))
221 static inline Simd4Double gmx_simdcall
222 abs(Simd4Double x)
224 return {
225 _mm256_andnot_pd( _mm256_set1_pd(GMX_DOUBLE_NEGZERO), x.simdInternal_ )
229 static inline Simd4Double gmx_simdcall
230 max(Simd4Double a, Simd4Double b)
232 return {
233 _mm256_max_pd(a.simdInternal_, b.simdInternal_)
237 static inline Simd4Double gmx_simdcall
238 min(Simd4Double a, Simd4Double b)
240 return {
241 _mm256_min_pd(a.simdInternal_, b.simdInternal_)
245 static inline Simd4Double gmx_simdcall
246 round(Simd4Double x)
248 return {
249 _mm256_round_pd(x.simdInternal_, _MM_FROUND_NINT)
253 static inline Simd4Double gmx_simdcall
254 trunc(Simd4Double x)
256 return {
257 _mm256_round_pd(x.simdInternal_, _MM_FROUND_TRUNC)
261 static inline double gmx_simdcall
262 dotProduct(Simd4Double a, Simd4Double b)
264 __m128d tmp1, tmp2;
265 a.simdInternal_ = _mm256_mul_pd(a.simdInternal_, b.simdInternal_);
266 tmp1 = _mm256_castpd256_pd128(a.simdInternal_);
267 tmp2 = _mm256_extractf128_pd(a.simdInternal_, 0x1);
269 tmp1 = _mm_add_pd(tmp1, _mm_permute_pd(tmp1, _MM_SHUFFLE2(0, 1)));
270 tmp1 = _mm_add_pd(tmp1, tmp2);
271 return *reinterpret_cast<double *>(&tmp1);
274 static inline void gmx_simdcall
275 transpose(Simd4Double * v0, Simd4Double * v1,
276 Simd4Double * v2, Simd4Double * v3)
278 __m256d t1, t2, t3, t4;
279 t1 = _mm256_unpacklo_pd(v0->simdInternal_, v1->simdInternal_);
280 t2 = _mm256_unpackhi_pd(v0->simdInternal_, v1->simdInternal_);
281 t3 = _mm256_unpacklo_pd(v2->simdInternal_, v3->simdInternal_);
282 t4 = _mm256_unpackhi_pd(v2->simdInternal_, v3->simdInternal_);
283 v0->simdInternal_ = _mm256_permute2f128_pd(t1, t3, 0x20);
284 v1->simdInternal_ = _mm256_permute2f128_pd(t2, t4, 0x20);
285 v2->simdInternal_ = _mm256_permute2f128_pd(t1, t3, 0x31);
286 v3->simdInternal_ = _mm256_permute2f128_pd(t2, t4, 0x31);
289 static inline Simd4DBool gmx_simdcall
290 operator==(Simd4Double a, Simd4Double b)
292 return {
293 _mm256_cmp_pd(a.simdInternal_, b.simdInternal_, _CMP_EQ_OQ)
297 static inline Simd4DBool gmx_simdcall
298 operator!=(Simd4Double a, Simd4Double b)
300 return {
301 _mm256_cmp_pd(a.simdInternal_, b.simdInternal_, _CMP_NEQ_OQ)
305 static inline Simd4DBool gmx_simdcall
306 operator<(Simd4Double a, Simd4Double b)
308 return {
309 _mm256_cmp_pd(a.simdInternal_, b.simdInternal_, _CMP_LT_OQ)
313 static inline Simd4DBool gmx_simdcall
314 operator<=(Simd4Double a, Simd4Double b)
316 return {
317 _mm256_cmp_pd(a.simdInternal_, b.simdInternal_, _CMP_LE_OQ)
321 static inline Simd4DBool gmx_simdcall
322 operator&&(Simd4DBool a, Simd4DBool b)
324 return {
325 _mm256_and_pd(a.simdInternal_, b.simdInternal_)
329 static inline Simd4DBool gmx_simdcall
330 operator||(Simd4DBool a, Simd4DBool b)
332 return {
333 _mm256_or_pd(a.simdInternal_, b.simdInternal_)
337 static inline bool gmx_simdcall
338 anyTrue(Simd4DBool a) { return _mm256_movemask_pd(a.simdInternal_) != 0; }
340 static inline Simd4Double gmx_simdcall
341 selectByMask(Simd4Double a, Simd4DBool mask)
343 return {
344 _mm256_and_pd(a.simdInternal_, mask.simdInternal_)
348 static inline Simd4Double gmx_simdcall
349 selectByNotMask(Simd4Double a, Simd4DBool mask)
351 return {
352 _mm256_andnot_pd(mask.simdInternal_, a.simdInternal_)
356 static inline Simd4Double gmx_simdcall
357 blend(Simd4Double a, Simd4Double b, Simd4DBool sel)
359 return {
360 _mm256_blendv_pd(a.simdInternal_, b.simdInternal_, sel.simdInternal_)
364 static inline double gmx_simdcall
365 reduce(Simd4Double a)
367 __m128d a0, a1;
368 // test with shuffle & add as an alternative to hadd later
369 a.simdInternal_ = _mm256_hadd_pd(a.simdInternal_, a.simdInternal_);
370 a0 = _mm256_castpd256_pd128(a.simdInternal_);
371 a1 = _mm256_extractf128_pd(a.simdInternal_, 0x1);
372 a0 = _mm_add_sd(a0, a1);
373 return *reinterpret_cast<double *>(&a0);
376 } // namespace gmx
378 #endif // GMX_SIMD_IMPL_X86_AVX_256_SIMD4_DOUBLE_H