2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_X86_AVX_256_SIMD4_DOUBLE_H
37 #define GMX_SIMD_IMPL_X86_AVX_256_SIMD4_DOUBLE_H
44 #include <immintrin.h>
54 Simd4Double(double d
) : simdInternal_(_mm256_set1_pd(d
)) {}
56 // Internal utility constructor to simplify return statements
57 Simd4Double(__m256d simd
) : simdInternal_(simd
) {}
59 __m256d simdInternal_
;
67 //! \brief Construct from scalar bool
68 Simd4DBool(bool b
) : simdInternal_(_mm256_castsi256_pd(_mm256_set1_epi32( b
? 0xFFFFFFFF : 0))) {}
70 // Internal utility constructor to simplify return statements
71 Simd4DBool(__m256d simd
) : simdInternal_(simd
) {}
73 __m256d simdInternal_
;
76 static inline Simd4Double gmx_simdcall
77 load4(const double *m
)
79 assert(std::size_t(m
) % 32 == 0);
85 static inline void gmx_simdcall
86 store4(double *m
, Simd4Double a
)
88 assert(std::size_t(m
) % 32 == 0);
89 _mm256_store_pd(m
, a
.simdInternal_
);
92 static inline Simd4Double gmx_simdcall
93 load4U(const double *m
)
100 static inline void gmx_simdcall
101 store4U(double *m
, Simd4Double a
)
103 _mm256_storeu_pd(m
, a
.simdInternal_
);
106 static inline Simd4Double gmx_simdcall
114 static inline Simd4Double gmx_simdcall
115 operator&(Simd4Double a
, Simd4Double b
)
118 _mm256_and_pd(a
.simdInternal_
, b
.simdInternal_
)
122 static inline Simd4Double gmx_simdcall
123 andNot(Simd4Double a
, Simd4Double b
)
126 _mm256_andnot_pd(a
.simdInternal_
, b
.simdInternal_
)
130 static inline Simd4Double gmx_simdcall
131 operator|(Simd4Double a
, Simd4Double b
)
134 _mm256_or_pd(a
.simdInternal_
, b
.simdInternal_
)
138 static inline Simd4Double gmx_simdcall
139 operator^(Simd4Double a
, Simd4Double b
)
142 _mm256_xor_pd(a
.simdInternal_
, b
.simdInternal_
)
146 static inline Simd4Double gmx_simdcall
147 operator+(Simd4Double a
, Simd4Double b
)
150 _mm256_add_pd(a
.simdInternal_
, b
.simdInternal_
)
154 static inline Simd4Double gmx_simdcall
155 operator-(Simd4Double a
, Simd4Double b
)
158 _mm256_sub_pd(a
.simdInternal_
, b
.simdInternal_
)
162 static inline Simd4Double gmx_simdcall
163 operator-(Simd4Double x
)
166 _mm256_xor_pd(x
.simdInternal_
, _mm256_set1_pd(GMX_DOUBLE_NEGZERO
))
170 static inline Simd4Double gmx_simdcall
171 operator*(Simd4Double a
, Simd4Double b
)
174 _mm256_mul_pd(a
.simdInternal_
, b
.simdInternal_
)
178 // Override for AVX2 and higher
179 #if GMX_SIMD_X86_AVX_256
180 static inline Simd4Double gmx_simdcall
181 fma(Simd4Double a
, Simd4Double b
, Simd4Double c
)
184 _mm256_add_pd(_mm256_mul_pd(a
.simdInternal_
, b
.simdInternal_
), c
.simdInternal_
)
188 static inline Simd4Double gmx_simdcall
189 fms(Simd4Double a
, Simd4Double b
, Simd4Double c
)
192 _mm256_sub_pd(_mm256_mul_pd(a
.simdInternal_
, b
.simdInternal_
), c
.simdInternal_
)
196 static inline Simd4Double gmx_simdcall
197 fnma(Simd4Double a
, Simd4Double b
, Simd4Double c
)
200 _mm256_sub_pd(c
.simdInternal_
, _mm256_mul_pd(a
.simdInternal_
, b
.simdInternal_
))
204 static inline Simd4Double gmx_simdcall
205 fnms(Simd4Double a
, Simd4Double b
, Simd4Double c
)
208 _mm256_sub_pd(_mm256_setzero_pd(), _mm256_add_pd(_mm256_mul_pd(a
.simdInternal_
, b
.simdInternal_
), c
.simdInternal_
))
213 static inline Simd4Double gmx_simdcall
217 _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(x
.simdInternal_
)))
221 static inline Simd4Double gmx_simdcall
225 _mm256_andnot_pd( _mm256_set1_pd(GMX_DOUBLE_NEGZERO
), x
.simdInternal_
)
229 static inline Simd4Double gmx_simdcall
230 max(Simd4Double a
, Simd4Double b
)
233 _mm256_max_pd(a
.simdInternal_
, b
.simdInternal_
)
237 static inline Simd4Double gmx_simdcall
238 min(Simd4Double a
, Simd4Double b
)
241 _mm256_min_pd(a
.simdInternal_
, b
.simdInternal_
)
245 static inline Simd4Double gmx_simdcall
249 _mm256_round_pd(x
.simdInternal_
, _MM_FROUND_NINT
)
253 static inline Simd4Double gmx_simdcall
257 _mm256_round_pd(x
.simdInternal_
, _MM_FROUND_TRUNC
)
261 static inline double gmx_simdcall
262 dotProduct(Simd4Double a
, Simd4Double b
)
265 a
.simdInternal_
= _mm256_mul_pd(a
.simdInternal_
, b
.simdInternal_
);
266 tmp1
= _mm256_castpd256_pd128(a
.simdInternal_
);
267 tmp2
= _mm256_extractf128_pd(a
.simdInternal_
, 0x1);
269 tmp1
= _mm_add_pd(tmp1
, _mm_permute_pd(tmp1
, _MM_SHUFFLE2(0, 1)));
270 tmp1
= _mm_add_pd(tmp1
, tmp2
);
271 return *reinterpret_cast<double *>(&tmp1
);
274 static inline void gmx_simdcall
275 transpose(Simd4Double
* v0
, Simd4Double
* v1
,
276 Simd4Double
* v2
, Simd4Double
* v3
)
278 __m256d t1
, t2
, t3
, t4
;
279 t1
= _mm256_unpacklo_pd(v0
->simdInternal_
, v1
->simdInternal_
);
280 t2
= _mm256_unpackhi_pd(v0
->simdInternal_
, v1
->simdInternal_
);
281 t3
= _mm256_unpacklo_pd(v2
->simdInternal_
, v3
->simdInternal_
);
282 t4
= _mm256_unpackhi_pd(v2
->simdInternal_
, v3
->simdInternal_
);
283 v0
->simdInternal_
= _mm256_permute2f128_pd(t1
, t3
, 0x20);
284 v1
->simdInternal_
= _mm256_permute2f128_pd(t2
, t4
, 0x20);
285 v2
->simdInternal_
= _mm256_permute2f128_pd(t1
, t3
, 0x31);
286 v3
->simdInternal_
= _mm256_permute2f128_pd(t2
, t4
, 0x31);
289 static inline Simd4DBool gmx_simdcall
290 operator==(Simd4Double a
, Simd4Double b
)
293 _mm256_cmp_pd(a
.simdInternal_
, b
.simdInternal_
, _CMP_EQ_OQ
)
297 static inline Simd4DBool gmx_simdcall
298 operator!=(Simd4Double a
, Simd4Double b
)
301 _mm256_cmp_pd(a
.simdInternal_
, b
.simdInternal_
, _CMP_NEQ_OQ
)
305 static inline Simd4DBool gmx_simdcall
306 operator<(Simd4Double a
, Simd4Double b
)
309 _mm256_cmp_pd(a
.simdInternal_
, b
.simdInternal_
, _CMP_LT_OQ
)
313 static inline Simd4DBool gmx_simdcall
314 operator<=(Simd4Double a
, Simd4Double b
)
317 _mm256_cmp_pd(a
.simdInternal_
, b
.simdInternal_
, _CMP_LE_OQ
)
321 static inline Simd4DBool gmx_simdcall
322 operator&&(Simd4DBool a
, Simd4DBool b
)
325 _mm256_and_pd(a
.simdInternal_
, b
.simdInternal_
)
329 static inline Simd4DBool gmx_simdcall
330 operator||(Simd4DBool a
, Simd4DBool b
)
333 _mm256_or_pd(a
.simdInternal_
, b
.simdInternal_
)
337 static inline bool gmx_simdcall
338 anyTrue(Simd4DBool a
) { return _mm256_movemask_pd(a
.simdInternal_
) != 0; }
340 static inline Simd4Double gmx_simdcall
341 selectByMask(Simd4Double a
, Simd4DBool mask
)
344 _mm256_and_pd(a
.simdInternal_
, mask
.simdInternal_
)
348 static inline Simd4Double gmx_simdcall
349 selectByNotMask(Simd4Double a
, Simd4DBool mask
)
352 _mm256_andnot_pd(mask
.simdInternal_
, a
.simdInternal_
)
356 static inline Simd4Double gmx_simdcall
357 blend(Simd4Double a
, Simd4Double b
, Simd4DBool sel
)
360 _mm256_blendv_pd(a
.simdInternal_
, b
.simdInternal_
, sel
.simdInternal_
)
364 static inline double gmx_simdcall
365 reduce(Simd4Double a
)
368 // test with shuffle & add as an alternative to hadd later
369 a
.simdInternal_
= _mm256_hadd_pd(a
.simdInternal_
, a
.simdInternal_
);
370 a0
= _mm256_castpd256_pd128(a
.simdInternal_
);
371 a1
= _mm256_extractf128_pd(a
.simdInternal_
, 0x1);
372 a0
= _mm_add_sd(a0
, a1
);
373 return *reinterpret_cast<double *>(&a0
);
378 #endif // GMX_SIMD_IMPL_X86_AVX_256_SIMD4_DOUBLE_H