2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2016,2017,2019,2020, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef GMX_SIMD_IMPL_X86_SSE2_SIMD_DOUBLE_H
36 #define GMX_SIMD_IMPL_X86_SSE2_SIMD_DOUBLE_H
44 #include <emmintrin.h>
46 #include "gromacs/math/utilities.h"
48 #include "impl_x86_sse2_simd_float.h"
58 SimdDouble(double d
) : simdInternal_(_mm_set1_pd(d
)) {}
60 // Internal utility constructor to simplify return statements
61 SimdDouble(__m128d simd
) : simdInternal_(simd
) {}
63 __m128d simdInternal_
;
71 SimdDInt32(std::int32_t i
) : simdInternal_(_mm_set1_epi32(i
)) {}
73 // Internal utility constructor to simplify return statements
74 SimdDInt32(__m128i simd
) : simdInternal_(simd
) {}
76 __m128i simdInternal_
;
84 SimdDBool(bool b
) : simdInternal_(_mm_castsi128_pd(_mm_set1_epi32(b
? 0xFFFFFFFF : 0))) {}
86 // Internal utility constructor to simplify return statements
87 SimdDBool(__m128d simd
) : simdInternal_(simd
) {}
89 __m128d simdInternal_
;
97 SimdDIBool(bool b
) : simdInternal_(_mm_set1_epi32(b
? 0xFFFFFFFF : 0)) {}
99 // Internal utility constructor to simplify return statements
100 SimdDIBool(__m128i simd
) : simdInternal_(simd
) {}
102 __m128i simdInternal_
;
105 static inline SimdDouble gmx_simdcall
simdLoad(const double* m
, SimdDoubleTag
= {})
107 assert(std::size_t(m
) % 16 == 0);
108 return { _mm_load_pd(m
) };
111 static inline void gmx_simdcall
store(double* m
, SimdDouble a
)
113 assert(std::size_t(m
) % 16 == 0);
114 _mm_store_pd(m
, a
.simdInternal_
);
117 static inline SimdDouble gmx_simdcall
simdLoadU(const double* m
, SimdDoubleTag
= {})
119 return { _mm_loadu_pd(m
) };
122 static inline void gmx_simdcall
storeU(double* m
, SimdDouble a
)
124 _mm_storeu_pd(m
, a
.simdInternal_
);
127 static inline SimdDouble gmx_simdcall
setZeroD()
129 return { _mm_setzero_pd() };
132 static inline SimdDInt32 gmx_simdcall
simdLoad(const std::int32_t* m
, SimdDInt32Tag
)
134 assert(std::size_t(m
) % 8 == 0);
135 return { _mm_loadl_epi64(reinterpret_cast<const __m128i
*>(m
)) };
138 static inline void gmx_simdcall
store(std::int32_t* m
, SimdDInt32 a
)
140 assert(std::size_t(m
) % 8 == 0);
141 _mm_storel_epi64(reinterpret_cast<__m128i
*>(m
), a
.simdInternal_
);
144 static inline SimdDInt32 gmx_simdcall
simdLoadU(const std::int32_t* m
, SimdDInt32Tag
)
146 return { _mm_loadl_epi64(reinterpret_cast<const __m128i
*>(m
)) };
149 static inline void gmx_simdcall
storeU(std::int32_t* m
, SimdDInt32 a
)
151 _mm_storel_epi64(reinterpret_cast<__m128i
*>(m
), a
.simdInternal_
);
154 static inline SimdDInt32 gmx_simdcall
setZeroDI()
156 return { _mm_setzero_si128() };
159 // Override for SSE4.1 and higher
160 #if GMX_SIMD_X86_SSE2
162 static inline std::int32_t gmx_simdcall
extract(SimdDInt32 a
)
164 return _mm_cvtsi128_si32(_mm_srli_si128(a
.simdInternal_
, 4 * index
));
168 static inline SimdDouble gmx_simdcall
operator&(SimdDouble a
, SimdDouble b
)
170 return { _mm_and_pd(a
.simdInternal_
, b
.simdInternal_
) };
173 static inline SimdDouble gmx_simdcall
andNot(SimdDouble a
, SimdDouble b
)
175 return { _mm_andnot_pd(a
.simdInternal_
, b
.simdInternal_
) };
178 static inline SimdDouble gmx_simdcall
operator|(SimdDouble a
, SimdDouble b
)
180 return { _mm_or_pd(a
.simdInternal_
, b
.simdInternal_
) };
183 static inline SimdDouble gmx_simdcall
operator^(SimdDouble a
, SimdDouble b
)
185 return { _mm_xor_pd(a
.simdInternal_
, b
.simdInternal_
) };
188 static inline SimdDouble gmx_simdcall
operator+(SimdDouble a
, SimdDouble b
)
190 return { _mm_add_pd(a
.simdInternal_
, b
.simdInternal_
) };
193 static inline SimdDouble gmx_simdcall
operator-(SimdDouble a
, SimdDouble b
)
195 return { _mm_sub_pd(a
.simdInternal_
, b
.simdInternal_
) };
198 static inline SimdDouble gmx_simdcall
operator-(SimdDouble x
)
200 return { _mm_xor_pd(x
.simdInternal_
, _mm_set1_pd(GMX_DOUBLE_NEGZERO
)) };
203 static inline SimdDouble gmx_simdcall
operator*(SimdDouble a
, SimdDouble b
)
205 return { _mm_mul_pd(a
.simdInternal_
, b
.simdInternal_
) };
208 // Override for AVX-128-FMA and higher
209 #if GMX_SIMD_X86_SSE2 || GMX_SIMD_X86_SSE4_1
210 static inline SimdDouble gmx_simdcall
fma(SimdDouble a
, SimdDouble b
, SimdDouble c
)
212 return { _mm_add_pd(_mm_mul_pd(a
.simdInternal_
, b
.simdInternal_
), c
.simdInternal_
) };
215 static inline SimdDouble gmx_simdcall
fms(SimdDouble a
, SimdDouble b
, SimdDouble c
)
217 return { _mm_sub_pd(_mm_mul_pd(a
.simdInternal_
, b
.simdInternal_
), c
.simdInternal_
) };
220 static inline SimdDouble gmx_simdcall
fnma(SimdDouble a
, SimdDouble b
, SimdDouble c
)
222 return { _mm_sub_pd(c
.simdInternal_
, _mm_mul_pd(a
.simdInternal_
, b
.simdInternal_
)) };
225 static inline SimdDouble gmx_simdcall
fnms(SimdDouble a
, SimdDouble b
, SimdDouble c
)
227 return { _mm_sub_pd(_mm_setzero_pd(),
228 _mm_add_pd(_mm_mul_pd(a
.simdInternal_
, b
.simdInternal_
), c
.simdInternal_
)) };
232 static inline SimdDouble gmx_simdcall
rsqrt(SimdDouble x
)
234 return { _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(x
.simdInternal_
))) };
237 static inline SimdDouble gmx_simdcall
rcp(SimdDouble x
)
239 return { _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(x
.simdInternal_
))) };
242 static inline SimdDouble gmx_simdcall
maskAdd(SimdDouble a
, SimdDouble b
, SimdDBool m
)
244 return { _mm_add_pd(a
.simdInternal_
, _mm_and_pd(b
.simdInternal_
, m
.simdInternal_
)) };
247 static inline SimdDouble gmx_simdcall
maskzMul(SimdDouble a
, SimdDouble b
, SimdDBool m
)
249 return { _mm_and_pd(_mm_mul_pd(a
.simdInternal_
, b
.simdInternal_
), m
.simdInternal_
) };
252 static inline SimdDouble gmx_simdcall
maskzFma(SimdDouble a
, SimdDouble b
, SimdDouble c
, SimdDBool m
)
254 return { _mm_and_pd(_mm_add_pd(_mm_mul_pd(a
.simdInternal_
, b
.simdInternal_
), c
.simdInternal_
),
258 // Override for SSE4.1 and higher
259 #if GMX_SIMD_X86_SSE2
260 static inline SimdDouble gmx_simdcall
maskzRsqrt(SimdDouble x
, SimdDBool m
)
262 // The result will always be correct since we mask the result with m, but
263 // for debug builds we also want to make sure not to generate FP exceptions
265 x
.simdInternal_
= _mm_or_pd(_mm_andnot_pd(m
.simdInternal_
, _mm_set1_pd(1.0)),
266 _mm_and_pd(m
.simdInternal_
, x
.simdInternal_
));
268 return { _mm_and_pd(_mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(x
.simdInternal_
))), m
.simdInternal_
) };
271 static inline SimdDouble gmx_simdcall
maskzRcp(SimdDouble x
, SimdDBool m
)
273 // The result will always be correct since we mask the result with m, but
274 // for debug builds we also want to make sure not to generate FP exceptions
276 x
.simdInternal_
= _mm_or_pd(_mm_andnot_pd(m
.simdInternal_
, _mm_set1_pd(1.0)),
277 _mm_and_pd(m
.simdInternal_
, x
.simdInternal_
));
279 return { _mm_and_pd(_mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(x
.simdInternal_
))), m
.simdInternal_
) };
283 static inline SimdDouble gmx_simdcall
abs(SimdDouble x
)
285 return { _mm_andnot_pd(_mm_set1_pd(GMX_DOUBLE_NEGZERO
), x
.simdInternal_
) };
288 static inline SimdDouble gmx_simdcall
max(SimdDouble a
, SimdDouble b
)
290 return { _mm_max_pd(a
.simdInternal_
, b
.simdInternal_
) };
293 static inline SimdDouble gmx_simdcall
min(SimdDouble a
, SimdDouble b
)
295 return { _mm_min_pd(a
.simdInternal_
, b
.simdInternal_
) };
298 // Override for SSE4.1 and higher
299 #if GMX_SIMD_X86_SSE2
300 static inline SimdDouble gmx_simdcall
round(SimdDouble x
)
302 return { _mm_cvtepi32_pd(_mm_cvtpd_epi32(x
.simdInternal_
)) };
305 static inline SimdDouble gmx_simdcall
trunc(SimdDouble x
)
307 return { _mm_cvtepi32_pd(_mm_cvttpd_epi32(x
.simdInternal_
)) };
312 template<MathOptimization opt
= MathOptimization::Safe
>
313 static inline SimdDouble
frexp(SimdDouble value
, SimdDInt32
* exponent
)
315 // Don't use _mm_set1_epi64x() - on MSVC it is only supported for 64-bit builds
316 const __m128d exponentMask
=
317 _mm_castsi128_pd(_mm_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000));
318 const __m128d mantissaMask
=
319 _mm_castsi128_pd(_mm_set_epi32(0x800FFFFF, 0xFFFFFFFF, 0x800FFFFF, 0xFFFFFFFF));
320 const __m128i exponentBias
= _mm_set1_epi32(1022); // add 1 to make our definition identical to frexp()
321 const __m128d half
= _mm_set1_pd(0.5);
323 __m128i iExponent
= _mm_castpd_si128(_mm_and_pd(value
.simdInternal_
, exponentMask
));
324 iExponent
= _mm_sub_epi32(_mm_srli_epi64(iExponent
, 52), exponentBias
);
326 __m128d result
= _mm_or_pd(_mm_and_pd(value
.simdInternal_
, mantissaMask
), half
);
328 if (opt
== MathOptimization::Safe
)
330 __m128d valueIsZero
= _mm_cmpeq_pd(_mm_setzero_pd(), value
.simdInternal_
);
331 // Set the upper/lower 64-bit-fields of "iExponent" to 0-bits if the corresponding input value was +-0.0
332 iExponent
= _mm_andnot_si128(_mm_castpd_si128(valueIsZero
), iExponent
);
333 // Set result to +-0 if the corresponding input value was +-0
334 result
= _mm_or_pd(_mm_andnot_pd(valueIsZero
, result
),
335 _mm_and_pd(valueIsZero
, value
.simdInternal_
));
338 // Shuffle exponent so that 32-bit-fields 0 & 1 contain the relevant exponent values to return
339 exponent
->simdInternal_
= _mm_shuffle_epi32(iExponent
, _MM_SHUFFLE(3, 1, 2, 0));
344 // Override for SSE4.1
345 #if GMX_SIMD_X86_SSE2
346 template<MathOptimization opt
= MathOptimization::Safe
>
347 static inline SimdDouble
ldexp(SimdDouble value
, SimdDInt32 exponent
)
349 const __m128i exponentBias
= _mm_set1_epi32(1023);
350 __m128i iExponent
= _mm_add_epi32(exponent
.simdInternal_
, exponentBias
);
352 if (opt
== MathOptimization::Safe
)
354 // Make sure biased argument is not negative
355 iExponent
= _mm_and_si128(iExponent
, _mm_cmpgt_epi32(iExponent
, _mm_setzero_si128()));
358 // After conversion integers will be in slot 0,1. Move them to 0,2 so
359 // we can do a 64-bit shift and get them to the dp exponents.
360 iExponent
= _mm_shuffle_epi32(iExponent
, _MM_SHUFFLE(3, 1, 2, 0));
361 iExponent
= _mm_slli_epi64(iExponent
, 52);
363 return { _mm_mul_pd(value
.simdInternal_
, _mm_castsi128_pd(iExponent
)) };
367 // Override for AVX-128-FMA and higher
368 #if GMX_SIMD_X86_SSE2 || GMX_SIMD_X86_SSE4_1
369 static inline double gmx_simdcall
reduce(SimdDouble a
)
371 __m128d b
= _mm_add_sd(a
.simdInternal_
,
372 _mm_shuffle_pd(a
.simdInternal_
, a
.simdInternal_
, _MM_SHUFFLE2(1, 1)));
373 return *reinterpret_cast<double*>(&b
);
377 static inline SimdDBool gmx_simdcall
operator==(SimdDouble a
, SimdDouble b
)
379 return { _mm_cmpeq_pd(a
.simdInternal_
, b
.simdInternal_
) };
382 static inline SimdDBool gmx_simdcall
operator!=(SimdDouble a
, SimdDouble b
)
384 return { _mm_cmpneq_pd(a
.simdInternal_
, b
.simdInternal_
) };
387 static inline SimdDBool gmx_simdcall
operator<(SimdDouble a
, SimdDouble b
)
389 return { _mm_cmplt_pd(a
.simdInternal_
, b
.simdInternal_
) };
392 static inline SimdDBool gmx_simdcall
operator<=(SimdDouble a
, SimdDouble b
)
394 return { _mm_cmple_pd(a
.simdInternal_
, b
.simdInternal_
) };
397 // Override for SSE4.1 and higher
398 #if GMX_SIMD_X86_SSE2
399 static inline SimdDBool gmx_simdcall
testBits(SimdDouble a
)
401 __m128i ia
= _mm_castpd_si128(a
.simdInternal_
);
402 __m128i res
= _mm_andnot_si128(_mm_cmpeq_epi32(ia
, _mm_setzero_si128()), _mm_cmpeq_epi32(ia
, ia
));
404 // set each 64-bit element if low or high 32-bit part is set
405 res
= _mm_or_si128(res
, _mm_shuffle_epi32(res
, _MM_SHUFFLE(2, 3, 0, 1)));
407 return { _mm_castsi128_pd(res
) };
411 static inline SimdDBool gmx_simdcall
operator&&(SimdDBool a
, SimdDBool b
)
413 return { _mm_and_pd(a
.simdInternal_
, b
.simdInternal_
) };
416 static inline SimdDBool gmx_simdcall
operator||(SimdDBool a
, SimdDBool b
)
418 return { _mm_or_pd(a
.simdInternal_
, b
.simdInternal_
) };
421 static inline bool gmx_simdcall
anyTrue(SimdDBool a
)
423 return _mm_movemask_pd(a
.simdInternal_
) != 0;
426 static inline SimdDouble gmx_simdcall
selectByMask(SimdDouble a
, SimdDBool mask
)
428 return { _mm_and_pd(a
.simdInternal_
, mask
.simdInternal_
) };
431 static inline SimdDouble gmx_simdcall
selectByNotMask(SimdDouble a
, SimdDBool mask
)
433 return { _mm_andnot_pd(mask
.simdInternal_
, a
.simdInternal_
) };
436 // Override for SSE4.1 and higher
437 #if GMX_SIMD_X86_SSE2
438 static inline SimdDouble gmx_simdcall
blend(SimdDouble a
, SimdDouble b
, SimdDBool sel
)
440 return { _mm_or_pd(_mm_andnot_pd(sel
.simdInternal_
, a
.simdInternal_
),
441 _mm_and_pd(sel
.simdInternal_
, b
.simdInternal_
)) };
445 static inline SimdDInt32 gmx_simdcall
operator&(SimdDInt32 a
, SimdDInt32 b
)
447 return { _mm_and_si128(a
.simdInternal_
, b
.simdInternal_
) };
450 static inline SimdDInt32 gmx_simdcall
andNot(SimdDInt32 a
, SimdDInt32 b
)
452 return { _mm_andnot_si128(a
.simdInternal_
, b
.simdInternal_
) };
455 static inline SimdDInt32 gmx_simdcall
operator|(SimdDInt32 a
, SimdDInt32 b
)
457 return { _mm_or_si128(a
.simdInternal_
, b
.simdInternal_
) };
460 static inline SimdDInt32 gmx_simdcall
operator^(SimdDInt32 a
, SimdDInt32 b
)
462 return { _mm_xor_si128(a
.simdInternal_
, b
.simdInternal_
) };
465 static inline SimdDInt32 gmx_simdcall
operator+(SimdDInt32 a
, SimdDInt32 b
)
467 return { _mm_add_epi32(a
.simdInternal_
, b
.simdInternal_
) };
470 static inline SimdDInt32 gmx_simdcall
operator-(SimdDInt32 a
, SimdDInt32 b
)
472 return { _mm_sub_epi32(a
.simdInternal_
, b
.simdInternal_
) };
475 // Override for SSE4.1 and higher
476 #if GMX_SIMD_X86_SSE2
477 static inline SimdDInt32 gmx_simdcall
operator*(SimdDInt32 a
, SimdDInt32 b
)
480 __m128i tmpA
= _mm_unpacklo_epi32(a
.simdInternal_
, _mm_setzero_si128()); // 0 a[1] 0 a[0]
481 __m128i tmpB
= _mm_unpacklo_epi32(b
.simdInternal_
, _mm_setzero_si128()); // 0 b[1] 0 b[0]
483 __m128i tmpC
= _mm_mul_epu32(tmpA
, tmpB
); // 0 a[1]*b[1] 0 a[0]*b[0]
485 return { _mm_shuffle_epi32(tmpC
, _MM_SHUFFLE(3, 1, 2, 0)) };
489 static inline SimdDIBool gmx_simdcall
operator==(SimdDInt32 a
, SimdDInt32 b
)
491 return { _mm_cmpeq_epi32(a
.simdInternal_
, b
.simdInternal_
) };
494 static inline SimdDIBool gmx_simdcall
testBits(SimdDInt32 a
)
496 __m128i x
= a
.simdInternal_
;
497 __m128i res
= _mm_andnot_si128(_mm_cmpeq_epi32(x
, _mm_setzero_si128()), _mm_cmpeq_epi32(x
, x
));
502 static inline SimdDIBool gmx_simdcall
operator<(SimdDInt32 a
, SimdDInt32 b
)
504 return { _mm_cmplt_epi32(a
.simdInternal_
, b
.simdInternal_
) };
507 static inline SimdDIBool gmx_simdcall
operator&&(SimdDIBool a
, SimdDIBool b
)
509 return { _mm_and_si128(a
.simdInternal_
, b
.simdInternal_
) };
512 static inline SimdDIBool gmx_simdcall
operator||(SimdDIBool a
, SimdDIBool b
)
514 return { _mm_or_si128(a
.simdInternal_
, b
.simdInternal_
) };
517 static inline bool gmx_simdcall
anyTrue(SimdDIBool a
)
519 return _mm_movemask_epi8(_mm_shuffle_epi32(a
.simdInternal_
, _MM_SHUFFLE(1, 0, 1, 0))) != 0;
522 static inline SimdDInt32 gmx_simdcall
selectByMask(SimdDInt32 a
, SimdDIBool mask
)
524 return { _mm_and_si128(a
.simdInternal_
, mask
.simdInternal_
) };
527 static inline SimdDInt32 gmx_simdcall
selectByNotMask(SimdDInt32 a
, SimdDIBool mask
)
529 return { _mm_andnot_si128(mask
.simdInternal_
, a
.simdInternal_
) };
532 // Override for SSE4.1 and higher
533 #if GMX_SIMD_X86_SSE2
534 static inline SimdDInt32 gmx_simdcall
blend(SimdDInt32 a
, SimdDInt32 b
, SimdDIBool sel
)
536 return { _mm_or_si128(_mm_andnot_si128(sel
.simdInternal_
, a
.simdInternal_
),
537 _mm_and_si128(sel
.simdInternal_
, b
.simdInternal_
)) };
541 static inline SimdDInt32 gmx_simdcall
cvtR2I(SimdDouble a
)
543 return { _mm_cvtpd_epi32(a
.simdInternal_
) };
546 static inline SimdDInt32 gmx_simdcall
cvttR2I(SimdDouble a
)
548 return { _mm_cvttpd_epi32(a
.simdInternal_
) };
551 static inline SimdDouble gmx_simdcall
cvtI2R(SimdDInt32 a
)
553 return { _mm_cvtepi32_pd(a
.simdInternal_
) };
556 static inline SimdDIBool gmx_simdcall
cvtB2IB(SimdDBool a
)
558 return { _mm_shuffle_epi32(_mm_castpd_si128(a
.simdInternal_
), _MM_SHUFFLE(2, 0, 2, 0)) };
561 static inline SimdDBool gmx_simdcall
cvtIB2B(SimdDIBool a
)
563 return { _mm_castsi128_pd(_mm_shuffle_epi32(a
.simdInternal_
, _MM_SHUFFLE(1, 1, 0, 0))) };
566 static inline void gmx_simdcall
cvtF2DD(SimdFloat f
, SimdDouble
* d0
, SimdDouble
* d1
)
568 d0
->simdInternal_
= _mm_cvtps_pd(f
.simdInternal_
);
569 d1
->simdInternal_
= _mm_cvtps_pd(_mm_movehl_ps(f
.simdInternal_
, f
.simdInternal_
));
572 static inline SimdFloat gmx_simdcall
cvtDD2F(SimdDouble d0
, SimdDouble d1
)
574 return { _mm_movelh_ps(_mm_cvtpd_ps(d0
.simdInternal_
), _mm_cvtpd_ps(d1
.simdInternal_
)) };
579 #endif // GMX_SIMD_IMPL_X86_SSE2_SIMD_DOUBLE_H