2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2016,2017,2019,2020, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD_DOUBLE_H
37 #define GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD_DOUBLE_H
45 #include "gromacs/math/utilities.h"
47 #include "impl_arm_neon_asimd_simd_float.h"
57 SimdDouble(double d
) : simdInternal_(vdupq_n_f64(d
)) {}
59 // Internal utility constructor to simplify return statements
60 SimdDouble(float64x2_t simd
) : simdInternal_(simd
) {}
62 float64x2_t simdInternal_
;
70 SimdDInt32(std::int32_t i
) : simdInternal_(vdup_n_s32(i
)) {}
72 // Internal utility constructor to simplify return statements
73 SimdDInt32(int32x2_t simd
) : simdInternal_(simd
) {}
75 int32x2_t simdInternal_
;
83 SimdDBool(bool b
) : simdInternal_(vdupq_n_u64(b
? 0xFFFFFFFFFFFFFFFF : 0)) {}
85 // Internal utility constructor to simplify return statements
86 SimdDBool(uint64x2_t simd
) : simdInternal_(simd
) {}
88 uint64x2_t simdInternal_
;
96 SimdDIBool(bool b
) : simdInternal_(vdup_n_u32(b
? 0xFFFFFFFF : 0)) {}
98 // Internal utility constructor to simplify return statements
99 SimdDIBool(uint32x2_t simd
) : simdInternal_(simd
) {}
101 uint32x2_t simdInternal_
;
104 static inline SimdDouble gmx_simdcall
simdLoad(const double* m
, SimdDoubleTag
= {})
106 assert(std::size_t(m
) % 16 == 0);
107 return { vld1q_f64(m
) };
110 static inline void gmx_simdcall
store(double* m
, SimdDouble a
)
112 assert(std::size_t(m
) % 16 == 0);
113 vst1q_f64(m
, a
.simdInternal_
);
116 static inline SimdDouble gmx_simdcall
simdLoadU(const double* m
, SimdDoubleTag
= {})
118 return { vld1q_f64(m
) };
121 static inline void gmx_simdcall
storeU(double* m
, SimdDouble a
)
123 vst1q_f64(m
, a
.simdInternal_
);
126 static inline SimdDouble gmx_simdcall
setZeroD()
128 return { vdupq_n_f64(0.0) };
131 static inline SimdDInt32 gmx_simdcall
simdLoad(const std::int32_t* m
, SimdDInt32Tag
)
133 assert(std::size_t(m
) % 8 == 0);
134 return { vld1_s32(m
) };
137 static inline void gmx_simdcall
store(std::int32_t* m
, SimdDInt32 a
)
139 assert(std::size_t(m
) % 8 == 0);
140 vst1_s32(m
, a
.simdInternal_
);
143 static inline SimdDInt32 gmx_simdcall
simdLoadU(const std::int32_t* m
, SimdDInt32Tag
)
145 return { vld1_s32(m
) };
148 static inline void gmx_simdcall
storeU(std::int32_t* m
, SimdDInt32 a
)
150 vst1_s32(m
, a
.simdInternal_
);
153 static inline SimdDInt32 gmx_simdcall
setZeroDI()
155 return { vdup_n_s32(0) };
159 gmx_simdcall
static inline std::int32_t extract(SimdDInt32 a
)
161 return vget_lane_s32(a
.simdInternal_
, index
);
164 static inline SimdDouble gmx_simdcall
operator&(SimdDouble a
, SimdDouble b
)
166 return { float64x2_t(vandq_s64(int64x2_t(a
.simdInternal_
), int64x2_t(b
.simdInternal_
))) };
169 static inline SimdDouble gmx_simdcall
andNot(SimdDouble a
, SimdDouble b
)
171 return { float64x2_t(vbicq_s64(int64x2_t(b
.simdInternal_
), int64x2_t(a
.simdInternal_
))) };
174 static inline SimdDouble gmx_simdcall
operator|(SimdDouble a
, SimdDouble b
)
176 return { float64x2_t(vorrq_s64(int64x2_t(a
.simdInternal_
), int64x2_t(b
.simdInternal_
))) };
179 static inline SimdDouble gmx_simdcall
operator^(SimdDouble a
, SimdDouble b
)
181 return { float64x2_t(veorq_s64(int64x2_t(a
.simdInternal_
), int64x2_t(b
.simdInternal_
))) };
184 static inline SimdDouble gmx_simdcall
operator+(SimdDouble a
, SimdDouble b
)
186 return { vaddq_f64(a
.simdInternal_
, b
.simdInternal_
) };
189 static inline SimdDouble gmx_simdcall
operator-(SimdDouble a
, SimdDouble b
)
191 return { vsubq_f64(a
.simdInternal_
, b
.simdInternal_
) };
194 static inline SimdDouble gmx_simdcall
operator-(SimdDouble x
)
196 return { vnegq_f64(x
.simdInternal_
) };
199 static inline SimdDouble gmx_simdcall
operator*(SimdDouble a
, SimdDouble b
)
201 return { vmulq_f64(a
.simdInternal_
, b
.simdInternal_
) };
204 static inline SimdDouble gmx_simdcall
fma(SimdDouble a
, SimdDouble b
, SimdDouble c
)
206 return { vfmaq_f64(c
.simdInternal_
, b
.simdInternal_
, a
.simdInternal_
) };
209 static inline SimdDouble gmx_simdcall
fms(SimdDouble a
, SimdDouble b
, SimdDouble c
)
211 return { vnegq_f64(vfmsq_f64(c
.simdInternal_
, b
.simdInternal_
, a
.simdInternal_
)) };
214 static inline SimdDouble gmx_simdcall
fnma(SimdDouble a
, SimdDouble b
, SimdDouble c
)
216 return { vfmsq_f64(c
.simdInternal_
, b
.simdInternal_
, a
.simdInternal_
) };
219 static inline SimdDouble gmx_simdcall
fnms(SimdDouble a
, SimdDouble b
, SimdDouble c
)
221 return { vnegq_f64(vfmaq_f64(c
.simdInternal_
, b
.simdInternal_
, a
.simdInternal_
)) };
224 static inline SimdDouble gmx_simdcall
rsqrt(SimdDouble x
)
226 return { vrsqrteq_f64(x
.simdInternal_
) };
229 static inline SimdDouble gmx_simdcall
rsqrtIter(SimdDouble lu
, SimdDouble x
)
231 return { vmulq_f64(lu
.simdInternal_
,
232 vrsqrtsq_f64(vmulq_f64(lu
.simdInternal_
, lu
.simdInternal_
), x
.simdInternal_
)) };
235 static inline SimdDouble gmx_simdcall
rcp(SimdDouble x
)
237 return { vrecpeq_f64(x
.simdInternal_
) };
240 static inline SimdDouble gmx_simdcall
rcpIter(SimdDouble lu
, SimdDouble x
)
242 return { vmulq_f64(lu
.simdInternal_
, vrecpsq_f64(lu
.simdInternal_
, x
.simdInternal_
)) };
245 static inline SimdDouble gmx_simdcall
maskAdd(SimdDouble a
, SimdDouble b
, SimdDBool m
)
247 float64x2_t addend
= float64x2_t(vandq_u64(uint64x2_t(b
.simdInternal_
), m
.simdInternal_
));
249 return { vaddq_f64(a
.simdInternal_
, addend
) };
252 static inline SimdDouble gmx_simdcall
maskzMul(SimdDouble a
, SimdDouble b
, SimdDBool m
)
254 float64x2_t prod
= vmulq_f64(a
.simdInternal_
, b
.simdInternal_
);
255 return { float64x2_t(vandq_u64(uint64x2_t(prod
), m
.simdInternal_
)) };
258 static inline SimdDouble gmx_simdcall
maskzFma(SimdDouble a
, SimdDouble b
, SimdDouble c
, SimdDBool m
)
260 float64x2_t prod
= vfmaq_f64(c
.simdInternal_
, b
.simdInternal_
, a
.simdInternal_
);
262 return { float64x2_t(vandq_u64(uint64x2_t(prod
), m
.simdInternal_
)) };
265 static inline SimdDouble gmx_simdcall
maskzRsqrt(SimdDouble x
, SimdDBool m
)
267 // The result will always be correct since we mask the result with m, but
268 // for debug builds we also want to make sure not to generate FP exceptions
270 x
.simdInternal_
= vbslq_f64(m
.simdInternal_
, x
.simdInternal_
, vdupq_n_f64(1.0));
272 return { float64x2_t(vandq_u64(uint64x2_t(vrsqrteq_f64(x
.simdInternal_
)), m
.simdInternal_
)) };
275 static inline SimdDouble gmx_simdcall
maskzRcp(SimdDouble x
, SimdDBool m
)
277 // The result will always be correct since we mask the result with m, but
278 // for debug builds we also want to make sure not to generate FP exceptions
280 x
.simdInternal_
= vbslq_f64(m
.simdInternal_
, x
.simdInternal_
, vdupq_n_f64(1.0));
282 return { float64x2_t(vandq_u64(uint64x2_t(vrecpeq_f64(x
.simdInternal_
)), m
.simdInternal_
)) };
285 static inline SimdDouble gmx_simdcall
abs(SimdDouble x
)
287 return { vabsq_f64(x
.simdInternal_
) };
290 static inline SimdDouble gmx_simdcall
max(SimdDouble a
, SimdDouble b
)
292 return { vmaxq_f64(a
.simdInternal_
, b
.simdInternal_
) };
295 static inline SimdDouble gmx_simdcall
min(SimdDouble a
, SimdDouble b
)
297 return { vminq_f64(a
.simdInternal_
, b
.simdInternal_
) };
300 static inline SimdDouble gmx_simdcall
round(SimdDouble x
)
302 return { vrndnq_f64(x
.simdInternal_
) };
305 static inline SimdDouble gmx_simdcall
trunc(SimdDouble x
)
307 return { vrndq_f64(x
.simdInternal_
) };
310 template<MathOptimization opt
= MathOptimization::Safe
>
311 static inline SimdDouble
frexp(SimdDouble value
, SimdDInt32
* exponent
)
313 const float64x2_t exponentMask
= float64x2_t(vdupq_n_s64(0x7FF0000000000000LL
));
314 const float64x2_t mantissaMask
= float64x2_t(vdupq_n_s64(0x800FFFFFFFFFFFFFLL
));
316 const int64x2_t exponentBias
= vdupq_n_s64(1022); // add 1 to make our definition identical to frexp()
317 const float64x2_t half
= vdupq_n_f64(0.5);
320 iExponent
= vandq_s64(int64x2_t(value
.simdInternal_
), int64x2_t(exponentMask
));
321 iExponent
= vsubq_s64(vshrq_n_s64(iExponent
, 52), exponentBias
);
322 exponent
->simdInternal_
= vmovn_s64(iExponent
);
324 return { float64x2_t(vorrq_s64(
325 vandq_s64(int64x2_t(value
.simdInternal_
), int64x2_t(mantissaMask
)), int64x2_t(half
))) };
328 template<MathOptimization opt
= MathOptimization::Safe
>
329 static inline SimdDouble
ldexp(SimdDouble value
, SimdDInt32 exponent
)
331 const int32x2_t exponentBias
= vdup_n_s32(1023);
332 int32x2_t iExponent
= vadd_s32(exponent
.simdInternal_
, exponentBias
);
333 int64x2_t iExponent64
;
335 if (opt
== MathOptimization::Safe
)
337 // Make sure biased argument is not negative
338 iExponent
= vmax_s32(iExponent
, vdup_n_s32(0));
341 iExponent64
= vmovl_s32(iExponent
);
342 iExponent64
= vshlq_n_s64(iExponent64
, 52);
344 return { vmulq_f64(value
.simdInternal_
, float64x2_t(iExponent64
)) };
347 static inline double gmx_simdcall
reduce(SimdDouble a
)
349 float64x2_t b
= vpaddq_f64(a
.simdInternal_
, a
.simdInternal_
);
350 return vgetq_lane_f64(b
, 0);
353 static inline SimdDBool gmx_simdcall
operator==(SimdDouble a
, SimdDouble b
)
355 return { vceqq_f64(a
.simdInternal_
, b
.simdInternal_
) };
358 static inline SimdDBool gmx_simdcall
operator!=(SimdDouble a
, SimdDouble b
)
360 return { vreinterpretq_u64_u32(
361 vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(a
.simdInternal_
, b
.simdInternal_
)))) };
364 static inline SimdDBool gmx_simdcall
operator<(SimdDouble a
, SimdDouble b
)
366 return { vcltq_f64(a
.simdInternal_
, b
.simdInternal_
) };
369 static inline SimdDBool gmx_simdcall
operator<=(SimdDouble a
, SimdDouble b
)
371 return { vcleq_f64(a
.simdInternal_
, b
.simdInternal_
) };
374 static inline SimdDBool gmx_simdcall
testBits(SimdDouble a
)
376 return { vtstq_s64(int64x2_t(a
.simdInternal_
), int64x2_t(a
.simdInternal_
)) };
379 static inline SimdDBool gmx_simdcall
operator&&(SimdDBool a
, SimdDBool b
)
381 return { vandq_u64(a
.simdInternal_
, b
.simdInternal_
) };
384 static inline SimdDBool gmx_simdcall
operator||(SimdDBool a
, SimdDBool b
)
386 return { vorrq_u64(a
.simdInternal_
, b
.simdInternal_
) };
389 static inline bool gmx_simdcall
anyTrue(SimdDBool a
)
391 return (vmaxvq_u32((uint32x4_t
)(a
.simdInternal_
)) != 0);
394 static inline SimdDouble gmx_simdcall
selectByMask(SimdDouble a
, SimdDBool m
)
396 return { float64x2_t(vandq_u64(uint64x2_t(a
.simdInternal_
), m
.simdInternal_
)) };
399 static inline SimdDouble gmx_simdcall
selectByNotMask(SimdDouble a
, SimdDBool m
)
401 return { float64x2_t(vbicq_u64(uint64x2_t(a
.simdInternal_
), m
.simdInternal_
)) };
404 static inline SimdDouble gmx_simdcall
blend(SimdDouble a
, SimdDouble b
, SimdDBool sel
)
406 return { vbslq_f64(sel
.simdInternal_
, b
.simdInternal_
, a
.simdInternal_
) };
409 static inline SimdDInt32 gmx_simdcall
operator&(SimdDInt32 a
, SimdDInt32 b
)
411 return { vand_s32(a
.simdInternal_
, b
.simdInternal_
) };
414 static inline SimdDInt32 gmx_simdcall
andNot(SimdDInt32 a
, SimdDInt32 b
)
416 return { vbic_s32(b
.simdInternal_
, a
.simdInternal_
) };
419 static inline SimdDInt32 gmx_simdcall
operator|(SimdDInt32 a
, SimdDInt32 b
)
421 return { vorr_s32(a
.simdInternal_
, b
.simdInternal_
) };
424 static inline SimdDInt32 gmx_simdcall
operator^(SimdDInt32 a
, SimdDInt32 b
)
426 return { veor_s32(a
.simdInternal_
, b
.simdInternal_
) };
429 static inline SimdDInt32 gmx_simdcall
operator+(SimdDInt32 a
, SimdDInt32 b
)
431 return { vadd_s32(a
.simdInternal_
, b
.simdInternal_
) };
434 static inline SimdDInt32 gmx_simdcall
operator-(SimdDInt32 a
, SimdDInt32 b
)
436 return { vsub_s32(a
.simdInternal_
, b
.simdInternal_
) };
439 static inline SimdDInt32 gmx_simdcall
operator*(SimdDInt32 a
, SimdDInt32 b
)
441 return { vmul_s32(a
.simdInternal_
, b
.simdInternal_
) };
444 static inline SimdDIBool gmx_simdcall
operator==(SimdDInt32 a
, SimdDInt32 b
)
446 return { vceq_s32(a
.simdInternal_
, b
.simdInternal_
) };
449 static inline SimdDIBool gmx_simdcall
testBits(SimdDInt32 a
)
451 return { vtst_s32(a
.simdInternal_
, a
.simdInternal_
) };
454 static inline SimdDIBool gmx_simdcall
operator<(SimdDInt32 a
, SimdDInt32 b
)
456 return { vclt_s32(a
.simdInternal_
, b
.simdInternal_
) };
459 static inline SimdDIBool gmx_simdcall
operator&&(SimdDIBool a
, SimdDIBool b
)
461 return { vand_u32(a
.simdInternal_
, b
.simdInternal_
) };
464 static inline SimdDIBool gmx_simdcall
operator||(SimdDIBool a
, SimdDIBool b
)
466 return { vorr_u32(a
.simdInternal_
, b
.simdInternal_
) };
469 static inline bool gmx_simdcall
anyTrue(SimdDIBool a
)
471 return (vmaxv_u32(a
.simdInternal_
) != 0);
474 static inline SimdDInt32 gmx_simdcall
selectByMask(SimdDInt32 a
, SimdDIBool m
)
476 return { vand_s32(a
.simdInternal_
, vreinterpret_s32_u32(m
.simdInternal_
)) };
479 static inline SimdDInt32 gmx_simdcall
selectByNotMask(SimdDInt32 a
, SimdDIBool m
)
481 return { vbic_s32(a
.simdInternal_
, vreinterpret_s32_u32(m
.simdInternal_
)) };
484 static inline SimdDInt32 gmx_simdcall
blend(SimdDInt32 a
, SimdDInt32 b
, SimdDIBool sel
)
486 return { vbsl_s32(sel
.simdInternal_
, b
.simdInternal_
, a
.simdInternal_
) };
489 static inline SimdDInt32 gmx_simdcall
cvtR2I(SimdDouble a
)
491 return { vmovn_s64(vcvtnq_s64_f64(a
.simdInternal_
)) };
494 static inline SimdDInt32 gmx_simdcall
cvttR2I(SimdDouble a
)
496 return { vmovn_s64(vcvtq_s64_f64(a
.simdInternal_
)) };
499 static inline SimdDouble gmx_simdcall
cvtI2R(SimdDInt32 a
)
501 return { vcvtq_f64_s64(vmovl_s32(a
.simdInternal_
)) };
504 static inline SimdDIBool gmx_simdcall
cvtB2IB(SimdDBool a
)
506 return { vqmovn_u64(a
.simdInternal_
) };
509 static inline SimdDBool gmx_simdcall
cvtIB2B(SimdDIBool a
)
511 return { vorrq_u64(vmovl_u32(a
.simdInternal_
), vshlq_n_u64(vmovl_u32(a
.simdInternal_
), 32)) };
514 static inline void gmx_simdcall
cvtF2DD(SimdFloat f
, SimdDouble
* d0
, SimdDouble
* d1
)
516 d0
->simdInternal_
= vcvt_f64_f32(vget_low_f32(f
.simdInternal_
));
517 d1
->simdInternal_
= vcvt_high_f64_f32(f
.simdInternal_
);
520 static inline SimdFloat gmx_simdcall
cvtDD2F(SimdDouble d0
, SimdDouble d1
)
522 return { vcvt_high_f32_f64(vcvt_f32_f64(d0
.simdInternal_
), d1
.simdInternal_
) };
527 #endif // GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD_DOUBLE_H