2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2016,2017,2018 by the GROMACS development team.
5 * Copyright (c) 2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 #ifndef GMX_SIMD_IMPL_X86_AVX_256_SIMD_DOUBLE_H
38 #define GMX_SIMD_IMPL_X86_AVX_256_SIMD_DOUBLE_H
46 #include <immintrin.h>
48 #include "gromacs/math/utilities.h"
50 #include "impl_x86_avx_256_simd_float.h"
59 MSVC_DIAGNOSTIC_IGNORE(26495) // simdInternal_ is not being initialized!
62 SimdDouble(double d
) : simdInternal_(_mm256_set1_pd(d
)) {}
64 // Internal utility constructor to simplify return statements
65 SimdDouble(__m256d simd
) : simdInternal_(simd
) {}
67 __m256d simdInternal_
;
73 MSVC_DIAGNOSTIC_IGNORE(26495) // simdInternal_ is not being initialized!
76 SimdDInt32(std::int32_t i
) : simdInternal_(_mm_set1_epi32(i
)) {}
78 // Internal utility constructor to simplify return statements
79 SimdDInt32(__m128i simd
) : simdInternal_(simd
) {}
81 __m128i simdInternal_
;
87 MSVC_DIAGNOSTIC_IGNORE(26495) // simdInternal_ is not being initialized!
90 SimdDBool(bool b
) : simdInternal_(_mm256_castsi256_pd(_mm256_set1_epi32(b
? 0xFFFFFFFF : 0))) {}
92 // Internal utility constructor to simplify return statements
93 SimdDBool(__m256d simd
) : simdInternal_(simd
) {}
95 __m256d simdInternal_
;
101 MSVC_DIAGNOSTIC_IGNORE(26495) // simdInternal_ is not being initialized!
103 MSVC_DIAGNOSTIC_RESET
104 SimdDIBool(bool b
) : simdInternal_(_mm_set1_epi32(b
? 0xFFFFFFFF : 0)) {}
106 // Internal utility constructor to simplify return statements
107 SimdDIBool(__m128i simd
) : simdInternal_(simd
) {}
109 __m128i simdInternal_
;
113 static inline SimdDouble gmx_simdcall
simdLoad(const double* m
, SimdDoubleTag
/*unused*/ = {})
115 assert(std::size_t(m
) % 32 == 0);
116 return { _mm256_load_pd(m
) };
119 static inline void gmx_simdcall
store(double* m
, SimdDouble a
)
121 assert(std::size_t(m
) % 32 == 0);
122 _mm256_store_pd(m
, a
.simdInternal_
);
125 static inline SimdDouble gmx_simdcall
simdLoadU(const double* m
, SimdDoubleTag
/*unused*/ = {})
127 return { _mm256_loadu_pd(m
) };
130 static inline void gmx_simdcall
storeU(double* m
, SimdDouble a
)
132 _mm256_storeu_pd(m
, a
.simdInternal_
);
135 static inline SimdDouble gmx_simdcall
setZeroD()
137 return { _mm256_setzero_pd() };
140 static inline SimdDInt32 gmx_simdcall
simdLoad(const std::int32_t* m
, SimdDInt32Tag
/*unused*/)
142 assert(std::size_t(m
) % 16 == 0);
143 return { _mm_load_si128(reinterpret_cast<const __m128i
*>(m
)) };
146 static inline void gmx_simdcall
store(std::int32_t* m
, SimdDInt32 a
)
148 assert(std::size_t(m
) % 16 == 0);
149 _mm_store_si128(reinterpret_cast<__m128i
*>(m
), a
.simdInternal_
);
152 static inline SimdDInt32 gmx_simdcall
simdLoadU(const std::int32_t* m
, SimdDInt32Tag
/*unused*/)
154 return { _mm_loadu_si128(reinterpret_cast<const __m128i
*>(m
)) };
157 static inline void gmx_simdcall
storeU(std::int32_t* m
, SimdDInt32 a
)
159 _mm_storeu_si128(reinterpret_cast<__m128i
*>(m
), a
.simdInternal_
);
162 static inline SimdDInt32 gmx_simdcall
setZeroDI()
164 return { _mm_setzero_si128() };
168 static inline std::int32_t gmx_simdcall
extract(SimdDInt32 a
)
170 return _mm_extract_epi32(a
.simdInternal_
, index
);
173 static inline SimdDouble gmx_simdcall
operator&(SimdDouble a
, SimdDouble b
)
175 return { _mm256_and_pd(a
.simdInternal_
, b
.simdInternal_
) };
178 static inline SimdDouble gmx_simdcall
andNot(SimdDouble a
, SimdDouble b
)
180 return { _mm256_andnot_pd(a
.simdInternal_
, b
.simdInternal_
) };
183 static inline SimdDouble gmx_simdcall
operator|(SimdDouble a
, SimdDouble b
)
185 return { _mm256_or_pd(a
.simdInternal_
, b
.simdInternal_
) };
188 static inline SimdDouble gmx_simdcall
operator^(SimdDouble a
, SimdDouble b
)
190 return { _mm256_xor_pd(a
.simdInternal_
, b
.simdInternal_
) };
193 static inline SimdDouble gmx_simdcall
operator+(SimdDouble a
, SimdDouble b
)
195 return { _mm256_add_pd(a
.simdInternal_
, b
.simdInternal_
) };
198 static inline SimdDouble gmx_simdcall
operator-(SimdDouble a
, SimdDouble b
)
200 return { _mm256_sub_pd(a
.simdInternal_
, b
.simdInternal_
) };
203 static inline SimdDouble gmx_simdcall
operator-(SimdDouble x
)
205 return { _mm256_xor_pd(x
.simdInternal_
, _mm256_set1_pd(GMX_DOUBLE_NEGZERO
)) };
208 static inline SimdDouble gmx_simdcall
operator*(SimdDouble a
, SimdDouble b
)
210 return { _mm256_mul_pd(a
.simdInternal_
, b
.simdInternal_
) };
213 // Override for AVX2 and higher
214 #if GMX_SIMD_X86_AVX_256
215 static inline SimdDouble gmx_simdcall
fma(SimdDouble a
, SimdDouble b
, SimdDouble c
)
217 return { _mm256_add_pd(_mm256_mul_pd(a
.simdInternal_
, b
.simdInternal_
), c
.simdInternal_
) };
220 static inline SimdDouble gmx_simdcall
fms(SimdDouble a
, SimdDouble b
, SimdDouble c
)
222 return { _mm256_sub_pd(_mm256_mul_pd(a
.simdInternal_
, b
.simdInternal_
), c
.simdInternal_
) };
225 static inline SimdDouble gmx_simdcall
fnma(SimdDouble a
, SimdDouble b
, SimdDouble c
)
227 return { _mm256_sub_pd(c
.simdInternal_
, _mm256_mul_pd(a
.simdInternal_
, b
.simdInternal_
)) };
230 static inline SimdDouble gmx_simdcall
fnms(SimdDouble a
, SimdDouble b
, SimdDouble c
)
232 return { _mm256_sub_pd(_mm256_setzero_pd(),
233 _mm256_add_pd(_mm256_mul_pd(a
.simdInternal_
, b
.simdInternal_
), c
.simdInternal_
)) };
237 static inline SimdDouble gmx_simdcall
rsqrt(SimdDouble x
)
239 return { _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(x
.simdInternal_
))) };
242 static inline SimdDouble gmx_simdcall
rcp(SimdDouble x
)
244 return { _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(x
.simdInternal_
))) };
247 static inline SimdDouble gmx_simdcall
maskAdd(SimdDouble a
, SimdDouble b
, SimdDBool m
)
249 return { _mm256_add_pd(a
.simdInternal_
, _mm256_and_pd(b
.simdInternal_
, m
.simdInternal_
)) };
252 static inline SimdDouble gmx_simdcall
maskzMul(SimdDouble a
, SimdDouble b
, SimdDBool m
)
254 return { _mm256_and_pd(_mm256_mul_pd(a
.simdInternal_
, b
.simdInternal_
), m
.simdInternal_
) };
257 static inline SimdDouble
maskzFma(SimdDouble a
, SimdDouble b
, SimdDouble c
, SimdDBool m
)
259 return { _mm256_and_pd(_mm256_add_pd(_mm256_mul_pd(a
.simdInternal_
, b
.simdInternal_
), c
.simdInternal_
),
263 static inline SimdDouble
maskzRsqrt(SimdDouble x
, SimdDBool m
)
266 x
.simdInternal_
= _mm256_blendv_pd(_mm256_set1_pd(1.0), x
.simdInternal_
, m
.simdInternal_
);
268 return { _mm256_and_pd(_mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(x
.simdInternal_
))), m
.simdInternal_
) };
271 static inline SimdDouble
maskzRcp(SimdDouble x
, SimdDBool m
)
274 x
.simdInternal_
= _mm256_blendv_pd(_mm256_set1_pd(1.0), x
.simdInternal_
, m
.simdInternal_
);
276 return { _mm256_and_pd(_mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(x
.simdInternal_
))), m
.simdInternal_
) };
279 static inline SimdDouble gmx_simdcall
abs(SimdDouble x
)
281 return { _mm256_andnot_pd(_mm256_set1_pd(GMX_DOUBLE_NEGZERO
), x
.simdInternal_
) };
284 static inline SimdDouble gmx_simdcall
max(SimdDouble a
, SimdDouble b
)
286 return { _mm256_max_pd(a
.simdInternal_
, b
.simdInternal_
) };
289 static inline SimdDouble gmx_simdcall
min(SimdDouble a
, SimdDouble b
)
291 return { _mm256_min_pd(a
.simdInternal_
, b
.simdInternal_
) };
294 static inline SimdDouble gmx_simdcall
round(SimdDouble x
)
296 return { _mm256_round_pd(x
.simdInternal_
, _MM_FROUND_NINT
) };
299 static inline SimdDouble gmx_simdcall
trunc(SimdDouble x
)
301 return { _mm256_round_pd(x
.simdInternal_
, _MM_FROUND_TRUNC
) };
304 // Override for AVX2 and higher
305 #if GMX_SIMD_X86_AVX_256
306 template<MathOptimization opt
= MathOptimization::Safe
>
307 static inline SimdDouble
frexp(SimdDouble value
, SimdDInt32
* exponent
)
309 const __m256d exponentMask
= _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FF0000000000000LL
));
310 const __m256d mantissaMask
= _mm256_castsi256_pd(_mm256_set1_epi64x(0x800FFFFFFFFFFFFFLL
));
311 const __m256d half
= _mm256_set1_pd(0.5);
312 const __m128i exponentBias
= _mm_set1_epi32(1022); // add 1 to make our definition identical to frexp()
314 __m256i iExponent
= _mm256_castpd_si256(_mm256_and_pd(value
.simdInternal_
, exponentMask
));
315 __m128i iExponentHigh
= _mm256_extractf128_si256(iExponent
, 0x1);
316 __m128i iExponentLow
= _mm256_castsi256_si128(iExponent
);
317 iExponentLow
= _mm_srli_epi64(iExponentLow
, 52);
318 iExponentHigh
= _mm_srli_epi64(iExponentHigh
, 52);
319 iExponentLow
= _mm_shuffle_epi32(iExponentLow
, _MM_SHUFFLE(1, 1, 2, 0));
320 iExponentHigh
= _mm_shuffle_epi32(iExponentHigh
, _MM_SHUFFLE(2, 0, 1, 1));
321 // We need to store the return in a 128-bit integer variable, so reuse iExponentLow for both
322 iExponentLow
= _mm_or_si128(iExponentLow
, iExponentHigh
);
323 iExponentLow
= _mm_sub_epi32(iExponentLow
, exponentBias
);
325 __m256d result
= _mm256_or_pd(_mm256_and_pd(value
.simdInternal_
, mantissaMask
), half
);
327 if (opt
== MathOptimization::Safe
)
329 __m256d valueIsZero
= _mm256_cmp_pd(_mm256_setzero_pd(), value
.simdInternal_
, _CMP_EQ_OQ
);
330 // This looks more complex than it is: the valueIsZero variable contains 4x 64-bit double
331 // precision fields, but a bit below we'll need a corresponding integer variable with 4x
332 // 32-bit fields. Since AVX1 does not support shuffling across the upper/lower 128-bit
333 // lanes, we need to extract those first, and then shuffle between two 128-bit variables.
334 __m128i iValueIsZero
= _mm_castps_si128(_mm_shuffle_ps(
335 _mm256_extractf128_ps(_mm256_castpd_ps(valueIsZero
), 0x0),
336 _mm256_extractf128_ps(_mm256_castpd_ps(valueIsZero
), 0x1), _MM_SHUFFLE(2, 0, 2, 0)));
338 // Set exponent to 0 when input value was zero
339 iExponentLow
= _mm_andnot_si128(iValueIsZero
, iExponentLow
);
341 // Set result to +-0 if the corresponding input value was +-0
342 result
= _mm256_blendv_pd(result
, value
.simdInternal_
, valueIsZero
);
344 exponent
->simdInternal_
= iExponentLow
;
349 template<MathOptimization opt
= MathOptimization::Safe
>
350 static inline SimdDouble
ldexp(SimdDouble value
, SimdDInt32 exponent
)
352 const __m128i exponentBias
= _mm_set1_epi32(1023);
353 __m128i iExponentLow
, iExponentHigh
;
356 iExponentLow
= _mm_add_epi32(exponent
.simdInternal_
, exponentBias
);
358 if (opt
== MathOptimization::Safe
)
360 // Make sure biased argument is not negative
361 iExponentLow
= _mm_max_epi32(iExponentLow
, _mm_setzero_si128());
364 iExponentHigh
= _mm_shuffle_epi32(iExponentLow
, _MM_SHUFFLE(3, 3, 2, 2));
365 iExponentLow
= _mm_shuffle_epi32(iExponentLow
, _MM_SHUFFLE(1, 1, 0, 0));
366 iExponentHigh
= _mm_slli_epi64(iExponentHigh
, 52);
367 iExponentLow
= _mm_slli_epi64(iExponentLow
, 52);
368 fExponent
= _mm256_castsi256_pd(
369 _mm256_insertf128_si256(_mm256_castsi128_si256(iExponentLow
), iExponentHigh
, 0x1));
370 return { _mm256_mul_pd(value
.simdInternal_
, fExponent
) };
374 static inline double gmx_simdcall
reduce(SimdDouble a
)
377 a
.simdInternal_
= _mm256_add_pd(a
.simdInternal_
, _mm256_permute_pd(a
.simdInternal_
, 0b0101));
378 a0
= _mm256_castpd256_pd128(a
.simdInternal_
);
379 a1
= _mm256_extractf128_pd(a
.simdInternal_
, 0x1);
380 a0
= _mm_add_sd(a0
, a1
);
382 return *reinterpret_cast<double*>(&a0
);
385 static inline SimdDBool gmx_simdcall
operator==(SimdDouble a
, SimdDouble b
)
387 return { _mm256_cmp_pd(a
.simdInternal_
, b
.simdInternal_
, _CMP_EQ_OQ
) };
390 static inline SimdDBool gmx_simdcall
operator!=(SimdDouble a
, SimdDouble b
)
392 return { _mm256_cmp_pd(a
.simdInternal_
, b
.simdInternal_
, _CMP_NEQ_OQ
) };
395 static inline SimdDBool gmx_simdcall
operator<(SimdDouble a
, SimdDouble b
)
397 return { _mm256_cmp_pd(a
.simdInternal_
, b
.simdInternal_
, _CMP_LT_OQ
) };
400 static inline SimdDBool gmx_simdcall
operator<=(SimdDouble a
, SimdDouble b
)
402 return { _mm256_cmp_pd(a
.simdInternal_
, b
.simdInternal_
, _CMP_LE_OQ
) };
405 // Override for AVX2 and higher
406 #if GMX_SIMD_X86_AVX_256
407 static inline SimdDBool gmx_simdcall
testBits(SimdDouble a
)
409 // Do an or of the low/high 32 bits of each double (so the data is replicated),
410 // and then use the same algorithm as we use for single precision.
411 __m256 tst
= _mm256_castpd_ps(a
.simdInternal_
);
413 tst
= _mm256_or_ps(tst
, _mm256_permute_ps(tst
, _MM_SHUFFLE(2, 3, 0, 1)));
414 tst
= _mm256_cvtepi32_ps(_mm256_castps_si256(tst
));
416 return { _mm256_castps_pd(_mm256_cmp_ps(tst
, _mm256_setzero_ps(), _CMP_NEQ_OQ
)) };
420 static inline SimdDBool gmx_simdcall
operator&&(SimdDBool a
, SimdDBool b
)
422 return { _mm256_and_pd(a
.simdInternal_
, b
.simdInternal_
) };
425 static inline SimdDBool gmx_simdcall
operator||(SimdDBool a
, SimdDBool b
)
427 return { _mm256_or_pd(a
.simdInternal_
, b
.simdInternal_
) };
430 static inline bool gmx_simdcall
anyTrue(SimdDBool a
)
432 return _mm256_movemask_pd(a
.simdInternal_
) != 0;
435 static inline SimdDouble gmx_simdcall
selectByMask(SimdDouble a
, SimdDBool mask
)
437 return { _mm256_and_pd(a
.simdInternal_
, mask
.simdInternal_
) };
440 static inline SimdDouble gmx_simdcall
selectByNotMask(SimdDouble a
, SimdDBool mask
)
442 return { _mm256_andnot_pd(mask
.simdInternal_
, a
.simdInternal_
) };
445 static inline SimdDouble gmx_simdcall
blend(SimdDouble a
, SimdDouble b
, SimdDBool sel
)
447 return { _mm256_blendv_pd(a
.simdInternal_
, b
.simdInternal_
, sel
.simdInternal_
) };
450 static inline SimdDInt32 gmx_simdcall
operator&(SimdDInt32 a
, SimdDInt32 b
)
452 return { _mm_and_si128(a
.simdInternal_
, b
.simdInternal_
) };
455 static inline SimdDInt32 gmx_simdcall
andNot(SimdDInt32 a
, SimdDInt32 b
)
457 return { _mm_andnot_si128(a
.simdInternal_
, b
.simdInternal_
) };
460 static inline SimdDInt32 gmx_simdcall
operator|(SimdDInt32 a
, SimdDInt32 b
)
462 return { _mm_or_si128(a
.simdInternal_
, b
.simdInternal_
) };
465 static inline SimdDInt32 gmx_simdcall
operator^(SimdDInt32 a
, SimdDInt32 b
)
467 return { _mm_xor_si128(a
.simdInternal_
, b
.simdInternal_
) };
470 static inline SimdDInt32 gmx_simdcall
operator+(SimdDInt32 a
, SimdDInt32 b
)
472 return { _mm_add_epi32(a
.simdInternal_
, b
.simdInternal_
) };
475 static inline SimdDInt32 gmx_simdcall
operator-(SimdDInt32 a
, SimdDInt32 b
)
477 return { _mm_sub_epi32(a
.simdInternal_
, b
.simdInternal_
) };
480 static inline SimdDInt32 gmx_simdcall
operator*(SimdDInt32 a
, SimdDInt32 b
)
482 return { _mm_mullo_epi32(a
.simdInternal_
, b
.simdInternal_
) };
485 static inline SimdDIBool gmx_simdcall
operator==(SimdDInt32 a
, SimdDInt32 b
)
487 return { _mm_cmpeq_epi32(a
.simdInternal_
, b
.simdInternal_
) };
490 static inline SimdDIBool gmx_simdcall
operator<(SimdDInt32 a
, SimdDInt32 b
)
492 return { _mm_cmplt_epi32(a
.simdInternal_
, b
.simdInternal_
) };
495 static inline SimdDIBool gmx_simdcall
testBits(SimdDInt32 a
)
497 __m128i x
= a
.simdInternal_
;
498 __m128i res
= _mm_andnot_si128(_mm_cmpeq_epi32(x
, _mm_setzero_si128()), _mm_cmpeq_epi32(x
, x
));
503 static inline SimdDIBool gmx_simdcall
operator&&(SimdDIBool a
, SimdDIBool b
)
505 return { _mm_and_si128(a
.simdInternal_
, b
.simdInternal_
) };
508 static inline SimdDIBool gmx_simdcall
operator||(SimdDIBool a
, SimdDIBool b
)
510 return { _mm_or_si128(a
.simdInternal_
, b
.simdInternal_
) };
513 static inline bool gmx_simdcall
anyTrue(SimdDIBool a
)
515 return _mm_movemask_epi8(a
.simdInternal_
) != 0;
518 static inline SimdDInt32 gmx_simdcall
selectByMask(SimdDInt32 a
, SimdDIBool mask
)
520 return { _mm_and_si128(a
.simdInternal_
, mask
.simdInternal_
) };
523 static inline SimdDInt32 gmx_simdcall
selectByNotMask(SimdDInt32 a
, SimdDIBool mask
)
525 return { _mm_andnot_si128(mask
.simdInternal_
, a
.simdInternal_
) };
528 static inline SimdDInt32 gmx_simdcall
blend(SimdDInt32 a
, SimdDInt32 b
, SimdDIBool sel
)
530 return { _mm_blendv_epi8(a
.simdInternal_
, b
.simdInternal_
, sel
.simdInternal_
) };
533 static inline SimdDInt32 gmx_simdcall
cvtR2I(SimdDouble a
)
535 return { _mm256_cvtpd_epi32(a
.simdInternal_
) };
538 static inline SimdDInt32 gmx_simdcall
cvttR2I(SimdDouble a
)
540 return { _mm256_cvttpd_epi32(a
.simdInternal_
) };
543 static inline SimdDouble gmx_simdcall
cvtI2R(SimdDInt32 a
)
545 return { _mm256_cvtepi32_pd(a
.simdInternal_
) };
548 static inline SimdDIBool gmx_simdcall
cvtB2IB(SimdDBool a
)
550 __m128i a1
= _mm256_extractf128_si256(_mm256_castpd_si256(a
.simdInternal_
), 0x1);
551 __m128i a0
= _mm256_castsi256_si128(_mm256_castpd_si256(a
.simdInternal_
));
552 a0
= _mm_shuffle_epi32(a0
, _MM_SHUFFLE(2, 0, 2, 0));
553 a1
= _mm_shuffle_epi32(a1
, _MM_SHUFFLE(2, 0, 2, 0));
555 return { _mm_blend_epi16(a0
, a1
, 0xF0) };
558 static inline SimdDBool gmx_simdcall
cvtIB2B(SimdDIBool a
)
560 __m128d lo
= _mm_castsi128_pd(_mm_unpacklo_epi32(a
.simdInternal_
, a
.simdInternal_
));
561 __m128d hi
= _mm_castsi128_pd(_mm_unpackhi_epi32(a
.simdInternal_
, a
.simdInternal_
));
563 return { _mm256_insertf128_pd(_mm256_castpd128_pd256(lo
), hi
, 0x1) };
566 static inline void gmx_simdcall
cvtF2DD(SimdFloat f
, SimdDouble
* d0
, SimdDouble
* d1
)
568 d0
->simdInternal_
= _mm256_cvtps_pd(_mm256_castps256_ps128(f
.simdInternal_
));
569 d1
->simdInternal_
= _mm256_cvtps_pd(_mm256_extractf128_ps(f
.simdInternal_
, 0x1));
572 static inline SimdFloat gmx_simdcall
cvtDD2F(SimdDouble d0
, SimdDouble d1
)
574 __m128 f0
= _mm256_cvtpd_ps(d0
.simdInternal_
);
575 __m128 f1
= _mm256_cvtpd_ps(d1
.simdInternal_
);
576 return { _mm256_insertf128_ps(_mm256_castps128_ps256(f0
), f1
, 0x1) };
581 #endif // GMX_SIMD_IMPL_X86_AVX_256_SIMD_DOUBLE_H