Make sure frexp() returns correct for argument 0.0
[gromacs.git] / src / gromacs / simd / impl_x86_avx_256 / impl_x86_avx_256_simd_double.h
blobdb17ac26bd50a63d77bb4b6c1e9969d786aac472
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2016,2017,2018 by the GROMACS development team.
5 * Copyright (c) 2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 #ifndef GMX_SIMD_IMPL_X86_AVX_256_SIMD_DOUBLE_H
38 #define GMX_SIMD_IMPL_X86_AVX_256_SIMD_DOUBLE_H
40 #include "config.h"
42 #include <cassert>
43 #include <cstddef>
44 #include <cstdint>
46 #include <immintrin.h>
48 #include "gromacs/math/utilities.h"
50 #include "impl_x86_avx_256_simd_float.h"
53 namespace gmx
56 class SimdDouble
58 public:
59 MSVC_DIAGNOSTIC_IGNORE(26495) // simdInternal_ is not being initialized!
60 SimdDouble() {}
61 MSVC_DIAGNOSTIC_RESET
62 SimdDouble(double d) : simdInternal_(_mm256_set1_pd(d)) {}
64 // Internal utility constructor to simplify return statements
65 SimdDouble(__m256d simd) : simdInternal_(simd) {}
67 __m256d simdInternal_;
70 class SimdDInt32
72 public:
73 MSVC_DIAGNOSTIC_IGNORE(26495) // simdInternal_ is not being initialized!
74 SimdDInt32() {}
75 MSVC_DIAGNOSTIC_RESET
76 SimdDInt32(std::int32_t i) : simdInternal_(_mm_set1_epi32(i)) {}
78 // Internal utility constructor to simplify return statements
79 SimdDInt32(__m128i simd) : simdInternal_(simd) {}
81 __m128i simdInternal_;
84 class SimdDBool
86 public:
87 MSVC_DIAGNOSTIC_IGNORE(26495) // simdInternal_ is not being initialized!
88 SimdDBool() {}
89 MSVC_DIAGNOSTIC_RESET
90 SimdDBool(bool b) : simdInternal_(_mm256_castsi256_pd(_mm256_set1_epi32(b ? 0xFFFFFFFF : 0))) {}
92 // Internal utility constructor to simplify return statements
93 SimdDBool(__m256d simd) : simdInternal_(simd) {}
95 __m256d simdInternal_;
98 class SimdDIBool
100 public:
101 MSVC_DIAGNOSTIC_IGNORE(26495) // simdInternal_ is not being initialized!
102 SimdDIBool() {}
103 MSVC_DIAGNOSTIC_RESET
104 SimdDIBool(bool b) : simdInternal_(_mm_set1_epi32(b ? 0xFFFFFFFF : 0)) {}
106 // Internal utility constructor to simplify return statements
107 SimdDIBool(__m128i simd) : simdInternal_(simd) {}
109 __m128i simdInternal_;
113 static inline SimdDouble gmx_simdcall simdLoad(const double* m, SimdDoubleTag /*unused*/ = {})
115 assert(std::size_t(m) % 32 == 0);
116 return { _mm256_load_pd(m) };
119 static inline void gmx_simdcall store(double* m, SimdDouble a)
121 assert(std::size_t(m) % 32 == 0);
122 _mm256_store_pd(m, a.simdInternal_);
125 static inline SimdDouble gmx_simdcall simdLoadU(const double* m, SimdDoubleTag /*unused*/ = {})
127 return { _mm256_loadu_pd(m) };
130 static inline void gmx_simdcall storeU(double* m, SimdDouble a)
132 _mm256_storeu_pd(m, a.simdInternal_);
135 static inline SimdDouble gmx_simdcall setZeroD()
137 return { _mm256_setzero_pd() };
140 static inline SimdDInt32 gmx_simdcall simdLoad(const std::int32_t* m, SimdDInt32Tag /*unused*/)
142 assert(std::size_t(m) % 16 == 0);
143 return { _mm_load_si128(reinterpret_cast<const __m128i*>(m)) };
146 static inline void gmx_simdcall store(std::int32_t* m, SimdDInt32 a)
148 assert(std::size_t(m) % 16 == 0);
149 _mm_store_si128(reinterpret_cast<__m128i*>(m), a.simdInternal_);
152 static inline SimdDInt32 gmx_simdcall simdLoadU(const std::int32_t* m, SimdDInt32Tag /*unused*/)
154 return { _mm_loadu_si128(reinterpret_cast<const __m128i*>(m)) };
157 static inline void gmx_simdcall storeU(std::int32_t* m, SimdDInt32 a)
159 _mm_storeu_si128(reinterpret_cast<__m128i*>(m), a.simdInternal_);
162 static inline SimdDInt32 gmx_simdcall setZeroDI()
164 return { _mm_setzero_si128() };
167 template<int index>
168 static inline std::int32_t gmx_simdcall extract(SimdDInt32 a)
170 return _mm_extract_epi32(a.simdInternal_, index);
173 static inline SimdDouble gmx_simdcall operator&(SimdDouble a, SimdDouble b)
175 return { _mm256_and_pd(a.simdInternal_, b.simdInternal_) };
178 static inline SimdDouble gmx_simdcall andNot(SimdDouble a, SimdDouble b)
180 return { _mm256_andnot_pd(a.simdInternal_, b.simdInternal_) };
183 static inline SimdDouble gmx_simdcall operator|(SimdDouble a, SimdDouble b)
185 return { _mm256_or_pd(a.simdInternal_, b.simdInternal_) };
188 static inline SimdDouble gmx_simdcall operator^(SimdDouble a, SimdDouble b)
190 return { _mm256_xor_pd(a.simdInternal_, b.simdInternal_) };
193 static inline SimdDouble gmx_simdcall operator+(SimdDouble a, SimdDouble b)
195 return { _mm256_add_pd(a.simdInternal_, b.simdInternal_) };
198 static inline SimdDouble gmx_simdcall operator-(SimdDouble a, SimdDouble b)
200 return { _mm256_sub_pd(a.simdInternal_, b.simdInternal_) };
203 static inline SimdDouble gmx_simdcall operator-(SimdDouble x)
205 return { _mm256_xor_pd(x.simdInternal_, _mm256_set1_pd(GMX_DOUBLE_NEGZERO)) };
208 static inline SimdDouble gmx_simdcall operator*(SimdDouble a, SimdDouble b)
210 return { _mm256_mul_pd(a.simdInternal_, b.simdInternal_) };
213 // Override for AVX2 and higher
214 #if GMX_SIMD_X86_AVX_256
215 static inline SimdDouble gmx_simdcall fma(SimdDouble a, SimdDouble b, SimdDouble c)
217 return { _mm256_add_pd(_mm256_mul_pd(a.simdInternal_, b.simdInternal_), c.simdInternal_) };
220 static inline SimdDouble gmx_simdcall fms(SimdDouble a, SimdDouble b, SimdDouble c)
222 return { _mm256_sub_pd(_mm256_mul_pd(a.simdInternal_, b.simdInternal_), c.simdInternal_) };
225 static inline SimdDouble gmx_simdcall fnma(SimdDouble a, SimdDouble b, SimdDouble c)
227 return { _mm256_sub_pd(c.simdInternal_, _mm256_mul_pd(a.simdInternal_, b.simdInternal_)) };
230 static inline SimdDouble gmx_simdcall fnms(SimdDouble a, SimdDouble b, SimdDouble c)
232 return { _mm256_sub_pd(_mm256_setzero_pd(),
233 _mm256_add_pd(_mm256_mul_pd(a.simdInternal_, b.simdInternal_), c.simdInternal_)) };
235 #endif
237 static inline SimdDouble gmx_simdcall rsqrt(SimdDouble x)
239 return { _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(x.simdInternal_))) };
242 static inline SimdDouble gmx_simdcall rcp(SimdDouble x)
244 return { _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(x.simdInternal_))) };
247 static inline SimdDouble gmx_simdcall maskAdd(SimdDouble a, SimdDouble b, SimdDBool m)
249 return { _mm256_add_pd(a.simdInternal_, _mm256_and_pd(b.simdInternal_, m.simdInternal_)) };
252 static inline SimdDouble gmx_simdcall maskzMul(SimdDouble a, SimdDouble b, SimdDBool m)
254 return { _mm256_and_pd(_mm256_mul_pd(a.simdInternal_, b.simdInternal_), m.simdInternal_) };
257 static inline SimdDouble maskzFma(SimdDouble a, SimdDouble b, SimdDouble c, SimdDBool m)
259 return { _mm256_and_pd(_mm256_add_pd(_mm256_mul_pd(a.simdInternal_, b.simdInternal_), c.simdInternal_),
260 m.simdInternal_) };
263 static inline SimdDouble maskzRsqrt(SimdDouble x, SimdDBool m)
265 #ifndef NDEBUG
266 x.simdInternal_ = _mm256_blendv_pd(_mm256_set1_pd(1.0), x.simdInternal_, m.simdInternal_);
267 #endif
268 return { _mm256_and_pd(_mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(x.simdInternal_))), m.simdInternal_) };
271 static inline SimdDouble maskzRcp(SimdDouble x, SimdDBool m)
273 #ifndef NDEBUG
274 x.simdInternal_ = _mm256_blendv_pd(_mm256_set1_pd(1.0), x.simdInternal_, m.simdInternal_);
275 #endif
276 return { _mm256_and_pd(_mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(x.simdInternal_))), m.simdInternal_) };
279 static inline SimdDouble gmx_simdcall abs(SimdDouble x)
281 return { _mm256_andnot_pd(_mm256_set1_pd(GMX_DOUBLE_NEGZERO), x.simdInternal_) };
284 static inline SimdDouble gmx_simdcall max(SimdDouble a, SimdDouble b)
286 return { _mm256_max_pd(a.simdInternal_, b.simdInternal_) };
289 static inline SimdDouble gmx_simdcall min(SimdDouble a, SimdDouble b)
291 return { _mm256_min_pd(a.simdInternal_, b.simdInternal_) };
294 static inline SimdDouble gmx_simdcall round(SimdDouble x)
296 return { _mm256_round_pd(x.simdInternal_, _MM_FROUND_NINT) };
299 static inline SimdDouble gmx_simdcall trunc(SimdDouble x)
301 return { _mm256_round_pd(x.simdInternal_, _MM_FROUND_TRUNC) };
304 // Override for AVX2 and higher
305 #if GMX_SIMD_X86_AVX_256
306 template<MathOptimization opt = MathOptimization::Safe>
307 static inline SimdDouble frexp(SimdDouble value, SimdDInt32* exponent)
309 const __m256d exponentMask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FF0000000000000LL));
310 const __m256d mantissaMask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x800FFFFFFFFFFFFFLL));
311 const __m256d half = _mm256_set1_pd(0.5);
312 const __m128i exponentBias = _mm_set1_epi32(1022); // add 1 to make our definition identical to frexp()
314 __m256i iExponent = _mm256_castpd_si256(_mm256_and_pd(value.simdInternal_, exponentMask));
315 __m128i iExponentHigh = _mm256_extractf128_si256(iExponent, 0x1);
316 __m128i iExponentLow = _mm256_castsi256_si128(iExponent);
317 iExponentLow = _mm_srli_epi64(iExponentLow, 52);
318 iExponentHigh = _mm_srli_epi64(iExponentHigh, 52);
319 iExponentLow = _mm_shuffle_epi32(iExponentLow, _MM_SHUFFLE(1, 1, 2, 0));
320 iExponentHigh = _mm_shuffle_epi32(iExponentHigh, _MM_SHUFFLE(2, 0, 1, 1));
321 // We need to store the return in a 128-bit integer variable, so reuse iExponentLow for both
322 iExponentLow = _mm_or_si128(iExponentLow, iExponentHigh);
323 iExponentLow = _mm_sub_epi32(iExponentLow, exponentBias);
325 __m256d result = _mm256_or_pd(_mm256_and_pd(value.simdInternal_, mantissaMask), half);
327 if (opt == MathOptimization::Safe)
329 __m256d valueIsZero = _mm256_cmp_pd(_mm256_setzero_pd(), value.simdInternal_, _CMP_EQ_OQ);
330 // This looks more complex than it is: the valueIsZero variable contains 4x 64-bit double
331 // precision fields, but a bit below we'll need a corresponding integer variable with 4x
332 // 32-bit fields. Since AVX1 does not support shuffling across the upper/lower 128-bit
333 // lanes, we need to extract those first, and then shuffle between two 128-bit variables.
334 __m128i iValueIsZero = _mm_castps_si128(_mm_shuffle_ps(
335 _mm256_extractf128_ps(_mm256_castpd_ps(valueIsZero), 0x0),
336 _mm256_extractf128_ps(_mm256_castpd_ps(valueIsZero), 0x1), _MM_SHUFFLE(2, 0, 2, 0)));
338 // Set exponent to 0 when input value was zero
339 iExponentLow = _mm_andnot_si128(iValueIsZero, iExponentLow);
341 // Set result to +-0 if the corresponding input value was +-0
342 result = _mm256_blendv_pd(result, value.simdInternal_, valueIsZero);
344 exponent->simdInternal_ = iExponentLow;
346 return { result };
349 template<MathOptimization opt = MathOptimization::Safe>
350 static inline SimdDouble ldexp(SimdDouble value, SimdDInt32 exponent)
352 const __m128i exponentBias = _mm_set1_epi32(1023);
353 __m128i iExponentLow, iExponentHigh;
354 __m256d fExponent;
356 iExponentLow = _mm_add_epi32(exponent.simdInternal_, exponentBias);
358 if (opt == MathOptimization::Safe)
360 // Make sure biased argument is not negative
361 iExponentLow = _mm_max_epi32(iExponentLow, _mm_setzero_si128());
364 iExponentHigh = _mm_shuffle_epi32(iExponentLow, _MM_SHUFFLE(3, 3, 2, 2));
365 iExponentLow = _mm_shuffle_epi32(iExponentLow, _MM_SHUFFLE(1, 1, 0, 0));
366 iExponentHigh = _mm_slli_epi64(iExponentHigh, 52);
367 iExponentLow = _mm_slli_epi64(iExponentLow, 52);
368 fExponent = _mm256_castsi256_pd(
369 _mm256_insertf128_si256(_mm256_castsi128_si256(iExponentLow), iExponentHigh, 0x1));
370 return { _mm256_mul_pd(value.simdInternal_, fExponent) };
372 #endif
374 static inline double gmx_simdcall reduce(SimdDouble a)
376 __m128d a0, a1;
377 a.simdInternal_ = _mm256_add_pd(a.simdInternal_, _mm256_permute_pd(a.simdInternal_, 0b0101));
378 a0 = _mm256_castpd256_pd128(a.simdInternal_);
379 a1 = _mm256_extractf128_pd(a.simdInternal_, 0x1);
380 a0 = _mm_add_sd(a0, a1);
382 return *reinterpret_cast<double*>(&a0);
385 static inline SimdDBool gmx_simdcall operator==(SimdDouble a, SimdDouble b)
387 return { _mm256_cmp_pd(a.simdInternal_, b.simdInternal_, _CMP_EQ_OQ) };
390 static inline SimdDBool gmx_simdcall operator!=(SimdDouble a, SimdDouble b)
392 return { _mm256_cmp_pd(a.simdInternal_, b.simdInternal_, _CMP_NEQ_OQ) };
395 static inline SimdDBool gmx_simdcall operator<(SimdDouble a, SimdDouble b)
397 return { _mm256_cmp_pd(a.simdInternal_, b.simdInternal_, _CMP_LT_OQ) };
400 static inline SimdDBool gmx_simdcall operator<=(SimdDouble a, SimdDouble b)
402 return { _mm256_cmp_pd(a.simdInternal_, b.simdInternal_, _CMP_LE_OQ) };
405 // Override for AVX2 and higher
406 #if GMX_SIMD_X86_AVX_256
407 static inline SimdDBool gmx_simdcall testBits(SimdDouble a)
409 // Do an or of the low/high 32 bits of each double (so the data is replicated),
410 // and then use the same algorithm as we use for single precision.
411 __m256 tst = _mm256_castpd_ps(a.simdInternal_);
413 tst = _mm256_or_ps(tst, _mm256_permute_ps(tst, _MM_SHUFFLE(2, 3, 0, 1)));
414 tst = _mm256_cvtepi32_ps(_mm256_castps_si256(tst));
416 return { _mm256_castps_pd(_mm256_cmp_ps(tst, _mm256_setzero_ps(), _CMP_NEQ_OQ)) };
418 #endif
420 static inline SimdDBool gmx_simdcall operator&&(SimdDBool a, SimdDBool b)
422 return { _mm256_and_pd(a.simdInternal_, b.simdInternal_) };
425 static inline SimdDBool gmx_simdcall operator||(SimdDBool a, SimdDBool b)
427 return { _mm256_or_pd(a.simdInternal_, b.simdInternal_) };
430 static inline bool gmx_simdcall anyTrue(SimdDBool a)
432 return _mm256_movemask_pd(a.simdInternal_) != 0;
435 static inline SimdDouble gmx_simdcall selectByMask(SimdDouble a, SimdDBool mask)
437 return { _mm256_and_pd(a.simdInternal_, mask.simdInternal_) };
440 static inline SimdDouble gmx_simdcall selectByNotMask(SimdDouble a, SimdDBool mask)
442 return { _mm256_andnot_pd(mask.simdInternal_, a.simdInternal_) };
445 static inline SimdDouble gmx_simdcall blend(SimdDouble a, SimdDouble b, SimdDBool sel)
447 return { _mm256_blendv_pd(a.simdInternal_, b.simdInternal_, sel.simdInternal_) };
450 static inline SimdDInt32 gmx_simdcall operator&(SimdDInt32 a, SimdDInt32 b)
452 return { _mm_and_si128(a.simdInternal_, b.simdInternal_) };
455 static inline SimdDInt32 gmx_simdcall andNot(SimdDInt32 a, SimdDInt32 b)
457 return { _mm_andnot_si128(a.simdInternal_, b.simdInternal_) };
460 static inline SimdDInt32 gmx_simdcall operator|(SimdDInt32 a, SimdDInt32 b)
462 return { _mm_or_si128(a.simdInternal_, b.simdInternal_) };
465 static inline SimdDInt32 gmx_simdcall operator^(SimdDInt32 a, SimdDInt32 b)
467 return { _mm_xor_si128(a.simdInternal_, b.simdInternal_) };
470 static inline SimdDInt32 gmx_simdcall operator+(SimdDInt32 a, SimdDInt32 b)
472 return { _mm_add_epi32(a.simdInternal_, b.simdInternal_) };
475 static inline SimdDInt32 gmx_simdcall operator-(SimdDInt32 a, SimdDInt32 b)
477 return { _mm_sub_epi32(a.simdInternal_, b.simdInternal_) };
480 static inline SimdDInt32 gmx_simdcall operator*(SimdDInt32 a, SimdDInt32 b)
482 return { _mm_mullo_epi32(a.simdInternal_, b.simdInternal_) };
485 static inline SimdDIBool gmx_simdcall operator==(SimdDInt32 a, SimdDInt32 b)
487 return { _mm_cmpeq_epi32(a.simdInternal_, b.simdInternal_) };
490 static inline SimdDIBool gmx_simdcall operator<(SimdDInt32 a, SimdDInt32 b)
492 return { _mm_cmplt_epi32(a.simdInternal_, b.simdInternal_) };
495 static inline SimdDIBool gmx_simdcall testBits(SimdDInt32 a)
497 __m128i x = a.simdInternal_;
498 __m128i res = _mm_andnot_si128(_mm_cmpeq_epi32(x, _mm_setzero_si128()), _mm_cmpeq_epi32(x, x));
500 return { res };
503 static inline SimdDIBool gmx_simdcall operator&&(SimdDIBool a, SimdDIBool b)
505 return { _mm_and_si128(a.simdInternal_, b.simdInternal_) };
508 static inline SimdDIBool gmx_simdcall operator||(SimdDIBool a, SimdDIBool b)
510 return { _mm_or_si128(a.simdInternal_, b.simdInternal_) };
513 static inline bool gmx_simdcall anyTrue(SimdDIBool a)
515 return _mm_movemask_epi8(a.simdInternal_) != 0;
518 static inline SimdDInt32 gmx_simdcall selectByMask(SimdDInt32 a, SimdDIBool mask)
520 return { _mm_and_si128(a.simdInternal_, mask.simdInternal_) };
523 static inline SimdDInt32 gmx_simdcall selectByNotMask(SimdDInt32 a, SimdDIBool mask)
525 return { _mm_andnot_si128(mask.simdInternal_, a.simdInternal_) };
528 static inline SimdDInt32 gmx_simdcall blend(SimdDInt32 a, SimdDInt32 b, SimdDIBool sel)
530 return { _mm_blendv_epi8(a.simdInternal_, b.simdInternal_, sel.simdInternal_) };
533 static inline SimdDInt32 gmx_simdcall cvtR2I(SimdDouble a)
535 return { _mm256_cvtpd_epi32(a.simdInternal_) };
538 static inline SimdDInt32 gmx_simdcall cvttR2I(SimdDouble a)
540 return { _mm256_cvttpd_epi32(a.simdInternal_) };
543 static inline SimdDouble gmx_simdcall cvtI2R(SimdDInt32 a)
545 return { _mm256_cvtepi32_pd(a.simdInternal_) };
548 static inline SimdDIBool gmx_simdcall cvtB2IB(SimdDBool a)
550 __m128i a1 = _mm256_extractf128_si256(_mm256_castpd_si256(a.simdInternal_), 0x1);
551 __m128i a0 = _mm256_castsi256_si128(_mm256_castpd_si256(a.simdInternal_));
552 a0 = _mm_shuffle_epi32(a0, _MM_SHUFFLE(2, 0, 2, 0));
553 a1 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(2, 0, 2, 0));
555 return { _mm_blend_epi16(a0, a1, 0xF0) };
558 static inline SimdDBool gmx_simdcall cvtIB2B(SimdDIBool a)
560 __m128d lo = _mm_castsi128_pd(_mm_unpacklo_epi32(a.simdInternal_, a.simdInternal_));
561 __m128d hi = _mm_castsi128_pd(_mm_unpackhi_epi32(a.simdInternal_, a.simdInternal_));
563 return { _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 0x1) };
566 static inline void gmx_simdcall cvtF2DD(SimdFloat f, SimdDouble* d0, SimdDouble* d1)
568 d0->simdInternal_ = _mm256_cvtps_pd(_mm256_castps256_ps128(f.simdInternal_));
569 d1->simdInternal_ = _mm256_cvtps_pd(_mm256_extractf128_ps(f.simdInternal_, 0x1));
572 static inline SimdFloat gmx_simdcall cvtDD2F(SimdDouble d0, SimdDouble d1)
574 __m128 f0 = _mm256_cvtpd_ps(d0.simdInternal_);
575 __m128 f1 = _mm256_cvtpd_ps(d1.simdInternal_);
576 return { _mm256_insertf128_ps(_mm256_castps128_ps256(f0), f1, 0x1) };
579 } // namespace gmx
581 #endif // GMX_SIMD_IMPL_X86_AVX_256_SIMD_DOUBLE_H