Improve accuracy of SIMD exp for small args
[gromacs.git] / src / gromacs / simd / impl_reference / impl_reference_util_double.h
bloba7256854b52b7f3c70062230b3d4c6dad5090337
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_REFERENCE_UTIL_DOUBLE_H
37 #define GMX_SIMD_IMPL_REFERENCE_UTIL_DOUBLE_H
39 /*! \libinternal \file
41 * \brief Reference impl., higher-level double prec. SIMD utility functions
43 * \author Erik Lindahl <erik.lindahl@scilifelab.se>
45 * \ingroup module_simd
48 /* Avoid adding dependencies on the rest of GROMACS here (e.g. gmxassert.h)
49 * since we want to be able run the low-level SIMD implementations independently
50 * in simulators for new hardware.
53 #include "config.h"
55 #include <cassert>
56 #include <cstddef>
57 #include <cstdint>
59 #include <algorithm>
61 #include "impl_reference_definitions.h"
62 #include "impl_reference_simd_double.h"
64 namespace gmx
67 /*! \cond libapi */
68 /*! \addtogroup module_simd */
69 /*! \{ */
71 /* \name Higher-level SIMD utility functions, double precision.
73 * These include generic functions to work with triplets of data, typically
74 * coordinates, and a few utility functions to load and update data in the
75 * nonbonded kernels. These functions should be available on all implementations.
77 * \{
80 /*! \brief Load 4 consecutive double from each of GMX_SIMD_DOUBLE_WIDTH offsets,
81 * and transpose into 4 SIMD double variables.
83 * \tparam align Alignment of the memory from which we read, i.e. distance
84 * (measured in elements, not bytes) between index points.
85 * When this is identical to the number of SIMD variables
86 * (i.e., 4 for this routine) the input data is packed without
87 * padding in memory. See the SIMD parameters for exactly
88 * what memory positions are loaded.
89 * \param base Pointer to the start of the memory area
90 * \param offset Array with offsets to the start of each data point.
91 * \param[out] v0 1st component of data, base[align*offset[i]] for each i.
92 * \param[out] v1 2nd component of data, base[align*offset[i] + 1] for each i.
93 * \param[out] v2 3rd component of data, base[align*offset[i] + 2] for each i.
94 * \param[out] v3 4th component of data, base[align*offset[i] + 3] for each i.
96 * The floating-point memory locations must be aligned, but only to the smaller
97 * of four elements and the floating-point SIMD width.
99 * The offset memory must be aligned to GMX_SIMD_DINT32_WIDTH.
101 * \note You should NOT scale offsets before calling this routine; it is
102 * done internally by using the alignment template parameter instead.
104 template <int align>
105 static inline void gmx_simdcall
106 gatherLoadTranspose(const double * base,
107 const std::int32_t offset[],
108 SimdDouble * v0,
109 SimdDouble * v1,
110 SimdDouble * v2,
111 SimdDouble * v3)
113 // Offset list must be aligned for SIMD DINT32
114 assert(std::size_t(offset) % (GMX_SIMD_DINT32_WIDTH*sizeof(std::int32_t)) == 0);
115 // Base pointer must be aligned to the smaller of 4 elements and double SIMD width
116 assert(std::size_t(base) % (std::min(GMX_SIMD_DOUBLE_WIDTH, 4)*sizeof(double)) == 0);
117 // align parameter must also be a multiple of the above alignment requirement
118 assert(align % std::min(GMX_SIMD_DOUBLE_WIDTH, 4) == 0);
120 for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
122 v0->simdInternal_[i] = base[align * offset[i]];
123 v1->simdInternal_[i] = base[align * offset[i] + 1];
124 v2->simdInternal_[i] = base[align * offset[i] + 2];
125 v3->simdInternal_[i] = base[align * offset[i] + 3];
130 /*! \brief Load 2 consecutive double from each of GMX_SIMD_DOUBLE_WIDTH offsets,
131 * and transpose into 2 SIMD double variables.
133 * \tparam align Alignment of the memory from which we read, i.e. distance
134 * (measured in elements, not bytes) between index points.
135 * When this is identical to the number of SIMD variables
136 * (i.e., 2 for this routine) the input data is packed without
137 * padding in memory. See the SIMD parameters for exactly
138 * what memory positions are loaded.
139 * \param base Pointer to the start of the memory area
140 * \param offset Array with offsets to the start of each data point.
141 * \param[out] v0 1st component of data, base[align*offset[i]] for each i.
142 * \param[out] v1 2nd component of data, base[align*offset[i] + 1] for each i.
144 * The floating-point memory locations must be aligned, but only to the smaller
145 * of two elements and the floating-point SIMD width.
147 * The offset memory must be aligned to GMX_SIMD_DINT32_WIDTH.
149 * \note You should NOT scale offsets before calling this routine; it is
150 * done internally by using the alignment template parameter instead.
152 template <int align>
153 static inline void gmx_simdcall
154 gatherLoadTranspose(const double * base,
155 const std::int32_t offset[],
156 SimdDouble * v0,
157 SimdDouble * v1)
159 // Offset list must be aligned for SIMD DINT32
160 assert(std::size_t(offset) % (GMX_SIMD_DINT32_WIDTH*sizeof(std::int32_t)) == 0);
161 // Base pointer must be aligned to the smaller of 2 elements and double SIMD width
162 assert(std::size_t(base) % (std::min(GMX_SIMD_DOUBLE_WIDTH, 2)*sizeof(double)) == 0);
163 // align parameter must also be a multiple of the above alignment requirement
164 assert(align % std::min(GMX_SIMD_DOUBLE_WIDTH, 2) == 0);
166 for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
168 v0->simdInternal_[i] = base[align * offset[i]];
169 v1->simdInternal_[i] = base[align * offset[i] + 1];
174 /*! \brief Best alignment to use for aligned pairs of double data.
176 * \copydetails c_simdBestPairAlignmentFloat
178 static const int c_simdBestPairAlignmentDouble = 2;
181 /*! \brief Load 3 consecutive doubles from each of GMX_SIMD_DOUBLE_WIDTH offsets,
182 * and transpose into 3 SIMD double variables.
184 * \tparam align Alignment of the memory from which we read, i.e. distance
185 * (measured in elements, not bytes) between index points.
186 * When this is identical to the number of SIMD variables
187 * (i.e., 3 for this routine) the input data is packed without
188 * padding in memory. See the SIMD parameters for exactly
189 * what memory positions are loaded.
190 * \param base Pointer to the start of the memory area
191 * \param offset Array with offsets to the start of each data point.
192 * \param[out] v0 1st component of data, base[align*offset[i]] for each i.
193 * \param[out] v1 2nd component of data, base[align*offset[i] + 1] for each i.
194 * \param[out] v2 3rd component of data, base[align*offset[i] + 2] for each i.
196 * This function can work with both aligned (better performance) and unaligned
197 * memory. When the align parameter is not a power-of-two (align==3 would be normal
198 * for packed atomic coordinates) the memory obviously cannot be aligned, and
199 * we account for this.
200 * However, in the case where align is a power-of-two, we assume the base pointer
201 * also has the same alignment, which will enable many platforms to use faster
202 * aligned memory load operations.
203 * An easy way to think of this is that each triplet of data in memory must be
204 * aligned to the align parameter you specify when it's a power-of-two.
206 * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
207 * enables us to use SIMD loads and gather operations on platforms that support it.
209 * \note You should NOT scale offsets before calling this routine; it is
210 * done internally by using the alignment template parameter instead.
211 * \note This routine uses a normal array for the offsets, since we typically
212 * load this data from memory. On the architectures we have tested this
213 * is faster even when a SIMD integer datatype is present.
214 * \note To improve performance, this function might use full-SIMD-width
215 * unaligned loads. This means you need to ensure the memory is padded
216 * at the end, so we always can load GMX_SIMD_REAL_WIDTH elements
217 * starting at the last offset. If you use the Gromacs aligned memory
218 * allocation routines this will always be the case.
220 template <int align>
221 static inline void gmx_simdcall
222 gatherLoadUTranspose(const double * base,
223 const std::int32_t offset[],
224 SimdDouble * v0,
225 SimdDouble * v1,
226 SimdDouble * v2)
228 // Offset list must be aligned for SIMD DINT32
229 assert(std::size_t(offset) % (GMX_SIMD_DINT32_WIDTH*sizeof(std::int32_t)) == 0);
231 for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
233 v0->simdInternal_[i] = base[align * offset[i]];
234 v1->simdInternal_[i] = base[align * offset[i] + 1];
235 v2->simdInternal_[i] = base[align * offset[i] + 2];
239 /*! \brief Transpose and store 3 SIMD doubles to 3 consecutive addresses at
240 * GMX_SIMD_DOUBLE_WIDTH offsets.
242 * \tparam align Alignment of the memory to which we write, i.e. distance
243 * (measured in elements, not bytes) between index points.
244 * When this is identical to the number of SIMD variables
245 * (i.e., 3 for this routine) the output data is packed without
246 * padding in memory. See the SIMD parameters for exactly
247 * what memory positions are written.
248 * \param[out] base Pointer to the start of the memory area
249 * \param offset Aligned array with offsets to the start of each triplet.
250 * \param v0 1st component of triplets, written to base[align*offset[i]].
251 * \param v1 2nd component of triplets, written to base[align*offset[i] + 1].
252 * \param v2 3rd component of triplets, written to base[align*offset[i] + 2].
254 * This function can work with both aligned (better performance) and unaligned
255 * memory. When the align parameter is not a power-of-two (align==3 would be normal
256 * for packed atomic coordinates) the memory obviously cannot be aligned, and
257 * we account for this.
258 * However, in the case where align is a power-of-two, we assume the base pointer
259 * also has the same alignment, which will enable many platforms to use faster
260 * aligned memory store operations.
261 * An easy way to think of this is that each triplet of data in memory must be
262 * aligned to the align parameter you specify when it's a power-of-two.
264 * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
265 * enables us to use SIMD loads and gather operations on platforms that support it.
267 * \note You should NOT scale offsets before calling this routine; it is
268 * done internally by using the alignment template parameter instead.
269 * \note This routine uses a normal array for the offsets, since we typically
270 * load the data from memory. On the architectures we have tested this
271 * is faster even when a SIMD integer datatype is present.
273 template <int align>
274 static inline void gmx_simdcall
275 transposeScatterStoreU(double * base,
276 const std::int32_t offset[],
277 SimdDouble v0,
278 SimdDouble v1,
279 SimdDouble v2)
281 // Offset list must be aligned for SIMD DINT32
282 assert(std::size_t(offset) % (GMX_SIMD_DINT32_WIDTH*sizeof(std::int32_t)) == 0);
284 for (std::size_t i = 0; i < v0.simdInternal_.size(); i++)
286 base[align * offset[i]] = v0.simdInternal_[i];
287 base[align * offset[i] + 1] = v1.simdInternal_[i];
288 base[align * offset[i] + 2] = v2.simdInternal_[i];
293 /*! \brief Transpose and add 3 SIMD doubles to 3 consecutive addresses at
294 * GMX_SIMD_DOUBLE_WIDTH offsets.
296 * \tparam align Alignment of the memory to which we write, i.e. distance
297 * (measured in elements, not bytes) between index points.
298 * When this is identical to the number of SIMD variables
299 * (i.e., 3 for this routine) the output data is packed without
300 * padding in memory. See the SIMD parameters for exactly
301 * what memory positions are incremented.
302 * \param[out] base Pointer to the start of the memory area
303 * \param offset Aligned array with offsets to the start of each triplet.
304 * \param v0 1st component of triplets, added to base[align*offset[i]].
305 * \param v1 2nd component of triplets, added to base[align*offset[i] + 1].
306 * \param v2 3rd component of triplets, added to base[align*offset[i] + 2].
308 * This function can work with both aligned (better performance) and unaligned
309 * memory. When the align parameter is not a power-of-two (align==3 would be normal
310 * for packed atomic coordinates) the memory obviously cannot be aligned, and
311 * we account for this.
312 * However, in the case where align is a power-of-two, we assume the base pointer
313 * also has the same alignment, which will enable many platforms to use faster
314 * aligned memory load/store operations.
315 * An easy way to think of this is that each triplet of data in memory must be
316 * aligned to the align parameter you specify when it's a power-of-two.
318 * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
319 * enables us to use SIMD loads and gather operations on platforms that support it.
321 * \note You should NOT scale offsets before calling this routine; it is
322 * done internally by using the alignment template parameter instead.
323 * \note This routine uses a normal array for the offsets, since we typically
324 * load the data from memory. On the architectures we have tested this
325 * is faster even when a SIMD integer datatype is present.
326 * \note To improve performance, this function might use full-SIMD-width
327 * unaligned load/store, and add 0.0 to the extra elements.
328 * This means you need to ensure the memory is padded
329 * at the end, so we always can load GMX_SIMD_REAL_WIDTH elements
330 * starting at the last offset. If you use the Gromacs aligned memory
331 * allocation routines this will always be the case.
333 template <int align>
334 static inline void gmx_simdcall
335 transposeScatterIncrU(double * base,
336 const std::int32_t offset[],
337 SimdDouble v0,
338 SimdDouble v1,
339 SimdDouble v2)
341 // Offset list must be aligned for SIMD DINT32
342 assert(std::size_t(offset) % (GMX_SIMD_DINT32_WIDTH*sizeof(std::int32_t)) == 0);
344 for (std::size_t i = 0; i < v0.simdInternal_.size(); i++)
346 base[align * offset[i]] += v0.simdInternal_[i];
347 base[align * offset[i] + 1] += v1.simdInternal_[i];
348 base[align * offset[i] + 2] += v2.simdInternal_[i];
352 /*! \brief Transpose and subtract 3 SIMD doubles to 3 consecutive addresses at
353 * GMX_SIMD_DOUBLE_WIDTH offsets.
355 * \tparam align Alignment of the memory to which we write, i.e. distance
356 * (measured in elements, not bytes) between index points.
357 * When this is identical to the number of SIMD variables
358 * (i.e., 3 for this routine) the output data is packed without
359 * padding in memory. See the SIMD parameters for exactly
360 * what memory positions are decremented.
361 * \param[out] base Pointer to start of memory.
362 * \param offset Aligned array with offsets to the start of each triplet.
363 * \param v0 1st component, subtracted from base[align*offset[i]]
364 * \param v1 2nd component, subtracted from base[align*offset[i]+1]
365 * \param v2 3rd component, subtracted from base[align*offset[i]+2]
367 * This function can work with both aligned (better performance) and unaligned
368 * memory. When the align parameter is not a power-of-two (align==3 would be normal
369 * for packed atomic coordinates) the memory obviously cannot be aligned, and
370 * we account for this.
371 * However, in the case where align is a power-of-two, we assume the base pointer
372 * also has the same alignment, which will enable many platforms to use faster
373 * aligned memory load/store operations.
374 * An easy way to think of this is that each triplet of data in memory must be
375 * aligned to the align parameter you specify when it's a power-of-two.
377 * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
378 * enables us to use SIMD loads and gather operations on platforms that support it.
380 * \note You should NOT scale offsets before calling this routine; it is
381 * done internally by using the alignment template parameter instead.
382 * \note This routine uses a normal array for the offsets, since we typically
383 * load the data from memory. On the architectures we have tested this
384 * is faster even when a SIMD integer datatype is present.
385 * \note To improve performance, this function might use full-SIMD-width
386 * unaligned load/store, and subtract 0.0 from the extra elements.
387 * This means you need to ensure the memory is padded
388 * at the end, so we always can load GMX_SIMD_REAL_WIDTH elements
389 * starting at the last offset. If you use the Gromacs aligned memory
390 * allocation routines this will always be the case.
392 template <int align>
393 static inline void gmx_simdcall
394 transposeScatterDecrU(double * base,
395 const std::int32_t offset[],
396 SimdDouble v0,
397 SimdDouble v1,
398 SimdDouble v2)
400 // Offset list must be aligned for SIMD DINT32
401 assert(std::size_t(offset) % (GMX_SIMD_DINT32_WIDTH*sizeof(std::int32_t)) == 0);
403 for (std::size_t i = 0; i < v0.simdInternal_.size(); i++)
405 base[align * offset[i]] -= v0.simdInternal_[i];
406 base[align * offset[i] + 1] -= v1.simdInternal_[i];
407 base[align * offset[i] + 2] -= v2.simdInternal_[i];
412 /*! \brief Expand each element of double SIMD variable into three identical
413 * consecutive elements in three SIMD outputs.
415 * \param scalar Floating-point input, e.g. [s0 s1 s2 s3] if width=4.
416 * \param[out] triplets0 First output, e.g. [s0 s0 s0 s1] if width=4.
417 * \param[out] triplets1 Second output, e.g. [s1 s1 s2 s2] if width=4.
418 * \param[out] triplets2 Third output, e.g. [s2 s3 s3 s3] if width=4.
420 * This routine is meant to use for things like scalar-vector multiplication,
421 * where the vectors are stored in a merged format like [x0 y0 z0 x1 y1 z1 ...],
422 * while the scalars are stored as [s0 s1 s2...], and the data cannot easily
423 * be changed to SIMD-friendly layout.
425 * In this case, load 3 full-width SIMD variables from the vector array (This
426 * will always correspond to GMX_SIMD_DOUBLE_WIDTH triplets),
427 * load a single full-width variable from the scalar array, and
428 * call this routine to expand the data. You can then simply multiply the
429 * first, second and third pair of SIMD variables, and store the three
430 * results back into a suitable vector-format array.
432 static inline void gmx_simdcall
433 expandScalarsToTriplets(SimdDouble scalar,
434 SimdDouble * triplets0,
435 SimdDouble * triplets1,
436 SimdDouble * triplets2)
438 for (std::size_t i = 0; i < scalar.simdInternal_.size(); i++)
440 triplets0->simdInternal_[i] = scalar.simdInternal_[i / 3];
441 triplets1->simdInternal_[i] = scalar.simdInternal_[(i + scalar.simdInternal_.size()) / 3];
442 triplets2->simdInternal_[i] = scalar.simdInternal_[(i + 2 * scalar.simdInternal_.size()) / 3];
447 /*! \brief Load 4 consecutive doubles from each of GMX_SIMD_DOUBLE_WIDTH offsets
448 * specified by a SIMD integer, transpose into 4 SIMD double variables.
450 * \tparam align Alignment of the memory from which we read, i.e. distance
451 * (measured in elements, not bytes) between index points.
452 * When this is identical to the number of SIMD variables
453 * (i.e., 4 for this routine) the input data is packed without
454 * padding in memory. See the SIMD parameters for exactly
455 * what memory positions are loaded.
456 * \param base Aligned pointer to the start of the memory.
457 * \param offset SIMD integer type with offsets to the start of each triplet.
458 * \param[out] v0 First component, base[align*offset[i]] for each i.
459 * \param[out] v1 Second component, base[align*offset[i] + 1] for each i.
460 * \param[out] v2 Third component, base[align*offset[i] + 2] for each i.
461 * \param[out] v3 Fourth component, base[align*offset[i] + 3] for each i.
463 * The floating-point memory locations must be aligned, but only to the smaller
464 * of four elements and the floating-point SIMD width.
466 * \note You should NOT scale offsets before calling this routine; it is
467 * done internally by using the alignment template parameter instead.
468 * \note This is a special routine primarily intended for loading Gromacs
469 * table data as efficiently as possible - this is the reason for using
470 * a SIMD offset index, since the result of the real-to-integer conversion
471 * is present in a SIMD register just before calling this routine.
473 template <int align>
474 static inline void gmx_simdcall
475 gatherLoadBySimdIntTranspose(const double * base,
476 SimdDInt32 offset,
477 SimdDouble * v0,
478 SimdDouble * v1,
479 SimdDouble * v2,
480 SimdDouble * v3)
482 // Base pointer must be aligned to the smaller of 4 elements and double SIMD width
483 assert(std::size_t(base) % (std::min(GMX_SIMD_DOUBLE_WIDTH, 4)*sizeof(double)) == 0);
484 // align parameter must also be a multiple of the above alignment requirement
485 assert(align % std::min(GMX_SIMD_DOUBLE_WIDTH, 4) == 0);
487 for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
489 v0->simdInternal_[i] = base[align * offset.simdInternal_[i]];
490 v1->simdInternal_[i] = base[align * offset.simdInternal_[i] + 1];
491 v2->simdInternal_[i] = base[align * offset.simdInternal_[i] + 2];
492 v3->simdInternal_[i] = base[align * offset.simdInternal_[i] + 3];
497 /*! \brief Load 2 consecutive doubles from each of GMX_SIMD_DOUBLE_WIDTH offsets
498 * (unaligned) specified by SIMD integer, transpose into 2 SIMD doubles.
500 * \tparam align Alignment of the memory from which we read, i.e. distance
501 * (measured in elements, not bytes) between index points.
502 * When this is identical to the number of SIMD variables
503 * (i.e., 2 for this routine) the input data is packed without
504 * padding in memory. See the SIMD parameters for exactly
505 * what memory positions are loaded.
506 * \param base Pointer to the start of the memory.
507 * \param offset SIMD integer type with offsets to the start of each triplet.
508 * \param[out] v0 First component, base[align*offset[i]] for each i.
509 * \param[out] v1 Second component, base[align*offset[i] + 1] for each i.
511 * Since some SIMD architectures cannot handle any unaligned loads, this routine
512 * is only available if GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE is 1.
514 * \note You should NOT scale offsets before calling this routine; it is
515 * done internally by using the alignment template parameter instead.
516 * \note This is a special routine primarily intended for loading Gromacs
517 * table data as efficiently as possible - this is the reason for using
518 * a SIMD offset index, since the result of the real-to-integer conversion
519 * is present in a SIMD register just before calling this routine.
521 template <int align>
522 static inline void gmx_simdcall
523 gatherLoadUBySimdIntTranspose(const double * base,
524 SimdDInt32 offset,
525 SimdDouble * v0,
526 SimdDouble * v1)
528 for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
530 v0->simdInternal_[i] = base[align * offset.simdInternal_[i]];
531 v1->simdInternal_[i] = base[align * offset.simdInternal_[i] + 1];
535 /*! \brief Load 2 consecutive doubles from each of GMX_SIMD_DOUBLE_WIDTH offsets
536 * specified by a SIMD integer, transpose into 2 SIMD double variables.
538 * \tparam align Alignment of the memory from which we read, i.e. distance
539 * (measured in elements, not bytes) between index points.
540 * When this is identical to the number of SIMD variables
541 * (i.e., 2 for this routine) the input data is packed without
542 * padding in memory. See the SIMD parameters for exactly
543 * what memory positions are loaded.
544 * \param base Aligned pointer to the start of the memory.
545 * \param offset SIMD integer type with offsets to the start of each triplet.
546 * \param[out] v0 First component, base[align*offset[i]] for each i.
547 * \param[out] v1 Second component, base[align*offset[i] + 1] for each i.
549 * The floating-point memory locations must be aligned, but only to the smaller
550 * of two elements and the floating-point SIMD width.
552 * \note You should NOT scale offsets before calling this routine; it is
553 * done internally by using the alignment template parameter instead.
554 * \note This is a special routine primarily intended for loading Gromacs
555 * table data as efficiently as possible - this is the reason for using
556 * a SIMD offset index, since the result of the real-to-integer conversion
557 * is present in a SIMD register just before calling this routine.
559 template <int align>
560 static inline void gmx_simdcall
561 gatherLoadBySimdIntTranspose(const double * base,
562 SimdDInt32 offset,
563 SimdDouble * v0,
564 SimdDouble * v1)
566 // Base pointer must be aligned to the smaller of 2 elements and double SIMD width
567 assert(std::size_t(base) % (std::min(GMX_SIMD_DOUBLE_WIDTH, 2)*sizeof(double)) == 0);
568 // align parameter must also be a multiple of the above alignment requirement
569 assert(align % std::min(GMX_SIMD_DOUBLE_WIDTH, 2) == 0);
571 for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
573 v0->simdInternal_[i] = base[align * offset.simdInternal_[i]];
574 v1->simdInternal_[i] = base[align * offset.simdInternal_[i] + 1];
579 /*! \brief Reduce each of four SIMD doubles, add those values to four consecutive
580 * doubles in memory, return sum.
582 * \param m Pointer to memory where four doubles should be incremented
583 * \param v0 SIMD variable whose sum should be added to m[0]
584 * \param v1 SIMD variable whose sum should be added to m[1]
585 * \param v2 SIMD variable whose sum should be added to m[2]
586 * \param v3 SIMD variable whose sum should be added to m[3]
588 * \return Sum of all elements in the four SIMD variables.
590 * The pointer m must be aligned to the smaller of four elements and the
591 * floating-point SIMD width.
593 * \note This is a special routine intended for the Gromacs nonbonded kernels.
594 * It is used in the epilogue of the outer loop, where the variables will
595 * contain unrolled forces for one outer-loop-particle each, corresponding to
596 * a single coordinate (i.e, say, four x-coordinate force variables). These
597 * should be summed and added to the force array in memory. Since we always work
598 * with contiguous SIMD-layout , we can use efficient aligned loads/stores.
599 * When calculating the virial, we also need the total sum of all forces for
600 * each coordinate. This is provided as the return value. For routines that
601 * do not need these, this extra code will be optimized away completely if you
602 * just ignore the return value (Checked with gcc-4.9.1 and clang-3.6 for AVX).
604 static inline double gmx_simdcall
605 reduceIncr4ReturnSum(double * m,
606 SimdDouble v0,
607 SimdDouble v1,
608 SimdDouble v2,
609 SimdDouble v3)
611 double sum[4]; // Note that the 4 here corresponds to the 4 m-elements, not any SIMD width
613 // Make sure the memory pointer is aligned to the smaller of 4 elements and double SIMD width
614 assert(std::size_t(m) % (std::min(GMX_SIMD_DOUBLE_WIDTH, 4)*sizeof(double)) == 0);
616 sum[0] = reduce(v0);
617 sum[1] = reduce(v1);
618 sum[2] = reduce(v2);
619 sum[3] = reduce(v3);
621 m[0] += sum[0];
622 m[1] += sum[1];
623 m[2] += sum[2];
624 m[3] += sum[3];
626 return sum[0] + sum[1] + sum[2] + sum[3];
630 /*! \}
632 * \name Higher-level SIMD utilities accessing partial (half-width) SIMD doubles.
634 * See the single-precision versions for documentation. Since double precision
635 * is typically half the width of single, this double version is likely only
636 * useful with 512-bit and larger implementations.
638 * \{
641 /*! \brief Load low & high parts of SIMD double from different locations.
643 * \param m0 Pointer to memory aligned to half SIMD width.
644 * \param m1 Pointer to memory aligned to half SIMD width.
646 * \return SIMD variable with low part loaded from m0, high from m1.
648 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
650 static inline SimdDouble gmx_simdcall
651 loadDualHsimd(const double * m0,
652 const double * m1)
654 SimdDouble a;
656 // Make sure the memory pointers are aligned to half double SIMD width
657 assert(std::size_t(m0) % (GMX_SIMD_DOUBLE_WIDTH/2*sizeof(double)) == 0);
658 assert(std::size_t(m1) % (GMX_SIMD_DOUBLE_WIDTH/2*sizeof(double)) == 0);
660 for (std::size_t i = 0; i < a.simdInternal_.size()/2; i++)
662 a.simdInternal_[i] = m0[i];
663 a.simdInternal_[a.simdInternal_.size()/2 + i] = m1[i];
665 return a;
668 /*! \brief Load half-SIMD-width double data, spread to both halves.
670 * \param m Pointer to memory aligned to half SIMD width.
672 * \return SIMD variable with both halves loaded from m..
674 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
676 static inline SimdDouble gmx_simdcall
677 loadDuplicateHsimd(const double * m)
679 SimdDouble a;
681 // Make sure the memory pointer is aligned
682 assert(std::size_t(m) % (GMX_SIMD_DOUBLE_WIDTH/2*sizeof(double)) == 0);
684 for (std::size_t i = 0; i < a.simdInternal_.size()/2; i++)
686 a.simdInternal_[i] = m[i];
687 a.simdInternal_[a.simdInternal_.size()/2 + i] = a.simdInternal_[i];
689 return a;
692 /*! \brief Load two doubles, spread 1st in low half, 2nd in high half.
694 * \param m Pointer to two adjacent double values.
696 * \return SIMD variable where all elements in the low half have been set
697 * to m[0], and all elements in high half to m[1].
699 * \note This routine always loads two values and sets the halves separately.
700 * If you want to set all elements to the same value, simply use
701 * the standard (non-half-SIMD) operations.
703 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
705 static inline SimdDouble gmx_simdcall
706 load1DualHsimd(const double * m)
708 SimdDouble a;
710 for (std::size_t i = 0; i < a.simdInternal_.size()/2; i++)
712 a.simdInternal_[i] = m[0];
713 a.simdInternal_[a.simdInternal_.size()/2 + i] = m[1];
715 return a;
719 /*! \brief Store low & high parts of SIMD double to different locations.
721 * \param m0 Pointer to memory aligned to half SIMD width.
722 * \param m1 Pointer to memory aligned to half SIMD width.
723 * \param a SIMD variable. Low half should be stored to m0, high to m1.
725 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
727 static inline void gmx_simdcall
728 storeDualHsimd(double * m0,
729 double * m1,
730 SimdDouble a)
732 // Make sure the memory pointers are aligned to half double SIMD width
733 assert(std::size_t(m0) % (GMX_SIMD_DOUBLE_WIDTH/2*sizeof(double)) == 0);
734 assert(std::size_t(m1) % (GMX_SIMD_DOUBLE_WIDTH/2*sizeof(double)) == 0);
736 for (std::size_t i = 0; i < a.simdInternal_.size()/2; i++)
738 m0[i] = a.simdInternal_[i];
739 m1[i] = a.simdInternal_[a.simdInternal_.size()/2 + i];
743 /*! \brief Add each half of SIMD variable to separate memory adresses
745 * \param m0 Pointer to memory aligned to half SIMD width.
746 * \param m1 Pointer to memory aligned to half SIMD width.
747 * \param a SIMD variable. Lower half will be added to m0, upper half to m1.
749 * The memory must be aligned to half SIMD width.
751 * \note The updated m0 value is written before m1 is read from memory, so
752 * the result will be correct even if the memory regions overlap.
754 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
756 static inline void gmx_simdcall
757 incrDualHsimd(double * m0,
758 double * m1,
759 SimdDouble a)
761 // Make sure the memory pointer is aligned to half double SIMD width
762 assert(std::size_t(m0) % (GMX_SIMD_DOUBLE_WIDTH/2*sizeof(double)) == 0);
763 assert(std::size_t(m1) % (GMX_SIMD_DOUBLE_WIDTH/2*sizeof(double)) == 0);
765 for (std::size_t i = 0; i < a.simdInternal_.size()/2; i++)
767 m0[i] += a.simdInternal_[i];
769 for (std::size_t i = 0; i < a.simdInternal_.size()/2; i++)
771 m1[i] += a.simdInternal_[a.simdInternal_.size()/2 + i];
775 /*! \brief Add the two halves of a SIMD double, subtract the sum from
776 * half-SIMD-width consecutive doubles in memory.
778 * \param m half-width aligned memory, from which sum of the halves will be subtracted.
779 * \param a SIMD variable. Upper & lower halves will first be added.
781 * If the SIMD width is 8 and contains [a b c d e f g h], the
782 * memory will be modified to [m[0]-(a+e) m[1]-(b+f) m[2]-(c+g) m[3]-(d+h)].
784 * The memory must be aligned to half SIMD width.
786 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
788 static inline void gmx_simdcall
789 decrHsimd(double * m,
790 SimdDouble a)
792 // Make sure the memory pointer is aligned to half double SIMD width
793 assert(std::size_t(m) % (GMX_SIMD_DOUBLE_WIDTH/2*sizeof(double)) == 0);
795 for (std::size_t i = 0; i < a.simdInternal_.size()/2; i++)
797 m[i] -= a.simdInternal_[i] + a.simdInternal_[a.simdInternal_.size()/2 + i];
802 /*! \brief Load 2 consecutive doubles from each of GMX_SIMD_DOUBLE_WIDTH/2 offsets,
803 * transpose into SIMD double (low half from base0, high from base1).
805 * \tparam align Alignment of the storage, i.e. the distance
806 * (measured in elements, not bytes) between index points.
807 * When this is identical to the number of output components
808 * the data is packed without padding. This must be a
809 * multiple of the alignment to keep all data aligned.
810 * \param base0 Pointer to base of first aligned memory
811 * \param base1 Pointer to base of second aligned memory
812 * \param offset Offset to the start of each pair
813 * \param[out] v0 1st element in each pair, base0 in low and base1 in high half.
814 * \param[out] v1 2nd element in each pair, base0 in low and base1 in high half.
816 * The offset array should be of half the SIMD width length, so it corresponds
817 * to the half-SIMD-register operations. This also means it must be aligned
818 * to half the integer SIMD width (i.e., GMX_SIMD_DINT32_WIDTH/2).
820 * The floating-point memory locations must be aligned, but only to the smaller
821 * of two elements and the floating-point SIMD width.
823 * This routine is primarily designed to load nonbonded parameters in the
824 * kernels. It is the equivalent of the full-width routine
825 * gatherLoadTranspose(), but just
826 * as the other hsimd routines it will pick half-SIMD-width data from base0
827 * and put in the lower half, while the upper half comes from base1.
829 * For an example, assume the SIMD width is 8, align is 2, that
830 * base0 is [A0 A1 B0 B1 C0 C1 D0 D1 ...], and base1 [E0 E1 F0 F1 G0 G1 H0 H1...].
832 * Then we will get v0 as [A0 B0 C0 D0 E0 F0 G0 H0] and v1 as [A1 B1 C1 D1 E1 F1 G1 H1].
834 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
836 template <int align>
837 static inline void gmx_simdcall
838 gatherLoadTransposeHsimd(const double * base0,
839 const double * base1,
840 std::int32_t offset[],
841 SimdDouble * v0,
842 SimdDouble * v1)
844 // Offset list must be aligned for half SIMD DINT32 width
845 assert(std::size_t(offset) % (GMX_SIMD_DINT32_WIDTH/2*sizeof(std::int32_t)) == 0);
846 // base pointers must be aligned to the smaller of 2 elements and double SIMD width
847 assert(std::size_t(base0) % (std::min(GMX_SIMD_DOUBLE_WIDTH, 2)*sizeof(double)) == 0);
848 assert(std::size_t(base1) % (std::min(GMX_SIMD_DOUBLE_WIDTH, 2)*sizeof(double)) == 0);
849 // alignment parameter must be also be multiple of the above required alignment
850 assert(align % std::min(GMX_SIMD_DOUBLE_WIDTH, 2) == 0);
852 for (std::size_t i = 0; i < v0->simdInternal_.size()/2; i++)
854 v0->simdInternal_[i] = base0[align * offset[i]];
855 v1->simdInternal_[i] = base0[align * offset[i] + 1];
856 v0->simdInternal_[v0->simdInternal_.size()/2 + i] = base1[align * offset[i]];
857 v1->simdInternal_[v1->simdInternal_.size()/2 + i] = base1[align * offset[i] + 1];
862 /*! \brief Reduce the 4 half-SIMD-with doubles in 2 SIMD variables (sum halves),
863 * increment four consecutive doubles in memory, return sum.
865 * \param m Pointer to memory where the four values should be incremented
866 * \param v0 Variable whose half-SIMD sums should be added to m[0]/m[1], respectively.
867 * \param v1 Variable whose half-SIMD sums should be added to m[2]/m[3], respectively.
869 * \return Sum of all elements in the four SIMD variables.
871 * The pointer m must be aligned, but only to the smaller
872 * of four elements and the floating-point SIMD width.
874 * \note This is the half-SIMD-width version of
875 * reduceIncr4ReturnSum(). The only difference is that the
876 * four half-SIMD inputs needed are present in the low/high halves of the
877 * two SIMD arguments.
879 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
881 static inline double gmx_simdcall
882 reduceIncr4ReturnSumHsimd(double * m,
883 SimdDouble v0,
884 SimdDouble v1)
886 // The 4 here corresponds to the 4 elements in memory, not any SIMD width
887 double sum[4] = { 0.0, 0.0, 0.0, 0.0 };
889 for (std::size_t i = 0; i < v0.simdInternal_.size()/2; i++)
891 sum[0] += v0.simdInternal_[i];
892 sum[1] += v0.simdInternal_[v0.simdInternal_.size()/2 + i];
893 sum[2] += v1.simdInternal_[i];
894 sum[3] += v1.simdInternal_[v1.simdInternal_.size()/2 + i];
897 // Make sure the memory pointer is aligned to the smaller of 4 elements and double SIMD width
898 assert(std::size_t(m) % (std::min(GMX_SIMD_DOUBLE_WIDTH, 4)*sizeof(double)) == 0);
900 m[0] += sum[0];
901 m[1] += sum[1];
902 m[2] += sum[2];
903 m[3] += sum[3];
905 return sum[0] + sum[1] + sum[2] + sum[3];
908 /*! \} */
910 /*! \} */
911 /*! \endcond */
913 } // namespace gmx
915 #endif // GMX_SIMD_IMPL_REFERENCE_UTIL_DOUBLE_H