src/gromacs/simd/impl_reference/impl_reference_util_double.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPL_REFERENCE_UTIL_DOUBLE_H
  37 #define GMX_SIMD_IMPL_REFERENCE_UTIL_DOUBLE_H
  38
  39 /*! \libinternal \file
  40  *
  41  * \brief Reference impl., higher-level double prec. SIMD utility functions
  42  *
  43  * \author Erik Lindahl <erik.lindahl@scilifelab.se>
  44  *
  45  * \ingroup module_simd
  46  */
  47
  48 /* Avoid adding dependencies on the rest of GROMACS here (e.g. gmxassert.h)
  49  * since we want to be able run the low-level SIMD implementations independently
  50  * in simulators for new hardware.
  51  */
  52
  53 #include "config.h"
  54
  55 #include <cassert>
  56 #include <cstddef>
  57 #include <cstdint>
  58
  59 #include <algorithm>
  60
  61 #include "impl_reference_definitions.h"
  62 #include "impl_reference_simd_double.h"
  63
  64 namespace gmx
  65 {
  66
  67 /*! \cond libapi */
  68 /*! \addtogroup module_simd */
  69 /*! \{ */
  70
  71 /* \name Higher-level SIMD utility functions, double precision.
  72  *
  73  * These include generic functions to work with triplets of data, typically
  74  * coordinates, and a few utility functions to load and update data in the
  75  * nonbonded kernels. These functions should be available on all implementations.
  76  *
  77  * \{
  78  */
  79
  80 /*! \brief Load 4 consecutive double from each of GMX_SIMD_DOUBLE_WIDTH offsets,
  81  *         and transpose into 4 SIMD double variables.
  82  *
  83  * \tparam     align  Alignment of the memory from which we read, i.e. distance
  84  *                    (measured in elements, not bytes) between index points.
  85  *                    When this is identical to the number of SIMD variables
  86  *                    (i.e., 4 for this routine) the input data is packed without
  87  *                    padding in memory. See the SIMD parameters for exactly
  88  *                    what memory positions are loaded.
  89  * \param      base   Pointer to the start of the memory area
  90  * \param      offset Array with offsets to the start of each data point.
  91  * \param[out] v0     1st component of data, base[align*offset[i]] for each i.
  92  * \param[out] v1     2nd component of data, base[align*offset[i] + 1] for each i.
  93  * \param[out] v2     3rd component of data, base[align*offset[i] + 2] for each i.
  94  * \param[out] v3     4th component of data, base[align*offset[i] + 3] for each i.
  95  *
  96  * The floating-point memory locations must be aligned, but only to the smaller
  97  * of four elements and the floating-point SIMD width.
  98  *
  99  * The offset memory must be aligned to GMX_SIMD_DINT32_WIDTH.
 100  *
 101  * \note You should NOT scale offsets before calling this routine; it is
 102  *       done internally by using the alignment template parameter instead.
 103  */
 104 template <int align>
 105 static inline void gmx_simdcall
 106 gatherLoadTranspose(const double  *        base,
 107                     const std::int32_t     offset[],
 108                     SimdDouble *           v0,
 109                     SimdDouble *           v1,
 110                     SimdDouble *           v2,
 111                     SimdDouble *           v3)
 112 {
 113     // Offset list must be aligned for SIMD DINT32
 114     assert(std::size_t(offset) % (GMX_SIMD_DINT32_WIDTH*sizeof(std::int32_t)) == 0);
 115     // Base pointer must be aligned to the smaller of 4 elements and double SIMD width
 116     assert(std::size_t(base) % (std::min(GMX_SIMD_DOUBLE_WIDTH, 4)*sizeof(double)) == 0);
 117     // align parameter must also be a multiple of the above alignment requirement
 118     assert(align % std::min(GMX_SIMD_DOUBLE_WIDTH, 4) == 0);
 119
 120     for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
 121     {
 122         v0->simdInternal_[i] = base[align * offset[i]];
 123         v1->simdInternal_[i] = base[align * offset[i] + 1];
 124         v2->simdInternal_[i] = base[align * offset[i] + 2];
 125         v3->simdInternal_[i] = base[align * offset[i] + 3];
 126     }
 127 }
 128
 129
 130 /*! \brief Load 2 consecutive double from each of GMX_SIMD_DOUBLE_WIDTH offsets,
 131  *         and transpose into 2 SIMD double variables.
 132  *
 133  * \tparam     align  Alignment of the memory from which we read, i.e. distance
 134  *                    (measured in elements, not bytes) between index points.
 135  *                    When this is identical to the number of SIMD variables
 136  *                    (i.e., 2 for this routine) the input data is packed without
 137  *                    padding in memory. See the SIMD parameters for exactly
 138  *                    what memory positions are loaded.
 139  * \param      base   Pointer to the start of the memory area
 140  * \param      offset Array with offsets to the start of each data point.
 141  * \param[out] v0     1st component of data, base[align*offset[i]] for each i.
 142  * \param[out] v1     2nd component of data, base[align*offset[i] + 1] for each i.
 143  *
 144  * The floating-point memory locations must be aligned, but only to the smaller
 145  * of two elements and the floating-point SIMD width.
 146  *
 147  * The offset memory must be aligned to GMX_SIMD_DINT32_WIDTH.
 148  *
 149  * \note You should NOT scale offsets before calling this routine; it is
 150  *       done internally by using the alignment template parameter instead.
 151  */
 152 template <int align>
 153 static inline void gmx_simdcall
 154 gatherLoadTranspose(const double  *        base,
 155                     const std::int32_t     offset[],
 156                     SimdDouble *           v0,
 157                     SimdDouble *           v1)
 158 {
 159     // Offset list must be aligned for SIMD DINT32
 160     assert(std::size_t(offset) % (GMX_SIMD_DINT32_WIDTH*sizeof(std::int32_t)) == 0);
 161     // Base pointer must be aligned to the smaller of 2 elements and double SIMD width
 162     assert(std::size_t(base) % (std::min(GMX_SIMD_DOUBLE_WIDTH, 2)*sizeof(double)) == 0);
 163     // align parameter must also be a multiple of the above alignment requirement
 164     assert(align % std::min(GMX_SIMD_DOUBLE_WIDTH, 2) == 0);
 165
 166     for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
 167     {
 168         v0->simdInternal_[i] = base[align * offset[i]];
 169         v1->simdInternal_[i] = base[align * offset[i] + 1];
 170     }
 171 }
 172
 173
 174 /*! \brief Best alignment to use for aligned pairs of double data.
 175  *
 176  * \copydetails c_simdBestPairAlignmentFloat
 177  */
 178 static const int c_simdBestPairAlignmentDouble = 2;
 179
 180
 181 /*! \brief Load 3 consecutive doubles from each of GMX_SIMD_DOUBLE_WIDTH offsets,
 182  *         and transpose into 3 SIMD double variables.
 183  *
 184  * \tparam     align  Alignment of the memory from which we read, i.e. distance
 185  *                    (measured in elements, not bytes) between index points.
 186  *                    When this is identical to the number of SIMD variables
 187  *                    (i.e., 3 for this routine) the input data is packed without
 188  *                    padding in memory. See the SIMD parameters for exactly
 189  *                    what memory positions are loaded.
 190  * \param      base   Pointer to the start of the memory area
 191  * \param      offset Array with offsets to the start of each data point.
 192  * \param[out] v0     1st component of data, base[align*offset[i]] for each i.
 193  * \param[out] v1     2nd component of data, base[align*offset[i] + 1] for each i.
 194  * \param[out] v2     3rd component of data, base[align*offset[i] + 2] for each i.
 195  *
 196  * This function can work with both aligned (better performance) and unaligned
 197  * memory. When the align parameter is not a power-of-two (align==3 would be normal
 198  * for packed atomic coordinates) the memory obviously cannot be aligned, and
 199  * we account for this.
 200  * However, in the case where align is a power-of-two, we assume the base pointer
 201  * also has the same alignment, which will enable many platforms to use faster
 202  * aligned memory load operations.
 203  * An easy way to think of this is that each triplet of data in memory must be
 204  * aligned to the align parameter you specify when it's a power-of-two.
 205  *
 206  * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
 207  * enables us to use SIMD loads and gather operations on platforms that support it.
 208  *
 209  * \note You should NOT scale offsets before calling this routine; it is
 210  *       done internally by using the alignment template parameter instead.
 211  * \note This routine uses a normal array for the offsets, since we typically
 212  *       load this data from memory. On the architectures we have tested this
 213  *       is faster even when a SIMD integer datatype is present.
 214  * \note To improve performance, this function might use full-SIMD-width
 215  *       unaligned loads. This means you need to ensure the memory is padded
 216  *       at the end, so we always can load GMX_SIMD_REAL_WIDTH elements
 217  *       starting at the last offset. If you use the Gromacs aligned memory
 218  *       allocation routines this will always be the case.
 219  */
 220 template <int align>
 221 static inline void gmx_simdcall
 222 gatherLoadUTranspose(const double  *        base,
 223                      const std::int32_t     offset[],
 224                      SimdDouble *           v0,
 225                      SimdDouble *           v1,
 226                      SimdDouble *           v2)
 227 {
 228     // Offset list must be aligned for SIMD DINT32
 229     assert(std::size_t(offset) % (GMX_SIMD_DINT32_WIDTH*sizeof(std::int32_t)) == 0);
 230
 231     for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
 232     {
 233         v0->simdInternal_[i] = base[align * offset[i]];
 234         v1->simdInternal_[i] = base[align * offset[i] + 1];
 235         v2->simdInternal_[i] = base[align * offset[i] + 2];
 236     }
 237 }
 238
 239 /*! \brief Transpose and store 3 SIMD doubles to 3 consecutive addresses at
 240  *         GMX_SIMD_DOUBLE_WIDTH offsets.
 241  *
 242  * \tparam     align  Alignment of the memory to which we write, i.e. distance
 243  *                    (measured in elements, not bytes) between index points.
 244  *                    When this is identical to the number of SIMD variables
 245  *                    (i.e., 3 for this routine) the output data is packed without
 246  *                    padding in memory. See the SIMD parameters for exactly
 247  *                    what memory positions are written.
 248  * \param[out] base   Pointer to the start of the memory area
 249  * \param      offset Aligned array with offsets to the start of each triplet.
 250  * \param      v0     1st component of triplets, written to base[align*offset[i]].
 251  * \param      v1     2nd component of triplets, written to base[align*offset[i] + 1].
 252  * \param      v2     3rd component of triplets, written to base[align*offset[i] + 2].
 253  *
 254  * This function can work with both aligned (better performance) and unaligned
 255  * memory. When the align parameter is not a power-of-two (align==3 would be normal
 256  * for packed atomic coordinates) the memory obviously cannot be aligned, and
 257  * we account for this.
 258  * However, in the case where align is a power-of-two, we assume the base pointer
 259  * also has the same alignment, which will enable many platforms to use faster
 260  * aligned memory store operations.
 261  * An easy way to think of this is that each triplet of data in memory must be
 262  * aligned to the align parameter you specify when it's a power-of-two.
 263  *
 264  * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
 265  * enables us to use SIMD loads and gather operations on platforms that support it.
 266  *
 267  * \note You should NOT scale offsets before calling this routine; it is
 268  *       done internally by using the alignment template parameter instead.
 269  * \note This routine uses a normal array for the offsets, since we typically
 270  *       load the data from memory. On the architectures we have tested this
 271  *       is faster even when a SIMD integer datatype is present.
 272  */
 273 template <int align>
 274 static inline void gmx_simdcall
 275 transposeScatterStoreU(double  *              base,
 276                        const std::int32_t     offset[],
 277                        SimdDouble             v0,
 278                        SimdDouble             v1,
 279                        SimdDouble             v2)
 280 {
 281     // Offset list must be aligned for SIMD DINT32
 282     assert(std::size_t(offset) % (GMX_SIMD_DINT32_WIDTH*sizeof(std::int32_t)) == 0);
 283
 284     for (std::size_t i = 0; i < v0.simdInternal_.size(); i++)
 285     {
 286         base[align * offset[i]]     = v0.simdInternal_[i];
 287         base[align * offset[i] + 1] = v1.simdInternal_[i];
 288         base[align * offset[i] + 2] = v2.simdInternal_[i];
 289     }
 290 }
 291
 292
 293 /*! \brief Transpose and add 3 SIMD doubles to 3 consecutive addresses at
 294  *         GMX_SIMD_DOUBLE_WIDTH offsets.
 295  *
 296  * \tparam     align  Alignment of the memory to which we write, i.e. distance
 297  *                    (measured in elements, not bytes) between index points.
 298  *                    When this is identical to the number of SIMD variables
 299  *                    (i.e., 3 for this routine) the output data is packed without
 300  *                    padding in memory. See the SIMD parameters for exactly
 301  *                    what memory positions are incremented.
 302  * \param[out] base   Pointer to the start of the memory area
 303  * \param      offset Aligned array with offsets to the start of each triplet.
 304  * \param      v0     1st component of triplets, added to base[align*offset[i]].
 305  * \param      v1     2nd component of triplets, added to base[align*offset[i] + 1].
 306  * \param      v2     3rd component of triplets, added to base[align*offset[i] + 2].
 307  *
 308  * This function can work with both aligned (better performance) and unaligned
 309  * memory. When the align parameter is not a power-of-two (align==3 would be normal
 310  * for packed atomic coordinates) the memory obviously cannot be aligned, and
 311  * we account for this.
 312  * However, in the case where align is a power-of-two, we assume the base pointer
 313  * also has the same alignment, which will enable many platforms to use faster
 314  * aligned memory load/store operations.
 315  * An easy way to think of this is that each triplet of data in memory must be
 316  * aligned to the align parameter you specify when it's a power-of-two.
 317  *
 318  * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
 319  * enables us to use SIMD loads and gather operations on platforms that support it.
 320  *
 321  * \note You should NOT scale offsets before calling this routine; it is
 322  *       done internally by using the alignment template parameter instead.
 323  * \note This routine uses a normal array for the offsets, since we typically
 324  *       load the data from memory. On the architectures we have tested this
 325  *       is faster even when a SIMD integer datatype is present.
 326  * \note To improve performance, this function might use full-SIMD-width
 327  *       unaligned load/store, and add 0.0 to the extra elements.
 328  *       This means you need to ensure the memory is padded
 329  *       at the end, so we always can load GMX_SIMD_REAL_WIDTH elements
 330  *       starting at the last offset. If you use the Gromacs aligned memory
 331  *       allocation routines this will always be the case.
 332  */
 333 template <int align>
 334 static inline void gmx_simdcall
 335 transposeScatterIncrU(double  *              base,
 336                       const std::int32_t     offset[],
 337                       SimdDouble             v0,
 338                       SimdDouble             v1,
 339                       SimdDouble             v2)
 340 {
 341     // Offset list must be aligned for SIMD DINT32
 342     assert(std::size_t(offset) % (GMX_SIMD_DINT32_WIDTH*sizeof(std::int32_t)) == 0);
 343
 344     for (std::size_t i = 0; i < v0.simdInternal_.size(); i++)
 345     {
 346         base[align * offset[i]]     += v0.simdInternal_[i];
 347         base[align * offset[i] + 1] += v1.simdInternal_[i];
 348         base[align * offset[i] + 2] += v2.simdInternal_[i];
 349     }
 350 }
 351
 352 /*! \brief Transpose and subtract 3 SIMD doubles to 3 consecutive addresses at
 353  *         GMX_SIMD_DOUBLE_WIDTH offsets.
 354  *
 355  * \tparam     align  Alignment of the memory to which we write, i.e. distance
 356  *                    (measured in elements, not bytes) between index points.
 357  *                    When this is identical to the number of SIMD variables
 358  *                    (i.e., 3 for this routine) the output data is packed without
 359  *                    padding in memory. See the SIMD parameters for exactly
 360  *                    what memory positions are decremented.
 361  * \param[out] base    Pointer to start of memory.
 362  * \param      offset  Aligned array with offsets to the start of each triplet.
 363  * \param      v0      1st component, subtracted from base[align*offset[i]]
 364  * \param      v1      2nd component, subtracted from base[align*offset[i]+1]
 365  * \param      v2      3rd component, subtracted from base[align*offset[i]+2]
 366  *
 367  * This function can work with both aligned (better performance) and unaligned
 368  * memory. When the align parameter is not a power-of-two (align==3 would be normal
 369  * for packed atomic coordinates) the memory obviously cannot be aligned, and
 370  * we account for this.
 371  * However, in the case where align is a power-of-two, we assume the base pointer
 372  * also has the same alignment, which will enable many platforms to use faster
 373  * aligned memory load/store operations.
 374  * An easy way to think of this is that each triplet of data in memory must be
 375  * aligned to the align parameter you specify when it's a power-of-two.
 376  *
 377  * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
 378  * enables us to use SIMD loads and gather operations on platforms that support it.
 379  *
 380  * \note You should NOT scale offsets before calling this routine; it is
 381  *       done internally by using the alignment template parameter instead.
 382  * \note This routine uses a normal array for the offsets, since we typically
 383  *       load the data from memory. On the architectures we have tested this
 384  *       is faster even when a SIMD integer datatype is present.
 385  * \note To improve performance, this function might use full-SIMD-width
 386  *       unaligned load/store, and subtract 0.0 from the extra elements.
 387  *       This means you need to ensure the memory is padded
 388  *       at the end, so we always can load GMX_SIMD_REAL_WIDTH elements
 389  *       starting at the last offset. If you use the Gromacs aligned memory
 390  *       allocation routines this will always be the case.
 391  */
 392 template <int align>
 393 static inline void gmx_simdcall
 394 transposeScatterDecrU(double  *              base,
 395                       const std::int32_t     offset[],
 396                       SimdDouble             v0,
 397                       SimdDouble             v1,
 398                       SimdDouble             v2)
 399 {
 400     // Offset list must be aligned for SIMD DINT32
 401     assert(std::size_t(offset) % (GMX_SIMD_DINT32_WIDTH*sizeof(std::int32_t)) == 0);
 402
 403     for (std::size_t i = 0; i < v0.simdInternal_.size(); i++)
 404     {
 405         base[align * offset[i]]     -= v0.simdInternal_[i];
 406         base[align * offset[i] + 1] -= v1.simdInternal_[i];
 407         base[align * offset[i] + 2] -= v2.simdInternal_[i];
 408     }
 409 }
 410
 411
 412 /*! \brief Expand each element of double SIMD variable into three identical
 413  *         consecutive elements in three SIMD outputs.
 414  *
 415  * \param      scalar    Floating-point input, e.g. [s0 s1 s2 s3] if width=4.
 416  * \param[out] triplets0 First output, e.g. [s0 s0 s0 s1] if width=4.
 417  * \param[out] triplets1 Second output, e.g. [s1 s1 s2 s2] if width=4.
 418  * \param[out] triplets2 Third output, e.g. [s2 s3 s3 s3] if width=4.
 419  *
 420  * This routine is meant to use for things like scalar-vector multiplication,
 421  * where the vectors are stored in a merged format like [x0 y0 z0 x1 y1 z1 ...],
 422  * while the scalars are stored as [s0 s1 s2...], and the data cannot easily
 423  * be changed to SIMD-friendly layout.
 424  *
 425  * In this case, load 3 full-width SIMD variables from the vector array (This
 426  * will always correspond to GMX_SIMD_DOUBLE_WIDTH triplets),
 427  * load a single full-width variable from the scalar array, and
 428  * call this routine to expand the data. You can then simply multiply the
 429  * first, second and third pair of SIMD variables, and store the three
 430  * results back into a suitable vector-format array.
 431  */
 432 static inline void gmx_simdcall
 433 expandScalarsToTriplets(SimdDouble    scalar,
 434                         SimdDouble *  triplets0,
 435                         SimdDouble *  triplets1,
 436                         SimdDouble *  triplets2)
 437 {
 438     for (std::size_t i = 0; i < scalar.simdInternal_.size(); i++)
 439     {
 440         triplets0->simdInternal_[i] = scalar.simdInternal_[i / 3];
 441         triplets1->simdInternal_[i] = scalar.simdInternal_[(i + scalar.simdInternal_.size()) / 3];
 442         triplets2->simdInternal_[i] = scalar.simdInternal_[(i + 2 * scalar.simdInternal_.size()) / 3];
 443     }
 444 }
 445
 446
 447 /*! \brief Load 4 consecutive doubles from each of GMX_SIMD_DOUBLE_WIDTH offsets
 448  *         specified by a SIMD integer, transpose into 4 SIMD double variables.
 449  *
 450  * \tparam     align  Alignment of the memory from which we read, i.e. distance
 451  *                    (measured in elements, not bytes) between index points.
 452  *                    When this is identical to the number of SIMD variables
 453  *                    (i.e., 4 for this routine) the input data is packed without
 454  *                    padding in memory. See the SIMD parameters for exactly
 455  *                    what memory positions are loaded.
 456  * \param      base   Aligned pointer to the start of the memory.
 457  * \param      offset SIMD integer type with offsets to the start of each triplet.
 458  * \param[out] v0     First component, base[align*offset[i]] for each i.
 459  * \param[out] v1     Second component, base[align*offset[i] + 1] for each i.
 460  * \param[out] v2     Third component, base[align*offset[i] + 2] for each i.
 461  * \param[out] v3     Fourth component, base[align*offset[i] + 3] for each i.
 462  *
 463  * The floating-point memory locations must be aligned, but only to the smaller
 464  * of four elements and the floating-point SIMD width.
 465  *
 466  * \note You should NOT scale offsets before calling this routine; it is
 467  *       done internally by using the alignment template parameter instead.
 468  * \note This is a special routine primarily intended for loading Gromacs
 469  *       table data as efficiently as possible - this is the reason for using
 470  *       a SIMD offset index, since the result of the  real-to-integer conversion
 471  *       is present in a SIMD register just before calling this routine.
 472  */
 473 template <int align>
 474 static inline void gmx_simdcall
 475 gatherLoadBySimdIntTranspose(const double *       base,
 476                              SimdDInt32           offset,
 477                              SimdDouble *         v0,
 478                              SimdDouble *         v1,
 479                              SimdDouble *         v2,
 480                              SimdDouble *         v3)
 481 {
 482     // Base pointer must be aligned to the smaller of 4 elements and double SIMD width
 483     assert(std::size_t(base) % (std::min(GMX_SIMD_DOUBLE_WIDTH, 4)*sizeof(double)) == 0);
 484     // align parameter must also be a multiple of the above alignment requirement
 485     assert(align % std::min(GMX_SIMD_DOUBLE_WIDTH, 4) == 0);
 486
 487     for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
 488     {
 489         v0->simdInternal_[i] = base[align * offset.simdInternal_[i]];
 490         v1->simdInternal_[i] = base[align * offset.simdInternal_[i] + 1];
 491         v2->simdInternal_[i] = base[align * offset.simdInternal_[i] + 2];
 492         v3->simdInternal_[i] = base[align * offset.simdInternal_[i] + 3];
 493     }
 494 }
 495
 496
 497 /*! \brief Load 2 consecutive doubles from each of GMX_SIMD_DOUBLE_WIDTH offsets
 498  *         (unaligned) specified by SIMD integer, transpose into 2 SIMD doubles.
 499  *
 500  * \tparam     align  Alignment of the memory from which we read, i.e. distance
 501  *                    (measured in elements, not bytes) between index points.
 502  *                    When this is identical to the number of SIMD variables
 503  *                    (i.e., 2 for this routine) the input data is packed without
 504  *                    padding in memory. See the SIMD parameters for exactly
 505  *                    what memory positions are loaded.
 506  * \param      base   Pointer to the start of the memory.
 507  * \param      offset SIMD integer type with offsets to the start of each triplet.
 508  * \param[out] v0     First component, base[align*offset[i]] for each i.
 509  * \param[out] v1     Second component, base[align*offset[i] + 1] for each i.
 510  *
 511  * Since some SIMD architectures cannot handle any unaligned loads, this routine
 512  * is only available if GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE is 1.
 513  *
 514  * \note You should NOT scale offsets before calling this routine; it is
 515  *       done internally by using the alignment template parameter instead.
 516  * \note This is a special routine primarily intended for loading Gromacs
 517  *       table data as efficiently as possible - this is the reason for using
 518  *       a SIMD offset index, since the result of the  real-to-integer conversion
 519  *       is present in a SIMD register just before calling this routine.
 520  */
 521 template <int align>
 522 static inline void gmx_simdcall
 523 gatherLoadUBySimdIntTranspose(const double *       base,
 524                               SimdDInt32           offset,
 525                               SimdDouble *         v0,
 526                               SimdDouble *         v1)
 527 {
 528     for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
 529     {
 530         v0->simdInternal_[i] = base[align * offset.simdInternal_[i]];
 531         v1->simdInternal_[i] = base[align * offset.simdInternal_[i] + 1];
 532     }
 533 }
 534
 535 /*! \brief Load 2 consecutive doubles from each of GMX_SIMD_DOUBLE_WIDTH offsets
 536  *         specified by a SIMD integer, transpose into 2 SIMD double variables.
 537  *
 538  * \tparam     align  Alignment of the memory from which we read, i.e. distance
 539  *                    (measured in elements, not bytes) between index points.
 540  *                    When this is identical to the number of SIMD variables
 541  *                    (i.e., 2 for this routine) the input data is packed without
 542  *                    padding in memory. See the SIMD parameters for exactly
 543  *                    what memory positions are loaded.
 544  * \param      base   Aligned pointer to the start of the memory.
 545  * \param      offset SIMD integer type with offsets to the start of each triplet.
 546  * \param[out] v0     First component, base[align*offset[i]] for each i.
 547  * \param[out] v1     Second component, base[align*offset[i] + 1] for each i.
 548  *
 549  * The floating-point memory locations must be aligned, but only to the smaller
 550  * of two elements and the floating-point SIMD width.
 551  *
 552  * \note You should NOT scale offsets before calling this routine; it is
 553  *       done internally by using the alignment template parameter instead.
 554  * \note This is a special routine primarily intended for loading Gromacs
 555  *       table data as efficiently as possible - this is the reason for using
 556  *       a SIMD offset index, since the result of the  real-to-integer conversion
 557  *       is present in a SIMD register just before calling this routine.
 558  */
 559 template <int align>
 560 static inline void gmx_simdcall
 561 gatherLoadBySimdIntTranspose(const double *       base,
 562                              SimdDInt32           offset,
 563                              SimdDouble *         v0,
 564                              SimdDouble *         v1)
 565 {
 566     // Base pointer must be aligned to the smaller of 2 elements and double SIMD width
 567     assert(std::size_t(base) % (std::min(GMX_SIMD_DOUBLE_WIDTH, 2)*sizeof(double)) == 0);
 568     // align parameter must also be a multiple of the above alignment requirement
 569     assert(align % std::min(GMX_SIMD_DOUBLE_WIDTH, 2) == 0);
 570
 571     for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
 572     {
 573         v0->simdInternal_[i] = base[align * offset.simdInternal_[i]];
 574         v1->simdInternal_[i] = base[align * offset.simdInternal_[i] + 1];
 575     }
 576 }
 577
 578
 579 /*! \brief Reduce each of four SIMD doubles, add those values to four consecutive
 580  *         doubles in memory, return sum.
 581  *
 582  * \param m   Pointer to memory where four doubles should be incremented
 583  * \param v0  SIMD variable whose sum should be added to m[0]
 584  * \param v1  SIMD variable whose sum should be added to m[1]
 585  * \param v2  SIMD variable whose sum should be added to m[2]
 586  * \param v3  SIMD variable whose sum should be added to m[3]
 587  *
 588  * \return Sum of all elements in the four SIMD variables.
 589  *
 590  * The pointer m must be aligned to the smaller of four elements and the
 591  * floating-point SIMD width.
 592  *
 593  * \note This is a special routine intended for the Gromacs nonbonded kernels.
 594  * It is used in the epilogue of the outer loop, where the variables will
 595  * contain unrolled forces for one outer-loop-particle each, corresponding to
 596  * a single coordinate (i.e, say, four x-coordinate force variables). These
 597  * should be summed and added to the force array in memory. Since we always work
 598  * with contiguous SIMD-layout , we can use efficient aligned loads/stores.
 599  * When calculating the virial, we also need the total sum of all forces for
 600  * each coordinate. This is provided as the return value. For routines that
 601  * do not need these, this extra code will be optimized away completely if you
 602  * just ignore the return value (Checked with gcc-4.9.1 and clang-3.6 for AVX).
 603  */
 604 static inline double gmx_simdcall
 605 reduceIncr4ReturnSum(double *           m,
 606                      SimdDouble         v0,
 607                      SimdDouble         v1,
 608                      SimdDouble         v2,
 609                      SimdDouble         v3)
 610 {
 611     double sum[4]; // Note that the 4 here corresponds to the 4 m-elements, not any SIMD width
 612
 613     // Make sure the memory pointer is aligned to the smaller of 4 elements and double SIMD width
 614     assert(std::size_t(m) % (std::min(GMX_SIMD_DOUBLE_WIDTH, 4)*sizeof(double)) == 0);
 615
 616     sum[0] = reduce(v0);
 617     sum[1] = reduce(v1);
 618     sum[2] = reduce(v2);
 619     sum[3] = reduce(v3);
 620
 621     m[0] += sum[0];
 622     m[1] += sum[1];
 623     m[2] += sum[2];
 624     m[3] += sum[3];
 625
 626     return sum[0] + sum[1] + sum[2] + sum[3];
 627 }
 628
 629
 630 /*! \}
 631  *
 632  * \name Higher-level SIMD utilities accessing partial (half-width) SIMD doubles.
 633  *
 634  * See the single-precision versions for documentation. Since double precision
 635  * is typically half the width of single, this double version is likely only
 636  * useful with 512-bit and larger implementations.
 637  *
 638  * \{
 639  */
 640
 641 /*! \brief Load low & high parts of SIMD double from different locations.
 642  *
 643  * \param m0 Pointer to memory aligned to half SIMD width.
 644  * \param m1 Pointer to memory aligned to half SIMD width.
 645  *
 646  * \return SIMD variable with low part loaded from m0, high from m1.
 647  *
 648  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
 649  */
 650 static inline SimdDouble gmx_simdcall
 651 loadDualHsimd(const double *  m0,
 652               const double *  m1)
 653 {
 654     SimdDouble        a;
 655
 656     // Make sure the memory pointers are aligned to half double SIMD width
 657     assert(std::size_t(m0) % (GMX_SIMD_DOUBLE_WIDTH/2*sizeof(double)) == 0);
 658     assert(std::size_t(m1) % (GMX_SIMD_DOUBLE_WIDTH/2*sizeof(double)) == 0);
 659
 660     for (std::size_t i = 0; i < a.simdInternal_.size()/2; i++)
 661     {
 662         a.simdInternal_[i]                            = m0[i];
 663         a.simdInternal_[a.simdInternal_.size()/2 + i] = m1[i];
 664     }
 665     return a;
 666 }
 667
 668 /*! \brief Load half-SIMD-width double data, spread to both halves.
 669  *
 670  * \param m Pointer to memory aligned to half SIMD width.
 671  *
 672  * \return SIMD variable with both halves loaded from m..
 673  *
 674  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
 675  */
 676 static inline SimdDouble gmx_simdcall
 677 loadDuplicateHsimd(const double *  m)
 678 {
 679     SimdDouble        a;
 680
 681     // Make sure the memory pointer is aligned
 682     assert(std::size_t(m) % (GMX_SIMD_DOUBLE_WIDTH/2*sizeof(double)) == 0);
 683
 684     for (std::size_t i = 0; i < a.simdInternal_.size()/2; i++)
 685     {
 686         a.simdInternal_[i]                            = m[i];
 687         a.simdInternal_[a.simdInternal_.size()/2 + i] = a.simdInternal_[i];
 688     }
 689     return a;
 690 }
 691
 692 /*! \brief Load two doubles, spread 1st in low half, 2nd in high half.
 693  *
 694  * \param m Pointer to two adjacent double values.
 695  *
 696  * \return SIMD variable where all elements in the low half have been set
 697  *         to m[0], and all elements in high half to m[1].
 698  *
 699  * \note This routine always loads two values and sets the halves separately.
 700  *       If you want to set all elements to the same value, simply use
 701  *       the standard (non-half-SIMD) operations.
 702  *
 703  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
 704  */
 705 static inline SimdDouble gmx_simdcall
 706 load1DualHsimd(const double *  m)
 707 {
 708     SimdDouble        a;
 709
 710     for (std::size_t i = 0; i < a.simdInternal_.size()/2; i++)
 711     {
 712         a.simdInternal_[i]                            = m[0];
 713         a.simdInternal_[a.simdInternal_.size()/2 + i] = m[1];
 714     }
 715     return a;
 716 }
 717
 718
 719 /*! \brief Store low & high parts of SIMD double to different locations.
 720  *
 721  * \param m0 Pointer to memory aligned to half SIMD width.
 722  * \param m1 Pointer to memory aligned to half SIMD width.
 723  * \param a  SIMD variable. Low half should be stored to m0, high to m1.
 724  *
 725  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
 726  */
 727 static inline void gmx_simdcall
 728 storeDualHsimd(double *           m0,
 729                double *           m1,
 730                SimdDouble         a)
 731 {
 732     // Make sure the memory pointers are aligned to half double SIMD width
 733     assert(std::size_t(m0) % (GMX_SIMD_DOUBLE_WIDTH/2*sizeof(double)) == 0);
 734     assert(std::size_t(m1) % (GMX_SIMD_DOUBLE_WIDTH/2*sizeof(double)) == 0);
 735
 736     for (std::size_t i = 0; i < a.simdInternal_.size()/2; i++)
 737     {
 738         m0[i] = a.simdInternal_[i];
 739         m1[i] = a.simdInternal_[a.simdInternal_.size()/2 + i];
 740     }
 741 }
 742
 743 /*! \brief Add each half of SIMD variable to separate memory adresses
 744  *
 745  * \param m0 Pointer to memory aligned to half SIMD width.
 746  * \param m1 Pointer to memory aligned to half SIMD width.
 747  * \param a  SIMD variable. Lower half will be added to m0, upper half to m1.
 748  *
 749  * The memory must be aligned to half SIMD width.
 750  *
 751  * \note The updated m0 value is written before m1 is read from memory, so
 752  *       the result will be correct even if the memory regions overlap.
 753  *
 754  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
 755  */
 756 static inline void gmx_simdcall
 757 incrDualHsimd(double *           m0,
 758               double *           m1,
 759               SimdDouble         a)
 760 {
 761     // Make sure the memory pointer is aligned to half double SIMD width
 762     assert(std::size_t(m0) % (GMX_SIMD_DOUBLE_WIDTH/2*sizeof(double)) == 0);
 763     assert(std::size_t(m1) % (GMX_SIMD_DOUBLE_WIDTH/2*sizeof(double)) == 0);
 764
 765     for (std::size_t i = 0; i < a.simdInternal_.size()/2; i++)
 766     {
 767         m0[i] += a.simdInternal_[i];
 768     }
 769     for (std::size_t i = 0; i < a.simdInternal_.size()/2; i++)
 770     {
 771         m1[i] += a.simdInternal_[a.simdInternal_.size()/2 + i];
 772     }
 773 }
 774
 775 /*! \brief Add the two halves of a SIMD double, subtract the sum from
 776  *         half-SIMD-width consecutive doubles in memory.
 777  *
 778  * \param m  half-width aligned memory, from which sum of the halves will be subtracted.
 779  * \param a  SIMD variable. Upper & lower halves will first be added.
 780  *
 781  * If the SIMD width is 8 and contains [a b c d e f g h], the
 782  * memory will be modified to [m[0]-(a+e) m[1]-(b+f) m[2]-(c+g) m[3]-(d+h)].
 783  *
 784  * The memory must be aligned to half SIMD width.
 785  *
 786  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
 787  */
 788 static inline void gmx_simdcall
 789 decrHsimd(double *           m,
 790           SimdDouble         a)
 791 {
 792     // Make sure the memory pointer is aligned to half double SIMD width
 793     assert(std::size_t(m) % (GMX_SIMD_DOUBLE_WIDTH/2*sizeof(double)) == 0);
 794
 795     for (std::size_t i = 0; i < a.simdInternal_.size()/2; i++)
 796     {
 797         m[i] -= a.simdInternal_[i] + a.simdInternal_[a.simdInternal_.size()/2 + i];
 798     }
 799 }
 800
 801
 802 /*! \brief Load 2 consecutive doubles from each of GMX_SIMD_DOUBLE_WIDTH/2 offsets,
 803  *         transpose into SIMD double (low half from base0, high from base1).
 804  *
 805  * \tparam     align  Alignment of the storage, i.e. the distance
 806  *                    (measured in elements, not bytes) between index points.
 807  *                    When this is identical to the number of output components
 808  *                    the data is packed without padding. This must be a
 809  *                    multiple of the alignment to keep all data aligned.
 810  * \param      base0  Pointer to base of first aligned memory
 811  * \param      base1  Pointer to base of second aligned memory
 812  * \param      offset Offset to the start of each pair
 813  * \param[out] v0     1st element in each pair, base0 in low and base1 in high half.
 814  * \param[out] v1     2nd element in each pair, base0 in low and base1 in high half.
 815  *
 816  * The offset array should be of half the SIMD width length, so it corresponds
 817  * to the half-SIMD-register operations. This also means it must be aligned
 818  * to half the integer SIMD width (i.e., GMX_SIMD_DINT32_WIDTH/2).
 819  *
 820  * The floating-point memory locations must be aligned, but only to the smaller
 821  * of two elements and the floating-point SIMD width.
 822  *
 823  * This routine is primarily designed to load nonbonded parameters in the
 824  * kernels. It is the equivalent of the full-width routine
 825  * gatherLoadTranspose(), but just
 826  * as the other hsimd routines it will pick half-SIMD-width data from base0
 827  * and put in the lower half, while the upper half comes from base1.
 828  *
 829  * For an example, assume the SIMD width is 8, align is 2, that
 830  * base0 is [A0 A1 B0 B1 C0 C1 D0 D1 ...], and base1 [E0 E1 F0 F1 G0 G1 H0 H1...].
 831  *
 832  * Then we will get v0 as [A0 B0 C0 D0 E0 F0 G0 H0] and v1 as [A1 B1 C1 D1 E1 F1 G1 H1].
 833  *
 834  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
 835  */
 836 template <int align>
 837 static inline void gmx_simdcall
 838 gatherLoadTransposeHsimd(const double  *       base0,
 839                          const double  *       base1,
 840                          std::int32_t          offset[],
 841                          SimdDouble *          v0,
 842                          SimdDouble *          v1)
 843 {
 844     // Offset list must be aligned for half SIMD DINT32 width
 845     assert(std::size_t(offset) % (GMX_SIMD_DINT32_WIDTH/2*sizeof(std::int32_t)) == 0);
 846     // base pointers must be aligned to the smaller of 2 elements and double SIMD width
 847     assert(std::size_t(base0) % (std::min(GMX_SIMD_DOUBLE_WIDTH, 2)*sizeof(double)) == 0);
 848     assert(std::size_t(base1) % (std::min(GMX_SIMD_DOUBLE_WIDTH, 2)*sizeof(double)) == 0);
 849     // alignment parameter must be also be multiple of the above required alignment
 850     assert(align % std::min(GMX_SIMD_DOUBLE_WIDTH, 2) == 0);
 851
 852     for (std::size_t i = 0; i < v0->simdInternal_.size()/2; i++)
 853     {
 854         v0->simdInternal_[i] = base0[align * offset[i]];
 855         v1->simdInternal_[i] = base0[align * offset[i] + 1];
 856         v0->simdInternal_[v0->simdInternal_.size()/2 + i] = base1[align * offset[i]];
 857         v1->simdInternal_[v1->simdInternal_.size()/2 + i] = base1[align * offset[i] + 1];
 858     }
 859 }
 860
 861
 862 /*! \brief Reduce the 4 half-SIMD-with doubles in 2 SIMD variables (sum halves),
 863  *         increment four consecutive doubles in memory, return sum.
 864  *
 865  * \param m    Pointer to memory where the four values should be incremented
 866  * \param v0   Variable whose half-SIMD sums should be added to m[0]/m[1], respectively.
 867  * \param v1   Variable whose half-SIMD sums should be added to m[2]/m[3], respectively.
 868  *
 869  * \return Sum of all elements in the four SIMD variables.
 870  *
 871  * The pointer m must be aligned, but only to the smaller
 872  * of four elements and the floating-point SIMD width.
 873  *
 874  * \note This is the half-SIMD-width version of
 875  *      reduceIncr4ReturnSum(). The only difference is that the
 876  *      four half-SIMD inputs needed are present in the low/high halves of the
 877  *      two SIMD arguments.
 878  *
 879  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
 880  */
 881 static inline double gmx_simdcall
 882 reduceIncr4ReturnSumHsimd(double *           m,
 883                           SimdDouble         v0,
 884                           SimdDouble         v1)
 885 {
 886     // The 4 here corresponds to the 4 elements in memory, not any SIMD width
 887     double sum[4] = { 0.0, 0.0, 0.0, 0.0 };
 888
 889     for (std::size_t i = 0; i < v0.simdInternal_.size()/2; i++)
 890     {
 891         sum[0] += v0.simdInternal_[i];
 892         sum[1] += v0.simdInternal_[v0.simdInternal_.size()/2 + i];
 893         sum[2] += v1.simdInternal_[i];
 894         sum[3] += v1.simdInternal_[v1.simdInternal_.size()/2 + i];
 895     }
 896
 897     // Make sure the memory pointer is aligned to the smaller of 4 elements and double SIMD width
 898     assert(std::size_t(m) % (std::min(GMX_SIMD_DOUBLE_WIDTH, 4)*sizeof(double)) == 0);
 899
 900     m[0] += sum[0];
 901     m[1] += sum[1];
 902     m[2] += sum[2];
 903     m[3] += sum[3];
 904
 905     return sum[0] + sum[1] + sum[2] + sum[3];
 906 }
 907
 908 /*! \} */
 909
 910 /*! \} */
 911 /*! \endcond */
 912
 913 }      // namespace gmx
 914
 915 #endif // GMX_SIMD_IMPL_REFERENCE_UTIL_DOUBLE_H