src/gromacs/ewald/pme_gather.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2017,2018 by the GROMACS development team.
   7  * Copyright (c) 2019,2020, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38
  39 #include "gmxpre.h"
  40
  41 #include "pme_gather.h"
  42
  43 #include "gromacs/math/vec.h"
  44 #include "gromacs/simd/simd.h"
  45 #include "gromacs/utility/basedefinitions.h"
  46 #include "gromacs/utility/gmxassert.h"
  47 #include "gromacs/utility/smalloc.h"
  48 #include "gromacs/utility/typetraits.h"
  49
  50 #include "pme_internal.h"
  51 #include "pme_simd.h"
  52 #include "pme_spline_work.h"
  53
  54 using namespace gmx; // TODO: Remove when this file is moved into gmx namespace
  55
  56 /* Spline function. Goals: 1) Force compiler to instantiate function separately
  57    for each compile-time value of order and once more for any possible (runtime)
  58    value. 2) Allow overloading for specific compile-time values.
  59    The Int template argument can be either int (runtime) or an object of type
  60    integral_constant<int, N> (compile-time). Common runtime values can be
  61    converted to compile-time values with a switch statement. For the compile
  62    value the compiler is required to instantiate the function separately for
  63    each value. The function can be overloaded for specific compile-time values
  64    using integral_constant<int, N> where N is either a specific value or an
  65    enable_if constrained non-type template parameter. The most specific overload
  66    (specific value > int template parameter > general function) is called. Inside
  67    the function the order argument can be used as regular int because
  68    integral_constant has a proper conversion.
  69
  70    SIMD do_fspline() template funtions will be used for PME order 4 and 5
  71    when the SIMD module has support for SIMD4 for the architecture used.
  72    For SIMD4 without unaligned load/store support:
  73      order 4 and 5 use the order 4+5 aligned SIMD template
  74    For SIMD4 with unaligned load/store support:
  75      order 4 uses the order 4 unaligned SIMD template
  76      order 5 uses the order 4+5 aligned SIMD template
  77  */
  78 struct do_fspline
  79 {
  80     do_fspline(const gmx_pme_t* pme,
  81                const real* gmx_restrict grid,
  82                const PmeAtomComm* gmx_restrict atc,
  83                const splinedata_t* gmx_restrict spline,
  84                int                              nn) :
  85         pme(pme),
  86         grid(grid),
  87         atc(atc),
  88         spline(spline),
  89         nn(nn)
  90     {
  91     }
  92
  93     template<typename Int>
  94     RVec operator()(Int order) const
  95     {
  96         static_assert(isIntegralConstant<Int, int>::value || std::is_same<Int, int>::value,
  97                       "'order' needs to be either of type integral_constant<int,N> or int.");
  98
  99         const int norder = nn * order;
 100
 101         /* Pointer arithmetic alert, next six statements */
 102         const real* const gmx_restrict thx  = spline->theta.coefficients[XX] + norder;
 103         const real* const gmx_restrict thy  = spline->theta.coefficients[YY] + norder;
 104         const real* const gmx_restrict thz  = spline->theta.coefficients[ZZ] + norder;
 105         const real* const gmx_restrict dthx = spline->dtheta.coefficients[XX] + norder;
 106         const real* const gmx_restrict dthy = spline->dtheta.coefficients[YY] + norder;
 107         const real* const gmx_restrict dthz = spline->dtheta.coefficients[ZZ] + norder;
 108
 109         RVec f(0, 0, 0);
 110
 111         for (int ithx = 0; (ithx < order); ithx++)
 112         {
 113             const int  index_x = (idxX + ithx) * gridNY * gridNZ;
 114             const real tx      = thx[ithx];
 115             const real dx      = dthx[ithx];
 116
 117             for (int ithy = 0; (ithy < order); ithy++)
 118             {
 119                 const int  index_xy = index_x + (idxY + ithy) * gridNZ;
 120                 const real ty       = thy[ithy];
 121                 const real dy       = dthy[ithy];
 122                 real       fxy1 = 0, fz1 = 0;
 123
 124                 for (int ithz = 0; (ithz < order); ithz++)
 125                 {
 126                     const real gval = grid[index_xy + (idxZ + ithz)];
 127                     fxy1 += thz[ithz] * gval;
 128                     fz1 += dthz[ithz] * gval;
 129                 }
 130                 f[XX] += dx * ty * fxy1;
 131                 f[YY] += tx * dy * fxy1;
 132                 f[ZZ] += tx * ty * fz1;
 133             }
 134         }
 135
 136         return f;
 137     }
 138
 139 // TODO: Consider always have at least a dummy implementation of Simd (enough for first phase of two-phase lookup) and then use enable_if instead of #ifdef
 140 #if PME_4NSIMD_GATHER
 141     /* Gather for one charge with pme_order=4 with unaligned SIMD4 load+store.
 142      * Uses 4N SIMD where N is SIMD_WIDTH/4 to operate on all of z and N of y.
 143      * This code does not assume any memory alignment for the grid.
 144      */
 145     RVec operator()(std::integral_constant<int, 4> /*unused*/) const
 146     {
 147         const int norder = nn * 4;
 148         /* Pointer arithmetic alert, next six statements */
 149         const real* const gmx_restrict thx  = spline->theta.coefficients[XX] + norder;
 150         const real* const gmx_restrict thy  = spline->theta.coefficients[YY] + norder;
 151         const real* const gmx_restrict thz  = spline->theta.coefficients[ZZ] + norder;
 152         const real* const gmx_restrict dthx = spline->dtheta.coefficients[XX] + norder;
 153         const real* const gmx_restrict dthy = spline->dtheta.coefficients[YY] + norder;
 154         const real* const gmx_restrict dthz = spline->dtheta.coefficients[ZZ] + norder;
 155
 156         Simd4NReal fx_S = setZero();
 157         Simd4NReal fy_S = setZero();
 158         Simd4NReal fz_S = setZero();
 159
 160         /* With order 4 the z-spline is actually aligned */
 161         const Simd4NReal tz_S = load4DuplicateN(thz);
 162         const Simd4NReal dz_S = load4DuplicateN(dthz);
 163
 164         for (int ithx = 0; ithx < 4; ithx++)
 165         {
 166             const int        index_x = (idxX + ithx) * gridNY * gridNZ;
 167             const Simd4NReal tx_S    = Simd4NReal(thx[ithx]);
 168             const Simd4NReal dx_S    = Simd4NReal(dthx[ithx]);
 169
 170             for (int ithy = 0; ithy < 4; ithy += GMX_SIMD4N_REAL_WIDTH / 4)
 171             {
 172                 const int index_xy = index_x + (idxY + ithy) * gridNZ;
 173
 174                 const Simd4NReal ty_S = loadUNDuplicate4(thy + ithy);
 175                 const Simd4NReal dy_S = loadUNDuplicate4(dthy + ithy);
 176
 177                 const Simd4NReal gval_S = loadU4NOffset(grid + index_xy + idxZ, gridNZ);
 178
 179
 180                 const Simd4NReal fxy1_S = tz_S * gval_S;
 181                 const Simd4NReal fz1_S  = dz_S * gval_S;
 182
 183                 fx_S = fma(dx_S * ty_S, fxy1_S, fx_S);
 184                 fy_S = fma(tx_S * dy_S, fxy1_S, fy_S);
 185                 fz_S = fma(tx_S * ty_S, fz1_S, fz_S);
 186             }
 187         }
 188
 189         return { reduce(fx_S), reduce(fy_S), reduce(fz_S) };
 190     }
 191 #endif
 192
 193 #ifdef PME_SIMD4_SPREAD_GATHER
 194     /* Load order elements from unaligned memory into two 4-wide SIMD */
 195     template<int order>
 196     static inline void loadOrderU(const real* data,
 197                                   std::integral_constant<int, order> /*unused*/,
 198                                   int        offset,
 199                                   Simd4Real* S0,
 200                                   Simd4Real* S1)
 201     {
 202 #    ifdef PME_SIMD4_UNALIGNED
 203         *S0 = load4U(data - offset);
 204         *S1 = load4U(data - offset + 4);
 205 #    else
 206         alignas(GMX_SIMD_ALIGNMENT) real buf_aligned[GMX_SIMD4_WIDTH * 2];
 207         /* Copy data to an aligned buffer */
 208         for (int i = 0; i < order; i++)
 209         {
 210             buf_aligned[offset + i] = data[i];
 211         }
 212         *S0 = load4(buf_aligned);
 213         *S1 = load4(buf_aligned + 4);
 214 #    endif
 215     }
 216 #endif
 217
 218 #ifdef PME_SIMD4_SPREAD_GATHER
 219     /* This code assumes that the grid is allocated 4-real aligned
 220      * and that pme->pmegrid_nz is a multiple of 4.
 221      * This code supports pme_order <= 5.
 222      */
 223     template<int Order>
 224     std::enable_if_t<Order == 4 || Order == 5, RVec> operator()(std::integral_constant<int, Order> order) const
 225     {
 226         const int norder = nn * order;
 227         GMX_ASSERT(gridNZ % 4 == 0,
 228                    "For aligned SIMD4 operations the grid size has to be padded up to a multiple "
 229                    "of 4");
 230         /* Pointer arithmetic alert, next six statements */
 231         const real* const gmx_restrict thx  = spline->theta.coefficients[XX] + norder;
 232         const real* const gmx_restrict thy  = spline->theta.coefficients[YY] + norder;
 233         const real* const gmx_restrict thz  = spline->theta.coefficients[ZZ] + norder;
 234         const real* const gmx_restrict dthx = spline->dtheta.coefficients[XX] + norder;
 235         const real* const gmx_restrict dthy = spline->dtheta.coefficients[YY] + norder;
 236         const real* const gmx_restrict dthz = spline->dtheta.coefficients[ZZ] + norder;
 237
 238         struct pme_spline_work* const work = pme->spline_work;
 239
 240         const int offset = idxZ & 3;
 241
 242         Simd4Real fx_S = setZero();
 243         Simd4Real fy_S = setZero();
 244         Simd4Real fz_S = setZero();
 245
 246         Simd4Real tz_S0, tz_S1, dz_S0, dz_S1;
 247         loadOrderU(thz, order, offset, &tz_S0, &tz_S1);
 248         loadOrderU(dthz, order, offset, &dz_S0, &dz_S1);
 249
 250         tz_S0 = selectByMask(tz_S0, work->mask_S0[offset]);
 251         dz_S0 = selectByMask(dz_S0, work->mask_S0[offset]);
 252         tz_S1 = selectByMask(tz_S1, work->mask_S1[offset]);
 253         dz_S1 = selectByMask(dz_S1, work->mask_S1[offset]);
 254
 255         for (int ithx = 0; (ithx < order); ithx++)
 256         {
 257             const int       index_x = (idxX + ithx) * gridNY * gridNZ;
 258             const Simd4Real tx_S    = Simd4Real(thx[ithx]);
 259             const Simd4Real dx_S    = Simd4Real(dthx[ithx]);
 260
 261             for (int ithy = 0; (ithy < order); ithy++)
 262             {
 263                 const int       index_xy = index_x + (idxY + ithy) * gridNZ;
 264                 const Simd4Real ty_S     = Simd4Real(thy[ithy]);
 265                 const Simd4Real dy_S     = Simd4Real(dthy[ithy]);
 266
 267                 const Simd4Real gval_S0 = load4(grid + index_xy + idxZ - offset);
 268                 const Simd4Real gval_S1 = load4(grid + index_xy + idxZ - offset + 4);
 269
 270                 const Simd4Real fxy1_S0 = tz_S0 * gval_S0;
 271                 const Simd4Real fz1_S0  = dz_S0 * gval_S0;
 272                 const Simd4Real fxy1_S1 = tz_S1 * gval_S1;
 273                 const Simd4Real fz1_S1  = dz_S1 * gval_S1;
 274
 275                 const Simd4Real fxy1_S = fxy1_S0 + fxy1_S1;
 276                 const Simd4Real fz1_S  = fz1_S0 + fz1_S1;
 277
 278                 fx_S = fma(dx_S * ty_S, fxy1_S, fx_S);
 279                 fy_S = fma(tx_S * dy_S, fxy1_S, fy_S);
 280                 fz_S = fma(tx_S * ty_S, fz1_S, fz_S);
 281             }
 282         }
 283
 284         return { reduce(fx_S), reduce(fy_S), reduce(fz_S) };
 285     }
 286 #endif
 287 private:
 288     const gmx_pme_t* const pme;
 289     const real* const gmx_restrict grid;
 290     const PmeAtomComm* const gmx_restrict atc;
 291     const splinedata_t* const gmx_restrict spline;
 292     const int                              nn;
 293
 294     const int gridNY = pme->pmegrid_ny;
 295     const int gridNZ = pme->pmegrid_nz;
 296
 297     const int* const idxptr = atc->idx[spline->ind[nn]];
 298     const int        idxX   = idxptr[XX];
 299     const int        idxY   = idxptr[YY];
 300     const int        idxZ   = idxptr[ZZ];
 301 };
 302
 303
 304 void gather_f_bsplines(const gmx_pme_t*    pme,
 305                        const real*         grid,
 306                        gmx_bool            bClearF,
 307                        const PmeAtomComm*  atc,
 308                        const splinedata_t* spline,
 309                        real                scale)
 310 {
 311     /* sum forces for local particles */
 312
 313     const int order = pme->pme_order;
 314     const int nx    = pme->nkx;
 315     const int ny    = pme->nky;
 316     const int nz    = pme->nkz;
 317
 318     const real rxx = pme->recipbox[XX][XX];
 319     const real ryx = pme->recipbox[YY][XX];
 320     const real ryy = pme->recipbox[YY][YY];
 321     const real rzx = pme->recipbox[ZZ][XX];
 322     const real rzy = pme->recipbox[ZZ][YY];
 323     const real rzz = pme->recipbox[ZZ][ZZ];
 324
 325     /* Extract the buffer for force output */
 326     rvec* gmx_restrict force = as_rvec_array(atc->f.data());
 327
 328     /* Note that unrolling this loop by templating this function on order
 329      * deteriorates performance significantly with gcc5/6/7.
 330      */
 331     for (int nn = 0; nn < spline->n; nn++)
 332     {
 333         const int  n           = spline->ind[nn];
 334         const real coefficient = scale * atc->coefficient[n];
 335
 336         if (bClearF)
 337         {
 338             force[n][XX] = 0;
 339             force[n][YY] = 0;
 340             force[n][ZZ] = 0;
 341         }
 342         if (coefficient != 0)
 343         {
 344             RVec       f;
 345             const auto spline_func = do_fspline(pme, grid, atc, spline, nn);
 346
 347             switch (order)
 348             {
 349                 case 4: f = spline_func(std::integral_constant<int, 4>()); break;
 350                 case 5: f = spline_func(std::integral_constant<int, 5>()); break;
 351                 default: f = spline_func(order); break;
 352             }
 353
 354             force[n][XX] += -coefficient * (f[XX] * nx * rxx);
 355             force[n][YY] += -coefficient * (f[XX] * nx * ryx + f[YY] * ny * ryy);
 356             force[n][ZZ] += -coefficient * (f[XX] * nx * rzx + f[YY] * ny * rzy + f[ZZ] * nz * rzz);
 357         }
 358     }
 359     /* Since the energy and not forces are interpolated
 360      * the net force might not be exactly zero.
 361      * This can be solved by also interpolating F, but
 362      * that comes at a cost.
 363      * A better hack is to remove the net force every
 364      * step, but that must be done at a higher level
 365      * since this routine doesn't see all atoms if running
 366      * in parallel. Don't know how important it is?  EL 990726
 367      */
 368 }
 369
 370
 371 real gather_energy_bsplines(gmx_pme_t* pme, const real* grid, PmeAtomComm* atc)
 372 {
 373     splinedata_t* spline;
 374     int           ithx, ithy, ithz, i0, j0, k0;
 375     int           index_x, index_xy;
 376     int*          idxptr;
 377     real          energy, pot, tx, ty, coefficient, gval;
 378     real *        thx, *thy, *thz;
 379     int           norder;
 380     int           order;
 381
 382     spline = &atc->spline[0];
 383
 384     order = pme->pme_order;
 385
 386     energy = 0;
 387     for (int n = 0; n < atc->numAtoms(); n++)
 388     {
 389         coefficient = atc->coefficient[n];
 390
 391         if (coefficient != 0)
 392         {
 393             idxptr = atc->idx[n];
 394             norder = n * order;
 395
 396             i0 = idxptr[XX];
 397             j0 = idxptr[YY];
 398             k0 = idxptr[ZZ];
 399
 400             /* Pointer arithmetic alert, next three statements */
 401             thx = spline->theta.coefficients[XX] + norder;
 402             thy = spline->theta.coefficients[YY] + norder;
 403             thz = spline->theta.coefficients[ZZ] + norder;
 404
 405             pot = 0;
 406             for (ithx = 0; (ithx < order); ithx++)
 407             {
 408                 index_x = (i0 + ithx) * pme->pmegrid_ny * pme->pmegrid_nz;
 409                 tx      = thx[ithx];
 410
 411                 for (ithy = 0; (ithy < order); ithy++)
 412                 {
 413                     index_xy = index_x + (j0 + ithy) * pme->pmegrid_nz;
 414                     ty       = thy[ithy];
 415
 416                     for (ithz = 0; (ithz < order); ithz++)
 417                     {
 418                         gval = grid[index_xy + (k0 + ithz)];
 419                         pot += tx * ty * thz[ithz] * gval;
 420                     }
 421                 }
 422             }
 423
 424             energy += pot * coefficient;
 425         }
 426     }
 427
 428     return energy;
 429 }