src/gromacs/ewald/pme_gather.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2017,2018,2019, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37
  38 #include "gmxpre.h"
  39
  40 #include "pme_gather.h"
  41
  42 #include "gromacs/math/vec.h"
  43 #include "gromacs/simd/simd.h"
  44 #include "gromacs/utility/basedefinitions.h"
  45 #include "gromacs/utility/gmxassert.h"
  46 #include "gromacs/utility/smalloc.h"
  47 #include "gromacs/utility/typetraits.h"
  48
  49 #include "pme_internal.h"
  50 #include "pme_simd.h"
  51 #include "pme_spline_work.h"
  52
  53 using namespace gmx; // TODO: Remove when this file is moved into gmx namespace
  54
  55 /* Spline function. Goals: 1) Force compiler to instantiate function separately
  56    for each compile-time value of order and once more for any possible (runtime)
  57    value. 2) Allow overloading for specific compile-time values.
  58    The Int template argument can be either int (runtime) or an object of type
  59    integral_constant<int, N> (compile-time). Common runtime values can be
  60    converted to compile-time values with a switch statement. For the compile
  61    value the compiler is required to instantiate the function separately for
  62    each value. The function can be overloaded for specific compile-time values
  63    using integral_constant<int, N> where N is either a specific value or an
  64    enable_if constrained non-type template parameter. The most specific overload
  65    (specific value > int template parameter > general function) is called. Inside
  66    the function the order argument can be used as regular int because
  67    integral_constant has a proper conversion.
  68
  69    SIMD do_fspline() template funtions will be used for PME order 4 and 5
  70    when the SIMD module has support for SIMD4 for the architecture used.
  71    For SIMD4 without unaligned load/store support:
  72      order 4 and 5 use the order 4+5 aligned SIMD template
  73    For SIMD4 with unaligned load/store support:
  74      order 4 uses the order 4 unaligned SIMD template
  75      order 5 uses the order 4+5 aligned SIMD template
  76  */
  77 struct do_fspline
  78 {
  79     do_fspline (
  80             const gmx_pme_t *                   pme,
  81             const real * gmx_restrict           grid,
  82             const PmeAtomComm * gmx_restrict    atc,
  83             const splinedata_t * gmx_restrict   spline,
  84             int                                 nn)
  85         : pme(pme), grid(grid), atc(atc), spline(spline), nn(nn) {}
  86
  87     template <typename Int>
  88     RVec operator()(Int order) const
  89     {
  90         static_assert(isIntegralConstant<Int, int>::value || std::is_same<Int, int>::value,
  91                       "'order' needs to be either of type integral_constant<int,N> or int.");
  92
  93         const int  norder = nn*order;
  94
  95         /* Pointer arithmetic alert, next six statements */
  96         const real *const gmx_restrict thx  = spline->theta.coefficients[XX] + norder;
  97         const real *const gmx_restrict thy  = spline->theta.coefficients[YY] + norder;
  98         const real *const gmx_restrict thz  = spline->theta.coefficients[ZZ] + norder;
  99         const real *const gmx_restrict dthx = spline->dtheta.coefficients[XX] + norder;
 100         const real *const gmx_restrict dthy = spline->dtheta.coefficients[YY] + norder;
 101         const real *const gmx_restrict dthz = spline->dtheta.coefficients[ZZ] + norder;
 102
 103         RVec                           f(0, 0, 0);
 104
 105         for (int ithx = 0; (ithx < order); ithx++)
 106         {
 107             const int  index_x = (idxX + ithx)*gridNY*gridNZ;
 108             const real tx      = thx[ithx];
 109             const real dx      = dthx[ithx];
 110
 111             for (int ithy = 0; (ithy < order); ithy++)
 112             {
 113                 const int  index_xy = index_x + (idxY + ithy)*gridNZ;
 114                 const real ty       = thy[ithy];
 115                 const real dy       = dthy[ithy];
 116                 real       fxy1     = 0, fz1 = 0;
 117
 118                 for (int ithz = 0; (ithz < order); ithz++)
 119                 {
 120                     const real gval = grid[index_xy + (idxZ + ithz)];
 121                     fxy1 += thz[ithz]*gval;
 122                     fz1  += dthz[ithz]*gval;
 123                 }
 124                 f[XX] += dx*ty*fxy1;
 125                 f[YY] += tx*dy*fxy1;
 126                 f[ZZ] += tx*ty*fz1;
 127             }
 128         }
 129
 130         return f;
 131     }
 132
 133 //TODO: Consider always have at least a dummy implementation of Simd (enough for first phase of two-phase lookup) and then use enable_if instead of #ifdef
 134 #if PME_4NSIMD_GATHER
 135 /* Gather for one charge with pme_order=4 with unaligned SIMD4 load+store.
 136  * Uses 4N SIMD where N is SIMD_WIDTH/4 to operate on all of z and N of y.
 137  * This code does not assume any memory alignment for the grid.
 138  */
 139     RVec
 140     operator()(std::integral_constant<int, 4> /*unused*/) const
 141     {
 142         const int                      norder = nn*4;
 143         /* Pointer arithmetic alert, next six statements */
 144         const real *const gmx_restrict thx  = spline->theta.coefficients[XX] + norder;
 145         const real *const gmx_restrict thy  = spline->theta.coefficients[YY] + norder;
 146         const real *const gmx_restrict thz  = spline->theta.coefficients[ZZ] + norder;
 147         const real *const gmx_restrict dthx = spline->dtheta.coefficients[XX] + norder;
 148         const real *const gmx_restrict dthy = spline->dtheta.coefficients[YY] + norder;
 149         const real *const gmx_restrict dthz = spline->dtheta.coefficients[ZZ] + norder;
 150
 151         Simd4NReal                     fx_S = setZero();
 152         Simd4NReal                     fy_S = setZero();
 153         Simd4NReal                     fz_S = setZero();
 154
 155         /* With order 4 the z-spline is actually aligned */
 156         const Simd4NReal tz_S = load4DuplicateN(thz);
 157         const Simd4NReal dz_S = load4DuplicateN(dthz);
 158
 159         for (int ithx = 0; ithx < 4; ithx++)
 160         {
 161             const int        index_x = (idxX + ithx)*gridNY*gridNZ;
 162             const Simd4NReal tx_S    = Simd4NReal(thx[ithx]);
 163             const Simd4NReal dx_S    = Simd4NReal(dthx[ithx]);
 164
 165             for (int ithy = 0; ithy < 4; ithy += GMX_SIMD4N_REAL_WIDTH/4)
 166             {
 167                 const int        index_xy = index_x + (idxY+ithy)*gridNZ;
 168
 169                 const Simd4NReal ty_S = loadUNDuplicate4(thy +ithy);
 170                 const Simd4NReal dy_S = loadUNDuplicate4(dthy+ithy);
 171
 172                 const Simd4NReal gval_S = loadU4NOffset(grid+index_xy+idxZ, gridNZ);
 173
 174
 175                 const Simd4NReal fxy1_S = tz_S * gval_S;
 176                 const Simd4NReal fz1_S  = dz_S * gval_S;
 177
 178                 fx_S = fma(dx_S * ty_S, fxy1_S, fx_S);
 179                 fy_S = fma(tx_S * dy_S, fxy1_S, fy_S);
 180                 fz_S = fma(tx_S * ty_S, fz1_S, fz_S);
 181             }
 182         }
 183
 184         return {
 185                    reduce(fx_S), reduce(fy_S), reduce(fz_S)
 186         };
 187     }
 188 #endif
 189
 190 #ifdef PME_SIMD4_SPREAD_GATHER
 191 /* Load order elements from unaligned memory into two 4-wide SIMD */
 192     template<int order>
 193     static inline void loadOrderU(const real* data, std::integral_constant<int, order> /*unused*/,
 194                                   int offset, Simd4Real* S0, Simd4Real* S1)
 195     {
 196 #ifdef PME_SIMD4_UNALIGNED
 197         *S0 = load4U(data-offset);
 198         *S1 = load4U(data-offset+4);
 199 #else
 200         alignas(GMX_SIMD_ALIGNMENT) real  buf_aligned[GMX_SIMD4_WIDTH*2];
 201         /* Copy data to an aligned buffer */
 202         for (int i = 0; i < order; i++)
 203         {
 204             buf_aligned[offset+i]  = data[i];
 205         }
 206         *S0 = load4(buf_aligned);
 207         *S1 = load4(buf_aligned+4);
 208 #endif
 209     }
 210 #endif
 211
 212 #ifdef PME_SIMD4_SPREAD_GATHER
 213 /* This code assumes that the grid is allocated 4-real aligned
 214  * and that pme->pmegrid_nz is a multiple of 4.
 215  * This code supports pme_order <= 5.
 216  */
 217     template <int Order>
 218     std::enable_if_t<Order == 4 || Order == 5, RVec>
 219     operator()(std::integral_constant<int, Order>  order) const
 220     {
 221         const int                     norder = nn*order;
 222         GMX_ASSERT(gridNZ % 4 == 0, "For aligned SIMD4 operations the grid size has to be padded up to a multiple of 4");
 223         /* Pointer arithmetic alert, next six statements */
 224         const real *const gmx_restrict thx  = spline->theta.coefficients[XX] + norder;
 225         const real *const gmx_restrict thy  = spline->theta.coefficients[YY] + norder;
 226         const real *const gmx_restrict thz  = spline->theta.coefficients[ZZ] + norder;
 227         const real *const gmx_restrict dthx = spline->dtheta.coefficients[XX] + norder;
 228         const real *const gmx_restrict dthy = spline->dtheta.coefficients[YY] + norder;
 229         const real *const gmx_restrict dthz = spline->dtheta.coefficients[ZZ] + norder;
 230
 231         struct pme_spline_work *const  work = pme->spline_work;
 232
 233         const int                      offset = idxZ & 3;
 234
 235         Simd4Real                      fx_S = setZero();
 236         Simd4Real                      fy_S = setZero();
 237         Simd4Real                      fz_S = setZero();
 238
 239         Simd4Real                      tz_S0, tz_S1, dz_S0, dz_S1;
 240         loadOrderU(thz,  order, offset, &tz_S0, &tz_S1);
 241         loadOrderU(dthz, order, offset, &dz_S0, &dz_S1);
 242
 243         tz_S0 = selectByMask(tz_S0, work->mask_S0[offset]);
 244         dz_S0 = selectByMask(dz_S0, work->mask_S0[offset]);
 245         tz_S1 = selectByMask(tz_S1, work->mask_S1[offset]);
 246         dz_S1 = selectByMask(dz_S1, work->mask_S1[offset]);
 247
 248         for (int ithx = 0; (ithx < order); ithx++)
 249         {
 250             const int       index_x  = (idxX + ithx)*gridNY*gridNZ;
 251             const Simd4Real tx_S     = Simd4Real(thx[ithx]);
 252             const Simd4Real dx_S     = Simd4Real(dthx[ithx]);
 253
 254             for (int ithy = 0; (ithy < order); ithy++)
 255             {
 256                 const int       index_xy = index_x + (idxY + ithy)*gridNZ;
 257                 const Simd4Real ty_S     = Simd4Real(thy[ithy]);
 258                 const Simd4Real dy_S     = Simd4Real(dthy[ithy]);
 259
 260                 const Simd4Real gval_S0 = load4(grid + index_xy + idxZ - offset);
 261                 const Simd4Real gval_S1 = load4(grid + index_xy + idxZ - offset + 4);
 262
 263                 const Simd4Real fxy1_S0 = tz_S0 * gval_S0;
 264                 const Simd4Real fz1_S0  = dz_S0 * gval_S0;
 265                 const Simd4Real fxy1_S1 = tz_S1 * gval_S1;
 266                 const Simd4Real fz1_S1  = dz_S1 * gval_S1;
 267
 268                 const Simd4Real fxy1_S = fxy1_S0 + fxy1_S1;
 269                 const Simd4Real fz1_S  = fz1_S0 + fz1_S1;
 270
 271                 fx_S = fma(dx_S * ty_S, fxy1_S, fx_S);
 272                 fy_S = fma(tx_S * dy_S, fxy1_S, fy_S);
 273                 fz_S = fma(tx_S * ty_S, fz1_S, fz_S);
 274             }
 275         }
 276
 277         return {
 278                    reduce(fx_S), reduce(fy_S), reduce(fz_S)
 279         };
 280     }
 281 #endif
 282     private:
 283         const gmx_pme_t *const                   pme;
 284         const real *const gmx_restrict           grid;
 285         const PmeAtomComm *const gmx_restrict    atc;
 286         const splinedata_t *const gmx_restrict   spline;
 287         const int                                nn;
 288
 289         const int                                gridNY = pme->pmegrid_ny;
 290         const int                                gridNZ = pme->pmegrid_nz;
 291
 292         const int *const                         idxptr = atc->idx[spline->ind[nn]];
 293         const int                                idxX   = idxptr[XX];
 294         const int                                idxY   = idxptr[YY];
 295         const int                                idxZ   = idxptr[ZZ];
 296 };
 297
 298
 299 void gather_f_bsplines(const gmx_pme_t *pme, const real *grid,
 300                        gmx_bool bClearF, const PmeAtomComm *atc,
 301                        const splinedata_t *spline,
 302                        real scale)
 303 {
 304     /* sum forces for local particles */
 305
 306     const int  order = pme->pme_order;
 307     const int  nx    = pme->nkx;
 308     const int  ny    = pme->nky;
 309     const int  nz    = pme->nkz;
 310
 311     const real rxx   = pme->recipbox[XX][XX];
 312     const real ryx   = pme->recipbox[YY][XX];
 313     const real ryy   = pme->recipbox[YY][YY];
 314     const real rzx   = pme->recipbox[ZZ][XX];
 315     const real rzy   = pme->recipbox[ZZ][YY];
 316     const real rzz   = pme->recipbox[ZZ][ZZ];
 317
 318     /* Extract the buffer for force output */
 319     rvec * gmx_restrict force = as_rvec_array(atc->f.data());
 320
 321     /* Note that unrolling this loop by templating this function on order
 322      * deteriorates performance significantly with gcc5/6/7.
 323      */
 324     for (int nn = 0; nn < spline->n; nn++)
 325     {
 326         const int  n           = spline->ind[nn];
 327         const real coefficient = scale*atc->coefficient[n];
 328
 329         if (bClearF)
 330         {
 331             force[n][XX] = 0;
 332             force[n][YY] = 0;
 333             force[n][ZZ] = 0;
 334         }
 335         if (coefficient != 0)
 336         {
 337             RVec       f;
 338             const auto spline_func = do_fspline(pme, grid, atc, spline, nn);
 339
 340             switch (order)
 341             {
 342                 case 4:
 343                     f = spline_func(std::integral_constant<int, 4>());
 344                     break;
 345                 case 5:
 346                     f = spline_func(std::integral_constant<int, 5>());
 347                     break;
 348                 default:
 349                     f = spline_func(order);
 350                     break;
 351             }
 352
 353             force[n][XX] += -coefficient*( f[XX]*nx*rxx );
 354             force[n][YY] += -coefficient*( f[XX]*nx*ryx + f[YY]*ny*ryy );
 355             force[n][ZZ] += -coefficient*( f[XX]*nx*rzx + f[YY]*ny*rzy + f[ZZ]*nz*rzz );
 356         }
 357     }
 358     /* Since the energy and not forces are interpolated
 359      * the net force might not be exactly zero.
 360      * This can be solved by also interpolating F, but
 361      * that comes at a cost.
 362      * A better hack is to remove the net force every
 363      * step, but that must be done at a higher level
 364      * since this routine doesn't see all atoms if running
 365      * in parallel. Don't know how important it is?  EL 990726
 366      */
 367 }
 368
 369
 370 real gather_energy_bsplines(gmx_pme_t *pme, const real *grid,
 371                             PmeAtomComm *atc)
 372 {
 373     splinedata_t *spline;
 374     int           ithx, ithy, ithz, i0, j0, k0;
 375     int           index_x, index_xy;
 376     int          *idxptr;
 377     real          energy, pot, tx, ty, coefficient, gval;
 378     real         *thx, *thy, *thz;
 379     int           norder;
 380     int           order;
 381
 382     spline = &atc->spline[0];
 383
 384     order = pme->pme_order;
 385
 386     energy = 0;
 387     for (int n = 0; n < atc->numAtoms(); n++)
 388     {
 389         coefficient      = atc->coefficient[n];
 390
 391         if (coefficient != 0)
 392         {
 393             idxptr = atc->idx[n];
 394             norder = n*order;
 395
 396             i0   = idxptr[XX];
 397             j0   = idxptr[YY];
 398             k0   = idxptr[ZZ];
 399
 400             /* Pointer arithmetic alert, next three statements */
 401             thx  = spline->theta.coefficients[XX] + norder;
 402             thy  = spline->theta.coefficients[YY] + norder;
 403             thz  = spline->theta.coefficients[ZZ] + norder;
 404
 405             pot = 0;
 406             for (ithx = 0; (ithx < order); ithx++)
 407             {
 408                 index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
 409                 tx      = thx[ithx];
 410
 411                 for (ithy = 0; (ithy < order); ithy++)
 412                 {
 413                     index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
 414                     ty       = thy[ithy];
 415
 416                     for (ithz = 0; (ithz < order); ithz++)
 417                     {
 418                         gval  = grid[index_xy+(k0+ithz)];
 419                         pot  += tx*ty*thz[ithz]*gval;
 420                     }
 421
 422                 }
 423             }
 424
 425             energy += pot*coefficient;
 426         }
 427     }
 428
 429     return energy;
 430 }