src/gromacs/ewald/pme_solve.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37
  38 #include "gmxpre.h"
  39
  40 #include "pme_solve.h"
  41
  42 #include <cmath>
  43
  44 #include "gromacs/fft/parallel_3dfft.h"
  45 #include "gromacs/math/units.h"
  46 #include "gromacs/math/utilities.h"
  47 #include "gromacs/math/vec.h"
  48 #include "gromacs/simd/simd.h"
  49 #include "gromacs/simd/simd_math.h"
  50 #include "gromacs/utility/arrayref.h"
  51 #include "gromacs/utility/exceptions.h"
  52 #include "gromacs/utility/smalloc.h"
  53
  54 #include "pme_internal.h"
  55
  56 #if GMX_SIMD_HAVE_REAL
  57 /* Turn on arbitrary width SIMD intrinsics for PME solve */
  58 #    define PME_SIMD_SOLVE
  59 #endif
  60
  61 using namespace gmx; // TODO: Remove when this file is moved into gmx namespace
  62
  63 struct pme_solve_work_t
  64 {
  65     /* work data for solve_pme */
  66     int      nalloc;
  67     real *   mhx;
  68     real *   mhy;
  69     real *   mhz;
  70     real *   m2;
  71     real *   denom;
  72     real *   tmp1_alloc;
  73     real *   tmp1;
  74     real *   tmp2;
  75     real *   eterm;
  76     real *   m2inv;
  77
  78     real     energy_q;
  79     matrix   vir_q;
  80     real     energy_lj;
  81     matrix   vir_lj;
  82 };
  83
  84 #ifdef PME_SIMD_SOLVE
  85 constexpr int c_simdWidth = GMX_SIMD_REAL_WIDTH;
  86 #else
  87 /* We can use any alignment > 0, so we use 4 */
  88 constexpr int c_simdWidth = 4;
  89 #endif
  90
  91 /* Returns the smallest number >= \p that is a multiple of \p factor, \p factor must be a power of 2 */
  92 template <unsigned int factor>
  93 static size_t roundUpToMultipleOfFactor(size_t number)
  94 {
  95     static_assert(factor > 0 && (factor & (factor - 1)) == 0, "factor should be >0 and a power of 2");
  96
  97     /* We need to add a most factor-1 and because factor is a power of 2,
  98      * we get the result by masking out the bits corresponding to factor-1.
  99      */
 100     return (number + factor - 1) & ~(factor - 1);
 101 }
 102
 103 /* Allocate an aligned pointer for SIMD operations, including extra elements
 104  * at the end for padding.
 105  */
 106 /* TODO: Replace this SIMD reallocator with a general, C++ solution */
 107 static void reallocSimdAlignedAndPadded(real **ptr, int unpaddedNumElements)
 108 {
 109     sfree_aligned(*ptr);
 110     snew_aligned(*ptr, roundUpToMultipleOfFactor<c_simdWidth>(unpaddedNumElements), c_simdWidth*sizeof(real));
 111 }
 112
 113 static void realloc_work(struct pme_solve_work_t *work, int nkx)
 114 {
 115     if (nkx > work->nalloc)
 116     {
 117         work->nalloc = nkx;
 118         srenew(work->mhx, work->nalloc);
 119         srenew(work->mhy, work->nalloc);
 120         srenew(work->mhz, work->nalloc);
 121         srenew(work->m2, work->nalloc);
 122         reallocSimdAlignedAndPadded(&work->denom, work->nalloc);
 123         reallocSimdAlignedAndPadded(&work->tmp1, work->nalloc);
 124         reallocSimdAlignedAndPadded(&work->tmp2, work->nalloc);
 125         reallocSimdAlignedAndPadded(&work->eterm, work->nalloc);
 126         srenew(work->m2inv, work->nalloc);
 127
 128         /* Init all allocated elements of denom to 1 to avoid 1/0 exceptions
 129          * of simd padded elements.
 130          */
 131         for (size_t i = 0; i < roundUpToMultipleOfFactor<c_simdWidth>(work->nalloc ); i++)
 132         {
 133             work->denom[i] = 1;
 134         }
 135     }
 136 }
 137
 138 void pme_init_all_work(struct pme_solve_work_t **work, int nthread, int nkx)
 139 {
 140     /* Use fft5d, order after FFT is y major, z, x minor */
 141
 142     snew(*work, nthread);
 143     /* Allocate the work arrays thread local to optimize memory access */
 144 #pragma omp parallel for num_threads(nthread) schedule(static)
 145     for (int thread = 0; thread < nthread; thread++)
 146     {
 147         try
 148         {
 149             realloc_work(&((*work)[thread]), nkx);
 150         }
 151         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 152     }
 153 }
 154
 155 static void free_work(struct pme_solve_work_t *work)
 156 {
 157     if (work)
 158     {
 159         sfree(work->mhx);
 160         sfree(work->mhy);
 161         sfree(work->mhz);
 162         sfree(work->m2);
 163         sfree_aligned(work->denom);
 164         sfree_aligned(work->tmp1);
 165         sfree_aligned(work->tmp2);
 166         sfree_aligned(work->eterm);
 167         sfree(work->m2inv);
 168     }
 169 }
 170
 171 void pme_free_all_work(struct pme_solve_work_t **work, int nthread)
 172 {
 173     if (*work)
 174     {
 175         for (int thread = 0; thread < nthread; thread++)
 176         {
 177             free_work(&(*work)[thread]);
 178         }
 179     }
 180     sfree(*work);
 181     *work = nullptr;
 182 }
 183
 184 void get_pme_ener_vir_q(pme_solve_work_t *work, int nthread, PmeOutput *output)
 185 {
 186     GMX_ASSERT(output != nullptr, "Need valid output buffer");
 187     /* This function sums output over threads and should therefore
 188      * only be called after thread synchronization.
 189      */
 190     output->coulombEnergy_ = work[0].energy_q;
 191     copy_mat(work[0].vir_q, output->coulombVirial_);
 192
 193     for (int thread = 1; thread < nthread; thread++)
 194     {
 195         output->coulombEnergy_ += work[thread].energy_q;
 196         m_add(output->coulombVirial_, work[thread].vir_q, output->coulombVirial_);
 197     }
 198 }
 199
 200 void get_pme_ener_vir_lj(pme_solve_work_t *work, int nthread, PmeOutput *output)
 201 {
 202     GMX_ASSERT(output != nullptr, "Need valid output buffer");
 203     /* This function sums output over threads and should therefore
 204      * only be called after thread synchronization.
 205      */
 206     output->lennardJonesEnergy_ = work[0].energy_lj;
 207     copy_mat(work[0].vir_lj, output->lennardJonesVirial_);
 208
 209     for (int thread = 1; thread < nthread; thread++)
 210     {
 211         output->lennardJonesEnergy_ += work[thread].energy_lj;
 212         m_add(output->lennardJonesVirial_, work[thread].vir_lj, output->lennardJonesVirial_);
 213     }
 214 }
 215
 216 #if defined PME_SIMD_SOLVE
 217 /* Calculate exponentials through SIMD */
 218 inline static void calc_exponentials_q(int /*unused*/, int /*unused*/, real f, ArrayRef<const SimdReal> d_aligned, ArrayRef<const SimdReal> r_aligned, ArrayRef<SimdReal> e_aligned)
 219 {
 220     {
 221         SimdReal              f_simd(f);
 222         SimdReal              tmp_d1, tmp_r, tmp_e;
 223
 224         /* We only need to calculate from start. But since start is 0 or 1
 225          * and we want to use aligned loads/stores, we always start from 0.
 226          */
 227         GMX_ASSERT(d_aligned.size() == r_aligned.size(), "d and r must have same size");
 228         GMX_ASSERT(d_aligned.size() == e_aligned.size(), "d and e must have same size");
 229         for (size_t kx = 0; kx != d_aligned.size(); ++kx)
 230         {
 231             tmp_d1        = d_aligned[kx];
 232             tmp_r         = r_aligned[kx];
 233             tmp_r         = gmx::exp(tmp_r);
 234             tmp_e         = f_simd / tmp_d1;
 235             tmp_e         = tmp_e * tmp_r;
 236             e_aligned[kx] = tmp_e;
 237         }
 238     }
 239 }
 240 #else
 241 inline static void calc_exponentials_q(int start, int end, real f, ArrayRef<real> d, ArrayRef<real> r, ArrayRef<real> e)
 242 {
 243     GMX_ASSERT(d.size() == r.size(), "d and r must have same size");
 244     GMX_ASSERT(d.size() == e.size(), "d and e must have same size");
 245     int kx;
 246     for (kx = start; kx < end; kx++)
 247     {
 248         d[kx] = 1.0/d[kx];
 249     }
 250     for (kx = start; kx < end; kx++)
 251     {
 252         r[kx] = std::exp(r[kx]);
 253     }
 254     for (kx = start; kx < end; kx++)
 255     {
 256         e[kx] = f*r[kx]*d[kx];
 257     }
 258 }
 259 #endif
 260
 261 #if defined PME_SIMD_SOLVE
 262 /* Calculate exponentials through SIMD */
 263 inline static void calc_exponentials_lj(int /*unused*/, int /*unused*/, ArrayRef<SimdReal> r_aligned, ArrayRef<SimdReal> factor_aligned, ArrayRef<SimdReal> d_aligned)
 264 {
 265     SimdReal              tmp_r, tmp_d, tmp_fac, d_inv, tmp_mk;
 266     const SimdReal        sqr_PI = sqrt(SimdReal(M_PI));
 267
 268     GMX_ASSERT(d_aligned.size() == r_aligned.size(), "d and r must have same size");
 269     GMX_ASSERT(d_aligned.size() == factor_aligned.size(), "d and factor must have same size");
 270     for (size_t kx = 0; kx != d_aligned.size(); ++kx)
 271     {
 272         /* We only need to calculate from start. But since start is 0 or 1
 273          * and we want to use aligned loads/stores, we always start from 0.
 274          */
 275         tmp_d              = d_aligned[kx];
 276         d_inv              = SimdReal(1.0) / tmp_d;
 277         d_aligned[kx]      = d_inv;
 278         tmp_r              = r_aligned[kx];
 279         tmp_r              = gmx::exp(tmp_r);
 280         r_aligned[kx]      = tmp_r;
 281         tmp_mk             = factor_aligned[kx];
 282         tmp_fac            = sqr_PI * tmp_mk * erfc(tmp_mk);
 283         factor_aligned[kx] = tmp_fac;
 284     }
 285 }
 286 #else
 287 inline static void calc_exponentials_lj(int start, int end, ArrayRef<real> r, ArrayRef<real> tmp2, ArrayRef<real> d)
 288 {
 289     int  kx;
 290     real mk;
 291     GMX_ASSERT(d.size() == r.size(), "d and r must have same size");
 292     GMX_ASSERT(d.size() == tmp2.size(), "d and tmp2 must have same size");
 293     for (kx = start; kx < end; kx++)
 294     {
 295         d[kx] = 1.0/d[kx];
 296     }
 297
 298     for (kx = start; kx < end; kx++)
 299     {
 300         r[kx] = std::exp(r[kx]);
 301     }
 302
 303     for (kx = start; kx < end; kx++)
 304     {
 305         mk       = tmp2[kx];
 306         tmp2[kx] = sqrt(M_PI)*mk*std::erfc(mk);
 307     }
 308 }
 309 #endif
 310
 311 #if defined PME_SIMD_SOLVE
 312 using PME_T = SimdReal;
 313 #else
 314 using PME_T = real;
 315 #endif
 316
 317 int solve_pme_yzx(const gmx_pme_t *pme, t_complex *grid, real vol,
 318                   gmx_bool bEnerVir,
 319                   int nthread, int thread)
 320 {
 321     /* do recip sum over local cells in grid */
 322     /* y major, z middle, x minor or continuous */
 323     t_complex               *p0;
 324     int                      kx, ky, kz, maxkx, maxky;
 325     int                      nx, ny, nz, iyz0, iyz1, iyz, iy, iz, kxstart, kxend;
 326     real                     mx, my, mz;
 327     real                     ewaldcoeff = pme->ewaldcoeff_q;
 328     real                     factor     = M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
 329     real                     ets2, struct2, vfactor, ets2vf;
 330     real                     d1, d2, energy = 0;
 331     real                     by, bz;
 332     real                     virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
 333     real                     rxx, ryx, ryy, rzx, rzy, rzz;
 334     struct pme_solve_work_t *work;
 335     real                    *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *eterm, *m2inv;
 336     real                     mhxk, mhyk, mhzk, m2k;
 337     real                     corner_fac;
 338     ivec                     complex_order;
 339     ivec                     local_ndata, local_offset, local_size;
 340     real                     elfac;
 341
 342     elfac = ONE_4PI_EPS0/pme->epsilon_r;
 343
 344     nx = pme->nkx;
 345     ny = pme->nky;
 346     nz = pme->nkz;
 347
 348     /* Dimensions should be identical for A/B grid, so we just use A here */
 349     gmx_parallel_3dfft_complex_limits(pme->pfft_setup[PME_GRID_QA],
 350                                       complex_order,
 351                                       local_ndata,
 352                                       local_offset,
 353                                       local_size);
 354
 355     rxx = pme->recipbox[XX][XX];
 356     ryx = pme->recipbox[YY][XX];
 357     ryy = pme->recipbox[YY][YY];
 358     rzx = pme->recipbox[ZZ][XX];
 359     rzy = pme->recipbox[ZZ][YY];
 360     rzz = pme->recipbox[ZZ][ZZ];
 361
 362     GMX_ASSERT(rxx != 0.0, "Someone broke the reciprocal box again");
 363
 364     maxkx = (nx+1)/2;
 365     maxky = (ny+1)/2;
 366
 367     work  = &pme->solve_work[thread];
 368     mhx   = work->mhx;
 369     mhy   = work->mhy;
 370     mhz   = work->mhz;
 371     m2    = work->m2;
 372     denom = work->denom;
 373     tmp1  = work->tmp1;
 374     eterm = work->eterm;
 375     m2inv = work->m2inv;
 376
 377     iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread   /nthread;
 378     iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
 379
 380     for (iyz = iyz0; iyz < iyz1; iyz++)
 381     {
 382         iy = iyz/local_ndata[ZZ];
 383         iz = iyz - iy*local_ndata[ZZ];
 384
 385         ky = iy + local_offset[YY];
 386
 387         if (ky < maxky)
 388         {
 389             my = ky;
 390         }
 391         else
 392         {
 393             my = (ky - ny);
 394         }
 395
 396         by = M_PI*vol*pme->bsp_mod[YY][ky];
 397
 398         kz = iz + local_offset[ZZ];
 399
 400         mz = kz;
 401
 402         bz = pme->bsp_mod[ZZ][kz];
 403
 404         /* 0.5 correction for corner points */
 405         corner_fac = 1;
 406         if (kz == 0 || kz == (nz+1)/2)
 407         {
 408             corner_fac = 0.5;
 409         }
 410
 411         p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
 412
 413         /* We should skip the k-space point (0,0,0) */
 414         /* Note that since here x is the minor index, local_offset[XX]=0 */
 415         if (local_offset[XX] > 0 || ky > 0 || kz > 0)
 416         {
 417             kxstart = local_offset[XX];
 418         }
 419         else
 420         {
 421             kxstart = local_offset[XX] + 1;
 422             p0++;
 423         }
 424         kxend = local_offset[XX] + local_ndata[XX];
 425
 426         if (bEnerVir)
 427         {
 428             /* More expensive inner loop, especially because of the storage
 429              * of the mh elements in array's.
 430              * Because x is the minor grid index, all mh elements
 431              * depend on kx for triclinic unit cells.
 432              */
 433
 434             /* Two explicit loops to avoid a conditional inside the loop */
 435             for (kx = kxstart; kx < maxkx; kx++)
 436             {
 437                 mx = kx;
 438
 439                 mhxk      = mx * rxx;
 440                 mhyk      = mx * ryx + my * ryy;
 441                 mhzk      = mx * rzx + my * rzy + mz * rzz;
 442                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 443                 mhx[kx]   = mhxk;
 444                 mhy[kx]   = mhyk;
 445                 mhz[kx]   = mhzk;
 446                 m2[kx]    = m2k;
 447                 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 448                 tmp1[kx]  = -factor*m2k;
 449             }
 450
 451             for (kx = maxkx; kx < kxend; kx++)
 452             {
 453                 mx = (kx - nx);
 454
 455                 mhxk      = mx * rxx;
 456                 mhyk      = mx * ryx + my * ryy;
 457                 mhzk      = mx * rzx + my * rzy + mz * rzz;
 458                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 459                 mhx[kx]   = mhxk;
 460                 mhy[kx]   = mhyk;
 461                 mhz[kx]   = mhzk;
 462                 m2[kx]    = m2k;
 463                 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 464                 tmp1[kx]  = -factor*m2k;
 465             }
 466
 467             for (kx = kxstart; kx < kxend; kx++)
 468             {
 469                 m2inv[kx] = 1.0/m2[kx];
 470             }
 471
 472             calc_exponentials_q(kxstart, kxend, elfac,
 473                                 ArrayRef<PME_T>(denom, denom+roundUpToMultipleOfFactor<c_simdWidth>(kxend)),
 474                                 ArrayRef<PME_T>(tmp1, tmp1+roundUpToMultipleOfFactor<c_simdWidth>(kxend)),
 475                                 ArrayRef<PME_T>(eterm, eterm+roundUpToMultipleOfFactor<c_simdWidth>(kxend)));
 476
 477             for (kx = kxstart; kx < kxend; kx++, p0++)
 478             {
 479                 d1      = p0->re;
 480                 d2      = p0->im;
 481
 482                 p0->re  = d1*eterm[kx];
 483                 p0->im  = d2*eterm[kx];
 484
 485                 struct2 = 2.0*(d1*d1+d2*d2);
 486
 487                 tmp1[kx] = eterm[kx]*struct2;
 488             }
 489
 490             for (kx = kxstart; kx < kxend; kx++)
 491             {
 492                 ets2     = corner_fac*tmp1[kx];
 493                 vfactor  = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
 494                 energy  += ets2;
 495
 496                 ets2vf   = ets2*vfactor;
 497                 virxx   += ets2vf*mhx[kx]*mhx[kx] - ets2;
 498                 virxy   += ets2vf*mhx[kx]*mhy[kx];
 499                 virxz   += ets2vf*mhx[kx]*mhz[kx];
 500                 viryy   += ets2vf*mhy[kx]*mhy[kx] - ets2;
 501                 viryz   += ets2vf*mhy[kx]*mhz[kx];
 502                 virzz   += ets2vf*mhz[kx]*mhz[kx] - ets2;
 503             }
 504         }
 505         else
 506         {
 507             /* We don't need to calculate the energy and the virial.
 508              * In this case the triclinic overhead is small.
 509              */
 510
 511             /* Two explicit loops to avoid a conditional inside the loop */
 512
 513             for (kx = kxstart; kx < maxkx; kx++)
 514             {
 515                 mx = kx;
 516
 517                 mhxk      = mx * rxx;
 518                 mhyk      = mx * ryx + my * ryy;
 519                 mhzk      = mx * rzx + my * rzy + mz * rzz;
 520                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 521                 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 522                 tmp1[kx]  = -factor*m2k;
 523             }
 524
 525             for (kx = maxkx; kx < kxend; kx++)
 526             {
 527                 mx = (kx - nx);
 528
 529                 mhxk      = mx * rxx;
 530                 mhyk      = mx * ryx + my * ryy;
 531                 mhzk      = mx * rzx + my * rzy + mz * rzz;
 532                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 533                 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 534                 tmp1[kx]  = -factor*m2k;
 535             }
 536
 537             calc_exponentials_q(kxstart, kxend, elfac,
 538                                 ArrayRef<PME_T>(denom, denom+roundUpToMultipleOfFactor<c_simdWidth>(kxend)),
 539                                 ArrayRef<PME_T>(tmp1, tmp1+roundUpToMultipleOfFactor<c_simdWidth>(kxend)),
 540                                 ArrayRef<PME_T>(eterm, eterm+roundUpToMultipleOfFactor<c_simdWidth>(kxend)));
 541
 542
 543             for (kx = kxstart; kx < kxend; kx++, p0++)
 544             {
 545                 d1      = p0->re;
 546                 d2      = p0->im;
 547
 548                 p0->re  = d1*eterm[kx];
 549                 p0->im  = d2*eterm[kx];
 550             }
 551         }
 552     }
 553
 554     if (bEnerVir)
 555     {
 556         /* Update virial with local values.
 557          * The virial is symmetric by definition.
 558          * this virial seems ok for isotropic scaling, but I'm
 559          * experiencing problems on semiisotropic membranes.
 560          * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
 561          */
 562         work->vir_q[XX][XX] = 0.25*virxx;
 563         work->vir_q[YY][YY] = 0.25*viryy;
 564         work->vir_q[ZZ][ZZ] = 0.25*virzz;
 565         work->vir_q[XX][YY] = work->vir_q[YY][XX] = 0.25*virxy;
 566         work->vir_q[XX][ZZ] = work->vir_q[ZZ][XX] = 0.25*virxz;
 567         work->vir_q[YY][ZZ] = work->vir_q[ZZ][YY] = 0.25*viryz;
 568
 569         /* This energy should be corrected for a charged system */
 570         work->energy_q = 0.5*energy;
 571     }
 572
 573     /* Return the loop count */
 574     return local_ndata[YY]*local_ndata[XX];
 575 }
 576
 577 int solve_pme_lj_yzx(const gmx_pme_t *pme, t_complex **grid, gmx_bool bLB, real vol,
 578                      gmx_bool bEnerVir, int nthread, int thread)
 579 {
 580     /* do recip sum over local cells in grid */
 581     /* y major, z middle, x minor or continuous */
 582     int                      ig, gcount;
 583     int                      kx, ky, kz, maxkx, maxky;
 584     int                      nx, ny, nz, iy, iyz0, iyz1, iyz, iz, kxstart, kxend;
 585     real                     mx, my, mz;
 586     real                     ewaldcoeff = pme->ewaldcoeff_lj;
 587     real                     factor     = M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
 588     real                     ets2, ets2vf;
 589     real                     eterm, vterm, d1, d2, energy = 0;
 590     real                     by, bz;
 591     real                     virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
 592     real                     rxx, ryx, ryy, rzx, rzy, rzz;
 593     real                    *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *tmp2;
 594     real                     mhxk, mhyk, mhzk, m2k;
 595     struct pme_solve_work_t *work;
 596     real                     corner_fac;
 597     ivec                     complex_order;
 598     ivec                     local_ndata, local_offset, local_size;
 599     nx = pme->nkx;
 600     ny = pme->nky;
 601     nz = pme->nkz;
 602
 603     /* Dimensions should be identical for A/B grid, so we just use A here */
 604     gmx_parallel_3dfft_complex_limits(pme->pfft_setup[PME_GRID_C6A],
 605                                       complex_order,
 606                                       local_ndata,
 607                                       local_offset,
 608                                       local_size);
 609     rxx = pme->recipbox[XX][XX];
 610     ryx = pme->recipbox[YY][XX];
 611     ryy = pme->recipbox[YY][YY];
 612     rzx = pme->recipbox[ZZ][XX];
 613     rzy = pme->recipbox[ZZ][YY];
 614     rzz = pme->recipbox[ZZ][ZZ];
 615
 616     maxkx = (nx+1)/2;
 617     maxky = (ny+1)/2;
 618
 619     work  = &pme->solve_work[thread];
 620     mhx   = work->mhx;
 621     mhy   = work->mhy;
 622     mhz   = work->mhz;
 623     m2    = work->m2;
 624     denom = work->denom;
 625     tmp1  = work->tmp1;
 626     tmp2  = work->tmp2;
 627
 628     iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread   /nthread;
 629     iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
 630
 631     for (iyz = iyz0; iyz < iyz1; iyz++)
 632     {
 633         iy = iyz/local_ndata[ZZ];
 634         iz = iyz - iy*local_ndata[ZZ];
 635
 636         ky = iy + local_offset[YY];
 637
 638         if (ky < maxky)
 639         {
 640             my = ky;
 641         }
 642         else
 643         {
 644             my = (ky - ny);
 645         }
 646
 647         by = 3.0*vol*pme->bsp_mod[YY][ky]
 648             / (M_PI*sqrt(M_PI)*ewaldcoeff*ewaldcoeff*ewaldcoeff);
 649
 650         kz = iz + local_offset[ZZ];
 651
 652         mz = kz;
 653
 654         bz = pme->bsp_mod[ZZ][kz];
 655
 656         /* 0.5 correction for corner points */
 657         corner_fac = 1;
 658         if (kz == 0 || kz == (nz+1)/2)
 659         {
 660             corner_fac = 0.5;
 661         }
 662
 663         kxstart = local_offset[XX];
 664         kxend   = local_offset[XX] + local_ndata[XX];
 665         if (bEnerVir)
 666         {
 667             /* More expensive inner loop, especially because of the
 668              * storage of the mh elements in array's.  Because x is the
 669              * minor grid index, all mh elements depend on kx for
 670              * triclinic unit cells.
 671              */
 672
 673             /* Two explicit loops to avoid a conditional inside the loop */
 674             for (kx = kxstart; kx < maxkx; kx++)
 675             {
 676                 mx = kx;
 677
 678                 mhxk      = mx * rxx;
 679                 mhyk      = mx * ryx + my * ryy;
 680                 mhzk      = mx * rzx + my * rzy + mz * rzz;
 681                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 682                 mhx[kx]   = mhxk;
 683                 mhy[kx]   = mhyk;
 684                 mhz[kx]   = mhzk;
 685                 m2[kx]    = m2k;
 686                 denom[kx] = bz*by*pme->bsp_mod[XX][kx];
 687                 tmp1[kx]  = -factor*m2k;
 688                 tmp2[kx]  = sqrt(factor*m2k);
 689             }
 690
 691             for (kx = maxkx; kx < kxend; kx++)
 692             {
 693                 mx = (kx - nx);
 694
 695                 mhxk      = mx * rxx;
 696                 mhyk      = mx * ryx + my * ryy;
 697                 mhzk      = mx * rzx + my * rzy + mz * rzz;
 698                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 699                 mhx[kx]   = mhxk;
 700                 mhy[kx]   = mhyk;
 701                 mhz[kx]   = mhzk;
 702                 m2[kx]    = m2k;
 703                 denom[kx] = bz*by*pme->bsp_mod[XX][kx];
 704                 tmp1[kx]  = -factor*m2k;
 705                 tmp2[kx]  = sqrt(factor*m2k);
 706             }
 707             /* Clear padding elements to avoid (harmless) fp exceptions */
 708             const int kxendSimd = roundUpToMultipleOfFactor<c_simdWidth>(kxend);
 709             for (; kx < kxendSimd; kx++)
 710             {
 711                 tmp1[kx] = 0;
 712                 tmp2[kx] = 0;
 713             }
 714
 715             calc_exponentials_lj(kxstart, kxend,
 716                                  ArrayRef<PME_T>(tmp1, tmp1+roundUpToMultipleOfFactor<c_simdWidth>(kxend)),
 717                                  ArrayRef<PME_T>(tmp2, tmp2+roundUpToMultipleOfFactor<c_simdWidth>(kxend)),
 718                                  ArrayRef<PME_T>(denom, denom+roundUpToMultipleOfFactor<c_simdWidth>(kxend)));
 719
 720             for (kx = kxstart; kx < kxend; kx++)
 721             {
 722                 m2k   = factor*m2[kx];
 723                 eterm = -((1.0 - 2.0*m2k)*tmp1[kx]
 724                           + 2.0*m2k*tmp2[kx]);
 725                 vterm    = 3.0*(-tmp1[kx] + tmp2[kx]);
 726                 tmp1[kx] = eterm*denom[kx];
 727                 tmp2[kx] = vterm*denom[kx];
 728             }
 729
 730             if (!bLB)
 731             {
 732                 t_complex *p0;
 733                 real       struct2;
 734
 735                 p0 = grid[0] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
 736                 for (kx = kxstart; kx < kxend; kx++, p0++)
 737                 {
 738                     d1      = p0->re;
 739                     d2      = p0->im;
 740
 741                     eterm   = tmp1[kx];
 742                     vterm   = tmp2[kx];
 743                     p0->re  = d1*eterm;
 744                     p0->im  = d2*eterm;
 745
 746                     struct2 = 2.0*(d1*d1+d2*d2);
 747
 748                     tmp1[kx] = eterm*struct2;
 749                     tmp2[kx] = vterm*struct2;
 750                 }
 751             }
 752             else
 753             {
 754                 real *struct2 = denom;
 755                 real  str2;
 756
 757                 for (kx = kxstart; kx < kxend; kx++)
 758                 {
 759                     struct2[kx] = 0.0;
 760                 }
 761                 /* Due to symmetry we only need to calculate 4 of the 7 terms */
 762                 for (ig = 0; ig <= 3; ++ig)
 763                 {
 764                     t_complex *p0, *p1;
 765                     real       scale;
 766
 767                     p0    = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
 768                     p1    = grid[6-ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
 769                     scale = 2.0*lb_scale_factor_symm[ig];
 770                     for (kx = kxstart; kx < kxend; ++kx, ++p0, ++p1)
 771                     {
 772                         struct2[kx] += scale*(p0->re*p1->re + p0->im*p1->im);
 773                     }
 774
 775                 }
 776                 for (ig = 0; ig <= 6; ++ig)
 777                 {
 778                     t_complex *p0;
 779
 780                     p0 = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
 781                     for (kx = kxstart; kx < kxend; kx++, p0++)
 782                     {
 783                         d1     = p0->re;
 784                         d2     = p0->im;
 785
 786                         eterm  = tmp1[kx];
 787                         p0->re = d1*eterm;
 788                         p0->im = d2*eterm;
 789                     }
 790                 }
 791                 for (kx = kxstart; kx < kxend; kx++)
 792                 {
 793                     eterm    = tmp1[kx];
 794                     vterm    = tmp2[kx];
 795                     str2     = struct2[kx];
 796                     tmp1[kx] = eterm*str2;
 797                     tmp2[kx] = vterm*str2;
 798                 }
 799             }
 800
 801             for (kx = kxstart; kx < kxend; kx++)
 802             {
 803                 ets2     = corner_fac*tmp1[kx];
 804                 vterm    = 2.0*factor*tmp2[kx];
 805                 energy  += ets2;
 806                 ets2vf   = corner_fac*vterm;
 807                 virxx   += ets2vf*mhx[kx]*mhx[kx] - ets2;
 808                 virxy   += ets2vf*mhx[kx]*mhy[kx];
 809                 virxz   += ets2vf*mhx[kx]*mhz[kx];
 810                 viryy   += ets2vf*mhy[kx]*mhy[kx] - ets2;
 811                 viryz   += ets2vf*mhy[kx]*mhz[kx];
 812                 virzz   += ets2vf*mhz[kx]*mhz[kx] - ets2;
 813             }
 814         }
 815         else
 816         {
 817             /* We don't need to calculate the energy and the virial.
 818              *  In this case the triclinic overhead is small.
 819              */
 820
 821             /* Two explicit loops to avoid a conditional inside the loop */
 822
 823             for (kx = kxstart; kx < maxkx; kx++)
 824             {
 825                 mx = kx;
 826
 827                 mhxk      = mx * rxx;
 828                 mhyk      = mx * ryx + my * ryy;
 829                 mhzk      = mx * rzx + my * rzy + mz * rzz;
 830                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 831                 m2[kx]    = m2k;
 832                 denom[kx] = bz*by*pme->bsp_mod[XX][kx];
 833                 tmp1[kx]  = -factor*m2k;
 834                 tmp2[kx]  = sqrt(factor*m2k);
 835             }
 836
 837             for (kx = maxkx; kx < kxend; kx++)
 838             {
 839                 mx = (kx - nx);
 840
 841                 mhxk      = mx * rxx;
 842                 mhyk      = mx * ryx + my * ryy;
 843                 mhzk      = mx * rzx + my * rzy + mz * rzz;
 844                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 845                 m2[kx]    = m2k;
 846                 denom[kx] = bz*by*pme->bsp_mod[XX][kx];
 847                 tmp1[kx]  = -factor*m2k;
 848                 tmp2[kx]  = sqrt(factor*m2k);
 849             }
 850             /* Clear padding elements to avoid (harmless) fp exceptions */
 851             const int kxendSimd = roundUpToMultipleOfFactor<c_simdWidth>(kxend);
 852             for (; kx < kxendSimd; kx++)
 853             {
 854                 tmp1[kx] = 0;
 855                 tmp2[kx] = 0;
 856             }
 857
 858             calc_exponentials_lj(kxstart, kxend,
 859                                  ArrayRef<PME_T>(tmp1, tmp1+roundUpToMultipleOfFactor<c_simdWidth>(kxend)),
 860                                  ArrayRef<PME_T>(tmp2, tmp2+roundUpToMultipleOfFactor<c_simdWidth>(kxend)),
 861                                  ArrayRef<PME_T>(denom, denom+roundUpToMultipleOfFactor<c_simdWidth>(kxend)));
 862
 863             for (kx = kxstart; kx < kxend; kx++)
 864             {
 865                 m2k    = factor*m2[kx];
 866                 eterm  = -((1.0 - 2.0*m2k)*tmp1[kx]
 867                            + 2.0*m2k*tmp2[kx]);
 868                 tmp1[kx] = eterm*denom[kx];
 869             }
 870             gcount = (bLB ? 7 : 1);
 871             for (ig = 0; ig < gcount; ++ig)
 872             {
 873                 t_complex *p0;
 874
 875                 p0 = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
 876                 for (kx = kxstart; kx < kxend; kx++, p0++)
 877                 {
 878                     d1      = p0->re;
 879                     d2      = p0->im;
 880
 881                     eterm   = tmp1[kx];
 882
 883                     p0->re  = d1*eterm;
 884                     p0->im  = d2*eterm;
 885                 }
 886             }
 887         }
 888     }
 889     if (bEnerVir)
 890     {
 891         work->vir_lj[XX][XX] = 0.25*virxx;
 892         work->vir_lj[YY][YY] = 0.25*viryy;
 893         work->vir_lj[ZZ][ZZ] = 0.25*virzz;
 894         work->vir_lj[XX][YY] = work->vir_lj[YY][XX] = 0.25*virxy;
 895         work->vir_lj[XX][ZZ] = work->vir_lj[ZZ][XX] = 0.25*virxz;
 896         work->vir_lj[YY][ZZ] = work->vir_lj[ZZ][YY] = 0.25*viryz;
 897
 898         /* This energy should be corrected for a charged system */
 899         work->energy_lj = 0.5*energy;
 900     }
 901     /* Return the loop count */
 902     return local_ndata[YY]*local_ndata[XX];
 903 }