src/gromacs/ewald/pme.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  *
  39  * \brief This file contains function definitions necessary for
  40  * computing energies and forces for the PME long-ranged part (Coulomb
  41  * and LJ).
  42  *
  43  * \author Erik Lindahl <erik@kth.se>
  44  * \author Berk Hess <hess@kth.se>
  45  * \ingroup module_ewald
  46  */
  47 /* IMPORTANT FOR DEVELOPERS:
  48  *
  49  * Triclinic pme stuff isn't entirely trivial, and we've experienced
  50  * some bugs during development (many of them due to me). To avoid
  51  * this in the future, please check the following things if you make
  52  * changes in this file:
  53  *
  54  * 1. You should obtain identical (at least to the PME precision)
  55  *    energies, forces, and virial for
  56  *    a rectangular box and a triclinic one where the z (or y) axis is
  57  *    tilted a whole box side. For instance you could use these boxes:
  58  *
  59  *    rectangular       triclinic
  60  *     2  0  0           2  0  0
  61  *     0  2  0           0  2  0
  62  *     0  0  6           2  2  6
  63  *
  64  * 2. You should check the energy conservation in a triclinic box.
  65  *
  66  * It might seem an overkill, but better safe than sorry.
  67  * /Erik 001109
  68  */
  69
  70 #include "gmxpre.h"
  71
  72 #include "pme.h"
  73
  74 #include "config.h"
  75
  76 #include <assert.h>
  77 #include <math.h>
  78 #include <stdio.h>
  79 #include <stdlib.h>
  80 #include <string.h>
  81
  82 #include <algorithm>
  83
  84 #include "gromacs/fft/parallel_3dfft.h"
  85 #include "gromacs/fileio/pdbio.h"
  86 #include "gromacs/gmxlib/network.h"
  87 #include "gromacs/gmxlib/nrnb.h"
  88 #include "gromacs/math/gmxcomplex.h"
  89 #include "gromacs/math/invertmatrix.h"
  90 #include "gromacs/math/units.h"
  91 #include "gromacs/math/vec.h"
  92 #include "gromacs/math/vectypes.h"
  93 #include "gromacs/mdtypes/commrec.h"
  94 #include "gromacs/mdtypes/forcerec.h"
  95 #include "gromacs/mdtypes/inputrec.h"
  96 #include "gromacs/mdtypes/md_enums.h"
  97 #include "gromacs/pbcutil/pbc.h"
  98 #include "gromacs/timing/cyclecounter.h"
  99 #include "gromacs/timing/wallcycle.h"
 100 #include "gromacs/timing/walltime_accounting.h"
 101 #include "gromacs/utility/basedefinitions.h"
 102 #include "gromacs/utility/exceptions.h"
 103 #include "gromacs/utility/fatalerror.h"
 104 #include "gromacs/utility/gmxmpi.h"
 105 #include "gromacs/utility/gmxomp.h"
 106 #include "gromacs/utility/real.h"
 107 #include "gromacs/utility/smalloc.h"
 108
 109 #include "calculate-spline-moduli.h"
 110 #include "pme-gather.h"
 111 #include "pme-grid.h"
 112 #include "pme-internal.h"
 113 #include "pme-redistribute.h"
 114 #include "pme-solve.h"
 115 #include "pme-spline-work.h"
 116 #include "pme-spread.h"
 117
 118 /*! \brief Number of bytes in a cache line.
 119  *
 120  * Must also be a multiple of the SIMD and SIMD4 register size, to
 121  * preserve alignment.
 122  */
 123 const int gmxCacheLineSize = 64;
 124
 125 //! Set up coordinate communication
 126 static void setup_coordinate_communication(pme_atomcomm_t *atc)
 127 {
 128     int nslab, n, i;
 129     int fw, bw;
 130
 131     nslab = atc->nslab;
 132
 133     n = 0;
 134     for (i = 1; i <= nslab/2; i++)
 135     {
 136         fw = (atc->nodeid + i) % nslab;
 137         bw = (atc->nodeid - i + nslab) % nslab;
 138         if (n < nslab - 1)
 139         {
 140             atc->node_dest[n] = fw;
 141             atc->node_src[n]  = bw;
 142             n++;
 143         }
 144         if (n < nslab - 1)
 145         {
 146             atc->node_dest[n] = bw;
 147             atc->node_src[n]  = fw;
 148             n++;
 149         }
 150     }
 151 }
 152
 153 /*! \brief Round \p n up to the next multiple of \p f */
 154 static int mult_up(int n, int f)
 155 {
 156     return ((n + f - 1)/f)*f;
 157 }
 158
 159 /*! \brief Return estimate of the load imbalance from the PME grid not being a good match for the number of PME ranks */
 160 static double estimate_pme_load_imbalance(struct gmx_pme_t *pme)
 161 {
 162     int    nma, nmi;
 163     double n1, n2, n3;
 164
 165     nma = pme->nnodes_major;
 166     nmi = pme->nnodes_minor;
 167
 168     n1 = mult_up(pme->nkx, nma)*mult_up(pme->nky, nmi)*pme->nkz;
 169     n2 = mult_up(pme->nkx, nma)*mult_up(pme->nkz, nmi)*pme->nky;
 170     n3 = mult_up(pme->nky, nma)*mult_up(pme->nkz, nmi)*pme->nkx;
 171
 172     /* pme_solve is roughly double the cost of an fft */
 173
 174     return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
 175 }
 176
 177 /*! \brief Initialize atom communication data structure */
 178 static void init_atomcomm(struct gmx_pme_t *pme, pme_atomcomm_t *atc,
 179                           int dimind, gmx_bool bSpread)
 180 {
 181     int thread;
 182
 183     atc->dimind    = dimind;
 184     atc->nslab     = 1;
 185     atc->nodeid    = 0;
 186     atc->pd_nalloc = 0;
 187 #if GMX_MPI
 188     if (pme->nnodes > 1)
 189     {
 190         atc->mpi_comm = pme->mpi_comm_d[dimind];
 191         MPI_Comm_size(atc->mpi_comm, &atc->nslab);
 192         MPI_Comm_rank(atc->mpi_comm, &atc->nodeid);
 193     }
 194     if (debug)
 195     {
 196         fprintf(debug, "For PME atom communication in dimind %d: nslab %d rank %d\n", atc->dimind, atc->nslab, atc->nodeid);
 197     }
 198 #endif
 199
 200     atc->bSpread   = bSpread;
 201     atc->pme_order = pme->pme_order;
 202
 203     if (atc->nslab > 1)
 204     {
 205         snew(atc->node_dest, atc->nslab);
 206         snew(atc->node_src, atc->nslab);
 207         setup_coordinate_communication(atc);
 208
 209         snew(atc->count_thread, pme->nthread);
 210         for (thread = 0; thread < pme->nthread; thread++)
 211         {
 212             snew(atc->count_thread[thread], atc->nslab);
 213         }
 214         atc->count = atc->count_thread[0];
 215         snew(atc->rcount, atc->nslab);
 216         snew(atc->buf_index, atc->nslab);
 217     }
 218
 219     atc->nthread = pme->nthread;
 220     if (atc->nthread > 1)
 221     {
 222         snew(atc->thread_plist, atc->nthread);
 223     }
 224     snew(atc->spline, atc->nthread);
 225     for (thread = 0; thread < atc->nthread; thread++)
 226     {
 227         if (atc->nthread > 1)
 228         {
 229             snew(atc->thread_plist[thread].n, atc->nthread+2*gmxCacheLineSize);
 230             atc->thread_plist[thread].n += gmxCacheLineSize;
 231         }
 232     }
 233 }
 234
 235 /*! \brief Destroy an atom communication data structure and its child structs */
 236 static void destroy_atomcomm(pme_atomcomm_t *atc)
 237 {
 238     sfree(atc->pd);
 239     if (atc->nslab > 1)
 240     {
 241         sfree(atc->node_dest);
 242         sfree(atc->node_src);
 243         for (int i = 0; i < atc->nthread; i++)
 244         {
 245             sfree(atc->count_thread[i]);
 246         }
 247         sfree(atc->count_thread);
 248         sfree(atc->rcount);
 249         sfree(atc->buf_index);
 250
 251         sfree(atc->x);
 252         sfree(atc->coefficient);
 253         sfree(atc->f);
 254     }
 255     sfree(atc->idx);
 256     sfree(atc->fractx);
 257
 258     sfree(atc->thread_idx);
 259     for (int i = 0; i < atc->nthread; i++)
 260     {
 261         if (atc->nthread > 1)
 262         {
 263             int *n_ptr = atc->thread_plist[i].n - gmxCacheLineSize;
 264             sfree(n_ptr);
 265             sfree(atc->thread_plist[i].i);
 266         }
 267         sfree(atc->spline[i].ind);
 268     }
 269     if (atc->nthread > 1)
 270     {
 271         sfree(atc->thread_plist);
 272     }
 273     sfree(atc->spline);
 274 }
 275
 276 /*! \brief Initialize data structure for communication */
 277 static void
 278 init_overlap_comm(pme_overlap_t *  ol,
 279                   int              norder,
 280 #if GMX_MPI
 281                   MPI_Comm         comm,
 282 #endif
 283                   int              nnodes,
 284                   int              nodeid,
 285                   int              ndata,
 286                   int              commplainsize)
 287 {
 288     int              b, i;
 289     pme_grid_comm_t *pgc;
 290     gmx_bool         bCont;
 291     int              fft_start, fft_end, send_index1, recv_index1;
 292 #if GMX_MPI
 293     MPI_Status       stat;
 294
 295     ol->mpi_comm = comm;
 296 #endif
 297
 298     ol->nnodes = nnodes;
 299     ol->nodeid = nodeid;
 300
 301     /* Linear translation of the PME grid won't affect reciprocal space
 302      * calculations, so to optimize we only interpolate "upwards",
 303      * which also means we only have to consider overlap in one direction.
 304      * I.e., particles on this node might also be spread to grid indices
 305      * that belong to higher nodes (modulo nnodes)
 306      */
 307
 308     snew(ol->s2g0, ol->nnodes+1);
 309     snew(ol->s2g1, ol->nnodes);
 310     if (debug)
 311     {
 312         fprintf(debug, "PME slab boundaries:");
 313     }
 314     for (i = 0; i < nnodes; i++)
 315     {
 316         /* s2g0 the local interpolation grid start.
 317          * s2g1 the local interpolation grid end.
 318          * Since in calc_pidx we divide particles, and not grid lines,
 319          * spatially uniform along dimension x or y, we need to round
 320          * s2g0 down and s2g1 up.
 321          */
 322         ol->s2g0[i] = ( i   *ndata + 0       )/nnodes;
 323         ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
 324
 325         if (debug)
 326         {
 327             fprintf(debug, "  %3d %3d", ol->s2g0[i], ol->s2g1[i]);
 328         }
 329     }
 330     ol->s2g0[nnodes] = ndata;
 331     if (debug)
 332     {
 333         fprintf(debug, "\n");
 334     }
 335
 336     /* Determine with how many nodes we need to communicate the grid overlap */
 337     b = 0;
 338     do
 339     {
 340         b++;
 341         bCont = FALSE;
 342         for (i = 0; i < nnodes; i++)
 343         {
 344             if ((i+b <  nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
 345                 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
 346             {
 347                 bCont = TRUE;
 348             }
 349         }
 350     }
 351     while (bCont && b < nnodes);
 352     ol->noverlap_nodes = b - 1;
 353
 354     snew(ol->send_id, ol->noverlap_nodes);
 355     snew(ol->recv_id, ol->noverlap_nodes);
 356     for (b = 0; b < ol->noverlap_nodes; b++)
 357     {
 358         ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
 359         ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
 360     }
 361     snew(ol->comm_data, ol->noverlap_nodes);
 362
 363     ol->send_size = 0;
 364     for (b = 0; b < ol->noverlap_nodes; b++)
 365     {
 366         pgc = &ol->comm_data[b];
 367         /* Send */
 368         fft_start        = ol->s2g0[ol->send_id[b]];
 369         fft_end          = ol->s2g0[ol->send_id[b]+1];
 370         if (ol->send_id[b] < nodeid)
 371         {
 372             fft_start += ndata;
 373             fft_end   += ndata;
 374         }
 375         send_index1       = ol->s2g1[nodeid];
 376         send_index1       = std::min(send_index1, fft_end);
 377         pgc->send_index0  = fft_start;
 378         pgc->send_nindex  = std::max(0, send_index1 - pgc->send_index0);
 379         ol->send_size    += pgc->send_nindex;
 380
 381         /* We always start receiving to the first index of our slab */
 382         fft_start        = ol->s2g0[ol->nodeid];
 383         fft_end          = ol->s2g0[ol->nodeid+1];
 384         recv_index1      = ol->s2g1[ol->recv_id[b]];
 385         if (ol->recv_id[b] > nodeid)
 386         {
 387             recv_index1 -= ndata;
 388         }
 389         recv_index1      = std::min(recv_index1, fft_end);
 390         pgc->recv_index0 = fft_start;
 391         pgc->recv_nindex = std::max(0, recv_index1 - pgc->recv_index0);
 392     }
 393
 394 #if GMX_MPI
 395     /* Communicate the buffer sizes to receive */
 396     for (b = 0; b < ol->noverlap_nodes; b++)
 397     {
 398         MPI_Sendrecv(&ol->send_size, 1, MPI_INT, ol->send_id[b], b,
 399                      &ol->comm_data[b].recv_size, 1, MPI_INT, ol->recv_id[b], b,
 400                      ol->mpi_comm, &stat);
 401     }
 402 #endif
 403
 404     /* For non-divisible grid we need pme_order iso pme_order-1 */
 405     snew(ol->sendbuf, norder*commplainsize);
 406     snew(ol->recvbuf, norder*commplainsize);
 407 }
 408
 409 /*! \brief Destroy data structure for communication */
 410 static void
 411 destroy_overlap_comm(const pme_overlap_t *ol)
 412 {
 413     sfree(ol->s2g0);
 414     sfree(ol->s2g1);
 415     sfree(ol->send_id);
 416     sfree(ol->recv_id);
 417     sfree(ol->comm_data);
 418     sfree(ol->sendbuf);
 419     sfree(ol->recvbuf);
 420 }
 421
 422 int minimalPmeGridSize(int pmeOrder)
 423 {
 424     /* The actual grid size limitations are:
 425      *   serial:        >= pme_order
 426      *   DD, no OpenMP: >= 2*(pme_order - 1)
 427      *   DD, OpenMP:    >= pme_order + 1
 428      * But we use the maximum for simplicity since in practice there is not
 429      * much performance difference between pme_order and 2*(pme_order -1).
 430      */
 431     int minimalSize = 2*(pmeOrder - 1);
 432
 433     GMX_RELEASE_ASSERT(pmeOrder >= 3, "pmeOrder has to be >= 3");
 434     GMX_RELEASE_ASSERT(minimalSize >= pmeOrder + 1, "The grid size should be >= pmeOrder + 1");
 435
 436     return minimalSize;
 437 }
 438
 439 void gmx_pme_check_restrictions(int pme_order,
 440                                 int nkx, int nky, int nkz,
 441                                 int nnodes_major,
 442                                 gmx_bool bUseThreads,
 443                                 gmx_bool bFatal,
 444                                 gmx_bool *bValidSettings)
 445 {
 446     if (pme_order > PME_ORDER_MAX)
 447     {
 448         if (!bFatal)
 449         {
 450             *bValidSettings = FALSE;
 451             return;
 452         }
 453         gmx_fatal(FARGS, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
 454                   pme_order, PME_ORDER_MAX);
 455     }
 456
 457     const int minGridSize = minimalPmeGridSize(pme_order);
 458     if (nkx < minGridSize ||
 459         nky < minGridSize ||
 460         nkz < minGridSize)
 461     {
 462         if (!bFatal)
 463         {
 464             *bValidSettings = FALSE;
 465             return;
 466         }
 467         gmx_fatal(FARGS, "The PME grid sizes need to be >= 2*(pme_order-1) (%d)",
 468                   minGridSize);
 469     }
 470
 471     /* Check for a limitation of the (current) sum_fftgrid_dd code.
 472      * We only allow multiple communication pulses in dim 1, not in dim 0.
 473      */
 474     if (bUseThreads && (nkx < nnodes_major*pme_order &&
 475                         nkx != nnodes_major*(pme_order - 1)))
 476     {
 477         if (!bFatal)
 478         {
 479             *bValidSettings = FALSE;
 480             return;
 481         }
 482         gmx_fatal(FARGS, "The number of PME grid lines per rank along x is %g. But when using OpenMP threads, the number of grid lines per rank along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use fewer ranks along x (and possibly more along y and/or z) by specifying -dd manually.",
 483                   nkx/(double)nnodes_major, pme_order);
 484     }
 485
 486     if (bValidSettings != nullptr)
 487     {
 488         *bValidSettings = TRUE;
 489     }
 490
 491     return;
 492 }
 493
 494 /*! \brief Round \p enumerator */
 495 static int div_round_up(int enumerator, int denominator)
 496 {
 497     return (enumerator + denominator - 1)/denominator;
 498 }
 499
 500 int gmx_pme_init(struct gmx_pme_t **pmedata,
 501                  t_commrec *        cr,
 502                  int                nnodes_major,
 503                  int                nnodes_minor,
 504                  t_inputrec *       ir,
 505                  int                homenr,
 506                  gmx_bool           bFreeEnergy_q,
 507                  gmx_bool           bFreeEnergy_lj,
 508                  gmx_bool           bReproducible,
 509                  real               ewaldcoeff_q,
 510                  real               ewaldcoeff_lj,
 511                  int                nthread)
 512 {
 513     struct gmx_pme_t *pme = nullptr;
 514
 515     int               use_threads, sum_use_threads, i;
 516     ivec              ndata;
 517
 518     if (debug)
 519     {
 520         fprintf(debug, "Creating PME data structures.\n");
 521     }
 522     snew(pme, 1);
 523
 524     pme->sum_qgrid_tmp       = nullptr;
 525     pme->sum_qgrid_dd_tmp    = nullptr;
 526     pme->buf_nalloc          = 0;
 527
 528     pme->nnodes              = 1;
 529     pme->bPPnode             = TRUE;
 530
 531     pme->nnodes_major        = nnodes_major;
 532     pme->nnodes_minor        = nnodes_minor;
 533
 534 #if GMX_MPI
 535     if (nnodes_major*nnodes_minor > 1)
 536     {
 537         pme->mpi_comm = cr->mpi_comm_mygroup;
 538
 539         MPI_Comm_rank(pme->mpi_comm, &pme->nodeid);
 540         MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
 541         if (pme->nnodes != nnodes_major*nnodes_minor)
 542         {
 543             gmx_incons("PME rank count mismatch");
 544         }
 545     }
 546     else
 547     {
 548         pme->mpi_comm = MPI_COMM_NULL;
 549     }
 550 #endif
 551
 552     if (pme->nnodes == 1)
 553     {
 554 #if GMX_MPI
 555         pme->mpi_comm_d[0] = MPI_COMM_NULL;
 556         pme->mpi_comm_d[1] = MPI_COMM_NULL;
 557 #endif
 558         pme->ndecompdim   = 0;
 559         pme->nodeid_major = 0;
 560         pme->nodeid_minor = 0;
 561 #if GMX_MPI
 562         pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
 563 #endif
 564     }
 565     else
 566     {
 567         if (nnodes_minor == 1)
 568         {
 569 #if GMX_MPI
 570             pme->mpi_comm_d[0] = pme->mpi_comm;
 571             pme->mpi_comm_d[1] = MPI_COMM_NULL;
 572 #endif
 573             pme->ndecompdim   = 1;
 574             pme->nodeid_major = pme->nodeid;
 575             pme->nodeid_minor = 0;
 576
 577         }
 578         else if (nnodes_major == 1)
 579         {
 580 #if GMX_MPI
 581             pme->mpi_comm_d[0] = MPI_COMM_NULL;
 582             pme->mpi_comm_d[1] = pme->mpi_comm;
 583 #endif
 584             pme->ndecompdim   = 1;
 585             pme->nodeid_major = 0;
 586             pme->nodeid_minor = pme->nodeid;
 587         }
 588         else
 589         {
 590             if (pme->nnodes % nnodes_major != 0)
 591             {
 592                 gmx_incons("For 2D PME decomposition, #PME ranks must be divisible by the number of ranks in the major dimension");
 593             }
 594             pme->ndecompdim = 2;
 595
 596 #if GMX_MPI
 597             MPI_Comm_split(pme->mpi_comm, pme->nodeid % nnodes_minor,
 598                            pme->nodeid, &pme->mpi_comm_d[0]);  /* My communicator along major dimension */
 599             MPI_Comm_split(pme->mpi_comm, pme->nodeid/nnodes_minor,
 600                            pme->nodeid, &pme->mpi_comm_d[1]);  /* My communicator along minor dimension */
 601
 602             MPI_Comm_rank(pme->mpi_comm_d[0], &pme->nodeid_major);
 603             MPI_Comm_size(pme->mpi_comm_d[0], &pme->nnodes_major);
 604             MPI_Comm_rank(pme->mpi_comm_d[1], &pme->nodeid_minor);
 605             MPI_Comm_size(pme->mpi_comm_d[1], &pme->nnodes_minor);
 606 #endif
 607         }
 608         pme->bPPnode = (cr->duty & DUTY_PP);
 609     }
 610
 611     pme->nthread = nthread;
 612
 613     /* Check if any of the PME MPI ranks uses threads */
 614     use_threads = (pme->nthread > 1 ? 1 : 0);
 615 #if GMX_MPI
 616     if (pme->nnodes > 1)
 617     {
 618         MPI_Allreduce(&use_threads, &sum_use_threads, 1, MPI_INT,
 619                       MPI_SUM, pme->mpi_comm);
 620     }
 621     else
 622 #endif
 623     {
 624         sum_use_threads = use_threads;
 625     }
 626     pme->bUseThreads = (sum_use_threads > 0);
 627
 628     if (ir->ePBC == epbcSCREW)
 629     {
 630         gmx_fatal(FARGS, "pme does not (yet) work with pbc = screw");
 631     }
 632
 633     /* NOTE:
 634      * It is likely that the current gmx_pme_do() routine supports calculating
 635      * only Coulomb or LJ while gmx_pme_init() configures for both,
 636      * but that has never been tested.
 637      * It is likely that the current gmx_pme_do() routine supports calculating,
 638      * not calculating free-energy for Coulomb and/or LJ while gmx_pme_init()
 639      * configures with free-energy, but that has never been tested.
 640      */
 641     pme->doCoulomb     = EEL_PME(ir->coulombtype);
 642     pme->doLJ          = EVDW_PME(ir->vdwtype);
 643     pme->bFEP_q        = ((ir->efep != efepNO) && bFreeEnergy_q);
 644     pme->bFEP_lj       = ((ir->efep != efepNO) && bFreeEnergy_lj);
 645     pme->bFEP          = (pme->bFEP_q || pme->bFEP_lj);
 646     pme->nkx           = ir->nkx;
 647     pme->nky           = ir->nky;
 648     pme->nkz           = ir->nkz;
 649     pme->bP3M          = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != nullptr);
 650     pme->pme_order     = ir->pme_order;
 651     pme->ewaldcoeff_q  = ewaldcoeff_q;
 652     pme->ewaldcoeff_lj = ewaldcoeff_lj;
 653
 654     /* Always constant electrostatics coefficients */
 655     pme->epsilon_r     = ir->epsilon_r;
 656
 657     /* Always constant LJ coefficients */
 658     pme->ljpme_combination_rule = ir->ljpme_combination_rule;
 659
 660     /* If we violate restrictions, generate a fatal error here */
 661     gmx_pme_check_restrictions(pme->pme_order,
 662                                pme->nkx, pme->nky, pme->nkz,
 663                                pme->nnodes_major,
 664                                pme->bUseThreads,
 665                                TRUE,
 666                                nullptr);
 667
 668     if (pme->nnodes > 1)
 669     {
 670         double imbal;
 671
 672 #if GMX_MPI
 673         MPI_Type_contiguous(DIM, GMX_MPI_REAL, &(pme->rvec_mpi));
 674         MPI_Type_commit(&(pme->rvec_mpi));
 675 #endif
 676
 677         /* Note that the coefficient spreading and force gathering, which usually
 678          * takes about the same amount of time as FFT+solve_pme,
 679          * is always fully load balanced
 680          * (unless the coefficient distribution is inhomogeneous).
 681          */
 682
 683         imbal = estimate_pme_load_imbalance(pme);
 684         if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
 685         {
 686             fprintf(stderr,
 687                     "\n"
 688                     "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
 689                     "      For optimal PME load balancing\n"
 690                     "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_ranks_x (%d)\n"
 691                     "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_ranks_y (%d)\n"
 692                     "\n",
 693                     (int)((imbal-1)*100 + 0.5),
 694                     pme->nkx, pme->nky, pme->nnodes_major,
 695                     pme->nky, pme->nkz, pme->nnodes_minor);
 696         }
 697     }
 698
 699     /* For non-divisible grid we need pme_order iso pme_order-1 */
 700     /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
 701      * y is always copied through a buffer: we don't need padding in z,
 702      * but we do need the overlap in x because of the communication order.
 703      */
 704     init_overlap_comm(&pme->overlap[0], pme->pme_order,
 705 #if GMX_MPI
 706                       pme->mpi_comm_d[0],
 707 #endif
 708                       pme->nnodes_major, pme->nodeid_major,
 709                       pme->nkx,
 710                       (div_round_up(pme->nky, pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
 711
 712     /* Along overlap dim 1 we can send in multiple pulses in sum_fftgrid_dd.
 713      * We do this with an offset buffer of equal size, so we need to allocate
 714      * extra for the offset. That's what the (+1)*pme->nkz is for.
 715      */
 716     init_overlap_comm(&pme->overlap[1], pme->pme_order,
 717 #if GMX_MPI
 718                       pme->mpi_comm_d[1],
 719 #endif
 720                       pme->nnodes_minor, pme->nodeid_minor,
 721                       pme->nky,
 722                       (div_round_up(pme->nkx, pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
 723
 724     /* Double-check for a limitation of the (current) sum_fftgrid_dd code.
 725      * Note that gmx_pme_check_restrictions checked for this already.
 726      */
 727     if (pme->bUseThreads && pme->overlap[0].noverlap_nodes > 1)
 728     {
 729         gmx_incons("More than one communication pulse required for grid overlap communication along the major dimension while using threads");
 730     }
 731
 732     snew(pme->bsp_mod[XX], pme->nkx);
 733     snew(pme->bsp_mod[YY], pme->nky);
 734     snew(pme->bsp_mod[ZZ], pme->nkz);
 735
 736     /* The required size of the interpolation grid, including overlap.
 737      * The allocated size (pmegrid_n?) might be slightly larger.
 738      */
 739     pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
 740         pme->overlap[0].s2g0[pme->nodeid_major];
 741     pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
 742         pme->overlap[1].s2g0[pme->nodeid_minor];
 743     pme->pmegrid_nz_base = pme->nkz;
 744     pme->pmegrid_nz      = pme->pmegrid_nz_base + pme->pme_order - 1;
 745     set_grid_alignment(&pme->pmegrid_nz, pme->pme_order);
 746
 747     pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
 748     pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
 749     pme->pmegrid_start_iz = 0;
 750
 751     make_gridindex_to_localindex(pme->nkx,
 752                                  pme->pmegrid_start_ix,
 753                                  pme->pmegrid_nx - (pme->pme_order-1),
 754                                  &pme->nnx, &pme->fshx);
 755     make_gridindex_to_localindex(pme->nky,
 756                                  pme->pmegrid_start_iy,
 757                                  pme->pmegrid_ny - (pme->pme_order-1),
 758                                  &pme->nny, &pme->fshy);
 759     make_gridindex_to_localindex(pme->nkz,
 760                                  pme->pmegrid_start_iz,
 761                                  pme->pmegrid_nz_base,
 762                                  &pme->nnz, &pme->fshz);
 763
 764     pme->spline_work = make_pme_spline_work(pme->pme_order);
 765
 766     ndata[0]    = pme->nkx;
 767     ndata[1]    = pme->nky;
 768     ndata[2]    = pme->nkz;
 769     /* It doesn't matter if we allocate too many grids here,
 770      * we only allocate and use the ones we need.
 771      */
 772     if (pme->doLJ)
 773     {
 774         pme->ngrids = ((ir->ljpme_combination_rule == eljpmeLB) ? DO_Q_AND_LJ_LB : DO_Q_AND_LJ);
 775     }
 776     else
 777     {
 778         pme->ngrids = DO_Q;
 779     }
 780     snew(pme->fftgrid, pme->ngrids);
 781     snew(pme->cfftgrid, pme->ngrids);
 782     snew(pme->pfft_setup, pme->ngrids);
 783
 784     for (i = 0; i < pme->ngrids; ++i)
 785     {
 786         if ((i <  DO_Q && pme->doCoulomb && (i == 0 ||
 787                                              bFreeEnergy_q)) ||
 788             (i >= DO_Q && pme->doLJ && (i == 2 ||
 789                                         bFreeEnergy_lj ||
 790                                         ir->ljpme_combination_rule == eljpmeLB)))
 791         {
 792             pmegrids_init(&pme->pmegrid[i],
 793                           pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
 794                           pme->pmegrid_nz_base,
 795                           pme->pme_order,
 796                           pme->bUseThreads,
 797                           pme->nthread,
 798                           pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
 799                           pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
 800             /* This routine will allocate the grid data to fit the FFTs */
 801             gmx_parallel_3dfft_init(&pme->pfft_setup[i], ndata,
 802                                     &pme->fftgrid[i], &pme->cfftgrid[i],
 803                                     pme->mpi_comm_d,
 804                                     bReproducible, pme->nthread);
 805
 806         }
 807     }
 808
 809     if (!pme->bP3M)
 810     {
 811         /* Use plain SPME B-spline interpolation */
 812         make_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
 813     }
 814     else
 815     {
 816         /* Use the P3M grid-optimized influence function */
 817         make_p3m_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
 818     }
 819
 820     /* Use atc[0] for spreading */
 821     init_atomcomm(pme, &pme->atc[0], nnodes_major > 1 ? 0 : 1, TRUE);
 822     if (pme->ndecompdim >= 2)
 823     {
 824         init_atomcomm(pme, &pme->atc[1], 1, FALSE);
 825     }
 826
 827     if (pme->nnodes == 1)
 828     {
 829         pme->atc[0].n = homenr;
 830         pme_realloc_atomcomm_things(&pme->atc[0]);
 831     }
 832
 833     pme->lb_buf1       = nullptr;
 834     pme->lb_buf2       = nullptr;
 835     pme->lb_buf_nalloc = 0;
 836
 837     pme_init_all_work(&pme->solve_work, pme->nthread, pme->nkx);
 838
 839     *pmedata = pme;
 840
 841     return 0;
 842 }
 843
 844 int gmx_pme_reinit(struct gmx_pme_t **pmedata,
 845                    t_commrec *        cr,
 846                    struct gmx_pme_t * pme_src,
 847                    const t_inputrec * ir,
 848                    ivec               grid_size,
 849                    real               ewaldcoeff_q,
 850                    real               ewaldcoeff_lj)
 851 {
 852     t_inputrec irc;
 853     int        homenr;
 854     int        ret;
 855
 856     irc     = *ir;
 857     irc.nkx = grid_size[XX];
 858     irc.nky = grid_size[YY];
 859     irc.nkz = grid_size[ZZ];
 860
 861     if (pme_src->nnodes == 1)
 862     {
 863         homenr = pme_src->atc[0].n;
 864     }
 865     else
 866     {
 867         homenr = -1;
 868     }
 869
 870     ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
 871                        &irc, homenr, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE, ewaldcoeff_q, ewaldcoeff_lj, pme_src->nthread);
 872
 873     if (ret == 0)
 874     {
 875         /* We can easily reuse the allocated pme grids in pme_src */
 876         reuse_pmegrids(&pme_src->pmegrid[PME_GRID_QA], &(*pmedata)->pmegrid[PME_GRID_QA]);
 877         /* We would like to reuse the fft grids, but that's harder */
 878     }
 879
 880     return ret;
 881 }
 882
 883 void gmx_pme_calc_energy(struct gmx_pme_t *pme, int n, rvec *x, real *q, real *V)
 884 {
 885     pme_atomcomm_t *atc;
 886     pmegrids_t     *grid;
 887
 888     if (pme->nnodes > 1)
 889     {
 890         gmx_incons("gmx_pme_calc_energy called in parallel");
 891     }
 892     if (pme->bFEP_q > 1)
 893     {
 894         gmx_incons("gmx_pme_calc_energy with free energy");
 895     }
 896
 897     atc            = &pme->atc_energy;
 898     atc->nthread   = 1;
 899     if (atc->spline == nullptr)
 900     {
 901         snew(atc->spline, atc->nthread);
 902     }
 903     atc->nslab     = 1;
 904     atc->bSpread   = TRUE;
 905     atc->pme_order = pme->pme_order;
 906     atc->n         = n;
 907     pme_realloc_atomcomm_things(atc);
 908     atc->x           = x;
 909     atc->coefficient = q;
 910
 911     /* We only use the A-charges grid */
 912     grid = &pme->pmegrid[PME_GRID_QA];
 913
 914     /* Only calculate the spline coefficients, don't actually spread */
 915     spread_on_grid(pme, atc, nullptr, TRUE, FALSE, pme->fftgrid[PME_GRID_QA], FALSE, PME_GRID_QA);
 916
 917     *V = gather_energy_bsplines(pme, grid->grid.grid, atc);
 918 }
 919
 920 /*! \brief Calculate initial Lorentz-Berthelot coefficients for LJ-PME */
 921 static void
 922 calc_initial_lb_coeffs(struct gmx_pme_t *pme, real *local_c6, real *local_sigma)
 923 {
 924     int  i;
 925     for (i = 0; i < pme->atc[0].n; ++i)
 926     {
 927         real sigma4;
 928         sigma4                     = local_sigma[i];
 929         sigma4                     = sigma4*sigma4;
 930         sigma4                     = sigma4*sigma4;
 931         pme->atc[0].coefficient[i] = local_c6[i] / sigma4;
 932     }
 933 }
 934
 935 /*! \brief Calculate next Lorentz-Berthelot coefficients for LJ-PME */
 936 static void
 937 calc_next_lb_coeffs(struct gmx_pme_t *pme, real *local_sigma)
 938 {
 939     int  i;
 940
 941     for (i = 0; i < pme->atc[0].n; ++i)
 942     {
 943         pme->atc[0].coefficient[i] *= local_sigma[i];
 944     }
 945 }
 946
 947 int gmx_pme_do(struct gmx_pme_t *pme,
 948                int start,       int homenr,
 949                rvec x[],        rvec f[],
 950                real chargeA[],  real chargeB[],
 951                real c6A[],      real c6B[],
 952                real sigmaA[],   real sigmaB[],
 953                matrix box,      t_commrec *cr,
 954                int  maxshift_x, int maxshift_y,
 955                t_nrnb *nrnb,    gmx_wallcycle_t wcycle,
 956                matrix vir_q,    matrix vir_lj,
 957                real *energy_q,  real *energy_lj,
 958                real lambda_q,   real lambda_lj,
 959                real *dvdlambda_q, real *dvdlambda_lj,
 960                int flags)
 961 {
 962     int                  d, i, j, npme, grid_index, max_grid_index;
 963     int                  n_d;
 964     pme_atomcomm_t      *atc        = nullptr;
 965     pmegrids_t          *pmegrid    = nullptr;
 966     real                *grid       = nullptr;
 967     rvec                *f_d;
 968     real                *coefficient = nullptr;
 969     real                 energy_AB[4];
 970     matrix               vir_AB[4];
 971     real                 scale, lambda;
 972     gmx_bool             bClearF;
 973     gmx_parallel_3dfft_t pfft_setup;
 974     real              *  fftgrid;
 975     t_complex          * cfftgrid;
 976     int                  thread;
 977     gmx_bool             bFirst, bDoSplines;
 978     int                  fep_state;
 979     int                  fep_states_lj           = pme->bFEP_lj ? 2 : 1;
 980     const gmx_bool       bCalcEnerVir            = flags & GMX_PME_CALC_ENER_VIR;
 981     const gmx_bool       bBackFFT                = flags & (GMX_PME_CALC_F | GMX_PME_CALC_POT);
 982     const gmx_bool       bCalcF                  = flags & GMX_PME_CALC_F;
 983
 984     assert(pme->nnodes > 0);
 985     assert(pme->nnodes == 1 || pme->ndecompdim > 0);
 986
 987     if (pme->nnodes > 1)
 988     {
 989         atc      = &pme->atc[0];
 990         atc->npd = homenr;
 991         if (atc->npd > atc->pd_nalloc)
 992         {
 993             atc->pd_nalloc = over_alloc_dd(atc->npd);
 994             srenew(atc->pd, atc->pd_nalloc);
 995         }
 996         for (d = pme->ndecompdim-1; d >= 0; d--)
 997         {
 998             atc           = &pme->atc[d];
 999             atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
1000         }
1001     }
1002     else
1003     {
1004         atc = &pme->atc[0];
1005         /* This could be necessary for TPI */
1006         pme->atc[0].n = homenr;
1007         if (DOMAINDECOMP(cr))
1008         {
1009             pme_realloc_atomcomm_things(atc);
1010         }
1011         atc->x = x;
1012         atc->f = f;
1013     }
1014
1015     gmx::invertBoxMatrix(box, pme->recipbox);
1016     bFirst = TRUE;
1017
1018     /* For simplicity, we construct the splines for all particles if
1019      * more than one PME calculations is needed. Some optimization
1020      * could be done by keeping track of which atoms have splines
1021      * constructed, and construct new splines on each pass for atoms
1022      * that don't yet have them.
1023      */
1024
1025     bDoSplines = pme->bFEP || (pme->doCoulomb && pme->doLJ);
1026
1027     /* We need a maximum of four separate PME calculations:
1028      * grid_index=0: Coulomb PME with charges from state A
1029      * grid_index=1: Coulomb PME with charges from state B
1030      * grid_index=2: LJ PME with C6 from state A
1031      * grid_index=3: LJ PME with C6 from state B
1032      * For Lorentz-Berthelot combination rules, a separate loop is used to
1033      * calculate all the terms
1034      */
1035
1036     /* If we are doing LJ-PME with LB, we only do Q here */
1037     max_grid_index = (pme->ljpme_combination_rule == eljpmeLB) ? DO_Q : DO_Q_AND_LJ;
1038
1039     for (grid_index = 0; grid_index < max_grid_index; ++grid_index)
1040     {
1041         /* Check if we should do calculations at this grid_index
1042          * If grid_index is odd we should be doing FEP
1043          * If grid_index < 2 we should be doing electrostatic PME
1044          * If grid_index >= 2 we should be doing LJ-PME
1045          */
1046         if ((grid_index <  DO_Q && (!pme->doCoulomb ||
1047                                     (grid_index == 1 && !pme->bFEP_q))) ||
1048             (grid_index >= DO_Q && (!pme->doLJ ||
1049                                     (grid_index == 3 && !pme->bFEP_lj))))
1050         {
1051             continue;
1052         }
1053         /* Unpack structure */
1054         pmegrid    = &pme->pmegrid[grid_index];
1055         fftgrid    = pme->fftgrid[grid_index];
1056         cfftgrid   = pme->cfftgrid[grid_index];
1057         pfft_setup = pme->pfft_setup[grid_index];
1058         switch (grid_index)
1059         {
1060             case 0: coefficient = chargeA + start; break;
1061             case 1: coefficient = chargeB + start; break;
1062             case 2: coefficient = c6A + start; break;
1063             case 3: coefficient = c6B + start; break;
1064         }
1065
1066         grid = pmegrid->grid.grid;
1067
1068         if (debug)
1069         {
1070             fprintf(debug, "PME: number of ranks = %d, rank = %d\n",
1071                     cr->nnodes, cr->nodeid);
1072             fprintf(debug, "Grid = %p\n", (void*)grid);
1073             if (grid == nullptr)
1074             {
1075                 gmx_fatal(FARGS, "No grid!");
1076             }
1077         }
1078         where();
1079
1080         if (pme->nnodes == 1)
1081         {
1082             atc->coefficient = coefficient;
1083         }
1084         else
1085         {
1086             wallcycle_start(wcycle, ewcPME_REDISTXF);
1087             do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, coefficient);
1088             where();
1089
1090             wallcycle_stop(wcycle, ewcPME_REDISTXF);
1091         }
1092
1093         if (debug)
1094         {
1095             fprintf(debug, "Rank= %6d, pme local particles=%6d\n",
1096                     cr->nodeid, atc->n);
1097         }
1098
1099         if (flags & GMX_PME_SPREAD)
1100         {
1101             wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1102
1103             /* Spread the coefficients on a grid */
1104             spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
1105
1106             if (bFirst)
1107             {
1108                 inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
1109             }
1110             inc_nrnb(nrnb, eNR_SPREADBSP,
1111                      pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
1112
1113             if (!pme->bUseThreads)
1114             {
1115                 wrap_periodic_pmegrid(pme, grid);
1116
1117                 /* sum contributions to local grid from other nodes */
1118 #if GMX_MPI
1119                 if (pme->nnodes > 1)
1120                 {
1121                     gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
1122                     where();
1123                 }
1124 #endif
1125
1126                 copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
1127             }
1128
1129             wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1130
1131             /* TODO If the OpenMP and single-threaded implementations
1132                converge, then spread_on_grid() and
1133                copy_pmegrid_to_fftgrid() will perhaps live in the same
1134                source file and the following debugging function can live
1135                there too. */
1136             /*
1137                dump_local_fftgrid(pme,fftgrid);
1138                exit(0);
1139              */
1140         }
1141
1142         /* Here we start a large thread parallel region */
1143 #pragma omp parallel num_threads(pme->nthread) private(thread)
1144         {
1145             try
1146             {
1147                 thread = gmx_omp_get_thread_num();
1148                 if (flags & GMX_PME_SOLVE)
1149                 {
1150                     int loop_count;
1151
1152                     /* do 3d-fft */
1153                     if (thread == 0)
1154                     {
1155                         wallcycle_start(wcycle, ewcPME_FFT);
1156                     }
1157                     gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
1158                                                thread, wcycle);
1159                     if (thread == 0)
1160                     {
1161                         wallcycle_stop(wcycle, ewcPME_FFT);
1162                     }
1163                     where();
1164
1165                     /* solve in k-space for our local cells */
1166                     if (thread == 0)
1167                     {
1168                         wallcycle_start(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
1169                     }
1170                     if (grid_index < DO_Q)
1171                     {
1172                         loop_count =
1173                             solve_pme_yzx(pme, cfftgrid,
1174                                           box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
1175                                           bCalcEnerVir,
1176                                           pme->nthread, thread);
1177                     }
1178                     else
1179                     {
1180                         loop_count =
1181                             solve_pme_lj_yzx(pme, &cfftgrid, FALSE,
1182                                              box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
1183                                              bCalcEnerVir,
1184                                              pme->nthread, thread);
1185                     }
1186
1187                     if (thread == 0)
1188                     {
1189                         wallcycle_stop(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
1190                         where();
1191                         inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
1192                     }
1193                 }
1194
1195                 if (bBackFFT)
1196                 {
1197                     /* do 3d-invfft */
1198                     if (thread == 0)
1199                     {
1200                         where();
1201                         wallcycle_start(wcycle, ewcPME_FFT);
1202                     }
1203                     gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
1204                                                thread, wcycle);
1205                     if (thread == 0)
1206                     {
1207                         wallcycle_stop(wcycle, ewcPME_FFT);
1208
1209                         where();
1210
1211                         if (pme->nodeid == 0)
1212                         {
1213                             real ntot = pme->nkx*pme->nky*pme->nkz;
1214                             npme  = static_cast<int>(ntot*std::log(ntot)/std::log(2.0));
1215                             inc_nrnb(nrnb, eNR_FFT, 2*npme);
1216                         }
1217
1218                         /* Note: this wallcycle region is closed below
1219                            outside an OpenMP region, so take care if
1220                            refactoring code here. */
1221                         wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1222                     }
1223
1224                     copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
1225                 }
1226             } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1227         }
1228         /* End of thread parallel section.
1229          * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
1230          */
1231
1232         if (bBackFFT)
1233         {
1234             /* distribute local grid to all nodes */
1235 #if GMX_MPI
1236             if (pme->nnodes > 1)
1237             {
1238                 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
1239             }
1240 #endif
1241             where();
1242
1243             unwrap_periodic_pmegrid(pme, grid);
1244         }
1245
1246         if (bCalcF)
1247         {
1248             /* interpolate forces for our local atoms */
1249
1250             where();
1251
1252             /* If we are running without parallelization,
1253              * atc->f is the actual force array, not a buffer,
1254              * therefore we should not clear it.
1255              */
1256             lambda  = grid_index < DO_Q ? lambda_q : lambda_lj;
1257             bClearF = (bFirst && PAR(cr));
1258 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1259             for (thread = 0; thread < pme->nthread; thread++)
1260             {
1261                 try
1262                 {
1263                     gather_f_bsplines(pme, grid, bClearF, atc,
1264                                       &atc->spline[thread],
1265                                       pme->bFEP ? (grid_index % 2 == 0 ? 1.0-lambda : lambda) : 1.0);
1266                 }
1267                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1268             }
1269
1270             where();
1271
1272             inc_nrnb(nrnb, eNR_GATHERFBSP,
1273                      pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
1274             /* Note: this wallcycle region is opened above inside an OpenMP
1275                region, so take care if refactoring code here. */
1276             wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1277         }
1278
1279         if (bCalcEnerVir)
1280         {
1281             /* This should only be called on the master thread
1282              * and after the threads have synchronized.
1283              */
1284             if (grid_index < 2)
1285             {
1286                 get_pme_ener_vir_q(pme->solve_work, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
1287             }
1288             else
1289             {
1290                 get_pme_ener_vir_lj(pme->solve_work, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
1291             }
1292         }
1293         bFirst = FALSE;
1294     } /* of grid_index-loop */
1295
1296     /* For Lorentz-Berthelot combination rules in LJ-PME, we need to calculate
1297      * seven terms. */
1298
1299     if (pme->doLJ && pme->ljpme_combination_rule == eljpmeLB)
1300     {
1301         /* Loop over A- and B-state if we are doing FEP */
1302         for (fep_state = 0; fep_state < fep_states_lj; ++fep_state)
1303         {
1304             real *local_c6 = nullptr, *local_sigma = nullptr, *RedistC6 = nullptr, *RedistSigma = nullptr;
1305             if (pme->nnodes == 1)
1306             {
1307                 if (pme->lb_buf1 == nullptr)
1308                 {
1309                     pme->lb_buf_nalloc = pme->atc[0].n;
1310                     snew(pme->lb_buf1, pme->lb_buf_nalloc);
1311                 }
1312                 pme->atc[0].coefficient = pme->lb_buf1;
1313                 switch (fep_state)
1314                 {
1315                     case 0:
1316                         local_c6      = c6A;
1317                         local_sigma   = sigmaA;
1318                         break;
1319                     case 1:
1320                         local_c6      = c6B;
1321                         local_sigma   = sigmaB;
1322                         break;
1323                     default:
1324                         gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
1325                 }
1326             }
1327             else
1328             {
1329                 atc = &pme->atc[0];
1330                 switch (fep_state)
1331                 {
1332                     case 0:
1333                         RedistC6      = c6A;
1334                         RedistSigma   = sigmaA;
1335                         break;
1336                     case 1:
1337                         RedistC6      = c6B;
1338                         RedistSigma   = sigmaB;
1339                         break;
1340                     default:
1341                         gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
1342                 }
1343                 wallcycle_start(wcycle, ewcPME_REDISTXF);
1344
1345                 do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, RedistC6);
1346                 if (pme->lb_buf_nalloc < atc->n)
1347                 {
1348                     pme->lb_buf_nalloc = atc->nalloc;
1349                     srenew(pme->lb_buf1, pme->lb_buf_nalloc);
1350                     srenew(pme->lb_buf2, pme->lb_buf_nalloc);
1351                 }
1352                 local_c6 = pme->lb_buf1;
1353                 for (i = 0; i < atc->n; ++i)
1354                 {
1355                     local_c6[i] = atc->coefficient[i];
1356                 }
1357                 where();
1358
1359                 do_redist_pos_coeffs(pme, cr, start, homenr, FALSE, x, RedistSigma);
1360                 local_sigma = pme->lb_buf2;
1361                 for (i = 0; i < atc->n; ++i)
1362                 {
1363                     local_sigma[i] = atc->coefficient[i];
1364                 }
1365                 where();
1366
1367                 wallcycle_stop(wcycle, ewcPME_REDISTXF);
1368             }
1369             calc_initial_lb_coeffs(pme, local_c6, local_sigma);
1370
1371             /*Seven terms in LJ-PME with LB, grid_index < 2 reserved for electrostatics*/
1372             for (grid_index = 2; grid_index < 9; ++grid_index)
1373             {
1374                 /* Unpack structure */
1375                 pmegrid    = &pme->pmegrid[grid_index];
1376                 fftgrid    = pme->fftgrid[grid_index];
1377                 pfft_setup = pme->pfft_setup[grid_index];
1378                 calc_next_lb_coeffs(pme, local_sigma);
1379                 grid = pmegrid->grid.grid;
1380                 where();
1381
1382                 if (flags & GMX_PME_SPREAD)
1383                 {
1384                     wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1385                     /* Spread the c6 on a grid */
1386                     spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
1387
1388                     if (bFirst)
1389                     {
1390                         inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
1391                     }
1392
1393                     inc_nrnb(nrnb, eNR_SPREADBSP,
1394                              pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
1395                     if (pme->nthread == 1)
1396                     {
1397                         wrap_periodic_pmegrid(pme, grid);
1398                         /* sum contributions to local grid from other nodes */
1399 #if GMX_MPI
1400                         if (pme->nnodes > 1)
1401                         {
1402                             gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
1403                             where();
1404                         }
1405 #endif
1406                         copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
1407                     }
1408                     wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1409                 }
1410                 /*Here we start a large thread parallel region*/
1411 #pragma omp parallel num_threads(pme->nthread) private(thread)
1412                 {
1413                     try
1414                     {
1415                         thread = gmx_omp_get_thread_num();
1416                         if (flags & GMX_PME_SOLVE)
1417                         {
1418                             /* do 3d-fft */
1419                             if (thread == 0)
1420                             {
1421                                 wallcycle_start(wcycle, ewcPME_FFT);
1422                             }
1423
1424                             gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
1425                                                        thread, wcycle);
1426                             if (thread == 0)
1427                             {
1428                                 wallcycle_stop(wcycle, ewcPME_FFT);
1429                             }
1430                             where();
1431                         }
1432                     }
1433                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1434                 }
1435                 bFirst = FALSE;
1436             }
1437             if (flags & GMX_PME_SOLVE)
1438             {
1439                 /* solve in k-space for our local cells */
1440 #pragma omp parallel num_threads(pme->nthread) private(thread)
1441                 {
1442                     try
1443                     {
1444                         int loop_count;
1445                         thread = gmx_omp_get_thread_num();
1446                         if (thread == 0)
1447                         {
1448                             wallcycle_start(wcycle, ewcLJPME);
1449                         }
1450
1451                         loop_count =
1452                             solve_pme_lj_yzx(pme, &pme->cfftgrid[2], TRUE,
1453                                              box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
1454                                              bCalcEnerVir,
1455                                              pme->nthread, thread);
1456                         if (thread == 0)
1457                         {
1458                             wallcycle_stop(wcycle, ewcLJPME);
1459                             where();
1460                             inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
1461                         }
1462                     }
1463                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1464                 }
1465             }
1466
1467             if (bCalcEnerVir)
1468             {
1469                 /* This should only be called on the master thread and
1470                  * after the threads have synchronized.
1471                  */
1472                 get_pme_ener_vir_lj(pme->solve_work, pme->nthread, &energy_AB[2+fep_state], vir_AB[2+fep_state]);
1473             }
1474
1475             if (bBackFFT)
1476             {
1477                 bFirst = !pme->doCoulomb;
1478                 calc_initial_lb_coeffs(pme, local_c6, local_sigma);
1479                 for (grid_index = 8; grid_index >= 2; --grid_index)
1480                 {
1481                     /* Unpack structure */
1482                     pmegrid    = &pme->pmegrid[grid_index];
1483                     fftgrid    = pme->fftgrid[grid_index];
1484                     pfft_setup = pme->pfft_setup[grid_index];
1485                     grid       = pmegrid->grid.grid;
1486                     calc_next_lb_coeffs(pme, local_sigma);
1487                     where();
1488 #pragma omp parallel num_threads(pme->nthread) private(thread)
1489                     {
1490                         try
1491                         {
1492                             thread = gmx_omp_get_thread_num();
1493                             /* do 3d-invfft */
1494                             if (thread == 0)
1495                             {
1496                                 where();
1497                                 wallcycle_start(wcycle, ewcPME_FFT);
1498                             }
1499
1500                             gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
1501                                                        thread, wcycle);
1502                             if (thread == 0)
1503                             {
1504                                 wallcycle_stop(wcycle, ewcPME_FFT);
1505
1506                                 where();
1507
1508                                 if (pme->nodeid == 0)
1509                                 {
1510                                     real ntot = pme->nkx*pme->nky*pme->nkz;
1511                                     npme  = static_cast<int>(ntot*std::log(ntot)/std::log(2.0));
1512                                     inc_nrnb(nrnb, eNR_FFT, 2*npme);
1513                                 }
1514                                 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1515                             }
1516
1517                             copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
1518                         }
1519                         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1520                     } /*#pragma omp parallel*/
1521
1522                     /* distribute local grid to all nodes */
1523 #if GMX_MPI
1524                     if (pme->nnodes > 1)
1525                     {
1526                         gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
1527                     }
1528 #endif
1529                     where();
1530
1531                     unwrap_periodic_pmegrid(pme, grid);
1532
1533                     if (bCalcF)
1534                     {
1535                         /* interpolate forces for our local atoms */
1536                         where();
1537                         bClearF = (bFirst && PAR(cr));
1538                         scale   = pme->bFEP ? (fep_state < 1 ? 1.0-lambda_lj : lambda_lj) : 1.0;
1539                         scale  *= lb_scale_factor[grid_index-2];
1540
1541 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1542                         for (thread = 0; thread < pme->nthread; thread++)
1543                         {
1544                             try
1545                             {
1546                                 gather_f_bsplines(pme, grid, bClearF, &pme->atc[0],
1547                                                   &pme->atc[0].spline[thread],
1548                                                   scale);
1549                             }
1550                             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1551                         }
1552
1553                         where();
1554
1555                         inc_nrnb(nrnb, eNR_GATHERFBSP,
1556                                  pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
1557                     }
1558                     wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1559
1560                     bFirst = FALSE;
1561                 } /* for (grid_index = 8; grid_index >= 2; --grid_index) */
1562             }     /* if (bCalcF) */
1563         }         /* for (fep_state = 0; fep_state < fep_states_lj; ++fep_state) */
1564     }             /* if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB) */
1565
1566     if (bCalcF && pme->nnodes > 1)
1567     {
1568         wallcycle_start(wcycle, ewcPME_REDISTXF);
1569         for (d = 0; d < pme->ndecompdim; d++)
1570         {
1571             atc = &pme->atc[d];
1572             if (d == pme->ndecompdim - 1)
1573             {
1574                 n_d = homenr;
1575                 f_d = f + start;
1576             }
1577             else
1578             {
1579                 n_d = pme->atc[d+1].n;
1580                 f_d = pme->atc[d+1].f;
1581             }
1582             if (DOMAINDECOMP(cr))
1583             {
1584                 dd_pmeredist_f(pme, atc, n_d, f_d,
1585                                d == pme->ndecompdim-1 && pme->bPPnode);
1586             }
1587         }
1588
1589         wallcycle_stop(wcycle, ewcPME_REDISTXF);
1590     }
1591     where();
1592
1593     if (bCalcEnerVir)
1594     {
1595         if (pme->doCoulomb)
1596         {
1597             if (!pme->bFEP_q)
1598             {
1599                 *energy_q = energy_AB[0];
1600                 m_add(vir_q, vir_AB[0], vir_q);
1601             }
1602             else
1603             {
1604                 *energy_q       = (1.0-lambda_q)*energy_AB[0] + lambda_q*energy_AB[1];
1605                 *dvdlambda_q   += energy_AB[1] - energy_AB[0];
1606                 for (i = 0; i < DIM; i++)
1607                 {
1608                     for (j = 0; j < DIM; j++)
1609                     {
1610                         vir_q[i][j] += (1.0-lambda_q)*vir_AB[0][i][j] +
1611                             lambda_q*vir_AB[1][i][j];
1612                     }
1613                 }
1614             }
1615             if (debug)
1616             {
1617                 fprintf(debug, "Electrostatic PME mesh energy: %g\n", *energy_q);
1618             }
1619         }
1620         else
1621         {
1622             *energy_q = 0;
1623         }
1624
1625         if (pme->doLJ)
1626         {
1627             if (!pme->bFEP_lj)
1628             {
1629                 *energy_lj = energy_AB[2];
1630                 m_add(vir_lj, vir_AB[2], vir_lj);
1631             }
1632             else
1633             {
1634                 *energy_lj     = (1.0-lambda_lj)*energy_AB[2] + lambda_lj*energy_AB[3];
1635                 *dvdlambda_lj += energy_AB[3] - energy_AB[2];
1636                 for (i = 0; i < DIM; i++)
1637                 {
1638                     for (j = 0; j < DIM; j++)
1639                     {
1640                         vir_lj[i][j] += (1.0-lambda_lj)*vir_AB[2][i][j] + lambda_lj*vir_AB[3][i][j];
1641                     }
1642                 }
1643             }
1644             if (debug)
1645             {
1646                 fprintf(debug, "Lennard-Jones PME mesh energy: %g\n", *energy_lj);
1647             }
1648         }
1649         else
1650         {
1651             *energy_lj = 0;
1652         }
1653     }
1654     return 0;
1655 }
1656
1657 int gmx_pme_destroy(struct gmx_pme_t **pmedata)
1658 {
1659     struct gmx_pme_t *pme = *pmedata;
1660
1661     sfree(pme->nnx);
1662     sfree(pme->nny);
1663     sfree(pme->nnz);
1664     sfree(pme->fshx);
1665     sfree(pme->fshy);
1666     sfree(pme->fshz);
1667
1668     for (int i = 0; i < pme->ngrids; ++i)
1669     {
1670         pmegrids_destroy(&pme->pmegrid[i]);
1671         gmx_parallel_3dfft_destroy(pme->pfft_setup[i]);
1672     }
1673
1674     for (int i = 0; i < pme->ndecompdim; i++)
1675     {
1676         destroy_atomcomm(&pme->atc[i]);
1677     }
1678
1679     destroy_overlap_comm(&pme->overlap[0]);
1680     destroy_overlap_comm(&pme->overlap[1]);
1681
1682     sfree(pme->lb_buf1);
1683     sfree(pme->lb_buf2);
1684
1685     sfree(pme->bufv);
1686     sfree(pme->bufr);
1687
1688     pme_free_all_work(&pme->solve_work, pme->nthread);
1689
1690     sfree(pme->sum_qgrid_tmp);
1691     sfree(pme->sum_qgrid_dd_tmp);
1692
1693     sfree(*pmedata);
1694     *pmedata = nullptr;
1695
1696     return 0;
1697 }