src/gromacs/ewald/pme.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  *
  39  * \brief This file contains function definitions necessary for
  40  * computing energies and forces for the PME long-ranged part (Coulomb
  41  * and LJ).
  42  *
  43  * \author Erik Lindahl <erik@kth.se>
  44  * \author Berk Hess <hess@kth.se>
  45  * \ingroup module_ewald
  46  */
  47 /* IMPORTANT FOR DEVELOPERS:
  48  *
  49  * Triclinic pme stuff isn't entirely trivial, and we've experienced
  50  * some bugs during development (many of them due to me). To avoid
  51  * this in the future, please check the following things if you make
  52  * changes in this file:
  53  *
  54  * 1. You should obtain identical (at least to the PME precision)
  55  *    energies, forces, and virial for
  56  *    a rectangular box and a triclinic one where the z (or y) axis is
  57  *    tilted a whole box side. For instance you could use these boxes:
  58  *
  59  *    rectangular       triclinic
  60  *     2  0  0           2  0  0
  61  *     0  2  0           0  2  0
  62  *     0  0  6           2  2  6
  63  *
  64  * 2. You should check the energy conservation in a triclinic box.
  65  *
  66  * It might seem an overkill, but better safe than sorry.
  67  * /Erik 001109
  68  */
  69
  70 #include "gmxpre.h"
  71
  72 #include "pme.h"
  73
  74 #include "config.h"
  75
  76 #include <assert.h>
  77 #include <math.h>
  78 #include <stdio.h>
  79 #include <stdlib.h>
  80 #include <string.h>
  81
  82 #include <algorithm>
  83
  84 #include "gromacs/fft/parallel_3dfft.h"
  85 #include "gromacs/fileio/pdbio.h"
  86 #include "gromacs/gmxlib/network.h"
  87 #include "gromacs/gmxlib/nrnb.h"
  88 #include "gromacs/math/gmxcomplex.h"
  89 #include "gromacs/math/invertmatrix.h"
  90 #include "gromacs/math/units.h"
  91 #include "gromacs/math/vec.h"
  92 #include "gromacs/math/vectypes.h"
  93 #include "gromacs/mdtypes/commrec.h"
  94 #include "gromacs/mdtypes/forcerec.h"
  95 #include "gromacs/mdtypes/inputrec.h"
  96 #include "gromacs/mdtypes/md_enums.h"
  97 #include "gromacs/pbcutil/pbc.h"
  98 #include "gromacs/timing/cyclecounter.h"
  99 #include "gromacs/timing/wallcycle.h"
 100 #include "gromacs/timing/walltime_accounting.h"
 101 #include "gromacs/utility/basedefinitions.h"
 102 #include "gromacs/utility/exceptions.h"
 103 #include "gromacs/utility/fatalerror.h"
 104 #include "gromacs/utility/gmxmpi.h"
 105 #include "gromacs/utility/gmxomp.h"
 106 #include "gromacs/utility/real.h"
 107 #include "gromacs/utility/smalloc.h"
 108
 109 #include "calculate-spline-moduli.h"
 110 #include "pme-gather.h"
 111 #include "pme-grid.h"
 112 #include "pme-internal.h"
 113 #include "pme-redistribute.h"
 114 #include "pme-solve.h"
 115 #include "pme-spline-work.h"
 116 #include "pme-spread.h"
 117
 118 /*! \brief Number of bytes in a cache line.
 119  *
 120  * Must also be a multiple of the SIMD and SIMD4 register size, to
 121  * preserve alignment.
 122  */
 123 const int gmxCacheLineSize = 64;
 124
 125 //! Set up coordinate communication
 126 static void setup_coordinate_communication(pme_atomcomm_t *atc)
 127 {
 128     int nslab, n, i;
 129     int fw, bw;
 130
 131     nslab = atc->nslab;
 132
 133     n = 0;
 134     for (i = 1; i <= nslab/2; i++)
 135     {
 136         fw = (atc->nodeid + i) % nslab;
 137         bw = (atc->nodeid - i + nslab) % nslab;
 138         if (n < nslab - 1)
 139         {
 140             atc->node_dest[n] = fw;
 141             atc->node_src[n]  = bw;
 142             n++;
 143         }
 144         if (n < nslab - 1)
 145         {
 146             atc->node_dest[n] = bw;
 147             atc->node_src[n]  = fw;
 148             n++;
 149         }
 150     }
 151 }
 152
 153 /*! \brief Round \p n up to the next multiple of \p f */
 154 static int mult_up(int n, int f)
 155 {
 156     return ((n + f - 1)/f)*f;
 157 }
 158
 159 /*! \brief Return estimate of the load imbalance from the PME grid not being a good match for the number of PME ranks */
 160 static double estimate_pme_load_imbalance(struct gmx_pme_t *pme)
 161 {
 162     int    nma, nmi;
 163     double n1, n2, n3;
 164
 165     nma = pme->nnodes_major;
 166     nmi = pme->nnodes_minor;
 167
 168     n1 = mult_up(pme->nkx, nma)*mult_up(pme->nky, nmi)*pme->nkz;
 169     n2 = mult_up(pme->nkx, nma)*mult_up(pme->nkz, nmi)*pme->nky;
 170     n3 = mult_up(pme->nky, nma)*mult_up(pme->nkz, nmi)*pme->nkx;
 171
 172     /* pme_solve is roughly double the cost of an fft */
 173
 174     return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
 175 }
 176
 177 /*! \brief Initialize atom communication data structure */
 178 static void init_atomcomm(struct gmx_pme_t *pme, pme_atomcomm_t *atc,
 179                           int dimind, gmx_bool bSpread)
 180 {
 181     int thread;
 182
 183     atc->dimind    = dimind;
 184     atc->nslab     = 1;
 185     atc->nodeid    = 0;
 186     atc->pd_nalloc = 0;
 187 #if GMX_MPI
 188     if (pme->nnodes > 1)
 189     {
 190         atc->mpi_comm = pme->mpi_comm_d[dimind];
 191         MPI_Comm_size(atc->mpi_comm, &atc->nslab);
 192         MPI_Comm_rank(atc->mpi_comm, &atc->nodeid);
 193     }
 194     if (debug)
 195     {
 196         fprintf(debug, "For PME atom communication in dimind %d: nslab %d rank %d\n", atc->dimind, atc->nslab, atc->nodeid);
 197     }
 198 #endif
 199
 200     atc->bSpread   = bSpread;
 201     atc->pme_order = pme->pme_order;
 202
 203     if (atc->nslab > 1)
 204     {
 205         snew(atc->node_dest, atc->nslab);
 206         snew(atc->node_src, atc->nslab);
 207         setup_coordinate_communication(atc);
 208
 209         snew(atc->count_thread, pme->nthread);
 210         for (thread = 0; thread < pme->nthread; thread++)
 211         {
 212             snew(atc->count_thread[thread], atc->nslab);
 213         }
 214         atc->count = atc->count_thread[0];
 215         snew(atc->rcount, atc->nslab);
 216         snew(atc->buf_index, atc->nslab);
 217     }
 218
 219     atc->nthread = pme->nthread;
 220     if (atc->nthread > 1)
 221     {
 222         snew(atc->thread_plist, atc->nthread);
 223     }
 224     snew(atc->spline, atc->nthread);
 225     for (thread = 0; thread < atc->nthread; thread++)
 226     {
 227         if (atc->nthread > 1)
 228         {
 229             snew(atc->thread_plist[thread].n, atc->nthread+2*gmxCacheLineSize);
 230             atc->thread_plist[thread].n += gmxCacheLineSize;
 231         }
 232         snew(atc->spline[thread].thread_one, pme->nthread);
 233         atc->spline[thread].thread_one[thread] = 1;
 234     }
 235 }
 236
 237 /*! \brief Destroy an atom communication data structure and its child structs */
 238 static void destroy_atomcomm(pme_atomcomm_t *atc)
 239 {
 240     sfree(atc->pd);
 241     if (atc->nslab > 1)
 242     {
 243         sfree(atc->node_dest);
 244         sfree(atc->node_src);
 245         for (int i = 0; i < atc->nthread; i++)
 246         {
 247             sfree(atc->count_thread[i]);
 248         }
 249         sfree(atc->count_thread);
 250         sfree(atc->rcount);
 251         sfree(atc->buf_index);
 252
 253         sfree(atc->x);
 254         sfree(atc->coefficient);
 255         sfree(atc->f);
 256     }
 257     sfree(atc->idx);
 258     sfree(atc->fractx);
 259
 260     sfree(atc->thread_idx);
 261     for (int i = 0; i < atc->nthread; i++)
 262     {
 263         if (atc->nthread > 1)
 264         {
 265             int *n_ptr = atc->thread_plist[i].n - gmxCacheLineSize;
 266             sfree(n_ptr);
 267             sfree(atc->thread_plist[i].i);
 268         }
 269         sfree(atc->spline[i].thread_one);
 270         sfree(atc->spline[i].ind);
 271     }
 272     if (atc->nthread > 1)
 273     {
 274         sfree(atc->thread_plist);
 275     }
 276     sfree(atc->spline);
 277 }
 278
 279 /*! \brief Initialize data structure for communication */
 280 static void
 281 init_overlap_comm(pme_overlap_t *  ol,
 282                   int              norder,
 283 #if GMX_MPI
 284                   MPI_Comm         comm,
 285 #endif
 286                   int              nnodes,
 287                   int              nodeid,
 288                   int              ndata,
 289                   int              commplainsize)
 290 {
 291     int              b, i;
 292     pme_grid_comm_t *pgc;
 293     gmx_bool         bCont;
 294     int              fft_start, fft_end, send_index1, recv_index1;
 295 #if GMX_MPI
 296     MPI_Status       stat;
 297
 298     ol->mpi_comm = comm;
 299 #endif
 300
 301     ol->nnodes = nnodes;
 302     ol->nodeid = nodeid;
 303
 304     /* Linear translation of the PME grid won't affect reciprocal space
 305      * calculations, so to optimize we only interpolate "upwards",
 306      * which also means we only have to consider overlap in one direction.
 307      * I.e., particles on this node might also be spread to grid indices
 308      * that belong to higher nodes (modulo nnodes)
 309      */
 310
 311     snew(ol->s2g0, ol->nnodes+1);
 312     snew(ol->s2g1, ol->nnodes);
 313     if (debug)
 314     {
 315         fprintf(debug, "PME slab boundaries:");
 316     }
 317     for (i = 0; i < nnodes; i++)
 318     {
 319         /* s2g0 the local interpolation grid start.
 320          * s2g1 the local interpolation grid end.
 321          * Since in calc_pidx we divide particles, and not grid lines,
 322          * spatially uniform along dimension x or y, we need to round
 323          * s2g0 down and s2g1 up.
 324          */
 325         ol->s2g0[i] = ( i   *ndata + 0       )/nnodes;
 326         ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
 327
 328         if (debug)
 329         {
 330             fprintf(debug, "  %3d %3d", ol->s2g0[i], ol->s2g1[i]);
 331         }
 332     }
 333     ol->s2g0[nnodes] = ndata;
 334     if (debug)
 335     {
 336         fprintf(debug, "\n");
 337     }
 338
 339     /* Determine with how many nodes we need to communicate the grid overlap */
 340     b = 0;
 341     do
 342     {
 343         b++;
 344         bCont = FALSE;
 345         for (i = 0; i < nnodes; i++)
 346         {
 347             if ((i+b <  nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
 348                 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
 349             {
 350                 bCont = TRUE;
 351             }
 352         }
 353     }
 354     while (bCont && b < nnodes);
 355     ol->noverlap_nodes = b - 1;
 356
 357     snew(ol->send_id, ol->noverlap_nodes);
 358     snew(ol->recv_id, ol->noverlap_nodes);
 359     for (b = 0; b < ol->noverlap_nodes; b++)
 360     {
 361         ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
 362         ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
 363     }
 364     snew(ol->comm_data, ol->noverlap_nodes);
 365
 366     ol->send_size = 0;
 367     for (b = 0; b < ol->noverlap_nodes; b++)
 368     {
 369         pgc = &ol->comm_data[b];
 370         /* Send */
 371         fft_start        = ol->s2g0[ol->send_id[b]];
 372         fft_end          = ol->s2g0[ol->send_id[b]+1];
 373         if (ol->send_id[b] < nodeid)
 374         {
 375             fft_start += ndata;
 376             fft_end   += ndata;
 377         }
 378         send_index1       = ol->s2g1[nodeid];
 379         send_index1       = std::min(send_index1, fft_end);
 380         pgc->send_index0  = fft_start;
 381         pgc->send_nindex  = std::max(0, send_index1 - pgc->send_index0);
 382         ol->send_size    += pgc->send_nindex;
 383
 384         /* We always start receiving to the first index of our slab */
 385         fft_start        = ol->s2g0[ol->nodeid];
 386         fft_end          = ol->s2g0[ol->nodeid+1];
 387         recv_index1      = ol->s2g1[ol->recv_id[b]];
 388         if (ol->recv_id[b] > nodeid)
 389         {
 390             recv_index1 -= ndata;
 391         }
 392         recv_index1      = std::min(recv_index1, fft_end);
 393         pgc->recv_index0 = fft_start;
 394         pgc->recv_nindex = std::max(0, recv_index1 - pgc->recv_index0);
 395     }
 396
 397 #if GMX_MPI
 398     /* Communicate the buffer sizes to receive */
 399     for (b = 0; b < ol->noverlap_nodes; b++)
 400     {
 401         MPI_Sendrecv(&ol->send_size, 1, MPI_INT, ol->send_id[b], b,
 402                      &ol->comm_data[b].recv_size, 1, MPI_INT, ol->recv_id[b], b,
 403                      ol->mpi_comm, &stat);
 404     }
 405 #endif
 406
 407     /* For non-divisible grid we need pme_order iso pme_order-1 */
 408     snew(ol->sendbuf, norder*commplainsize);
 409     snew(ol->recvbuf, norder*commplainsize);
 410 }
 411
 412 /*! \brief Destroy data structure for communication */
 413 static void
 414 destroy_overlap_comm(const pme_overlap_t *ol)
 415 {
 416     sfree(ol->s2g0);
 417     sfree(ol->s2g1);
 418     sfree(ol->send_id);
 419     sfree(ol->recv_id);
 420     sfree(ol->comm_data);
 421     sfree(ol->sendbuf);
 422     sfree(ol->recvbuf);
 423 }
 424
 425 void gmx_pme_check_restrictions(int pme_order,
 426                                 int nkx, int nky, int nkz,
 427                                 int nnodes_major,
 428                                 int nnodes_minor,
 429                                 gmx_bool bUseThreads,
 430                                 gmx_bool bFatal,
 431                                 gmx_bool *bValidSettings)
 432 {
 433     if (pme_order > PME_ORDER_MAX)
 434     {
 435         if (!bFatal)
 436         {
 437             *bValidSettings = FALSE;
 438             return;
 439         }
 440         gmx_fatal(FARGS, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
 441                   pme_order, PME_ORDER_MAX);
 442     }
 443
 444     if (nkx <= pme_order*(nnodes_major > 1 ? 2 : 1) ||
 445         nky <= pme_order*(nnodes_minor > 1 ? 2 : 1) ||
 446         nkz <= pme_order)
 447     {
 448         if (!bFatal)
 449         {
 450             *bValidSettings = FALSE;
 451             return;
 452         }
 453         gmx_fatal(FARGS, "The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order",
 454                   pme_order);
 455     }
 456
 457     /* Check for a limitation of the (current) sum_fftgrid_dd code.
 458      * We only allow multiple communication pulses in dim 1, not in dim 0.
 459      */
 460     if (bUseThreads && (nkx < nnodes_major*pme_order &&
 461                         nkx != nnodes_major*(pme_order - 1)))
 462     {
 463         if (!bFatal)
 464         {
 465             *bValidSettings = FALSE;
 466             return;
 467         }
 468         gmx_fatal(FARGS, "The number of PME grid lines per rank along x is %g. But when using OpenMP threads, the number of grid lines per rank along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use fewer ranks along x (and possibly more along y and/or z) by specifying -dd manually.",
 469                   nkx/(double)nnodes_major, pme_order);
 470     }
 471
 472     if (bValidSettings != NULL)
 473     {
 474         *bValidSettings = TRUE;
 475     }
 476
 477     return;
 478 }
 479
 480 /*! \brief Round \p enumerator */
 481 static int div_round_up(int enumerator, int denominator)
 482 {
 483     return (enumerator + denominator - 1)/denominator;
 484 }
 485
 486 int gmx_pme_init(struct gmx_pme_t **pmedata,
 487                  t_commrec *        cr,
 488                  int                nnodes_major,
 489                  int                nnodes_minor,
 490                  t_inputrec *       ir,
 491                  int                homenr,
 492                  gmx_bool           bFreeEnergy_q,
 493                  gmx_bool           bFreeEnergy_lj,
 494                  gmx_bool           bReproducible,
 495                  real               ewaldcoeff_q,
 496                  real               ewaldcoeff_lj,
 497                  int                nthread)
 498 {
 499     struct gmx_pme_t *pme = NULL;
 500
 501     int               use_threads, sum_use_threads, i;
 502     ivec              ndata;
 503
 504     if (debug)
 505     {
 506         fprintf(debug, "Creating PME data structures.\n");
 507     }
 508     snew(pme, 1);
 509
 510     pme->sum_qgrid_tmp       = NULL;
 511     pme->sum_qgrid_dd_tmp    = NULL;
 512     pme->buf_nalloc          = 0;
 513
 514     pme->nnodes              = 1;
 515     pme->bPPnode             = TRUE;
 516
 517     pme->nnodes_major        = nnodes_major;
 518     pme->nnodes_minor        = nnodes_minor;
 519
 520 #if GMX_MPI
 521     if (nnodes_major*nnodes_minor > 1)
 522     {
 523         pme->mpi_comm = cr->mpi_comm_mygroup;
 524
 525         MPI_Comm_rank(pme->mpi_comm, &pme->nodeid);
 526         MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
 527         if (pme->nnodes != nnodes_major*nnodes_minor)
 528         {
 529             gmx_incons("PME rank count mismatch");
 530         }
 531     }
 532     else
 533     {
 534         pme->mpi_comm = MPI_COMM_NULL;
 535     }
 536 #endif
 537
 538     if (pme->nnodes == 1)
 539     {
 540 #if GMX_MPI
 541         pme->mpi_comm_d[0] = MPI_COMM_NULL;
 542         pme->mpi_comm_d[1] = MPI_COMM_NULL;
 543 #endif
 544         pme->ndecompdim   = 0;
 545         pme->nodeid_major = 0;
 546         pme->nodeid_minor = 0;
 547 #if GMX_MPI
 548         pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
 549 #endif
 550     }
 551     else
 552     {
 553         if (nnodes_minor == 1)
 554         {
 555 #if GMX_MPI
 556             pme->mpi_comm_d[0] = pme->mpi_comm;
 557             pme->mpi_comm_d[1] = MPI_COMM_NULL;
 558 #endif
 559             pme->ndecompdim   = 1;
 560             pme->nodeid_major = pme->nodeid;
 561             pme->nodeid_minor = 0;
 562
 563         }
 564         else if (nnodes_major == 1)
 565         {
 566 #if GMX_MPI
 567             pme->mpi_comm_d[0] = MPI_COMM_NULL;
 568             pme->mpi_comm_d[1] = pme->mpi_comm;
 569 #endif
 570             pme->ndecompdim   = 1;
 571             pme->nodeid_major = 0;
 572             pme->nodeid_minor = pme->nodeid;
 573         }
 574         else
 575         {
 576             if (pme->nnodes % nnodes_major != 0)
 577             {
 578                 gmx_incons("For 2D PME decomposition, #PME ranks must be divisible by the number of ranks in the major dimension");
 579             }
 580             pme->ndecompdim = 2;
 581
 582 #if GMX_MPI
 583             MPI_Comm_split(pme->mpi_comm, pme->nodeid % nnodes_minor,
 584                            pme->nodeid, &pme->mpi_comm_d[0]);  /* My communicator along major dimension */
 585             MPI_Comm_split(pme->mpi_comm, pme->nodeid/nnodes_minor,
 586                            pme->nodeid, &pme->mpi_comm_d[1]);  /* My communicator along minor dimension */
 587
 588             MPI_Comm_rank(pme->mpi_comm_d[0], &pme->nodeid_major);
 589             MPI_Comm_size(pme->mpi_comm_d[0], &pme->nnodes_major);
 590             MPI_Comm_rank(pme->mpi_comm_d[1], &pme->nodeid_minor);
 591             MPI_Comm_size(pme->mpi_comm_d[1], &pme->nnodes_minor);
 592 #endif
 593         }
 594         pme->bPPnode = (cr->duty & DUTY_PP);
 595     }
 596
 597     pme->nthread = nthread;
 598
 599     /* Check if any of the PME MPI ranks uses threads */
 600     use_threads = (pme->nthread > 1 ? 1 : 0);
 601 #if GMX_MPI
 602     if (pme->nnodes > 1)
 603     {
 604         MPI_Allreduce(&use_threads, &sum_use_threads, 1, MPI_INT,
 605                       MPI_SUM, pme->mpi_comm);
 606     }
 607     else
 608 #endif
 609     {
 610         sum_use_threads = use_threads;
 611     }
 612     pme->bUseThreads = (sum_use_threads > 0);
 613
 614     if (ir->ePBC == epbcSCREW)
 615     {
 616         gmx_fatal(FARGS, "pme does not (yet) work with pbc = screw");
 617     }
 618
 619     /* NOTE:
 620      * It is likely that the current gmx_pme_do() routine supports calculating
 621      * only Coulomb or LJ while gmx_pme_init() configures for both,
 622      * but that has never been tested.
 623      * It is likely that the current gmx_pme_do() routine supports calculating,
 624      * not calculating free-energy for Coulomb and/or LJ while gmx_pme_init()
 625      * configures with free-energy, but that has never been tested.
 626      */
 627     pme->doCoulomb     = EEL_PME(ir->coulombtype);
 628     pme->doLJ          = EVDW_PME(ir->vdwtype);
 629     pme->bFEP_q        = ((ir->efep != efepNO) && bFreeEnergy_q);
 630     pme->bFEP_lj       = ((ir->efep != efepNO) && bFreeEnergy_lj);
 631     pme->bFEP          = (pme->bFEP_q || pme->bFEP_lj);
 632     pme->nkx           = ir->nkx;
 633     pme->nky           = ir->nky;
 634     pme->nkz           = ir->nkz;
 635     pme->bP3M          = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
 636     pme->pme_order     = ir->pme_order;
 637     pme->ewaldcoeff_q  = ewaldcoeff_q;
 638     pme->ewaldcoeff_lj = ewaldcoeff_lj;
 639
 640     /* Always constant electrostatics coefficients */
 641     pme->epsilon_r     = ir->epsilon_r;
 642
 643     /* Always constant LJ coefficients */
 644     pme->ljpme_combination_rule = ir->ljpme_combination_rule;
 645
 646     /* If we violate restrictions, generate a fatal error here */
 647     gmx_pme_check_restrictions(pme->pme_order,
 648                                pme->nkx, pme->nky, pme->nkz,
 649                                pme->nnodes_major,
 650                                pme->nnodes_minor,
 651                                pme->bUseThreads,
 652                                TRUE,
 653                                NULL);
 654
 655     if (pme->nnodes > 1)
 656     {
 657         double imbal;
 658
 659 #if GMX_MPI
 660         MPI_Type_contiguous(DIM, GMX_MPI_REAL, &(pme->rvec_mpi));
 661         MPI_Type_commit(&(pme->rvec_mpi));
 662 #endif
 663
 664         /* Note that the coefficient spreading and force gathering, which usually
 665          * takes about the same amount of time as FFT+solve_pme,
 666          * is always fully load balanced
 667          * (unless the coefficient distribution is inhomogeneous).
 668          */
 669
 670         imbal = estimate_pme_load_imbalance(pme);
 671         if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
 672         {
 673             fprintf(stderr,
 674                     "\n"
 675                     "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
 676                     "      For optimal PME load balancing\n"
 677                     "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_ranks_x (%d)\n"
 678                     "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_ranks_y (%d)\n"
 679                     "\n",
 680                     (int)((imbal-1)*100 + 0.5),
 681                     pme->nkx, pme->nky, pme->nnodes_major,
 682                     pme->nky, pme->nkz, pme->nnodes_minor);
 683         }
 684     }
 685
 686     /* For non-divisible grid we need pme_order iso pme_order-1 */
 687     /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
 688      * y is always copied through a buffer: we don't need padding in z,
 689      * but we do need the overlap in x because of the communication order.
 690      */
 691     init_overlap_comm(&pme->overlap[0], pme->pme_order,
 692 #if GMX_MPI
 693                       pme->mpi_comm_d[0],
 694 #endif
 695                       pme->nnodes_major, pme->nodeid_major,
 696                       pme->nkx,
 697                       (div_round_up(pme->nky, pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
 698
 699     /* Along overlap dim 1 we can send in multiple pulses in sum_fftgrid_dd.
 700      * We do this with an offset buffer of equal size, so we need to allocate
 701      * extra for the offset. That's what the (+1)*pme->nkz is for.
 702      */
 703     init_overlap_comm(&pme->overlap[1], pme->pme_order,
 704 #if GMX_MPI
 705                       pme->mpi_comm_d[1],
 706 #endif
 707                       pme->nnodes_minor, pme->nodeid_minor,
 708                       pme->nky,
 709                       (div_round_up(pme->nkx, pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
 710
 711     /* Double-check for a limitation of the (current) sum_fftgrid_dd code.
 712      * Note that gmx_pme_check_restrictions checked for this already.
 713      */
 714     if (pme->bUseThreads && pme->overlap[0].noverlap_nodes > 1)
 715     {
 716         gmx_incons("More than one communication pulse required for grid overlap communication along the major dimension while using threads");
 717     }
 718
 719     snew(pme->bsp_mod[XX], pme->nkx);
 720     snew(pme->bsp_mod[YY], pme->nky);
 721     snew(pme->bsp_mod[ZZ], pme->nkz);
 722
 723     /* The required size of the interpolation grid, including overlap.
 724      * The allocated size (pmegrid_n?) might be slightly larger.
 725      */
 726     pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
 727         pme->overlap[0].s2g0[pme->nodeid_major];
 728     pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
 729         pme->overlap[1].s2g0[pme->nodeid_minor];
 730     pme->pmegrid_nz_base = pme->nkz;
 731     pme->pmegrid_nz      = pme->pmegrid_nz_base + pme->pme_order - 1;
 732     set_grid_alignment(&pme->pmegrid_nz, pme->pme_order);
 733
 734     pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
 735     pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
 736     pme->pmegrid_start_iz = 0;
 737
 738     make_gridindex5_to_localindex(pme->nkx,
 739                                   pme->pmegrid_start_ix,
 740                                   pme->pmegrid_nx - (pme->pme_order-1),
 741                                   &pme->nnx, &pme->fshx);
 742     make_gridindex5_to_localindex(pme->nky,
 743                                   pme->pmegrid_start_iy,
 744                                   pme->pmegrid_ny - (pme->pme_order-1),
 745                                   &pme->nny, &pme->fshy);
 746     make_gridindex5_to_localindex(pme->nkz,
 747                                   pme->pmegrid_start_iz,
 748                                   pme->pmegrid_nz_base,
 749                                   &pme->nnz, &pme->fshz);
 750
 751     pme->spline_work = make_pme_spline_work(pme->pme_order);
 752
 753     ndata[0]    = pme->nkx;
 754     ndata[1]    = pme->nky;
 755     ndata[2]    = pme->nkz;
 756     /* It doesn't matter if we allocate too many grids here,
 757      * we only allocate and use the ones we need.
 758      */
 759     if (pme->doLJ)
 760     {
 761         pme->ngrids = ((ir->ljpme_combination_rule == eljpmeLB) ? DO_Q_AND_LJ_LB : DO_Q_AND_LJ);
 762     }
 763     else
 764     {
 765         pme->ngrids = DO_Q;
 766     }
 767     snew(pme->fftgrid, pme->ngrids);
 768     snew(pme->cfftgrid, pme->ngrids);
 769     snew(pme->pfft_setup, pme->ngrids);
 770
 771     for (i = 0; i < pme->ngrids; ++i)
 772     {
 773         if ((i <  DO_Q && pme->doCoulomb && (i == 0 ||
 774                                              bFreeEnergy_q)) ||
 775             (i >= DO_Q && pme->doLJ && (i == 2 ||
 776                                         bFreeEnergy_lj ||
 777                                         ir->ljpme_combination_rule == eljpmeLB)))
 778         {
 779             pmegrids_init(&pme->pmegrid[i],
 780                           pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
 781                           pme->pmegrid_nz_base,
 782                           pme->pme_order,
 783                           pme->bUseThreads,
 784                           pme->nthread,
 785                           pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
 786                           pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
 787             /* This routine will allocate the grid data to fit the FFTs */
 788             gmx_parallel_3dfft_init(&pme->pfft_setup[i], ndata,
 789                                     &pme->fftgrid[i], &pme->cfftgrid[i],
 790                                     pme->mpi_comm_d,
 791                                     bReproducible, pme->nthread);
 792
 793         }
 794     }
 795
 796     if (!pme->bP3M)
 797     {
 798         /* Use plain SPME B-spline interpolation */
 799         make_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
 800     }
 801     else
 802     {
 803         /* Use the P3M grid-optimized influence function */
 804         make_p3m_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
 805     }
 806
 807     /* Use atc[0] for spreading */
 808     init_atomcomm(pme, &pme->atc[0], nnodes_major > 1 ? 0 : 1, TRUE);
 809     if (pme->ndecompdim >= 2)
 810     {
 811         init_atomcomm(pme, &pme->atc[1], 1, FALSE);
 812     }
 813
 814     if (pme->nnodes == 1)
 815     {
 816         pme->atc[0].n = homenr;
 817         pme_realloc_atomcomm_things(&pme->atc[0]);
 818     }
 819
 820     pme->lb_buf1       = NULL;
 821     pme->lb_buf2       = NULL;
 822     pme->lb_buf_nalloc = 0;
 823
 824     pme_init_all_work(&pme->solve_work, pme->nthread, pme->nkx);
 825
 826     *pmedata = pme;
 827
 828     return 0;
 829 }
 830
 831 int gmx_pme_reinit(struct gmx_pme_t **pmedata,
 832                    t_commrec *        cr,
 833                    struct gmx_pme_t * pme_src,
 834                    const t_inputrec * ir,
 835                    ivec               grid_size,
 836                    real               ewaldcoeff_q,
 837                    real               ewaldcoeff_lj)
 838 {
 839     t_inputrec irc;
 840     int        homenr;
 841     int        ret;
 842
 843     irc     = *ir;
 844     irc.nkx = grid_size[XX];
 845     irc.nky = grid_size[YY];
 846     irc.nkz = grid_size[ZZ];
 847
 848     if (pme_src->nnodes == 1)
 849     {
 850         homenr = pme_src->atc[0].n;
 851     }
 852     else
 853     {
 854         homenr = -1;
 855     }
 856
 857     ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
 858                        &irc, homenr, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE, ewaldcoeff_q, ewaldcoeff_lj, pme_src->nthread);
 859
 860     if (ret == 0)
 861     {
 862         /* We can easily reuse the allocated pme grids in pme_src */
 863         reuse_pmegrids(&pme_src->pmegrid[PME_GRID_QA], &(*pmedata)->pmegrid[PME_GRID_QA]);
 864         /* We would like to reuse the fft grids, but that's harder */
 865     }
 866
 867     return ret;
 868 }
 869
 870 void gmx_pme_calc_energy(struct gmx_pme_t *pme, int n, rvec *x, real *q, real *V)
 871 {
 872     pme_atomcomm_t *atc;
 873     pmegrids_t     *grid;
 874
 875     if (pme->nnodes > 1)
 876     {
 877         gmx_incons("gmx_pme_calc_energy called in parallel");
 878     }
 879     if (pme->bFEP_q > 1)
 880     {
 881         gmx_incons("gmx_pme_calc_energy with free energy");
 882     }
 883
 884     atc            = &pme->atc_energy;
 885     atc->nthread   = 1;
 886     if (atc->spline == NULL)
 887     {
 888         snew(atc->spline, atc->nthread);
 889     }
 890     atc->nslab     = 1;
 891     atc->bSpread   = TRUE;
 892     atc->pme_order = pme->pme_order;
 893     atc->n         = n;
 894     pme_realloc_atomcomm_things(atc);
 895     atc->x           = x;
 896     atc->coefficient = q;
 897
 898     /* We only use the A-charges grid */
 899     grid = &pme->pmegrid[PME_GRID_QA];
 900
 901     /* Only calculate the spline coefficients, don't actually spread */
 902     spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgrid[PME_GRID_QA], FALSE, PME_GRID_QA);
 903
 904     *V = gather_energy_bsplines(pme, grid->grid.grid, atc);
 905 }
 906
 907 /*! \brief Calculate initial Lorentz-Berthelot coefficients for LJ-PME */
 908 static void
 909 calc_initial_lb_coeffs(struct gmx_pme_t *pme, real *local_c6, real *local_sigma)
 910 {
 911     int  i;
 912     for (i = 0; i < pme->atc[0].n; ++i)
 913     {
 914         real sigma4;
 915         sigma4                     = local_sigma[i];
 916         sigma4                     = sigma4*sigma4;
 917         sigma4                     = sigma4*sigma4;
 918         pme->atc[0].coefficient[i] = local_c6[i] / sigma4;
 919     }
 920 }
 921
 922 /*! \brief Calculate next Lorentz-Berthelot coefficients for LJ-PME */
 923 static void
 924 calc_next_lb_coeffs(struct gmx_pme_t *pme, real *local_sigma)
 925 {
 926     int  i;
 927
 928     for (i = 0; i < pme->atc[0].n; ++i)
 929     {
 930         pme->atc[0].coefficient[i] *= local_sigma[i];
 931     }
 932 }
 933
 934 int gmx_pme_do(struct gmx_pme_t *pme,
 935                int start,       int homenr,
 936                rvec x[],        rvec f[],
 937                real chargeA[],  real chargeB[],
 938                real c6A[],      real c6B[],
 939                real sigmaA[],   real sigmaB[],
 940                matrix box,      t_commrec *cr,
 941                int  maxshift_x, int maxshift_y,
 942                t_nrnb *nrnb,    gmx_wallcycle_t wcycle,
 943                matrix vir_q,    matrix vir_lj,
 944                real *energy_q,  real *energy_lj,
 945                real lambda_q,   real lambda_lj,
 946                real *dvdlambda_q, real *dvdlambda_lj,
 947                int flags)
 948 {
 949     int                  d, i, j, npme, grid_index, max_grid_index;
 950     int                  n_d;
 951     pme_atomcomm_t      *atc        = NULL;
 952     pmegrids_t          *pmegrid    = NULL;
 953     real                *grid       = NULL;
 954     rvec                *f_d;
 955     real                *coefficient = NULL;
 956     real                 energy_AB[4];
 957     matrix               vir_AB[4];
 958     real                 scale, lambda;
 959     gmx_bool             bClearF;
 960     gmx_parallel_3dfft_t pfft_setup;
 961     real              *  fftgrid;
 962     t_complex          * cfftgrid;
 963     int                  thread;
 964     gmx_bool             bFirst, bDoSplines;
 965     int                  fep_state;
 966     int                  fep_states_lj           = pme->bFEP_lj ? 2 : 1;
 967     const gmx_bool       bCalcEnerVir            = flags & GMX_PME_CALC_ENER_VIR;
 968     const gmx_bool       bBackFFT                = flags & (GMX_PME_CALC_F | GMX_PME_CALC_POT);
 969     const gmx_bool       bCalcF                  = flags & GMX_PME_CALC_F;
 970
 971     assert(pme->nnodes > 0);
 972     assert(pme->nnodes == 1 || pme->ndecompdim > 0);
 973
 974     if (pme->nnodes > 1)
 975     {
 976         atc      = &pme->atc[0];
 977         atc->npd = homenr;
 978         if (atc->npd > atc->pd_nalloc)
 979         {
 980             atc->pd_nalloc = over_alloc_dd(atc->npd);
 981             srenew(atc->pd, atc->pd_nalloc);
 982         }
 983         for (d = pme->ndecompdim-1; d >= 0; d--)
 984         {
 985             atc           = &pme->atc[d];
 986             atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
 987         }
 988     }
 989     else
 990     {
 991         atc = &pme->atc[0];
 992         /* This could be necessary for TPI */
 993         pme->atc[0].n = homenr;
 994         if (DOMAINDECOMP(cr))
 995         {
 996             pme_realloc_atomcomm_things(atc);
 997         }
 998         atc->x = x;
 999         atc->f = f;
1000     }
1001
1002     gmx::invertBoxMatrix(box, pme->recipbox);
1003     bFirst = TRUE;
1004
1005     /* For simplicity, we construct the splines for all particles if
1006      * more than one PME calculations is needed. Some optimization
1007      * could be done by keeping track of which atoms have splines
1008      * constructed, and construct new splines on each pass for atoms
1009      * that don't yet have them.
1010      */
1011
1012     bDoSplines = pme->bFEP || (pme->doCoulomb && pme->doLJ);
1013
1014     /* We need a maximum of four separate PME calculations:
1015      * grid_index=0: Coulomb PME with charges from state A
1016      * grid_index=1: Coulomb PME with charges from state B
1017      * grid_index=2: LJ PME with C6 from state A
1018      * grid_index=3: LJ PME with C6 from state B
1019      * For Lorentz-Berthelot combination rules, a separate loop is used to
1020      * calculate all the terms
1021      */
1022
1023     /* If we are doing LJ-PME with LB, we only do Q here */
1024     max_grid_index = (pme->ljpme_combination_rule == eljpmeLB) ? DO_Q : DO_Q_AND_LJ;
1025
1026     for (grid_index = 0; grid_index < max_grid_index; ++grid_index)
1027     {
1028         /* Check if we should do calculations at this grid_index
1029          * If grid_index is odd we should be doing FEP
1030          * If grid_index < 2 we should be doing electrostatic PME
1031          * If grid_index >= 2 we should be doing LJ-PME
1032          */
1033         if ((grid_index <  DO_Q && (!pme->doCoulomb ||
1034                                     (grid_index == 1 && !pme->bFEP_q))) ||
1035             (grid_index >= DO_Q && (!pme->doLJ ||
1036                                     (grid_index == 3 && !pme->bFEP_lj))))
1037         {
1038             continue;
1039         }
1040         /* Unpack structure */
1041         pmegrid    = &pme->pmegrid[grid_index];
1042         fftgrid    = pme->fftgrid[grid_index];
1043         cfftgrid   = pme->cfftgrid[grid_index];
1044         pfft_setup = pme->pfft_setup[grid_index];
1045         switch (grid_index)
1046         {
1047             case 0: coefficient = chargeA + start; break;
1048             case 1: coefficient = chargeB + start; break;
1049             case 2: coefficient = c6A + start; break;
1050             case 3: coefficient = c6B + start; break;
1051         }
1052
1053         grid = pmegrid->grid.grid;
1054
1055         if (debug)
1056         {
1057             fprintf(debug, "PME: number of ranks = %d, rank = %d\n",
1058                     cr->nnodes, cr->nodeid);
1059             fprintf(debug, "Grid = %p\n", (void*)grid);
1060             if (grid == NULL)
1061             {
1062                 gmx_fatal(FARGS, "No grid!");
1063             }
1064         }
1065         where();
1066
1067         if (pme->nnodes == 1)
1068         {
1069             atc->coefficient = coefficient;
1070         }
1071         else
1072         {
1073             wallcycle_start(wcycle, ewcPME_REDISTXF);
1074             do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, coefficient);
1075             where();
1076
1077             wallcycle_stop(wcycle, ewcPME_REDISTXF);
1078         }
1079
1080         if (debug)
1081         {
1082             fprintf(debug, "Rank= %6d, pme local particles=%6d\n",
1083                     cr->nodeid, atc->n);
1084         }
1085
1086         if (flags & GMX_PME_SPREAD)
1087         {
1088             wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1089
1090             /* Spread the coefficients on a grid */
1091             spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
1092
1093             if (bFirst)
1094             {
1095                 inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
1096             }
1097             inc_nrnb(nrnb, eNR_SPREADBSP,
1098                      pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
1099
1100             if (!pme->bUseThreads)
1101             {
1102                 wrap_periodic_pmegrid(pme, grid);
1103
1104                 /* sum contributions to local grid from other nodes */
1105 #if GMX_MPI
1106                 if (pme->nnodes > 1)
1107                 {
1108                     gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
1109                     where();
1110                 }
1111 #endif
1112
1113                 copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
1114             }
1115
1116             wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1117
1118             /* TODO If the OpenMP and single-threaded implementations
1119                converge, then spread_on_grid() and
1120                copy_pmegrid_to_fftgrid() will perhaps live in the same
1121                source file and the following debugging function can live
1122                there too. */
1123             /*
1124                dump_local_fftgrid(pme,fftgrid);
1125                exit(0);
1126              */
1127         }
1128
1129         /* Here we start a large thread parallel region */
1130 #pragma omp parallel num_threads(pme->nthread) private(thread)
1131         {
1132             try
1133             {
1134                 thread = gmx_omp_get_thread_num();
1135                 if (flags & GMX_PME_SOLVE)
1136                 {
1137                     int loop_count;
1138
1139                     /* do 3d-fft */
1140                     if (thread == 0)
1141                     {
1142                         wallcycle_start(wcycle, ewcPME_FFT);
1143                     }
1144                     gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
1145                                                thread, wcycle);
1146                     if (thread == 0)
1147                     {
1148                         wallcycle_stop(wcycle, ewcPME_FFT);
1149                     }
1150                     where();
1151
1152                     /* solve in k-space for our local cells */
1153                     if (thread == 0)
1154                     {
1155                         wallcycle_start(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
1156                     }
1157                     if (grid_index < DO_Q)
1158                     {
1159                         loop_count =
1160                             solve_pme_yzx(pme, cfftgrid,
1161                                           box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
1162                                           bCalcEnerVir,
1163                                           pme->nthread, thread);
1164                     }
1165                     else
1166                     {
1167                         loop_count =
1168                             solve_pme_lj_yzx(pme, &cfftgrid, FALSE,
1169                                              box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
1170                                              bCalcEnerVir,
1171                                              pme->nthread, thread);
1172                     }
1173
1174                     if (thread == 0)
1175                     {
1176                         wallcycle_stop(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
1177                         where();
1178                         inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
1179                     }
1180                 }
1181
1182                 if (bBackFFT)
1183                 {
1184                     /* do 3d-invfft */
1185                     if (thread == 0)
1186                     {
1187                         where();
1188                         wallcycle_start(wcycle, ewcPME_FFT);
1189                     }
1190                     gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
1191                                                thread, wcycle);
1192                     if (thread == 0)
1193                     {
1194                         wallcycle_stop(wcycle, ewcPME_FFT);
1195
1196                         where();
1197
1198                         if (pme->nodeid == 0)
1199                         {
1200                             real ntot = pme->nkx*pme->nky*pme->nkz;
1201                             npme  = static_cast<int>(ntot*std::log(ntot)/std::log(2.0));
1202                             inc_nrnb(nrnb, eNR_FFT, 2*npme);
1203                         }
1204
1205                         /* Note: this wallcycle region is closed below
1206                            outside an OpenMP region, so take care if
1207                            refactoring code here. */
1208                         wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1209                     }
1210
1211                     copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
1212                 }
1213             } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1214         }
1215         /* End of thread parallel section.
1216          * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
1217          */
1218
1219         if (bBackFFT)
1220         {
1221             /* distribute local grid to all nodes */
1222 #if GMX_MPI
1223             if (pme->nnodes > 1)
1224             {
1225                 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
1226             }
1227 #endif
1228             where();
1229
1230             unwrap_periodic_pmegrid(pme, grid);
1231         }
1232
1233         if (bCalcF)
1234         {
1235             /* interpolate forces for our local atoms */
1236
1237             where();
1238
1239             /* If we are running without parallelization,
1240              * atc->f is the actual force array, not a buffer,
1241              * therefore we should not clear it.
1242              */
1243             lambda  = grid_index < DO_Q ? lambda_q : lambda_lj;
1244             bClearF = (bFirst && PAR(cr));
1245 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1246             for (thread = 0; thread < pme->nthread; thread++)
1247             {
1248                 try
1249                 {
1250                     gather_f_bsplines(pme, grid, bClearF, atc,
1251                                       &atc->spline[thread],
1252                                       pme->bFEP ? (grid_index % 2 == 0 ? 1.0-lambda : lambda) : 1.0);
1253                 }
1254                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1255             }
1256
1257             where();
1258
1259             inc_nrnb(nrnb, eNR_GATHERFBSP,
1260                      pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
1261             /* Note: this wallcycle region is opened above inside an OpenMP
1262                region, so take care if refactoring code here. */
1263             wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1264         }
1265
1266         if (bCalcEnerVir)
1267         {
1268             /* This should only be called on the master thread
1269              * and after the threads have synchronized.
1270              */
1271             if (grid_index < 2)
1272             {
1273                 get_pme_ener_vir_q(pme->solve_work, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
1274             }
1275             else
1276             {
1277                 get_pme_ener_vir_lj(pme->solve_work, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
1278             }
1279         }
1280         bFirst = FALSE;
1281     } /* of grid_index-loop */
1282
1283     /* For Lorentz-Berthelot combination rules in LJ-PME, we need to calculate
1284      * seven terms. */
1285
1286     if (pme->doLJ && pme->ljpme_combination_rule == eljpmeLB)
1287     {
1288         /* Loop over A- and B-state if we are doing FEP */
1289         for (fep_state = 0; fep_state < fep_states_lj; ++fep_state)
1290         {
1291             real *local_c6 = NULL, *local_sigma = NULL, *RedistC6 = NULL, *RedistSigma = NULL;
1292             if (pme->nnodes == 1)
1293             {
1294                 if (pme->lb_buf1 == NULL)
1295                 {
1296                     pme->lb_buf_nalloc = pme->atc[0].n;
1297                     snew(pme->lb_buf1, pme->lb_buf_nalloc);
1298                 }
1299                 pme->atc[0].coefficient = pme->lb_buf1;
1300                 switch (fep_state)
1301                 {
1302                     case 0:
1303                         local_c6      = c6A;
1304                         local_sigma   = sigmaA;
1305                         break;
1306                     case 1:
1307                         local_c6      = c6B;
1308                         local_sigma   = sigmaB;
1309                         break;
1310                     default:
1311                         gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
1312                 }
1313             }
1314             else
1315             {
1316                 atc = &pme->atc[0];
1317                 switch (fep_state)
1318                 {
1319                     case 0:
1320                         RedistC6      = c6A;
1321                         RedistSigma   = sigmaA;
1322                         break;
1323                     case 1:
1324                         RedistC6      = c6B;
1325                         RedistSigma   = sigmaB;
1326                         break;
1327                     default:
1328                         gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
1329                 }
1330                 wallcycle_start(wcycle, ewcPME_REDISTXF);
1331
1332                 do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, RedistC6);
1333                 if (pme->lb_buf_nalloc < atc->n)
1334                 {
1335                     pme->lb_buf_nalloc = atc->nalloc;
1336                     srenew(pme->lb_buf1, pme->lb_buf_nalloc);
1337                     srenew(pme->lb_buf2, pme->lb_buf_nalloc);
1338                 }
1339                 local_c6 = pme->lb_buf1;
1340                 for (i = 0; i < atc->n; ++i)
1341                 {
1342                     local_c6[i] = atc->coefficient[i];
1343                 }
1344                 where();
1345
1346                 do_redist_pos_coeffs(pme, cr, start, homenr, FALSE, x, RedistSigma);
1347                 local_sigma = pme->lb_buf2;
1348                 for (i = 0; i < atc->n; ++i)
1349                 {
1350                     local_sigma[i] = atc->coefficient[i];
1351                 }
1352                 where();
1353
1354                 wallcycle_stop(wcycle, ewcPME_REDISTXF);
1355             }
1356             calc_initial_lb_coeffs(pme, local_c6, local_sigma);
1357
1358             /*Seven terms in LJ-PME with LB, grid_index < 2 reserved for electrostatics*/
1359             for (grid_index = 2; grid_index < 9; ++grid_index)
1360             {
1361                 /* Unpack structure */
1362                 pmegrid    = &pme->pmegrid[grid_index];
1363                 fftgrid    = pme->fftgrid[grid_index];
1364                 pfft_setup = pme->pfft_setup[grid_index];
1365                 calc_next_lb_coeffs(pme, local_sigma);
1366                 grid = pmegrid->grid.grid;
1367                 where();
1368
1369                 if (flags & GMX_PME_SPREAD)
1370                 {
1371                     wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1372                     /* Spread the c6 on a grid */
1373                     spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
1374
1375                     if (bFirst)
1376                     {
1377                         inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
1378                     }
1379
1380                     inc_nrnb(nrnb, eNR_SPREADBSP,
1381                              pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
1382                     if (pme->nthread == 1)
1383                     {
1384                         wrap_periodic_pmegrid(pme, grid);
1385                         /* sum contributions to local grid from other nodes */
1386 #if GMX_MPI
1387                         if (pme->nnodes > 1)
1388                         {
1389                             gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
1390                             where();
1391                         }
1392 #endif
1393                         copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
1394                     }
1395                     wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1396                 }
1397                 /*Here we start a large thread parallel region*/
1398 #pragma omp parallel num_threads(pme->nthread) private(thread)
1399                 {
1400                     try
1401                     {
1402                         thread = gmx_omp_get_thread_num();
1403                         if (flags & GMX_PME_SOLVE)
1404                         {
1405                             /* do 3d-fft */
1406                             if (thread == 0)
1407                             {
1408                                 wallcycle_start(wcycle, ewcPME_FFT);
1409                             }
1410
1411                             gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
1412                                                        thread, wcycle);
1413                             if (thread == 0)
1414                             {
1415                                 wallcycle_stop(wcycle, ewcPME_FFT);
1416                             }
1417                             where();
1418                         }
1419                     }
1420                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1421                 }
1422                 bFirst = FALSE;
1423             }
1424             if (flags & GMX_PME_SOLVE)
1425             {
1426                 /* solve in k-space for our local cells */
1427 #pragma omp parallel num_threads(pme->nthread) private(thread)
1428                 {
1429                     try
1430                     {
1431                         int loop_count;
1432                         thread = gmx_omp_get_thread_num();
1433                         if (thread == 0)
1434                         {
1435                             wallcycle_start(wcycle, ewcLJPME);
1436                         }
1437
1438                         loop_count =
1439                             solve_pme_lj_yzx(pme, &pme->cfftgrid[2], TRUE,
1440                                              box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
1441                                              bCalcEnerVir,
1442                                              pme->nthread, thread);
1443                         if (thread == 0)
1444                         {
1445                             wallcycle_stop(wcycle, ewcLJPME);
1446                             where();
1447                             inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
1448                         }
1449                     }
1450                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1451                 }
1452             }
1453
1454             if (bCalcEnerVir)
1455             {
1456                 /* This should only be called on the master thread and
1457                  * after the threads have synchronized.
1458                  */
1459                 get_pme_ener_vir_lj(pme->solve_work, pme->nthread, &energy_AB[2+fep_state], vir_AB[2+fep_state]);
1460             }
1461
1462             if (bBackFFT)
1463             {
1464                 bFirst = !pme->doCoulomb;
1465                 calc_initial_lb_coeffs(pme, local_c6, local_sigma);
1466                 for (grid_index = 8; grid_index >= 2; --grid_index)
1467                 {
1468                     /* Unpack structure */
1469                     pmegrid    = &pme->pmegrid[grid_index];
1470                     fftgrid    = pme->fftgrid[grid_index];
1471                     pfft_setup = pme->pfft_setup[grid_index];
1472                     grid       = pmegrid->grid.grid;
1473                     calc_next_lb_coeffs(pme, local_sigma);
1474                     where();
1475 #pragma omp parallel num_threads(pme->nthread) private(thread)
1476                     {
1477                         try
1478                         {
1479                             thread = gmx_omp_get_thread_num();
1480                             /* do 3d-invfft */
1481                             if (thread == 0)
1482                             {
1483                                 where();
1484                                 wallcycle_start(wcycle, ewcPME_FFT);
1485                             }
1486
1487                             gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
1488                                                        thread, wcycle);
1489                             if (thread == 0)
1490                             {
1491                                 wallcycle_stop(wcycle, ewcPME_FFT);
1492
1493                                 where();
1494
1495                                 if (pme->nodeid == 0)
1496                                 {
1497                                     real ntot = pme->nkx*pme->nky*pme->nkz;
1498                                     npme  = static_cast<int>(ntot*std::log(ntot)/std::log(2.0));
1499                                     inc_nrnb(nrnb, eNR_FFT, 2*npme);
1500                                 }
1501                                 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1502                             }
1503
1504                             copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
1505                         }
1506                         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1507                     } /*#pragma omp parallel*/
1508
1509                     /* distribute local grid to all nodes */
1510 #if GMX_MPI
1511                     if (pme->nnodes > 1)
1512                     {
1513                         gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
1514                     }
1515 #endif
1516                     where();
1517
1518                     unwrap_periodic_pmegrid(pme, grid);
1519
1520                     if (bCalcF)
1521                     {
1522                         /* interpolate forces for our local atoms */
1523                         where();
1524                         bClearF = (bFirst && PAR(cr));
1525                         scale   = pme->bFEP ? (fep_state < 1 ? 1.0-lambda_lj : lambda_lj) : 1.0;
1526                         scale  *= lb_scale_factor[grid_index-2];
1527
1528 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1529                         for (thread = 0; thread < pme->nthread; thread++)
1530                         {
1531                             try
1532                             {
1533                                 gather_f_bsplines(pme, grid, bClearF, &pme->atc[0],
1534                                                   &pme->atc[0].spline[thread],
1535                                                   scale);
1536                             }
1537                             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1538                         }
1539
1540                         where();
1541
1542                         inc_nrnb(nrnb, eNR_GATHERFBSP,
1543                                  pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
1544                     }
1545                     wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1546
1547                     bFirst = FALSE;
1548                 } /* for (grid_index = 8; grid_index >= 2; --grid_index) */
1549             }     /* if (bCalcF) */
1550         }         /* for (fep_state = 0; fep_state < fep_states_lj; ++fep_state) */
1551     }             /* if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB) */
1552
1553     if (bCalcF && pme->nnodes > 1)
1554     {
1555         wallcycle_start(wcycle, ewcPME_REDISTXF);
1556         for (d = 0; d < pme->ndecompdim; d++)
1557         {
1558             atc = &pme->atc[d];
1559             if (d == pme->ndecompdim - 1)
1560             {
1561                 n_d = homenr;
1562                 f_d = f + start;
1563             }
1564             else
1565             {
1566                 n_d = pme->atc[d+1].n;
1567                 f_d = pme->atc[d+1].f;
1568             }
1569             if (DOMAINDECOMP(cr))
1570             {
1571                 dd_pmeredist_f(pme, atc, n_d, f_d,
1572                                d == pme->ndecompdim-1 && pme->bPPnode);
1573             }
1574         }
1575
1576         wallcycle_stop(wcycle, ewcPME_REDISTXF);
1577     }
1578     where();
1579
1580     if (bCalcEnerVir)
1581     {
1582         if (pme->doCoulomb)
1583         {
1584             if (!pme->bFEP_q)
1585             {
1586                 *energy_q = energy_AB[0];
1587                 m_add(vir_q, vir_AB[0], vir_q);
1588             }
1589             else
1590             {
1591                 *energy_q       = (1.0-lambda_q)*energy_AB[0] + lambda_q*energy_AB[1];
1592                 *dvdlambda_q   += energy_AB[1] - energy_AB[0];
1593                 for (i = 0; i < DIM; i++)
1594                 {
1595                     for (j = 0; j < DIM; j++)
1596                     {
1597                         vir_q[i][j] += (1.0-lambda_q)*vir_AB[0][i][j] +
1598                             lambda_q*vir_AB[1][i][j];
1599                     }
1600                 }
1601             }
1602             if (debug)
1603             {
1604                 fprintf(debug, "Electrostatic PME mesh energy: %g\n", *energy_q);
1605             }
1606         }
1607         else
1608         {
1609             *energy_q = 0;
1610         }
1611
1612         if (pme->doLJ)
1613         {
1614             if (!pme->bFEP_lj)
1615             {
1616                 *energy_lj = energy_AB[2];
1617                 m_add(vir_lj, vir_AB[2], vir_lj);
1618             }
1619             else
1620             {
1621                 *energy_lj     = (1.0-lambda_lj)*energy_AB[2] + lambda_lj*energy_AB[3];
1622                 *dvdlambda_lj += energy_AB[3] - energy_AB[2];
1623                 for (i = 0; i < DIM; i++)
1624                 {
1625                     for (j = 0; j < DIM; j++)
1626                     {
1627                         vir_lj[i][j] += (1.0-lambda_lj)*vir_AB[2][i][j] + lambda_lj*vir_AB[3][i][j];
1628                     }
1629                 }
1630             }
1631             if (debug)
1632             {
1633                 fprintf(debug, "Lennard-Jones PME mesh energy: %g\n", *energy_lj);
1634             }
1635         }
1636         else
1637         {
1638             *energy_lj = 0;
1639         }
1640     }
1641     return 0;
1642 }
1643
1644 int gmx_pme_destroy(struct gmx_pme_t **pmedata)
1645 {
1646     struct gmx_pme_t *pme = *pmedata;
1647
1648     sfree(pme->nnx);
1649     sfree(pme->nny);
1650     sfree(pme->nnz);
1651     sfree(pme->fshx);
1652     sfree(pme->fshy);
1653     sfree(pme->fshz);
1654
1655     for (int i = 0; i < pme->ngrids; ++i)
1656     {
1657         pmegrids_destroy(&pme->pmegrid[i]);
1658         gmx_parallel_3dfft_destroy(pme->pfft_setup[i]);
1659     }
1660
1661     for (int i = 0; i < pme->ndecompdim; i++)
1662     {
1663         destroy_atomcomm(&pme->atc[i]);
1664     }
1665
1666     destroy_overlap_comm(&pme->overlap[0]);
1667     destroy_overlap_comm(&pme->overlap[1]);
1668
1669     sfree(pme->lb_buf1);
1670     sfree(pme->lb_buf2);
1671
1672     sfree(pme->bufv);
1673     sfree(pme->bufr);
1674
1675     pme_free_all_work(&pme->solve_work, pme->nthread);
1676
1677     sfree(pme->sum_qgrid_tmp);
1678     sfree(pme->sum_qgrid_dd_tmp);
1679
1680     sfree(*pmedata);
1681     *pmedata = NULL;
1682
1683     return 0;
1684 }