Fix trivial PME memory leaks
[gromacs.git] / src / gromacs / ewald / pme.cpp
blob0ca3acf4ae41fab3a93ac8103ed7c2e8de63aa1f
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
37 /*! \internal \file
39 * \brief This file contains function definitions necessary for
40 * computing energies and forces for the PME long-ranged part (Coulomb
41 * and LJ).
43 * \author Erik Lindahl <erik@kth.se>
44 * \author Berk Hess <hess@kth.se>
45 * \ingroup module_ewald
47 /* IMPORTANT FOR DEVELOPERS:
49 * Triclinic pme stuff isn't entirely trivial, and we've experienced
50 * some bugs during development (many of them due to me). To avoid
51 * this in the future, please check the following things if you make
52 * changes in this file:
54 * 1. You should obtain identical (at least to the PME precision)
55 * energies, forces, and virial for
56 * a rectangular box and a triclinic one where the z (or y) axis is
57 * tilted a whole box side. For instance you could use these boxes:
59 * rectangular triclinic
60 * 2 0 0 2 0 0
61 * 0 2 0 0 2 0
62 * 0 0 6 2 2 6
64 * 2. You should check the energy conservation in a triclinic box.
66 * It might seem an overkill, but better safe than sorry.
67 * /Erik 001109
70 #include "gmxpre.h"
72 #include "pme.h"
74 #include "config.h"
76 #include <assert.h>
77 #include <math.h>
78 #include <stdio.h>
79 #include <stdlib.h>
80 #include <string.h>
82 #include <algorithm>
84 #include "gromacs/fft/parallel_3dfft.h"
85 #include "gromacs/fileio/pdbio.h"
86 #include "gromacs/gmxlib/network.h"
87 #include "gromacs/gmxlib/nrnb.h"
88 #include "gromacs/math/gmxcomplex.h"
89 #include "gromacs/math/invertmatrix.h"
90 #include "gromacs/math/units.h"
91 #include "gromacs/math/vec.h"
92 #include "gromacs/math/vectypes.h"
93 #include "gromacs/mdtypes/commrec.h"
94 #include "gromacs/mdtypes/forcerec.h"
95 #include "gromacs/mdtypes/inputrec.h"
96 #include "gromacs/mdtypes/md_enums.h"
97 #include "gromacs/pbcutil/pbc.h"
98 #include "gromacs/timing/cyclecounter.h"
99 #include "gromacs/timing/wallcycle.h"
100 #include "gromacs/timing/walltime_accounting.h"
101 #include "gromacs/utility/basedefinitions.h"
102 #include "gromacs/utility/exceptions.h"
103 #include "gromacs/utility/fatalerror.h"
104 #include "gromacs/utility/gmxmpi.h"
105 #include "gromacs/utility/gmxomp.h"
106 #include "gromacs/utility/real.h"
107 #include "gromacs/utility/smalloc.h"
109 #include "calculate-spline-moduli.h"
110 #include "pme-gather.h"
111 #include "pme-grid.h"
112 #include "pme-internal.h"
113 #include "pme-redistribute.h"
114 #include "pme-solve.h"
115 #include "pme-spline-work.h"
116 #include "pme-spread.h"
118 /*! \brief Number of bytes in a cache line.
120 * Must also be a multiple of the SIMD and SIMD4 register size, to
121 * preserve alignment.
123 const int gmxCacheLineSize = 64;
125 //! Set up coordinate communication
126 static void setup_coordinate_communication(pme_atomcomm_t *atc)
128 int nslab, n, i;
129 int fw, bw;
131 nslab = atc->nslab;
133 n = 0;
134 for (i = 1; i <= nslab/2; i++)
136 fw = (atc->nodeid + i) % nslab;
137 bw = (atc->nodeid - i + nslab) % nslab;
138 if (n < nslab - 1)
140 atc->node_dest[n] = fw;
141 atc->node_src[n] = bw;
142 n++;
144 if (n < nslab - 1)
146 atc->node_dest[n] = bw;
147 atc->node_src[n] = fw;
148 n++;
153 /*! \brief Round \p n up to the next multiple of \p f */
154 static int mult_up(int n, int f)
156 return ((n + f - 1)/f)*f;
159 /*! \brief Return estimate of the load imbalance from the PME grid not being a good match for the number of PME ranks */
160 static double estimate_pme_load_imbalance(struct gmx_pme_t *pme)
162 int nma, nmi;
163 double n1, n2, n3;
165 nma = pme->nnodes_major;
166 nmi = pme->nnodes_minor;
168 n1 = mult_up(pme->nkx, nma)*mult_up(pme->nky, nmi)*pme->nkz;
169 n2 = mult_up(pme->nkx, nma)*mult_up(pme->nkz, nmi)*pme->nky;
170 n3 = mult_up(pme->nky, nma)*mult_up(pme->nkz, nmi)*pme->nkx;
172 /* pme_solve is roughly double the cost of an fft */
174 return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
177 /*! \brief Initialize atom communication data structure */
178 static void init_atomcomm(struct gmx_pme_t *pme, pme_atomcomm_t *atc,
179 int dimind, gmx_bool bSpread)
181 int thread;
183 atc->dimind = dimind;
184 atc->nslab = 1;
185 atc->nodeid = 0;
186 atc->pd_nalloc = 0;
187 #if GMX_MPI
188 if (pme->nnodes > 1)
190 atc->mpi_comm = pme->mpi_comm_d[dimind];
191 MPI_Comm_size(atc->mpi_comm, &atc->nslab);
192 MPI_Comm_rank(atc->mpi_comm, &atc->nodeid);
194 if (debug)
196 fprintf(debug, "For PME atom communication in dimind %d: nslab %d rank %d\n", atc->dimind, atc->nslab, atc->nodeid);
198 #endif
200 atc->bSpread = bSpread;
201 atc->pme_order = pme->pme_order;
203 if (atc->nslab > 1)
205 snew(atc->node_dest, atc->nslab);
206 snew(atc->node_src, atc->nslab);
207 setup_coordinate_communication(atc);
209 snew(atc->count_thread, pme->nthread);
210 for (thread = 0; thread < pme->nthread; thread++)
212 snew(atc->count_thread[thread], atc->nslab);
214 atc->count = atc->count_thread[0];
215 snew(atc->rcount, atc->nslab);
216 snew(atc->buf_index, atc->nslab);
219 atc->nthread = pme->nthread;
220 if (atc->nthread > 1)
222 snew(atc->thread_plist, atc->nthread);
224 snew(atc->spline, atc->nthread);
225 for (thread = 0; thread < atc->nthread; thread++)
227 if (atc->nthread > 1)
229 snew(atc->thread_plist[thread].n, atc->nthread+2*gmxCacheLineSize);
230 atc->thread_plist[thread].n += gmxCacheLineSize;
232 snew(atc->spline[thread].thread_one, pme->nthread);
233 atc->spline[thread].thread_one[thread] = 1;
237 /*! \brief Destroy an atom communication data structure and its child structs */
238 static void destroy_atomcomm(pme_atomcomm_t *atc)
240 sfree(atc->pd);
241 if (atc->nslab > 1)
243 sfree(atc->node_dest);
244 sfree(atc->node_src);
245 for (int i = 0; i < atc->nthread; i++)
247 sfree(atc->count_thread[i]);
249 sfree(atc->count_thread);
250 sfree(atc->rcount);
251 sfree(atc->buf_index);
253 sfree(atc->x);
254 sfree(atc->coefficient);
255 sfree(atc->f);
257 sfree(atc->idx);
258 sfree(atc->fractx);
260 sfree(atc->thread_idx);
261 for (int i = 0; i < atc->nthread; i++)
263 if (atc->nthread > 1)
265 int *n_ptr = atc->thread_plist[i].n - gmxCacheLineSize;
266 sfree(n_ptr);
267 sfree(atc->thread_plist[i].i);
269 sfree(atc->spline[i].thread_one);
270 sfree(atc->spline[i].ind);
272 if (atc->nthread > 1)
274 sfree(atc->thread_plist);
276 sfree(atc->spline);
279 /*! \brief Initialize data structure for communication */
280 static void
281 init_overlap_comm(pme_overlap_t * ol,
282 int norder,
283 #if GMX_MPI
284 MPI_Comm comm,
285 #endif
286 int nnodes,
287 int nodeid,
288 int ndata,
289 int commplainsize)
291 int b, i;
292 pme_grid_comm_t *pgc;
293 gmx_bool bCont;
294 int fft_start, fft_end, send_index1, recv_index1;
295 #if GMX_MPI
296 MPI_Status stat;
298 ol->mpi_comm = comm;
299 #endif
301 ol->nnodes = nnodes;
302 ol->nodeid = nodeid;
304 /* Linear translation of the PME grid won't affect reciprocal space
305 * calculations, so to optimize we only interpolate "upwards",
306 * which also means we only have to consider overlap in one direction.
307 * I.e., particles on this node might also be spread to grid indices
308 * that belong to higher nodes (modulo nnodes)
311 snew(ol->s2g0, ol->nnodes+1);
312 snew(ol->s2g1, ol->nnodes);
313 if (debug)
315 fprintf(debug, "PME slab boundaries:");
317 for (i = 0; i < nnodes; i++)
319 /* s2g0 the local interpolation grid start.
320 * s2g1 the local interpolation grid end.
321 * Since in calc_pidx we divide particles, and not grid lines,
322 * spatially uniform along dimension x or y, we need to round
323 * s2g0 down and s2g1 up.
325 ol->s2g0[i] = ( i *ndata + 0 )/nnodes;
326 ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
328 if (debug)
330 fprintf(debug, " %3d %3d", ol->s2g0[i], ol->s2g1[i]);
333 ol->s2g0[nnodes] = ndata;
334 if (debug)
336 fprintf(debug, "\n");
339 /* Determine with how many nodes we need to communicate the grid overlap */
340 b = 0;
343 b++;
344 bCont = FALSE;
345 for (i = 0; i < nnodes; i++)
347 if ((i+b < nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
348 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
350 bCont = TRUE;
354 while (bCont && b < nnodes);
355 ol->noverlap_nodes = b - 1;
357 snew(ol->send_id, ol->noverlap_nodes);
358 snew(ol->recv_id, ol->noverlap_nodes);
359 for (b = 0; b < ol->noverlap_nodes; b++)
361 ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
362 ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
364 snew(ol->comm_data, ol->noverlap_nodes);
366 ol->send_size = 0;
367 for (b = 0; b < ol->noverlap_nodes; b++)
369 pgc = &ol->comm_data[b];
370 /* Send */
371 fft_start = ol->s2g0[ol->send_id[b]];
372 fft_end = ol->s2g0[ol->send_id[b]+1];
373 if (ol->send_id[b] < nodeid)
375 fft_start += ndata;
376 fft_end += ndata;
378 send_index1 = ol->s2g1[nodeid];
379 send_index1 = std::min(send_index1, fft_end);
380 pgc->send_index0 = fft_start;
381 pgc->send_nindex = std::max(0, send_index1 - pgc->send_index0);
382 ol->send_size += pgc->send_nindex;
384 /* We always start receiving to the first index of our slab */
385 fft_start = ol->s2g0[ol->nodeid];
386 fft_end = ol->s2g0[ol->nodeid+1];
387 recv_index1 = ol->s2g1[ol->recv_id[b]];
388 if (ol->recv_id[b] > nodeid)
390 recv_index1 -= ndata;
392 recv_index1 = std::min(recv_index1, fft_end);
393 pgc->recv_index0 = fft_start;
394 pgc->recv_nindex = std::max(0, recv_index1 - pgc->recv_index0);
397 #if GMX_MPI
398 /* Communicate the buffer sizes to receive */
399 for (b = 0; b < ol->noverlap_nodes; b++)
401 MPI_Sendrecv(&ol->send_size, 1, MPI_INT, ol->send_id[b], b,
402 &ol->comm_data[b].recv_size, 1, MPI_INT, ol->recv_id[b], b,
403 ol->mpi_comm, &stat);
405 #endif
407 /* For non-divisible grid we need pme_order iso pme_order-1 */
408 snew(ol->sendbuf, norder*commplainsize);
409 snew(ol->recvbuf, norder*commplainsize);
412 /*! \brief Destroy data structure for communication */
413 static void
414 destroy_overlap_comm(const pme_overlap_t *ol)
416 sfree(ol->s2g0);
417 sfree(ol->s2g1);
418 sfree(ol->send_id);
419 sfree(ol->recv_id);
420 sfree(ol->comm_data);
421 sfree(ol->sendbuf);
422 sfree(ol->recvbuf);
425 void gmx_pme_check_restrictions(int pme_order,
426 int nkx, int nky, int nkz,
427 int nnodes_major,
428 int nnodes_minor,
429 gmx_bool bUseThreads,
430 gmx_bool bFatal,
431 gmx_bool *bValidSettings)
433 if (pme_order > PME_ORDER_MAX)
435 if (!bFatal)
437 *bValidSettings = FALSE;
438 return;
440 gmx_fatal(FARGS, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
441 pme_order, PME_ORDER_MAX);
444 if (nkx <= pme_order*(nnodes_major > 1 ? 2 : 1) ||
445 nky <= pme_order*(nnodes_minor > 1 ? 2 : 1) ||
446 nkz <= pme_order)
448 if (!bFatal)
450 *bValidSettings = FALSE;
451 return;
453 gmx_fatal(FARGS, "The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order",
454 pme_order);
457 /* Check for a limitation of the (current) sum_fftgrid_dd code.
458 * We only allow multiple communication pulses in dim 1, not in dim 0.
460 if (bUseThreads && (nkx < nnodes_major*pme_order &&
461 nkx != nnodes_major*(pme_order - 1)))
463 if (!bFatal)
465 *bValidSettings = FALSE;
466 return;
468 gmx_fatal(FARGS, "The number of PME grid lines per rank along x is %g. But when using OpenMP threads, the number of grid lines per rank along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use fewer ranks along x (and possibly more along y and/or z) by specifying -dd manually.",
469 nkx/(double)nnodes_major, pme_order);
472 if (bValidSettings != NULL)
474 *bValidSettings = TRUE;
477 return;
480 /*! \brief Round \p enumerator */
481 static int div_round_up(int enumerator, int denominator)
483 return (enumerator + denominator - 1)/denominator;
486 int gmx_pme_init(struct gmx_pme_t **pmedata,
487 t_commrec * cr,
488 int nnodes_major,
489 int nnodes_minor,
490 t_inputrec * ir,
491 int homenr,
492 gmx_bool bFreeEnergy_q,
493 gmx_bool bFreeEnergy_lj,
494 gmx_bool bReproducible,
495 real ewaldcoeff_q,
496 real ewaldcoeff_lj,
497 int nthread)
499 struct gmx_pme_t *pme = NULL;
501 int use_threads, sum_use_threads, i;
502 ivec ndata;
504 if (debug)
506 fprintf(debug, "Creating PME data structures.\n");
508 snew(pme, 1);
510 pme->sum_qgrid_tmp = NULL;
511 pme->sum_qgrid_dd_tmp = NULL;
512 pme->buf_nalloc = 0;
514 pme->nnodes = 1;
515 pme->bPPnode = TRUE;
517 pme->nnodes_major = nnodes_major;
518 pme->nnodes_minor = nnodes_minor;
520 #if GMX_MPI
521 if (nnodes_major*nnodes_minor > 1)
523 pme->mpi_comm = cr->mpi_comm_mygroup;
525 MPI_Comm_rank(pme->mpi_comm, &pme->nodeid);
526 MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
527 if (pme->nnodes != nnodes_major*nnodes_minor)
529 gmx_incons("PME rank count mismatch");
532 else
534 pme->mpi_comm = MPI_COMM_NULL;
536 #endif
538 if (pme->nnodes == 1)
540 #if GMX_MPI
541 pme->mpi_comm_d[0] = MPI_COMM_NULL;
542 pme->mpi_comm_d[1] = MPI_COMM_NULL;
543 #endif
544 pme->ndecompdim = 0;
545 pme->nodeid_major = 0;
546 pme->nodeid_minor = 0;
547 #if GMX_MPI
548 pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
549 #endif
551 else
553 if (nnodes_minor == 1)
555 #if GMX_MPI
556 pme->mpi_comm_d[0] = pme->mpi_comm;
557 pme->mpi_comm_d[1] = MPI_COMM_NULL;
558 #endif
559 pme->ndecompdim = 1;
560 pme->nodeid_major = pme->nodeid;
561 pme->nodeid_minor = 0;
564 else if (nnodes_major == 1)
566 #if GMX_MPI
567 pme->mpi_comm_d[0] = MPI_COMM_NULL;
568 pme->mpi_comm_d[1] = pme->mpi_comm;
569 #endif
570 pme->ndecompdim = 1;
571 pme->nodeid_major = 0;
572 pme->nodeid_minor = pme->nodeid;
574 else
576 if (pme->nnodes % nnodes_major != 0)
578 gmx_incons("For 2D PME decomposition, #PME ranks must be divisible by the number of ranks in the major dimension");
580 pme->ndecompdim = 2;
582 #if GMX_MPI
583 MPI_Comm_split(pme->mpi_comm, pme->nodeid % nnodes_minor,
584 pme->nodeid, &pme->mpi_comm_d[0]); /* My communicator along major dimension */
585 MPI_Comm_split(pme->mpi_comm, pme->nodeid/nnodes_minor,
586 pme->nodeid, &pme->mpi_comm_d[1]); /* My communicator along minor dimension */
588 MPI_Comm_rank(pme->mpi_comm_d[0], &pme->nodeid_major);
589 MPI_Comm_size(pme->mpi_comm_d[0], &pme->nnodes_major);
590 MPI_Comm_rank(pme->mpi_comm_d[1], &pme->nodeid_minor);
591 MPI_Comm_size(pme->mpi_comm_d[1], &pme->nnodes_minor);
592 #endif
594 pme->bPPnode = (cr->duty & DUTY_PP);
597 pme->nthread = nthread;
599 /* Check if any of the PME MPI ranks uses threads */
600 use_threads = (pme->nthread > 1 ? 1 : 0);
601 #if GMX_MPI
602 if (pme->nnodes > 1)
604 MPI_Allreduce(&use_threads, &sum_use_threads, 1, MPI_INT,
605 MPI_SUM, pme->mpi_comm);
607 else
608 #endif
610 sum_use_threads = use_threads;
612 pme->bUseThreads = (sum_use_threads > 0);
614 if (ir->ePBC == epbcSCREW)
616 gmx_fatal(FARGS, "pme does not (yet) work with pbc = screw");
619 /* NOTE:
620 * It is likely that the current gmx_pme_do() routine supports calculating
621 * only Coulomb or LJ while gmx_pme_init() configures for both,
622 * but that has never been tested.
623 * It is likely that the current gmx_pme_do() routine supports calculating,
624 * not calculating free-energy for Coulomb and/or LJ while gmx_pme_init()
625 * configures with free-energy, but that has never been tested.
627 pme->doCoulomb = EEL_PME(ir->coulombtype);
628 pme->doLJ = EVDW_PME(ir->vdwtype);
629 pme->bFEP_q = ((ir->efep != efepNO) && bFreeEnergy_q);
630 pme->bFEP_lj = ((ir->efep != efepNO) && bFreeEnergy_lj);
631 pme->bFEP = (pme->bFEP_q || pme->bFEP_lj);
632 pme->nkx = ir->nkx;
633 pme->nky = ir->nky;
634 pme->nkz = ir->nkz;
635 pme->bP3M = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
636 pme->pme_order = ir->pme_order;
637 pme->ewaldcoeff_q = ewaldcoeff_q;
638 pme->ewaldcoeff_lj = ewaldcoeff_lj;
640 /* Always constant electrostatics coefficients */
641 pme->epsilon_r = ir->epsilon_r;
643 /* Always constant LJ coefficients */
644 pme->ljpme_combination_rule = ir->ljpme_combination_rule;
646 /* If we violate restrictions, generate a fatal error here */
647 gmx_pme_check_restrictions(pme->pme_order,
648 pme->nkx, pme->nky, pme->nkz,
649 pme->nnodes_major,
650 pme->nnodes_minor,
651 pme->bUseThreads,
652 TRUE,
653 NULL);
655 if (pme->nnodes > 1)
657 double imbal;
659 #if GMX_MPI
660 MPI_Type_contiguous(DIM, GMX_MPI_REAL, &(pme->rvec_mpi));
661 MPI_Type_commit(&(pme->rvec_mpi));
662 #endif
664 /* Note that the coefficient spreading and force gathering, which usually
665 * takes about the same amount of time as FFT+solve_pme,
666 * is always fully load balanced
667 * (unless the coefficient distribution is inhomogeneous).
670 imbal = estimate_pme_load_imbalance(pme);
671 if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
673 fprintf(stderr,
674 "\n"
675 "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
676 " For optimal PME load balancing\n"
677 " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_ranks_x (%d)\n"
678 " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_ranks_y (%d)\n"
679 "\n",
680 (int)((imbal-1)*100 + 0.5),
681 pme->nkx, pme->nky, pme->nnodes_major,
682 pme->nky, pme->nkz, pme->nnodes_minor);
686 /* For non-divisible grid we need pme_order iso pme_order-1 */
687 /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
688 * y is always copied through a buffer: we don't need padding in z,
689 * but we do need the overlap in x because of the communication order.
691 init_overlap_comm(&pme->overlap[0], pme->pme_order,
692 #if GMX_MPI
693 pme->mpi_comm_d[0],
694 #endif
695 pme->nnodes_major, pme->nodeid_major,
696 pme->nkx,
697 (div_round_up(pme->nky, pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
699 /* Along overlap dim 1 we can send in multiple pulses in sum_fftgrid_dd.
700 * We do this with an offset buffer of equal size, so we need to allocate
701 * extra for the offset. That's what the (+1)*pme->nkz is for.
703 init_overlap_comm(&pme->overlap[1], pme->pme_order,
704 #if GMX_MPI
705 pme->mpi_comm_d[1],
706 #endif
707 pme->nnodes_minor, pme->nodeid_minor,
708 pme->nky,
709 (div_round_up(pme->nkx, pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
711 /* Double-check for a limitation of the (current) sum_fftgrid_dd code.
712 * Note that gmx_pme_check_restrictions checked for this already.
714 if (pme->bUseThreads && pme->overlap[0].noverlap_nodes > 1)
716 gmx_incons("More than one communication pulse required for grid overlap communication along the major dimension while using threads");
719 snew(pme->bsp_mod[XX], pme->nkx);
720 snew(pme->bsp_mod[YY], pme->nky);
721 snew(pme->bsp_mod[ZZ], pme->nkz);
723 /* The required size of the interpolation grid, including overlap.
724 * The allocated size (pmegrid_n?) might be slightly larger.
726 pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
727 pme->overlap[0].s2g0[pme->nodeid_major];
728 pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
729 pme->overlap[1].s2g0[pme->nodeid_minor];
730 pme->pmegrid_nz_base = pme->nkz;
731 pme->pmegrid_nz = pme->pmegrid_nz_base + pme->pme_order - 1;
732 set_grid_alignment(&pme->pmegrid_nz, pme->pme_order);
734 pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
735 pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
736 pme->pmegrid_start_iz = 0;
738 make_gridindex5_to_localindex(pme->nkx,
739 pme->pmegrid_start_ix,
740 pme->pmegrid_nx - (pme->pme_order-1),
741 &pme->nnx, &pme->fshx);
742 make_gridindex5_to_localindex(pme->nky,
743 pme->pmegrid_start_iy,
744 pme->pmegrid_ny - (pme->pme_order-1),
745 &pme->nny, &pme->fshy);
746 make_gridindex5_to_localindex(pme->nkz,
747 pme->pmegrid_start_iz,
748 pme->pmegrid_nz_base,
749 &pme->nnz, &pme->fshz);
751 pme->spline_work = make_pme_spline_work(pme->pme_order);
753 ndata[0] = pme->nkx;
754 ndata[1] = pme->nky;
755 ndata[2] = pme->nkz;
756 /* It doesn't matter if we allocate too many grids here,
757 * we only allocate and use the ones we need.
759 if (pme->doLJ)
761 pme->ngrids = ((ir->ljpme_combination_rule == eljpmeLB) ? DO_Q_AND_LJ_LB : DO_Q_AND_LJ);
763 else
765 pme->ngrids = DO_Q;
767 snew(pme->fftgrid, pme->ngrids);
768 snew(pme->cfftgrid, pme->ngrids);
769 snew(pme->pfft_setup, pme->ngrids);
771 for (i = 0; i < pme->ngrids; ++i)
773 if ((i < DO_Q && pme->doCoulomb && (i == 0 ||
774 bFreeEnergy_q)) ||
775 (i >= DO_Q && pme->doLJ && (i == 2 ||
776 bFreeEnergy_lj ||
777 ir->ljpme_combination_rule == eljpmeLB)))
779 pmegrids_init(&pme->pmegrid[i],
780 pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
781 pme->pmegrid_nz_base,
782 pme->pme_order,
783 pme->bUseThreads,
784 pme->nthread,
785 pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
786 pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
787 /* This routine will allocate the grid data to fit the FFTs */
788 gmx_parallel_3dfft_init(&pme->pfft_setup[i], ndata,
789 &pme->fftgrid[i], &pme->cfftgrid[i],
790 pme->mpi_comm_d,
791 bReproducible, pme->nthread);
796 if (!pme->bP3M)
798 /* Use plain SPME B-spline interpolation */
799 make_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
801 else
803 /* Use the P3M grid-optimized influence function */
804 make_p3m_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
807 /* Use atc[0] for spreading */
808 init_atomcomm(pme, &pme->atc[0], nnodes_major > 1 ? 0 : 1, TRUE);
809 if (pme->ndecompdim >= 2)
811 init_atomcomm(pme, &pme->atc[1], 1, FALSE);
814 if (pme->nnodes == 1)
816 pme->atc[0].n = homenr;
817 pme_realloc_atomcomm_things(&pme->atc[0]);
820 pme->lb_buf1 = NULL;
821 pme->lb_buf2 = NULL;
822 pme->lb_buf_nalloc = 0;
824 pme_init_all_work(&pme->solve_work, pme->nthread, pme->nkx);
826 *pmedata = pme;
828 return 0;
831 int gmx_pme_reinit(struct gmx_pme_t **pmedata,
832 t_commrec * cr,
833 struct gmx_pme_t * pme_src,
834 const t_inputrec * ir,
835 ivec grid_size,
836 real ewaldcoeff_q,
837 real ewaldcoeff_lj)
839 t_inputrec irc;
840 int homenr;
841 int ret;
843 irc = *ir;
844 irc.nkx = grid_size[XX];
845 irc.nky = grid_size[YY];
846 irc.nkz = grid_size[ZZ];
848 if (pme_src->nnodes == 1)
850 homenr = pme_src->atc[0].n;
852 else
854 homenr = -1;
857 ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
858 &irc, homenr, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE, ewaldcoeff_q, ewaldcoeff_lj, pme_src->nthread);
860 if (ret == 0)
862 /* We can easily reuse the allocated pme grids in pme_src */
863 reuse_pmegrids(&pme_src->pmegrid[PME_GRID_QA], &(*pmedata)->pmegrid[PME_GRID_QA]);
864 /* We would like to reuse the fft grids, but that's harder */
867 return ret;
870 void gmx_pme_calc_energy(struct gmx_pme_t *pme, int n, rvec *x, real *q, real *V)
872 pme_atomcomm_t *atc;
873 pmegrids_t *grid;
875 if (pme->nnodes > 1)
877 gmx_incons("gmx_pme_calc_energy called in parallel");
879 if (pme->bFEP_q > 1)
881 gmx_incons("gmx_pme_calc_energy with free energy");
884 atc = &pme->atc_energy;
885 atc->nthread = 1;
886 if (atc->spline == NULL)
888 snew(atc->spline, atc->nthread);
890 atc->nslab = 1;
891 atc->bSpread = TRUE;
892 atc->pme_order = pme->pme_order;
893 atc->n = n;
894 pme_realloc_atomcomm_things(atc);
895 atc->x = x;
896 atc->coefficient = q;
898 /* We only use the A-charges grid */
899 grid = &pme->pmegrid[PME_GRID_QA];
901 /* Only calculate the spline coefficients, don't actually spread */
902 spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgrid[PME_GRID_QA], FALSE, PME_GRID_QA);
904 *V = gather_energy_bsplines(pme, grid->grid.grid, atc);
907 /*! \brief Calculate initial Lorentz-Berthelot coefficients for LJ-PME */
908 static void
909 calc_initial_lb_coeffs(struct gmx_pme_t *pme, real *local_c6, real *local_sigma)
911 int i;
912 for (i = 0; i < pme->atc[0].n; ++i)
914 real sigma4;
915 sigma4 = local_sigma[i];
916 sigma4 = sigma4*sigma4;
917 sigma4 = sigma4*sigma4;
918 pme->atc[0].coefficient[i] = local_c6[i] / sigma4;
922 /*! \brief Calculate next Lorentz-Berthelot coefficients for LJ-PME */
923 static void
924 calc_next_lb_coeffs(struct gmx_pme_t *pme, real *local_sigma)
926 int i;
928 for (i = 0; i < pme->atc[0].n; ++i)
930 pme->atc[0].coefficient[i] *= local_sigma[i];
934 int gmx_pme_do(struct gmx_pme_t *pme,
935 int start, int homenr,
936 rvec x[], rvec f[],
937 real chargeA[], real chargeB[],
938 real c6A[], real c6B[],
939 real sigmaA[], real sigmaB[],
940 matrix box, t_commrec *cr,
941 int maxshift_x, int maxshift_y,
942 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
943 matrix vir_q, matrix vir_lj,
944 real *energy_q, real *energy_lj,
945 real lambda_q, real lambda_lj,
946 real *dvdlambda_q, real *dvdlambda_lj,
947 int flags)
949 int d, i, j, npme, grid_index, max_grid_index;
950 int n_d;
951 pme_atomcomm_t *atc = NULL;
952 pmegrids_t *pmegrid = NULL;
953 real *grid = NULL;
954 rvec *f_d;
955 real *coefficient = NULL;
956 real energy_AB[4];
957 matrix vir_AB[4];
958 real scale, lambda;
959 gmx_bool bClearF;
960 gmx_parallel_3dfft_t pfft_setup;
961 real * fftgrid;
962 t_complex * cfftgrid;
963 int thread;
964 gmx_bool bFirst, bDoSplines;
965 int fep_state;
966 int fep_states_lj = pme->bFEP_lj ? 2 : 1;
967 const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
968 const gmx_bool bBackFFT = flags & (GMX_PME_CALC_F | GMX_PME_CALC_POT);
969 const gmx_bool bCalcF = flags & GMX_PME_CALC_F;
971 assert(pme->nnodes > 0);
972 assert(pme->nnodes == 1 || pme->ndecompdim > 0);
974 if (pme->nnodes > 1)
976 atc = &pme->atc[0];
977 atc->npd = homenr;
978 if (atc->npd > atc->pd_nalloc)
980 atc->pd_nalloc = over_alloc_dd(atc->npd);
981 srenew(atc->pd, atc->pd_nalloc);
983 for (d = pme->ndecompdim-1; d >= 0; d--)
985 atc = &pme->atc[d];
986 atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
989 else
991 atc = &pme->atc[0];
992 /* This could be necessary for TPI */
993 pme->atc[0].n = homenr;
994 if (DOMAINDECOMP(cr))
996 pme_realloc_atomcomm_things(atc);
998 atc->x = x;
999 atc->f = f;
1002 gmx::invertBoxMatrix(box, pme->recipbox);
1003 bFirst = TRUE;
1005 /* For simplicity, we construct the splines for all particles if
1006 * more than one PME calculations is needed. Some optimization
1007 * could be done by keeping track of which atoms have splines
1008 * constructed, and construct new splines on each pass for atoms
1009 * that don't yet have them.
1012 bDoSplines = pme->bFEP || (pme->doCoulomb && pme->doLJ);
1014 /* We need a maximum of four separate PME calculations:
1015 * grid_index=0: Coulomb PME with charges from state A
1016 * grid_index=1: Coulomb PME with charges from state B
1017 * grid_index=2: LJ PME with C6 from state A
1018 * grid_index=3: LJ PME with C6 from state B
1019 * For Lorentz-Berthelot combination rules, a separate loop is used to
1020 * calculate all the terms
1023 /* If we are doing LJ-PME with LB, we only do Q here */
1024 max_grid_index = (pme->ljpme_combination_rule == eljpmeLB) ? DO_Q : DO_Q_AND_LJ;
1026 for (grid_index = 0; grid_index < max_grid_index; ++grid_index)
1028 /* Check if we should do calculations at this grid_index
1029 * If grid_index is odd we should be doing FEP
1030 * If grid_index < 2 we should be doing electrostatic PME
1031 * If grid_index >= 2 we should be doing LJ-PME
1033 if ((grid_index < DO_Q && (!pme->doCoulomb ||
1034 (grid_index == 1 && !pme->bFEP_q))) ||
1035 (grid_index >= DO_Q && (!pme->doLJ ||
1036 (grid_index == 3 && !pme->bFEP_lj))))
1038 continue;
1040 /* Unpack structure */
1041 pmegrid = &pme->pmegrid[grid_index];
1042 fftgrid = pme->fftgrid[grid_index];
1043 cfftgrid = pme->cfftgrid[grid_index];
1044 pfft_setup = pme->pfft_setup[grid_index];
1045 switch (grid_index)
1047 case 0: coefficient = chargeA + start; break;
1048 case 1: coefficient = chargeB + start; break;
1049 case 2: coefficient = c6A + start; break;
1050 case 3: coefficient = c6B + start; break;
1053 grid = pmegrid->grid.grid;
1055 if (debug)
1057 fprintf(debug, "PME: number of ranks = %d, rank = %d\n",
1058 cr->nnodes, cr->nodeid);
1059 fprintf(debug, "Grid = %p\n", (void*)grid);
1060 if (grid == NULL)
1062 gmx_fatal(FARGS, "No grid!");
1065 where();
1067 if (pme->nnodes == 1)
1069 atc->coefficient = coefficient;
1071 else
1073 wallcycle_start(wcycle, ewcPME_REDISTXF);
1074 do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, coefficient);
1075 where();
1077 wallcycle_stop(wcycle, ewcPME_REDISTXF);
1080 if (debug)
1082 fprintf(debug, "Rank= %6d, pme local particles=%6d\n",
1083 cr->nodeid, atc->n);
1086 if (flags & GMX_PME_SPREAD)
1088 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1090 /* Spread the coefficients on a grid */
1091 spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
1093 if (bFirst)
1095 inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
1097 inc_nrnb(nrnb, eNR_SPREADBSP,
1098 pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
1100 if (!pme->bUseThreads)
1102 wrap_periodic_pmegrid(pme, grid);
1104 /* sum contributions to local grid from other nodes */
1105 #if GMX_MPI
1106 if (pme->nnodes > 1)
1108 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
1109 where();
1111 #endif
1113 copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
1116 wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1118 /* TODO If the OpenMP and single-threaded implementations
1119 converge, then spread_on_grid() and
1120 copy_pmegrid_to_fftgrid() will perhaps live in the same
1121 source file and the following debugging function can live
1122 there too. */
1124 dump_local_fftgrid(pme,fftgrid);
1125 exit(0);
1129 /* Here we start a large thread parallel region */
1130 #pragma omp parallel num_threads(pme->nthread) private(thread)
1134 thread = gmx_omp_get_thread_num();
1135 if (flags & GMX_PME_SOLVE)
1137 int loop_count;
1139 /* do 3d-fft */
1140 if (thread == 0)
1142 wallcycle_start(wcycle, ewcPME_FFT);
1144 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
1145 thread, wcycle);
1146 if (thread == 0)
1148 wallcycle_stop(wcycle, ewcPME_FFT);
1150 where();
1152 /* solve in k-space for our local cells */
1153 if (thread == 0)
1155 wallcycle_start(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
1157 if (grid_index < DO_Q)
1159 loop_count =
1160 solve_pme_yzx(pme, cfftgrid,
1161 box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
1162 bCalcEnerVir,
1163 pme->nthread, thread);
1165 else
1167 loop_count =
1168 solve_pme_lj_yzx(pme, &cfftgrid, FALSE,
1169 box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
1170 bCalcEnerVir,
1171 pme->nthread, thread);
1174 if (thread == 0)
1176 wallcycle_stop(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
1177 where();
1178 inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
1182 if (bBackFFT)
1184 /* do 3d-invfft */
1185 if (thread == 0)
1187 where();
1188 wallcycle_start(wcycle, ewcPME_FFT);
1190 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
1191 thread, wcycle);
1192 if (thread == 0)
1194 wallcycle_stop(wcycle, ewcPME_FFT);
1196 where();
1198 if (pme->nodeid == 0)
1200 real ntot = pme->nkx*pme->nky*pme->nkz;
1201 npme = static_cast<int>(ntot*std::log(ntot)/std::log(2.0));
1202 inc_nrnb(nrnb, eNR_FFT, 2*npme);
1205 /* Note: this wallcycle region is closed below
1206 outside an OpenMP region, so take care if
1207 refactoring code here. */
1208 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1211 copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
1213 } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1215 /* End of thread parallel section.
1216 * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
1219 if (bBackFFT)
1221 /* distribute local grid to all nodes */
1222 #if GMX_MPI
1223 if (pme->nnodes > 1)
1225 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
1227 #endif
1228 where();
1230 unwrap_periodic_pmegrid(pme, grid);
1233 if (bCalcF)
1235 /* interpolate forces for our local atoms */
1237 where();
1239 /* If we are running without parallelization,
1240 * atc->f is the actual force array, not a buffer,
1241 * therefore we should not clear it.
1243 lambda = grid_index < DO_Q ? lambda_q : lambda_lj;
1244 bClearF = (bFirst && PAR(cr));
1245 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1246 for (thread = 0; thread < pme->nthread; thread++)
1250 gather_f_bsplines(pme, grid, bClearF, atc,
1251 &atc->spline[thread],
1252 pme->bFEP ? (grid_index % 2 == 0 ? 1.0-lambda : lambda) : 1.0);
1254 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1257 where();
1259 inc_nrnb(nrnb, eNR_GATHERFBSP,
1260 pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
1261 /* Note: this wallcycle region is opened above inside an OpenMP
1262 region, so take care if refactoring code here. */
1263 wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1266 if (bCalcEnerVir)
1268 /* This should only be called on the master thread
1269 * and after the threads have synchronized.
1271 if (grid_index < 2)
1273 get_pme_ener_vir_q(pme->solve_work, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
1275 else
1277 get_pme_ener_vir_lj(pme->solve_work, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
1280 bFirst = FALSE;
1281 } /* of grid_index-loop */
1283 /* For Lorentz-Berthelot combination rules in LJ-PME, we need to calculate
1284 * seven terms. */
1286 if (pme->doLJ && pme->ljpme_combination_rule == eljpmeLB)
1288 /* Loop over A- and B-state if we are doing FEP */
1289 for (fep_state = 0; fep_state < fep_states_lj; ++fep_state)
1291 real *local_c6 = NULL, *local_sigma = NULL, *RedistC6 = NULL, *RedistSigma = NULL;
1292 if (pme->nnodes == 1)
1294 if (pme->lb_buf1 == NULL)
1296 pme->lb_buf_nalloc = pme->atc[0].n;
1297 snew(pme->lb_buf1, pme->lb_buf_nalloc);
1299 pme->atc[0].coefficient = pme->lb_buf1;
1300 switch (fep_state)
1302 case 0:
1303 local_c6 = c6A;
1304 local_sigma = sigmaA;
1305 break;
1306 case 1:
1307 local_c6 = c6B;
1308 local_sigma = sigmaB;
1309 break;
1310 default:
1311 gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
1314 else
1316 atc = &pme->atc[0];
1317 switch (fep_state)
1319 case 0:
1320 RedistC6 = c6A;
1321 RedistSigma = sigmaA;
1322 break;
1323 case 1:
1324 RedistC6 = c6B;
1325 RedistSigma = sigmaB;
1326 break;
1327 default:
1328 gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
1330 wallcycle_start(wcycle, ewcPME_REDISTXF);
1332 do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, RedistC6);
1333 if (pme->lb_buf_nalloc < atc->n)
1335 pme->lb_buf_nalloc = atc->nalloc;
1336 srenew(pme->lb_buf1, pme->lb_buf_nalloc);
1337 srenew(pme->lb_buf2, pme->lb_buf_nalloc);
1339 local_c6 = pme->lb_buf1;
1340 for (i = 0; i < atc->n; ++i)
1342 local_c6[i] = atc->coefficient[i];
1344 where();
1346 do_redist_pos_coeffs(pme, cr, start, homenr, FALSE, x, RedistSigma);
1347 local_sigma = pme->lb_buf2;
1348 for (i = 0; i < atc->n; ++i)
1350 local_sigma[i] = atc->coefficient[i];
1352 where();
1354 wallcycle_stop(wcycle, ewcPME_REDISTXF);
1356 calc_initial_lb_coeffs(pme, local_c6, local_sigma);
1358 /*Seven terms in LJ-PME with LB, grid_index < 2 reserved for electrostatics*/
1359 for (grid_index = 2; grid_index < 9; ++grid_index)
1361 /* Unpack structure */
1362 pmegrid = &pme->pmegrid[grid_index];
1363 fftgrid = pme->fftgrid[grid_index];
1364 pfft_setup = pme->pfft_setup[grid_index];
1365 calc_next_lb_coeffs(pme, local_sigma);
1366 grid = pmegrid->grid.grid;
1367 where();
1369 if (flags & GMX_PME_SPREAD)
1371 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1372 /* Spread the c6 on a grid */
1373 spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
1375 if (bFirst)
1377 inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
1380 inc_nrnb(nrnb, eNR_SPREADBSP,
1381 pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
1382 if (pme->nthread == 1)
1384 wrap_periodic_pmegrid(pme, grid);
1385 /* sum contributions to local grid from other nodes */
1386 #if GMX_MPI
1387 if (pme->nnodes > 1)
1389 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
1390 where();
1392 #endif
1393 copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
1395 wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1397 /*Here we start a large thread parallel region*/
1398 #pragma omp parallel num_threads(pme->nthread) private(thread)
1402 thread = gmx_omp_get_thread_num();
1403 if (flags & GMX_PME_SOLVE)
1405 /* do 3d-fft */
1406 if (thread == 0)
1408 wallcycle_start(wcycle, ewcPME_FFT);
1411 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
1412 thread, wcycle);
1413 if (thread == 0)
1415 wallcycle_stop(wcycle, ewcPME_FFT);
1417 where();
1420 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1422 bFirst = FALSE;
1424 if (flags & GMX_PME_SOLVE)
1426 /* solve in k-space for our local cells */
1427 #pragma omp parallel num_threads(pme->nthread) private(thread)
1431 int loop_count;
1432 thread = gmx_omp_get_thread_num();
1433 if (thread == 0)
1435 wallcycle_start(wcycle, ewcLJPME);
1438 loop_count =
1439 solve_pme_lj_yzx(pme, &pme->cfftgrid[2], TRUE,
1440 box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
1441 bCalcEnerVir,
1442 pme->nthread, thread);
1443 if (thread == 0)
1445 wallcycle_stop(wcycle, ewcLJPME);
1446 where();
1447 inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
1450 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1454 if (bCalcEnerVir)
1456 /* This should only be called on the master thread and
1457 * after the threads have synchronized.
1459 get_pme_ener_vir_lj(pme->solve_work, pme->nthread, &energy_AB[2+fep_state], vir_AB[2+fep_state]);
1462 if (bBackFFT)
1464 bFirst = !pme->doCoulomb;
1465 calc_initial_lb_coeffs(pme, local_c6, local_sigma);
1466 for (grid_index = 8; grid_index >= 2; --grid_index)
1468 /* Unpack structure */
1469 pmegrid = &pme->pmegrid[grid_index];
1470 fftgrid = pme->fftgrid[grid_index];
1471 pfft_setup = pme->pfft_setup[grid_index];
1472 grid = pmegrid->grid.grid;
1473 calc_next_lb_coeffs(pme, local_sigma);
1474 where();
1475 #pragma omp parallel num_threads(pme->nthread) private(thread)
1479 thread = gmx_omp_get_thread_num();
1480 /* do 3d-invfft */
1481 if (thread == 0)
1483 where();
1484 wallcycle_start(wcycle, ewcPME_FFT);
1487 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
1488 thread, wcycle);
1489 if (thread == 0)
1491 wallcycle_stop(wcycle, ewcPME_FFT);
1493 where();
1495 if (pme->nodeid == 0)
1497 real ntot = pme->nkx*pme->nky*pme->nkz;
1498 npme = static_cast<int>(ntot*std::log(ntot)/std::log(2.0));
1499 inc_nrnb(nrnb, eNR_FFT, 2*npme);
1501 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1504 copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
1506 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1507 } /*#pragma omp parallel*/
1509 /* distribute local grid to all nodes */
1510 #if GMX_MPI
1511 if (pme->nnodes > 1)
1513 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
1515 #endif
1516 where();
1518 unwrap_periodic_pmegrid(pme, grid);
1520 if (bCalcF)
1522 /* interpolate forces for our local atoms */
1523 where();
1524 bClearF = (bFirst && PAR(cr));
1525 scale = pme->bFEP ? (fep_state < 1 ? 1.0-lambda_lj : lambda_lj) : 1.0;
1526 scale *= lb_scale_factor[grid_index-2];
1528 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1529 for (thread = 0; thread < pme->nthread; thread++)
1533 gather_f_bsplines(pme, grid, bClearF, &pme->atc[0],
1534 &pme->atc[0].spline[thread],
1535 scale);
1537 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1540 where();
1542 inc_nrnb(nrnb, eNR_GATHERFBSP,
1543 pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
1545 wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1547 bFirst = FALSE;
1548 } /* for (grid_index = 8; grid_index >= 2; --grid_index) */
1549 } /* if (bCalcF) */
1550 } /* for (fep_state = 0; fep_state < fep_states_lj; ++fep_state) */
1551 } /* if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB) */
1553 if (bCalcF && pme->nnodes > 1)
1555 wallcycle_start(wcycle, ewcPME_REDISTXF);
1556 for (d = 0; d < pme->ndecompdim; d++)
1558 atc = &pme->atc[d];
1559 if (d == pme->ndecompdim - 1)
1561 n_d = homenr;
1562 f_d = f + start;
1564 else
1566 n_d = pme->atc[d+1].n;
1567 f_d = pme->atc[d+1].f;
1569 if (DOMAINDECOMP(cr))
1571 dd_pmeredist_f(pme, atc, n_d, f_d,
1572 d == pme->ndecompdim-1 && pme->bPPnode);
1576 wallcycle_stop(wcycle, ewcPME_REDISTXF);
1578 where();
1580 if (bCalcEnerVir)
1582 if (pme->doCoulomb)
1584 if (!pme->bFEP_q)
1586 *energy_q = energy_AB[0];
1587 m_add(vir_q, vir_AB[0], vir_q);
1589 else
1591 *energy_q = (1.0-lambda_q)*energy_AB[0] + lambda_q*energy_AB[1];
1592 *dvdlambda_q += energy_AB[1] - energy_AB[0];
1593 for (i = 0; i < DIM; i++)
1595 for (j = 0; j < DIM; j++)
1597 vir_q[i][j] += (1.0-lambda_q)*vir_AB[0][i][j] +
1598 lambda_q*vir_AB[1][i][j];
1602 if (debug)
1604 fprintf(debug, "Electrostatic PME mesh energy: %g\n", *energy_q);
1607 else
1609 *energy_q = 0;
1612 if (pme->doLJ)
1614 if (!pme->bFEP_lj)
1616 *energy_lj = energy_AB[2];
1617 m_add(vir_lj, vir_AB[2], vir_lj);
1619 else
1621 *energy_lj = (1.0-lambda_lj)*energy_AB[2] + lambda_lj*energy_AB[3];
1622 *dvdlambda_lj += energy_AB[3] - energy_AB[2];
1623 for (i = 0; i < DIM; i++)
1625 for (j = 0; j < DIM; j++)
1627 vir_lj[i][j] += (1.0-lambda_lj)*vir_AB[2][i][j] + lambda_lj*vir_AB[3][i][j];
1631 if (debug)
1633 fprintf(debug, "Lennard-Jones PME mesh energy: %g\n", *energy_lj);
1636 else
1638 *energy_lj = 0;
1641 return 0;
1644 int gmx_pme_destroy(struct gmx_pme_t **pmedata)
1646 struct gmx_pme_t *pme = *pmedata;
1648 sfree(pme->nnx);
1649 sfree(pme->nny);
1650 sfree(pme->nnz);
1651 sfree(pme->fshx);
1652 sfree(pme->fshy);
1653 sfree(pme->fshz);
1655 for (int i = 0; i < pme->ngrids; ++i)
1657 pmegrids_destroy(&pme->pmegrid[i]);
1658 gmx_parallel_3dfft_destroy(pme->pfft_setup[i]);
1661 for (int i = 0; i < pme->ndecompdim; i++)
1663 destroy_atomcomm(&pme->atc[i]);
1666 destroy_overlap_comm(&pme->overlap[0]);
1667 destroy_overlap_comm(&pme->overlap[1]);
1669 sfree(pme->lb_buf1);
1670 sfree(pme->lb_buf2);
1672 sfree(pme->bufv);
1673 sfree(pme->bufr);
1675 pme_free_all_work(&pme->solve_work, pme->nthread);
1677 sfree(pme->sum_qgrid_tmp);
1678 sfree(pme->sum_qgrid_dd_tmp);
1680 sfree(*pmedata);
1681 *pmedata = NULL;
1683 return 0;