Convert t_vcm to C++, plugging a memory leak
[gromacs.git] / src / gromacs / mdlib / sim_util.cpp
blob22b13e8825e22fd287f953bdb614ac901b9a8879
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
37 #include "gmxpre.h"
39 #include "sim_util.h"
41 #include "config.h"
43 #include <cmath>
44 #include <cstdint>
45 #include <cstdio>
46 #include <cstring>
48 #include <array>
50 #include "gromacs/awh/awh.h"
51 #include "gromacs/domdec/dlbtiming.h"
52 #include "gromacs/domdec/domdec_struct.h"
53 #include "gromacs/domdec/partition.h"
54 #include "gromacs/essentialdynamics/edsam.h"
55 #include "gromacs/ewald/pme.h"
56 #include "gromacs/gmxlib/chargegroup.h"
57 #include "gromacs/gmxlib/network.h"
58 #include "gromacs/gmxlib/nrnb.h"
59 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
60 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
61 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
62 #include "gromacs/gpu_utils/gpu_utils.h"
63 #include "gromacs/imd/imd.h"
64 #include "gromacs/listed-forces/bonded.h"
65 #include "gromacs/listed-forces/disre.h"
66 #include "gromacs/listed-forces/gpubonded.h"
67 #include "gromacs/listed-forces/manage-threading.h"
68 #include "gromacs/listed-forces/orires.h"
69 #include "gromacs/math/arrayrefwithpadding.h"
70 #include "gromacs/math/functions.h"
71 #include "gromacs/math/units.h"
72 #include "gromacs/math/vec.h"
73 #include "gromacs/math/vecdump.h"
74 #include "gromacs/mdlib/calcmu.h"
75 #include "gromacs/mdlib/calcvir.h"
76 #include "gromacs/mdlib/constr.h"
77 #include "gromacs/mdlib/force.h"
78 #include "gromacs/mdlib/forcerec.h"
79 #include "gromacs/mdlib/gmx_omp_nthreads.h"
80 #include "gromacs/mdlib/mdrun.h"
81 #include "gromacs/mdlib/nb_verlet.h"
82 #include "gromacs/mdlib/nbnxn_atomdata.h"
83 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
84 #include "gromacs/mdlib/nbnxn_grid.h"
85 #include "gromacs/mdlib/nbnxn_search.h"
86 #include "gromacs/mdlib/ppforceworkload.h"
87 #include "gromacs/mdlib/qmmm.h"
88 #include "gromacs/mdlib/update.h"
89 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
90 #include "gromacs/mdtypes/commrec.h"
91 #include "gromacs/mdtypes/forceoutput.h"
92 #include "gromacs/mdtypes/iforceprovider.h"
93 #include "gromacs/mdtypes/inputrec.h"
94 #include "gromacs/mdtypes/md_enums.h"
95 #include "gromacs/mdtypes/state.h"
96 #include "gromacs/pbcutil/ishift.h"
97 #include "gromacs/pbcutil/mshift.h"
98 #include "gromacs/pbcutil/pbc.h"
99 #include "gromacs/pulling/pull.h"
100 #include "gromacs/pulling/pull_rotation.h"
101 #include "gromacs/timing/cyclecounter.h"
102 #include "gromacs/timing/gpu_timing.h"
103 #include "gromacs/timing/wallcycle.h"
104 #include "gromacs/timing/wallcyclereporting.h"
105 #include "gromacs/timing/walltime_accounting.h"
106 #include "gromacs/topology/topology.h"
107 #include "gromacs/utility/arrayref.h"
108 #include "gromacs/utility/basedefinitions.h"
109 #include "gromacs/utility/cstringutil.h"
110 #include "gromacs/utility/exceptions.h"
111 #include "gromacs/utility/fatalerror.h"
112 #include "gromacs/utility/gmxassert.h"
113 #include "gromacs/utility/gmxmpi.h"
114 #include "gromacs/utility/logger.h"
115 #include "gromacs/utility/pleasecite.h"
116 #include "gromacs/utility/smalloc.h"
117 #include "gromacs/utility/strconvert.h"
118 #include "gromacs/utility/sysinfo.h"
120 #include "nbnxn_gpu.h"
121 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
122 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
124 // TODO: this environment variable allows us to verify before release
125 // that on less common architectures the total cost of polling is not larger than
126 // a blocking wait (so polling does not introduce overhead when the static
127 // PME-first ordering would suffice).
128 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
131 void print_time(FILE *out,
132 gmx_walltime_accounting_t walltime_accounting,
133 int64_t step,
134 const t_inputrec *ir,
135 const t_commrec *cr)
137 time_t finish;
138 double dt, elapsed_seconds, time_per_step;
140 #if !GMX_THREAD_MPI
141 if (!PAR(cr))
142 #endif
144 fprintf(out, "\r");
146 fputs("step ", out);
147 fputs(gmx::int64ToString(step).c_str(), out);
148 fflush(out);
150 if ((step >= ir->nstlist))
152 double seconds_since_epoch = gmx_gettime();
153 elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
154 time_per_step = elapsed_seconds/(step - ir->init_step + 1);
155 dt = (ir->nsteps + ir->init_step - step) * time_per_step;
157 if (ir->nsteps >= 0)
159 if (dt >= 300)
161 finish = static_cast<time_t>(seconds_since_epoch + dt);
162 auto timebuf = gmx_ctime_r(&finish);
163 timebuf.erase(timebuf.find_first_of('\n'));
164 fputs(", will finish ", out);
165 fputs(timebuf.c_str(), out);
167 else
169 fprintf(out, ", remaining wall clock time: %5d s ", static_cast<int>(dt));
172 else
174 fprintf(out, " performance: %.1f ns/day ",
175 ir->delta_t/1000*24*60*60/time_per_step);
178 #if !GMX_THREAD_MPI
179 if (PAR(cr))
181 fprintf(out, "\n");
183 #else
184 GMX_UNUSED_VALUE(cr);
185 #endif
187 fflush(out);
190 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
191 double the_time)
193 if (!fplog)
195 return;
198 time_t temp_time = static_cast<time_t>(the_time);
200 auto timebuf = gmx_ctime_r(&temp_time);
202 fprintf(fplog, "%s on rank %d %s\n", title, nodeid, timebuf.c_str());
205 void print_start(FILE *fplog, const t_commrec *cr,
206 gmx_walltime_accounting_t walltime_accounting,
207 const char *name)
209 char buf[STRLEN];
211 sprintf(buf, "Started %s", name);
212 print_date_and_time(fplog, cr->nodeid, buf,
213 walltime_accounting_get_start_time_stamp(walltime_accounting));
216 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
218 const int end = forceToAdd.size();
220 int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
221 #pragma omp parallel for num_threads(nt) schedule(static)
222 for (int i = 0; i < end; i++)
224 rvec_inc(f[i], forceToAdd[i]);
228 static void calc_virial(int start, int homenr, const rvec x[], const rvec f[],
229 tensor vir_part, const t_graph *graph, const matrix box,
230 t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
232 /* The short-range virial from surrounding boxes */
233 calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
234 inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
236 /* Calculate partial virial, for local atoms only, based on short range.
237 * Total virial is computed in global_stat, called from do_md
239 f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
240 inc_nrnb(nrnb, eNR_VIRIAL, homenr);
242 if (debug)
244 pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
248 static void pull_potential_wrapper(const t_commrec *cr,
249 const t_inputrec *ir,
250 const matrix box, gmx::ArrayRef<const gmx::RVec> x,
251 gmx::ForceWithVirial *force,
252 const t_mdatoms *mdatoms,
253 gmx_enerdata_t *enerd,
254 const real *lambda,
255 double t,
256 gmx_wallcycle_t wcycle)
258 t_pbc pbc;
259 real dvdl;
261 /* Calculate the center of mass forces, this requires communication,
262 * which is why pull_potential is called close to other communication.
264 wallcycle_start(wcycle, ewcPULLPOT);
265 set_pbc(&pbc, ir->ePBC, box);
266 dvdl = 0;
267 enerd->term[F_COM_PULL] +=
268 pull_potential(ir->pull_work, mdatoms, &pbc,
269 cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
270 enerd->dvdl_lin[efptRESTRAINT] += dvdl;
271 wallcycle_stop(wcycle, ewcPULLPOT);
274 static void pme_receive_force_ener(const t_commrec *cr,
275 gmx::ForceWithVirial *forceWithVirial,
276 gmx_enerdata_t *enerd,
277 gmx_wallcycle_t wcycle)
279 real e_q, e_lj, dvdl_q, dvdl_lj;
280 float cycles_ppdpme, cycles_seppme;
282 cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
283 dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
285 /* In case of node-splitting, the PP nodes receive the long-range
286 * forces, virial and energy from the PME nodes here.
288 wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
289 dvdl_q = 0;
290 dvdl_lj = 0;
291 gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
292 &cycles_seppme);
293 enerd->term[F_COUL_RECIP] += e_q;
294 enerd->term[F_LJ_RECIP] += e_lj;
295 enerd->dvdl_lin[efptCOUL] += dvdl_q;
296 enerd->dvdl_lin[efptVDW] += dvdl_lj;
298 if (wcycle)
300 dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
302 wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
305 static void print_large_forces(FILE *fp,
306 const t_mdatoms *md,
307 const t_commrec *cr,
308 int64_t step,
309 real forceTolerance,
310 const rvec *x,
311 const rvec *f)
313 real force2Tolerance = gmx::square(forceTolerance);
314 gmx::index numNonFinite = 0;
315 for (int i = 0; i < md->homenr; i++)
317 real force2 = norm2(f[i]);
318 bool nonFinite = !std::isfinite(force2);
319 if (force2 >= force2Tolerance || nonFinite)
321 fprintf(fp, "step %" PRId64 " atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
322 step,
323 ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
325 if (nonFinite)
327 numNonFinite++;
330 if (numNonFinite > 0)
332 /* Note that with MPI this fatal call on one rank might interrupt
333 * the printing on other ranks. But we can only avoid that with
334 * an expensive MPI barrier that we would need at each step.
336 gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %td atoms", step, numNonFinite);
340 static void post_process_forces(const t_commrec *cr,
341 int64_t step,
342 t_nrnb *nrnb,
343 gmx_wallcycle_t wcycle,
344 const gmx_localtop_t *top,
345 const matrix box,
346 const rvec x[],
347 rvec f[],
348 gmx::ForceWithVirial *forceWithVirial,
349 tensor vir_force,
350 const t_mdatoms *mdatoms,
351 const t_graph *graph,
352 const t_forcerec *fr,
353 const gmx_vsite_t *vsite,
354 int flags)
356 if (fr->haveDirectVirialContributions)
358 rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
360 if (vsite)
362 /* Spread the mesh force on virtual sites to the other particles...
363 * This is parallellized. MPI communication is performed
364 * if the constructing atoms aren't local.
366 matrix virial = { { 0 } };
367 spread_vsite_f(vsite, x, fDirectVir, nullptr,
368 (flags & GMX_FORCE_VIRIAL) != 0, virial,
369 nrnb,
370 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
371 forceWithVirial->addVirialContribution(virial);
374 if (flags & GMX_FORCE_VIRIAL)
376 /* Now add the forces, this is local */
377 sum_forces(f, forceWithVirial->force_);
379 /* Add the direct virial contributions */
380 GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
381 m_add(vir_force, forceWithVirial->getVirial(), vir_force);
383 if (debug)
385 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
390 if (fr->print_force >= 0)
392 print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
396 static void do_nb_verlet(const t_forcerec *fr,
397 const interaction_const_t *ic,
398 gmx_enerdata_t *enerd,
399 int flags, int ilocality,
400 int clearF,
401 int64_t step,
402 t_nrnb *nrnb,
403 gmx_wallcycle_t wcycle)
405 if (!(flags & GMX_FORCE_NONBONDED))
407 /* skip non-bonded calculation */
408 return;
411 nonbonded_verlet_t *nbv = fr->nbv;
412 nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
414 /* GPU kernel launch overhead is already timed separately */
415 if (fr->cutoff_scheme != ecutsVERLET)
417 gmx_incons("Invalid cut-off scheme passed!");
420 bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
422 if (!bUsingGpuKernels)
424 /* When dynamic pair-list pruning is requested, we need to prune
425 * at nstlistPrune steps.
427 if (nbv->listParams->useDynamicPruning &&
428 (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
430 /* Prune the pair-list beyond fr->ic->rlistPrune using
431 * the current coordinates of the atoms.
433 wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
434 nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
435 wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
438 wallcycle_sub_start(wcycle, ewcsNONBONDED);
441 switch (nbvg->kernel_type)
443 case nbnxnk4x4_PlainC:
444 case nbnxnk4xN_SIMD_4xN:
445 case nbnxnk4xN_SIMD_2xNN:
446 nbnxn_kernel_cpu(nbvg,
447 nbv->nbat,
449 fr->shift_vec,
450 flags,
451 clearF,
452 fr->fshift[0],
453 enerd->grpp.ener[egCOULSR],
454 fr->bBHAM ?
455 enerd->grpp.ener[egBHAMSR] :
456 enerd->grpp.ener[egLJSR]);
457 break;
459 case nbnxnk8x8x8_GPU:
460 nbnxn_gpu_launch_kernel(nbv->gpu_nbv, flags, ilocality);
461 break;
463 case nbnxnk8x8x8_PlainC:
464 nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nblGpu[0],
465 nbv->nbat, ic,
466 fr->shift_vec,
467 flags,
468 clearF,
469 nbv->nbat->out[0].f,
470 fr->fshift[0],
471 enerd->grpp.ener[egCOULSR],
472 fr->bBHAM ?
473 enerd->grpp.ener[egBHAMSR] :
474 enerd->grpp.ener[egLJSR]);
475 break;
477 default:
478 GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
481 if (!bUsingGpuKernels)
483 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
486 int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
487 if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
489 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
491 else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
492 (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
494 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
496 else
498 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
500 enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
501 if (flags & GMX_FORCE_ENERGY)
503 /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
504 enr_nbnxn_kernel_ljc += 1;
505 enr_nbnxn_kernel_lj += 1;
508 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
509 nbvg->nbl_lists.natpair_ljq);
510 inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
511 nbvg->nbl_lists.natpair_lj);
512 /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
513 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
514 nbvg->nbl_lists.natpair_q);
516 if (ic->vdw_modifier == eintmodFORCESWITCH)
518 /* We add up the switch cost separately */
519 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
520 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
522 if (ic->vdw_modifier == eintmodPOTSWITCH)
524 /* We add up the switch cost separately */
525 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
526 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
528 if (ic->vdwtype == evdwPME)
530 /* We add up the LJ Ewald cost separately */
531 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
532 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
536 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
537 t_forcerec *fr,
538 rvec x[],
539 rvec f[],
540 const t_mdatoms *mdatoms,
541 t_lambda *fepvals,
542 real *lambda,
543 gmx_enerdata_t *enerd,
544 int flags,
545 t_nrnb *nrnb,
546 gmx_wallcycle_t wcycle)
548 int donb_flags;
549 nb_kernel_data_t kernel_data;
550 real lam_i[efptNR];
551 real dvdl_nb[efptNR];
552 int th;
553 int i, j;
555 donb_flags = 0;
556 /* Add short-range interactions */
557 donb_flags |= GMX_NONBONDED_DO_SR;
559 /* Currently all group scheme kernels always calculate (shift-)forces */
560 if (flags & GMX_FORCE_FORCES)
562 donb_flags |= GMX_NONBONDED_DO_FORCE;
564 if (flags & GMX_FORCE_VIRIAL)
566 donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
568 if (flags & GMX_FORCE_ENERGY)
570 donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
573 kernel_data.flags = donb_flags;
574 kernel_data.lambda = lambda;
575 kernel_data.dvdl = dvdl_nb;
577 kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
578 kernel_data.energygrp_vdw = enerd->grpp.ener[egLJSR];
580 /* reset free energy components */
581 for (i = 0; i < efptNR; i++)
583 dvdl_nb[i] = 0;
586 GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
588 wallcycle_sub_start(wcycle, ewcsNONBONDED);
589 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
590 for (th = 0; th < nbl_lists->nnbl; th++)
594 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
595 x, f, fr, mdatoms, &kernel_data, nrnb);
597 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
600 if (fepvals->sc_alpha != 0)
602 enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
603 enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
605 else
607 enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
608 enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
611 /* If we do foreign lambda and we have soft-core interactions
612 * we have to recalculate the (non-linear) energies contributions.
614 if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
616 kernel_data.flags = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
617 kernel_data.lambda = lam_i;
618 kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
619 kernel_data.energygrp_vdw = enerd->foreign_grpp.ener[egLJSR];
620 /* Note that we add to kernel_data.dvdl, but ignore the result */
622 for (i = 0; i < enerd->n_lambda; i++)
624 for (j = 0; j < efptNR; j++)
626 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
628 reset_foreign_enerdata(enerd);
629 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
630 for (th = 0; th < nbl_lists->nnbl; th++)
634 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
635 x, f, fr, mdatoms, &kernel_data, nrnb);
637 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
640 sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
641 enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
645 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
648 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
650 return nbv != nullptr && nbv->bUseGPU;
653 static inline void clear_rvecs_omp(int n, rvec v[])
655 int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
657 /* Note that we would like to avoid this conditional by putting it
658 * into the omp pragma instead, but then we still take the full
659 * omp parallel for overhead (at least with gcc5).
661 if (nth == 1)
663 for (int i = 0; i < n; i++)
665 clear_rvec(v[i]);
668 else
670 #pragma omp parallel for num_threads(nth) schedule(static)
671 for (int i = 0; i < n; i++)
673 clear_rvec(v[i]);
678 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
680 * \param groupOptions Group options, containing T-coupling options
682 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
684 real nrdfCoupled = 0;
685 real nrdfUncoupled = 0;
686 real kineticEnergy = 0;
687 for (int g = 0; g < groupOptions.ngtc; g++)
689 if (groupOptions.tau_t[g] >= 0)
691 nrdfCoupled += groupOptions.nrdf[g];
692 kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
694 else
696 nrdfUncoupled += groupOptions.nrdf[g];
700 /* This conditional with > also catches nrdf=0 */
701 if (nrdfCoupled > nrdfUncoupled)
703 return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
705 else
707 return 0;
711 /*! \brief This routine checks that the potential energy is finite.
713 * Always checks that the potential energy is finite. If step equals
714 * inputrec.init_step also checks that the magnitude of the potential energy
715 * is reasonable. Terminates with a fatal error when a check fails.
716 * Note that passing this check does not guarantee finite forces,
717 * since those use slightly different arithmetics. But in most cases
718 * there is just a narrow coordinate range where forces are not finite
719 * and energies are finite.
721 * \param[in] step The step number, used for checking and printing
722 * \param[in] enerd The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
723 * \param[in] inputrec The input record
725 static void checkPotentialEnergyValidity(int64_t step,
726 const gmx_enerdata_t &enerd,
727 const t_inputrec &inputrec)
729 /* Threshold valid for comparing absolute potential energy against
730 * the kinetic energy. Normally one should not consider absolute
731 * potential energy values, but with a factor of one million
732 * we should never get false positives.
734 constexpr real c_thresholdFactor = 1e6;
736 bool energyIsNotFinite = !std::isfinite(enerd.term[F_EPOT]);
737 real averageKineticEnergy = 0;
738 /* We only check for large potential energy at the initial step,
739 * because that is by far the most likely step for this too occur
740 * and because computing the average kinetic energy is not free.
741 * Note: nstcalcenergy >> 1 often does not allow to catch large energies
742 * before they become NaN.
744 if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
746 averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
749 if (energyIsNotFinite || (averageKineticEnergy > 0 &&
750 enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
752 gmx_fatal(FARGS, "Step %" PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
753 step,
754 enerd.term[F_EPOT],
755 energyIsNotFinite ? "not finite" : "extremely high",
756 enerd.term[F_LJ],
757 enerd.term[F_COUL_SR],
758 energyIsNotFinite ? "non-finite" : "very high",
759 energyIsNotFinite ? " or Nan" : "");
763 /*! \brief Compute forces and/or energies for special algorithms
765 * The intention is to collect all calls to algorithms that compute
766 * forces on local atoms only and that do not contribute to the local
767 * virial sum (but add their virial contribution separately).
768 * Eventually these should likely all become ForceProviders.
769 * Within this function the intention is to have algorithms that do
770 * global communication at the end, so global barriers within the MD loop
771 * are as close together as possible.
773 * \param[in] fplog The log file
774 * \param[in] cr The communication record
775 * \param[in] inputrec The input record
776 * \param[in] awh The Awh module (nullptr if none in use).
777 * \param[in] enforcedRotation Enforced rotation module.
778 * \param[in] step The current MD step
779 * \param[in] t The current time
780 * \param[in,out] wcycle Wallcycle accounting struct
781 * \param[in,out] forceProviders Pointer to a list of force providers
782 * \param[in] box The unit cell
783 * \param[in] x The coordinates
784 * \param[in] mdatoms Per atom properties
785 * \param[in] lambda Array of free-energy lambda values
786 * \param[in] forceFlags Flags that tell whether we should compute forces/energies/virial
787 * \param[in,out] forceWithVirial Force and virial buffers
788 * \param[in,out] enerd Energy buffer
789 * \param[in,out] ed Essential dynamics pointer
790 * \param[in] bNS Tells if we did neighbor searching this step, used for ED sampling
792 * \todo Remove bNS, which is used incorrectly.
793 * \todo Convert all other algorithms called here to ForceProviders.
795 static void
796 computeSpecialForces(FILE *fplog,
797 const t_commrec *cr,
798 const t_inputrec *inputrec,
799 gmx::Awh *awh,
800 gmx_enfrot *enforcedRotation,
801 int64_t step,
802 double t,
803 gmx_wallcycle_t wcycle,
804 ForceProviders *forceProviders,
805 matrix box,
806 gmx::ArrayRef<const gmx::RVec> x,
807 const t_mdatoms *mdatoms,
808 real *lambda,
809 int forceFlags,
810 gmx::ForceWithVirial *forceWithVirial,
811 gmx_enerdata_t *enerd,
812 gmx_edsam *ed,
813 gmx_bool bNS)
815 const bool computeForces = (forceFlags & GMX_FORCE_FORCES) != 0;
817 /* NOTE: Currently all ForceProviders only provide forces.
818 * When they also provide energies, remove this conditional.
820 if (computeForces)
822 gmx::ForceProviderInput forceProviderInput(x, *mdatoms, t, box, *cr);
823 gmx::ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
825 /* Collect forces from modules */
826 forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
829 if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
831 pull_potential_wrapper(cr, inputrec, box, x,
832 forceWithVirial,
833 mdatoms, enerd, lambda, t,
834 wcycle);
836 if (awh)
838 enerd->term[F_COM_PULL] +=
839 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
840 forceWithVirial,
841 t, step, wcycle, fplog);
845 rvec *f = as_rvec_array(forceWithVirial->force_.data());
847 /* Add the forces from enforced rotation potentials (if any) */
848 if (inputrec->bRot)
850 wallcycle_start(wcycle, ewcROTadd);
851 enerd->term[F_COM_PULL] += add_rot_forces(enforcedRotation, f, cr, step, t);
852 wallcycle_stop(wcycle, ewcROTadd);
855 if (ed)
857 /* Note that since init_edsam() is called after the initialization
858 * of forcerec, edsam doesn't request the noVirSum force buffer.
859 * Thus if no other algorithm (e.g. PME) requires it, the forces
860 * here will contribute to the virial.
862 do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
865 /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
866 if (inputrec->bIMD && computeForces)
868 IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
872 /*! \brief Launch the prepare_step and spread stages of PME GPU.
874 * \param[in] pmedata The PME structure
875 * \param[in] box The box matrix
876 * \param[in] x Coordinate array
877 * \param[in] flags Force flags
878 * \param[in] pmeFlags PME flags
879 * \param[in] wcycle The wallcycle structure
881 static inline void launchPmeGpuSpread(gmx_pme_t *pmedata,
882 matrix box,
883 rvec x[],
884 int flags,
885 int pmeFlags,
886 gmx_wallcycle_t wcycle)
888 pme_gpu_prepare_computation(pmedata, (flags & GMX_FORCE_DYNAMICBOX) != 0, box, wcycle, pmeFlags);
889 pme_gpu_launch_spread(pmedata, x, wcycle);
892 /*! \brief Launch the FFT and gather stages of PME GPU
894 * This function only implements setting the output forces (no accumulation).
896 * \param[in] pmedata The PME structure
897 * \param[in] wcycle The wallcycle structure
899 static void launchPmeGpuFftAndGather(gmx_pme_t *pmedata,
900 gmx_wallcycle_t wcycle)
902 pme_gpu_launch_complex_transforms(pmedata, wcycle);
903 pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
906 /*! \brief
907 * Polling wait for either of the PME or nonbonded GPU tasks.
909 * Instead of a static order in waiting for GPU tasks, this function
910 * polls checking which of the two tasks completes first, and does the
911 * associated force buffer reduction overlapped with the other task.
912 * By doing that, unlike static scheduling order, it can always overlap
913 * one of the reductions, regardless of the GPU task completion order.
915 * \param[in] nbv Nonbonded verlet structure
916 * \param[in,out] pmedata PME module data
917 * \param[in,out] force Force array to reduce task outputs into.
918 * \param[in,out] forceWithVirial Force and virial buffers
919 * \param[in,out] fshift Shift force output vector results are reduced into
920 * \param[in,out] enerd Energy data structure results are reduced into
921 * \param[in] flags Force flags
922 * \param[in] pmeFlags PME flags
923 * \param[in] haveOtherWork Tells whether there is other work than non-bonded in the stream(s)
924 * \param[in] wcycle The wallcycle structure
926 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv,
927 gmx_pme_t *pmedata,
928 gmx::ArrayRefWithPadding<gmx::RVec> *force,
929 gmx::ForceWithVirial *forceWithVirial,
930 rvec fshift[],
931 gmx_enerdata_t *enerd,
932 int flags,
933 int pmeFlags,
934 bool haveOtherWork,
935 gmx_wallcycle_t wcycle)
937 bool isPmeGpuDone = false;
938 bool isNbGpuDone = false;
941 gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
943 while (!isPmeGpuDone || !isNbGpuDone)
945 if (!isPmeGpuDone)
947 GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
948 isPmeGpuDone = pme_gpu_try_finish_task(pmedata, pmeFlags, wcycle, forceWithVirial, enerd, completionType);
951 if (!isNbGpuDone)
953 GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
954 wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
955 isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
956 flags, eatLocal,
957 haveOtherWork,
958 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
959 fshift, completionType);
960 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
961 // To get the call count right, when the task finished we
962 // issue a start/stop.
963 // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
964 // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
965 if (isNbGpuDone)
967 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
968 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
970 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
971 nbv->nbat, as_rvec_array(force->unpaddedArrayRef().data()), wcycle);
977 /*! \brief
978 * Launch the dynamic rolling pruning GPU task.
980 * We currently alternate local/non-local list pruning in odd-even steps
981 * (only pruning every second step without DD).
983 * \param[in] cr The communication record
984 * \param[in] nbv Nonbonded verlet structure
985 * \param[in] inputrec The input record
986 * \param[in] step The current MD step
988 static inline void launchGpuRollingPruning(const t_commrec *cr,
989 const nonbonded_verlet_t *nbv,
990 const t_inputrec *inputrec,
991 const int64_t step)
993 /* We should not launch the rolling pruning kernel at a search
994 * step or just before search steps, since that's useless.
995 * Without domain decomposition we prune at even steps.
996 * With domain decomposition we alternate local and non-local
997 * pruning at even and odd steps.
999 int numRollingParts = nbv->listParams->numRollingParts;
1000 GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1001 int stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1002 bool stepIsEven = ((stepWithCurrentList & 1) == 0);
1003 if (stepWithCurrentList > 0 &&
1004 stepWithCurrentList < inputrec->nstlist - 1 &&
1005 (stepIsEven || DOMAINDECOMP(cr)))
1007 nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
1008 stepIsEven ? eintLocal : eintNonlocal,
1009 numRollingParts);
1013 static void do_force_cutsVERLET(FILE *fplog,
1014 const t_commrec *cr,
1015 const gmx_multisim_t *ms,
1016 const t_inputrec *inputrec,
1017 gmx::Awh *awh,
1018 gmx_enfrot *enforcedRotation,
1019 int64_t step,
1020 t_nrnb *nrnb,
1021 gmx_wallcycle_t wcycle,
1022 const gmx_localtop_t *top,
1023 const gmx_groups_t * /* groups */,
1024 matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
1025 history_t *hist,
1026 gmx::ArrayRefWithPadding<gmx::RVec> force,
1027 tensor vir_force,
1028 const t_mdatoms *mdatoms,
1029 gmx_enerdata_t *enerd, t_fcdata *fcd,
1030 real *lambda,
1031 t_graph *graph,
1032 t_forcerec *fr,
1033 gmx::PpForceWorkload *ppForceWorkload,
1034 interaction_const_t *ic,
1035 const gmx_vsite_t *vsite,
1036 rvec mu_tot,
1037 double t,
1038 gmx_edsam *ed,
1039 const int flags,
1040 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1041 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1043 int cg1, i, j;
1044 double mu[2*DIM];
1045 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1046 gmx_bool bDoForces, bUseGPU, bUseOrEmulGPU;
1047 rvec vzero, box_diag;
1048 float cycles_pme, cycles_wait_gpu;
1049 nonbonded_verlet_t *nbv = fr->nbv;
1051 bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
1052 bNS = ((flags & GMX_FORCE_NS) != 0) && (!fr->bAllvsAll);
1053 bFillGrid = (bNS && bStateChanged);
1054 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1055 bDoForces = ((flags & GMX_FORCE_FORCES) != 0);
1056 bUseGPU = fr->nbv->bUseGPU;
1057 bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
1059 const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
1060 // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
1061 const bool useGpuPme = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
1062 ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
1063 const int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE |
1064 ((flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0) |
1065 ((flags & GMX_FORCE_ENERGY) ? GMX_PME_CALC_ENER_VIR : 0) |
1066 ((flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0);
1068 /* At a search step we need to start the first balancing region
1069 * somewhere early inside the step after communication during domain
1070 * decomposition (and not during the previous step as usual).
1072 if (bNS &&
1073 ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1075 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
1078 cycles_wait_gpu = 0;
1080 const int start = 0;
1081 const int homenr = mdatoms->homenr;
1083 clear_mat(vir_force);
1085 if (DOMAINDECOMP(cr))
1087 cg1 = cr->dd->globalAtomGroupIndices.size();
1089 else
1091 cg1 = top->cgs.nr;
1093 if (fr->n_tpi > 0)
1095 cg1--;
1098 if (bStateChanged)
1100 update_forcerec(fr, box);
1102 if (inputrecNeedMutot(inputrec))
1104 /* Calculate total (local) dipole moment in a temporary common array.
1105 * This makes it possible to sum them over nodes faster.
1107 calc_mu(start, homenr,
1108 x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1109 mu, mu+DIM);
1113 if (fr->ePBC != epbcNONE)
1115 /* Compute shift vectors every step,
1116 * because of pressure coupling or box deformation!
1118 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1120 calc_shifts(box, fr->shift_vec);
1123 if (bCalcCGCM)
1125 put_atoms_in_box_omp(fr->ePBC, box, x.unpaddedArrayRef().subArray(0, homenr));
1126 inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1128 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1130 unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1134 nbnxn_atomdata_copy_shiftvec((flags & GMX_FORCE_DYNAMICBOX) != 0,
1135 fr->shift_vec, nbv->nbat);
1137 #if GMX_MPI
1138 if (!thisRankHasDuty(cr, DUTY_PME))
1140 /* Send particle coordinates to the pme nodes.
1141 * Since this is only implemented for domain decomposition
1142 * and domain decomposition does not use the graph,
1143 * we do not need to worry about shifting.
1145 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1146 lambda[efptCOUL], lambda[efptVDW],
1147 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1148 step, wcycle);
1150 #endif /* GMX_MPI */
1152 if (useGpuPme)
1154 launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.unpaddedArrayRef().data()), flags, pmeFlags, wcycle);
1157 /* do gridding for pair search */
1158 if (bNS)
1160 if (graph && bStateChanged)
1162 /* Calculate intramolecular shift vectors to make molecules whole */
1163 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1166 clear_rvec(vzero);
1167 box_diag[XX] = box[XX][XX];
1168 box_diag[YY] = box[YY][YY];
1169 box_diag[ZZ] = box[ZZ][ZZ];
1171 wallcycle_start(wcycle, ewcNS);
1172 if (!DOMAINDECOMP(cr))
1174 wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1175 nbnxn_put_on_grid(nbv->nbs.get(), fr->ePBC, box,
1176 0, vzero, box_diag,
1177 nullptr, 0, mdatoms->homenr, -1,
1178 fr->cginfo, x.unpaddedArrayRef(),
1179 0, nullptr,
1180 nbv->grp[eintLocal].kernel_type,
1181 nbv->nbat);
1182 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1184 else
1186 wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1187 nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
1188 fr->cginfo, x.unpaddedArrayRef(),
1189 nbv->grp[eintNonlocal].kernel_type,
1190 nbv->nbat);
1191 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1194 nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
1196 wallcycle_stop(wcycle, ewcNS);
1199 /* initialize the GPU atom data and copy shift vector */
1200 if (bUseGPU)
1202 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1203 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1205 if (bNS)
1207 nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1210 nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1212 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1214 if (bNS && fr->gpuBonded)
1216 /* Now we put all atoms on the grid, we can assign bonded
1217 * interactions to the GPU, where the grid order is
1218 * needed. Also the xq, f and fshift device buffers have
1219 * been reallocated if needed, so the bonded code can
1220 * learn about them. */
1221 // TODO the xq, f, and fshift buffers are now shared
1222 // resources, so they should be maintained by a
1223 // higher-level object than the nb module.
1224 fr->gpuBonded->updateInteractionListsAndDeviceBuffers(nbnxn_get_gridindices(fr->nbv->nbs.get()),
1225 top->idef,
1226 nbnxn_gpu_get_xq(nbv->gpu_nbv),
1227 nbnxn_gpu_get_f(nbv->gpu_nbv),
1228 nbnxn_gpu_get_fshift(nbv->gpu_nbv));
1229 ppForceWorkload->haveGpuBondedWork = fr->gpuBonded->haveInteractions();
1232 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1235 /* do local pair search */
1236 if (bNS)
1238 wallcycle_start_nocount(wcycle, ewcNS);
1239 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1240 nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1241 &top->excls,
1242 nbv->listParams->rlistOuter,
1243 nbv->min_ci_balanced,
1244 &nbv->grp[eintLocal].nbl_lists,
1245 eintLocal,
1246 nbv->grp[eintLocal].kernel_type,
1247 nrnb);
1248 nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
1249 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1251 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
1253 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1255 if (bUseGPU)
1257 /* initialize local pair-list on the GPU */
1258 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1259 nbv->grp[eintLocal].nbl_lists.nblGpu[0],
1260 eintLocal);
1262 wallcycle_stop(wcycle, ewcNS);
1264 else
1266 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatLocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1267 nbv->nbat, wcycle);
1270 if (bUseGPU)
1272 if (DOMAINDECOMP(cr))
1274 ddOpenBalanceRegionGpu(cr->dd);
1277 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1279 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1280 nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatLocal, ppForceWorkload->haveGpuBondedWork);
1281 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1283 // bonded work not split into separate local and non-local, so with DD
1284 // we can only launch the kernel after non-local coordinates have been received.
1285 if (ppForceWorkload->haveGpuBondedWork && !DOMAINDECOMP(cr))
1287 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1288 fr->gpuBonded->launchKernels(fr, flags, box);
1289 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1292 /* launch local nonbonded work on GPU */
1293 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1294 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1295 step, nrnb, wcycle);
1296 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1297 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1300 if (useGpuPme)
1302 // In PME GPU and mixed mode we launch FFT / gather after the
1303 // X copy/transform to allow overlap as well as after the GPU NB
1304 // launch to avoid FFT launch overhead hijacking the CPU and delaying
1305 // the nonbonded kernel.
1306 launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1309 /* Communicate coordinates and sum dipole if necessary +
1310 do non-local pair search */
1311 if (DOMAINDECOMP(cr))
1313 if (bNS)
1315 wallcycle_start_nocount(wcycle, ewcNS);
1316 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1318 nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1319 &top->excls,
1320 nbv->listParams->rlistOuter,
1321 nbv->min_ci_balanced,
1322 &nbv->grp[eintNonlocal].nbl_lists,
1323 eintNonlocal,
1324 nbv->grp[eintNonlocal].kernel_type,
1325 nrnb);
1326 nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1327 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1329 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1331 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1333 if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1335 /* initialize non-local pair-list on the GPU */
1336 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1337 nbv->grp[eintNonlocal].nbl_lists.nblGpu[0],
1338 eintNonlocal);
1340 wallcycle_stop(wcycle, ewcNS);
1342 else
1344 dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1346 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatNonlocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1347 nbv->nbat, wcycle);
1350 if (bUseGPU)
1352 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1354 /* launch non-local nonbonded tasks on GPU */
1355 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1356 nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
1357 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1359 if (ppForceWorkload->haveGpuBondedWork)
1361 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1362 fr->gpuBonded->launchKernels(fr, flags, box);
1363 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1366 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1367 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1368 step, nrnb, wcycle);
1369 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1371 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1375 if (bUseGPU)
1377 /* launch D2H copy-back F */
1378 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1379 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1380 if (DOMAINDECOMP(cr))
1382 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1383 flags, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
1385 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1386 flags, eatLocal, ppForceWorkload->haveGpuBondedWork);
1387 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1389 if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1391 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1392 fr->gpuBonded->launchEnergyTransfer();
1393 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1395 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1398 if (bStateChanged && inputrecNeedMutot(inputrec))
1400 if (PAR(cr))
1402 gmx_sumd(2*DIM, mu, cr);
1403 ddReopenBalanceRegionCpu(cr->dd);
1406 for (i = 0; i < 2; i++)
1408 for (j = 0; j < DIM; j++)
1410 fr->mu_tot[i][j] = mu[i*DIM + j];
1414 if (fr->efep == efepNO)
1416 copy_rvec(fr->mu_tot[0], mu_tot);
1418 else
1420 for (j = 0; j < DIM; j++)
1422 mu_tot[j] =
1423 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1424 lambda[efptCOUL]*fr->mu_tot[1][j];
1428 /* Reset energies */
1429 reset_enerdata(enerd);
1430 clear_rvecs(SHIFTS, fr->fshift);
1432 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1434 wallcycle_start(wcycle, ewcPPDURINGPME);
1435 dd_force_flop_start(cr->dd, nrnb);
1438 if (inputrec->bRot)
1440 wallcycle_start(wcycle, ewcROT);
1441 do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1442 wallcycle_stop(wcycle, ewcROT);
1445 /* Temporary solution until all routines take PaddedRVecVector */
1446 rvec *const f = as_rvec_array(force.unpaddedArrayRef().data());
1448 /* Start the force cycle counter.
1449 * Note that a different counter is used for dynamic load balancing.
1451 wallcycle_start(wcycle, ewcFORCE);
1453 gmx::ArrayRef<gmx::RVec> forceRef = force.unpaddedArrayRef();
1454 if (bDoForces)
1456 /* If we need to compute the virial, we might need a separate
1457 * force buffer for algorithms for which the virial is calculated
1458 * directly, such as PME.
1460 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1462 forceRef = *fr->forceBufferForDirectVirialContributions;
1464 /* TODO: update comment
1465 * We only compute forces on local atoms. Note that vsites can
1466 * spread to non-local atoms, but that part of the buffer is
1467 * cleared separately in the vsite spreading code.
1469 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1471 /* Clear the short- and long-range forces */
1472 clear_rvecs_omp(fr->natoms_force_constr, f);
1475 /* forceWithVirial uses the local atom range only */
1476 gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1478 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1480 clear_pull_forces(inputrec->pull_work);
1483 /* We calculate the non-bonded forces, when done on the CPU, here.
1484 * We do this before calling do_force_lowlevel, because in that
1485 * function, the listed forces are calculated before PME, which
1486 * does communication. With this order, non-bonded and listed
1487 * force calculation imbalance can be balanced out by the domain
1488 * decomposition load balancing.
1491 if (!bUseOrEmulGPU)
1493 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1494 step, nrnb, wcycle);
1497 if (fr->efep != efepNO)
1499 /* Calculate the local and non-local free energy interactions here.
1500 * Happens here on the CPU both with and without GPU.
1502 if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1504 do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1505 fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
1506 inputrec->fepvals, lambda,
1507 enerd, flags, nrnb, wcycle);
1510 if (DOMAINDECOMP(cr) &&
1511 fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1513 do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1514 fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
1515 inputrec->fepvals, lambda,
1516 enerd, flags, nrnb, wcycle);
1520 if (!bUseOrEmulGPU)
1522 int aloc;
1524 if (DOMAINDECOMP(cr))
1526 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1527 step, nrnb, wcycle);
1530 if (!bUseOrEmulGPU)
1532 aloc = eintLocal;
1534 else
1536 aloc = eintNonlocal;
1539 /* Add all the non-bonded force to the normal force array.
1540 * This can be split into a local and a non-local part when overlapping
1541 * communication with calculation with domain decomposition.
1543 wallcycle_stop(wcycle, ewcFORCE);
1545 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatAll, nbv->nbat, f, wcycle);
1547 wallcycle_start_nocount(wcycle, ewcFORCE);
1549 /* if there are multiple fshift output buffers reduce them */
1550 if ((flags & GMX_FORCE_VIRIAL) &&
1551 nbv->grp[aloc].nbl_lists.nnbl > 1)
1553 /* This is not in a subcounter because it takes a
1554 negligible and constant-sized amount of time */
1555 nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1556 fr->fshift);
1560 /* update QMMMrec, if necessary */
1561 if (fr->bQMMM)
1563 update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
1566 /* Compute the bonded and non-bonded energies and optionally forces */
1567 do_force_lowlevel(fr, inputrec, &(top->idef),
1568 cr, ms, nrnb, wcycle, mdatoms,
1569 as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
1570 box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1571 flags, &cycles_pme);
1573 wallcycle_stop(wcycle, ewcFORCE);
1575 computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1576 step, t, wcycle,
1577 fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
1578 flags, &forceWithVirial, enerd,
1579 ed, bNS);
1581 if (bUseOrEmulGPU)
1583 /* wait for non-local forces (or calculate in emulation mode) */
1584 if (DOMAINDECOMP(cr))
1586 if (bUseGPU)
1588 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1589 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1590 flags, eatNonlocal,
1591 ppForceWorkload->haveGpuBondedWork,
1592 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1593 fr->fshift);
1594 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1596 else
1598 wallcycle_start_nocount(wcycle, ewcFORCE);
1599 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1600 step, nrnb, wcycle);
1601 wallcycle_stop(wcycle, ewcFORCE);
1604 /* skip the reduction if there was no non-local work to do */
1605 if (nbv->grp[eintNonlocal].nbl_lists.nblGpu[0]->nsci > 0)
1607 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal,
1608 nbv->nbat, f, wcycle);
1613 if (DOMAINDECOMP(cr))
1615 /* We are done with the CPU compute.
1616 * We will now communicate the non-local forces.
1617 * If we use a GPU this will overlap with GPU work, so in that case
1618 * we do not close the DD force balancing region here.
1620 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1622 ddCloseBalanceRegionCpu(cr->dd);
1624 if (bDoForces)
1626 dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
1630 // With both nonbonded and PME offloaded a GPU on the same rank, we use
1631 // an alternating wait/reduction scheme.
1632 bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1633 if (alternateGpuWait)
1635 alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, pmeFlags, ppForceWorkload->haveGpuBondedWork, wcycle);
1638 if (!alternateGpuWait && useGpuPme)
1640 pme_gpu_wait_and_reduce(fr->pmedata, pmeFlags, wcycle, &forceWithVirial, enerd);
1643 /* Wait for local GPU NB outputs on the non-alternating wait path */
1644 if (!alternateGpuWait && bUseGPU)
1646 /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1647 * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1648 * but even with a step of 0.1 ms the difference is less than 1%
1649 * of the step time.
1651 const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1653 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1654 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1655 flags, eatLocal, ppForceWorkload->haveGpuBondedWork,
1656 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1657 fr->fshift);
1658 float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1660 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1662 DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1663 if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1665 /* We measured few cycles, it could be that the kernel
1666 * and transfer finished earlier and there was no actual
1667 * wait time, only API call overhead.
1668 * Then the actual time could be anywhere between 0 and
1669 * cycles_wait_est. We will use half of cycles_wait_est.
1671 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1673 ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1677 if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1679 // NOTE: emulation kernel is not included in the balancing region,
1680 // but emulation mode does not target performance anyway
1681 wallcycle_start_nocount(wcycle, ewcFORCE);
1682 do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1683 DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1684 step, nrnb, wcycle);
1685 wallcycle_stop(wcycle, ewcFORCE);
1688 if (useGpuPme)
1690 pme_gpu_reinit_computation(fr->pmedata, wcycle);
1693 if (bUseGPU)
1695 /* now clear the GPU outputs while we finish the step on the CPU */
1696 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1697 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1698 nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1700 /* Is dynamic pair-list pruning activated? */
1701 if (nbv->listParams->useDynamicPruning)
1703 launchGpuRollingPruning(cr, nbv, inputrec, step);
1705 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1706 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1709 if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1711 wallcycle_start(wcycle, ewcWAIT_GPU_BONDED);
1712 // in principle this should be included in the DD balancing region,
1713 // but generally it is infrequent so we'll omit it for the sake of
1714 // simpler code
1715 fr->gpuBonded->accumulateEnergyTerms(enerd);
1716 wallcycle_stop(wcycle, ewcWAIT_GPU_BONDED);
1718 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1719 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1720 fr->gpuBonded->clearEnergies();
1721 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1722 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1725 /* Do the nonbonded GPU (or emulation) force buffer reduction
1726 * on the non-alternating path. */
1727 if (bUseOrEmulGPU && !alternateGpuWait)
1729 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1730 nbv->nbat, f, wcycle);
1732 if (DOMAINDECOMP(cr))
1734 dd_force_flop_stop(cr->dd, nrnb);
1737 if (bDoForces)
1739 /* If we have NoVirSum forces, but we do not calculate the virial,
1740 * we sum fr->f_novirsum=f later.
1742 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1744 spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
1745 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1748 if (flags & GMX_FORCE_VIRIAL)
1750 /* Calculation of the virial must be done after vsites! */
1751 calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
1752 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1756 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1758 /* In case of node-splitting, the PP nodes receive the long-range
1759 * forces, virial and energy from the PME nodes here.
1761 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1764 if (bDoForces)
1766 post_process_forces(cr, step, nrnb, wcycle,
1767 top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
1768 vir_force, mdatoms, graph, fr, vsite,
1769 flags);
1772 if (flags & GMX_FORCE_ENERGY)
1774 /* Sum the potential energy terms from group contributions */
1775 sum_epot(&(enerd->grpp), enerd->term);
1777 if (!EI_TPI(inputrec->eI))
1779 checkPotentialEnergyValidity(step, *enerd, *inputrec);
1784 static void do_force_cutsGROUP(FILE *fplog,
1785 const t_commrec *cr,
1786 const gmx_multisim_t *ms,
1787 const t_inputrec *inputrec,
1788 gmx::Awh *awh,
1789 gmx_enfrot *enforcedRotation,
1790 int64_t step,
1791 t_nrnb *nrnb,
1792 gmx_wallcycle_t wcycle,
1793 gmx_localtop_t *top,
1794 const gmx_groups_t *groups,
1795 matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
1796 history_t *hist,
1797 gmx::ArrayRefWithPadding<gmx::RVec> force,
1798 tensor vir_force,
1799 const t_mdatoms *mdatoms,
1800 gmx_enerdata_t *enerd,
1801 t_fcdata *fcd,
1802 real *lambda,
1803 t_graph *graph,
1804 t_forcerec *fr,
1805 const gmx_vsite_t *vsite,
1806 rvec mu_tot,
1807 double t,
1808 gmx_edsam *ed,
1809 int flags,
1810 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1811 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1813 int cg0, cg1, i, j;
1814 double mu[2*DIM];
1815 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1816 gmx_bool bDoForces;
1817 float cycles_pme;
1819 const int start = 0;
1820 const int homenr = mdatoms->homenr;
1822 clear_mat(vir_force);
1824 cg0 = 0;
1825 if (DOMAINDECOMP(cr))
1827 cg1 = cr->dd->globalAtomGroupIndices.size();
1829 else
1831 cg1 = top->cgs.nr;
1833 if (fr->n_tpi > 0)
1835 cg1--;
1838 bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
1839 bNS = ((flags & GMX_FORCE_NS) != 0) && (!fr->bAllvsAll);
1840 /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1841 bFillGrid = (bNS && bStateChanged);
1842 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1843 bDoForces = ((flags & GMX_FORCE_FORCES) != 0);
1845 if (bStateChanged)
1847 update_forcerec(fr, box);
1849 if (inputrecNeedMutot(inputrec))
1851 /* Calculate total (local) dipole moment in a temporary common array.
1852 * This makes it possible to sum them over nodes faster.
1854 calc_mu(start, homenr,
1855 x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1856 mu, mu+DIM);
1860 if (fr->ePBC != epbcNONE)
1862 /* Compute shift vectors every step,
1863 * because of pressure coupling or box deformation!
1865 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1867 calc_shifts(box, fr->shift_vec);
1870 if (bCalcCGCM)
1872 put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1873 &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1874 inc_nrnb(nrnb, eNR_CGCM, homenr);
1875 inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1877 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1879 unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1882 else if (bCalcCGCM)
1884 calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1885 inc_nrnb(nrnb, eNR_CGCM, homenr);
1888 if (bCalcCGCM && gmx_debug_at)
1890 pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1893 #if GMX_MPI
1894 if (!thisRankHasDuty(cr, DUTY_PME))
1896 /* Send particle coordinates to the pme nodes.
1897 * Since this is only implemented for domain decomposition
1898 * and domain decomposition does not use the graph,
1899 * we do not need to worry about shifting.
1901 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1902 lambda[efptCOUL], lambda[efptVDW],
1903 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1904 step, wcycle);
1906 #endif /* GMX_MPI */
1908 /* Communicate coordinates and sum dipole if necessary */
1909 if (DOMAINDECOMP(cr))
1911 dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1913 /* No GPU support, no move_x overlap, so reopen the balance region here */
1914 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1916 ddReopenBalanceRegionCpu(cr->dd);
1920 if (inputrecNeedMutot(inputrec))
1922 if (bStateChanged)
1924 if (PAR(cr))
1926 gmx_sumd(2*DIM, mu, cr);
1927 ddReopenBalanceRegionCpu(cr->dd);
1929 for (i = 0; i < 2; i++)
1931 for (j = 0; j < DIM; j++)
1933 fr->mu_tot[i][j] = mu[i*DIM + j];
1937 if (fr->efep == efepNO)
1939 copy_rvec(fr->mu_tot[0], mu_tot);
1941 else
1943 for (j = 0; j < DIM; j++)
1945 mu_tot[j] =
1946 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1951 /* Reset energies */
1952 reset_enerdata(enerd);
1953 clear_rvecs(SHIFTS, fr->fshift);
1955 if (bNS)
1957 wallcycle_start(wcycle, ewcNS);
1959 if (graph && bStateChanged)
1961 /* Calculate intramolecular shift vectors to make molecules whole */
1962 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1965 /* Do the actual neighbour searching */
1966 ns(fplog, fr, box,
1967 groups, top, mdatoms,
1968 cr, nrnb, bFillGrid);
1970 wallcycle_stop(wcycle, ewcNS);
1973 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1975 wallcycle_start(wcycle, ewcPPDURINGPME);
1976 dd_force_flop_start(cr->dd, nrnb);
1979 if (inputrec->bRot)
1981 wallcycle_start(wcycle, ewcROT);
1982 do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1983 wallcycle_stop(wcycle, ewcROT);
1986 /* Temporary solution until all routines take PaddedRVecVector */
1987 rvec *f = as_rvec_array(force.unpaddedArrayRef().data());
1989 /* Start the force cycle counter.
1990 * Note that a different counter is used for dynamic load balancing.
1992 wallcycle_start(wcycle, ewcFORCE);
1994 gmx::ArrayRef<gmx::RVec> forceRef = force.paddedArrayRef();
1995 if (bDoForces)
1997 /* If we need to compute the virial, we might need a separate
1998 * force buffer for algorithms for which the virial is calculated
1999 * separately, such as PME.
2001 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
2003 forceRef = *fr->forceBufferForDirectVirialContributions;
2005 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
2008 /* Clear the short- and long-range forces */
2009 clear_rvecs(fr->natoms_force_constr, f);
2012 /* forceWithVirial might need the full force atom range */
2013 gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
2015 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
2017 clear_pull_forces(inputrec->pull_work);
2020 /* update QMMMrec, if necessary */
2021 if (fr->bQMMM)
2023 update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
2026 /* Compute the bonded and non-bonded energies and optionally forces */
2027 do_force_lowlevel(fr, inputrec, &(top->idef),
2028 cr, ms, nrnb, wcycle, mdatoms,
2029 as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
2030 box, inputrec->fepvals, lambda,
2031 graph, &(top->excls), fr->mu_tot,
2032 flags,
2033 &cycles_pme);
2035 wallcycle_stop(wcycle, ewcFORCE);
2037 if (DOMAINDECOMP(cr))
2039 dd_force_flop_stop(cr->dd, nrnb);
2041 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
2043 ddCloseBalanceRegionCpu(cr->dd);
2047 computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
2048 step, t, wcycle,
2049 fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
2050 flags, &forceWithVirial, enerd,
2051 ed, bNS);
2053 if (bDoForces)
2055 /* Communicate the forces */
2056 if (DOMAINDECOMP(cr))
2058 dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
2059 /* Do we need to communicate the separate force array
2060 * for terms that do not contribute to the single sum virial?
2061 * Position restraints and electric fields do not introduce
2062 * inter-cg forces, only full electrostatics methods do.
2063 * When we do not calculate the virial, fr->f_novirsum = f,
2064 * so we have already communicated these forces.
2066 if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
2067 (flags & GMX_FORCE_VIRIAL))
2069 dd_move_f(cr->dd, forceWithVirial.force_, nullptr, wcycle);
2073 /* If we have NoVirSum forces, but we do not calculate the virial,
2074 * we sum fr->f_novirsum=f later.
2076 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
2078 spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
2079 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
2082 if (flags & GMX_FORCE_VIRIAL)
2084 /* Calculation of the virial must be done after vsites! */
2085 calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
2086 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
2090 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
2092 /* In case of node-splitting, the PP nodes receive the long-range
2093 * forces, virial and energy from the PME nodes here.
2095 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
2098 if (bDoForces)
2100 post_process_forces(cr, step, nrnb, wcycle,
2101 top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
2102 vir_force, mdatoms, graph, fr, vsite,
2103 flags);
2106 if (flags & GMX_FORCE_ENERGY)
2108 /* Sum the potential energy terms from group contributions */
2109 sum_epot(&(enerd->grpp), enerd->term);
2111 if (!EI_TPI(inputrec->eI))
2113 checkPotentialEnergyValidity(step, *enerd, *inputrec);
2119 void do_force(FILE *fplog,
2120 const t_commrec *cr,
2121 const gmx_multisim_t *ms,
2122 const t_inputrec *inputrec,
2123 gmx::Awh *awh,
2124 gmx_enfrot *enforcedRotation,
2125 int64_t step,
2126 t_nrnb *nrnb,
2127 gmx_wallcycle_t wcycle,
2128 gmx_localtop_t *top,
2129 const gmx_groups_t *groups,
2130 matrix box,
2131 gmx::ArrayRefWithPadding<gmx::RVec> x, //NOLINT(performance-unnecessary-value-param)
2132 history_t *hist,
2133 gmx::ArrayRefWithPadding<gmx::RVec> force, //NOLINT(performance-unnecessary-value-param)
2134 tensor vir_force,
2135 const t_mdatoms *mdatoms,
2136 gmx_enerdata_t *enerd,
2137 t_fcdata *fcd,
2138 gmx::ArrayRef<real> lambda,
2139 t_graph *graph,
2140 t_forcerec *fr,
2141 gmx::PpForceWorkload *ppForceWorkload,
2142 const gmx_vsite_t *vsite,
2143 rvec mu_tot,
2144 double t,
2145 gmx_edsam *ed,
2146 int flags,
2147 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
2148 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
2150 /* modify force flag if not doing nonbonded */
2151 if (!fr->bNonbonded)
2153 flags &= ~GMX_FORCE_NONBONDED;
2156 switch (inputrec->cutoff_scheme)
2158 case ecutsVERLET:
2159 do_force_cutsVERLET(fplog, cr, ms, inputrec,
2160 awh, enforcedRotation, step, nrnb, wcycle,
2161 top,
2162 groups,
2163 box, x, hist,
2164 force, vir_force,
2165 mdatoms,
2166 enerd, fcd,
2167 lambda.data(), graph,
2169 ppForceWorkload,
2170 fr->ic,
2171 vsite, mu_tot,
2172 t, ed,
2173 flags,
2174 ddOpenBalanceRegion,
2175 ddCloseBalanceRegion);
2176 break;
2177 case ecutsGROUP:
2178 do_force_cutsGROUP(fplog, cr, ms, inputrec,
2179 awh, enforcedRotation, step, nrnb, wcycle,
2180 top,
2181 groups,
2182 box, x, hist,
2183 force, vir_force,
2184 mdatoms,
2185 enerd, fcd,
2186 lambda.data(), graph,
2187 fr, vsite, mu_tot,
2188 t, ed,
2189 flags,
2190 ddOpenBalanceRegion,
2191 ddCloseBalanceRegion);
2192 break;
2193 default:
2194 gmx_incons("Invalid cut-off scheme passed!");
2197 /* In case we don't have constraints and are using GPUs, the next balancing
2198 * region starts here.
2199 * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2200 * virial calculation and COM pulling, is not thus not included in
2201 * the balance timing, which is ok as most tasks do communication.
2203 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
2205 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
2210 void do_constrain_first(FILE *fplog, gmx::Constraints *constr,
2211 const t_inputrec *ir, const t_mdatoms *md,
2212 t_state *state)
2214 int i, m, start, end;
2215 int64_t step;
2216 real dt = ir->delta_t;
2217 real dvdl_dum;
2218 rvec *savex;
2220 /* We need to allocate one element extra, since we might use
2221 * (unaligned) 4-wide SIMD loads to access rvec entries.
2223 snew(savex, state->natoms + 1);
2225 start = 0;
2226 end = md->homenr;
2228 if (debug)
2230 fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2231 start, md->homenr, end);
2233 /* Do a first constrain to reset particles... */
2234 step = ir->init_step;
2235 if (fplog)
2237 char buf[STEPSTRSIZE];
2238 fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2239 gmx_step_str(step, buf));
2241 dvdl_dum = 0;
2243 /* constrain the current position */
2244 constr->apply(TRUE, FALSE,
2245 step, 0, 1.0,
2246 state->x.rvec_array(), state->x.rvec_array(), nullptr,
2247 state->box,
2248 state->lambda[efptBONDED], &dvdl_dum,
2249 nullptr, nullptr, gmx::ConstraintVariable::Positions);
2250 if (EI_VV(ir->eI))
2252 /* constrain the inital velocity, and save it */
2253 /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2254 constr->apply(TRUE, FALSE,
2255 step, 0, 1.0,
2256 state->x.rvec_array(), state->v.rvec_array(), state->v.rvec_array(),
2257 state->box,
2258 state->lambda[efptBONDED], &dvdl_dum,
2259 nullptr, nullptr, gmx::ConstraintVariable::Velocities);
2261 /* constrain the inital velocities at t-dt/2 */
2262 if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2264 auto x = makeArrayRef(state->x).subArray(start, end);
2265 auto v = makeArrayRef(state->v).subArray(start, end);
2266 for (i = start; (i < end); i++)
2268 for (m = 0; (m < DIM); m++)
2270 /* Reverse the velocity */
2271 v[i][m] = -v[i][m];
2272 /* Store the position at t-dt in buf */
2273 savex[i][m] = x[i][m] + dt*v[i][m];
2276 /* Shake the positions at t=-dt with the positions at t=0
2277 * as reference coordinates.
2279 if (fplog)
2281 char buf[STEPSTRSIZE];
2282 fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2283 gmx_step_str(step, buf));
2285 dvdl_dum = 0;
2286 constr->apply(TRUE, FALSE,
2287 step, -1, 1.0,
2288 state->x.rvec_array(), savex, nullptr,
2289 state->box,
2290 state->lambda[efptBONDED], &dvdl_dum,
2291 state->v.rvec_array(), nullptr, gmx::ConstraintVariable::Positions);
2293 for (i = start; i < end; i++)
2295 for (m = 0; m < DIM; m++)
2297 /* Re-reverse the velocities */
2298 v[i][m] = -v[i][m];
2302 sfree(savex);
2306 static void
2307 integrate_table(const real vdwtab[], real scale, int offstart, int rstart, int rend,
2308 double *enerout, double *virout)
2310 double enersum, virsum;
2311 double invscale, invscale2, invscale3;
2312 double r, ea, eb, ec, pa, pb, pc, pd;
2313 double y0, f, g, h;
2314 int ri, offset;
2315 double tabfactor;
2317 invscale = 1.0/scale;
2318 invscale2 = invscale*invscale;
2319 invscale3 = invscale*invscale2;
2321 /* Following summation derived from cubic spline definition,
2322 * Numerical Recipies in C, second edition, p. 113-116. Exact for
2323 * the cubic spline. We first calculate the negative of the
2324 * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2325 * add the more standard, abrupt cutoff correction to that result,
2326 * yielding the long-range correction for a switched function. We
2327 * perform both the pressure and energy loops at the same time for
2328 * simplicity, as the computational cost is low. */
2330 if (offstart == 0)
2332 /* Since the dispersion table has been scaled down a factor
2333 * 6.0 and the repulsion a factor 12.0 to compensate for the
2334 * c6/c12 parameters inside nbfp[] being scaled up (to save
2335 * flops in kernels), we need to correct for this.
2337 tabfactor = 6.0;
2339 else
2341 tabfactor = 12.0;
2344 enersum = 0.0;
2345 virsum = 0.0;
2346 for (ri = rstart; ri < rend; ++ri)
2348 r = ri*invscale;
2349 ea = invscale3;
2350 eb = 2.0*invscale2*r;
2351 ec = invscale*r*r;
2353 pa = invscale3;
2354 pb = 3.0*invscale2*r;
2355 pc = 3.0*invscale*r*r;
2356 pd = r*r*r;
2358 /* this "8" is from the packing in the vdwtab array - perhaps
2359 should be defined? */
2361 offset = 8*ri + offstart;
2362 y0 = vdwtab[offset];
2363 f = vdwtab[offset+1];
2364 g = vdwtab[offset+2];
2365 h = vdwtab[offset+3];
2367 enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2368 virsum += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2370 *enerout = 4.0*M_PI*enersum*tabfactor;
2371 *virout = 4.0*M_PI*virsum*tabfactor;
2374 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2376 double eners[2], virs[2], enersum, virsum;
2377 double r0, rc3, rc9;
2378 int ri0, ri1, i;
2379 real scale, *vdwtab;
2381 fr->enershiftsix = 0;
2382 fr->enershifttwelve = 0;
2383 fr->enerdiffsix = 0;
2384 fr->enerdifftwelve = 0;
2385 fr->virdiffsix = 0;
2386 fr->virdifftwelve = 0;
2388 const interaction_const_t *ic = fr->ic;
2390 if (eDispCorr != edispcNO)
2392 for (i = 0; i < 2; i++)
2394 eners[i] = 0;
2395 virs[i] = 0;
2397 if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2398 (ic->vdw_modifier == eintmodPOTSWITCH) ||
2399 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2400 (ic->vdwtype == evdwSHIFT) ||
2401 (ic->vdwtype == evdwSWITCH))
2403 if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2404 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2405 (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2407 gmx_fatal(FARGS,
2408 "With dispersion correction rvdw-switch can not be zero "
2409 "for vdw-type = %s", evdw_names[ic->vdwtype]);
2412 /* TODO This code depends on the logic in tables.c that
2413 constructs the table layout, which should be made
2414 explicit in future cleanup. */
2415 GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2416 "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2417 scale = fr->dispersionCorrectionTable->scale;
2418 vdwtab = fr->dispersionCorrectionTable->data;
2420 /* Round the cut-offs to exact table values for precision */
2421 ri0 = static_cast<int>(std::floor(ic->rvdw_switch*scale));
2422 ri1 = static_cast<int>(std::ceil(ic->rvdw*scale));
2424 /* The code below has some support for handling force-switching, i.e.
2425 * when the force (instead of potential) is switched over a limited
2426 * region. This leads to a constant shift in the potential inside the
2427 * switching region, which we can handle by adding a constant energy
2428 * term in the force-switch case just like when we do potential-shift.
2430 * For now this is not enabled, but to keep the functionality in the
2431 * code we check separately for switch and shift. When we do force-switch
2432 * the shifting point is rvdw_switch, while it is the cutoff when we
2433 * have a classical potential-shift.
2435 * For a pure potential-shift the potential has a constant shift
2436 * all the way out to the cutoff, and that is it. For other forms
2437 * we need to calculate the constant shift up to the point where we
2438 * start modifying the potential.
2440 ri0 = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2442 r0 = ri0/scale;
2443 rc3 = r0*r0*r0;
2444 rc9 = rc3*rc3*rc3;
2446 if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2447 (ic->vdwtype == evdwSHIFT))
2449 /* Determine the constant energy shift below rvdw_switch.
2450 * Table has a scale factor since we have scaled it down to compensate
2451 * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2453 fr->enershiftsix = static_cast<real>(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2454 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2456 else if (ic->vdw_modifier == eintmodPOTSHIFT)
2458 fr->enershiftsix = static_cast<real>(-1.0/(rc3*rc3));
2459 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3));
2462 /* Add the constant part from 0 to rvdw_switch.
2463 * This integration from 0 to rvdw_switch overcounts the number
2464 * of interactions by 1, as it also counts the self interaction.
2465 * We will correct for this later.
2467 eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2468 eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2470 /* Calculate the contribution in the range [r0,r1] where we
2471 * modify the potential. For a pure potential-shift modifier we will
2472 * have ri0==ri1, and there will not be any contribution here.
2474 for (i = 0; i < 2; i++)
2476 enersum = 0;
2477 virsum = 0;
2478 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2479 eners[i] -= enersum;
2480 virs[i] -= virsum;
2483 /* Alright: Above we compensated by REMOVING the parts outside r0
2484 * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2486 * Regardless of whether r0 is the point where we start switching,
2487 * or the cutoff where we calculated the constant shift, we include
2488 * all the parts we are missing out to infinity from r0 by
2489 * calculating the analytical dispersion correction.
2491 eners[0] += -4.0*M_PI/(3.0*rc3);
2492 eners[1] += 4.0*M_PI/(9.0*rc9);
2493 virs[0] += 8.0*M_PI/rc3;
2494 virs[1] += -16.0*M_PI/(3.0*rc9);
2496 else if (ic->vdwtype == evdwCUT ||
2497 EVDW_PME(ic->vdwtype) ||
2498 ic->vdwtype == evdwUSER)
2500 if (ic->vdwtype == evdwUSER && fplog)
2502 fprintf(fplog,
2503 "WARNING: using dispersion correction with user tables\n");
2506 /* Note that with LJ-PME, the dispersion correction is multiplied
2507 * by the difference between the actual C6 and the value of C6
2508 * that would produce the combination rule.
2509 * This means the normal energy and virial difference formulas
2510 * can be used here.
2513 rc3 = ic->rvdw*ic->rvdw*ic->rvdw;
2514 rc9 = rc3*rc3*rc3;
2515 /* Contribution beyond the cut-off */
2516 eners[0] += -4.0*M_PI/(3.0*rc3);
2517 eners[1] += 4.0*M_PI/(9.0*rc9);
2518 if (ic->vdw_modifier == eintmodPOTSHIFT)
2520 /* Contribution within the cut-off */
2521 eners[0] += -4.0*M_PI/(3.0*rc3);
2522 eners[1] += 4.0*M_PI/(3.0*rc9);
2524 /* Contribution beyond the cut-off */
2525 virs[0] += 8.0*M_PI/rc3;
2526 virs[1] += -16.0*M_PI/(3.0*rc9);
2528 else
2530 gmx_fatal(FARGS,
2531 "Dispersion correction is not implemented for vdw-type = %s",
2532 evdw_names[ic->vdwtype]);
2535 /* When we deprecate the group kernels the code below can go too */
2536 if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2538 /* Calculate self-interaction coefficient (assuming that
2539 * the reciprocal-space contribution is constant in the
2540 * region that contributes to the self-interaction).
2542 fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2544 eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2545 virs[0] += gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2548 fr->enerdiffsix = eners[0];
2549 fr->enerdifftwelve = eners[1];
2550 /* The 0.5 is due to the Gromacs definition of the virial */
2551 fr->virdiffsix = 0.5*virs[0];
2552 fr->virdifftwelve = 0.5*virs[1];
2556 void calc_dispcorr(const t_inputrec *ir, const t_forcerec *fr,
2557 const matrix box, real lambda, tensor pres, tensor virial,
2558 real *prescorr, real *enercorr, real *dvdlcorr)
2560 gmx_bool bCorrAll, bCorrPres;
2561 real dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2562 int m;
2564 *prescorr = 0;
2565 *enercorr = 0;
2566 *dvdlcorr = 0;
2568 clear_mat(virial);
2569 clear_mat(pres);
2571 if (ir->eDispCorr != edispcNO)
2573 bCorrAll = (ir->eDispCorr == edispcAllEner ||
2574 ir->eDispCorr == edispcAllEnerPres);
2575 bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2576 ir->eDispCorr == edispcAllEnerPres);
2578 invvol = 1/det(box);
2579 if (fr->n_tpi)
2581 /* Only correct for the interactions with the inserted molecule */
2582 dens = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2583 ninter = fr->n_tpi;
2585 else
2587 dens = fr->numAtomsForDispersionCorrection*invvol;
2588 ninter = 0.5*fr->numAtomsForDispersionCorrection;
2591 if (ir->efep == efepNO)
2593 avcsix = fr->avcsix[0];
2594 avctwelve = fr->avctwelve[0];
2596 else
2598 avcsix = (1 - lambda)*fr->avcsix[0] + lambda*fr->avcsix[1];
2599 avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2602 enerdiff = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2603 *enercorr += avcsix*enerdiff;
2604 dvdlambda = 0.0;
2605 if (ir->efep != efepNO)
2607 dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2609 if (bCorrAll)
2611 enerdiff = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2612 *enercorr += avctwelve*enerdiff;
2613 if (fr->efep != efepNO)
2615 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2619 if (bCorrPres)
2621 svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2622 if (ir->eDispCorr == edispcAllEnerPres)
2624 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2626 /* The factor 2 is because of the Gromacs virial definition */
2627 spres = -2.0*invvol*svir*PRESFAC;
2629 for (m = 0; m < DIM; m++)
2631 virial[m][m] += svir;
2632 pres[m][m] += spres;
2634 *prescorr += spres;
2637 /* Can't currently control when it prints, for now, just print when degugging */
2638 if (debug)
2640 if (bCorrAll)
2642 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2643 avcsix, avctwelve);
2645 if (bCorrPres)
2647 fprintf(debug,
2648 "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2649 *enercorr, spres, svir);
2651 else
2653 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2657 if (fr->efep != efepNO)
2659 *dvdlcorr += dvdlambda;
2664 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2665 const gmx_mtop_t *mtop, rvec x[],
2666 gmx_bool bFirst)
2668 t_graph *graph;
2669 int as, mol;
2671 if (bFirst && fplog)
2673 fprintf(fplog, "Removing pbc first time\n");
2676 snew(graph, 1);
2677 as = 0;
2678 for (const gmx_molblock_t &molb : mtop->molblock)
2680 const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2681 if (moltype.atoms.nr == 1 ||
2682 (!bFirst && moltype.cgs.nr == 1))
2684 /* Just one atom or charge group in the molecule, no PBC required */
2685 as += molb.nmol*moltype.atoms.nr;
2687 else
2689 mk_graph_moltype(moltype, graph);
2691 for (mol = 0; mol < molb.nmol; mol++)
2693 mk_mshift(fplog, graph, ePBC, box, x+as);
2695 shift_self(graph, box, x+as);
2696 /* The molecule is whole now.
2697 * We don't need the second mk_mshift call as in do_pbc_first,
2698 * since we no longer need this graph.
2701 as += moltype.atoms.nr;
2703 done_graph(graph);
2706 sfree(graph);
2709 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2710 const gmx_mtop_t *mtop, rvec x[])
2712 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2715 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2716 const gmx_mtop_t *mtop, rvec x[])
2718 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2721 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2723 int t, nth;
2724 nth = gmx_omp_nthreads_get(emntDefault);
2726 #pragma omp parallel for num_threads(nth) schedule(static)
2727 for (t = 0; t < nth; t++)
2731 size_t natoms = x.size();
2732 size_t offset = (natoms*t )/nth;
2733 size_t len = (natoms*(t + 1))/nth - offset;
2734 put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2736 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2740 // TODO This can be cleaned up a lot, and move back to runner.cpp
2741 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2742 const t_inputrec *inputrec,
2743 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2744 gmx_walltime_accounting_t walltime_accounting,
2745 nonbonded_verlet_t *nbv,
2746 const gmx_pme_t *pme,
2747 gmx_bool bWriteStat)
2749 t_nrnb *nrnb_tot = nullptr;
2750 double delta_t = 0;
2751 double nbfs = 0, mflop = 0;
2752 double elapsed_time,
2753 elapsed_time_over_all_ranks,
2754 elapsed_time_over_all_threads,
2755 elapsed_time_over_all_threads_over_all_ranks;
2756 /* Control whether it is valid to print a report. Only the
2757 simulation master may print, but it should not do so if the run
2758 terminated e.g. before a scheduled reset step. This is
2759 complicated by the fact that PME ranks are unaware of the
2760 reason why they were sent a pmerecvqxFINISH. To avoid
2761 communication deadlocks, we always do the communication for the
2762 report, even if we've decided not to write the report, because
2763 how long it takes to finish the run is not important when we've
2764 decided not to report on the simulation performance.
2766 Further, we only report performance for dynamical integrators,
2767 because those are the only ones for which we plan to
2768 consider doing any optimizations. */
2769 bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2771 if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2773 GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2774 printReport = false;
2777 if (cr->nnodes > 1)
2779 snew(nrnb_tot, 1);
2780 #if GMX_MPI
2781 MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2782 cr->mpi_comm_mysim);
2783 #endif
2785 else
2787 nrnb_tot = nrnb;
2790 elapsed_time = walltime_accounting_get_time_since_reset(walltime_accounting);
2791 elapsed_time_over_all_threads = walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
2792 if (cr->nnodes > 1)
2794 #if GMX_MPI
2795 /* reduce elapsed_time over all MPI ranks in the current simulation */
2796 MPI_Allreduce(&elapsed_time,
2797 &elapsed_time_over_all_ranks,
2798 1, MPI_DOUBLE, MPI_SUM,
2799 cr->mpi_comm_mysim);
2800 elapsed_time_over_all_ranks /= cr->nnodes;
2801 /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2802 * current simulation. */
2803 MPI_Allreduce(&elapsed_time_over_all_threads,
2804 &elapsed_time_over_all_threads_over_all_ranks,
2805 1, MPI_DOUBLE, MPI_SUM,
2806 cr->mpi_comm_mysim);
2807 #endif
2809 else
2811 elapsed_time_over_all_ranks = elapsed_time;
2812 elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2815 if (printReport)
2817 print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2819 if (cr->nnodes > 1)
2821 sfree(nrnb_tot);
2824 if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2826 print_dd_statistics(cr, inputrec, fplog);
2829 /* TODO Move the responsibility for any scaling by thread counts
2830 * to the code that handled the thread region, so that there's a
2831 * mechanism to keep cycle counting working during the transition
2832 * to task parallelism. */
2833 int nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
2834 int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2835 wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2836 auto cycle_sum(wallcycle_sum(cr, wcycle));
2838 if (printReport)
2840 auto nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2841 gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
2842 if (pme_gpu_task_enabled(pme))
2844 pme_gpu_get_timings(pme, &pme_gpu_timings);
2846 wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2847 elapsed_time_over_all_ranks,
2848 wcycle, cycle_sum,
2849 nbnxn_gpu_timings,
2850 &pme_gpu_timings);
2852 if (EI_DYNAMICS(inputrec->eI))
2854 delta_t = inputrec->delta_t;
2857 if (fplog)
2859 print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2860 elapsed_time_over_all_ranks,
2861 walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2862 delta_t, nbfs, mflop);
2864 if (bWriteStat)
2866 print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2867 elapsed_time_over_all_ranks,
2868 walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2869 delta_t, nbfs, mflop);
2874 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2876 /* this function works, but could probably use a logic rewrite to keep all the different
2877 types of efep straight. */
2879 if ((ir->efep == efepNO) && (!ir->bSimTemp))
2881 return;
2884 t_lambda *fep = ir->fepvals;
2885 *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2886 if checkpoint is set -- a kludge is in for now
2887 to prevent this.*/
2889 for (int i = 0; i < efptNR; i++)
2891 /* overwrite lambda state with init_lambda for now for backwards compatibility */
2892 if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2894 lambda[i] = fep->init_lambda;
2895 if (lam0)
2897 lam0[i] = lambda[i];
2900 else
2902 lambda[i] = fep->all_lambda[i][*fep_state];
2903 if (lam0)
2905 lam0[i] = lambda[i];
2909 if (ir->bSimTemp)
2911 /* need to rescale control temperatures to match current state */
2912 for (int i = 0; i < ir->opts.ngtc; i++)
2914 if (ir->opts.ref_t[i] > 0)
2916 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2921 /* Send to the log the information on the current lambdas */
2922 if (fplog != nullptr)
2924 fprintf(fplog, "Initial vector of lambda components:[ ");
2925 for (const auto &l : lambda)
2927 fprintf(fplog, "%10.4f ", l);
2929 fprintf(fplog, "]\n");
2934 void init_md(FILE *fplog,
2935 const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2936 t_inputrec *ir, const gmx_output_env_t *oenv,
2937 const MdrunOptions &mdrunOptions,
2938 double *t, double *t0,
2939 t_state *globalState, double *lam0,
2940 t_nrnb *nrnb, gmx_mtop_t *mtop,
2941 gmx_update_t **upd,
2942 gmx::BoxDeformation *deform,
2943 int nfile, const t_filenm fnm[],
2944 gmx_mdoutf_t *outf, t_mdebin **mdebin,
2945 tensor force_vir, tensor shake_vir,
2946 tensor total_vir, tensor pres, rvec mu_tot,
2947 gmx_bool *bSimAnn,
2948 gmx_wallcycle_t wcycle)
2950 int i;
2952 /* Initial values */
2953 *t = *t0 = ir->init_t;
2955 *bSimAnn = FALSE;
2956 for (i = 0; i < ir->opts.ngtc; i++)
2958 /* set bSimAnn if any group is being annealed */
2959 if (ir->opts.annealing[i] != eannNO)
2961 *bSimAnn = TRUE;
2965 /* Initialize lambda variables */
2966 /* TODO: Clean up initialization of fep_state and lambda in t_state.
2967 * We currently need to call initialize_lambdas on non-master ranks
2968 * to initialize lam0.
2970 if (MASTER(cr))
2972 initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
2974 else
2976 int tmpFepState;
2977 std::array<real, efptNR> tmpLambda;
2978 initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
2981 // TODO upd is never NULL in practice, but the analysers don't know that
2982 if (upd)
2984 *upd = init_update(ir, deform);
2986 if (*bSimAnn)
2988 update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2991 if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
2993 if (ir->etc == etcBERENDSEN)
2995 please_cite(fplog, "Berendsen84a");
2997 if (ir->etc == etcVRESCALE)
2999 please_cite(fplog, "Bussi2007a");
3001 if (ir->eI == eiSD1)
3003 please_cite(fplog, "Goga2012");
3006 init_nrnb(nrnb);
3008 if (nfile != -1)
3010 *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
3012 *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
3013 mtop, ir, mdoutf_get_fp_dhdl(*outf));
3016 /* Initiate variables */
3017 clear_mat(force_vir);
3018 clear_mat(shake_vir);
3019 clear_rvec(mu_tot);
3020 clear_mat(total_vir);
3021 clear_mat(pres);
3024 void init_rerun(FILE *fplog,
3025 const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
3026 t_inputrec *ir, const gmx_output_env_t *oenv,
3027 const MdrunOptions &mdrunOptions,
3028 t_state *globalState, double *lam0,
3029 t_nrnb *nrnb, gmx_mtop_t *mtop,
3030 int nfile, const t_filenm fnm[],
3031 gmx_mdoutf_t *outf, t_mdebin **mdebin,
3032 gmx_wallcycle_t wcycle)
3034 /* Initialize lambda variables */
3035 /* TODO: Clean up initialization of fep_state and lambda in t_state.
3036 * We currently need to call initialize_lambdas on non-master ranks
3037 * to initialize lam0.
3039 if (MASTER(cr))
3041 initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
3043 else
3045 int tmpFepState;
3046 std::array<real, efptNR> tmpLambda;
3047 initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
3050 init_nrnb(nrnb);
3052 if (nfile != -1)
3054 *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
3055 *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
3056 mtop, ir, mdoutf_get_fp_dhdl(*outf), true);