Hide internals of nbnxm parlist
[gromacs.git] / src / gromacs / mdlib / sim_util.cpp
blobf8f7c388aeb2503fd46a93443046828c1fa56214
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
37 #include "gmxpre.h"
39 #include "sim_util.h"
41 #include "config.h"
43 #include <cmath>
44 #include <cstdint>
45 #include <cstdio>
46 #include <cstring>
48 #include <array>
50 #include "gromacs/awh/awh.h"
51 #include "gromacs/domdec/dlbtiming.h"
52 #include "gromacs/domdec/domdec.h"
53 #include "gromacs/domdec/domdec_struct.h"
54 #include "gromacs/domdec/partition.h"
55 #include "gromacs/essentialdynamics/edsam.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/gmxlib/chargegroup.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/gmxlib/nrnb.h"
60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
63 #include "gromacs/gpu_utils/gpu_utils.h"
64 #include "gromacs/imd/imd.h"
65 #include "gromacs/listed_forces/bonded.h"
66 #include "gromacs/listed_forces/disre.h"
67 #include "gromacs/listed_forces/gpubonded.h"
68 #include "gromacs/listed_forces/manage_threading.h"
69 #include "gromacs/listed_forces/orires.h"
70 #include "gromacs/math/arrayrefwithpadding.h"
71 #include "gromacs/math/functions.h"
72 #include "gromacs/math/units.h"
73 #include "gromacs/math/vec.h"
74 #include "gromacs/math/vecdump.h"
75 #include "gromacs/mdlib/calcmu.h"
76 #include "gromacs/mdlib/calcvir.h"
77 #include "gromacs/mdlib/constr.h"
78 #include "gromacs/mdlib/force.h"
79 #include "gromacs/mdlib/forcerec.h"
80 #include "gromacs/mdlib/gmx_omp_nthreads.h"
81 #include "gromacs/mdlib/mdrun.h"
82 #include "gromacs/mdlib/ppforceworkload.h"
83 #include "gromacs/mdlib/qmmm.h"
84 #include "gromacs/mdlib/update.h"
85 #include "gromacs/mdtypes/commrec.h"
86 #include "gromacs/mdtypes/enerdata.h"
87 #include "gromacs/mdtypes/forceoutput.h"
88 #include "gromacs/mdtypes/iforceprovider.h"
89 #include "gromacs/mdtypes/inputrec.h"
90 #include "gromacs/mdtypes/md_enums.h"
91 #include "gromacs/mdtypes/state.h"
92 #include "gromacs/nbnxm/atomdata.h"
93 #include "gromacs/nbnxm/gpu_data_mgmt.h"
94 #include "gromacs/nbnxm/nbnxm.h"
95 #include "gromacs/pbcutil/ishift.h"
96 #include "gromacs/pbcutil/mshift.h"
97 #include "gromacs/pbcutil/pbc.h"
98 #include "gromacs/pulling/pull.h"
99 #include "gromacs/pulling/pull_rotation.h"
100 #include "gromacs/timing/cyclecounter.h"
101 #include "gromacs/timing/gpu_timing.h"
102 #include "gromacs/timing/wallcycle.h"
103 #include "gromacs/timing/wallcyclereporting.h"
104 #include "gromacs/timing/walltime_accounting.h"
105 #include "gromacs/topology/topology.h"
106 #include "gromacs/utility/arrayref.h"
107 #include "gromacs/utility/basedefinitions.h"
108 #include "gromacs/utility/cstringutil.h"
109 #include "gromacs/utility/exceptions.h"
110 #include "gromacs/utility/fatalerror.h"
111 #include "gromacs/utility/gmxassert.h"
112 #include "gromacs/utility/gmxmpi.h"
113 #include "gromacs/utility/logger.h"
114 #include "gromacs/utility/smalloc.h"
115 #include "gromacs/utility/strconvert.h"
116 #include "gromacs/utility/sysinfo.h"
118 // TODO: this environment variable allows us to verify before release
119 // that on less common architectures the total cost of polling is not larger than
120 // a blocking wait (so polling does not introduce overhead when the static
121 // PME-first ordering would suffice).
122 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
125 void print_time(FILE *out,
126 gmx_walltime_accounting_t walltime_accounting,
127 int64_t step,
128 const t_inputrec *ir,
129 const t_commrec *cr)
131 time_t finish;
132 double dt, elapsed_seconds, time_per_step;
134 #if !GMX_THREAD_MPI
135 if (!PAR(cr))
136 #endif
138 fprintf(out, "\r");
140 fputs("step ", out);
141 fputs(gmx::int64ToString(step).c_str(), out);
142 fflush(out);
144 if ((step >= ir->nstlist))
146 double seconds_since_epoch = gmx_gettime();
147 elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
148 time_per_step = elapsed_seconds/(step - ir->init_step + 1);
149 dt = (ir->nsteps + ir->init_step - step) * time_per_step;
151 if (ir->nsteps >= 0)
153 if (dt >= 300)
155 finish = static_cast<time_t>(seconds_since_epoch + dt);
156 auto timebuf = gmx_ctime_r(&finish);
157 timebuf.erase(timebuf.find_first_of('\n'));
158 fputs(", will finish ", out);
159 fputs(timebuf.c_str(), out);
161 else
163 fprintf(out, ", remaining wall clock time: %5d s ", static_cast<int>(dt));
166 else
168 fprintf(out, " performance: %.1f ns/day ",
169 ir->delta_t/1000*24*60*60/time_per_step);
172 #if !GMX_THREAD_MPI
173 if (PAR(cr))
175 fprintf(out, "\n");
177 #else
178 GMX_UNUSED_VALUE(cr);
179 #endif
181 fflush(out);
184 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
185 double the_time)
187 if (!fplog)
189 return;
192 time_t temp_time = static_cast<time_t>(the_time);
194 auto timebuf = gmx_ctime_r(&temp_time);
196 fprintf(fplog, "%s on rank %d %s\n", title, nodeid, timebuf.c_str());
199 void print_start(FILE *fplog, const t_commrec *cr,
200 gmx_walltime_accounting_t walltime_accounting,
201 const char *name)
203 char buf[STRLEN];
205 sprintf(buf, "Started %s", name);
206 print_date_and_time(fplog, cr->nodeid, buf,
207 walltime_accounting_get_start_time_stamp(walltime_accounting));
210 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
212 const int end = forceToAdd.size();
214 int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
215 #pragma omp parallel for num_threads(nt) schedule(static)
216 for (int i = 0; i < end; i++)
218 rvec_inc(f[i], forceToAdd[i]);
222 static void calc_virial(int start, int homenr, const rvec x[], const rvec f[],
223 tensor vir_part, const t_graph *graph, const matrix box,
224 t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
226 /* The short-range virial from surrounding boxes */
227 calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
228 inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
230 /* Calculate partial virial, for local atoms only, based on short range.
231 * Total virial is computed in global_stat, called from do_md
233 f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
234 inc_nrnb(nrnb, eNR_VIRIAL, homenr);
236 if (debug)
238 pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
242 static void pull_potential_wrapper(const t_commrec *cr,
243 const t_inputrec *ir,
244 const matrix box, gmx::ArrayRef<const gmx::RVec> x,
245 gmx::ForceWithVirial *force,
246 const t_mdatoms *mdatoms,
247 gmx_enerdata_t *enerd,
248 const real *lambda,
249 double t,
250 gmx_wallcycle_t wcycle)
252 t_pbc pbc;
253 real dvdl;
255 /* Calculate the center of mass forces, this requires communication,
256 * which is why pull_potential is called close to other communication.
258 wallcycle_start(wcycle, ewcPULLPOT);
259 set_pbc(&pbc, ir->ePBC, box);
260 dvdl = 0;
261 enerd->term[F_COM_PULL] +=
262 pull_potential(ir->pull_work, mdatoms, &pbc,
263 cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
264 enerd->dvdl_lin[efptRESTRAINT] += dvdl;
265 wallcycle_stop(wcycle, ewcPULLPOT);
268 static void pme_receive_force_ener(const t_commrec *cr,
269 gmx::ForceWithVirial *forceWithVirial,
270 gmx_enerdata_t *enerd,
271 gmx_wallcycle_t wcycle)
273 real e_q, e_lj, dvdl_q, dvdl_lj;
274 float cycles_ppdpme, cycles_seppme;
276 cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
277 dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
279 /* In case of node-splitting, the PP nodes receive the long-range
280 * forces, virial and energy from the PME nodes here.
282 wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
283 dvdl_q = 0;
284 dvdl_lj = 0;
285 gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
286 &cycles_seppme);
287 enerd->term[F_COUL_RECIP] += e_q;
288 enerd->term[F_LJ_RECIP] += e_lj;
289 enerd->dvdl_lin[efptCOUL] += dvdl_q;
290 enerd->dvdl_lin[efptVDW] += dvdl_lj;
292 if (wcycle)
294 dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
296 wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
299 static void print_large_forces(FILE *fp,
300 const t_mdatoms *md,
301 const t_commrec *cr,
302 int64_t step,
303 real forceTolerance,
304 const rvec *x,
305 const rvec *f)
307 real force2Tolerance = gmx::square(forceTolerance);
308 gmx::index numNonFinite = 0;
309 for (int i = 0; i < md->homenr; i++)
311 real force2 = norm2(f[i]);
312 bool nonFinite = !std::isfinite(force2);
313 if (force2 >= force2Tolerance || nonFinite)
315 fprintf(fp, "step %" PRId64 " atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
316 step,
317 ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
319 if (nonFinite)
321 numNonFinite++;
324 if (numNonFinite > 0)
326 /* Note that with MPI this fatal call on one rank might interrupt
327 * the printing on other ranks. But we can only avoid that with
328 * an expensive MPI barrier that we would need at each step.
330 gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %td atoms", step, numNonFinite);
334 static void post_process_forces(const t_commrec *cr,
335 int64_t step,
336 t_nrnb *nrnb,
337 gmx_wallcycle_t wcycle,
338 const gmx_localtop_t *top,
339 const matrix box,
340 const rvec x[],
341 rvec f[],
342 gmx::ForceWithVirial *forceWithVirial,
343 tensor vir_force,
344 const t_mdatoms *mdatoms,
345 const t_graph *graph,
346 const t_forcerec *fr,
347 const gmx_vsite_t *vsite,
348 int flags)
350 if (fr->haveDirectVirialContributions)
352 rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
354 if (vsite)
356 /* Spread the mesh force on virtual sites to the other particles...
357 * This is parallellized. MPI communication is performed
358 * if the constructing atoms aren't local.
360 matrix virial = { { 0 } };
361 spread_vsite_f(vsite, x, fDirectVir, nullptr,
362 (flags & GMX_FORCE_VIRIAL) != 0, virial,
363 nrnb,
364 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
365 forceWithVirial->addVirialContribution(virial);
368 if (flags & GMX_FORCE_VIRIAL)
370 /* Now add the forces, this is local */
371 sum_forces(f, forceWithVirial->force_);
373 /* Add the direct virial contributions */
374 GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
375 m_add(vir_force, forceWithVirial->getVirial(), vir_force);
377 if (debug)
379 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
384 if (fr->print_force >= 0)
386 print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
390 static void do_nb_verlet(t_forcerec *fr,
391 const interaction_const_t *ic,
392 gmx_enerdata_t *enerd,
393 const int flags,
394 const Nbnxm::InteractionLocality ilocality,
395 const int clearF,
396 const int64_t step,
397 t_nrnb *nrnb,
398 gmx_wallcycle_t wcycle)
400 if (!(flags & GMX_FORCE_NONBONDED))
402 /* skip non-bonded calculation */
403 return;
406 nonbonded_verlet_t *nbv = fr->nbv;
408 /* GPU kernel launch overhead is already timed separately */
409 if (fr->cutoff_scheme != ecutsVERLET)
411 gmx_incons("Invalid cut-off scheme passed!");
414 if (!nbv->useGpu())
416 /* When dynamic pair-list pruning is requested, we need to prune
417 * at nstlistPrune steps.
419 if (nbv->pairlistSets().isDynamicPairlistPruningStep(step))
421 /* Prune the pair-list beyond fr->ic->rlistPrune using
422 * the current coordinates of the atoms.
424 wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
425 nbv->dispatchPruneKernel(ilocality, fr->shift_vec);
426 wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
429 wallcycle_sub_start(wcycle, ewcsNONBONDED);
432 nbv->dispatchNonbondedKernel(ilocality, *ic, flags, clearF, fr, enerd, nrnb);
434 if (!nbv->useGpu())
436 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
440 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
442 return nbv != nullptr && nbv->useGpu();
445 static inline void clear_rvecs_omp(int n, rvec v[])
447 int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
449 /* Note that we would like to avoid this conditional by putting it
450 * into the omp pragma instead, but then we still take the full
451 * omp parallel for overhead (at least with gcc5).
453 if (nth == 1)
455 for (int i = 0; i < n; i++)
457 clear_rvec(v[i]);
460 else
462 #pragma omp parallel for num_threads(nth) schedule(static)
463 for (int i = 0; i < n; i++)
465 clear_rvec(v[i]);
470 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
472 * \param groupOptions Group options, containing T-coupling options
474 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
476 real nrdfCoupled = 0;
477 real nrdfUncoupled = 0;
478 real kineticEnergy = 0;
479 for (int g = 0; g < groupOptions.ngtc; g++)
481 if (groupOptions.tau_t[g] >= 0)
483 nrdfCoupled += groupOptions.nrdf[g];
484 kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
486 else
488 nrdfUncoupled += groupOptions.nrdf[g];
492 /* This conditional with > also catches nrdf=0 */
493 if (nrdfCoupled > nrdfUncoupled)
495 return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
497 else
499 return 0;
503 /*! \brief This routine checks that the potential energy is finite.
505 * Always checks that the potential energy is finite. If step equals
506 * inputrec.init_step also checks that the magnitude of the potential energy
507 * is reasonable. Terminates with a fatal error when a check fails.
508 * Note that passing this check does not guarantee finite forces,
509 * since those use slightly different arithmetics. But in most cases
510 * there is just a narrow coordinate range where forces are not finite
511 * and energies are finite.
513 * \param[in] step The step number, used for checking and printing
514 * \param[in] enerd The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
515 * \param[in] inputrec The input record
517 static void checkPotentialEnergyValidity(int64_t step,
518 const gmx_enerdata_t &enerd,
519 const t_inputrec &inputrec)
521 /* Threshold valid for comparing absolute potential energy against
522 * the kinetic energy. Normally one should not consider absolute
523 * potential energy values, but with a factor of one million
524 * we should never get false positives.
526 constexpr real c_thresholdFactor = 1e6;
528 bool energyIsNotFinite = !std::isfinite(enerd.term[F_EPOT]);
529 real averageKineticEnergy = 0;
530 /* We only check for large potential energy at the initial step,
531 * because that is by far the most likely step for this too occur
532 * and because computing the average kinetic energy is not free.
533 * Note: nstcalcenergy >> 1 often does not allow to catch large energies
534 * before they become NaN.
536 if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
538 averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
541 if (energyIsNotFinite || (averageKineticEnergy > 0 &&
542 enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
544 gmx_fatal(FARGS, "Step %" PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
545 step,
546 enerd.term[F_EPOT],
547 energyIsNotFinite ? "not finite" : "extremely high",
548 enerd.term[F_LJ],
549 enerd.term[F_COUL_SR],
550 energyIsNotFinite ? "non-finite" : "very high",
551 energyIsNotFinite ? " or Nan" : "");
555 /*! \brief Compute forces and/or energies for special algorithms
557 * The intention is to collect all calls to algorithms that compute
558 * forces on local atoms only and that do not contribute to the local
559 * virial sum (but add their virial contribution separately).
560 * Eventually these should likely all become ForceProviders.
561 * Within this function the intention is to have algorithms that do
562 * global communication at the end, so global barriers within the MD loop
563 * are as close together as possible.
565 * \param[in] fplog The log file
566 * \param[in] cr The communication record
567 * \param[in] inputrec The input record
568 * \param[in] awh The Awh module (nullptr if none in use).
569 * \param[in] enforcedRotation Enforced rotation module.
570 * \param[in] step The current MD step
571 * \param[in] t The current time
572 * \param[in,out] wcycle Wallcycle accounting struct
573 * \param[in,out] forceProviders Pointer to a list of force providers
574 * \param[in] box The unit cell
575 * \param[in] x The coordinates
576 * \param[in] mdatoms Per atom properties
577 * \param[in] lambda Array of free-energy lambda values
578 * \param[in] forceFlags Flags that tell whether we should compute forces/energies/virial
579 * \param[in,out] forceWithVirial Force and virial buffers
580 * \param[in,out] enerd Energy buffer
581 * \param[in,out] ed Essential dynamics pointer
582 * \param[in] bNS Tells if we did neighbor searching this step, used for ED sampling
584 * \todo Remove bNS, which is used incorrectly.
585 * \todo Convert all other algorithms called here to ForceProviders.
587 static void
588 computeSpecialForces(FILE *fplog,
589 const t_commrec *cr,
590 const t_inputrec *inputrec,
591 gmx::Awh *awh,
592 gmx_enfrot *enforcedRotation,
593 int64_t step,
594 double t,
595 gmx_wallcycle_t wcycle,
596 ForceProviders *forceProviders,
597 matrix box,
598 gmx::ArrayRef<const gmx::RVec> x,
599 const t_mdatoms *mdatoms,
600 real *lambda,
601 int forceFlags,
602 gmx::ForceWithVirial *forceWithVirial,
603 gmx_enerdata_t *enerd,
604 gmx_edsam *ed,
605 gmx_bool bNS)
607 const bool computeForces = (forceFlags & GMX_FORCE_FORCES) != 0;
609 /* NOTE: Currently all ForceProviders only provide forces.
610 * When they also provide energies, remove this conditional.
612 if (computeForces)
614 gmx::ForceProviderInput forceProviderInput(x, *mdatoms, t, box, *cr);
615 gmx::ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
617 /* Collect forces from modules */
618 forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
621 if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
623 pull_potential_wrapper(cr, inputrec, box, x,
624 forceWithVirial,
625 mdatoms, enerd, lambda, t,
626 wcycle);
628 if (awh)
630 enerd->term[F_COM_PULL] +=
631 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
632 forceWithVirial,
633 t, step, wcycle, fplog);
637 rvec *f = as_rvec_array(forceWithVirial->force_.data());
639 /* Add the forces from enforced rotation potentials (if any) */
640 if (inputrec->bRot)
642 wallcycle_start(wcycle, ewcROTadd);
643 enerd->term[F_COM_PULL] += add_rot_forces(enforcedRotation, f, cr, step, t);
644 wallcycle_stop(wcycle, ewcROTadd);
647 if (ed)
649 /* Note that since init_edsam() is called after the initialization
650 * of forcerec, edsam doesn't request the noVirSum force buffer.
651 * Thus if no other algorithm (e.g. PME) requires it, the forces
652 * here will contribute to the virial.
654 do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
657 /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
658 if (inputrec->bIMD && computeForces)
660 IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
664 /*! \brief Launch the prepare_step and spread stages of PME GPU.
666 * \param[in] pmedata The PME structure
667 * \param[in] box The box matrix
668 * \param[in] x Coordinate array
669 * \param[in] flags Force flags
670 * \param[in] pmeFlags PME flags
671 * \param[in] wcycle The wallcycle structure
673 static inline void launchPmeGpuSpread(gmx_pme_t *pmedata,
674 matrix box,
675 rvec x[],
676 int flags,
677 int pmeFlags,
678 gmx_wallcycle_t wcycle)
680 pme_gpu_prepare_computation(pmedata, (flags & GMX_FORCE_DYNAMICBOX) != 0, box, wcycle, pmeFlags);
681 pme_gpu_launch_spread(pmedata, x, wcycle);
684 /*! \brief Launch the FFT and gather stages of PME GPU
686 * This function only implements setting the output forces (no accumulation).
688 * \param[in] pmedata The PME structure
689 * \param[in] wcycle The wallcycle structure
691 static void launchPmeGpuFftAndGather(gmx_pme_t *pmedata,
692 gmx_wallcycle_t wcycle)
694 pme_gpu_launch_complex_transforms(pmedata, wcycle);
695 pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
698 /*! \brief
699 * Polling wait for either of the PME or nonbonded GPU tasks.
701 * Instead of a static order in waiting for GPU tasks, this function
702 * polls checking which of the two tasks completes first, and does the
703 * associated force buffer reduction overlapped with the other task.
704 * By doing that, unlike static scheduling order, it can always overlap
705 * one of the reductions, regardless of the GPU task completion order.
707 * \param[in] nbv Nonbonded verlet structure
708 * \param[in,out] pmedata PME module data
709 * \param[in,out] force Force array to reduce task outputs into.
710 * \param[in,out] forceWithVirial Force and virial buffers
711 * \param[in,out] fshift Shift force output vector results are reduced into
712 * \param[in,out] enerd Energy data structure results are reduced into
713 * \param[in] flags Force flags
714 * \param[in] pmeFlags PME flags
715 * \param[in] haveOtherWork Tells whether there is other work than non-bonded in the stream(s)
716 * \param[in] wcycle The wallcycle structure
718 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv,
719 gmx_pme_t *pmedata,
720 gmx::ArrayRefWithPadding<gmx::RVec> *force,
721 gmx::ForceWithVirial *forceWithVirial,
722 rvec fshift[],
723 gmx_enerdata_t *enerd,
724 int flags,
725 int pmeFlags,
726 bool haveOtherWork,
727 gmx_wallcycle_t wcycle)
729 bool isPmeGpuDone = false;
730 bool isNbGpuDone = false;
733 gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
735 while (!isPmeGpuDone || !isNbGpuDone)
737 if (!isPmeGpuDone)
739 GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
740 isPmeGpuDone = pme_gpu_try_finish_task(pmedata, pmeFlags, wcycle, forceWithVirial, enerd, completionType);
743 if (!isNbGpuDone)
745 GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
746 wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
747 isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv,
748 flags,
749 Nbnxm::AtomLocality::Local,
750 haveOtherWork,
751 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
752 fshift, completionType);
753 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
754 // To get the call count right, when the task finished we
755 // issue a start/stop.
756 // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
757 // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
758 if (isNbGpuDone)
760 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
761 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
763 nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
764 as_rvec_array(force->unpaddedArrayRef().data()), wcycle);
770 /*! \brief
771 * Launch the dynamic rolling pruning GPU task.
773 * We currently alternate local/non-local list pruning in odd-even steps
774 * (only pruning every second step without DD).
776 * \param[in] cr The communication record
777 * \param[in] nbv Nonbonded verlet structure
778 * \param[in] inputrec The input record
779 * \param[in] step The current MD step
781 static inline void launchGpuRollingPruning(const t_commrec *cr,
782 const nonbonded_verlet_t *nbv,
783 const t_inputrec *inputrec,
784 const int64_t step)
786 /* We should not launch the rolling pruning kernel at a search
787 * step or just before search steps, since that's useless.
788 * Without domain decomposition we prune at even steps.
789 * With domain decomposition we alternate local and non-local
790 * pruning at even and odd steps.
792 int numRollingParts = nbv->pairlistSets().params().numRollingParts;
793 GMX_ASSERT(numRollingParts == nbv->pairlistSets().params().nstlistPrune/2,
794 "Since we alternate local/non-local at even/odd steps, "
795 "we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
796 int stepWithCurrentList = nbv->pairlistSets().numStepsWithPairlist(step);
797 bool stepIsEven = ((stepWithCurrentList & 1) == 0);
798 if (stepWithCurrentList > 0 &&
799 stepWithCurrentList < inputrec->nstlist - 1 &&
800 (stepIsEven || havePPDomainDecomposition(cr)))
802 Nbnxm::gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
803 stepIsEven ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal,
804 numRollingParts);
808 static void do_force_cutsVERLET(FILE *fplog,
809 const t_commrec *cr,
810 const gmx_multisim_t *ms,
811 const t_inputrec *inputrec,
812 gmx::Awh *awh,
813 gmx_enfrot *enforcedRotation,
814 int64_t step,
815 t_nrnb *nrnb,
816 gmx_wallcycle_t wcycle,
817 const gmx_localtop_t *top,
818 const gmx_groups_t * /* groups */,
819 matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
820 history_t *hist,
821 gmx::ArrayRefWithPadding<gmx::RVec> force,
822 tensor vir_force,
823 const t_mdatoms *mdatoms,
824 gmx_enerdata_t *enerd, t_fcdata *fcd,
825 real *lambda,
826 t_graph *graph,
827 t_forcerec *fr,
828 gmx::PpForceWorkload *ppForceWorkload,
829 interaction_const_t *ic,
830 const gmx_vsite_t *vsite,
831 rvec mu_tot,
832 double t,
833 gmx_edsam *ed,
834 const int flags,
835 const DDBalanceRegionHandler &ddBalanceRegionHandler)
837 int cg1, i, j;
838 double mu[2*DIM];
839 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
840 gmx_bool bDoForces, bUseGPU, bUseOrEmulGPU;
841 rvec vzero, box_diag;
842 float cycles_pme, cycles_wait_gpu;
843 nonbonded_verlet_t *nbv = fr->nbv;
845 bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
846 bNS = ((flags & GMX_FORCE_NS) != 0);
847 bFillGrid = (bNS && bStateChanged);
848 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
849 bDoForces = ((flags & GMX_FORCE_FORCES) != 0);
850 bUseGPU = fr->nbv->useGpu();
851 bUseOrEmulGPU = bUseGPU || fr->nbv->emulateGpu();
853 const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
854 // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
855 const bool useGpuPme = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
856 ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
857 const int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE |
858 ((flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0) |
859 ((flags & GMX_FORCE_ENERGY) ? GMX_PME_CALC_ENER_VIR : 0) |
860 ((flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0);
862 /* At a search step we need to start the first balancing region
863 * somewhere early inside the step after communication during domain
864 * decomposition (and not during the previous step as usual).
866 if (bNS)
868 ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::yes);
871 cycles_wait_gpu = 0;
873 const int start = 0;
874 const int homenr = mdatoms->homenr;
876 clear_mat(vir_force);
878 if (DOMAINDECOMP(cr))
880 cg1 = cr->dd->globalAtomGroupIndices.size();
882 else
884 cg1 = top->cgs.nr;
886 if (fr->n_tpi > 0)
888 cg1--;
891 if (bStateChanged)
893 update_forcerec(fr, box);
895 if (inputrecNeedMutot(inputrec))
897 /* Calculate total (local) dipole moment in a temporary common array.
898 * This makes it possible to sum them over nodes faster.
900 calc_mu(start, homenr,
901 x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
902 mu, mu+DIM);
906 if (fr->ePBC != epbcNONE)
908 /* Compute shift vectors every step,
909 * because of pressure coupling or box deformation!
911 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
913 calc_shifts(box, fr->shift_vec);
916 if (bCalcCGCM)
918 put_atoms_in_box_omp(fr->ePBC, box, x.unpaddedArrayRef().subArray(0, homenr));
919 inc_nrnb(nrnb, eNR_SHIFTX, homenr);
921 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
923 unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
927 nbnxn_atomdata_copy_shiftvec((flags & GMX_FORCE_DYNAMICBOX) != 0,
928 fr->shift_vec, nbv->nbat);
930 #if GMX_MPI
931 if (!thisRankHasDuty(cr, DUTY_PME))
933 /* Send particle coordinates to the pme nodes.
934 * Since this is only implemented for domain decomposition
935 * and domain decomposition does not use the graph,
936 * we do not need to worry about shifting.
938 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
939 lambda[efptCOUL], lambda[efptVDW],
940 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
941 step, wcycle);
943 #endif /* GMX_MPI */
945 if (useGpuPme)
947 launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.unpaddedArrayRef().data()), flags, pmeFlags, wcycle);
950 /* do gridding for pair search */
951 if (bNS)
953 if (graph && bStateChanged)
955 /* Calculate intramolecular shift vectors to make molecules whole */
956 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
959 clear_rvec(vzero);
960 box_diag[XX] = box[XX][XX];
961 box_diag[YY] = box[YY][YY];
962 box_diag[ZZ] = box[ZZ][ZZ];
964 wallcycle_start(wcycle, ewcNS);
965 if (!DOMAINDECOMP(cr))
967 wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
968 nbnxn_put_on_grid(nbv, box,
969 0, vzero, box_diag,
970 nullptr, 0, mdatoms->homenr, -1,
971 fr->cginfo, x.unpaddedArrayRef(),
972 0, nullptr);
973 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
975 else
977 wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
978 nbnxn_put_on_grid_nonlocal(nbv, domdec_zones(cr->dd),
979 fr->cginfo, x.unpaddedArrayRef());
980 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
983 nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
985 wallcycle_stop(wcycle, ewcNS);
988 /* initialize the GPU atom data and copy shift vector */
989 if (bUseGPU)
991 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
992 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
994 if (bNS)
996 Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
999 Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1001 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1003 if (bNS && fr->gpuBonded)
1005 /* Now we put all atoms on the grid, we can assign bonded
1006 * interactions to the GPU, where the grid order is
1007 * needed. Also the xq, f and fshift device buffers have
1008 * been reallocated if needed, so the bonded code can
1009 * learn about them. */
1010 // TODO the xq, f, and fshift buffers are now shared
1011 // resources, so they should be maintained by a
1012 // higher-level object than the nb module.
1013 fr->gpuBonded->updateInteractionListsAndDeviceBuffers(nbnxn_get_gridindices(fr->nbv->nbs.get()),
1014 top->idef,
1015 Nbnxm::gpu_get_xq(nbv->gpu_nbv),
1016 Nbnxm::gpu_get_f(nbv->gpu_nbv),
1017 Nbnxm::gpu_get_fshift(nbv->gpu_nbv));
1018 ppForceWorkload->haveGpuBondedWork = fr->gpuBonded->haveInteractions();
1021 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1024 /* do local pair search */
1025 if (bNS)
1027 wallcycle_start_nocount(wcycle, ewcNS);
1028 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1029 /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
1030 nbv->constructPairlist(Nbnxm::InteractionLocality::Local,
1031 &top->excls, step, nrnb);
1032 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1033 wallcycle_stop(wcycle, ewcNS);
1035 else
1037 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
1038 FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1039 nbv->nbat, wcycle);
1042 if (bUseGPU)
1044 ddBalanceRegionHandler.openBeforeForceComputationGpu();
1046 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1048 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1049 Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
1050 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1052 // bonded work not split into separate local and non-local, so with DD
1053 // we can only launch the kernel after non-local coordinates have been received.
1054 if (ppForceWorkload->haveGpuBondedWork && !havePPDomainDecomposition(cr))
1056 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1057 fr->gpuBonded->launchKernels(fr, flags, box);
1058 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1061 /* launch local nonbonded work on GPU */
1062 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1063 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFNo,
1064 step, nrnb, wcycle);
1065 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1066 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1069 if (useGpuPme)
1071 // In PME GPU and mixed mode we launch FFT / gather after the
1072 // X copy/transform to allow overlap as well as after the GPU NB
1073 // launch to avoid FFT launch overhead hijacking the CPU and delaying
1074 // the nonbonded kernel.
1075 launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1078 /* Communicate coordinates and sum dipole if necessary +
1079 do non-local pair search */
1080 if (havePPDomainDecomposition(cr))
1082 if (bNS)
1084 wallcycle_start_nocount(wcycle, ewcNS);
1085 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1086 /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
1087 nbv->constructPairlist(Nbnxm::InteractionLocality::NonLocal,
1088 &top->excls, step, nrnb);
1089 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1090 wallcycle_stop(wcycle, ewcNS);
1092 else
1094 dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1096 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
1097 FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1098 nbv->nbat, wcycle);
1101 if (bUseGPU)
1103 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1105 /* launch non-local nonbonded tasks on GPU */
1106 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1107 Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
1108 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1110 if (ppForceWorkload->haveGpuBondedWork)
1112 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1113 fr->gpuBonded->launchKernels(fr, flags, box);
1114 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1117 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1118 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
1119 step, nrnb, wcycle);
1120 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1122 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1126 if (bUseGPU)
1128 /* launch D2H copy-back F */
1129 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1130 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1131 if (havePPDomainDecomposition(cr))
1133 Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1134 flags, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
1136 Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1137 flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
1138 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1140 if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1142 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1143 fr->gpuBonded->launchEnergyTransfer();
1144 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1146 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1149 if (bStateChanged && inputrecNeedMutot(inputrec))
1151 if (PAR(cr))
1153 gmx_sumd(2*DIM, mu, cr);
1155 ddBalanceRegionHandler.reopenRegionCpu();
1158 for (i = 0; i < 2; i++)
1160 for (j = 0; j < DIM; j++)
1162 fr->mu_tot[i][j] = mu[i*DIM + j];
1166 if (fr->efep == efepNO)
1168 copy_rvec(fr->mu_tot[0], mu_tot);
1170 else
1172 for (j = 0; j < DIM; j++)
1174 mu_tot[j] =
1175 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1176 lambda[efptCOUL]*fr->mu_tot[1][j];
1180 /* Reset energies */
1181 reset_enerdata(enerd);
1182 clear_rvecs(SHIFTS, fr->fshift);
1184 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1186 wallcycle_start(wcycle, ewcPPDURINGPME);
1187 dd_force_flop_start(cr->dd, nrnb);
1190 if (inputrec->bRot)
1192 wallcycle_start(wcycle, ewcROT);
1193 do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1194 wallcycle_stop(wcycle, ewcROT);
1197 /* Temporary solution until all routines take PaddedRVecVector */
1198 rvec *const f = as_rvec_array(force.unpaddedArrayRef().data());
1200 /* Start the force cycle counter.
1201 * Note that a different counter is used for dynamic load balancing.
1203 wallcycle_start(wcycle, ewcFORCE);
1205 gmx::ArrayRef<gmx::RVec> forceRef = force.unpaddedArrayRef();
1206 if (bDoForces)
1208 /* If we need to compute the virial, we might need a separate
1209 * force buffer for algorithms for which the virial is calculated
1210 * directly, such as PME.
1212 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1214 forceRef = *fr->forceBufferForDirectVirialContributions;
1216 /* TODO: update comment
1217 * We only compute forces on local atoms. Note that vsites can
1218 * spread to non-local atoms, but that part of the buffer is
1219 * cleared separately in the vsite spreading code.
1221 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1223 /* Clear the short- and long-range forces */
1224 clear_rvecs_omp(fr->natoms_force_constr, f);
1227 /* forceWithVirial uses the local atom range only */
1228 gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1230 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1232 clear_pull_forces(inputrec->pull_work);
1235 /* We calculate the non-bonded forces, when done on the CPU, here.
1236 * We do this before calling do_force_lowlevel, because in that
1237 * function, the listed forces are calculated before PME, which
1238 * does communication. With this order, non-bonded and listed
1239 * force calculation imbalance can be balanced out by the domain
1240 * decomposition load balancing.
1243 if (!bUseOrEmulGPU)
1245 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFYes,
1246 step, nrnb, wcycle);
1249 if (fr->efep != efepNO)
1251 /* Calculate the local and non-local free energy interactions here.
1252 * Happens here on the CPU both with and without GPU.
1254 wallcycle_sub_start(wcycle, ewcsNONBONDED);
1255 nbv->dispatchFreeEnergyKernel(Nbnxm::InteractionLocality::Local,
1256 fr, as_rvec_array(x.unpaddedArrayRef().data()), f, *mdatoms,
1257 inputrec->fepvals, lambda,
1258 enerd, flags, nrnb);
1260 if (havePPDomainDecomposition(cr))
1262 nbv->dispatchFreeEnergyKernel(Nbnxm::InteractionLocality::NonLocal,
1263 fr, as_rvec_array(x.unpaddedArrayRef().data()), f, *mdatoms,
1264 inputrec->fepvals, lambda,
1265 enerd, flags, nrnb);
1267 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
1270 if (!bUseOrEmulGPU)
1272 if (havePPDomainDecomposition(cr))
1274 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
1275 step, nrnb, wcycle);
1278 /* Add all the non-bonded force to the normal force array.
1279 * This can be split into a local and a non-local part when overlapping
1280 * communication with calculation with domain decomposition.
1282 wallcycle_stop(wcycle, ewcFORCE);
1284 nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::All, f, wcycle);
1286 wallcycle_start_nocount(wcycle, ewcFORCE);
1288 /* If there are multiple fshift output buffers we need to reduce them */
1289 if (flags & GMX_FORCE_VIRIAL)
1291 /* This is not in a subcounter because it takes a
1292 negligible and constant-sized amount of time */
1293 nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1294 fr->fshift);
1298 /* update QMMMrec, if necessary */
1299 if (fr->bQMMM)
1301 update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
1304 /* Compute the bonded and non-bonded energies and optionally forces */
1305 do_force_lowlevel(fr, inputrec, &(top->idef),
1306 cr, ms, nrnb, wcycle, mdatoms,
1307 as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
1308 box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1309 flags,
1310 &cycles_pme, ddBalanceRegionHandler);
1312 wallcycle_stop(wcycle, ewcFORCE);
1314 computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1315 step, t, wcycle,
1316 fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
1317 flags, &forceWithVirial, enerd,
1318 ed, bNS);
1320 if (bUseOrEmulGPU)
1322 /* wait for non-local forces (or calculate in emulation mode) */
1323 if (havePPDomainDecomposition(cr))
1325 if (bUseGPU)
1327 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1328 Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
1329 flags, Nbnxm::AtomLocality::NonLocal,
1330 ppForceWorkload->haveGpuBondedWork,
1331 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1332 fr->fshift);
1333 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1335 else
1337 wallcycle_start_nocount(wcycle, ewcFORCE);
1338 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFYes,
1339 step, nrnb, wcycle);
1340 wallcycle_stop(wcycle, ewcFORCE);
1343 nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::NonLocal,
1344 f, wcycle);
1348 if (havePPDomainDecomposition(cr))
1350 /* We are done with the CPU compute.
1351 * We will now communicate the non-local forces.
1352 * If we use a GPU this will overlap with GPU work, so in that case
1353 * we do not close the DD force balancing region here.
1355 ddBalanceRegionHandler.closeAfterForceComputationCpu();
1357 if (bDoForces)
1359 dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
1363 // With both nonbonded and PME offloaded a GPU on the same rank, we use
1364 // an alternating wait/reduction scheme.
1365 bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1366 if (alternateGpuWait)
1368 alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, pmeFlags, ppForceWorkload->haveGpuBondedWork, wcycle);
1371 if (!alternateGpuWait && useGpuPme)
1373 pme_gpu_wait_and_reduce(fr->pmedata, pmeFlags, wcycle, &forceWithVirial, enerd);
1376 /* Wait for local GPU NB outputs on the non-alternating wait path */
1377 if (!alternateGpuWait && bUseGPU)
1379 /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1380 * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1381 * but even with a step of 0.1 ms the difference is less than 1%
1382 * of the step time.
1384 const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1386 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1387 Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
1388 flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork,
1389 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1390 fr->fshift);
1391 float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1393 if (ddBalanceRegionHandler.useBalancingRegion())
1395 DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1396 if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1398 /* We measured few cycles, it could be that the kernel
1399 * and transfer finished earlier and there was no actual
1400 * wait time, only API call overhead.
1401 * Then the actual time could be anywhere between 0 and
1402 * cycles_wait_est. We will use half of cycles_wait_est.
1404 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1406 ddBalanceRegionHandler.closeAfterForceComputationGpu(cycles_wait_gpu, waitedForGpu);
1410 if (fr->nbv->emulateGpu())
1412 // NOTE: emulation kernel is not included in the balancing region,
1413 // but emulation mode does not target performance anyway
1414 wallcycle_start_nocount(wcycle, ewcFORCE);
1415 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local,
1416 DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1417 step, nrnb, wcycle);
1418 wallcycle_stop(wcycle, ewcFORCE);
1421 if (useGpuPme)
1423 pme_gpu_reinit_computation(fr->pmedata, wcycle);
1426 if (bUseGPU)
1428 /* now clear the GPU outputs while we finish the step on the CPU */
1429 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1430 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1431 Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, flags);
1433 /* Is dynamic pair-list pruning activated? */
1434 if (nbv->pairlistSets().params().useDynamicPruning)
1436 launchGpuRollingPruning(cr, nbv, inputrec, step);
1438 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1439 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1442 if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1444 wallcycle_start(wcycle, ewcWAIT_GPU_BONDED);
1445 // in principle this should be included in the DD balancing region,
1446 // but generally it is infrequent so we'll omit it for the sake of
1447 // simpler code
1448 fr->gpuBonded->accumulateEnergyTerms(enerd);
1449 wallcycle_stop(wcycle, ewcWAIT_GPU_BONDED);
1451 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1452 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1453 fr->gpuBonded->clearEnergies();
1454 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1455 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1458 /* Do the nonbonded GPU (or emulation) force buffer reduction
1459 * on the non-alternating path. */
1460 if (bUseOrEmulGPU && !alternateGpuWait)
1462 nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
1463 f, wcycle);
1465 if (DOMAINDECOMP(cr))
1467 dd_force_flop_stop(cr->dd, nrnb);
1470 if (bDoForces)
1472 /* If we have NoVirSum forces, but we do not calculate the virial,
1473 * we sum fr->f_novirsum=f later.
1475 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1477 spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
1478 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1481 if (flags & GMX_FORCE_VIRIAL)
1483 /* Calculation of the virial must be done after vsites! */
1484 calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
1485 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1489 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1491 /* In case of node-splitting, the PP nodes receive the long-range
1492 * forces, virial and energy from the PME nodes here.
1494 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1497 if (bDoForces)
1499 post_process_forces(cr, step, nrnb, wcycle,
1500 top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
1501 vir_force, mdatoms, graph, fr, vsite,
1502 flags);
1505 if (flags & GMX_FORCE_ENERGY)
1507 /* Sum the potential energy terms from group contributions */
1508 sum_epot(&(enerd->grpp), enerd->term);
1510 if (!EI_TPI(inputrec->eI))
1512 checkPotentialEnergyValidity(step, *enerd, *inputrec);
1517 static void do_force_cutsGROUP(FILE *fplog,
1518 const t_commrec *cr,
1519 const gmx_multisim_t *ms,
1520 const t_inputrec *inputrec,
1521 gmx::Awh *awh,
1522 gmx_enfrot *enforcedRotation,
1523 int64_t step,
1524 t_nrnb *nrnb,
1525 gmx_wallcycle_t wcycle,
1526 gmx_localtop_t *top,
1527 const gmx_groups_t *groups,
1528 matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
1529 history_t *hist,
1530 gmx::ArrayRefWithPadding<gmx::RVec> force,
1531 tensor vir_force,
1532 const t_mdatoms *mdatoms,
1533 gmx_enerdata_t *enerd,
1534 t_fcdata *fcd,
1535 real *lambda,
1536 t_graph *graph,
1537 t_forcerec *fr,
1538 const gmx_vsite_t *vsite,
1539 rvec mu_tot,
1540 double t,
1541 gmx_edsam *ed,
1542 int flags,
1543 const DDBalanceRegionHandler &ddBalanceRegionHandler)
1545 int cg0, cg1, i, j;
1546 double mu[2*DIM];
1547 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1548 gmx_bool bDoForces;
1549 float cycles_pme;
1551 const int start = 0;
1552 const int homenr = mdatoms->homenr;
1554 clear_mat(vir_force);
1556 cg0 = 0;
1557 if (DOMAINDECOMP(cr))
1559 cg1 = cr->dd->globalAtomGroupIndices.size();
1561 else
1563 cg1 = top->cgs.nr;
1565 if (fr->n_tpi > 0)
1567 cg1--;
1570 bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
1571 bNS = ((flags & GMX_FORCE_NS) != 0);
1572 /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1573 bFillGrid = (bNS && bStateChanged);
1574 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1575 bDoForces = ((flags & GMX_FORCE_FORCES) != 0);
1577 if (bStateChanged)
1579 update_forcerec(fr, box);
1581 if (inputrecNeedMutot(inputrec))
1583 /* Calculate total (local) dipole moment in a temporary common array.
1584 * This makes it possible to sum them over nodes faster.
1586 calc_mu(start, homenr,
1587 x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1588 mu, mu+DIM);
1592 if (fr->ePBC != epbcNONE)
1594 /* Compute shift vectors every step,
1595 * because of pressure coupling or box deformation!
1597 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1599 calc_shifts(box, fr->shift_vec);
1602 if (bCalcCGCM)
1604 put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1605 &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1606 inc_nrnb(nrnb, eNR_CGCM, homenr);
1607 inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1609 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1611 unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1614 else if (bCalcCGCM)
1616 calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1617 inc_nrnb(nrnb, eNR_CGCM, homenr);
1620 if (bCalcCGCM && gmx_debug_at)
1622 pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1625 #if GMX_MPI
1626 if (!thisRankHasDuty(cr, DUTY_PME))
1628 /* Send particle coordinates to the pme nodes.
1629 * Since this is only implemented for domain decomposition
1630 * and domain decomposition does not use the graph,
1631 * we do not need to worry about shifting.
1633 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1634 lambda[efptCOUL], lambda[efptVDW],
1635 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1636 step, wcycle);
1638 #endif /* GMX_MPI */
1640 /* Communicate coordinates and sum dipole if necessary */
1641 if (DOMAINDECOMP(cr))
1643 dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1645 /* No GPU support, no move_x overlap, so reopen the balance region here */
1646 ddBalanceRegionHandler.reopenRegionCpu();
1649 if (inputrecNeedMutot(inputrec))
1651 if (bStateChanged)
1653 if (PAR(cr))
1655 gmx_sumd(2*DIM, mu, cr);
1657 ddBalanceRegionHandler.reopenRegionCpu();
1659 for (i = 0; i < 2; i++)
1661 for (j = 0; j < DIM; j++)
1663 fr->mu_tot[i][j] = mu[i*DIM + j];
1667 if (fr->efep == efepNO)
1669 copy_rvec(fr->mu_tot[0], mu_tot);
1671 else
1673 for (j = 0; j < DIM; j++)
1675 mu_tot[j] =
1676 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1681 /* Reset energies */
1682 reset_enerdata(enerd);
1683 clear_rvecs(SHIFTS, fr->fshift);
1685 if (bNS)
1687 wallcycle_start(wcycle, ewcNS);
1689 if (graph && bStateChanged)
1691 /* Calculate intramolecular shift vectors to make molecules whole */
1692 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1695 /* Do the actual neighbour searching */
1696 ns(fplog, fr, box,
1697 groups, top, mdatoms,
1698 cr, nrnb, bFillGrid);
1700 wallcycle_stop(wcycle, ewcNS);
1703 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1705 wallcycle_start(wcycle, ewcPPDURINGPME);
1706 dd_force_flop_start(cr->dd, nrnb);
1709 if (inputrec->bRot)
1711 wallcycle_start(wcycle, ewcROT);
1712 do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1713 wallcycle_stop(wcycle, ewcROT);
1716 /* Temporary solution until all routines take PaddedRVecVector */
1717 rvec *f = as_rvec_array(force.unpaddedArrayRef().data());
1719 /* Start the force cycle counter.
1720 * Note that a different counter is used for dynamic load balancing.
1722 wallcycle_start(wcycle, ewcFORCE);
1724 gmx::ArrayRef<gmx::RVec> forceRef = force.paddedArrayRef();
1725 if (bDoForces)
1727 /* If we need to compute the virial, we might need a separate
1728 * force buffer for algorithms for which the virial is calculated
1729 * separately, such as PME.
1731 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1733 forceRef = *fr->forceBufferForDirectVirialContributions;
1735 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1738 /* Clear the short- and long-range forces */
1739 clear_rvecs(fr->natoms_force_constr, f);
1742 /* forceWithVirial might need the full force atom range */
1743 gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1745 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1747 clear_pull_forces(inputrec->pull_work);
1750 /* update QMMMrec, if necessary */
1751 if (fr->bQMMM)
1753 update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
1756 /* Compute the bonded and non-bonded energies and optionally forces */
1757 do_force_lowlevel(fr, inputrec, &(top->idef),
1758 cr, ms, nrnb, wcycle, mdatoms,
1759 as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
1760 box, inputrec->fepvals, lambda,
1761 graph, &(top->excls), fr->mu_tot,
1762 flags,
1763 &cycles_pme, ddBalanceRegionHandler);
1765 wallcycle_stop(wcycle, ewcFORCE);
1767 if (DOMAINDECOMP(cr))
1769 dd_force_flop_stop(cr->dd, nrnb);
1771 ddBalanceRegionHandler.closeAfterForceComputationCpu();
1774 computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1775 step, t, wcycle,
1776 fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
1777 flags, &forceWithVirial, enerd,
1778 ed, bNS);
1780 if (bDoForces)
1782 /* Communicate the forces */
1783 if (DOMAINDECOMP(cr))
1785 dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
1786 /* Do we need to communicate the separate force array
1787 * for terms that do not contribute to the single sum virial?
1788 * Position restraints and electric fields do not introduce
1789 * inter-cg forces, only full electrostatics methods do.
1790 * When we do not calculate the virial, fr->f_novirsum = f,
1791 * so we have already communicated these forces.
1793 if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
1794 (flags & GMX_FORCE_VIRIAL))
1796 dd_move_f(cr->dd, forceWithVirial.force_, nullptr, wcycle);
1800 /* If we have NoVirSum forces, but we do not calculate the virial,
1801 * we sum fr->f_novirsum=f later.
1803 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1805 spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
1806 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1809 if (flags & GMX_FORCE_VIRIAL)
1811 /* Calculation of the virial must be done after vsites! */
1812 calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
1813 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1817 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1819 /* In case of node-splitting, the PP nodes receive the long-range
1820 * forces, virial and energy from the PME nodes here.
1822 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1825 if (bDoForces)
1827 post_process_forces(cr, step, nrnb, wcycle,
1828 top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
1829 vir_force, mdatoms, graph, fr, vsite,
1830 flags);
1833 if (flags & GMX_FORCE_ENERGY)
1835 /* Sum the potential energy terms from group contributions */
1836 sum_epot(&(enerd->grpp), enerd->term);
1838 if (!EI_TPI(inputrec->eI))
1840 checkPotentialEnergyValidity(step, *enerd, *inputrec);
1846 void do_force(FILE *fplog,
1847 const t_commrec *cr,
1848 const gmx_multisim_t *ms,
1849 const t_inputrec *inputrec,
1850 gmx::Awh *awh,
1851 gmx_enfrot *enforcedRotation,
1852 int64_t step,
1853 t_nrnb *nrnb,
1854 gmx_wallcycle_t wcycle,
1855 gmx_localtop_t *top,
1856 const gmx_groups_t *groups,
1857 matrix box,
1858 gmx::ArrayRefWithPadding<gmx::RVec> x, //NOLINT(performance-unnecessary-value-param)
1859 history_t *hist,
1860 gmx::ArrayRefWithPadding<gmx::RVec> force, //NOLINT(performance-unnecessary-value-param)
1861 tensor vir_force,
1862 const t_mdatoms *mdatoms,
1863 gmx_enerdata_t *enerd,
1864 t_fcdata *fcd,
1865 gmx::ArrayRef<real> lambda,
1866 t_graph *graph,
1867 t_forcerec *fr,
1868 gmx::PpForceWorkload *ppForceWorkload,
1869 const gmx_vsite_t *vsite,
1870 rvec mu_tot,
1871 double t,
1872 gmx_edsam *ed,
1873 int flags,
1874 const DDBalanceRegionHandler &ddBalanceRegionHandler)
1876 /* modify force flag if not doing nonbonded */
1877 if (!fr->bNonbonded)
1879 flags &= ~GMX_FORCE_NONBONDED;
1882 switch (inputrec->cutoff_scheme)
1884 case ecutsVERLET:
1885 do_force_cutsVERLET(fplog, cr, ms, inputrec,
1886 awh, enforcedRotation, step, nrnb, wcycle,
1887 top,
1888 groups,
1889 box, x, hist,
1890 force, vir_force,
1891 mdatoms,
1892 enerd, fcd,
1893 lambda.data(), graph,
1895 ppForceWorkload,
1896 fr->ic,
1897 vsite, mu_tot,
1898 t, ed,
1899 flags,
1900 ddBalanceRegionHandler);
1901 break;
1902 case ecutsGROUP:
1903 do_force_cutsGROUP(fplog, cr, ms, inputrec,
1904 awh, enforcedRotation, step, nrnb, wcycle,
1905 top,
1906 groups,
1907 box, x, hist,
1908 force, vir_force,
1909 mdatoms,
1910 enerd, fcd,
1911 lambda.data(), graph,
1912 fr, vsite, mu_tot,
1913 t, ed,
1914 flags,
1915 ddBalanceRegionHandler);
1916 break;
1917 default:
1918 gmx_incons("Invalid cut-off scheme passed!");
1921 /* In case we don't have constraints and are using GPUs, the next balancing
1922 * region starts here.
1923 * Some "special" work at the end of do_force_cuts?, such as vsite spread,
1924 * virial calculation and COM pulling, is not thus not included in
1925 * the balance timing, which is ok as most tasks do communication.
1927 ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::no);
1931 void do_constrain_first(FILE *fplog, gmx::Constraints *constr,
1932 const t_inputrec *ir, const t_mdatoms *md,
1933 t_state *state)
1935 int i, m, start, end;
1936 int64_t step;
1937 real dt = ir->delta_t;
1938 real dvdl_dum;
1939 rvec *savex;
1941 /* We need to allocate one element extra, since we might use
1942 * (unaligned) 4-wide SIMD loads to access rvec entries.
1944 snew(savex, state->natoms + 1);
1946 start = 0;
1947 end = md->homenr;
1949 if (debug)
1951 fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
1952 start, md->homenr, end);
1954 /* Do a first constrain to reset particles... */
1955 step = ir->init_step;
1956 if (fplog)
1958 char buf[STEPSTRSIZE];
1959 fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
1960 gmx_step_str(step, buf));
1962 dvdl_dum = 0;
1964 /* constrain the current position */
1965 constr->apply(TRUE, FALSE,
1966 step, 0, 1.0,
1967 state->x.rvec_array(), state->x.rvec_array(), nullptr,
1968 state->box,
1969 state->lambda[efptBONDED], &dvdl_dum,
1970 nullptr, nullptr, gmx::ConstraintVariable::Positions);
1971 if (EI_VV(ir->eI))
1973 /* constrain the inital velocity, and save it */
1974 /* also may be useful if we need the ekin from the halfstep for velocity verlet */
1975 constr->apply(TRUE, FALSE,
1976 step, 0, 1.0,
1977 state->x.rvec_array(), state->v.rvec_array(), state->v.rvec_array(),
1978 state->box,
1979 state->lambda[efptBONDED], &dvdl_dum,
1980 nullptr, nullptr, gmx::ConstraintVariable::Velocities);
1982 /* constrain the inital velocities at t-dt/2 */
1983 if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
1985 auto x = makeArrayRef(state->x).subArray(start, end);
1986 auto v = makeArrayRef(state->v).subArray(start, end);
1987 for (i = start; (i < end); i++)
1989 for (m = 0; (m < DIM); m++)
1991 /* Reverse the velocity */
1992 v[i][m] = -v[i][m];
1993 /* Store the position at t-dt in buf */
1994 savex[i][m] = x[i][m] + dt*v[i][m];
1997 /* Shake the positions at t=-dt with the positions at t=0
1998 * as reference coordinates.
2000 if (fplog)
2002 char buf[STEPSTRSIZE];
2003 fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2004 gmx_step_str(step, buf));
2006 dvdl_dum = 0;
2007 constr->apply(TRUE, FALSE,
2008 step, -1, 1.0,
2009 state->x.rvec_array(), savex, nullptr,
2010 state->box,
2011 state->lambda[efptBONDED], &dvdl_dum,
2012 state->v.rvec_array(), nullptr, gmx::ConstraintVariable::Positions);
2014 for (i = start; i < end; i++)
2016 for (m = 0; m < DIM; m++)
2018 /* Re-reverse the velocities */
2019 v[i][m] = -v[i][m];
2023 sfree(savex);
2027 static void
2028 integrate_table(const real vdwtab[], real scale, int offstart, int rstart, int rend,
2029 double *enerout, double *virout)
2031 double enersum, virsum;
2032 double invscale, invscale2, invscale3;
2033 double r, ea, eb, ec, pa, pb, pc, pd;
2034 double y0, f, g, h;
2035 int ri, offset;
2036 double tabfactor;
2038 invscale = 1.0/scale;
2039 invscale2 = invscale*invscale;
2040 invscale3 = invscale*invscale2;
2042 /* Following summation derived from cubic spline definition,
2043 * Numerical Recipies in C, second edition, p. 113-116. Exact for
2044 * the cubic spline. We first calculate the negative of the
2045 * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2046 * add the more standard, abrupt cutoff correction to that result,
2047 * yielding the long-range correction for a switched function. We
2048 * perform both the pressure and energy loops at the same time for
2049 * simplicity, as the computational cost is low. */
2051 if (offstart == 0)
2053 /* Since the dispersion table has been scaled down a factor
2054 * 6.0 and the repulsion a factor 12.0 to compensate for the
2055 * c6/c12 parameters inside nbfp[] being scaled up (to save
2056 * flops in kernels), we need to correct for this.
2058 tabfactor = 6.0;
2060 else
2062 tabfactor = 12.0;
2065 enersum = 0.0;
2066 virsum = 0.0;
2067 for (ri = rstart; ri < rend; ++ri)
2069 r = ri*invscale;
2070 ea = invscale3;
2071 eb = 2.0*invscale2*r;
2072 ec = invscale*r*r;
2074 pa = invscale3;
2075 pb = 3.0*invscale2*r;
2076 pc = 3.0*invscale*r*r;
2077 pd = r*r*r;
2079 /* this "8" is from the packing in the vdwtab array - perhaps
2080 should be defined? */
2082 offset = 8*ri + offstart;
2083 y0 = vdwtab[offset];
2084 f = vdwtab[offset+1];
2085 g = vdwtab[offset+2];
2086 h = vdwtab[offset+3];
2088 enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2089 virsum += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2091 *enerout = 4.0*M_PI*enersum*tabfactor;
2092 *virout = 4.0*M_PI*virsum*tabfactor;
2095 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2097 double eners[2], virs[2], enersum, virsum;
2098 double r0, rc3, rc9;
2099 int ri0, ri1, i;
2100 real scale, *vdwtab;
2102 fr->enershiftsix = 0;
2103 fr->enershifttwelve = 0;
2104 fr->enerdiffsix = 0;
2105 fr->enerdifftwelve = 0;
2106 fr->virdiffsix = 0;
2107 fr->virdifftwelve = 0;
2109 const interaction_const_t *ic = fr->ic;
2111 if (eDispCorr != edispcNO)
2113 for (i = 0; i < 2; i++)
2115 eners[i] = 0;
2116 virs[i] = 0;
2118 if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2119 (ic->vdw_modifier == eintmodPOTSWITCH) ||
2120 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2121 (ic->vdwtype == evdwSHIFT) ||
2122 (ic->vdwtype == evdwSWITCH))
2124 if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2125 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2126 (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2128 gmx_fatal(FARGS,
2129 "With dispersion correction rvdw-switch can not be zero "
2130 "for vdw-type = %s", evdw_names[ic->vdwtype]);
2133 /* TODO This code depends on the logic in tables.c that
2134 constructs the table layout, which should be made
2135 explicit in future cleanup. */
2136 GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2137 "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2138 scale = fr->dispersionCorrectionTable->scale;
2139 vdwtab = fr->dispersionCorrectionTable->data;
2141 /* Round the cut-offs to exact table values for precision */
2142 ri0 = static_cast<int>(std::floor(ic->rvdw_switch*scale));
2143 ri1 = static_cast<int>(std::ceil(ic->rvdw*scale));
2145 /* The code below has some support for handling force-switching, i.e.
2146 * when the force (instead of potential) is switched over a limited
2147 * region. This leads to a constant shift in the potential inside the
2148 * switching region, which we can handle by adding a constant energy
2149 * term in the force-switch case just like when we do potential-shift.
2151 * For now this is not enabled, but to keep the functionality in the
2152 * code we check separately for switch and shift. When we do force-switch
2153 * the shifting point is rvdw_switch, while it is the cutoff when we
2154 * have a classical potential-shift.
2156 * For a pure potential-shift the potential has a constant shift
2157 * all the way out to the cutoff, and that is it. For other forms
2158 * we need to calculate the constant shift up to the point where we
2159 * start modifying the potential.
2161 ri0 = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2163 r0 = ri0/scale;
2164 rc3 = r0*r0*r0;
2165 rc9 = rc3*rc3*rc3;
2167 if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2168 (ic->vdwtype == evdwSHIFT))
2170 /* Determine the constant energy shift below rvdw_switch.
2171 * Table has a scale factor since we have scaled it down to compensate
2172 * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2174 fr->enershiftsix = static_cast<real>(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2175 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2177 else if (ic->vdw_modifier == eintmodPOTSHIFT)
2179 fr->enershiftsix = static_cast<real>(-1.0/(rc3*rc3));
2180 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3));
2183 /* Add the constant part from 0 to rvdw_switch.
2184 * This integration from 0 to rvdw_switch overcounts the number
2185 * of interactions by 1, as it also counts the self interaction.
2186 * We will correct for this later.
2188 eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2189 eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2191 /* Calculate the contribution in the range [r0,r1] where we
2192 * modify the potential. For a pure potential-shift modifier we will
2193 * have ri0==ri1, and there will not be any contribution here.
2195 for (i = 0; i < 2; i++)
2197 enersum = 0;
2198 virsum = 0;
2199 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2200 eners[i] -= enersum;
2201 virs[i] -= virsum;
2204 /* Alright: Above we compensated by REMOVING the parts outside r0
2205 * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2207 * Regardless of whether r0 is the point where we start switching,
2208 * or the cutoff where we calculated the constant shift, we include
2209 * all the parts we are missing out to infinity from r0 by
2210 * calculating the analytical dispersion correction.
2212 eners[0] += -4.0*M_PI/(3.0*rc3);
2213 eners[1] += 4.0*M_PI/(9.0*rc9);
2214 virs[0] += 8.0*M_PI/rc3;
2215 virs[1] += -16.0*M_PI/(3.0*rc9);
2217 else if (ic->vdwtype == evdwCUT ||
2218 EVDW_PME(ic->vdwtype) ||
2219 ic->vdwtype == evdwUSER)
2221 if (ic->vdwtype == evdwUSER && fplog)
2223 fprintf(fplog,
2224 "WARNING: using dispersion correction with user tables\n");
2227 /* Note that with LJ-PME, the dispersion correction is multiplied
2228 * by the difference between the actual C6 and the value of C6
2229 * that would produce the combination rule.
2230 * This means the normal energy and virial difference formulas
2231 * can be used here.
2234 rc3 = ic->rvdw*ic->rvdw*ic->rvdw;
2235 rc9 = rc3*rc3*rc3;
2236 /* Contribution beyond the cut-off */
2237 eners[0] += -4.0*M_PI/(3.0*rc3);
2238 eners[1] += 4.0*M_PI/(9.0*rc9);
2239 if (ic->vdw_modifier == eintmodPOTSHIFT)
2241 /* Contribution within the cut-off */
2242 eners[0] += -4.0*M_PI/(3.0*rc3);
2243 eners[1] += 4.0*M_PI/(3.0*rc9);
2245 /* Contribution beyond the cut-off */
2246 virs[0] += 8.0*M_PI/rc3;
2247 virs[1] += -16.0*M_PI/(3.0*rc9);
2249 else
2251 gmx_fatal(FARGS,
2252 "Dispersion correction is not implemented for vdw-type = %s",
2253 evdw_names[ic->vdwtype]);
2256 /* When we deprecate the group kernels the code below can go too */
2257 if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2259 /* Calculate self-interaction coefficient (assuming that
2260 * the reciprocal-space contribution is constant in the
2261 * region that contributes to the self-interaction).
2263 fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2265 eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2266 virs[0] += gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2269 fr->enerdiffsix = eners[0];
2270 fr->enerdifftwelve = eners[1];
2271 /* The 0.5 is due to the Gromacs definition of the virial */
2272 fr->virdiffsix = 0.5*virs[0];
2273 fr->virdifftwelve = 0.5*virs[1];
2277 void calc_dispcorr(const t_inputrec *ir, const t_forcerec *fr,
2278 const matrix box, real lambda, tensor pres, tensor virial,
2279 real *prescorr, real *enercorr, real *dvdlcorr)
2281 gmx_bool bCorrAll, bCorrPres;
2282 real dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2283 int m;
2285 *prescorr = 0;
2286 *enercorr = 0;
2287 *dvdlcorr = 0;
2289 clear_mat(virial);
2290 clear_mat(pres);
2292 if (ir->eDispCorr != edispcNO)
2294 bCorrAll = (ir->eDispCorr == edispcAllEner ||
2295 ir->eDispCorr == edispcAllEnerPres);
2296 bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2297 ir->eDispCorr == edispcAllEnerPres);
2299 invvol = 1/det(box);
2300 if (fr->n_tpi)
2302 /* Only correct for the interactions with the inserted molecule */
2303 dens = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2304 ninter = fr->n_tpi;
2306 else
2308 dens = fr->numAtomsForDispersionCorrection*invvol;
2309 ninter = 0.5*fr->numAtomsForDispersionCorrection;
2312 if (ir->efep == efepNO)
2314 avcsix = fr->avcsix[0];
2315 avctwelve = fr->avctwelve[0];
2317 else
2319 avcsix = (1 - lambda)*fr->avcsix[0] + lambda*fr->avcsix[1];
2320 avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2323 enerdiff = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2324 *enercorr += avcsix*enerdiff;
2325 dvdlambda = 0.0;
2326 if (ir->efep != efepNO)
2328 dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2330 if (bCorrAll)
2332 enerdiff = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2333 *enercorr += avctwelve*enerdiff;
2334 if (fr->efep != efepNO)
2336 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2340 if (bCorrPres)
2342 svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2343 if (ir->eDispCorr == edispcAllEnerPres)
2345 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2347 /* The factor 2 is because of the Gromacs virial definition */
2348 spres = -2.0*invvol*svir*PRESFAC;
2350 for (m = 0; m < DIM; m++)
2352 virial[m][m] += svir;
2353 pres[m][m] += spres;
2355 *prescorr += spres;
2358 /* Can't currently control when it prints, for now, just print when degugging */
2359 if (debug)
2361 if (bCorrAll)
2363 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2364 avcsix, avctwelve);
2366 if (bCorrPres)
2368 fprintf(debug,
2369 "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2370 *enercorr, spres, svir);
2372 else
2374 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2378 if (fr->efep != efepNO)
2380 *dvdlcorr += dvdlambda;
2385 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2386 const gmx_mtop_t *mtop, rvec x[],
2387 gmx_bool bFirst)
2389 t_graph *graph;
2390 int as, mol;
2392 if (bFirst && fplog)
2394 fprintf(fplog, "Removing pbc first time\n");
2397 snew(graph, 1);
2398 as = 0;
2399 for (const gmx_molblock_t &molb : mtop->molblock)
2401 const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2402 if (moltype.atoms.nr == 1 ||
2403 (!bFirst && moltype.cgs.nr == 1))
2405 /* Just one atom or charge group in the molecule, no PBC required */
2406 as += molb.nmol*moltype.atoms.nr;
2408 else
2410 mk_graph_moltype(moltype, graph);
2412 for (mol = 0; mol < molb.nmol; mol++)
2414 mk_mshift(fplog, graph, ePBC, box, x+as);
2416 shift_self(graph, box, x+as);
2417 /* The molecule is whole now.
2418 * We don't need the second mk_mshift call as in do_pbc_first,
2419 * since we no longer need this graph.
2422 as += moltype.atoms.nr;
2424 done_graph(graph);
2427 sfree(graph);
2430 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2431 const gmx_mtop_t *mtop, rvec x[])
2433 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2436 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2437 const gmx_mtop_t *mtop, rvec x[])
2439 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2442 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2444 int t, nth;
2445 nth = gmx_omp_nthreads_get(emntDefault);
2447 #pragma omp parallel for num_threads(nth) schedule(static)
2448 for (t = 0; t < nth; t++)
2452 size_t natoms = x.size();
2453 size_t offset = (natoms*t )/nth;
2454 size_t len = (natoms*(t + 1))/nth - offset;
2455 put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2457 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2461 // TODO This can be cleaned up a lot, and move back to runner.cpp
2462 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2463 const t_inputrec *inputrec,
2464 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2465 gmx_walltime_accounting_t walltime_accounting,
2466 nonbonded_verlet_t *nbv,
2467 const gmx_pme_t *pme,
2468 gmx_bool bWriteStat)
2470 t_nrnb *nrnb_tot = nullptr;
2471 double delta_t = 0;
2472 double nbfs = 0, mflop = 0;
2473 double elapsed_time,
2474 elapsed_time_over_all_ranks,
2475 elapsed_time_over_all_threads,
2476 elapsed_time_over_all_threads_over_all_ranks;
2477 /* Control whether it is valid to print a report. Only the
2478 simulation master may print, but it should not do so if the run
2479 terminated e.g. before a scheduled reset step. This is
2480 complicated by the fact that PME ranks are unaware of the
2481 reason why they were sent a pmerecvqxFINISH. To avoid
2482 communication deadlocks, we always do the communication for the
2483 report, even if we've decided not to write the report, because
2484 how long it takes to finish the run is not important when we've
2485 decided not to report on the simulation performance.
2487 Further, we only report performance for dynamical integrators,
2488 because those are the only ones for which we plan to
2489 consider doing any optimizations. */
2490 bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2492 if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2494 GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2495 printReport = false;
2498 if (cr->nnodes > 1)
2500 snew(nrnb_tot, 1);
2501 #if GMX_MPI
2502 MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2503 cr->mpi_comm_mysim);
2504 #endif
2506 else
2508 nrnb_tot = nrnb;
2511 elapsed_time = walltime_accounting_get_time_since_reset(walltime_accounting);
2512 elapsed_time_over_all_threads = walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
2513 if (cr->nnodes > 1)
2515 #if GMX_MPI
2516 /* reduce elapsed_time over all MPI ranks in the current simulation */
2517 MPI_Allreduce(&elapsed_time,
2518 &elapsed_time_over_all_ranks,
2519 1, MPI_DOUBLE, MPI_SUM,
2520 cr->mpi_comm_mysim);
2521 elapsed_time_over_all_ranks /= cr->nnodes;
2522 /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2523 * current simulation. */
2524 MPI_Allreduce(&elapsed_time_over_all_threads,
2525 &elapsed_time_over_all_threads_over_all_ranks,
2526 1, MPI_DOUBLE, MPI_SUM,
2527 cr->mpi_comm_mysim);
2528 #endif
2530 else
2532 elapsed_time_over_all_ranks = elapsed_time;
2533 elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2536 if (printReport)
2538 print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2540 if (cr->nnodes > 1)
2542 sfree(nrnb_tot);
2545 if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2547 print_dd_statistics(cr, inputrec, fplog);
2550 /* TODO Move the responsibility for any scaling by thread counts
2551 * to the code that handled the thread region, so that there's a
2552 * mechanism to keep cycle counting working during the transition
2553 * to task parallelism. */
2554 int nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
2555 int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2556 wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2557 auto cycle_sum(wallcycle_sum(cr, wcycle));
2559 if (printReport)
2561 auto nbnxn_gpu_timings = use_GPU(nbv) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
2562 gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
2563 if (pme_gpu_task_enabled(pme))
2565 pme_gpu_get_timings(pme, &pme_gpu_timings);
2567 wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2568 elapsed_time_over_all_ranks,
2569 wcycle, cycle_sum,
2570 nbnxn_gpu_timings,
2571 &pme_gpu_timings);
2573 if (EI_DYNAMICS(inputrec->eI))
2575 delta_t = inputrec->delta_t;
2578 if (fplog)
2580 print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2581 elapsed_time_over_all_ranks,
2582 walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2583 delta_t, nbfs, mflop);
2585 if (bWriteStat)
2587 print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2588 elapsed_time_over_all_ranks,
2589 walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2590 delta_t, nbfs, mflop);
2595 void initialize_lambdas(FILE *fplog,
2596 const t_inputrec &ir,
2597 bool isMaster,
2598 int *fep_state,
2599 gmx::ArrayRef<real> lambda,
2600 double *lam0)
2602 /* TODO: Clean up initialization of fep_state and lambda in
2603 t_state. This function works, but could probably use a logic
2604 rewrite to keep all the different types of efep straight. */
2606 if ((ir.efep == efepNO) && (!ir.bSimTemp))
2608 return;
2611 const t_lambda *fep = ir.fepvals;
2612 if (isMaster)
2614 *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2615 if checkpoint is set -- a kludge is in for now
2616 to prevent this.*/
2619 for (int i = 0; i < efptNR; i++)
2621 double thisLambda;
2622 /* overwrite lambda state with init_lambda for now for backwards compatibility */
2623 if (fep->init_lambda >= 0) /* if it's -1, it was never initialized */
2625 thisLambda = fep->init_lambda;
2627 else
2629 thisLambda = fep->all_lambda[i][fep->init_fep_state];
2631 if (isMaster)
2633 lambda[i] = thisLambda;
2635 if (lam0 != nullptr)
2637 lam0[i] = thisLambda;
2640 if (ir.bSimTemp)
2642 /* need to rescale control temperatures to match current state */
2643 for (int i = 0; i < ir.opts.ngtc; i++)
2645 if (ir.opts.ref_t[i] > 0)
2647 ir.opts.ref_t[i] = ir.simtempvals->temperatures[fep->init_fep_state];
2652 /* Send to the log the information on the current lambdas */
2653 if (fplog != nullptr)
2655 fprintf(fplog, "Initial vector of lambda components:[ ");
2656 for (const auto &l : lambda)
2658 fprintf(fplog, "%10.4f ", l);
2660 fprintf(fplog, "]\n");