Implement assigning bondeds to the GPU
[gromacs.git] / src / gromacs / mdlib / sim_util.cpp
blobe0c6a813c8f30eb5f7f4c712b5ad6dfd749c1143
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
37 #include "gmxpre.h"
39 #include "sim_util.h"
41 #include "config.h"
43 #include <cmath>
44 #include <cstdint>
45 #include <cstdio>
46 #include <cstring>
48 #include <array>
50 #include "gromacs/awh/awh.h"
51 #include "gromacs/domdec/dlbtiming.h"
52 #include "gromacs/domdec/domdec_struct.h"
53 #include "gromacs/domdec/partition.h"
54 #include "gromacs/essentialdynamics/edsam.h"
55 #include "gromacs/ewald/pme.h"
56 #include "gromacs/gmxlib/chargegroup.h"
57 #include "gromacs/gmxlib/network.h"
58 #include "gromacs/gmxlib/nrnb.h"
59 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
60 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
61 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
62 #include "gromacs/gpu_utils/gpu_utils.h"
63 #include "gromacs/imd/imd.h"
64 #include "gromacs/listed-forces/bonded.h"
65 #include "gromacs/listed-forces/disre.h"
66 #include "gromacs/listed-forces/manage-threading.h"
67 #include "gromacs/listed-forces/orires.h"
68 #include "gromacs/math/functions.h"
69 #include "gromacs/math/units.h"
70 #include "gromacs/math/vec.h"
71 #include "gromacs/math/vecdump.h"
72 #include "gromacs/mdlib/calcmu.h"
73 #include "gromacs/mdlib/calcvir.h"
74 #include "gromacs/mdlib/constr.h"
75 #include "gromacs/mdlib/force.h"
76 #include "gromacs/mdlib/forcerec.h"
77 #include "gromacs/mdlib/gmx_omp_nthreads.h"
78 #include "gromacs/mdlib/mdrun.h"
79 #include "gromacs/mdlib/nb_verlet.h"
80 #include "gromacs/mdlib/nbnxn_atomdata.h"
81 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
82 #include "gromacs/mdlib/nbnxn_grid.h"
83 #include "gromacs/mdlib/nbnxn_search.h"
84 #include "gromacs/mdlib/qmmm.h"
85 #include "gromacs/mdlib/update.h"
86 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
87 #include "gromacs/mdtypes/commrec.h"
88 #include "gromacs/mdtypes/forceoutput.h"
89 #include "gromacs/mdtypes/iforceprovider.h"
90 #include "gromacs/mdtypes/inputrec.h"
91 #include "gromacs/mdtypes/md_enums.h"
92 #include "gromacs/mdtypes/state.h"
93 #include "gromacs/pbcutil/ishift.h"
94 #include "gromacs/pbcutil/mshift.h"
95 #include "gromacs/pbcutil/pbc.h"
96 #include "gromacs/pulling/pull.h"
97 #include "gromacs/pulling/pull_rotation.h"
98 #include "gromacs/timing/cyclecounter.h"
99 #include "gromacs/timing/gpu_timing.h"
100 #include "gromacs/timing/wallcycle.h"
101 #include "gromacs/timing/wallcyclereporting.h"
102 #include "gromacs/timing/walltime_accounting.h"
103 #include "gromacs/topology/topology.h"
104 #include "gromacs/utility/arrayref.h"
105 #include "gromacs/utility/basedefinitions.h"
106 #include "gromacs/utility/cstringutil.h"
107 #include "gromacs/utility/exceptions.h"
108 #include "gromacs/utility/fatalerror.h"
109 #include "gromacs/utility/gmxassert.h"
110 #include "gromacs/utility/gmxmpi.h"
111 #include "gromacs/utility/logger.h"
112 #include "gromacs/utility/pleasecite.h"
113 #include "gromacs/utility/smalloc.h"
114 #include "gromacs/utility/strconvert.h"
115 #include "gromacs/utility/sysinfo.h"
117 #include "nbnxn_gpu.h"
118 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
119 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
121 // TODO: this environment variable allows us to verify before release
122 // that on less common architectures the total cost of polling is not larger than
123 // a blocking wait (so polling does not introduce overhead when the static
124 // PME-first ordering would suffice).
125 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
128 void print_time(FILE *out,
129 gmx_walltime_accounting_t walltime_accounting,
130 int64_t step,
131 const t_inputrec *ir,
132 const t_commrec *cr)
134 time_t finish;
135 double dt, elapsed_seconds, time_per_step;
137 #if !GMX_THREAD_MPI
138 if (!PAR(cr))
139 #endif
141 fprintf(out, "\r");
143 fputs("step ", out);
144 fputs(gmx::int64ToString(step).c_str(), out);
145 fflush(out);
147 if ((step >= ir->nstlist))
149 double seconds_since_epoch = gmx_gettime();
150 elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
151 time_per_step = elapsed_seconds/(step - ir->init_step + 1);
152 dt = (ir->nsteps + ir->init_step - step) * time_per_step;
154 if (ir->nsteps >= 0)
156 if (dt >= 300)
158 finish = static_cast<time_t>(seconds_since_epoch + dt);
159 auto timebuf = gmx_ctime_r(&finish);
160 fputs(", will finish ", out);
161 fputs(timebuf.c_str(), out);
163 else
165 fprintf(out, ", remaining wall clock time: %5d s ", static_cast<int>(dt));
168 else
170 fprintf(out, " performance: %.1f ns/day ",
171 ir->delta_t/1000*24*60*60/time_per_step);
174 #if !GMX_THREAD_MPI
175 if (PAR(cr))
177 fprintf(out, "\n");
179 #else
180 GMX_UNUSED_VALUE(cr);
181 #endif
183 fflush(out);
186 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
187 double the_time)
189 if (!fplog)
191 return;
194 time_t temp_time = static_cast<time_t>(the_time);
196 auto timebuf = gmx_ctime_r(&temp_time);
197 // Retain only the characters before the first space
198 timebuf.erase(std::find(timebuf.begin(), timebuf.end(), ' '), timebuf.end());
200 fprintf(fplog, "%s on rank %d %s\n", title, nodeid, timebuf.c_str());
203 void print_start(FILE *fplog, const t_commrec *cr,
204 gmx_walltime_accounting_t walltime_accounting,
205 const char *name)
207 char buf[STRLEN];
209 sprintf(buf, "Started %s", name);
210 print_date_and_time(fplog, cr->nodeid, buf,
211 walltime_accounting_get_start_time_stamp(walltime_accounting));
214 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
216 const int end = forceToAdd.size();
218 int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
219 #pragma omp parallel for num_threads(nt) schedule(static)
220 for (int i = 0; i < end; i++)
222 rvec_inc(f[i], forceToAdd[i]);
226 static void pme_gpu_reduce_outputs(gmx_wallcycle_t wcycle,
227 gmx::ForceWithVirial *forceWithVirial,
228 gmx::ArrayRef<const gmx::RVec> pmeForces,
229 gmx_enerdata_t *enerd,
230 const tensor vir_Q,
231 real Vlr_q)
233 wallcycle_start(wcycle, ewcPME_GPU_F_REDUCTION);
234 GMX_ASSERT(forceWithVirial, "Invalid force pointer");
235 forceWithVirial->addVirialContribution(vir_Q);
236 enerd->term[F_COUL_RECIP] += Vlr_q;
237 sum_forces(as_rvec_array(forceWithVirial->force_.data()), pmeForces);
238 wallcycle_stop(wcycle, ewcPME_GPU_F_REDUCTION);
241 static void calc_virial(int start, int homenr, const rvec x[], const rvec f[],
242 tensor vir_part, const t_graph *graph, const matrix box,
243 t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
245 /* The short-range virial from surrounding boxes */
246 calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
247 inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
249 /* Calculate partial virial, for local atoms only, based on short range.
250 * Total virial is computed in global_stat, called from do_md
252 f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
253 inc_nrnb(nrnb, eNR_VIRIAL, homenr);
255 if (debug)
257 pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
261 static void pull_potential_wrapper(const t_commrec *cr,
262 const t_inputrec *ir,
263 const matrix box, gmx::ArrayRef<const gmx::RVec> x,
264 gmx::ForceWithVirial *force,
265 const t_mdatoms *mdatoms,
266 gmx_enerdata_t *enerd,
267 const real *lambda,
268 double t,
269 gmx_wallcycle_t wcycle)
271 t_pbc pbc;
272 real dvdl;
274 /* Calculate the center of mass forces, this requires communication,
275 * which is why pull_potential is called close to other communication.
277 wallcycle_start(wcycle, ewcPULLPOT);
278 set_pbc(&pbc, ir->ePBC, box);
279 dvdl = 0;
280 enerd->term[F_COM_PULL] +=
281 pull_potential(ir->pull_work, mdatoms, &pbc,
282 cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
283 enerd->dvdl_lin[efptRESTRAINT] += dvdl;
284 wallcycle_stop(wcycle, ewcPULLPOT);
287 static void pme_receive_force_ener(const t_commrec *cr,
288 gmx::ForceWithVirial *forceWithVirial,
289 gmx_enerdata_t *enerd,
290 gmx_wallcycle_t wcycle)
292 real e_q, e_lj, dvdl_q, dvdl_lj;
293 float cycles_ppdpme, cycles_seppme;
295 cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
296 dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
298 /* In case of node-splitting, the PP nodes receive the long-range
299 * forces, virial and energy from the PME nodes here.
301 wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
302 dvdl_q = 0;
303 dvdl_lj = 0;
304 gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
305 &cycles_seppme);
306 enerd->term[F_COUL_RECIP] += e_q;
307 enerd->term[F_LJ_RECIP] += e_lj;
308 enerd->dvdl_lin[efptCOUL] += dvdl_q;
309 enerd->dvdl_lin[efptVDW] += dvdl_lj;
311 if (wcycle)
313 dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
315 wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
318 static void print_large_forces(FILE *fp,
319 const t_mdatoms *md,
320 const t_commrec *cr,
321 int64_t step,
322 real forceTolerance,
323 const rvec *x,
324 const rvec *f)
326 real force2Tolerance = gmx::square(forceTolerance);
327 gmx::index numNonFinite = 0;
328 for (int i = 0; i < md->homenr; i++)
330 real force2 = norm2(f[i]);
331 bool nonFinite = !std::isfinite(force2);
332 if (force2 >= force2Tolerance || nonFinite)
334 fprintf(fp, "step %" PRId64 " atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
335 step,
336 ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
338 if (nonFinite)
340 numNonFinite++;
343 if (numNonFinite > 0)
345 /* Note that with MPI this fatal call on one rank might interrupt
346 * the printing on other ranks. But we can only avoid that with
347 * an expensive MPI barrier that we would need at each step.
349 gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %td atoms", step, numNonFinite);
353 static void post_process_forces(const t_commrec *cr,
354 int64_t step,
355 t_nrnb *nrnb,
356 gmx_wallcycle_t wcycle,
357 const gmx_localtop_t *top,
358 const matrix box,
359 const rvec x[],
360 rvec f[],
361 gmx::ForceWithVirial *forceWithVirial,
362 tensor vir_force,
363 const t_mdatoms *mdatoms,
364 const t_graph *graph,
365 const t_forcerec *fr,
366 const gmx_vsite_t *vsite,
367 int flags)
369 if (fr->haveDirectVirialContributions)
371 rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
373 if (vsite)
375 /* Spread the mesh force on virtual sites to the other particles...
376 * This is parallellized. MPI communication is performed
377 * if the constructing atoms aren't local.
379 matrix virial = { { 0 } };
380 spread_vsite_f(vsite, x, fDirectVir, nullptr,
381 (flags & GMX_FORCE_VIRIAL) != 0, virial,
382 nrnb,
383 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
384 forceWithVirial->addVirialContribution(virial);
387 if (flags & GMX_FORCE_VIRIAL)
389 /* Now add the forces, this is local */
390 sum_forces(f, forceWithVirial->force_);
392 /* Add the direct virial contributions */
393 GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
394 m_add(vir_force, forceWithVirial->getVirial(), vir_force);
396 if (debug)
398 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
403 if (fr->print_force >= 0)
405 print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
409 static void do_nb_verlet(const t_forcerec *fr,
410 const interaction_const_t *ic,
411 gmx_enerdata_t *enerd,
412 int flags, int ilocality,
413 int clearF,
414 int64_t step,
415 t_nrnb *nrnb,
416 gmx_wallcycle_t wcycle)
418 if (!(flags & GMX_FORCE_NONBONDED))
420 /* skip non-bonded calculation */
421 return;
424 nonbonded_verlet_t *nbv = fr->nbv;
425 nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
427 /* GPU kernel launch overhead is already timed separately */
428 if (fr->cutoff_scheme != ecutsVERLET)
430 gmx_incons("Invalid cut-off scheme passed!");
433 bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
435 if (!bUsingGpuKernels)
437 /* When dynamic pair-list pruning is requested, we need to prune
438 * at nstlistPrune steps.
440 if (nbv->listParams->useDynamicPruning &&
441 (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
443 /* Prune the pair-list beyond fr->ic->rlistPrune using
444 * the current coordinates of the atoms.
446 wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
447 nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
448 wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
451 wallcycle_sub_start(wcycle, ewcsNONBONDED);
454 switch (nbvg->kernel_type)
456 case nbnxnk4x4_PlainC:
457 case nbnxnk4xN_SIMD_4xN:
458 case nbnxnk4xN_SIMD_2xNN:
459 nbnxn_kernel_cpu(nbvg,
460 nbv->nbat,
462 fr->shift_vec,
463 flags,
464 clearF,
465 fr->fshift[0],
466 enerd->grpp.ener[egCOULSR],
467 fr->bBHAM ?
468 enerd->grpp.ener[egBHAMSR] :
469 enerd->grpp.ener[egLJSR]);
470 break;
472 case nbnxnk8x8x8_GPU:
473 nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbv->nbat, flags, ilocality);
474 break;
476 case nbnxnk8x8x8_PlainC:
477 nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
478 nbv->nbat, ic,
479 fr->shift_vec,
480 flags,
481 clearF,
482 nbv->nbat->out[0].f,
483 fr->fshift[0],
484 enerd->grpp.ener[egCOULSR],
485 fr->bBHAM ?
486 enerd->grpp.ener[egBHAMSR] :
487 enerd->grpp.ener[egLJSR]);
488 break;
490 default:
491 GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
494 if (!bUsingGpuKernels)
496 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
499 int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
500 if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
502 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
504 else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
505 (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
507 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
509 else
511 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
513 enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
514 if (flags & GMX_FORCE_ENERGY)
516 /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
517 enr_nbnxn_kernel_ljc += 1;
518 enr_nbnxn_kernel_lj += 1;
521 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
522 nbvg->nbl_lists.natpair_ljq);
523 inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
524 nbvg->nbl_lists.natpair_lj);
525 /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
526 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
527 nbvg->nbl_lists.natpair_q);
529 if (ic->vdw_modifier == eintmodFORCESWITCH)
531 /* We add up the switch cost separately */
532 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
533 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
535 if (ic->vdw_modifier == eintmodPOTSWITCH)
537 /* We add up the switch cost separately */
538 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
539 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
541 if (ic->vdwtype == evdwPME)
543 /* We add up the LJ Ewald cost separately */
544 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
545 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
549 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
550 t_forcerec *fr,
551 rvec x[],
552 rvec f[],
553 const t_mdatoms *mdatoms,
554 t_lambda *fepvals,
555 real *lambda,
556 gmx_enerdata_t *enerd,
557 int flags,
558 t_nrnb *nrnb,
559 gmx_wallcycle_t wcycle)
561 int donb_flags;
562 nb_kernel_data_t kernel_data;
563 real lam_i[efptNR];
564 real dvdl_nb[efptNR];
565 int th;
566 int i, j;
568 donb_flags = 0;
569 /* Add short-range interactions */
570 donb_flags |= GMX_NONBONDED_DO_SR;
572 /* Currently all group scheme kernels always calculate (shift-)forces */
573 if (flags & GMX_FORCE_FORCES)
575 donb_flags |= GMX_NONBONDED_DO_FORCE;
577 if (flags & GMX_FORCE_VIRIAL)
579 donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
581 if (flags & GMX_FORCE_ENERGY)
583 donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
586 kernel_data.flags = donb_flags;
587 kernel_data.lambda = lambda;
588 kernel_data.dvdl = dvdl_nb;
590 kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
591 kernel_data.energygrp_vdw = enerd->grpp.ener[egLJSR];
593 /* reset free energy components */
594 for (i = 0; i < efptNR; i++)
596 dvdl_nb[i] = 0;
599 GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
601 wallcycle_sub_start(wcycle, ewcsNONBONDED);
602 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
603 for (th = 0; th < nbl_lists->nnbl; th++)
607 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
608 x, f, fr, mdatoms, &kernel_data, nrnb);
610 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
613 if (fepvals->sc_alpha != 0)
615 enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
616 enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
618 else
620 enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
621 enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
624 /* If we do foreign lambda and we have soft-core interactions
625 * we have to recalculate the (non-linear) energies contributions.
627 if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
629 kernel_data.flags = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
630 kernel_data.lambda = lam_i;
631 kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
632 kernel_data.energygrp_vdw = enerd->foreign_grpp.ener[egLJSR];
633 /* Note that we add to kernel_data.dvdl, but ignore the result */
635 for (i = 0; i < enerd->n_lambda; i++)
637 for (j = 0; j < efptNR; j++)
639 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
641 reset_foreign_enerdata(enerd);
642 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
643 for (th = 0; th < nbl_lists->nnbl; th++)
647 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
648 x, f, fr, mdatoms, &kernel_data, nrnb);
650 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
653 sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
654 enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
658 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
661 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
663 return nbv != nullptr && nbv->bUseGPU;
666 static inline void clear_rvecs_omp(int n, rvec v[])
668 int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
670 /* Note that we would like to avoid this conditional by putting it
671 * into the omp pragma instead, but then we still take the full
672 * omp parallel for overhead (at least with gcc5).
674 if (nth == 1)
676 for (int i = 0; i < n; i++)
678 clear_rvec(v[i]);
681 else
683 #pragma omp parallel for num_threads(nth) schedule(static)
684 for (int i = 0; i < n; i++)
686 clear_rvec(v[i]);
691 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
693 * \param groupOptions Group options, containing T-coupling options
695 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
697 real nrdfCoupled = 0;
698 real nrdfUncoupled = 0;
699 real kineticEnergy = 0;
700 for (int g = 0; g < groupOptions.ngtc; g++)
702 if (groupOptions.tau_t[g] >= 0)
704 nrdfCoupled += groupOptions.nrdf[g];
705 kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
707 else
709 nrdfUncoupled += groupOptions.nrdf[g];
713 /* This conditional with > also catches nrdf=0 */
714 if (nrdfCoupled > nrdfUncoupled)
716 return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
718 else
720 return 0;
724 /*! \brief This routine checks that the potential energy is finite.
726 * Always checks that the potential energy is finite. If step equals
727 * inputrec.init_step also checks that the magnitude of the potential energy
728 * is reasonable. Terminates with a fatal error when a check fails.
729 * Note that passing this check does not guarantee finite forces,
730 * since those use slightly different arithmetics. But in most cases
731 * there is just a narrow coordinate range where forces are not finite
732 * and energies are finite.
734 * \param[in] step The step number, used for checking and printing
735 * \param[in] enerd The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
736 * \param[in] inputrec The input record
738 static void checkPotentialEnergyValidity(int64_t step,
739 const gmx_enerdata_t &enerd,
740 const t_inputrec &inputrec)
742 /* Threshold valid for comparing absolute potential energy against
743 * the kinetic energy. Normally one should not consider absolute
744 * potential energy values, but with a factor of one million
745 * we should never get false positives.
747 constexpr real c_thresholdFactor = 1e6;
749 bool energyIsNotFinite = !std::isfinite(enerd.term[F_EPOT]);
750 real averageKineticEnergy = 0;
751 /* We only check for large potential energy at the initial step,
752 * because that is by far the most likely step for this too occur
753 * and because computing the average kinetic energy is not free.
754 * Note: nstcalcenergy >> 1 often does not allow to catch large energies
755 * before they become NaN.
757 if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
759 averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
762 if (energyIsNotFinite || (averageKineticEnergy > 0 &&
763 enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
765 gmx_fatal(FARGS, "Step %" PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
766 step,
767 enerd.term[F_EPOT],
768 energyIsNotFinite ? "not finite" : "extremely high",
769 enerd.term[F_LJ],
770 enerd.term[F_COUL_SR],
771 energyIsNotFinite ? "non-finite" : "very high",
772 energyIsNotFinite ? " or Nan" : "");
776 /*! \brief Compute forces and/or energies for special algorithms
778 * The intention is to collect all calls to algorithms that compute
779 * forces on local atoms only and that do not contribute to the local
780 * virial sum (but add their virial contribution separately).
781 * Eventually these should likely all become ForceProviders.
782 * Within this function the intention is to have algorithms that do
783 * global communication at the end, so global barriers within the MD loop
784 * are as close together as possible.
786 * \param[in] fplog The log file
787 * \param[in] cr The communication record
788 * \param[in] inputrec The input record
789 * \param[in] awh The Awh module (nullptr if none in use).
790 * \param[in] enforcedRotation Enforced rotation module.
791 * \param[in] step The current MD step
792 * \param[in] t The current time
793 * \param[in,out] wcycle Wallcycle accounting struct
794 * \param[in,out] forceProviders Pointer to a list of force providers
795 * \param[in] box The unit cell
796 * \param[in] x The coordinates
797 * \param[in] mdatoms Per atom properties
798 * \param[in] lambda Array of free-energy lambda values
799 * \param[in] forceFlags Flags that tell whether we should compute forces/energies/virial
800 * \param[in,out] forceWithVirial Force and virial buffers
801 * \param[in,out] enerd Energy buffer
802 * \param[in,out] ed Essential dynamics pointer
803 * \param[in] bNS Tells if we did neighbor searching this step, used for ED sampling
805 * \todo Remove bNS, which is used incorrectly.
806 * \todo Convert all other algorithms called here to ForceProviders.
808 static void
809 computeSpecialForces(FILE *fplog,
810 const t_commrec *cr,
811 const t_inputrec *inputrec,
812 gmx::Awh *awh,
813 gmx_enfrot *enforcedRotation,
814 int64_t step,
815 double t,
816 gmx_wallcycle_t wcycle,
817 ForceProviders *forceProviders,
818 matrix box,
819 gmx::ArrayRef<const gmx::RVec> x,
820 const t_mdatoms *mdatoms,
821 real *lambda,
822 int forceFlags,
823 gmx::ForceWithVirial *forceWithVirial,
824 gmx_enerdata_t *enerd,
825 gmx_edsam *ed,
826 gmx_bool bNS)
828 const bool computeForces = (forceFlags & GMX_FORCE_FORCES) != 0;
830 /* NOTE: Currently all ForceProviders only provide forces.
831 * When they also provide energies, remove this conditional.
833 if (computeForces)
835 gmx::ForceProviderInput forceProviderInput(x, *mdatoms, t, box, *cr);
836 gmx::ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
838 /* Collect forces from modules */
839 forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
842 if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
844 pull_potential_wrapper(cr, inputrec, box, x,
845 forceWithVirial,
846 mdatoms, enerd, lambda, t,
847 wcycle);
849 if (awh)
851 enerd->term[F_COM_PULL] +=
852 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
853 forceWithVirial,
854 t, step, wcycle, fplog);
858 rvec *f = as_rvec_array(forceWithVirial->force_.data());
860 /* Add the forces from enforced rotation potentials (if any) */
861 if (inputrec->bRot)
863 wallcycle_start(wcycle, ewcROTadd);
864 enerd->term[F_COM_PULL] += add_rot_forces(enforcedRotation, f, cr, step, t);
865 wallcycle_stop(wcycle, ewcROTadd);
868 if (ed)
870 /* Note that since init_edsam() is called after the initialization
871 * of forcerec, edsam doesn't request the noVirSum force buffer.
872 * Thus if no other algorithm (e.g. PME) requires it, the forces
873 * here will contribute to the virial.
875 do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
878 /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
879 if (inputrec->bIMD && computeForces)
881 IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
885 /*! \brief Launch the prepare_step and spread stages of PME GPU.
887 * \param[in] pmedata The PME structure
888 * \param[in] box The box matrix
889 * \param[in] x Coordinate array
890 * \param[in] flags Force flags
891 * \param[in] wcycle The wallcycle structure
893 static inline void launchPmeGpuSpread(gmx_pme_t *pmedata,
894 matrix box,
895 rvec x[],
896 int flags,
897 gmx_wallcycle_t wcycle)
899 int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE;
900 pmeFlags |= (flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0;
901 pmeFlags |= (flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0;
903 pme_gpu_prepare_computation(pmedata, (flags & GMX_FORCE_DYNAMICBOX) != 0, box, wcycle, pmeFlags);
904 pme_gpu_launch_spread(pmedata, x, wcycle);
907 /*! \brief Launch the FFT and gather stages of PME GPU
909 * This function only implements setting the output forces (no accumulation).
911 * \param[in] pmedata The PME structure
912 * \param[in] wcycle The wallcycle structure
914 static void launchPmeGpuFftAndGather(gmx_pme_t *pmedata,
915 gmx_wallcycle_t wcycle)
917 pme_gpu_launch_complex_transforms(pmedata, wcycle);
918 pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
921 /*! \brief
922 * Polling wait for either of the PME or nonbonded GPU tasks.
924 * Instead of a static order in waiting for GPU tasks, this function
925 * polls checking which of the two tasks completes first, and does the
926 * associated force buffer reduction overlapped with the other task.
927 * By doing that, unlike static scheduling order, it can always overlap
928 * one of the reductions, regardless of the GPU task completion order.
930 * \param[in] nbv Nonbonded verlet structure
931 * \param[in] pmedata PME module data
932 * \param[in,out] force Force array to reduce task outputs into.
933 * \param[in,out] forceWithVirial Force and virial buffers
934 * \param[in,out] fshift Shift force output vector results are reduced into
935 * \param[in,out] enerd Energy data structure results are reduced into
936 * \param[in] flags Force flags
937 * \param[in] wcycle The wallcycle structure
939 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv,
940 const gmx_pme_t *pmedata,
941 gmx::PaddedArrayRef<gmx::RVec> *force,
942 gmx::ForceWithVirial *forceWithVirial,
943 rvec fshift[],
944 gmx_enerdata_t *enerd,
945 int flags,
946 gmx_wallcycle_t wcycle)
948 bool isPmeGpuDone = false;
949 bool isNbGpuDone = false;
952 gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
954 while (!isPmeGpuDone || !isNbGpuDone)
956 if (!isPmeGpuDone)
958 matrix vir_Q;
959 real Vlr_q;
961 GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
962 isPmeGpuDone = pme_gpu_try_finish_task(pmedata, wcycle, &pmeGpuForces,
963 vir_Q, &Vlr_q, completionType);
965 if (isPmeGpuDone)
967 pme_gpu_reduce_outputs(wcycle, forceWithVirial, pmeGpuForces,
968 enerd, vir_Q, Vlr_q);
972 if (!isNbGpuDone)
974 GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
975 wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
976 isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
977 flags, eatLocal,
978 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
979 fshift, completionType);
980 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
981 // To get the call count right, when the task finished we
982 // issue a start/stop.
983 // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
984 // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
985 if (isNbGpuDone)
987 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
988 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
990 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
991 nbv->nbat, as_rvec_array(force->data()), wcycle);
997 /*! \brief
998 * Launch the dynamic rolling pruning GPU task.
1000 * We currently alternate local/non-local list pruning in odd-even steps
1001 * (only pruning every second step without DD).
1003 * \param[in] cr The communication record
1004 * \param[in] nbv Nonbonded verlet structure
1005 * \param[in] inputrec The input record
1006 * \param[in] step The current MD step
1008 static inline void launchGpuRollingPruning(const t_commrec *cr,
1009 const nonbonded_verlet_t *nbv,
1010 const t_inputrec *inputrec,
1011 const int64_t step)
1013 /* We should not launch the rolling pruning kernel at a search
1014 * step or just before search steps, since that's useless.
1015 * Without domain decomposition we prune at even steps.
1016 * With domain decomposition we alternate local and non-local
1017 * pruning at even and odd steps.
1019 int numRollingParts = nbv->listParams->numRollingParts;
1020 GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1021 int stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1022 bool stepIsEven = ((stepWithCurrentList & 1) == 0);
1023 if (stepWithCurrentList > 0 &&
1024 stepWithCurrentList < inputrec->nstlist - 1 &&
1025 (stepIsEven || DOMAINDECOMP(cr)))
1027 nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
1028 stepIsEven ? eintLocal : eintNonlocal,
1029 numRollingParts);
1033 static void do_force_cutsVERLET(FILE *fplog,
1034 const t_commrec *cr,
1035 const gmx_multisim_t *ms,
1036 const t_inputrec *inputrec,
1037 gmx::Awh *awh,
1038 gmx_enfrot *enforcedRotation,
1039 int64_t step,
1040 t_nrnb *nrnb,
1041 gmx_wallcycle_t wcycle,
1042 const gmx_localtop_t *top,
1043 const gmx_groups_t * /* groups */,
1044 matrix box, gmx::PaddedArrayRef<gmx::RVec> x,
1045 history_t *hist,
1046 gmx::PaddedArrayRef<gmx::RVec> force,
1047 tensor vir_force,
1048 const t_mdatoms *mdatoms,
1049 gmx_enerdata_t *enerd, t_fcdata *fcd,
1050 real *lambda,
1051 t_graph *graph,
1052 t_forcerec *fr,
1053 interaction_const_t *ic,
1054 const gmx_vsite_t *vsite,
1055 rvec mu_tot,
1056 double t,
1057 gmx_edsam *ed,
1058 int flags,
1059 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1060 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1062 int cg1, i, j;
1063 double mu[2*DIM];
1064 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1065 gmx_bool bDoForces, bUseGPU, bUseOrEmulGPU;
1066 rvec vzero, box_diag;
1067 float cycles_pme, cycles_wait_gpu;
1068 nonbonded_verlet_t *nbv = fr->nbv;
1070 bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
1071 bNS = ((flags & GMX_FORCE_NS) != 0) && (!fr->bAllvsAll);
1072 bFillGrid = (bNS && bStateChanged);
1073 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1074 bDoForces = ((flags & GMX_FORCE_FORCES) != 0);
1075 bUseGPU = fr->nbv->bUseGPU;
1076 bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
1078 const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
1079 // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
1080 const bool useGpuPme = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
1081 ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
1083 /* At a search step we need to start the first balancing region
1084 * somewhere early inside the step after communication during domain
1085 * decomposition (and not during the previous step as usual).
1087 if (bNS &&
1088 ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1090 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
1093 cycles_wait_gpu = 0;
1095 const int start = 0;
1096 const int homenr = mdatoms->homenr;
1098 clear_mat(vir_force);
1100 if (DOMAINDECOMP(cr))
1102 cg1 = cr->dd->globalAtomGroupIndices.size();
1104 else
1106 cg1 = top->cgs.nr;
1108 if (fr->n_tpi > 0)
1110 cg1--;
1113 if (bStateChanged)
1115 update_forcerec(fr, box);
1117 if (inputrecNeedMutot(inputrec))
1119 /* Calculate total (local) dipole moment in a temporary common array.
1120 * This makes it possible to sum them over nodes faster.
1122 calc_mu(start, homenr,
1123 x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1124 mu, mu+DIM);
1128 if (fr->ePBC != epbcNONE)
1130 /* Compute shift vectors every step,
1131 * because of pressure coupling or box deformation!
1133 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1135 calc_shifts(box, fr->shift_vec);
1138 if (bCalcCGCM)
1140 put_atoms_in_box_omp(fr->ePBC, box, x.subArray(0, homenr));
1141 inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1143 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1145 unshift_self(graph, box, as_rvec_array(x.data()));
1149 nbnxn_atomdata_copy_shiftvec((flags & GMX_FORCE_DYNAMICBOX) != 0,
1150 fr->shift_vec, nbv->nbat);
1152 #if GMX_MPI
1153 if (!thisRankHasDuty(cr, DUTY_PME))
1155 /* Send particle coordinates to the pme nodes.
1156 * Since this is only implemented for domain decomposition
1157 * and domain decomposition does not use the graph,
1158 * we do not need to worry about shifting.
1160 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1161 lambda[efptCOUL], lambda[efptVDW],
1162 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1163 step, wcycle);
1165 #endif /* GMX_MPI */
1167 if (useGpuPme)
1169 launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.data()), flags, wcycle);
1172 /* do gridding for pair search */
1173 if (bNS)
1175 if (graph && bStateChanged)
1177 /* Calculate intramolecular shift vectors to make molecules whole */
1178 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1181 clear_rvec(vzero);
1182 box_diag[XX] = box[XX][XX];
1183 box_diag[YY] = box[YY][YY];
1184 box_diag[ZZ] = box[ZZ][ZZ];
1186 wallcycle_start(wcycle, ewcNS);
1187 if (!DOMAINDECOMP(cr))
1189 wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1190 nbnxn_put_on_grid(nbv->nbs.get(), fr->ePBC, box,
1191 0, vzero, box_diag,
1192 nullptr, 0, mdatoms->homenr, -1,
1193 fr->cginfo, x,
1194 0, nullptr,
1195 nbv->grp[eintLocal].kernel_type,
1196 nbv->nbat);
1197 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1199 else
1201 wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1202 nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
1203 fr->cginfo, x,
1204 nbv->grp[eintNonlocal].kernel_type,
1205 nbv->nbat);
1206 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1209 nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
1211 /* Now we put all atoms on the grid, we can assign bonded interactions
1212 * to the GPU, where the grid order is needed.
1214 if (fr->gpuBondedLists)
1216 assign_bondeds_to_gpu(fr->gpuBondedLists,
1217 nbnxn_get_gridindices(fr->nbv->nbs.get()),
1218 top->idef);
1221 wallcycle_stop(wcycle, ewcNS);
1224 /* initialize the GPU atom data and copy shift vector */
1225 if (bUseGPU)
1227 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1228 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1230 if (bNS)
1232 nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1235 nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1237 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1238 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1241 /* do local pair search */
1242 if (bNS)
1244 wallcycle_start_nocount(wcycle, ewcNS);
1245 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1246 nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1247 &top->excls,
1248 nbv->listParams->rlistOuter,
1249 nbv->min_ci_balanced,
1250 &nbv->grp[eintLocal].nbl_lists,
1251 eintLocal,
1252 nbv->grp[eintLocal].kernel_type,
1253 nrnb);
1254 nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
1255 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1257 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
1259 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1261 if (bUseGPU)
1263 /* initialize local pair-list on the GPU */
1264 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1265 nbv->grp[eintLocal].nbl_lists.nbl[0],
1266 eintLocal);
1268 wallcycle_stop(wcycle, ewcNS);
1270 else
1272 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatLocal, FALSE, as_rvec_array(x.data()),
1273 nbv->nbat, wcycle);
1276 if (bUseGPU)
1278 if (DOMAINDECOMP(cr))
1280 ddOpenBalanceRegionGpu(cr->dd);
1283 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1284 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1285 /* launch local nonbonded work on GPU */
1286 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1287 step, nrnb, wcycle);
1288 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1289 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1292 if (useGpuPme)
1294 // In PME GPU and mixed mode we launch FFT / gather after the
1295 // X copy/transform to allow overlap as well as after the GPU NB
1296 // launch to avoid FFT launch overhead hijacking the CPU and delaying
1297 // the nonbonded kernel.
1298 launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1301 /* Communicate coordinates and sum dipole if necessary +
1302 do non-local pair search */
1303 if (DOMAINDECOMP(cr))
1305 if (bNS)
1307 wallcycle_start_nocount(wcycle, ewcNS);
1308 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1310 nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1311 &top->excls,
1312 nbv->listParams->rlistOuter,
1313 nbv->min_ci_balanced,
1314 &nbv->grp[eintNonlocal].nbl_lists,
1315 eintNonlocal,
1316 nbv->grp[eintNonlocal].kernel_type,
1317 nrnb);
1318 nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1319 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1321 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1323 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1325 if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1327 /* initialize non-local pair-list on the GPU */
1328 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1329 nbv->grp[eintNonlocal].nbl_lists.nbl[0],
1330 eintNonlocal);
1332 wallcycle_stop(wcycle, ewcNS);
1334 else
1336 dd_move_x(cr->dd, box, x, wcycle);
1338 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatNonlocal, FALSE, as_rvec_array(x.data()),
1339 nbv->nbat, wcycle);
1342 if (bUseGPU)
1344 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1345 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1346 /* launch non-local nonbonded tasks on GPU */
1347 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1348 step, nrnb, wcycle);
1349 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1350 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1354 if (bUseGPU)
1356 /* launch D2H copy-back F */
1357 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1358 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1359 if (DOMAINDECOMP(cr))
1361 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1362 flags, eatNonlocal);
1364 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1365 flags, eatLocal);
1366 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1367 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1370 if (bStateChanged && inputrecNeedMutot(inputrec))
1372 if (PAR(cr))
1374 gmx_sumd(2*DIM, mu, cr);
1375 ddReopenBalanceRegionCpu(cr->dd);
1378 for (i = 0; i < 2; i++)
1380 for (j = 0; j < DIM; j++)
1382 fr->mu_tot[i][j] = mu[i*DIM + j];
1386 if (fr->efep == efepNO)
1388 copy_rvec(fr->mu_tot[0], mu_tot);
1390 else
1392 for (j = 0; j < DIM; j++)
1394 mu_tot[j] =
1395 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1396 lambda[efptCOUL]*fr->mu_tot[1][j];
1400 /* Reset energies */
1401 reset_enerdata(enerd);
1402 clear_rvecs(SHIFTS, fr->fshift);
1404 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1406 wallcycle_start(wcycle, ewcPPDURINGPME);
1407 dd_force_flop_start(cr->dd, nrnb);
1410 if (inputrec->bRot)
1412 wallcycle_start(wcycle, ewcROT);
1413 do_rotation(cr, enforcedRotation, box, as_rvec_array(x.data()), t, step, bNS);
1414 wallcycle_stop(wcycle, ewcROT);
1417 /* Temporary solution until all routines take PaddedRVecVector */
1418 rvec *const f = as_rvec_array(force.data());
1420 /* Start the force cycle counter.
1421 * Note that a different counter is used for dynamic load balancing.
1423 wallcycle_start(wcycle, ewcFORCE);
1425 gmx::ArrayRef<gmx::RVec> forceRef = force;
1426 if (bDoForces)
1428 /* If we need to compute the virial, we might need a separate
1429 * force buffer for algorithms for which the virial is calculated
1430 * directly, such as PME.
1432 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1434 forceRef = *fr->forceBufferForDirectVirialContributions;
1436 /* TODO: update comment
1437 * We only compute forces on local atoms. Note that vsites can
1438 * spread to non-local atoms, but that part of the buffer is
1439 * cleared separately in the vsite spreading code.
1441 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1443 /* Clear the short- and long-range forces */
1444 clear_rvecs_omp(fr->natoms_force_constr, f);
1447 /* forceWithVirial uses the local atom range only */
1448 gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1450 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1452 clear_pull_forces(inputrec->pull_work);
1455 /* We calculate the non-bonded forces, when done on the CPU, here.
1456 * We do this before calling do_force_lowlevel, because in that
1457 * function, the listed forces are calculated before PME, which
1458 * does communication. With this order, non-bonded and listed
1459 * force calculation imbalance can be balanced out by the domain
1460 * decomposition load balancing.
1463 if (!bUseOrEmulGPU)
1465 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1466 step, nrnb, wcycle);
1469 if (fr->efep != efepNO)
1471 /* Calculate the local and non-local free energy interactions here.
1472 * Happens here on the CPU both with and without GPU.
1474 if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1476 do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1477 fr, as_rvec_array(x.data()), f, mdatoms,
1478 inputrec->fepvals, lambda,
1479 enerd, flags, nrnb, wcycle);
1482 if (DOMAINDECOMP(cr) &&
1483 fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1485 do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1486 fr, as_rvec_array(x.data()), f, mdatoms,
1487 inputrec->fepvals, lambda,
1488 enerd, flags, nrnb, wcycle);
1492 if (!bUseOrEmulGPU)
1494 int aloc;
1496 if (DOMAINDECOMP(cr))
1498 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1499 step, nrnb, wcycle);
1502 if (!bUseOrEmulGPU)
1504 aloc = eintLocal;
1506 else
1508 aloc = eintNonlocal;
1511 /* Add all the non-bonded force to the normal force array.
1512 * This can be split into a local and a non-local part when overlapping
1513 * communication with calculation with domain decomposition.
1515 wallcycle_stop(wcycle, ewcFORCE);
1517 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatAll, nbv->nbat, f, wcycle);
1519 wallcycle_start_nocount(wcycle, ewcFORCE);
1521 /* if there are multiple fshift output buffers reduce them */
1522 if ((flags & GMX_FORCE_VIRIAL) &&
1523 nbv->grp[aloc].nbl_lists.nnbl > 1)
1525 /* This is not in a subcounter because it takes a
1526 negligible and constant-sized amount of time */
1527 nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1528 fr->fshift);
1532 /* update QMMMrec, if necessary */
1533 if (fr->bQMMM)
1535 update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1538 /* Compute the bonded and non-bonded energies and optionally forces */
1539 do_force_lowlevel(fr, inputrec, &(top->idef),
1540 cr, ms, nrnb, wcycle, mdatoms,
1541 as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1542 box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1543 flags, &cycles_pme);
1545 wallcycle_stop(wcycle, ewcFORCE);
1547 computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1548 step, t, wcycle,
1549 fr->forceProviders, box, x, mdatoms, lambda,
1550 flags, &forceWithVirial, enerd,
1551 ed, bNS);
1553 if (bUseOrEmulGPU)
1555 /* wait for non-local forces (or calculate in emulation mode) */
1556 if (DOMAINDECOMP(cr))
1558 if (bUseGPU)
1560 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1561 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1562 flags, eatNonlocal,
1563 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1564 fr->fshift);
1565 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1567 else
1569 wallcycle_start_nocount(wcycle, ewcFORCE);
1570 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1571 step, nrnb, wcycle);
1572 wallcycle_stop(wcycle, ewcFORCE);
1575 /* skip the reduction if there was no non-local work to do */
1576 if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1578 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal,
1579 nbv->nbat, f, wcycle);
1584 if (DOMAINDECOMP(cr))
1586 /* We are done with the CPU compute.
1587 * We will now communicate the non-local forces.
1588 * If we use a GPU this will overlap with GPU work, so in that case
1589 * we do not close the DD force balancing region here.
1591 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1593 ddCloseBalanceRegionCpu(cr->dd);
1595 if (bDoForces)
1597 dd_move_f(cr->dd, force, fr->fshift, wcycle);
1601 // With both nonbonded and PME offloaded a GPU on the same rank, we use
1602 // an alternating wait/reduction scheme.
1603 bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1604 if (alternateGpuWait)
1606 alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, wcycle);
1609 if (!alternateGpuWait && useGpuPme)
1611 gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
1612 matrix vir_Q;
1613 real Vlr_q = 0.0;
1614 pme_gpu_wait_finish_task(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
1615 pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
1618 /* Wait for local GPU NB outputs on the non-alternating wait path */
1619 if (!alternateGpuWait && bUseGPU)
1621 /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1622 * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1623 * but even with a step of 0.1 ms the difference is less than 1%
1624 * of the step time.
1626 const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1628 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1629 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1630 flags, eatLocal,
1631 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1632 fr->fshift);
1633 float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1635 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1637 DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1638 if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1640 /* We measured few cycles, it could be that the kernel
1641 * and transfer finished earlier and there was no actual
1642 * wait time, only API call overhead.
1643 * Then the actual time could be anywhere between 0 and
1644 * cycles_wait_est. We will use half of cycles_wait_est.
1646 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1648 ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1652 if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1654 // NOTE: emulation kernel is not included in the balancing region,
1655 // but emulation mode does not target performance anyway
1656 wallcycle_start_nocount(wcycle, ewcFORCE);
1657 do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1658 DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1659 step, nrnb, wcycle);
1660 wallcycle_stop(wcycle, ewcFORCE);
1663 if (useGpuPme)
1665 pme_gpu_reinit_computation(fr->pmedata, wcycle);
1668 if (bUseGPU)
1670 /* now clear the GPU outputs while we finish the step on the CPU */
1671 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1672 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1673 nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1675 /* Is dynamic pair-list pruning activated? */
1676 if (nbv->listParams->useDynamicPruning)
1678 launchGpuRollingPruning(cr, nbv, inputrec, step);
1680 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1681 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1684 /* Do the nonbonded GPU (or emulation) force buffer reduction
1685 * on the non-alternating path. */
1686 if (bUseOrEmulGPU && !alternateGpuWait)
1688 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1689 nbv->nbat, f, wcycle);
1692 if (DOMAINDECOMP(cr))
1694 dd_force_flop_stop(cr->dd, nrnb);
1697 if (bDoForces)
1699 /* If we have NoVirSum forces, but we do not calculate the virial,
1700 * we sum fr->f_novirsum=f later.
1702 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1704 spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
1705 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1708 if (flags & GMX_FORCE_VIRIAL)
1710 /* Calculation of the virial must be done after vsites! */
1711 calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
1712 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1716 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1718 /* In case of node-splitting, the PP nodes receive the long-range
1719 * forces, virial and energy from the PME nodes here.
1721 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1724 if (bDoForces)
1726 post_process_forces(cr, step, nrnb, wcycle,
1727 top, box, as_rvec_array(x.data()), f, &forceWithVirial,
1728 vir_force, mdatoms, graph, fr, vsite,
1729 flags);
1732 if (flags & GMX_FORCE_ENERGY)
1734 /* Sum the potential energy terms from group contributions */
1735 sum_epot(&(enerd->grpp), enerd->term);
1737 if (!EI_TPI(inputrec->eI))
1739 checkPotentialEnergyValidity(step, *enerd, *inputrec);
1744 static void do_force_cutsGROUP(FILE *fplog,
1745 const t_commrec *cr,
1746 const gmx_multisim_t *ms,
1747 const t_inputrec *inputrec,
1748 gmx::Awh *awh,
1749 gmx_enfrot *enforcedRotation,
1750 int64_t step,
1751 t_nrnb *nrnb,
1752 gmx_wallcycle_t wcycle,
1753 gmx_localtop_t *top,
1754 const gmx_groups_t *groups,
1755 matrix box, gmx::PaddedArrayRef<gmx::RVec> x,
1756 history_t *hist,
1757 gmx::PaddedArrayRef<gmx::RVec> force,
1758 tensor vir_force,
1759 const t_mdatoms *mdatoms,
1760 gmx_enerdata_t *enerd,
1761 t_fcdata *fcd,
1762 real *lambda,
1763 t_graph *graph,
1764 t_forcerec *fr,
1765 const gmx_vsite_t *vsite,
1766 rvec mu_tot,
1767 double t,
1768 gmx_edsam *ed,
1769 int flags,
1770 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1771 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1773 int cg0, cg1, i, j;
1774 double mu[2*DIM];
1775 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1776 gmx_bool bDoForces;
1777 float cycles_pme;
1779 const int start = 0;
1780 const int homenr = mdatoms->homenr;
1782 clear_mat(vir_force);
1784 cg0 = 0;
1785 if (DOMAINDECOMP(cr))
1787 cg1 = cr->dd->globalAtomGroupIndices.size();
1789 else
1791 cg1 = top->cgs.nr;
1793 if (fr->n_tpi > 0)
1795 cg1--;
1798 bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
1799 bNS = ((flags & GMX_FORCE_NS) != 0) && (!fr->bAllvsAll);
1800 /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1801 bFillGrid = (bNS && bStateChanged);
1802 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1803 bDoForces = ((flags & GMX_FORCE_FORCES) != 0);
1805 if (bStateChanged)
1807 update_forcerec(fr, box);
1809 if (inputrecNeedMutot(inputrec))
1811 /* Calculate total (local) dipole moment in a temporary common array.
1812 * This makes it possible to sum them over nodes faster.
1814 calc_mu(start, homenr,
1815 x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1816 mu, mu+DIM);
1820 if (fr->ePBC != epbcNONE)
1822 /* Compute shift vectors every step,
1823 * because of pressure coupling or box deformation!
1825 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1827 calc_shifts(box, fr->shift_vec);
1830 if (bCalcCGCM)
1832 put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1833 &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1834 inc_nrnb(nrnb, eNR_CGCM, homenr);
1835 inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1837 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1839 unshift_self(graph, box, as_rvec_array(x.data()));
1842 else if (bCalcCGCM)
1844 calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1845 inc_nrnb(nrnb, eNR_CGCM, homenr);
1848 if (bCalcCGCM && gmx_debug_at)
1850 pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1853 #if GMX_MPI
1854 if (!thisRankHasDuty(cr, DUTY_PME))
1856 /* Send particle coordinates to the pme nodes.
1857 * Since this is only implemented for domain decomposition
1858 * and domain decomposition does not use the graph,
1859 * we do not need to worry about shifting.
1861 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1862 lambda[efptCOUL], lambda[efptVDW],
1863 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1864 step, wcycle);
1866 #endif /* GMX_MPI */
1868 /* Communicate coordinates and sum dipole if necessary */
1869 if (DOMAINDECOMP(cr))
1871 dd_move_x(cr->dd, box, x, wcycle);
1873 /* No GPU support, no move_x overlap, so reopen the balance region here */
1874 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1876 ddReopenBalanceRegionCpu(cr->dd);
1880 if (inputrecNeedMutot(inputrec))
1882 if (bStateChanged)
1884 if (PAR(cr))
1886 gmx_sumd(2*DIM, mu, cr);
1887 ddReopenBalanceRegionCpu(cr->dd);
1889 for (i = 0; i < 2; i++)
1891 for (j = 0; j < DIM; j++)
1893 fr->mu_tot[i][j] = mu[i*DIM + j];
1897 if (fr->efep == efepNO)
1899 copy_rvec(fr->mu_tot[0], mu_tot);
1901 else
1903 for (j = 0; j < DIM; j++)
1905 mu_tot[j] =
1906 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1911 /* Reset energies */
1912 reset_enerdata(enerd);
1913 clear_rvecs(SHIFTS, fr->fshift);
1915 if (bNS)
1917 wallcycle_start(wcycle, ewcNS);
1919 if (graph && bStateChanged)
1921 /* Calculate intramolecular shift vectors to make molecules whole */
1922 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1925 /* Do the actual neighbour searching */
1926 ns(fplog, fr, box,
1927 groups, top, mdatoms,
1928 cr, nrnb, bFillGrid);
1930 wallcycle_stop(wcycle, ewcNS);
1933 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1935 wallcycle_start(wcycle, ewcPPDURINGPME);
1936 dd_force_flop_start(cr->dd, nrnb);
1939 if (inputrec->bRot)
1941 wallcycle_start(wcycle, ewcROT);
1942 do_rotation(cr, enforcedRotation, box, as_rvec_array(x.data()), t, step, bNS);
1943 wallcycle_stop(wcycle, ewcROT);
1946 /* Temporary solution until all routines take PaddedRVecVector */
1947 rvec *f = as_rvec_array(force.data());
1949 /* Start the force cycle counter.
1950 * Note that a different counter is used for dynamic load balancing.
1952 wallcycle_start(wcycle, ewcFORCE);
1954 gmx::ArrayRef<gmx::RVec> forceRef = force;
1955 if (bDoForces)
1957 /* If we need to compute the virial, we might need a separate
1958 * force buffer for algorithms for which the virial is calculated
1959 * separately, such as PME.
1961 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1963 forceRef = *fr->forceBufferForDirectVirialContributions;
1965 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1968 /* Clear the short- and long-range forces */
1969 clear_rvecs(fr->natoms_force_constr, f);
1972 /* forceWithVirial might need the full force atom range */
1973 gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1975 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1977 clear_pull_forces(inputrec->pull_work);
1980 /* update QMMMrec, if necessary */
1981 if (fr->bQMMM)
1983 update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1986 /* Compute the bonded and non-bonded energies and optionally forces */
1987 do_force_lowlevel(fr, inputrec, &(top->idef),
1988 cr, ms, nrnb, wcycle, mdatoms,
1989 as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1990 box, inputrec->fepvals, lambda,
1991 graph, &(top->excls), fr->mu_tot,
1992 flags,
1993 &cycles_pme);
1995 wallcycle_stop(wcycle, ewcFORCE);
1997 if (DOMAINDECOMP(cr))
1999 dd_force_flop_stop(cr->dd, nrnb);
2001 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
2003 ddCloseBalanceRegionCpu(cr->dd);
2007 computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
2008 step, t, wcycle,
2009 fr->forceProviders, box, x, mdatoms, lambda,
2010 flags, &forceWithVirial, enerd,
2011 ed, bNS);
2013 if (bDoForces)
2015 /* Communicate the forces */
2016 if (DOMAINDECOMP(cr))
2018 dd_move_f(cr->dd, force, fr->fshift, wcycle);
2019 /* Do we need to communicate the separate force array
2020 * for terms that do not contribute to the single sum virial?
2021 * Position restraints and electric fields do not introduce
2022 * inter-cg forces, only full electrostatics methods do.
2023 * When we do not calculate the virial, fr->f_novirsum = f,
2024 * so we have already communicated these forces.
2026 if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
2027 (flags & GMX_FORCE_VIRIAL))
2029 dd_move_f(cr->dd, forceWithVirial.force_, nullptr, wcycle);
2033 /* If we have NoVirSum forces, but we do not calculate the virial,
2034 * we sum fr->f_novirsum=f later.
2036 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
2038 spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
2039 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
2042 if (flags & GMX_FORCE_VIRIAL)
2044 /* Calculation of the virial must be done after vsites! */
2045 calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
2046 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
2050 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
2052 /* In case of node-splitting, the PP nodes receive the long-range
2053 * forces, virial and energy from the PME nodes here.
2055 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
2058 if (bDoForces)
2060 post_process_forces(cr, step, nrnb, wcycle,
2061 top, box, as_rvec_array(x.data()), f, &forceWithVirial,
2062 vir_force, mdatoms, graph, fr, vsite,
2063 flags);
2066 if (flags & GMX_FORCE_ENERGY)
2068 /* Sum the potential energy terms from group contributions */
2069 sum_epot(&(enerd->grpp), enerd->term);
2071 if (!EI_TPI(inputrec->eI))
2073 checkPotentialEnergyValidity(step, *enerd, *inputrec);
2079 void do_force(FILE *fplog,
2080 const t_commrec *cr,
2081 const gmx_multisim_t *ms,
2082 const t_inputrec *inputrec,
2083 gmx::Awh *awh,
2084 gmx_enfrot *enforcedRotation,
2085 int64_t step,
2086 t_nrnb *nrnb,
2087 gmx_wallcycle_t wcycle,
2088 gmx_localtop_t *top,
2089 const gmx_groups_t *groups,
2090 matrix box,
2091 gmx::PaddedArrayRef<gmx::RVec> x,
2092 history_t *hist,
2093 gmx::PaddedArrayRef<gmx::RVec> force,
2094 tensor vir_force,
2095 const t_mdatoms *mdatoms,
2096 gmx_enerdata_t *enerd,
2097 t_fcdata *fcd,
2098 gmx::ArrayRef<real> lambda,
2099 t_graph *graph,
2100 t_forcerec *fr,
2101 const gmx_vsite_t *vsite,
2102 rvec mu_tot,
2103 double t,
2104 gmx_edsam *ed,
2105 int flags,
2106 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
2107 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
2109 /* modify force flag if not doing nonbonded */
2110 if (!fr->bNonbonded)
2112 flags &= ~GMX_FORCE_NONBONDED;
2115 switch (inputrec->cutoff_scheme)
2117 case ecutsVERLET:
2118 do_force_cutsVERLET(fplog, cr, ms, inputrec,
2119 awh, enforcedRotation, step, nrnb, wcycle,
2120 top,
2121 groups,
2122 box, x, hist,
2123 force, vir_force,
2124 mdatoms,
2125 enerd, fcd,
2126 lambda.data(), graph,
2127 fr, fr->ic,
2128 vsite, mu_tot,
2129 t, ed,
2130 flags,
2131 ddOpenBalanceRegion,
2132 ddCloseBalanceRegion);
2133 break;
2134 case ecutsGROUP:
2135 do_force_cutsGROUP(fplog, cr, ms, inputrec,
2136 awh, enforcedRotation, step, nrnb, wcycle,
2137 top,
2138 groups,
2139 box, x, hist,
2140 force, vir_force,
2141 mdatoms,
2142 enerd, fcd,
2143 lambda.data(), graph,
2144 fr, vsite, mu_tot,
2145 t, ed,
2146 flags,
2147 ddOpenBalanceRegion,
2148 ddCloseBalanceRegion);
2149 break;
2150 default:
2151 gmx_incons("Invalid cut-off scheme passed!");
2154 /* In case we don't have constraints and are using GPUs, the next balancing
2155 * region starts here.
2156 * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2157 * virial calculation and COM pulling, is not thus not included in
2158 * the balance timing, which is ok as most tasks do communication.
2160 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
2162 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
2167 void do_constrain_first(FILE *fplog, gmx::Constraints *constr,
2168 const t_inputrec *ir, const t_mdatoms *md,
2169 t_state *state)
2171 int i, m, start, end;
2172 int64_t step;
2173 real dt = ir->delta_t;
2174 real dvdl_dum;
2175 rvec *savex;
2177 /* We need to allocate one element extra, since we might use
2178 * (unaligned) 4-wide SIMD loads to access rvec entries.
2180 snew(savex, state->natoms + 1);
2182 start = 0;
2183 end = md->homenr;
2185 if (debug)
2187 fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2188 start, md->homenr, end);
2190 /* Do a first constrain to reset particles... */
2191 step = ir->init_step;
2192 if (fplog)
2194 char buf[STEPSTRSIZE];
2195 fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2196 gmx_step_str(step, buf));
2198 dvdl_dum = 0;
2200 /* constrain the current position */
2201 constr->apply(TRUE, FALSE,
2202 step, 0, 1.0,
2203 state->x.rvec_array(), state->x.rvec_array(), nullptr,
2204 state->box,
2205 state->lambda[efptBONDED], &dvdl_dum,
2206 nullptr, nullptr, gmx::ConstraintVariable::Positions);
2207 if (EI_VV(ir->eI))
2209 /* constrain the inital velocity, and save it */
2210 /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2211 constr->apply(TRUE, FALSE,
2212 step, 0, 1.0,
2213 state->x.rvec_array(), state->v.rvec_array(), state->v.rvec_array(),
2214 state->box,
2215 state->lambda[efptBONDED], &dvdl_dum,
2216 nullptr, nullptr, gmx::ConstraintVariable::Velocities);
2218 /* constrain the inital velocities at t-dt/2 */
2219 if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2221 auto x = makeArrayRef(state->x).subArray(start, end);
2222 auto v = makeArrayRef(state->v).subArray(start, end);
2223 for (i = start; (i < end); i++)
2225 for (m = 0; (m < DIM); m++)
2227 /* Reverse the velocity */
2228 v[i][m] = -v[i][m];
2229 /* Store the position at t-dt in buf */
2230 savex[i][m] = x[i][m] + dt*v[i][m];
2233 /* Shake the positions at t=-dt with the positions at t=0
2234 * as reference coordinates.
2236 if (fplog)
2238 char buf[STEPSTRSIZE];
2239 fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2240 gmx_step_str(step, buf));
2242 dvdl_dum = 0;
2243 constr->apply(TRUE, FALSE,
2244 step, -1, 1.0,
2245 state->x.rvec_array(), savex, nullptr,
2246 state->box,
2247 state->lambda[efptBONDED], &dvdl_dum,
2248 state->v.rvec_array(), nullptr, gmx::ConstraintVariable::Positions);
2250 for (i = start; i < end; i++)
2252 for (m = 0; m < DIM; m++)
2254 /* Re-reverse the velocities */
2255 v[i][m] = -v[i][m];
2259 sfree(savex);
2263 static void
2264 integrate_table(const real vdwtab[], real scale, int offstart, int rstart, int rend,
2265 double *enerout, double *virout)
2267 double enersum, virsum;
2268 double invscale, invscale2, invscale3;
2269 double r, ea, eb, ec, pa, pb, pc, pd;
2270 double y0, f, g, h;
2271 int ri, offset;
2272 double tabfactor;
2274 invscale = 1.0/scale;
2275 invscale2 = invscale*invscale;
2276 invscale3 = invscale*invscale2;
2278 /* Following summation derived from cubic spline definition,
2279 * Numerical Recipies in C, second edition, p. 113-116. Exact for
2280 * the cubic spline. We first calculate the negative of the
2281 * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2282 * add the more standard, abrupt cutoff correction to that result,
2283 * yielding the long-range correction for a switched function. We
2284 * perform both the pressure and energy loops at the same time for
2285 * simplicity, as the computational cost is low. */
2287 if (offstart == 0)
2289 /* Since the dispersion table has been scaled down a factor
2290 * 6.0 and the repulsion a factor 12.0 to compensate for the
2291 * c6/c12 parameters inside nbfp[] being scaled up (to save
2292 * flops in kernels), we need to correct for this.
2294 tabfactor = 6.0;
2296 else
2298 tabfactor = 12.0;
2301 enersum = 0.0;
2302 virsum = 0.0;
2303 for (ri = rstart; ri < rend; ++ri)
2305 r = ri*invscale;
2306 ea = invscale3;
2307 eb = 2.0*invscale2*r;
2308 ec = invscale*r*r;
2310 pa = invscale3;
2311 pb = 3.0*invscale2*r;
2312 pc = 3.0*invscale*r*r;
2313 pd = r*r*r;
2315 /* this "8" is from the packing in the vdwtab array - perhaps
2316 should be defined? */
2318 offset = 8*ri + offstart;
2319 y0 = vdwtab[offset];
2320 f = vdwtab[offset+1];
2321 g = vdwtab[offset+2];
2322 h = vdwtab[offset+3];
2324 enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2325 virsum += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2327 *enerout = 4.0*M_PI*enersum*tabfactor;
2328 *virout = 4.0*M_PI*virsum*tabfactor;
2331 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2333 double eners[2], virs[2], enersum, virsum;
2334 double r0, rc3, rc9;
2335 int ri0, ri1, i;
2336 real scale, *vdwtab;
2338 fr->enershiftsix = 0;
2339 fr->enershifttwelve = 0;
2340 fr->enerdiffsix = 0;
2341 fr->enerdifftwelve = 0;
2342 fr->virdiffsix = 0;
2343 fr->virdifftwelve = 0;
2345 const interaction_const_t *ic = fr->ic;
2347 if (eDispCorr != edispcNO)
2349 for (i = 0; i < 2; i++)
2351 eners[i] = 0;
2352 virs[i] = 0;
2354 if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2355 (ic->vdw_modifier == eintmodPOTSWITCH) ||
2356 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2357 (ic->vdwtype == evdwSHIFT) ||
2358 (ic->vdwtype == evdwSWITCH))
2360 if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2361 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2362 (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2364 gmx_fatal(FARGS,
2365 "With dispersion correction rvdw-switch can not be zero "
2366 "for vdw-type = %s", evdw_names[ic->vdwtype]);
2369 /* TODO This code depends on the logic in tables.c that
2370 constructs the table layout, which should be made
2371 explicit in future cleanup. */
2372 GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2373 "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2374 scale = fr->dispersionCorrectionTable->scale;
2375 vdwtab = fr->dispersionCorrectionTable->data;
2377 /* Round the cut-offs to exact table values for precision */
2378 ri0 = static_cast<int>(std::floor(ic->rvdw_switch*scale));
2379 ri1 = static_cast<int>(std::ceil(ic->rvdw*scale));
2381 /* The code below has some support for handling force-switching, i.e.
2382 * when the force (instead of potential) is switched over a limited
2383 * region. This leads to a constant shift in the potential inside the
2384 * switching region, which we can handle by adding a constant energy
2385 * term in the force-switch case just like when we do potential-shift.
2387 * For now this is not enabled, but to keep the functionality in the
2388 * code we check separately for switch and shift. When we do force-switch
2389 * the shifting point is rvdw_switch, while it is the cutoff when we
2390 * have a classical potential-shift.
2392 * For a pure potential-shift the potential has a constant shift
2393 * all the way out to the cutoff, and that is it. For other forms
2394 * we need to calculate the constant shift up to the point where we
2395 * start modifying the potential.
2397 ri0 = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2399 r0 = ri0/scale;
2400 rc3 = r0*r0*r0;
2401 rc9 = rc3*rc3*rc3;
2403 if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2404 (ic->vdwtype == evdwSHIFT))
2406 /* Determine the constant energy shift below rvdw_switch.
2407 * Table has a scale factor since we have scaled it down to compensate
2408 * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2410 fr->enershiftsix = static_cast<real>(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2411 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2413 else if (ic->vdw_modifier == eintmodPOTSHIFT)
2415 fr->enershiftsix = static_cast<real>(-1.0/(rc3*rc3));
2416 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3));
2419 /* Add the constant part from 0 to rvdw_switch.
2420 * This integration from 0 to rvdw_switch overcounts the number
2421 * of interactions by 1, as it also counts the self interaction.
2422 * We will correct for this later.
2424 eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2425 eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2427 /* Calculate the contribution in the range [r0,r1] where we
2428 * modify the potential. For a pure potential-shift modifier we will
2429 * have ri0==ri1, and there will not be any contribution here.
2431 for (i = 0; i < 2; i++)
2433 enersum = 0;
2434 virsum = 0;
2435 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2436 eners[i] -= enersum;
2437 virs[i] -= virsum;
2440 /* Alright: Above we compensated by REMOVING the parts outside r0
2441 * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2443 * Regardless of whether r0 is the point where we start switching,
2444 * or the cutoff where we calculated the constant shift, we include
2445 * all the parts we are missing out to infinity from r0 by
2446 * calculating the analytical dispersion correction.
2448 eners[0] += -4.0*M_PI/(3.0*rc3);
2449 eners[1] += 4.0*M_PI/(9.0*rc9);
2450 virs[0] += 8.0*M_PI/rc3;
2451 virs[1] += -16.0*M_PI/(3.0*rc9);
2453 else if (ic->vdwtype == evdwCUT ||
2454 EVDW_PME(ic->vdwtype) ||
2455 ic->vdwtype == evdwUSER)
2457 if (ic->vdwtype == evdwUSER && fplog)
2459 fprintf(fplog,
2460 "WARNING: using dispersion correction with user tables\n");
2463 /* Note that with LJ-PME, the dispersion correction is multiplied
2464 * by the difference between the actual C6 and the value of C6
2465 * that would produce the combination rule.
2466 * This means the normal energy and virial difference formulas
2467 * can be used here.
2470 rc3 = ic->rvdw*ic->rvdw*ic->rvdw;
2471 rc9 = rc3*rc3*rc3;
2472 /* Contribution beyond the cut-off */
2473 eners[0] += -4.0*M_PI/(3.0*rc3);
2474 eners[1] += 4.0*M_PI/(9.0*rc9);
2475 if (ic->vdw_modifier == eintmodPOTSHIFT)
2477 /* Contribution within the cut-off */
2478 eners[0] += -4.0*M_PI/(3.0*rc3);
2479 eners[1] += 4.0*M_PI/(3.0*rc9);
2481 /* Contribution beyond the cut-off */
2482 virs[0] += 8.0*M_PI/rc3;
2483 virs[1] += -16.0*M_PI/(3.0*rc9);
2485 else
2487 gmx_fatal(FARGS,
2488 "Dispersion correction is not implemented for vdw-type = %s",
2489 evdw_names[ic->vdwtype]);
2492 /* When we deprecate the group kernels the code below can go too */
2493 if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2495 /* Calculate self-interaction coefficient (assuming that
2496 * the reciprocal-space contribution is constant in the
2497 * region that contributes to the self-interaction).
2499 fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2501 eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2502 virs[0] += gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2505 fr->enerdiffsix = eners[0];
2506 fr->enerdifftwelve = eners[1];
2507 /* The 0.5 is due to the Gromacs definition of the virial */
2508 fr->virdiffsix = 0.5*virs[0];
2509 fr->virdifftwelve = 0.5*virs[1];
2513 void calc_dispcorr(const t_inputrec *ir, const t_forcerec *fr,
2514 const matrix box, real lambda, tensor pres, tensor virial,
2515 real *prescorr, real *enercorr, real *dvdlcorr)
2517 gmx_bool bCorrAll, bCorrPres;
2518 real dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2519 int m;
2521 *prescorr = 0;
2522 *enercorr = 0;
2523 *dvdlcorr = 0;
2525 clear_mat(virial);
2526 clear_mat(pres);
2528 if (ir->eDispCorr != edispcNO)
2530 bCorrAll = (ir->eDispCorr == edispcAllEner ||
2531 ir->eDispCorr == edispcAllEnerPres);
2532 bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2533 ir->eDispCorr == edispcAllEnerPres);
2535 invvol = 1/det(box);
2536 if (fr->n_tpi)
2538 /* Only correct for the interactions with the inserted molecule */
2539 dens = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2540 ninter = fr->n_tpi;
2542 else
2544 dens = fr->numAtomsForDispersionCorrection*invvol;
2545 ninter = 0.5*fr->numAtomsForDispersionCorrection;
2548 if (ir->efep == efepNO)
2550 avcsix = fr->avcsix[0];
2551 avctwelve = fr->avctwelve[0];
2553 else
2555 avcsix = (1 - lambda)*fr->avcsix[0] + lambda*fr->avcsix[1];
2556 avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2559 enerdiff = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2560 *enercorr += avcsix*enerdiff;
2561 dvdlambda = 0.0;
2562 if (ir->efep != efepNO)
2564 dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2566 if (bCorrAll)
2568 enerdiff = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2569 *enercorr += avctwelve*enerdiff;
2570 if (fr->efep != efepNO)
2572 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2576 if (bCorrPres)
2578 svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2579 if (ir->eDispCorr == edispcAllEnerPres)
2581 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2583 /* The factor 2 is because of the Gromacs virial definition */
2584 spres = -2.0*invvol*svir*PRESFAC;
2586 for (m = 0; m < DIM; m++)
2588 virial[m][m] += svir;
2589 pres[m][m] += spres;
2591 *prescorr += spres;
2594 /* Can't currently control when it prints, for now, just print when degugging */
2595 if (debug)
2597 if (bCorrAll)
2599 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2600 avcsix, avctwelve);
2602 if (bCorrPres)
2604 fprintf(debug,
2605 "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2606 *enercorr, spres, svir);
2608 else
2610 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2614 if (fr->efep != efepNO)
2616 *dvdlcorr += dvdlambda;
2621 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2622 const gmx_mtop_t *mtop, rvec x[],
2623 gmx_bool bFirst)
2625 t_graph *graph;
2626 int as, mol;
2628 if (bFirst && fplog)
2630 fprintf(fplog, "Removing pbc first time\n");
2633 snew(graph, 1);
2634 as = 0;
2635 for (const gmx_molblock_t &molb : mtop->molblock)
2637 const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2638 if (moltype.atoms.nr == 1 ||
2639 (!bFirst && moltype.cgs.nr == 1))
2641 /* Just one atom or charge group in the molecule, no PBC required */
2642 as += molb.nmol*moltype.atoms.nr;
2644 else
2646 mk_graph_moltype(moltype, graph);
2648 for (mol = 0; mol < molb.nmol; mol++)
2650 mk_mshift(fplog, graph, ePBC, box, x+as);
2652 shift_self(graph, box, x+as);
2653 /* The molecule is whole now.
2654 * We don't need the second mk_mshift call as in do_pbc_first,
2655 * since we no longer need this graph.
2658 as += moltype.atoms.nr;
2660 done_graph(graph);
2663 sfree(graph);
2666 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2667 const gmx_mtop_t *mtop, rvec x[])
2669 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2672 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2673 const gmx_mtop_t *mtop, rvec x[])
2675 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2678 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2680 int t, nth;
2681 nth = gmx_omp_nthreads_get(emntDefault);
2683 #pragma omp parallel for num_threads(nth) schedule(static)
2684 for (t = 0; t < nth; t++)
2688 size_t natoms = x.size();
2689 size_t offset = (natoms*t )/nth;
2690 size_t len = (natoms*(t + 1))/nth - offset;
2691 put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2693 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2697 // TODO This can be cleaned up a lot, and move back to runner.cpp
2698 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2699 const t_inputrec *inputrec,
2700 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2701 gmx_walltime_accounting_t walltime_accounting,
2702 nonbonded_verlet_t *nbv,
2703 const gmx_pme_t *pme,
2704 gmx_bool bWriteStat)
2706 t_nrnb *nrnb_tot = nullptr;
2707 double delta_t = 0;
2708 double nbfs = 0, mflop = 0;
2709 double elapsed_time,
2710 elapsed_time_over_all_ranks,
2711 elapsed_time_over_all_threads,
2712 elapsed_time_over_all_threads_over_all_ranks;
2713 /* Control whether it is valid to print a report. Only the
2714 simulation master may print, but it should not do so if the run
2715 terminated e.g. before a scheduled reset step. This is
2716 complicated by the fact that PME ranks are unaware of the
2717 reason why they were sent a pmerecvqxFINISH. To avoid
2718 communication deadlocks, we always do the communication for the
2719 report, even if we've decided not to write the report, because
2720 how long it takes to finish the run is not important when we've
2721 decided not to report on the simulation performance.
2723 Further, we only report performance for dynamical integrators,
2724 because those are the only ones for which we plan to
2725 consider doing any optimizations. */
2726 bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2728 if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2730 GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2731 printReport = false;
2734 if (cr->nnodes > 1)
2736 snew(nrnb_tot, 1);
2737 #if GMX_MPI
2738 MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2739 cr->mpi_comm_mysim);
2740 #endif
2742 else
2744 nrnb_tot = nrnb;
2747 elapsed_time = walltime_accounting_get_time_since_reset(walltime_accounting);
2748 elapsed_time_over_all_threads = walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
2749 if (cr->nnodes > 1)
2751 #if GMX_MPI
2752 /* reduce elapsed_time over all MPI ranks in the current simulation */
2753 MPI_Allreduce(&elapsed_time,
2754 &elapsed_time_over_all_ranks,
2755 1, MPI_DOUBLE, MPI_SUM,
2756 cr->mpi_comm_mysim);
2757 elapsed_time_over_all_ranks /= cr->nnodes;
2758 /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2759 * current simulation. */
2760 MPI_Allreduce(&elapsed_time_over_all_threads,
2761 &elapsed_time_over_all_threads_over_all_ranks,
2762 1, MPI_DOUBLE, MPI_SUM,
2763 cr->mpi_comm_mysim);
2764 #endif
2766 else
2768 elapsed_time_over_all_ranks = elapsed_time;
2769 elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2772 if (printReport)
2774 print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2776 if (cr->nnodes > 1)
2778 sfree(nrnb_tot);
2781 if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2783 print_dd_statistics(cr, inputrec, fplog);
2786 /* TODO Move the responsibility for any scaling by thread counts
2787 * to the code that handled the thread region, so that there's a
2788 * mechanism to keep cycle counting working during the transition
2789 * to task parallelism. */
2790 int nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
2791 int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2792 wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2793 auto cycle_sum(wallcycle_sum(cr, wcycle));
2795 if (printReport)
2797 auto nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2798 gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
2799 if (pme_gpu_task_enabled(pme))
2801 pme_gpu_get_timings(pme, &pme_gpu_timings);
2803 wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2804 elapsed_time_over_all_ranks,
2805 wcycle, cycle_sum,
2806 nbnxn_gpu_timings,
2807 &pme_gpu_timings);
2809 if (EI_DYNAMICS(inputrec->eI))
2811 delta_t = inputrec->delta_t;
2814 if (fplog)
2816 print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2817 elapsed_time_over_all_ranks,
2818 walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2819 delta_t, nbfs, mflop);
2821 if (bWriteStat)
2823 print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2824 elapsed_time_over_all_ranks,
2825 walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2826 delta_t, nbfs, mflop);
2831 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2833 /* this function works, but could probably use a logic rewrite to keep all the different
2834 types of efep straight. */
2836 if ((ir->efep == efepNO) && (!ir->bSimTemp))
2838 return;
2841 t_lambda *fep = ir->fepvals;
2842 *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2843 if checkpoint is set -- a kludge is in for now
2844 to prevent this.*/
2846 for (int i = 0; i < efptNR; i++)
2848 /* overwrite lambda state with init_lambda for now for backwards compatibility */
2849 if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2851 lambda[i] = fep->init_lambda;
2852 if (lam0)
2854 lam0[i] = lambda[i];
2857 else
2859 lambda[i] = fep->all_lambda[i][*fep_state];
2860 if (lam0)
2862 lam0[i] = lambda[i];
2866 if (ir->bSimTemp)
2868 /* need to rescale control temperatures to match current state */
2869 for (int i = 0; i < ir->opts.ngtc; i++)
2871 if (ir->opts.ref_t[i] > 0)
2873 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2878 /* Send to the log the information on the current lambdas */
2879 if (fplog != nullptr)
2881 fprintf(fplog, "Initial vector of lambda components:[ ");
2882 for (const auto &l : lambda)
2884 fprintf(fplog, "%10.4f ", l);
2886 fprintf(fplog, "]\n");
2891 void init_md(FILE *fplog,
2892 const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2893 t_inputrec *ir, const gmx_output_env_t *oenv,
2894 const MdrunOptions &mdrunOptions,
2895 double *t, double *t0,
2896 t_state *globalState, double *lam0,
2897 t_nrnb *nrnb, gmx_mtop_t *mtop,
2898 gmx_update_t **upd,
2899 gmx::BoxDeformation *deform,
2900 int nfile, const t_filenm fnm[],
2901 gmx_mdoutf_t *outf, t_mdebin **mdebin,
2902 tensor force_vir, tensor shake_vir,
2903 tensor total_vir, tensor pres, rvec mu_tot,
2904 gmx_bool *bSimAnn, t_vcm **vcm,
2905 gmx_wallcycle_t wcycle)
2907 int i;
2909 /* Initial values */
2910 *t = *t0 = ir->init_t;
2912 *bSimAnn = FALSE;
2913 for (i = 0; i < ir->opts.ngtc; i++)
2915 /* set bSimAnn if any group is being annealed */
2916 if (ir->opts.annealing[i] != eannNO)
2918 *bSimAnn = TRUE;
2922 /* Initialize lambda variables */
2923 /* TODO: Clean up initialization of fep_state and lambda in t_state.
2924 * We currently need to call initialize_lambdas on non-master ranks
2925 * to initialize lam0.
2927 if (MASTER(cr))
2929 initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
2931 else
2933 int tmpFepState;
2934 std::array<real, efptNR> tmpLambda;
2935 initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
2938 // TODO upd is never NULL in practice, but the analysers don't know that
2939 if (upd)
2941 *upd = init_update(ir, deform);
2943 if (*bSimAnn)
2945 update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2948 if (vcm != nullptr)
2950 *vcm = init_vcm(fplog, &mtop->groups, ir);
2953 if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
2955 if (ir->etc == etcBERENDSEN)
2957 please_cite(fplog, "Berendsen84a");
2959 if (ir->etc == etcVRESCALE)
2961 please_cite(fplog, "Bussi2007a");
2963 if (ir->eI == eiSD1)
2965 please_cite(fplog, "Goga2012");
2968 init_nrnb(nrnb);
2970 if (nfile != -1)
2972 *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
2974 *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
2975 mtop, ir, mdoutf_get_fp_dhdl(*outf));
2978 /* Initiate variables */
2979 clear_mat(force_vir);
2980 clear_mat(shake_vir);
2981 clear_rvec(mu_tot);
2982 clear_mat(total_vir);
2983 clear_mat(pres);
2986 void init_rerun(FILE *fplog,
2987 const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2988 t_inputrec *ir, const gmx_output_env_t *oenv,
2989 const MdrunOptions &mdrunOptions,
2990 t_state *globalState, double *lam0,
2991 t_nrnb *nrnb, gmx_mtop_t *mtop,
2992 int nfile, const t_filenm fnm[],
2993 gmx_mdoutf_t *outf, t_mdebin **mdebin,
2994 gmx_wallcycle_t wcycle)
2996 /* Initialize lambda variables */
2997 /* TODO: Clean up initialization of fep_state and lambda in t_state.
2998 * We currently need to call initialize_lambdas on non-master ranks
2999 * to initialize lam0.
3001 if (MASTER(cr))
3003 initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
3005 else
3007 int tmpFepState;
3008 std::array<real, efptNR> tmpLambda;
3009 initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
3012 init_nrnb(nrnb);
3014 if (nfile != -1)
3016 *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
3017 *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
3018 mtop, ir, mdoutf_get_fp_dhdl(*outf), true);