Introduce PpForceWorkload
[gromacs.git] / src / gromacs / mdlib / sim_util.cpp
blob9b77e8a5aac5f6d8b8f7aec458ecedd559f2659d
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
37 #include "gmxpre.h"
39 #include "sim_util.h"
41 #include "config.h"
43 #include <cmath>
44 #include <cstdint>
45 #include <cstdio>
46 #include <cstring>
48 #include <array>
50 #include "gromacs/awh/awh.h"
51 #include "gromacs/domdec/dlbtiming.h"
52 #include "gromacs/domdec/domdec_struct.h"
53 #include "gromacs/domdec/partition.h"
54 #include "gromacs/essentialdynamics/edsam.h"
55 #include "gromacs/ewald/pme.h"
56 #include "gromacs/gmxlib/chargegroup.h"
57 #include "gromacs/gmxlib/network.h"
58 #include "gromacs/gmxlib/nrnb.h"
59 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
60 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
61 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
62 #include "gromacs/gpu_utils/gpu_utils.h"
63 #include "gromacs/imd/imd.h"
64 #include "gromacs/listed-forces/bonded.h"
65 #include "gromacs/listed-forces/disre.h"
66 #include "gromacs/listed-forces/listed-forces.h"
67 #include "gromacs/listed-forces/manage-threading.h"
68 #include "gromacs/listed-forces/orires.h"
69 #include "gromacs/math/arrayrefwithpadding.h"
70 #include "gromacs/math/functions.h"
71 #include "gromacs/math/units.h"
72 #include "gromacs/math/vec.h"
73 #include "gromacs/math/vecdump.h"
74 #include "gromacs/mdlib/calcmu.h"
75 #include "gromacs/mdlib/calcvir.h"
76 #include "gromacs/mdlib/constr.h"
77 #include "gromacs/mdlib/force.h"
78 #include "gromacs/mdlib/forcerec.h"
79 #include "gromacs/mdlib/gmx_omp_nthreads.h"
80 #include "gromacs/mdlib/mdrun.h"
81 #include "gromacs/mdlib/nb_verlet.h"
82 #include "gromacs/mdlib/nbnxn_atomdata.h"
83 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
84 #include "gromacs/mdlib/nbnxn_grid.h"
85 #include "gromacs/mdlib/nbnxn_search.h"
86 #include "gromacs/mdlib/ppforceworkload.h"
87 #include "gromacs/mdlib/qmmm.h"
88 #include "gromacs/mdlib/update.h"
89 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
90 #include "gromacs/mdtypes/commrec.h"
91 #include "gromacs/mdtypes/forceoutput.h"
92 #include "gromacs/mdtypes/iforceprovider.h"
93 #include "gromacs/mdtypes/inputrec.h"
94 #include "gromacs/mdtypes/md_enums.h"
95 #include "gromacs/mdtypes/state.h"
96 #include "gromacs/pbcutil/ishift.h"
97 #include "gromacs/pbcutil/mshift.h"
98 #include "gromacs/pbcutil/pbc.h"
99 #include "gromacs/pulling/pull.h"
100 #include "gromacs/pulling/pull_rotation.h"
101 #include "gromacs/timing/cyclecounter.h"
102 #include "gromacs/timing/gpu_timing.h"
103 #include "gromacs/timing/wallcycle.h"
104 #include "gromacs/timing/wallcyclereporting.h"
105 #include "gromacs/timing/walltime_accounting.h"
106 #include "gromacs/topology/topology.h"
107 #include "gromacs/utility/arrayref.h"
108 #include "gromacs/utility/basedefinitions.h"
109 #include "gromacs/utility/cstringutil.h"
110 #include "gromacs/utility/exceptions.h"
111 #include "gromacs/utility/fatalerror.h"
112 #include "gromacs/utility/gmxassert.h"
113 #include "gromacs/utility/gmxmpi.h"
114 #include "gromacs/utility/logger.h"
115 #include "gromacs/utility/pleasecite.h"
116 #include "gromacs/utility/smalloc.h"
117 #include "gromacs/utility/strconvert.h"
118 #include "gromacs/utility/sysinfo.h"
120 #include "nbnxn_gpu.h"
121 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
122 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
124 // TODO: this environment variable allows us to verify before release
125 // that on less common architectures the total cost of polling is not larger than
126 // a blocking wait (so polling does not introduce overhead when the static
127 // PME-first ordering would suffice).
128 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
131 void print_time(FILE *out,
132 gmx_walltime_accounting_t walltime_accounting,
133 int64_t step,
134 const t_inputrec *ir,
135 const t_commrec *cr)
137 time_t finish;
138 double dt, elapsed_seconds, time_per_step;
140 #if !GMX_THREAD_MPI
141 if (!PAR(cr))
142 #endif
144 fprintf(out, "\r");
146 fputs("step ", out);
147 fputs(gmx::int64ToString(step).c_str(), out);
148 fflush(out);
150 if ((step >= ir->nstlist))
152 double seconds_since_epoch = gmx_gettime();
153 elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
154 time_per_step = elapsed_seconds/(step - ir->init_step + 1);
155 dt = (ir->nsteps + ir->init_step - step) * time_per_step;
157 if (ir->nsteps >= 0)
159 if (dt >= 300)
161 finish = static_cast<time_t>(seconds_since_epoch + dt);
162 auto timebuf = gmx_ctime_r(&finish);
163 fputs(", will finish ", out);
164 fputs(timebuf.c_str(), out);
166 else
168 fprintf(out, ", remaining wall clock time: %5d s ", static_cast<int>(dt));
171 else
173 fprintf(out, " performance: %.1f ns/day ",
174 ir->delta_t/1000*24*60*60/time_per_step);
177 #if !GMX_THREAD_MPI
178 if (PAR(cr))
180 fprintf(out, "\n");
182 #else
183 GMX_UNUSED_VALUE(cr);
184 #endif
186 fflush(out);
189 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
190 double the_time)
192 if (!fplog)
194 return;
197 time_t temp_time = static_cast<time_t>(the_time);
199 auto timebuf = gmx_ctime_r(&temp_time);
201 fprintf(fplog, "%s on rank %d %s\n", title, nodeid, timebuf.c_str());
204 void print_start(FILE *fplog, const t_commrec *cr,
205 gmx_walltime_accounting_t walltime_accounting,
206 const char *name)
208 char buf[STRLEN];
210 sprintf(buf, "Started %s", name);
211 print_date_and_time(fplog, cr->nodeid, buf,
212 walltime_accounting_get_start_time_stamp(walltime_accounting));
215 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
217 const int end = forceToAdd.size();
219 int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
220 #pragma omp parallel for num_threads(nt) schedule(static)
221 for (int i = 0; i < end; i++)
223 rvec_inc(f[i], forceToAdd[i]);
227 static void pme_gpu_reduce_outputs(gmx_wallcycle_t wcycle,
228 gmx::ForceWithVirial *forceWithVirial,
229 gmx::ArrayRef<const gmx::RVec> pmeForces,
230 gmx_enerdata_t *enerd,
231 const tensor vir_Q,
232 real Vlr_q)
234 wallcycle_start(wcycle, ewcPME_GPU_F_REDUCTION);
235 GMX_ASSERT(forceWithVirial, "Invalid force pointer");
236 forceWithVirial->addVirialContribution(vir_Q);
237 enerd->term[F_COUL_RECIP] += Vlr_q;
238 sum_forces(as_rvec_array(forceWithVirial->force_.data()), pmeForces);
239 wallcycle_stop(wcycle, ewcPME_GPU_F_REDUCTION);
242 static void calc_virial(int start, int homenr, const rvec x[], const rvec f[],
243 tensor vir_part, const t_graph *graph, const matrix box,
244 t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
246 /* The short-range virial from surrounding boxes */
247 calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
248 inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
250 /* Calculate partial virial, for local atoms only, based on short range.
251 * Total virial is computed in global_stat, called from do_md
253 f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
254 inc_nrnb(nrnb, eNR_VIRIAL, homenr);
256 if (debug)
258 pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
262 static void pull_potential_wrapper(const t_commrec *cr,
263 const t_inputrec *ir,
264 const matrix box, gmx::ArrayRef<const gmx::RVec> x,
265 gmx::ForceWithVirial *force,
266 const t_mdatoms *mdatoms,
267 gmx_enerdata_t *enerd,
268 const real *lambda,
269 double t,
270 gmx_wallcycle_t wcycle)
272 t_pbc pbc;
273 real dvdl;
275 /* Calculate the center of mass forces, this requires communication,
276 * which is why pull_potential is called close to other communication.
278 wallcycle_start(wcycle, ewcPULLPOT);
279 set_pbc(&pbc, ir->ePBC, box);
280 dvdl = 0;
281 enerd->term[F_COM_PULL] +=
282 pull_potential(ir->pull_work, mdatoms, &pbc,
283 cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
284 enerd->dvdl_lin[efptRESTRAINT] += dvdl;
285 wallcycle_stop(wcycle, ewcPULLPOT);
288 static void pme_receive_force_ener(const t_commrec *cr,
289 gmx::ForceWithVirial *forceWithVirial,
290 gmx_enerdata_t *enerd,
291 gmx_wallcycle_t wcycle)
293 real e_q, e_lj, dvdl_q, dvdl_lj;
294 float cycles_ppdpme, cycles_seppme;
296 cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
297 dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
299 /* In case of node-splitting, the PP nodes receive the long-range
300 * forces, virial and energy from the PME nodes here.
302 wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
303 dvdl_q = 0;
304 dvdl_lj = 0;
305 gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
306 &cycles_seppme);
307 enerd->term[F_COUL_RECIP] += e_q;
308 enerd->term[F_LJ_RECIP] += e_lj;
309 enerd->dvdl_lin[efptCOUL] += dvdl_q;
310 enerd->dvdl_lin[efptVDW] += dvdl_lj;
312 if (wcycle)
314 dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
316 wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
319 static void print_large_forces(FILE *fp,
320 const t_mdatoms *md,
321 const t_commrec *cr,
322 int64_t step,
323 real forceTolerance,
324 const rvec *x,
325 const rvec *f)
327 real force2Tolerance = gmx::square(forceTolerance);
328 gmx::index numNonFinite = 0;
329 for (int i = 0; i < md->homenr; i++)
331 real force2 = norm2(f[i]);
332 bool nonFinite = !std::isfinite(force2);
333 if (force2 >= force2Tolerance || nonFinite)
335 fprintf(fp, "step %" PRId64 " atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
336 step,
337 ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
339 if (nonFinite)
341 numNonFinite++;
344 if (numNonFinite > 0)
346 /* Note that with MPI this fatal call on one rank might interrupt
347 * the printing on other ranks. But we can only avoid that with
348 * an expensive MPI barrier that we would need at each step.
350 gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %td atoms", step, numNonFinite);
354 static void post_process_forces(const t_commrec *cr,
355 int64_t step,
356 t_nrnb *nrnb,
357 gmx_wallcycle_t wcycle,
358 const gmx_localtop_t *top,
359 const matrix box,
360 const rvec x[],
361 rvec f[],
362 gmx::ForceWithVirial *forceWithVirial,
363 tensor vir_force,
364 const t_mdatoms *mdatoms,
365 const t_graph *graph,
366 const t_forcerec *fr,
367 const gmx_vsite_t *vsite,
368 int flags)
370 if (fr->haveDirectVirialContributions)
372 rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
374 if (vsite)
376 /* Spread the mesh force on virtual sites to the other particles...
377 * This is parallellized. MPI communication is performed
378 * if the constructing atoms aren't local.
380 matrix virial = { { 0 } };
381 spread_vsite_f(vsite, x, fDirectVir, nullptr,
382 (flags & GMX_FORCE_VIRIAL) != 0, virial,
383 nrnb,
384 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
385 forceWithVirial->addVirialContribution(virial);
388 if (flags & GMX_FORCE_VIRIAL)
390 /* Now add the forces, this is local */
391 sum_forces(f, forceWithVirial->force_);
393 /* Add the direct virial contributions */
394 GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
395 m_add(vir_force, forceWithVirial->getVirial(), vir_force);
397 if (debug)
399 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
404 if (fr->print_force >= 0)
406 print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
410 static void do_nb_verlet(const t_forcerec *fr,
411 const interaction_const_t *ic,
412 gmx_enerdata_t *enerd,
413 int flags, int ilocality,
414 int clearF,
415 int64_t step,
416 t_nrnb *nrnb,
417 gmx_wallcycle_t wcycle)
419 if (!(flags & GMX_FORCE_NONBONDED))
421 /* skip non-bonded calculation */
422 return;
425 nonbonded_verlet_t *nbv = fr->nbv;
426 nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
428 /* GPU kernel launch overhead is already timed separately */
429 if (fr->cutoff_scheme != ecutsVERLET)
431 gmx_incons("Invalid cut-off scheme passed!");
434 bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
436 if (!bUsingGpuKernels)
438 /* When dynamic pair-list pruning is requested, we need to prune
439 * at nstlistPrune steps.
441 if (nbv->listParams->useDynamicPruning &&
442 (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
444 /* Prune the pair-list beyond fr->ic->rlistPrune using
445 * the current coordinates of the atoms.
447 wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
448 nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
449 wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
452 wallcycle_sub_start(wcycle, ewcsNONBONDED);
455 switch (nbvg->kernel_type)
457 case nbnxnk4x4_PlainC:
458 case nbnxnk4xN_SIMD_4xN:
459 case nbnxnk4xN_SIMD_2xNN:
460 nbnxn_kernel_cpu(nbvg,
461 nbv->nbat,
463 fr->shift_vec,
464 flags,
465 clearF,
466 fr->fshift[0],
467 enerd->grpp.ener[egCOULSR],
468 fr->bBHAM ?
469 enerd->grpp.ener[egBHAMSR] :
470 enerd->grpp.ener[egLJSR]);
471 break;
473 case nbnxnk8x8x8_GPU:
474 nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbv->nbat, flags, ilocality);
475 break;
477 case nbnxnk8x8x8_PlainC:
478 nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
479 nbv->nbat, ic,
480 fr->shift_vec,
481 flags,
482 clearF,
483 nbv->nbat->out[0].f,
484 fr->fshift[0],
485 enerd->grpp.ener[egCOULSR],
486 fr->bBHAM ?
487 enerd->grpp.ener[egBHAMSR] :
488 enerd->grpp.ener[egLJSR]);
489 break;
491 default:
492 GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
495 if (!bUsingGpuKernels)
497 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
500 int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
501 if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
503 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
505 else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
506 (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
508 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
510 else
512 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
514 enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
515 if (flags & GMX_FORCE_ENERGY)
517 /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
518 enr_nbnxn_kernel_ljc += 1;
519 enr_nbnxn_kernel_lj += 1;
522 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
523 nbvg->nbl_lists.natpair_ljq);
524 inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
525 nbvg->nbl_lists.natpair_lj);
526 /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
527 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
528 nbvg->nbl_lists.natpair_q);
530 if (ic->vdw_modifier == eintmodFORCESWITCH)
532 /* We add up the switch cost separately */
533 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
534 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
536 if (ic->vdw_modifier == eintmodPOTSWITCH)
538 /* We add up the switch cost separately */
539 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
540 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
542 if (ic->vdwtype == evdwPME)
544 /* We add up the LJ Ewald cost separately */
545 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
546 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
550 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
551 t_forcerec *fr,
552 rvec x[],
553 rvec f[],
554 const t_mdatoms *mdatoms,
555 t_lambda *fepvals,
556 real *lambda,
557 gmx_enerdata_t *enerd,
558 int flags,
559 t_nrnb *nrnb,
560 gmx_wallcycle_t wcycle)
562 int donb_flags;
563 nb_kernel_data_t kernel_data;
564 real lam_i[efptNR];
565 real dvdl_nb[efptNR];
566 int th;
567 int i, j;
569 donb_flags = 0;
570 /* Add short-range interactions */
571 donb_flags |= GMX_NONBONDED_DO_SR;
573 /* Currently all group scheme kernels always calculate (shift-)forces */
574 if (flags & GMX_FORCE_FORCES)
576 donb_flags |= GMX_NONBONDED_DO_FORCE;
578 if (flags & GMX_FORCE_VIRIAL)
580 donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
582 if (flags & GMX_FORCE_ENERGY)
584 donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
587 kernel_data.flags = donb_flags;
588 kernel_data.lambda = lambda;
589 kernel_data.dvdl = dvdl_nb;
591 kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
592 kernel_data.energygrp_vdw = enerd->grpp.ener[egLJSR];
594 /* reset free energy components */
595 for (i = 0; i < efptNR; i++)
597 dvdl_nb[i] = 0;
600 GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
602 wallcycle_sub_start(wcycle, ewcsNONBONDED);
603 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
604 for (th = 0; th < nbl_lists->nnbl; th++)
608 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
609 x, f, fr, mdatoms, &kernel_data, nrnb);
611 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
614 if (fepvals->sc_alpha != 0)
616 enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
617 enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
619 else
621 enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
622 enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
625 /* If we do foreign lambda and we have soft-core interactions
626 * we have to recalculate the (non-linear) energies contributions.
628 if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
630 kernel_data.flags = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
631 kernel_data.lambda = lam_i;
632 kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
633 kernel_data.energygrp_vdw = enerd->foreign_grpp.ener[egLJSR];
634 /* Note that we add to kernel_data.dvdl, but ignore the result */
636 for (i = 0; i < enerd->n_lambda; i++)
638 for (j = 0; j < efptNR; j++)
640 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
642 reset_foreign_enerdata(enerd);
643 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
644 for (th = 0; th < nbl_lists->nnbl; th++)
648 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
649 x, f, fr, mdatoms, &kernel_data, nrnb);
651 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
654 sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
655 enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
659 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
662 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
664 return nbv != nullptr && nbv->bUseGPU;
667 static inline void clear_rvecs_omp(int n, rvec v[])
669 int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
671 /* Note that we would like to avoid this conditional by putting it
672 * into the omp pragma instead, but then we still take the full
673 * omp parallel for overhead (at least with gcc5).
675 if (nth == 1)
677 for (int i = 0; i < n; i++)
679 clear_rvec(v[i]);
682 else
684 #pragma omp parallel for num_threads(nth) schedule(static)
685 for (int i = 0; i < n; i++)
687 clear_rvec(v[i]);
692 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
694 * \param groupOptions Group options, containing T-coupling options
696 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
698 real nrdfCoupled = 0;
699 real nrdfUncoupled = 0;
700 real kineticEnergy = 0;
701 for (int g = 0; g < groupOptions.ngtc; g++)
703 if (groupOptions.tau_t[g] >= 0)
705 nrdfCoupled += groupOptions.nrdf[g];
706 kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
708 else
710 nrdfUncoupled += groupOptions.nrdf[g];
714 /* This conditional with > also catches nrdf=0 */
715 if (nrdfCoupled > nrdfUncoupled)
717 return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
719 else
721 return 0;
725 /*! \brief This routine checks that the potential energy is finite.
727 * Always checks that the potential energy is finite. If step equals
728 * inputrec.init_step also checks that the magnitude of the potential energy
729 * is reasonable. Terminates with a fatal error when a check fails.
730 * Note that passing this check does not guarantee finite forces,
731 * since those use slightly different arithmetics. But in most cases
732 * there is just a narrow coordinate range where forces are not finite
733 * and energies are finite.
735 * \param[in] step The step number, used for checking and printing
736 * \param[in] enerd The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
737 * \param[in] inputrec The input record
739 static void checkPotentialEnergyValidity(int64_t step,
740 const gmx_enerdata_t &enerd,
741 const t_inputrec &inputrec)
743 /* Threshold valid for comparing absolute potential energy against
744 * the kinetic energy. Normally one should not consider absolute
745 * potential energy values, but with a factor of one million
746 * we should never get false positives.
748 constexpr real c_thresholdFactor = 1e6;
750 bool energyIsNotFinite = !std::isfinite(enerd.term[F_EPOT]);
751 real averageKineticEnergy = 0;
752 /* We only check for large potential energy at the initial step,
753 * because that is by far the most likely step for this too occur
754 * and because computing the average kinetic energy is not free.
755 * Note: nstcalcenergy >> 1 often does not allow to catch large energies
756 * before they become NaN.
758 if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
760 averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
763 if (energyIsNotFinite || (averageKineticEnergy > 0 &&
764 enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
766 gmx_fatal(FARGS, "Step %" PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
767 step,
768 enerd.term[F_EPOT],
769 energyIsNotFinite ? "not finite" : "extremely high",
770 enerd.term[F_LJ],
771 enerd.term[F_COUL_SR],
772 energyIsNotFinite ? "non-finite" : "very high",
773 energyIsNotFinite ? " or Nan" : "");
777 /*! \brief Compute forces and/or energies for special algorithms
779 * The intention is to collect all calls to algorithms that compute
780 * forces on local atoms only and that do not contribute to the local
781 * virial sum (but add their virial contribution separately).
782 * Eventually these should likely all become ForceProviders.
783 * Within this function the intention is to have algorithms that do
784 * global communication at the end, so global barriers within the MD loop
785 * are as close together as possible.
787 * \param[in] fplog The log file
788 * \param[in] cr The communication record
789 * \param[in] inputrec The input record
790 * \param[in] awh The Awh module (nullptr if none in use).
791 * \param[in] enforcedRotation Enforced rotation module.
792 * \param[in] step The current MD step
793 * \param[in] t The current time
794 * \param[in,out] wcycle Wallcycle accounting struct
795 * \param[in,out] forceProviders Pointer to a list of force providers
796 * \param[in] box The unit cell
797 * \param[in] x The coordinates
798 * \param[in] mdatoms Per atom properties
799 * \param[in] lambda Array of free-energy lambda values
800 * \param[in] forceFlags Flags that tell whether we should compute forces/energies/virial
801 * \param[in,out] forceWithVirial Force and virial buffers
802 * \param[in,out] enerd Energy buffer
803 * \param[in,out] ed Essential dynamics pointer
804 * \param[in] bNS Tells if we did neighbor searching this step, used for ED sampling
806 * \todo Remove bNS, which is used incorrectly.
807 * \todo Convert all other algorithms called here to ForceProviders.
809 static void
810 computeSpecialForces(FILE *fplog,
811 const t_commrec *cr,
812 const t_inputrec *inputrec,
813 gmx::Awh *awh,
814 gmx_enfrot *enforcedRotation,
815 int64_t step,
816 double t,
817 gmx_wallcycle_t wcycle,
818 ForceProviders *forceProviders,
819 matrix box,
820 gmx::ArrayRef<const gmx::RVec> x,
821 const t_mdatoms *mdatoms,
822 real *lambda,
823 int forceFlags,
824 gmx::ForceWithVirial *forceWithVirial,
825 gmx_enerdata_t *enerd,
826 gmx_edsam *ed,
827 gmx_bool bNS)
829 const bool computeForces = (forceFlags & GMX_FORCE_FORCES) != 0;
831 /* NOTE: Currently all ForceProviders only provide forces.
832 * When they also provide energies, remove this conditional.
834 if (computeForces)
836 gmx::ForceProviderInput forceProviderInput(x, *mdatoms, t, box, *cr);
837 gmx::ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
839 /* Collect forces from modules */
840 forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
843 if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
845 pull_potential_wrapper(cr, inputrec, box, x,
846 forceWithVirial,
847 mdatoms, enerd, lambda, t,
848 wcycle);
850 if (awh)
852 enerd->term[F_COM_PULL] +=
853 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
854 forceWithVirial,
855 t, step, wcycle, fplog);
859 rvec *f = as_rvec_array(forceWithVirial->force_.data());
861 /* Add the forces from enforced rotation potentials (if any) */
862 if (inputrec->bRot)
864 wallcycle_start(wcycle, ewcROTadd);
865 enerd->term[F_COM_PULL] += add_rot_forces(enforcedRotation, f, cr, step, t);
866 wallcycle_stop(wcycle, ewcROTadd);
869 if (ed)
871 /* Note that since init_edsam() is called after the initialization
872 * of forcerec, edsam doesn't request the noVirSum force buffer.
873 * Thus if no other algorithm (e.g. PME) requires it, the forces
874 * here will contribute to the virial.
876 do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
879 /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
880 if (inputrec->bIMD && computeForces)
882 IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
886 /*! \brief Launch the prepare_step and spread stages of PME GPU.
888 * \param[in] pmedata The PME structure
889 * \param[in] box The box matrix
890 * \param[in] x Coordinate array
891 * \param[in] flags Force flags
892 * \param[in] wcycle The wallcycle structure
894 static inline void launchPmeGpuSpread(gmx_pme_t *pmedata,
895 matrix box,
896 rvec x[],
897 int flags,
898 gmx_wallcycle_t wcycle)
900 int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE;
901 pmeFlags |= (flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0;
902 pmeFlags |= (flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0;
904 pme_gpu_prepare_computation(pmedata, (flags & GMX_FORCE_DYNAMICBOX) != 0, box, wcycle, pmeFlags);
905 pme_gpu_launch_spread(pmedata, x, wcycle);
908 /*! \brief Launch the FFT and gather stages of PME GPU
910 * This function only implements setting the output forces (no accumulation).
912 * \param[in] pmedata The PME structure
913 * \param[in] wcycle The wallcycle structure
915 static void launchPmeGpuFftAndGather(gmx_pme_t *pmedata,
916 gmx_wallcycle_t wcycle)
918 pme_gpu_launch_complex_transforms(pmedata, wcycle);
919 pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
922 /*! \brief
923 * Polling wait for either of the PME or nonbonded GPU tasks.
925 * Instead of a static order in waiting for GPU tasks, this function
926 * polls checking which of the two tasks completes first, and does the
927 * associated force buffer reduction overlapped with the other task.
928 * By doing that, unlike static scheduling order, it can always overlap
929 * one of the reductions, regardless of the GPU task completion order.
931 * \param[in] nbv Nonbonded verlet structure
932 * \param[in] pmedata PME module data
933 * \param[in,out] force Force array to reduce task outputs into.
934 * \param[in,out] forceWithVirial Force and virial buffers
935 * \param[in,out] fshift Shift force output vector results are reduced into
936 * \param[in,out] enerd Energy data structure results are reduced into
937 * \param[in] flags Force flags
938 * \param[in] haveOtherWork Tells whether there is other work than non-bonded in the stream(s)
939 * \param[in] wcycle The wallcycle structure
941 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv,
942 const gmx_pme_t *pmedata,
943 gmx::ArrayRefWithPadding<gmx::RVec> *force,
944 gmx::ForceWithVirial *forceWithVirial,
945 rvec fshift[],
946 gmx_enerdata_t *enerd,
947 int flags,
948 bool haveOtherWork,
949 gmx_wallcycle_t wcycle)
951 bool isPmeGpuDone = false;
952 bool isNbGpuDone = false;
955 gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
957 while (!isPmeGpuDone || !isNbGpuDone)
959 if (!isPmeGpuDone)
961 matrix vir_Q;
962 real Vlr_q;
964 GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
965 isPmeGpuDone = pme_gpu_try_finish_task(pmedata, wcycle, &pmeGpuForces,
966 vir_Q, &Vlr_q, completionType);
968 if (isPmeGpuDone)
970 pme_gpu_reduce_outputs(wcycle, forceWithVirial, pmeGpuForces,
971 enerd, vir_Q, Vlr_q);
975 if (!isNbGpuDone)
977 GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
978 wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
979 isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
980 flags, eatLocal,
981 haveOtherWork,
982 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
983 fshift, completionType);
984 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
985 // To get the call count right, when the task finished we
986 // issue a start/stop.
987 // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
988 // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
989 if (isNbGpuDone)
991 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
992 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
994 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
995 nbv->nbat, as_rvec_array(force->unpaddedArrayRef().data()), wcycle);
1001 /*! \brief
1002 * Launch the dynamic rolling pruning GPU task.
1004 * We currently alternate local/non-local list pruning in odd-even steps
1005 * (only pruning every second step without DD).
1007 * \param[in] cr The communication record
1008 * \param[in] nbv Nonbonded verlet structure
1009 * \param[in] inputrec The input record
1010 * \param[in] step The current MD step
1012 static inline void launchGpuRollingPruning(const t_commrec *cr,
1013 const nonbonded_verlet_t *nbv,
1014 const t_inputrec *inputrec,
1015 const int64_t step)
1017 /* We should not launch the rolling pruning kernel at a search
1018 * step or just before search steps, since that's useless.
1019 * Without domain decomposition we prune at even steps.
1020 * With domain decomposition we alternate local and non-local
1021 * pruning at even and odd steps.
1023 int numRollingParts = nbv->listParams->numRollingParts;
1024 GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1025 int stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1026 bool stepIsEven = ((stepWithCurrentList & 1) == 0);
1027 if (stepWithCurrentList > 0 &&
1028 stepWithCurrentList < inputrec->nstlist - 1 &&
1029 (stepIsEven || DOMAINDECOMP(cr)))
1031 nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
1032 stepIsEven ? eintLocal : eintNonlocal,
1033 numRollingParts);
1037 static void do_force_cutsVERLET(FILE *fplog,
1038 const t_commrec *cr,
1039 const gmx_multisim_t *ms,
1040 const t_inputrec *inputrec,
1041 gmx::Awh *awh,
1042 gmx_enfrot *enforcedRotation,
1043 int64_t step,
1044 t_nrnb *nrnb,
1045 gmx_wallcycle_t wcycle,
1046 const gmx_localtop_t *top,
1047 const gmx_groups_t * /* groups */,
1048 matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
1049 history_t *hist,
1050 gmx::ArrayRefWithPadding<gmx::RVec> force,
1051 tensor vir_force,
1052 const t_mdatoms *mdatoms,
1053 gmx_enerdata_t *enerd, t_fcdata *fcd,
1054 real *lambda,
1055 t_graph *graph,
1056 t_forcerec *fr,
1057 gmx::PpForceWorkload *ppForceWorkload,
1058 interaction_const_t *ic,
1059 const gmx_vsite_t *vsite,
1060 rvec mu_tot,
1061 double t,
1062 gmx_edsam *ed,
1063 int flags,
1064 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1065 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1067 int cg1, i, j;
1068 double mu[2*DIM];
1069 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1070 gmx_bool bDoForces, bUseGPU, bUseOrEmulGPU;
1071 rvec vzero, box_diag;
1072 float cycles_pme, cycles_wait_gpu;
1073 nonbonded_verlet_t *nbv = fr->nbv;
1075 bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
1076 bNS = ((flags & GMX_FORCE_NS) != 0) && (!fr->bAllvsAll);
1077 bFillGrid = (bNS && bStateChanged);
1078 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1079 bDoForces = ((flags & GMX_FORCE_FORCES) != 0);
1080 bUseGPU = fr->nbv->bUseGPU;
1081 bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
1083 const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
1084 // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
1085 const bool useGpuPme = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
1086 ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
1088 /* At a search step we need to start the first balancing region
1089 * somewhere early inside the step after communication during domain
1090 * decomposition (and not during the previous step as usual).
1092 if (bNS &&
1093 ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1095 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
1098 cycles_wait_gpu = 0;
1100 const int start = 0;
1101 const int homenr = mdatoms->homenr;
1103 clear_mat(vir_force);
1105 if (DOMAINDECOMP(cr))
1107 cg1 = cr->dd->globalAtomGroupIndices.size();
1109 else
1111 cg1 = top->cgs.nr;
1113 if (fr->n_tpi > 0)
1115 cg1--;
1118 if (bStateChanged)
1120 update_forcerec(fr, box);
1122 if (inputrecNeedMutot(inputrec))
1124 /* Calculate total (local) dipole moment in a temporary common array.
1125 * This makes it possible to sum them over nodes faster.
1127 calc_mu(start, homenr,
1128 x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1129 mu, mu+DIM);
1133 if (fr->ePBC != epbcNONE)
1135 /* Compute shift vectors every step,
1136 * because of pressure coupling or box deformation!
1138 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1140 calc_shifts(box, fr->shift_vec);
1143 if (bCalcCGCM)
1145 put_atoms_in_box_omp(fr->ePBC, box, x.unpaddedArrayRef().subArray(0, homenr));
1146 inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1148 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1150 unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1154 nbnxn_atomdata_copy_shiftvec((flags & GMX_FORCE_DYNAMICBOX) != 0,
1155 fr->shift_vec, nbv->nbat);
1157 #if GMX_MPI
1158 if (!thisRankHasDuty(cr, DUTY_PME))
1160 /* Send particle coordinates to the pme nodes.
1161 * Since this is only implemented for domain decomposition
1162 * and domain decomposition does not use the graph,
1163 * we do not need to worry about shifting.
1165 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1166 lambda[efptCOUL], lambda[efptVDW],
1167 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1168 step, wcycle);
1170 #endif /* GMX_MPI */
1172 if (useGpuPme)
1174 launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.unpaddedArrayRef().data()), flags, wcycle);
1177 /* do gridding for pair search */
1178 if (bNS)
1180 if (graph && bStateChanged)
1182 /* Calculate intramolecular shift vectors to make molecules whole */
1183 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1186 clear_rvec(vzero);
1187 box_diag[XX] = box[XX][XX];
1188 box_diag[YY] = box[YY][YY];
1189 box_diag[ZZ] = box[ZZ][ZZ];
1191 wallcycle_start(wcycle, ewcNS);
1192 if (!DOMAINDECOMP(cr))
1194 wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1195 nbnxn_put_on_grid(nbv->nbs.get(), fr->ePBC, box,
1196 0, vzero, box_diag,
1197 nullptr, 0, mdatoms->homenr, -1,
1198 fr->cginfo, x.unpaddedArrayRef(),
1199 0, nullptr,
1200 nbv->grp[eintLocal].kernel_type,
1201 nbv->nbat);
1202 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1204 else
1206 wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1207 nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
1208 fr->cginfo, x.unpaddedArrayRef(),
1209 nbv->grp[eintNonlocal].kernel_type,
1210 nbv->nbat);
1211 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1214 nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
1216 /* Now we put all atoms on the grid, we can assign bonded interactions
1217 * to the GPU, where the grid order is needed.
1219 if (fr->gpuBondedLists)
1221 assign_bondeds_to_gpu(fr->gpuBondedLists,
1222 nbnxn_get_gridindices(fr->nbv->nbs.get()),
1223 top->idef);
1225 update_gpu_bonded(fr->gpuBondedLists);
1226 ppForceWorkload->haveGpuBondedWork = bonded_gpu_have_interactions(fr->gpuBondedLists);
1229 wallcycle_stop(wcycle, ewcNS);
1232 /* initialize the GPU atom data and copy shift vector */
1233 if (bUseGPU)
1235 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1236 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1238 if (bNS)
1240 nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1243 nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1245 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1246 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1249 /* do local pair search */
1250 if (bNS)
1252 wallcycle_start_nocount(wcycle, ewcNS);
1253 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1254 nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1255 &top->excls,
1256 nbv->listParams->rlistOuter,
1257 nbv->min_ci_balanced,
1258 &nbv->grp[eintLocal].nbl_lists,
1259 eintLocal,
1260 nbv->grp[eintLocal].kernel_type,
1261 nrnb);
1262 nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
1263 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1265 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
1267 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1269 if (bUseGPU)
1271 /* initialize local pair-list on the GPU */
1272 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1273 nbv->grp[eintLocal].nbl_lists.nbl[0],
1274 eintLocal);
1276 wallcycle_stop(wcycle, ewcNS);
1278 else
1280 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatLocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1281 nbv->nbat, wcycle);
1284 if (bUseGPU)
1286 if (DOMAINDECOMP(cr))
1288 ddOpenBalanceRegionGpu(cr->dd);
1291 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1292 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1293 /* launch local nonbonded work on GPU */
1294 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1295 step, nrnb, wcycle);
1296 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1298 if (ppForceWorkload->haveGpuBondedWork && !DOMAINDECOMP(cr))
1300 do_bonded_gpu(fr, flags,
1301 nbnxn_gpu_get_xq(nbv->gpu_nbv), box,
1302 nbnxn_gpu_get_f(nbv->gpu_nbv),
1303 nbnxn_gpu_get_fshift(nbv->gpu_nbv));
1306 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1309 if (useGpuPme)
1311 // In PME GPU and mixed mode we launch FFT / gather after the
1312 // X copy/transform to allow overlap as well as after the GPU NB
1313 // launch to avoid FFT launch overhead hijacking the CPU and delaying
1314 // the nonbonded kernel.
1315 launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1318 /* Communicate coordinates and sum dipole if necessary +
1319 do non-local pair search */
1320 if (DOMAINDECOMP(cr))
1322 if (bNS)
1324 wallcycle_start_nocount(wcycle, ewcNS);
1325 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1327 nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1328 &top->excls,
1329 nbv->listParams->rlistOuter,
1330 nbv->min_ci_balanced,
1331 &nbv->grp[eintNonlocal].nbl_lists,
1332 eintNonlocal,
1333 nbv->grp[eintNonlocal].kernel_type,
1334 nrnb);
1335 nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1336 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1338 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1340 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1342 if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1344 /* initialize non-local pair-list on the GPU */
1345 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1346 nbv->grp[eintNonlocal].nbl_lists.nbl[0],
1347 eintNonlocal);
1349 wallcycle_stop(wcycle, ewcNS);
1351 else
1353 dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1355 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatNonlocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1356 nbv->nbat, wcycle);
1359 if (bUseGPU)
1361 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1362 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1363 /* launch non-local nonbonded tasks on GPU */
1364 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1365 step, nrnb, wcycle);
1366 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1368 if (ppForceWorkload->haveGpuBondedWork)
1370 do_bonded_gpu(fr, flags,
1371 nbnxn_gpu_get_xq(nbv->gpu_nbv), box,
1372 nbnxn_gpu_get_f(nbv->gpu_nbv),
1373 nbnxn_gpu_get_fshift(nbv->gpu_nbv));
1376 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1380 if (bUseGPU)
1382 /* launch D2H copy-back F */
1383 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1384 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1385 if (DOMAINDECOMP(cr))
1387 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1388 flags, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
1390 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1391 flags, eatLocal, ppForceWorkload->haveGpuBondedWork);
1392 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1393 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1396 if (bStateChanged && inputrecNeedMutot(inputrec))
1398 if (PAR(cr))
1400 gmx_sumd(2*DIM, mu, cr);
1401 ddReopenBalanceRegionCpu(cr->dd);
1404 for (i = 0; i < 2; i++)
1406 for (j = 0; j < DIM; j++)
1408 fr->mu_tot[i][j] = mu[i*DIM + j];
1412 if (fr->efep == efepNO)
1414 copy_rvec(fr->mu_tot[0], mu_tot);
1416 else
1418 for (j = 0; j < DIM; j++)
1420 mu_tot[j] =
1421 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1422 lambda[efptCOUL]*fr->mu_tot[1][j];
1426 /* Reset energies */
1427 reset_enerdata(enerd);
1428 clear_rvecs(SHIFTS, fr->fshift);
1430 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1432 wallcycle_start(wcycle, ewcPPDURINGPME);
1433 dd_force_flop_start(cr->dd, nrnb);
1436 if (inputrec->bRot)
1438 wallcycle_start(wcycle, ewcROT);
1439 do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1440 wallcycle_stop(wcycle, ewcROT);
1443 /* Temporary solution until all routines take PaddedRVecVector */
1444 rvec *const f = as_rvec_array(force.unpaddedArrayRef().data());
1446 /* Start the force cycle counter.
1447 * Note that a different counter is used for dynamic load balancing.
1449 wallcycle_start(wcycle, ewcFORCE);
1451 gmx::ArrayRef<gmx::RVec> forceRef = force.unpaddedArrayRef();
1452 if (bDoForces)
1454 /* If we need to compute the virial, we might need a separate
1455 * force buffer for algorithms for which the virial is calculated
1456 * directly, such as PME.
1458 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1460 forceRef = *fr->forceBufferForDirectVirialContributions;
1462 /* TODO: update comment
1463 * We only compute forces on local atoms. Note that vsites can
1464 * spread to non-local atoms, but that part of the buffer is
1465 * cleared separately in the vsite spreading code.
1467 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1469 /* Clear the short- and long-range forces */
1470 clear_rvecs_omp(fr->natoms_force_constr, f);
1473 /* forceWithVirial uses the local atom range only */
1474 gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1476 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1478 clear_pull_forces(inputrec->pull_work);
1481 /* We calculate the non-bonded forces, when done on the CPU, here.
1482 * We do this before calling do_force_lowlevel, because in that
1483 * function, the listed forces are calculated before PME, which
1484 * does communication. With this order, non-bonded and listed
1485 * force calculation imbalance can be balanced out by the domain
1486 * decomposition load balancing.
1489 if (!bUseOrEmulGPU)
1491 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1492 step, nrnb, wcycle);
1495 if (fr->efep != efepNO)
1497 /* Calculate the local and non-local free energy interactions here.
1498 * Happens here on the CPU both with and without GPU.
1500 if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1502 do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1503 fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
1504 inputrec->fepvals, lambda,
1505 enerd, flags, nrnb, wcycle);
1508 if (DOMAINDECOMP(cr) &&
1509 fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1511 do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1512 fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
1513 inputrec->fepvals, lambda,
1514 enerd, flags, nrnb, wcycle);
1518 if (!bUseOrEmulGPU)
1520 int aloc;
1522 if (DOMAINDECOMP(cr))
1524 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1525 step, nrnb, wcycle);
1528 if (!bUseOrEmulGPU)
1530 aloc = eintLocal;
1532 else
1534 aloc = eintNonlocal;
1537 /* Add all the non-bonded force to the normal force array.
1538 * This can be split into a local and a non-local part when overlapping
1539 * communication with calculation with domain decomposition.
1541 wallcycle_stop(wcycle, ewcFORCE);
1543 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatAll, nbv->nbat, f, wcycle);
1545 wallcycle_start_nocount(wcycle, ewcFORCE);
1547 /* if there are multiple fshift output buffers reduce them */
1548 if ((flags & GMX_FORCE_VIRIAL) &&
1549 nbv->grp[aloc].nbl_lists.nnbl > 1)
1551 /* This is not in a subcounter because it takes a
1552 negligible and constant-sized amount of time */
1553 nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1554 fr->fshift);
1558 /* update QMMMrec, if necessary */
1559 if (fr->bQMMM)
1561 update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
1564 /* Compute the bonded and non-bonded energies and optionally forces */
1565 do_force_lowlevel(fr, inputrec, &(top->idef),
1566 cr, ms, nrnb, wcycle, mdatoms,
1567 as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
1568 box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1569 flags, &cycles_pme);
1571 wallcycle_stop(wcycle, ewcFORCE);
1573 computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1574 step, t, wcycle,
1575 fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
1576 flags, &forceWithVirial, enerd,
1577 ed, bNS);
1579 if (bUseOrEmulGPU)
1581 /* wait for non-local forces (or calculate in emulation mode) */
1582 if (DOMAINDECOMP(cr))
1584 if (bUseGPU)
1586 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1587 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1588 flags, eatNonlocal,
1589 ppForceWorkload->haveGpuBondedWork,
1590 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1591 fr->fshift);
1592 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1594 else
1596 wallcycle_start_nocount(wcycle, ewcFORCE);
1597 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1598 step, nrnb, wcycle);
1599 wallcycle_stop(wcycle, ewcFORCE);
1602 /* skip the reduction if there was no non-local work to do */
1603 if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1605 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal,
1606 nbv->nbat, f, wcycle);
1611 if (DOMAINDECOMP(cr))
1613 /* We are done with the CPU compute.
1614 * We will now communicate the non-local forces.
1615 * If we use a GPU this will overlap with GPU work, so in that case
1616 * we do not close the DD force balancing region here.
1618 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1620 ddCloseBalanceRegionCpu(cr->dd);
1622 if (bDoForces)
1624 dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
1628 // With both nonbonded and PME offloaded a GPU on the same rank, we use
1629 // an alternating wait/reduction scheme.
1630 bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1631 if (alternateGpuWait)
1633 alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, ppForceWorkload->haveGpuBondedWork, wcycle);
1636 if (!alternateGpuWait && useGpuPme)
1638 gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
1639 matrix vir_Q;
1640 real Vlr_q = 0.0;
1641 pme_gpu_wait_finish_task(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
1642 pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
1645 /* Wait for local GPU NB outputs on the non-alternating wait path */
1646 if (!alternateGpuWait && bUseGPU)
1648 /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1649 * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1650 * but even with a step of 0.1 ms the difference is less than 1%
1651 * of the step time.
1653 const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1655 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1656 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1657 flags, eatLocal, ppForceWorkload->haveGpuBondedWork,
1658 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1659 fr->fshift);
1660 float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1662 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1664 DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1665 if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1667 /* We measured few cycles, it could be that the kernel
1668 * and transfer finished earlier and there was no actual
1669 * wait time, only API call overhead.
1670 * Then the actual time could be anywhere between 0 and
1671 * cycles_wait_est. We will use half of cycles_wait_est.
1673 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1675 ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1679 if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1681 // NOTE: emulation kernel is not included in the balancing region,
1682 // but emulation mode does not target performance anyway
1683 wallcycle_start_nocount(wcycle, ewcFORCE);
1684 do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1685 DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1686 step, nrnb, wcycle);
1687 wallcycle_stop(wcycle, ewcFORCE);
1690 if (useGpuPme)
1692 pme_gpu_reinit_computation(fr->pmedata, wcycle);
1695 if (bUseGPU)
1697 /* now clear the GPU outputs while we finish the step on the CPU */
1698 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1699 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1700 nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1702 /* Is dynamic pair-list pruning activated? */
1703 if (nbv->listParams->useDynamicPruning)
1705 launchGpuRollingPruning(cr, nbv, inputrec, step);
1707 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1708 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1711 /* Do the nonbonded GPU (or emulation) force buffer reduction
1712 * on the non-alternating path. */
1713 if (bUseOrEmulGPU && !alternateGpuWait)
1715 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1716 nbv->nbat, f, wcycle);
1719 if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1721 bonded_gpu_get_energies(fr, enerd);
1723 bonded_gpu_clear_energies(fr->gpuBondedLists);
1726 if (DOMAINDECOMP(cr))
1728 dd_force_flop_stop(cr->dd, nrnb);
1731 if (bDoForces)
1733 /* If we have NoVirSum forces, but we do not calculate the virial,
1734 * we sum fr->f_novirsum=f later.
1736 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1738 spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
1739 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1742 if (flags & GMX_FORCE_VIRIAL)
1744 /* Calculation of the virial must be done after vsites! */
1745 calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
1746 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1750 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1752 /* In case of node-splitting, the PP nodes receive the long-range
1753 * forces, virial and energy from the PME nodes here.
1755 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1758 if (bDoForces)
1760 post_process_forces(cr, step, nrnb, wcycle,
1761 top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
1762 vir_force, mdatoms, graph, fr, vsite,
1763 flags);
1766 if (flags & GMX_FORCE_ENERGY)
1768 /* Sum the potential energy terms from group contributions */
1769 sum_epot(&(enerd->grpp), enerd->term);
1771 if (!EI_TPI(inputrec->eI))
1773 checkPotentialEnergyValidity(step, *enerd, *inputrec);
1778 static void do_force_cutsGROUP(FILE *fplog,
1779 const t_commrec *cr,
1780 const gmx_multisim_t *ms,
1781 const t_inputrec *inputrec,
1782 gmx::Awh *awh,
1783 gmx_enfrot *enforcedRotation,
1784 int64_t step,
1785 t_nrnb *nrnb,
1786 gmx_wallcycle_t wcycle,
1787 gmx_localtop_t *top,
1788 const gmx_groups_t *groups,
1789 matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
1790 history_t *hist,
1791 gmx::ArrayRefWithPadding<gmx::RVec> force,
1792 tensor vir_force,
1793 const t_mdatoms *mdatoms,
1794 gmx_enerdata_t *enerd,
1795 t_fcdata *fcd,
1796 real *lambda,
1797 t_graph *graph,
1798 t_forcerec *fr,
1799 const gmx_vsite_t *vsite,
1800 rvec mu_tot,
1801 double t,
1802 gmx_edsam *ed,
1803 int flags,
1804 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1805 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1807 int cg0, cg1, i, j;
1808 double mu[2*DIM];
1809 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1810 gmx_bool bDoForces;
1811 float cycles_pme;
1813 const int start = 0;
1814 const int homenr = mdatoms->homenr;
1816 clear_mat(vir_force);
1818 cg0 = 0;
1819 if (DOMAINDECOMP(cr))
1821 cg1 = cr->dd->globalAtomGroupIndices.size();
1823 else
1825 cg1 = top->cgs.nr;
1827 if (fr->n_tpi > 0)
1829 cg1--;
1832 bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
1833 bNS = ((flags & GMX_FORCE_NS) != 0) && (!fr->bAllvsAll);
1834 /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1835 bFillGrid = (bNS && bStateChanged);
1836 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1837 bDoForces = ((flags & GMX_FORCE_FORCES) != 0);
1839 if (bStateChanged)
1841 update_forcerec(fr, box);
1843 if (inputrecNeedMutot(inputrec))
1845 /* Calculate total (local) dipole moment in a temporary common array.
1846 * This makes it possible to sum them over nodes faster.
1848 calc_mu(start, homenr,
1849 x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1850 mu, mu+DIM);
1854 if (fr->ePBC != epbcNONE)
1856 /* Compute shift vectors every step,
1857 * because of pressure coupling or box deformation!
1859 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1861 calc_shifts(box, fr->shift_vec);
1864 if (bCalcCGCM)
1866 put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1867 &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1868 inc_nrnb(nrnb, eNR_CGCM, homenr);
1869 inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1871 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1873 unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1876 else if (bCalcCGCM)
1878 calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1879 inc_nrnb(nrnb, eNR_CGCM, homenr);
1882 if (bCalcCGCM && gmx_debug_at)
1884 pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1887 #if GMX_MPI
1888 if (!thisRankHasDuty(cr, DUTY_PME))
1890 /* Send particle coordinates to the pme nodes.
1891 * Since this is only implemented for domain decomposition
1892 * and domain decomposition does not use the graph,
1893 * we do not need to worry about shifting.
1895 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1896 lambda[efptCOUL], lambda[efptVDW],
1897 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1898 step, wcycle);
1900 #endif /* GMX_MPI */
1902 /* Communicate coordinates and sum dipole if necessary */
1903 if (DOMAINDECOMP(cr))
1905 dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1907 /* No GPU support, no move_x overlap, so reopen the balance region here */
1908 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1910 ddReopenBalanceRegionCpu(cr->dd);
1914 if (inputrecNeedMutot(inputrec))
1916 if (bStateChanged)
1918 if (PAR(cr))
1920 gmx_sumd(2*DIM, mu, cr);
1921 ddReopenBalanceRegionCpu(cr->dd);
1923 for (i = 0; i < 2; i++)
1925 for (j = 0; j < DIM; j++)
1927 fr->mu_tot[i][j] = mu[i*DIM + j];
1931 if (fr->efep == efepNO)
1933 copy_rvec(fr->mu_tot[0], mu_tot);
1935 else
1937 for (j = 0; j < DIM; j++)
1939 mu_tot[j] =
1940 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1945 /* Reset energies */
1946 reset_enerdata(enerd);
1947 clear_rvecs(SHIFTS, fr->fshift);
1949 if (bNS)
1951 wallcycle_start(wcycle, ewcNS);
1953 if (graph && bStateChanged)
1955 /* Calculate intramolecular shift vectors to make molecules whole */
1956 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1959 /* Do the actual neighbour searching */
1960 ns(fplog, fr, box,
1961 groups, top, mdatoms,
1962 cr, nrnb, bFillGrid);
1964 wallcycle_stop(wcycle, ewcNS);
1967 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1969 wallcycle_start(wcycle, ewcPPDURINGPME);
1970 dd_force_flop_start(cr->dd, nrnb);
1973 if (inputrec->bRot)
1975 wallcycle_start(wcycle, ewcROT);
1976 do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1977 wallcycle_stop(wcycle, ewcROT);
1980 /* Temporary solution until all routines take PaddedRVecVector */
1981 rvec *f = as_rvec_array(force.unpaddedArrayRef().data());
1983 /* Start the force cycle counter.
1984 * Note that a different counter is used for dynamic load balancing.
1986 wallcycle_start(wcycle, ewcFORCE);
1988 gmx::ArrayRef<gmx::RVec> forceRef = force.paddedArrayRef();
1989 if (bDoForces)
1991 /* If we need to compute the virial, we might need a separate
1992 * force buffer for algorithms for which the virial is calculated
1993 * separately, such as PME.
1995 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1997 forceRef = *fr->forceBufferForDirectVirialContributions;
1999 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
2002 /* Clear the short- and long-range forces */
2003 clear_rvecs(fr->natoms_force_constr, f);
2006 /* forceWithVirial might need the full force atom range */
2007 gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
2009 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
2011 clear_pull_forces(inputrec->pull_work);
2014 /* update QMMMrec, if necessary */
2015 if (fr->bQMMM)
2017 update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
2020 /* Compute the bonded and non-bonded energies and optionally forces */
2021 do_force_lowlevel(fr, inputrec, &(top->idef),
2022 cr, ms, nrnb, wcycle, mdatoms,
2023 as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
2024 box, inputrec->fepvals, lambda,
2025 graph, &(top->excls), fr->mu_tot,
2026 flags,
2027 &cycles_pme);
2029 wallcycle_stop(wcycle, ewcFORCE);
2031 if (DOMAINDECOMP(cr))
2033 dd_force_flop_stop(cr->dd, nrnb);
2035 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
2037 ddCloseBalanceRegionCpu(cr->dd);
2041 computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
2042 step, t, wcycle,
2043 fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
2044 flags, &forceWithVirial, enerd,
2045 ed, bNS);
2047 if (bDoForces)
2049 /* Communicate the forces */
2050 if (DOMAINDECOMP(cr))
2052 dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
2053 /* Do we need to communicate the separate force array
2054 * for terms that do not contribute to the single sum virial?
2055 * Position restraints and electric fields do not introduce
2056 * inter-cg forces, only full electrostatics methods do.
2057 * When we do not calculate the virial, fr->f_novirsum = f,
2058 * so we have already communicated these forces.
2060 if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
2061 (flags & GMX_FORCE_VIRIAL))
2063 dd_move_f(cr->dd, forceWithVirial.force_, nullptr, wcycle);
2067 /* If we have NoVirSum forces, but we do not calculate the virial,
2068 * we sum fr->f_novirsum=f later.
2070 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
2072 spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
2073 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
2076 if (flags & GMX_FORCE_VIRIAL)
2078 /* Calculation of the virial must be done after vsites! */
2079 calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
2080 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
2084 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
2086 /* In case of node-splitting, the PP nodes receive the long-range
2087 * forces, virial and energy from the PME nodes here.
2089 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
2092 if (bDoForces)
2094 post_process_forces(cr, step, nrnb, wcycle,
2095 top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
2096 vir_force, mdatoms, graph, fr, vsite,
2097 flags);
2100 if (flags & GMX_FORCE_ENERGY)
2102 /* Sum the potential energy terms from group contributions */
2103 sum_epot(&(enerd->grpp), enerd->term);
2105 if (!EI_TPI(inputrec->eI))
2107 checkPotentialEnergyValidity(step, *enerd, *inputrec);
2113 void do_force(FILE *fplog,
2114 const t_commrec *cr,
2115 const gmx_multisim_t *ms,
2116 const t_inputrec *inputrec,
2117 gmx::Awh *awh,
2118 gmx_enfrot *enforcedRotation,
2119 int64_t step,
2120 t_nrnb *nrnb,
2121 gmx_wallcycle_t wcycle,
2122 gmx_localtop_t *top,
2123 const gmx_groups_t *groups,
2124 matrix box,
2125 gmx::ArrayRefWithPadding<gmx::RVec> x, //NOLINT(performance-unnecessary-value-param)
2126 history_t *hist,
2127 gmx::ArrayRefWithPadding<gmx::RVec> force, //NOLINT(performance-unnecessary-value-param)
2128 tensor vir_force,
2129 const t_mdatoms *mdatoms,
2130 gmx_enerdata_t *enerd,
2131 t_fcdata *fcd,
2132 gmx::ArrayRef<real> lambda,
2133 t_graph *graph,
2134 t_forcerec *fr,
2135 gmx::PpForceWorkload *ppForceWorkload,
2136 const gmx_vsite_t *vsite,
2137 rvec mu_tot,
2138 double t,
2139 gmx_edsam *ed,
2140 int flags,
2141 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
2142 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
2144 /* modify force flag if not doing nonbonded */
2145 if (!fr->bNonbonded)
2147 flags &= ~GMX_FORCE_NONBONDED;
2150 switch (inputrec->cutoff_scheme)
2152 case ecutsVERLET:
2153 do_force_cutsVERLET(fplog, cr, ms, inputrec,
2154 awh, enforcedRotation, step, nrnb, wcycle,
2155 top,
2156 groups,
2157 box, x, hist,
2158 force, vir_force,
2159 mdatoms,
2160 enerd, fcd,
2161 lambda.data(), graph,
2163 ppForceWorkload,
2164 fr->ic,
2165 vsite, mu_tot,
2166 t, ed,
2167 flags,
2168 ddOpenBalanceRegion,
2169 ddCloseBalanceRegion);
2170 break;
2171 case ecutsGROUP:
2172 do_force_cutsGROUP(fplog, cr, ms, inputrec,
2173 awh, enforcedRotation, step, nrnb, wcycle,
2174 top,
2175 groups,
2176 box, x, hist,
2177 force, vir_force,
2178 mdatoms,
2179 enerd, fcd,
2180 lambda.data(), graph,
2181 fr, vsite, mu_tot,
2182 t, ed,
2183 flags,
2184 ddOpenBalanceRegion,
2185 ddCloseBalanceRegion);
2186 break;
2187 default:
2188 gmx_incons("Invalid cut-off scheme passed!");
2191 /* In case we don't have constraints and are using GPUs, the next balancing
2192 * region starts here.
2193 * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2194 * virial calculation and COM pulling, is not thus not included in
2195 * the balance timing, which is ok as most tasks do communication.
2197 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
2199 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
2204 void do_constrain_first(FILE *fplog, gmx::Constraints *constr,
2205 const t_inputrec *ir, const t_mdatoms *md,
2206 t_state *state)
2208 int i, m, start, end;
2209 int64_t step;
2210 real dt = ir->delta_t;
2211 real dvdl_dum;
2212 rvec *savex;
2214 /* We need to allocate one element extra, since we might use
2215 * (unaligned) 4-wide SIMD loads to access rvec entries.
2217 snew(savex, state->natoms + 1);
2219 start = 0;
2220 end = md->homenr;
2222 if (debug)
2224 fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2225 start, md->homenr, end);
2227 /* Do a first constrain to reset particles... */
2228 step = ir->init_step;
2229 if (fplog)
2231 char buf[STEPSTRSIZE];
2232 fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2233 gmx_step_str(step, buf));
2235 dvdl_dum = 0;
2237 /* constrain the current position */
2238 constr->apply(TRUE, FALSE,
2239 step, 0, 1.0,
2240 state->x.rvec_array(), state->x.rvec_array(), nullptr,
2241 state->box,
2242 state->lambda[efptBONDED], &dvdl_dum,
2243 nullptr, nullptr, gmx::ConstraintVariable::Positions);
2244 if (EI_VV(ir->eI))
2246 /* constrain the inital velocity, and save it */
2247 /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2248 constr->apply(TRUE, FALSE,
2249 step, 0, 1.0,
2250 state->x.rvec_array(), state->v.rvec_array(), state->v.rvec_array(),
2251 state->box,
2252 state->lambda[efptBONDED], &dvdl_dum,
2253 nullptr, nullptr, gmx::ConstraintVariable::Velocities);
2255 /* constrain the inital velocities at t-dt/2 */
2256 if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2258 auto x = makeArrayRef(state->x).subArray(start, end);
2259 auto v = makeArrayRef(state->v).subArray(start, end);
2260 for (i = start; (i < end); i++)
2262 for (m = 0; (m < DIM); m++)
2264 /* Reverse the velocity */
2265 v[i][m] = -v[i][m];
2266 /* Store the position at t-dt in buf */
2267 savex[i][m] = x[i][m] + dt*v[i][m];
2270 /* Shake the positions at t=-dt with the positions at t=0
2271 * as reference coordinates.
2273 if (fplog)
2275 char buf[STEPSTRSIZE];
2276 fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2277 gmx_step_str(step, buf));
2279 dvdl_dum = 0;
2280 constr->apply(TRUE, FALSE,
2281 step, -1, 1.0,
2282 state->x.rvec_array(), savex, nullptr,
2283 state->box,
2284 state->lambda[efptBONDED], &dvdl_dum,
2285 state->v.rvec_array(), nullptr, gmx::ConstraintVariable::Positions);
2287 for (i = start; i < end; i++)
2289 for (m = 0; m < DIM; m++)
2291 /* Re-reverse the velocities */
2292 v[i][m] = -v[i][m];
2296 sfree(savex);
2300 static void
2301 integrate_table(const real vdwtab[], real scale, int offstart, int rstart, int rend,
2302 double *enerout, double *virout)
2304 double enersum, virsum;
2305 double invscale, invscale2, invscale3;
2306 double r, ea, eb, ec, pa, pb, pc, pd;
2307 double y0, f, g, h;
2308 int ri, offset;
2309 double tabfactor;
2311 invscale = 1.0/scale;
2312 invscale2 = invscale*invscale;
2313 invscale3 = invscale*invscale2;
2315 /* Following summation derived from cubic spline definition,
2316 * Numerical Recipies in C, second edition, p. 113-116. Exact for
2317 * the cubic spline. We first calculate the negative of the
2318 * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2319 * add the more standard, abrupt cutoff correction to that result,
2320 * yielding the long-range correction for a switched function. We
2321 * perform both the pressure and energy loops at the same time for
2322 * simplicity, as the computational cost is low. */
2324 if (offstart == 0)
2326 /* Since the dispersion table has been scaled down a factor
2327 * 6.0 and the repulsion a factor 12.0 to compensate for the
2328 * c6/c12 parameters inside nbfp[] being scaled up (to save
2329 * flops in kernels), we need to correct for this.
2331 tabfactor = 6.0;
2333 else
2335 tabfactor = 12.0;
2338 enersum = 0.0;
2339 virsum = 0.0;
2340 for (ri = rstart; ri < rend; ++ri)
2342 r = ri*invscale;
2343 ea = invscale3;
2344 eb = 2.0*invscale2*r;
2345 ec = invscale*r*r;
2347 pa = invscale3;
2348 pb = 3.0*invscale2*r;
2349 pc = 3.0*invscale*r*r;
2350 pd = r*r*r;
2352 /* this "8" is from the packing in the vdwtab array - perhaps
2353 should be defined? */
2355 offset = 8*ri + offstart;
2356 y0 = vdwtab[offset];
2357 f = vdwtab[offset+1];
2358 g = vdwtab[offset+2];
2359 h = vdwtab[offset+3];
2361 enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2362 virsum += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2364 *enerout = 4.0*M_PI*enersum*tabfactor;
2365 *virout = 4.0*M_PI*virsum*tabfactor;
2368 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2370 double eners[2], virs[2], enersum, virsum;
2371 double r0, rc3, rc9;
2372 int ri0, ri1, i;
2373 real scale, *vdwtab;
2375 fr->enershiftsix = 0;
2376 fr->enershifttwelve = 0;
2377 fr->enerdiffsix = 0;
2378 fr->enerdifftwelve = 0;
2379 fr->virdiffsix = 0;
2380 fr->virdifftwelve = 0;
2382 const interaction_const_t *ic = fr->ic;
2384 if (eDispCorr != edispcNO)
2386 for (i = 0; i < 2; i++)
2388 eners[i] = 0;
2389 virs[i] = 0;
2391 if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2392 (ic->vdw_modifier == eintmodPOTSWITCH) ||
2393 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2394 (ic->vdwtype == evdwSHIFT) ||
2395 (ic->vdwtype == evdwSWITCH))
2397 if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2398 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2399 (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2401 gmx_fatal(FARGS,
2402 "With dispersion correction rvdw-switch can not be zero "
2403 "for vdw-type = %s", evdw_names[ic->vdwtype]);
2406 /* TODO This code depends on the logic in tables.c that
2407 constructs the table layout, which should be made
2408 explicit in future cleanup. */
2409 GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2410 "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2411 scale = fr->dispersionCorrectionTable->scale;
2412 vdwtab = fr->dispersionCorrectionTable->data;
2414 /* Round the cut-offs to exact table values for precision */
2415 ri0 = static_cast<int>(std::floor(ic->rvdw_switch*scale));
2416 ri1 = static_cast<int>(std::ceil(ic->rvdw*scale));
2418 /* The code below has some support for handling force-switching, i.e.
2419 * when the force (instead of potential) is switched over a limited
2420 * region. This leads to a constant shift in the potential inside the
2421 * switching region, which we can handle by adding a constant energy
2422 * term in the force-switch case just like when we do potential-shift.
2424 * For now this is not enabled, but to keep the functionality in the
2425 * code we check separately for switch and shift. When we do force-switch
2426 * the shifting point is rvdw_switch, while it is the cutoff when we
2427 * have a classical potential-shift.
2429 * For a pure potential-shift the potential has a constant shift
2430 * all the way out to the cutoff, and that is it. For other forms
2431 * we need to calculate the constant shift up to the point where we
2432 * start modifying the potential.
2434 ri0 = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2436 r0 = ri0/scale;
2437 rc3 = r0*r0*r0;
2438 rc9 = rc3*rc3*rc3;
2440 if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2441 (ic->vdwtype == evdwSHIFT))
2443 /* Determine the constant energy shift below rvdw_switch.
2444 * Table has a scale factor since we have scaled it down to compensate
2445 * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2447 fr->enershiftsix = static_cast<real>(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2448 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2450 else if (ic->vdw_modifier == eintmodPOTSHIFT)
2452 fr->enershiftsix = static_cast<real>(-1.0/(rc3*rc3));
2453 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3));
2456 /* Add the constant part from 0 to rvdw_switch.
2457 * This integration from 0 to rvdw_switch overcounts the number
2458 * of interactions by 1, as it also counts the self interaction.
2459 * We will correct for this later.
2461 eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2462 eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2464 /* Calculate the contribution in the range [r0,r1] where we
2465 * modify the potential. For a pure potential-shift modifier we will
2466 * have ri0==ri1, and there will not be any contribution here.
2468 for (i = 0; i < 2; i++)
2470 enersum = 0;
2471 virsum = 0;
2472 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2473 eners[i] -= enersum;
2474 virs[i] -= virsum;
2477 /* Alright: Above we compensated by REMOVING the parts outside r0
2478 * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2480 * Regardless of whether r0 is the point where we start switching,
2481 * or the cutoff where we calculated the constant shift, we include
2482 * all the parts we are missing out to infinity from r0 by
2483 * calculating the analytical dispersion correction.
2485 eners[0] += -4.0*M_PI/(3.0*rc3);
2486 eners[1] += 4.0*M_PI/(9.0*rc9);
2487 virs[0] += 8.0*M_PI/rc3;
2488 virs[1] += -16.0*M_PI/(3.0*rc9);
2490 else if (ic->vdwtype == evdwCUT ||
2491 EVDW_PME(ic->vdwtype) ||
2492 ic->vdwtype == evdwUSER)
2494 if (ic->vdwtype == evdwUSER && fplog)
2496 fprintf(fplog,
2497 "WARNING: using dispersion correction with user tables\n");
2500 /* Note that with LJ-PME, the dispersion correction is multiplied
2501 * by the difference between the actual C6 and the value of C6
2502 * that would produce the combination rule.
2503 * This means the normal energy and virial difference formulas
2504 * can be used here.
2507 rc3 = ic->rvdw*ic->rvdw*ic->rvdw;
2508 rc9 = rc3*rc3*rc3;
2509 /* Contribution beyond the cut-off */
2510 eners[0] += -4.0*M_PI/(3.0*rc3);
2511 eners[1] += 4.0*M_PI/(9.0*rc9);
2512 if (ic->vdw_modifier == eintmodPOTSHIFT)
2514 /* Contribution within the cut-off */
2515 eners[0] += -4.0*M_PI/(3.0*rc3);
2516 eners[1] += 4.0*M_PI/(3.0*rc9);
2518 /* Contribution beyond the cut-off */
2519 virs[0] += 8.0*M_PI/rc3;
2520 virs[1] += -16.0*M_PI/(3.0*rc9);
2522 else
2524 gmx_fatal(FARGS,
2525 "Dispersion correction is not implemented for vdw-type = %s",
2526 evdw_names[ic->vdwtype]);
2529 /* When we deprecate the group kernels the code below can go too */
2530 if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2532 /* Calculate self-interaction coefficient (assuming that
2533 * the reciprocal-space contribution is constant in the
2534 * region that contributes to the self-interaction).
2536 fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2538 eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2539 virs[0] += gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2542 fr->enerdiffsix = eners[0];
2543 fr->enerdifftwelve = eners[1];
2544 /* The 0.5 is due to the Gromacs definition of the virial */
2545 fr->virdiffsix = 0.5*virs[0];
2546 fr->virdifftwelve = 0.5*virs[1];
2550 void calc_dispcorr(const t_inputrec *ir, const t_forcerec *fr,
2551 const matrix box, real lambda, tensor pres, tensor virial,
2552 real *prescorr, real *enercorr, real *dvdlcorr)
2554 gmx_bool bCorrAll, bCorrPres;
2555 real dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2556 int m;
2558 *prescorr = 0;
2559 *enercorr = 0;
2560 *dvdlcorr = 0;
2562 clear_mat(virial);
2563 clear_mat(pres);
2565 if (ir->eDispCorr != edispcNO)
2567 bCorrAll = (ir->eDispCorr == edispcAllEner ||
2568 ir->eDispCorr == edispcAllEnerPres);
2569 bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2570 ir->eDispCorr == edispcAllEnerPres);
2572 invvol = 1/det(box);
2573 if (fr->n_tpi)
2575 /* Only correct for the interactions with the inserted molecule */
2576 dens = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2577 ninter = fr->n_tpi;
2579 else
2581 dens = fr->numAtomsForDispersionCorrection*invvol;
2582 ninter = 0.5*fr->numAtomsForDispersionCorrection;
2585 if (ir->efep == efepNO)
2587 avcsix = fr->avcsix[0];
2588 avctwelve = fr->avctwelve[0];
2590 else
2592 avcsix = (1 - lambda)*fr->avcsix[0] + lambda*fr->avcsix[1];
2593 avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2596 enerdiff = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2597 *enercorr += avcsix*enerdiff;
2598 dvdlambda = 0.0;
2599 if (ir->efep != efepNO)
2601 dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2603 if (bCorrAll)
2605 enerdiff = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2606 *enercorr += avctwelve*enerdiff;
2607 if (fr->efep != efepNO)
2609 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2613 if (bCorrPres)
2615 svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2616 if (ir->eDispCorr == edispcAllEnerPres)
2618 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2620 /* The factor 2 is because of the Gromacs virial definition */
2621 spres = -2.0*invvol*svir*PRESFAC;
2623 for (m = 0; m < DIM; m++)
2625 virial[m][m] += svir;
2626 pres[m][m] += spres;
2628 *prescorr += spres;
2631 /* Can't currently control when it prints, for now, just print when degugging */
2632 if (debug)
2634 if (bCorrAll)
2636 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2637 avcsix, avctwelve);
2639 if (bCorrPres)
2641 fprintf(debug,
2642 "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2643 *enercorr, spres, svir);
2645 else
2647 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2651 if (fr->efep != efepNO)
2653 *dvdlcorr += dvdlambda;
2658 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2659 const gmx_mtop_t *mtop, rvec x[],
2660 gmx_bool bFirst)
2662 t_graph *graph;
2663 int as, mol;
2665 if (bFirst && fplog)
2667 fprintf(fplog, "Removing pbc first time\n");
2670 snew(graph, 1);
2671 as = 0;
2672 for (const gmx_molblock_t &molb : mtop->molblock)
2674 const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2675 if (moltype.atoms.nr == 1 ||
2676 (!bFirst && moltype.cgs.nr == 1))
2678 /* Just one atom or charge group in the molecule, no PBC required */
2679 as += molb.nmol*moltype.atoms.nr;
2681 else
2683 mk_graph_moltype(moltype, graph);
2685 for (mol = 0; mol < molb.nmol; mol++)
2687 mk_mshift(fplog, graph, ePBC, box, x+as);
2689 shift_self(graph, box, x+as);
2690 /* The molecule is whole now.
2691 * We don't need the second mk_mshift call as in do_pbc_first,
2692 * since we no longer need this graph.
2695 as += moltype.atoms.nr;
2697 done_graph(graph);
2700 sfree(graph);
2703 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2704 const gmx_mtop_t *mtop, rvec x[])
2706 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2709 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2710 const gmx_mtop_t *mtop, rvec x[])
2712 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2715 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2717 int t, nth;
2718 nth = gmx_omp_nthreads_get(emntDefault);
2720 #pragma omp parallel for num_threads(nth) schedule(static)
2721 for (t = 0; t < nth; t++)
2725 size_t natoms = x.size();
2726 size_t offset = (natoms*t )/nth;
2727 size_t len = (natoms*(t + 1))/nth - offset;
2728 put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2730 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2734 // TODO This can be cleaned up a lot, and move back to runner.cpp
2735 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2736 const t_inputrec *inputrec,
2737 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2738 gmx_walltime_accounting_t walltime_accounting,
2739 nonbonded_verlet_t *nbv,
2740 const gmx_pme_t *pme,
2741 gmx_bool bWriteStat)
2743 t_nrnb *nrnb_tot = nullptr;
2744 double delta_t = 0;
2745 double nbfs = 0, mflop = 0;
2746 double elapsed_time,
2747 elapsed_time_over_all_ranks,
2748 elapsed_time_over_all_threads,
2749 elapsed_time_over_all_threads_over_all_ranks;
2750 /* Control whether it is valid to print a report. Only the
2751 simulation master may print, but it should not do so if the run
2752 terminated e.g. before a scheduled reset step. This is
2753 complicated by the fact that PME ranks are unaware of the
2754 reason why they were sent a pmerecvqxFINISH. To avoid
2755 communication deadlocks, we always do the communication for the
2756 report, even if we've decided not to write the report, because
2757 how long it takes to finish the run is not important when we've
2758 decided not to report on the simulation performance.
2760 Further, we only report performance for dynamical integrators,
2761 because those are the only ones for which we plan to
2762 consider doing any optimizations. */
2763 bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2765 if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2767 GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2768 printReport = false;
2771 if (cr->nnodes > 1)
2773 snew(nrnb_tot, 1);
2774 #if GMX_MPI
2775 MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2776 cr->mpi_comm_mysim);
2777 #endif
2779 else
2781 nrnb_tot = nrnb;
2784 elapsed_time = walltime_accounting_get_time_since_reset(walltime_accounting);
2785 elapsed_time_over_all_threads = walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
2786 if (cr->nnodes > 1)
2788 #if GMX_MPI
2789 /* reduce elapsed_time over all MPI ranks in the current simulation */
2790 MPI_Allreduce(&elapsed_time,
2791 &elapsed_time_over_all_ranks,
2792 1, MPI_DOUBLE, MPI_SUM,
2793 cr->mpi_comm_mysim);
2794 elapsed_time_over_all_ranks /= cr->nnodes;
2795 /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2796 * current simulation. */
2797 MPI_Allreduce(&elapsed_time_over_all_threads,
2798 &elapsed_time_over_all_threads_over_all_ranks,
2799 1, MPI_DOUBLE, MPI_SUM,
2800 cr->mpi_comm_mysim);
2801 #endif
2803 else
2805 elapsed_time_over_all_ranks = elapsed_time;
2806 elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2809 if (printReport)
2811 print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2813 if (cr->nnodes > 1)
2815 sfree(nrnb_tot);
2818 if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2820 print_dd_statistics(cr, inputrec, fplog);
2823 /* TODO Move the responsibility for any scaling by thread counts
2824 * to the code that handled the thread region, so that there's a
2825 * mechanism to keep cycle counting working during the transition
2826 * to task parallelism. */
2827 int nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
2828 int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2829 wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2830 auto cycle_sum(wallcycle_sum(cr, wcycle));
2832 if (printReport)
2834 auto nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2835 gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
2836 if (pme_gpu_task_enabled(pme))
2838 pme_gpu_get_timings(pme, &pme_gpu_timings);
2840 wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2841 elapsed_time_over_all_ranks,
2842 wcycle, cycle_sum,
2843 nbnxn_gpu_timings,
2844 &pme_gpu_timings);
2846 if (EI_DYNAMICS(inputrec->eI))
2848 delta_t = inputrec->delta_t;
2851 if (fplog)
2853 print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2854 elapsed_time_over_all_ranks,
2855 walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2856 delta_t, nbfs, mflop);
2858 if (bWriteStat)
2860 print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2861 elapsed_time_over_all_ranks,
2862 walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2863 delta_t, nbfs, mflop);
2868 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2870 /* this function works, but could probably use a logic rewrite to keep all the different
2871 types of efep straight. */
2873 if ((ir->efep == efepNO) && (!ir->bSimTemp))
2875 return;
2878 t_lambda *fep = ir->fepvals;
2879 *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2880 if checkpoint is set -- a kludge is in for now
2881 to prevent this.*/
2883 for (int i = 0; i < efptNR; i++)
2885 /* overwrite lambda state with init_lambda for now for backwards compatibility */
2886 if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2888 lambda[i] = fep->init_lambda;
2889 if (lam0)
2891 lam0[i] = lambda[i];
2894 else
2896 lambda[i] = fep->all_lambda[i][*fep_state];
2897 if (lam0)
2899 lam0[i] = lambda[i];
2903 if (ir->bSimTemp)
2905 /* need to rescale control temperatures to match current state */
2906 for (int i = 0; i < ir->opts.ngtc; i++)
2908 if (ir->opts.ref_t[i] > 0)
2910 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2915 /* Send to the log the information on the current lambdas */
2916 if (fplog != nullptr)
2918 fprintf(fplog, "Initial vector of lambda components:[ ");
2919 for (const auto &l : lambda)
2921 fprintf(fplog, "%10.4f ", l);
2923 fprintf(fplog, "]\n");
2928 void init_md(FILE *fplog,
2929 const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2930 t_inputrec *ir, const gmx_output_env_t *oenv,
2931 const MdrunOptions &mdrunOptions,
2932 double *t, double *t0,
2933 t_state *globalState, double *lam0,
2934 t_nrnb *nrnb, gmx_mtop_t *mtop,
2935 gmx_update_t **upd,
2936 gmx::BoxDeformation *deform,
2937 int nfile, const t_filenm fnm[],
2938 gmx_mdoutf_t *outf, t_mdebin **mdebin,
2939 tensor force_vir, tensor shake_vir,
2940 tensor total_vir, tensor pres, rvec mu_tot,
2941 gmx_bool *bSimAnn, t_vcm **vcm,
2942 gmx_wallcycle_t wcycle)
2944 int i;
2946 /* Initial values */
2947 *t = *t0 = ir->init_t;
2949 *bSimAnn = FALSE;
2950 for (i = 0; i < ir->opts.ngtc; i++)
2952 /* set bSimAnn if any group is being annealed */
2953 if (ir->opts.annealing[i] != eannNO)
2955 *bSimAnn = TRUE;
2959 /* Initialize lambda variables */
2960 /* TODO: Clean up initialization of fep_state and lambda in t_state.
2961 * We currently need to call initialize_lambdas on non-master ranks
2962 * to initialize lam0.
2964 if (MASTER(cr))
2966 initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
2968 else
2970 int tmpFepState;
2971 std::array<real, efptNR> tmpLambda;
2972 initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
2975 // TODO upd is never NULL in practice, but the analysers don't know that
2976 if (upd)
2978 *upd = init_update(ir, deform);
2980 if (*bSimAnn)
2982 update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2985 if (vcm != nullptr)
2987 *vcm = init_vcm(fplog, &mtop->groups, ir);
2990 if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
2992 if (ir->etc == etcBERENDSEN)
2994 please_cite(fplog, "Berendsen84a");
2996 if (ir->etc == etcVRESCALE)
2998 please_cite(fplog, "Bussi2007a");
3000 if (ir->eI == eiSD1)
3002 please_cite(fplog, "Goga2012");
3005 init_nrnb(nrnb);
3007 if (nfile != -1)
3009 *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
3011 *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
3012 mtop, ir, mdoutf_get_fp_dhdl(*outf));
3015 /* Initiate variables */
3016 clear_mat(force_vir);
3017 clear_mat(shake_vir);
3018 clear_rvec(mu_tot);
3019 clear_mat(total_vir);
3020 clear_mat(pres);
3023 void init_rerun(FILE *fplog,
3024 const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
3025 t_inputrec *ir, const gmx_output_env_t *oenv,
3026 const MdrunOptions &mdrunOptions,
3027 t_state *globalState, double *lam0,
3028 t_nrnb *nrnb, gmx_mtop_t *mtop,
3029 int nfile, const t_filenm fnm[],
3030 gmx_mdoutf_t *outf, t_mdebin **mdebin,
3031 gmx_wallcycle_t wcycle)
3033 /* Initialize lambda variables */
3034 /* TODO: Clean up initialization of fep_state and lambda in t_state.
3035 * We currently need to call initialize_lambdas on non-master ranks
3036 * to initialize lam0.
3038 if (MASTER(cr))
3040 initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
3042 else
3044 int tmpFepState;
3045 std::array<real, efptNR> tmpLambda;
3046 initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
3049 init_nrnb(nrnb);
3051 if (nfile != -1)
3053 *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
3054 *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
3055 mtop, ir, mdoutf_get_fp_dhdl(*outf), true);