Update Awh initialization and lifetime management
[gromacs.git] / src / gromacs / mdlib / sim_util.cpp
blob025d5339eca4b178d699953e61c1cf89da901017
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
37 #include "gmxpre.h"
39 #include "sim_util.h"
41 #include "config.h"
43 #include <stdio.h>
44 #include <string.h>
46 #include <cmath>
47 #include <cstdint>
49 #include <array>
51 #include "gromacs/awh/awh.h"
52 #include "gromacs/domdec/dlbtiming.h"
53 #include "gromacs/domdec/domdec.h"
54 #include "gromacs/domdec/domdec_struct.h"
55 #include "gromacs/essentialdynamics/edsam.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/gmxlib/chargegroup.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/gmxlib/nrnb.h"
60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
63 #include "gromacs/gpu_utils/gpu_utils.h"
64 #include "gromacs/imd/imd.h"
65 #include "gromacs/listed-forces/bonded.h"
66 #include "gromacs/listed-forces/disre.h"
67 #include "gromacs/listed-forces/orires.h"
68 #include "gromacs/math/functions.h"
69 #include "gromacs/math/units.h"
70 #include "gromacs/math/vec.h"
71 #include "gromacs/math/vecdump.h"
72 #include "gromacs/mdlib/calcmu.h"
73 #include "gromacs/mdlib/calcvir.h"
74 #include "gromacs/mdlib/constr.h"
75 #include "gromacs/mdlib/force.h"
76 #include "gromacs/mdlib/forcerec.h"
77 #include "gromacs/mdlib/gmx_omp_nthreads.h"
78 #include "gromacs/mdlib/mdrun.h"
79 #include "gromacs/mdlib/nb_verlet.h"
80 #include "gromacs/mdlib/nbnxn_atomdata.h"
81 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
82 #include "gromacs/mdlib/nbnxn_grid.h"
83 #include "gromacs/mdlib/nbnxn_search.h"
84 #include "gromacs/mdlib/qmmm.h"
85 #include "gromacs/mdlib/update.h"
86 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
87 #include "gromacs/mdtypes/commrec.h"
88 #include "gromacs/mdtypes/forceoutput.h"
89 #include "gromacs/mdtypes/iforceprovider.h"
90 #include "gromacs/mdtypes/inputrec.h"
91 #include "gromacs/mdtypes/md_enums.h"
92 #include "gromacs/mdtypes/state.h"
93 #include "gromacs/pbcutil/ishift.h"
94 #include "gromacs/pbcutil/mshift.h"
95 #include "gromacs/pbcutil/pbc.h"
96 #include "gromacs/pulling/pull.h"
97 #include "gromacs/pulling/pull_rotation.h"
98 #include "gromacs/timing/cyclecounter.h"
99 #include "gromacs/timing/gpu_timing.h"
100 #include "gromacs/timing/wallcycle.h"
101 #include "gromacs/timing/wallcyclereporting.h"
102 #include "gromacs/timing/walltime_accounting.h"
103 #include "gromacs/topology/topology.h"
104 #include "gromacs/utility/arrayref.h"
105 #include "gromacs/utility/basedefinitions.h"
106 #include "gromacs/utility/cstringutil.h"
107 #include "gromacs/utility/exceptions.h"
108 #include "gromacs/utility/fatalerror.h"
109 #include "gromacs/utility/gmxassert.h"
110 #include "gromacs/utility/gmxmpi.h"
111 #include "gromacs/utility/logger.h"
112 #include "gromacs/utility/pleasecite.h"
113 #include "gromacs/utility/smalloc.h"
114 #include "gromacs/utility/sysinfo.h"
116 #include "nbnxn_gpu.h"
117 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
118 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
120 // TODO: this environment variable allows us to verify before release
121 // that on less common architectures the total cost of polling is not larger than
122 // a blocking wait (so polling does not introduce overhead when the static
123 // PME-first ordering would suffice).
124 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
127 void print_time(FILE *out,
128 gmx_walltime_accounting_t walltime_accounting,
129 gmx_int64_t step,
130 t_inputrec *ir,
131 const t_commrec *cr)
133 time_t finish;
134 char timebuf[STRLEN];
135 double dt, elapsed_seconds, time_per_step;
136 char buf[48];
138 #if !GMX_THREAD_MPI
139 if (!PAR(cr))
140 #endif
142 fprintf(out, "\r");
144 fprintf(out, "step %s", gmx_step_str(step, buf));
145 fflush(out);
147 if ((step >= ir->nstlist))
149 double seconds_since_epoch = gmx_gettime();
150 elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
151 time_per_step = elapsed_seconds/(step - ir->init_step + 1);
152 dt = (ir->nsteps + ir->init_step - step) * time_per_step;
154 if (ir->nsteps >= 0)
156 if (dt >= 300)
158 finish = (time_t) (seconds_since_epoch + dt);
159 gmx_ctime_r(&finish, timebuf, STRLEN);
160 sprintf(buf, "%s", timebuf);
161 buf[strlen(buf)-1] = '\0';
162 fprintf(out, ", will finish %s", buf);
164 else
166 fprintf(out, ", remaining wall clock time: %5d s ", (int)dt);
169 else
171 fprintf(out, " performance: %.1f ns/day ",
172 ir->delta_t/1000*24*60*60/time_per_step);
175 #if !GMX_THREAD_MPI
176 if (PAR(cr))
178 fprintf(out, "\n");
180 #else
181 GMX_UNUSED_VALUE(cr);
182 #endif
184 fflush(out);
187 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
188 double the_time)
190 char time_string[STRLEN];
192 if (!fplog)
194 return;
198 int i;
199 char timebuf[STRLEN];
200 time_t temp_time = (time_t) the_time;
202 gmx_ctime_r(&temp_time, timebuf, STRLEN);
203 for (i = 0; timebuf[i] >= ' '; i++)
205 time_string[i] = timebuf[i];
207 time_string[i] = '\0';
210 fprintf(fplog, "%s on rank %d %s\n", title, nodeid, time_string);
213 void print_start(FILE *fplog, const t_commrec *cr,
214 gmx_walltime_accounting_t walltime_accounting,
215 const char *name)
217 char buf[STRLEN];
219 sprintf(buf, "Started %s", name);
220 print_date_and_time(fplog, cr->nodeid, buf,
221 walltime_accounting_get_start_time_stamp(walltime_accounting));
224 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
226 const int end = forceToAdd.size();
228 // cppcheck-suppress unreadVariable
229 int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
230 #pragma omp parallel for num_threads(nt) schedule(static)
231 for (int i = 0; i < end; i++)
233 rvec_inc(f[i], forceToAdd[i]);
237 static void pme_gpu_reduce_outputs(gmx_wallcycle_t wcycle,
238 ForceWithVirial *forceWithVirial,
239 ArrayRef<const gmx::RVec> pmeForces,
240 gmx_enerdata_t *enerd,
241 const tensor vir_Q,
242 real Vlr_q)
244 wallcycle_start(wcycle, ewcPME_GPU_F_REDUCTION);
245 GMX_ASSERT(forceWithVirial, "Invalid force pointer");
246 forceWithVirial->addVirialContribution(vir_Q);
247 enerd->term[F_COUL_RECIP] += Vlr_q;
248 sum_forces(as_rvec_array(forceWithVirial->force_.data()), pmeForces);
249 wallcycle_stop(wcycle, ewcPME_GPU_F_REDUCTION);
252 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
253 tensor vir_part, t_graph *graph, matrix box,
254 t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
256 int i;
258 /* The short-range virial from surrounding boxes */
259 calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
260 inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
262 /* Calculate partial virial, for local atoms only, based on short range.
263 * Total virial is computed in global_stat, called from do_md
265 f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
266 inc_nrnb(nrnb, eNR_VIRIAL, homenr);
268 /* Add wall contribution */
269 for (i = 0; i < DIM; i++)
271 vir_part[i][ZZ] += fr->vir_wall_z[i];
274 if (debug)
276 pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
280 static void pull_potential_wrapper(const t_commrec *cr,
281 const t_inputrec *ir,
282 matrix box, ArrayRef<const RVec> x,
283 ForceWithVirial *force,
284 const t_mdatoms *mdatoms,
285 gmx_enerdata_t *enerd,
286 real *lambda,
287 double t,
288 gmx_wallcycle_t wcycle)
290 t_pbc pbc;
291 real dvdl;
293 /* Calculate the center of mass forces, this requires communication,
294 * which is why pull_potential is called close to other communication.
296 wallcycle_start(wcycle, ewcPULLPOT);
297 set_pbc(&pbc, ir->ePBC, box);
298 dvdl = 0;
299 enerd->term[F_COM_PULL] +=
300 pull_potential(ir->pull_work, mdatoms, &pbc,
301 cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
302 enerd->dvdl_lin[efptRESTRAINT] += dvdl;
303 wallcycle_stop(wcycle, ewcPULLPOT);
306 static void pme_receive_force_ener(const t_commrec *cr,
307 ForceWithVirial *forceWithVirial,
308 gmx_enerdata_t *enerd,
309 gmx_wallcycle_t wcycle)
311 real e_q, e_lj, dvdl_q, dvdl_lj;
312 float cycles_ppdpme, cycles_seppme;
314 cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
315 dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
317 /* In case of node-splitting, the PP nodes receive the long-range
318 * forces, virial and energy from the PME nodes here.
320 wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
321 dvdl_q = 0;
322 dvdl_lj = 0;
323 gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
324 &cycles_seppme);
325 enerd->term[F_COUL_RECIP] += e_q;
326 enerd->term[F_LJ_RECIP] += e_lj;
327 enerd->dvdl_lin[efptCOUL] += dvdl_q;
328 enerd->dvdl_lin[efptVDW] += dvdl_lj;
330 if (wcycle)
332 dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
334 wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
337 static void print_large_forces(FILE *fp,
338 const t_mdatoms *md,
339 const t_commrec *cr,
340 gmx_int64_t step,
341 real forceTolerance,
342 const rvec *x,
343 const rvec *f)
345 real force2Tolerance = gmx::square(forceTolerance);
346 std::uintmax_t numNonFinite = 0;
347 for (int i = 0; i < md->homenr; i++)
349 real force2 = norm2(f[i]);
350 bool nonFinite = !std::isfinite(force2);
351 if (force2 >= force2Tolerance || nonFinite)
353 fprintf(fp, "step %" GMX_PRId64 " atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
354 step,
355 ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
357 if (nonFinite)
359 numNonFinite++;
362 if (numNonFinite > 0)
364 /* Note that with MPI this fatal call on one rank might interrupt
365 * the printing on other ranks. But we can only avoid that with
366 * an expensive MPI barrier that we would need at each step.
368 gmx_fatal(FARGS, "At step %" GMX_PRId64 " detected non-finite forces on %ju atoms", step, numNonFinite);
372 static void post_process_forces(const t_commrec *cr,
373 gmx_int64_t step,
374 t_nrnb *nrnb,
375 gmx_wallcycle_t wcycle,
376 const gmx_localtop_t *top,
377 matrix box,
378 rvec x[],
379 rvec f[],
380 ForceWithVirial *forceWithVirial,
381 tensor vir_force,
382 const t_mdatoms *mdatoms,
383 t_graph *graph,
384 t_forcerec *fr,
385 const gmx_vsite_t *vsite,
386 int flags)
388 if (fr->haveDirectVirialContributions)
390 rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
392 if (vsite)
394 /* Spread the mesh force on virtual sites to the other particles...
395 * This is parallellized. MPI communication is performed
396 * if the constructing atoms aren't local.
398 matrix virial = { { 0 } };
399 spread_vsite_f(vsite, x, fDirectVir, nullptr,
400 (flags & GMX_FORCE_VIRIAL), virial,
401 nrnb,
402 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
403 forceWithVirial->addVirialContribution(virial);
406 if (flags & GMX_FORCE_VIRIAL)
408 /* Now add the forces, this is local */
409 sum_forces(f, forceWithVirial->force_);
411 /* Add the direct virial contributions */
412 GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
413 m_add(vir_force, forceWithVirial->getVirial(), vir_force);
415 if (debug)
417 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
422 if (fr->print_force >= 0)
424 print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
428 static void do_nb_verlet(t_forcerec *fr,
429 interaction_const_t *ic,
430 gmx_enerdata_t *enerd,
431 int flags, int ilocality,
432 int clearF,
433 gmx_int64_t step,
434 t_nrnb *nrnb,
435 gmx_wallcycle_t wcycle)
437 if (!(flags & GMX_FORCE_NONBONDED))
439 /* skip non-bonded calculation */
440 return;
443 nonbonded_verlet_t *nbv = fr->nbv;
444 nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
446 /* GPU kernel launch overhead is already timed separately */
447 if (fr->cutoff_scheme != ecutsVERLET)
449 gmx_incons("Invalid cut-off scheme passed!");
452 bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
454 if (!bUsingGpuKernels)
456 /* When dynamic pair-list pruning is requested, we need to prune
457 * at nstlistPrune steps.
459 if (nbv->listParams->useDynamicPruning &&
460 (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
462 /* Prune the pair-list beyond fr->ic->rlistPrune using
463 * the current coordinates of the atoms.
465 wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
466 nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
467 wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
470 wallcycle_sub_start(wcycle, ewcsNONBONDED);
473 switch (nbvg->kernel_type)
475 case nbnxnk4x4_PlainC:
476 case nbnxnk4xN_SIMD_4xN:
477 case nbnxnk4xN_SIMD_2xNN:
478 nbnxn_kernel_cpu(nbvg,
479 nbv->nbat,
481 fr->shift_vec,
482 flags,
483 clearF,
484 fr->fshift[0],
485 enerd->grpp.ener[egCOULSR],
486 fr->bBHAM ?
487 enerd->grpp.ener[egBHAMSR] :
488 enerd->grpp.ener[egLJSR]);
489 break;
491 case nbnxnk8x8x8_GPU:
492 nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbv->nbat, flags, ilocality);
493 break;
495 case nbnxnk8x8x8_PlainC:
496 nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
497 nbv->nbat, ic,
498 fr->shift_vec,
499 flags,
500 clearF,
501 nbv->nbat->out[0].f,
502 fr->fshift[0],
503 enerd->grpp.ener[egCOULSR],
504 fr->bBHAM ?
505 enerd->grpp.ener[egBHAMSR] :
506 enerd->grpp.ener[egLJSR]);
507 break;
509 default:
510 GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
513 if (!bUsingGpuKernels)
515 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
518 int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
519 if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
521 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
523 else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
524 (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
526 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
528 else
530 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
532 enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
533 if (flags & GMX_FORCE_ENERGY)
535 /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
536 enr_nbnxn_kernel_ljc += 1;
537 enr_nbnxn_kernel_lj += 1;
540 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
541 nbvg->nbl_lists.natpair_ljq);
542 inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
543 nbvg->nbl_lists.natpair_lj);
544 /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
545 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
546 nbvg->nbl_lists.natpair_q);
548 if (ic->vdw_modifier == eintmodFORCESWITCH)
550 /* We add up the switch cost separately */
551 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
552 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
554 if (ic->vdw_modifier == eintmodPOTSWITCH)
556 /* We add up the switch cost separately */
557 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
558 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
560 if (ic->vdwtype == evdwPME)
562 /* We add up the LJ Ewald cost separately */
563 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
564 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
568 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
569 t_forcerec *fr,
570 rvec x[],
571 rvec f[],
572 const t_mdatoms *mdatoms,
573 t_lambda *fepvals,
574 real *lambda,
575 gmx_enerdata_t *enerd,
576 int flags,
577 t_nrnb *nrnb,
578 gmx_wallcycle_t wcycle)
580 int donb_flags;
581 nb_kernel_data_t kernel_data;
582 real lam_i[efptNR];
583 real dvdl_nb[efptNR];
584 int th;
585 int i, j;
587 donb_flags = 0;
588 /* Add short-range interactions */
589 donb_flags |= GMX_NONBONDED_DO_SR;
591 /* Currently all group scheme kernels always calculate (shift-)forces */
592 if (flags & GMX_FORCE_FORCES)
594 donb_flags |= GMX_NONBONDED_DO_FORCE;
596 if (flags & GMX_FORCE_VIRIAL)
598 donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
600 if (flags & GMX_FORCE_ENERGY)
602 donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
605 kernel_data.flags = donb_flags;
606 kernel_data.lambda = lambda;
607 kernel_data.dvdl = dvdl_nb;
609 kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
610 kernel_data.energygrp_vdw = enerd->grpp.ener[egLJSR];
612 /* reset free energy components */
613 for (i = 0; i < efptNR; i++)
615 dvdl_nb[i] = 0;
618 GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
620 wallcycle_sub_start(wcycle, ewcsNONBONDED);
621 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
622 for (th = 0; th < nbl_lists->nnbl; th++)
626 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
627 x, f, fr, mdatoms, &kernel_data, nrnb);
629 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
632 if (fepvals->sc_alpha != 0)
634 enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
635 enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
637 else
639 enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
640 enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
643 /* If we do foreign lambda and we have soft-core interactions
644 * we have to recalculate the (non-linear) energies contributions.
646 if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
648 kernel_data.flags = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
649 kernel_data.lambda = lam_i;
650 kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
651 kernel_data.energygrp_vdw = enerd->foreign_grpp.ener[egLJSR];
652 /* Note that we add to kernel_data.dvdl, but ignore the result */
654 for (i = 0; i < enerd->n_lambda; i++)
656 for (j = 0; j < efptNR; j++)
658 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
660 reset_foreign_enerdata(enerd);
661 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
662 for (th = 0; th < nbl_lists->nnbl; th++)
666 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
667 x, f, fr, mdatoms, &kernel_data, nrnb);
669 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
672 sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
673 enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
677 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
680 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
682 return nbv != nullptr && nbv->bUseGPU;
685 static inline void clear_rvecs_omp(int n, rvec v[])
687 int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
689 /* Note that we would like to avoid this conditional by putting it
690 * into the omp pragma instead, but then we still take the full
691 * omp parallel for overhead (at least with gcc5).
693 if (nth == 1)
695 for (int i = 0; i < n; i++)
697 clear_rvec(v[i]);
700 else
702 #pragma omp parallel for num_threads(nth) schedule(static)
703 for (int i = 0; i < n; i++)
705 clear_rvec(v[i]);
710 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
712 * \param groupOptions Group options, containing T-coupling options
714 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
716 real nrdfCoupled = 0;
717 real nrdfUncoupled = 0;
718 real kineticEnergy = 0;
719 for (int g = 0; g < groupOptions.ngtc; g++)
721 if (groupOptions.tau_t[g] >= 0)
723 nrdfCoupled += groupOptions.nrdf[g];
724 kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
726 else
728 nrdfUncoupled += groupOptions.nrdf[g];
732 /* This conditional with > also catches nrdf=0 */
733 if (nrdfCoupled > nrdfUncoupled)
735 return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
737 else
739 return 0;
743 /*! \brief This routine checks that the potential energy is finite.
745 * Always checks that the potential energy is finite. If step equals
746 * inputrec.init_step also checks that the magnitude of the potential energy
747 * is reasonable. Terminates with a fatal error when a check fails.
748 * Note that passing this check does not guarantee finite forces,
749 * since those use slightly different arithmetics. But in most cases
750 * there is just a narrow coordinate range where forces are not finite
751 * and energies are finite.
753 * \param[in] step The step number, used for checking and printing
754 * \param[in] enerd The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
755 * \param[in] inputrec The input record
757 static void checkPotentialEnergyValidity(gmx_int64_t step,
758 const gmx_enerdata_t &enerd,
759 const t_inputrec &inputrec)
761 /* Threshold valid for comparing absolute potential energy against
762 * the kinetic energy. Normally one should not consider absolute
763 * potential energy values, but with a factor of one million
764 * we should never get false positives.
766 constexpr real c_thresholdFactor = 1e6;
768 bool energyIsNotFinite = !std::isfinite(enerd.term[F_EPOT]);
769 real averageKineticEnergy = 0;
770 /* We only check for large potential energy at the initial step,
771 * because that is by far the most likely step for this too occur
772 * and because computing the average kinetic energy is not free.
773 * Note: nstcalcenergy >> 1 often does not allow to catch large energies
774 * before they become NaN.
776 if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
778 averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
781 if (energyIsNotFinite || (averageKineticEnergy > 0 &&
782 enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
784 gmx_fatal(FARGS, "Step %" GMX_PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
785 step,
786 enerd.term[F_EPOT],
787 energyIsNotFinite ? "not finite" : "extremely high",
788 enerd.term[F_LJ],
789 enerd.term[F_COUL_SR],
790 energyIsNotFinite ? "non-finite" : "very high",
791 energyIsNotFinite ? " or Nan" : "");
795 /*! \brief Compute forces and/or energies for special algorithms
797 * The intention is to collect all calls to algorithms that compute
798 * forces on local atoms only and that do not contribute to the local
799 * virial sum (but add their virial contribution separately).
800 * Eventually these should likely all become ForceProviders.
801 * Within this function the intention is to have algorithms that do
802 * global communication at the end, so global barriers within the MD loop
803 * are as close together as possible.
805 * \param[in] fplog The log file
806 * \param[in] cr The communication record
807 * \param[in] inputrec The input record
808 * \param[in] awh The Awh module (nullptr if none in use).
809 * \param[in] step The current MD step
810 * \param[in] t The current time
811 * \param[in,out] wcycle Wallcycle accounting struct
812 * \param[in,out] forceProviders Pointer to a list of force providers
813 * \param[in] box The unit cell
814 * \param[in] x The coordinates
815 * \param[in] mdatoms Per atom properties
816 * \param[in] lambda Array of free-energy lambda values
817 * \param[in] forceFlags Flags that tell whether we should compute forces/energies/virial
818 * \param[in,out] forceWithVirial Force and virial buffers
819 * \param[in,out] enerd Energy buffer
820 * \param[in,out] ed Essential dynamics pointer
821 * \param[in] bNS Tells if we did neighbor searching this step, used for ED sampling
823 * \todo Remove bNS, which is used incorrectly.
824 * \todo Convert all other algorithms called here to ForceProviders.
826 static void
827 computeSpecialForces(FILE *fplog,
828 const t_commrec *cr,
829 const t_inputrec *inputrec,
830 gmx::Awh *awh,
831 gmx_int64_t step,
832 double t,
833 gmx_wallcycle_t wcycle,
834 ForceProviders *forceProviders,
835 matrix box,
836 ArrayRef<const RVec> x,
837 const t_mdatoms *mdatoms,
838 real *lambda,
839 int forceFlags,
840 ForceWithVirial *forceWithVirial,
841 gmx_enerdata_t *enerd,
842 const gmx_edsam *ed,
843 gmx_bool bNS)
845 const bool computeForces = (forceFlags & GMX_FORCE_FORCES);
847 /* NOTE: Currently all ForceProviders only provide forces.
848 * When they also provide energies, remove this conditional.
850 if (computeForces)
852 ForceProviderInput forceProviderInput(x, *mdatoms, t, box, *cr);
853 ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
855 /* Collect forces from modules */
856 forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
859 if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
861 pull_potential_wrapper(cr, inputrec, box, x,
862 forceWithVirial,
863 mdatoms, enerd, lambda, t,
864 wcycle);
866 if (awh)
868 enerd->term[F_COM_PULL] +=
869 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
870 forceWithVirial,
871 t, step, wcycle, fplog);
875 rvec *f = as_rvec_array(forceWithVirial->force_.data());
877 /* Add the forces from enforced rotation potentials (if any) */
878 if (inputrec->bRot)
880 wallcycle_start(wcycle, ewcROTadd);
881 enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
882 wallcycle_stop(wcycle, ewcROTadd);
885 if (ed)
887 /* Note that since init_edsam() is called after the initialization
888 * of forcerec, edsam doesn't request the noVirSum force buffer.
889 * Thus if no other algorithm (e.g. PME) requires it, the forces
890 * here will contribute to the virial.
892 do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
895 /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
896 if (inputrec->bIMD && computeForces)
898 IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
902 /*! \brief Launch the prepare_step and spread stages of PME GPU.
904 * \param[in] pmedata The PME structure
905 * \param[in] box The box matrix
906 * \param[in] x Coordinate array
907 * \param[in] flags Force flags
908 * \param[in] wcycle The wallcycle structure
910 static inline void launchPmeGpuSpread(gmx_pme_t *pmedata,
911 matrix box,
912 rvec x[],
913 int flags,
914 gmx_wallcycle_t wcycle)
916 int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE;
917 pmeFlags |= (flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0;
918 pmeFlags |= (flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0;
920 pme_gpu_prepare_computation(pmedata, flags & GMX_FORCE_DYNAMICBOX, box, wcycle, pmeFlags);
921 pme_gpu_launch_spread(pmedata, x, wcycle);
924 /*! \brief Launch the FFT and gather stages of PME GPU
926 * This function only implements setting the output forces (no accumulation).
928 * \param[in] pmedata The PME structure
929 * \param[in] wcycle The wallcycle structure
931 static void launchPmeGpuFftAndGather(gmx_pme_t *pmedata,
932 gmx_wallcycle_t wcycle)
934 pme_gpu_launch_complex_transforms(pmedata, wcycle);
935 pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
938 /*! \brief
939 * Polling wait for either of the PME or nonbonded GPU tasks.
941 * Instead of a static order in waiting for GPU tasks, this function
942 * polls checking which of the two tasks completes first, and does the
943 * associated force buffer reduction overlapped with the other task.
944 * By doing that, unlike static scheduling order, it can always overlap
945 * one of the reductions, regardless of the GPU task completion order.
947 * \param[in] nbv Nonbonded verlet structure
948 * \param[in] pmedata PME module data
949 * \param[in,out] force Force array to reduce task outputs into.
950 * \param[in,out] forceWithVirial Force and virial buffers
951 * \param[in,out] fshift Shift force output vector results are reduced into
952 * \param[in,out] enerd Energy data structure results are reduced into
953 * \param[in] flags Force flags
954 * \param[in] wcycle The wallcycle structure
956 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv,
957 const gmx_pme_t *pmedata,
958 gmx::PaddedArrayRef<gmx::RVec> *force,
959 ForceWithVirial *forceWithVirial,
960 rvec fshift[],
961 gmx_enerdata_t *enerd,
962 int flags,
963 gmx_wallcycle_t wcycle)
965 bool isPmeGpuDone = false;
966 bool isNbGpuDone = false;
969 gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
971 while (!isPmeGpuDone || !isNbGpuDone)
973 if (!isPmeGpuDone)
975 matrix vir_Q;
976 real Vlr_q;
978 GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
979 isPmeGpuDone = pme_gpu_try_finish_task(pmedata, wcycle, &pmeGpuForces,
980 vir_Q, &Vlr_q, completionType);
982 if (isPmeGpuDone)
984 pme_gpu_reduce_outputs(wcycle, forceWithVirial, pmeGpuForces,
985 enerd, vir_Q, Vlr_q);
989 if (!isNbGpuDone)
991 GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
992 wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
993 isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
994 flags, eatLocal,
995 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
996 fshift, completionType);
997 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
998 // To get the call count right, when the task finished we
999 // issue a start/stop.
1000 // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
1001 // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
1002 if (isNbGpuDone)
1004 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1005 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1007 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1008 nbv->nbat, as_rvec_array(force->data()), wcycle);
1014 /*! \brief
1015 * Launch the dynamic rolling pruning GPU task.
1017 * We currently alternate local/non-local list pruning in odd-even steps
1018 * (only pruning every second step without DD).
1020 * \param[in] cr The communication record
1021 * \param[in] nbv Nonbonded verlet structure
1022 * \param[in] inputrec The input record
1023 * \param[in] step The current MD step
1025 static inline void launchGpuRollingPruning(const t_commrec *cr,
1026 const nonbonded_verlet_t *nbv,
1027 const t_inputrec *inputrec,
1028 const gmx_int64_t step)
1030 /* We should not launch the rolling pruning kernel at a search
1031 * step or just before search steps, since that's useless.
1032 * Without domain decomposition we prune at even steps.
1033 * With domain decomposition we alternate local and non-local
1034 * pruning at even and odd steps.
1036 int numRollingParts = nbv->listParams->numRollingParts;
1037 GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1038 int stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1039 bool stepIsEven = ((stepWithCurrentList & 1) == 0);
1040 if (stepWithCurrentList > 0 &&
1041 stepWithCurrentList < inputrec->nstlist - 1 &&
1042 (stepIsEven || DOMAINDECOMP(cr)))
1044 nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
1045 stepIsEven ? eintLocal : eintNonlocal,
1046 numRollingParts);
1050 static void do_force_cutsVERLET(FILE *fplog,
1051 const t_commrec *cr,
1052 const gmx_multisim_t *ms,
1053 const t_inputrec *inputrec,
1054 gmx::Awh *awh,
1055 gmx_int64_t step,
1056 t_nrnb *nrnb,
1057 gmx_wallcycle_t wcycle,
1058 const gmx_localtop_t *top,
1059 const gmx_groups_t * /* groups */,
1060 matrix box, gmx::PaddedArrayRef<gmx::RVec> x,
1061 history_t *hist,
1062 gmx::PaddedArrayRef<gmx::RVec> force,
1063 tensor vir_force,
1064 const t_mdatoms *mdatoms,
1065 gmx_enerdata_t *enerd, t_fcdata *fcd,
1066 real *lambda,
1067 t_graph *graph,
1068 t_forcerec *fr,
1069 interaction_const_t *ic,
1070 const gmx_vsite_t *vsite,
1071 rvec mu_tot,
1072 double t,
1073 const gmx_edsam *ed,
1074 int flags,
1075 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1076 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1078 int cg1, i, j;
1079 double mu[2*DIM];
1080 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1081 gmx_bool bDoForces, bUseGPU, bUseOrEmulGPU;
1082 rvec vzero, box_diag;
1083 float cycles_pme, cycles_wait_gpu;
1084 nonbonded_verlet_t *nbv = fr->nbv;
1086 bStateChanged = (flags & GMX_FORCE_STATECHANGED);
1087 bNS = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1088 bFillGrid = (bNS && bStateChanged);
1089 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1090 bDoForces = (flags & GMX_FORCE_FORCES);
1091 bUseGPU = fr->nbv->bUseGPU;
1092 bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
1094 const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
1095 // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
1096 const bool useGpuPme = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
1097 ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
1099 /* At a search step we need to start the first balancing region
1100 * somewhere early inside the step after communication during domain
1101 * decomposition (and not during the previous step as usual).
1103 if (bNS &&
1104 ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1106 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
1109 cycles_wait_gpu = 0;
1111 const int start = 0;
1112 const int homenr = mdatoms->homenr;
1114 clear_mat(vir_force);
1116 if (DOMAINDECOMP(cr))
1118 cg1 = cr->dd->ncg_tot;
1120 else
1122 cg1 = top->cgs.nr;
1124 if (fr->n_tpi > 0)
1126 cg1--;
1129 if (bStateChanged)
1131 update_forcerec(fr, box);
1133 if (inputrecNeedMutot(inputrec))
1135 /* Calculate total (local) dipole moment in a temporary common array.
1136 * This makes it possible to sum them over nodes faster.
1138 calc_mu(start, homenr,
1139 x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1140 mu, mu+DIM);
1144 if (fr->ePBC != epbcNONE)
1146 /* Compute shift vectors every step,
1147 * because of pressure coupling or box deformation!
1149 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1151 calc_shifts(box, fr->shift_vec);
1154 if (bCalcCGCM)
1156 put_atoms_in_box_omp(fr->ePBC, box, x.subArray(0, homenr));
1157 inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1159 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1161 unshift_self(graph, box, as_rvec_array(x.data()));
1165 nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
1166 fr->shift_vec, nbv->nbat);
1168 #if GMX_MPI
1169 if (!thisRankHasDuty(cr, DUTY_PME))
1171 /* Send particle coordinates to the pme nodes.
1172 * Since this is only implemented for domain decomposition
1173 * and domain decomposition does not use the graph,
1174 * we do not need to worry about shifting.
1176 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1177 lambda[efptCOUL], lambda[efptVDW],
1178 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1179 step, wcycle);
1181 #endif /* GMX_MPI */
1183 if (useGpuPme)
1185 launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.data()), flags, wcycle);
1188 /* do gridding for pair search */
1189 if (bNS)
1191 if (graph && bStateChanged)
1193 /* Calculate intramolecular shift vectors to make molecules whole */
1194 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1197 clear_rvec(vzero);
1198 box_diag[XX] = box[XX][XX];
1199 box_diag[YY] = box[YY][YY];
1200 box_diag[ZZ] = box[ZZ][ZZ];
1202 wallcycle_start(wcycle, ewcNS);
1203 if (!DOMAINDECOMP(cr))
1205 wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1206 nbnxn_put_on_grid(nbv->nbs.get(), fr->ePBC, box,
1207 0, vzero, box_diag,
1208 0, mdatoms->homenr, -1, fr->cginfo, as_rvec_array(x.data()),
1209 0, nullptr,
1210 nbv->grp[eintLocal].kernel_type,
1211 nbv->nbat);
1212 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1214 else
1216 wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1217 nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
1218 fr->cginfo, as_rvec_array(x.data()),
1219 nbv->grp[eintNonlocal].kernel_type,
1220 nbv->nbat);
1221 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1224 nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
1225 wallcycle_stop(wcycle, ewcNS);
1228 /* initialize the GPU atom data and copy shift vector */
1229 if (bUseGPU)
1231 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1232 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1234 if (bNS)
1236 nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1239 nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1241 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1242 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1245 /* do local pair search */
1246 if (bNS)
1248 wallcycle_start_nocount(wcycle, ewcNS);
1249 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1250 nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1251 &top->excls,
1252 nbv->listParams->rlistOuter,
1253 nbv->min_ci_balanced,
1254 &nbv->grp[eintLocal].nbl_lists,
1255 eintLocal,
1256 nbv->grp[eintLocal].kernel_type,
1257 nrnb);
1258 nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
1259 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1261 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
1263 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1265 if (bUseGPU)
1267 /* initialize local pair-list on the GPU */
1268 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1269 nbv->grp[eintLocal].nbl_lists.nbl[0],
1270 eintLocal);
1272 wallcycle_stop(wcycle, ewcNS);
1274 else
1276 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatLocal, FALSE, as_rvec_array(x.data()),
1277 nbv->nbat, wcycle);
1280 if (bUseGPU)
1282 if (DOMAINDECOMP(cr))
1284 ddOpenBalanceRegionGpu(cr->dd);
1287 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1288 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1289 /* launch local nonbonded work on GPU */
1290 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1291 step, nrnb, wcycle);
1292 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1293 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1296 if (useGpuPme)
1298 // In PME GPU and mixed mode we launch FFT / gather after the
1299 // X copy/transform to allow overlap as well as after the GPU NB
1300 // launch to avoid FFT launch overhead hijacking the CPU and delaying
1301 // the nonbonded kernel.
1302 launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1305 /* Communicate coordinates and sum dipole if necessary +
1306 do non-local pair search */
1307 if (DOMAINDECOMP(cr))
1309 if (bNS)
1311 wallcycle_start_nocount(wcycle, ewcNS);
1312 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1314 nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1315 &top->excls,
1316 nbv->listParams->rlistOuter,
1317 nbv->min_ci_balanced,
1318 &nbv->grp[eintNonlocal].nbl_lists,
1319 eintNonlocal,
1320 nbv->grp[eintNonlocal].kernel_type,
1321 nrnb);
1322 nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1323 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1325 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1327 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1329 if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1331 /* initialize non-local pair-list on the GPU */
1332 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1333 nbv->grp[eintNonlocal].nbl_lists.nbl[0],
1334 eintNonlocal);
1336 wallcycle_stop(wcycle, ewcNS);
1338 else
1340 dd_move_x(cr->dd, box, as_rvec_array(x.data()), wcycle);
1342 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatNonlocal, FALSE, as_rvec_array(x.data()),
1343 nbv->nbat, wcycle);
1346 if (bUseGPU)
1348 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1349 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1350 /* launch non-local nonbonded tasks on GPU */
1351 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1352 step, nrnb, wcycle);
1353 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1354 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1358 if (bUseGPU)
1360 /* launch D2H copy-back F */
1361 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1362 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1363 if (DOMAINDECOMP(cr))
1365 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1366 flags, eatNonlocal);
1368 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1369 flags, eatLocal);
1370 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1371 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1374 if (bStateChanged && inputrecNeedMutot(inputrec))
1376 if (PAR(cr))
1378 gmx_sumd(2*DIM, mu, cr);
1379 ddReopenBalanceRegionCpu(cr->dd);
1382 for (i = 0; i < 2; i++)
1384 for (j = 0; j < DIM; j++)
1386 fr->mu_tot[i][j] = mu[i*DIM + j];
1390 if (fr->efep == efepNO)
1392 copy_rvec(fr->mu_tot[0], mu_tot);
1394 else
1396 for (j = 0; j < DIM; j++)
1398 mu_tot[j] =
1399 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1400 lambda[efptCOUL]*fr->mu_tot[1][j];
1404 /* Reset energies */
1405 reset_enerdata(enerd);
1406 clear_rvecs(SHIFTS, fr->fshift);
1408 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1410 wallcycle_start(wcycle, ewcPPDURINGPME);
1411 dd_force_flop_start(cr->dd, nrnb);
1414 if (inputrec->bRot)
1416 wallcycle_start(wcycle, ewcROT);
1417 do_rotation(cr, inputrec, box, as_rvec_array(x.data()), t, step, bNS);
1418 wallcycle_stop(wcycle, ewcROT);
1421 /* Temporary solution until all routines take PaddedRVecVector */
1422 rvec *f = as_rvec_array(force.data());
1424 /* Start the force cycle counter.
1425 * Note that a different counter is used for dynamic load balancing.
1427 wallcycle_start(wcycle, ewcFORCE);
1429 gmx::ArrayRef<gmx::RVec> forceRef = force;
1430 if (bDoForces)
1432 /* If we need to compute the virial, we might need a separate
1433 * force buffer for algorithms for which the virial is calculated
1434 * directly, such as PME.
1436 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1438 forceRef = *fr->forceBufferForDirectVirialContributions;
1440 /* We only compute forces on local atoms. Note that vsites can
1441 * spread to non-local atoms, but that part of the buffer is
1442 * cleared separately in the vsite spreading code.
1444 clear_rvecs_omp(mdatoms->homenr, as_rvec_array(forceRef.data()));
1446 /* Clear the short- and long-range forces */
1447 clear_rvecs_omp(fr->natoms_force_constr, f);
1450 /* forceWithVirial uses the local atom range only */
1451 ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
1453 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1455 clear_pull_forces(inputrec->pull_work);
1458 /* We calculate the non-bonded forces, when done on the CPU, here.
1459 * We do this before calling do_force_lowlevel, because in that
1460 * function, the listed forces are calculated before PME, which
1461 * does communication. With this order, non-bonded and listed
1462 * force calculation imbalance can be balanced out by the domain
1463 * decomposition load balancing.
1466 if (!bUseOrEmulGPU)
1468 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1469 step, nrnb, wcycle);
1472 if (fr->efep != efepNO)
1474 /* Calculate the local and non-local free energy interactions here.
1475 * Happens here on the CPU both with and without GPU.
1477 if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1479 do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1480 fr, as_rvec_array(x.data()), f, mdatoms,
1481 inputrec->fepvals, lambda,
1482 enerd, flags, nrnb, wcycle);
1485 if (DOMAINDECOMP(cr) &&
1486 fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1488 do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1489 fr, as_rvec_array(x.data()), f, mdatoms,
1490 inputrec->fepvals, lambda,
1491 enerd, flags, nrnb, wcycle);
1495 if (!bUseOrEmulGPU)
1497 int aloc;
1499 if (DOMAINDECOMP(cr))
1501 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1502 step, nrnb, wcycle);
1505 if (!bUseOrEmulGPU)
1507 aloc = eintLocal;
1509 else
1511 aloc = eintNonlocal;
1514 /* Add all the non-bonded force to the normal force array.
1515 * This can be split into a local and a non-local part when overlapping
1516 * communication with calculation with domain decomposition.
1518 wallcycle_stop(wcycle, ewcFORCE);
1520 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatAll, nbv->nbat, f, wcycle);
1522 wallcycle_start_nocount(wcycle, ewcFORCE);
1524 /* if there are multiple fshift output buffers reduce them */
1525 if ((flags & GMX_FORCE_VIRIAL) &&
1526 nbv->grp[aloc].nbl_lists.nnbl > 1)
1528 /* This is not in a subcounter because it takes a
1529 negligible and constant-sized amount of time */
1530 nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1531 fr->fshift);
1535 /* update QMMMrec, if necessary */
1536 if (fr->bQMMM)
1538 update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1541 /* Compute the bonded and non-bonded energies and optionally forces */
1542 do_force_lowlevel(fr, inputrec, &(top->idef),
1543 cr, ms, nrnb, wcycle, mdatoms,
1544 as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1545 box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1546 flags, &cycles_pme);
1548 wallcycle_stop(wcycle, ewcFORCE);
1550 computeSpecialForces(fplog, cr, inputrec, awh, step, t, wcycle,
1551 fr->forceProviders, box, x, mdatoms, lambda,
1552 flags, &forceWithVirial, enerd,
1553 ed, bNS);
1555 if (bUseOrEmulGPU)
1557 /* wait for non-local forces (or calculate in emulation mode) */
1558 if (DOMAINDECOMP(cr))
1560 if (bUseGPU)
1562 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1563 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1564 flags, eatNonlocal,
1565 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1566 fr->fshift);
1567 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1569 else
1571 wallcycle_start_nocount(wcycle, ewcFORCE);
1572 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1573 step, nrnb, wcycle);
1574 wallcycle_stop(wcycle, ewcFORCE);
1577 /* skip the reduction if there was no non-local work to do */
1578 if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1580 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal,
1581 nbv->nbat, f, wcycle);
1586 if (DOMAINDECOMP(cr))
1588 /* We are done with the CPU compute.
1589 * We will now communicate the non-local forces.
1590 * If we use a GPU this will overlap with GPU work, so in that case
1591 * we do not close the DD force balancing region here.
1593 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1595 ddCloseBalanceRegionCpu(cr->dd);
1597 if (bDoForces)
1599 dd_move_f(cr->dd, f, fr->fshift, wcycle);
1603 // With both nonbonded and PME offloaded a GPU on the same rank, we use
1604 // an alternating wait/reduction scheme.
1605 bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1606 if (alternateGpuWait)
1608 alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, wcycle);
1611 if (!alternateGpuWait && useGpuPme)
1613 gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
1614 matrix vir_Q;
1615 real Vlr_q = 0.0;
1616 pme_gpu_wait_finish_task(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
1617 pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
1620 /* Wait for local GPU NB outputs on the non-alternating wait path */
1621 if (!alternateGpuWait && bUseGPU)
1623 /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1624 * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1625 * but even with a step of 0.1 ms the difference is less than 1%
1626 * of the step time.
1628 const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1630 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1631 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1632 flags, eatLocal,
1633 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1634 fr->fshift);
1635 float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1637 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1639 DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1640 if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1642 /* We measured few cycles, it could be that the kernel
1643 * and transfer finished earlier and there was no actual
1644 * wait time, only API call overhead.
1645 * Then the actual time could be anywhere between 0 and
1646 * cycles_wait_est. We will use half of cycles_wait_est.
1648 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1650 ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1654 if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1656 // NOTE: emulation kernel is not included in the balancing region,
1657 // but emulation mode does not target performance anyway
1658 wallcycle_start_nocount(wcycle, ewcFORCE);
1659 do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1660 DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1661 step, nrnb, wcycle);
1662 wallcycle_stop(wcycle, ewcFORCE);
1665 if (useGpuPme)
1667 pme_gpu_reinit_computation(fr->pmedata, wcycle);
1670 if (bUseGPU)
1672 /* now clear the GPU outputs while we finish the step on the CPU */
1673 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1674 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1675 nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1677 /* Is dynamic pair-list pruning activated? */
1678 if (nbv->listParams->useDynamicPruning)
1680 launchGpuRollingPruning(cr, nbv, inputrec, step);
1682 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1683 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1686 /* Do the nonbonded GPU (or emulation) force buffer reduction
1687 * on the non-alternating path. */
1688 if (bUseOrEmulGPU && !alternateGpuWait)
1690 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1691 nbv->nbat, f, wcycle);
1694 if (DOMAINDECOMP(cr))
1696 dd_force_flop_stop(cr->dd, nrnb);
1699 if (bDoForces)
1701 /* If we have NoVirSum forces, but we do not calculate the virial,
1702 * we sum fr->f_novirsum=f later.
1704 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1706 spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
1707 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1710 if (flags & GMX_FORCE_VIRIAL)
1712 /* Calculation of the virial must be done after vsites! */
1713 calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
1714 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1718 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1720 /* In case of node-splitting, the PP nodes receive the long-range
1721 * forces, virial and energy from the PME nodes here.
1723 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1726 if (bDoForces)
1728 post_process_forces(cr, step, nrnb, wcycle,
1729 top, box, as_rvec_array(x.data()), f, &forceWithVirial,
1730 vir_force, mdatoms, graph, fr, vsite,
1731 flags);
1734 if (flags & GMX_FORCE_ENERGY)
1736 /* Sum the potential energy terms from group contributions */
1737 sum_epot(&(enerd->grpp), enerd->term);
1739 if (!EI_TPI(inputrec->eI))
1741 checkPotentialEnergyValidity(step, *enerd, *inputrec);
1746 static void do_force_cutsGROUP(FILE *fplog,
1747 const t_commrec *cr,
1748 const gmx_multisim_t *ms,
1749 const t_inputrec *inputrec,
1750 gmx::Awh *awh,
1751 gmx_int64_t step,
1752 t_nrnb *nrnb,
1753 gmx_wallcycle_t wcycle,
1754 gmx_localtop_t *top,
1755 const gmx_groups_t *groups,
1756 matrix box, gmx::PaddedArrayRef<gmx::RVec> x,
1757 history_t *hist,
1758 gmx::PaddedArrayRef<gmx::RVec> force,
1759 tensor vir_force,
1760 const t_mdatoms *mdatoms,
1761 gmx_enerdata_t *enerd,
1762 t_fcdata *fcd,
1763 real *lambda,
1764 t_graph *graph,
1765 t_forcerec *fr,
1766 const gmx_vsite_t *vsite,
1767 rvec mu_tot,
1768 double t,
1769 const gmx_edsam *ed,
1770 int flags,
1771 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1772 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1774 int cg0, cg1, i, j;
1775 double mu[2*DIM];
1776 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1777 gmx_bool bDoForces;
1778 float cycles_pme;
1780 const int start = 0;
1781 const int homenr = mdatoms->homenr;
1783 clear_mat(vir_force);
1785 cg0 = 0;
1786 if (DOMAINDECOMP(cr))
1788 cg1 = cr->dd->ncg_tot;
1790 else
1792 cg1 = top->cgs.nr;
1794 if (fr->n_tpi > 0)
1796 cg1--;
1799 bStateChanged = (flags & GMX_FORCE_STATECHANGED);
1800 bNS = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1801 /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1802 bFillGrid = (bNS && bStateChanged);
1803 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1804 bDoForces = (flags & GMX_FORCE_FORCES);
1806 if (bStateChanged)
1808 update_forcerec(fr, box);
1810 if (inputrecNeedMutot(inputrec))
1812 /* Calculate total (local) dipole moment in a temporary common array.
1813 * This makes it possible to sum them over nodes faster.
1815 calc_mu(start, homenr,
1816 x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1817 mu, mu+DIM);
1821 if (fr->ePBC != epbcNONE)
1823 /* Compute shift vectors every step,
1824 * because of pressure coupling or box deformation!
1826 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1828 calc_shifts(box, fr->shift_vec);
1831 if (bCalcCGCM)
1833 put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1834 &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1835 inc_nrnb(nrnb, eNR_CGCM, homenr);
1836 inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1838 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1840 unshift_self(graph, box, as_rvec_array(x.data()));
1843 else if (bCalcCGCM)
1845 calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1846 inc_nrnb(nrnb, eNR_CGCM, homenr);
1849 if (bCalcCGCM && gmx_debug_at)
1851 pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1854 #if GMX_MPI
1855 if (!thisRankHasDuty(cr, DUTY_PME))
1857 /* Send particle coordinates to the pme nodes.
1858 * Since this is only implemented for domain decomposition
1859 * and domain decomposition does not use the graph,
1860 * we do not need to worry about shifting.
1862 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1863 lambda[efptCOUL], lambda[efptVDW],
1864 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1865 step, wcycle);
1867 #endif /* GMX_MPI */
1869 /* Communicate coordinates and sum dipole if necessary */
1870 if (DOMAINDECOMP(cr))
1872 dd_move_x(cr->dd, box, as_rvec_array(x.data()), wcycle);
1874 /* No GPU support, no move_x overlap, so reopen the balance region here */
1875 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1877 ddReopenBalanceRegionCpu(cr->dd);
1881 if (inputrecNeedMutot(inputrec))
1883 if (bStateChanged)
1885 if (PAR(cr))
1887 gmx_sumd(2*DIM, mu, cr);
1888 ddReopenBalanceRegionCpu(cr->dd);
1890 for (i = 0; i < 2; i++)
1892 for (j = 0; j < DIM; j++)
1894 fr->mu_tot[i][j] = mu[i*DIM + j];
1898 if (fr->efep == efepNO)
1900 copy_rvec(fr->mu_tot[0], mu_tot);
1902 else
1904 for (j = 0; j < DIM; j++)
1906 mu_tot[j] =
1907 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1912 /* Reset energies */
1913 reset_enerdata(enerd);
1914 clear_rvecs(SHIFTS, fr->fshift);
1916 if (bNS)
1918 wallcycle_start(wcycle, ewcNS);
1920 if (graph && bStateChanged)
1922 /* Calculate intramolecular shift vectors to make molecules whole */
1923 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1926 /* Do the actual neighbour searching */
1927 ns(fplog, fr, box,
1928 groups, top, mdatoms,
1929 cr, nrnb, bFillGrid);
1931 wallcycle_stop(wcycle, ewcNS);
1934 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1936 wallcycle_start(wcycle, ewcPPDURINGPME);
1937 dd_force_flop_start(cr->dd, nrnb);
1940 if (inputrec->bRot)
1942 wallcycle_start(wcycle, ewcROT);
1943 do_rotation(cr, inputrec, box, as_rvec_array(x.data()), t, step, bNS);
1944 wallcycle_stop(wcycle, ewcROT);
1947 /* Temporary solution until all routines take PaddedRVecVector */
1948 rvec *f = as_rvec_array(force.data());
1950 /* Start the force cycle counter.
1951 * Note that a different counter is used for dynamic load balancing.
1953 wallcycle_start(wcycle, ewcFORCE);
1955 gmx::ArrayRef<gmx::RVec> forceRef = force;
1956 if (bDoForces)
1958 /* If we need to compute the virial, we might need a separate
1959 * force buffer for algorithms for which the virial is calculated
1960 * separately, such as PME.
1962 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1964 forceRef = *fr->forceBufferForDirectVirialContributions;
1966 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1969 /* Clear the short- and long-range forces */
1970 clear_rvecs(fr->natoms_force_constr, f);
1973 /* forceWithVirial might need the full force atom range */
1974 ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
1976 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1978 clear_pull_forces(inputrec->pull_work);
1981 /* update QMMMrec, if necessary */
1982 if (fr->bQMMM)
1984 update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1987 /* Compute the bonded and non-bonded energies and optionally forces */
1988 do_force_lowlevel(fr, inputrec, &(top->idef),
1989 cr, ms, nrnb, wcycle, mdatoms,
1990 as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1991 box, inputrec->fepvals, lambda,
1992 graph, &(top->excls), fr->mu_tot,
1993 flags,
1994 &cycles_pme);
1996 wallcycle_stop(wcycle, ewcFORCE);
1998 if (DOMAINDECOMP(cr))
2000 dd_force_flop_stop(cr->dd, nrnb);
2002 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
2004 ddCloseBalanceRegionCpu(cr->dd);
2008 computeSpecialForces(fplog, cr, inputrec, awh, step, t, wcycle,
2009 fr->forceProviders, box, x, mdatoms, lambda,
2010 flags, &forceWithVirial, enerd,
2011 ed, bNS);
2013 if (bDoForces)
2015 /* Communicate the forces */
2016 if (DOMAINDECOMP(cr))
2018 dd_move_f(cr->dd, f, fr->fshift, wcycle);
2019 /* Do we need to communicate the separate force array
2020 * for terms that do not contribute to the single sum virial?
2021 * Position restraints and electric fields do not introduce
2022 * inter-cg forces, only full electrostatics methods do.
2023 * When we do not calculate the virial, fr->f_novirsum = f,
2024 * so we have already communicated these forces.
2026 if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
2027 (flags & GMX_FORCE_VIRIAL))
2029 dd_move_f(cr->dd, as_rvec_array(forceWithVirial.force_.data()), nullptr, wcycle);
2033 /* If we have NoVirSum forces, but we do not calculate the virial,
2034 * we sum fr->f_novirsum=f later.
2036 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
2038 spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
2039 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
2042 if (flags & GMX_FORCE_VIRIAL)
2044 /* Calculation of the virial must be done after vsites! */
2045 calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
2046 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
2050 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
2052 /* In case of node-splitting, the PP nodes receive the long-range
2053 * forces, virial and energy from the PME nodes here.
2055 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
2058 if (bDoForces)
2060 post_process_forces(cr, step, nrnb, wcycle,
2061 top, box, as_rvec_array(x.data()), f, &forceWithVirial,
2062 vir_force, mdatoms, graph, fr, vsite,
2063 flags);
2066 if (flags & GMX_FORCE_ENERGY)
2068 /* Sum the potential energy terms from group contributions */
2069 sum_epot(&(enerd->grpp), enerd->term);
2071 if (!EI_TPI(inputrec->eI))
2073 checkPotentialEnergyValidity(step, *enerd, *inputrec);
2079 void do_force(FILE *fplog,
2080 const t_commrec *cr,
2081 const gmx_multisim_t *ms,
2082 const t_inputrec *inputrec,
2083 gmx::Awh *awh,
2084 gmx_int64_t step,
2085 t_nrnb *nrnb,
2086 gmx_wallcycle_t wcycle,
2087 gmx_localtop_t *top,
2088 const gmx_groups_t *groups,
2089 matrix box,
2090 gmx::PaddedArrayRef<gmx::RVec> x,
2091 history_t *hist,
2092 gmx::PaddedArrayRef<gmx::RVec> force,
2093 tensor vir_force,
2094 const t_mdatoms *mdatoms,
2095 gmx_enerdata_t *enerd,
2096 t_fcdata *fcd,
2097 gmx::ArrayRef<real> lambda,
2098 t_graph *graph,
2099 t_forcerec *fr,
2100 const gmx_vsite_t *vsite,
2101 rvec mu_tot,
2102 double t,
2103 const gmx_edsam *ed,
2104 int flags,
2105 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
2106 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
2108 /* modify force flag if not doing nonbonded */
2109 if (!fr->bNonbonded)
2111 flags &= ~GMX_FORCE_NONBONDED;
2114 GMX_ASSERT(x.size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "coordinates should be padded");
2115 GMX_ASSERT(force.size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "force should be padded");
2117 switch (inputrec->cutoff_scheme)
2119 case ecutsVERLET:
2120 do_force_cutsVERLET(fplog, cr, ms, inputrec,
2121 awh, step, nrnb, wcycle,
2122 top,
2123 groups,
2124 box, x, hist,
2125 force, vir_force,
2126 mdatoms,
2127 enerd, fcd,
2128 lambda.data(), graph,
2129 fr, fr->ic,
2130 vsite, mu_tot,
2131 t, ed,
2132 flags,
2133 ddOpenBalanceRegion,
2134 ddCloseBalanceRegion);
2135 break;
2136 case ecutsGROUP:
2137 do_force_cutsGROUP(fplog, cr, ms, inputrec,
2138 awh, step, nrnb, wcycle,
2139 top,
2140 groups,
2141 box, x, hist,
2142 force, vir_force,
2143 mdatoms,
2144 enerd, fcd,
2145 lambda.data(), graph,
2146 fr, vsite, mu_tot,
2147 t, ed,
2148 flags,
2149 ddOpenBalanceRegion,
2150 ddCloseBalanceRegion);
2151 break;
2152 default:
2153 gmx_incons("Invalid cut-off scheme passed!");
2156 /* In case we don't have constraints and are using GPUs, the next balancing
2157 * region starts here.
2158 * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2159 * virial calculation and COM pulling, is not thus not included in
2160 * the balance timing, which is ok as most tasks do communication.
2162 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
2164 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
2169 void do_constrain_first(FILE *fplog, Constraints *constr,
2170 t_inputrec *ir, t_mdatoms *md,
2171 t_state *state, const t_commrec *cr,
2172 const gmx_multisim_t *ms,
2173 t_nrnb *nrnb,
2174 t_forcerec *fr, gmx_localtop_t *top)
2176 int i, m, start, end;
2177 gmx_int64_t step;
2178 real dt = ir->delta_t;
2179 real dvdl_dum;
2180 rvec *savex;
2182 /* We need to allocate one element extra, since we might use
2183 * (unaligned) 4-wide SIMD loads to access rvec entries.
2185 snew(savex, state->natoms + 1);
2187 start = 0;
2188 end = md->homenr;
2190 if (debug)
2192 fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2193 start, md->homenr, end);
2195 /* Do a first constrain to reset particles... */
2196 step = ir->init_step;
2197 if (fplog)
2199 char buf[STEPSTRSIZE];
2200 fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2201 gmx_step_str(step, buf));
2203 dvdl_dum = 0;
2205 /* constrain the current position */
2206 constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
2207 ir, cr, ms, step, 0, 1.0, md,
2208 as_rvec_array(state->x.data()), as_rvec_array(state->x.data()), nullptr,
2209 fr->bMolPBC, state->box,
2210 state->lambda[efptBONDED], &dvdl_dum,
2211 nullptr, nullptr, nrnb, econqCoord);
2212 if (EI_VV(ir->eI))
2214 /* constrain the inital velocity, and save it */
2215 /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2216 constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
2217 ir, cr, ms, step, 0, 1.0, md,
2218 as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
2219 fr->bMolPBC, state->box,
2220 state->lambda[efptBONDED], &dvdl_dum,
2221 nullptr, nullptr, nrnb, econqVeloc);
2223 /* constrain the inital velocities at t-dt/2 */
2224 if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2226 for (i = start; (i < end); i++)
2228 for (m = 0; (m < DIM); m++)
2230 /* Reverse the velocity */
2231 state->v[i][m] = -state->v[i][m];
2232 /* Store the position at t-dt in buf */
2233 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
2236 /* Shake the positions at t=-dt with the positions at t=0
2237 * as reference coordinates.
2239 if (fplog)
2241 char buf[STEPSTRSIZE];
2242 fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2243 gmx_step_str(step, buf));
2245 dvdl_dum = 0;
2246 constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
2247 ir, cr, ms, step, -1, 1.0, md,
2248 as_rvec_array(state->x.data()), savex, nullptr,
2249 fr->bMolPBC, state->box,
2250 state->lambda[efptBONDED], &dvdl_dum,
2251 as_rvec_array(state->v.data()), nullptr, nrnb, econqCoord);
2253 for (i = start; i < end; i++)
2255 for (m = 0; m < DIM; m++)
2257 /* Re-reverse the velocities */
2258 state->v[i][m] = -state->v[i][m];
2262 sfree(savex);
2266 static void
2267 integrate_table(real vdwtab[], real scale, int offstart, int rstart, int rend,
2268 double *enerout, double *virout)
2270 double enersum, virsum;
2271 double invscale, invscale2, invscale3;
2272 double r, ea, eb, ec, pa, pb, pc, pd;
2273 double y0, f, g, h;
2274 int ri, offset;
2275 double tabfactor;
2277 invscale = 1.0/scale;
2278 invscale2 = invscale*invscale;
2279 invscale3 = invscale*invscale2;
2281 /* Following summation derived from cubic spline definition,
2282 * Numerical Recipies in C, second edition, p. 113-116. Exact for
2283 * the cubic spline. We first calculate the negative of the
2284 * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2285 * add the more standard, abrupt cutoff correction to that result,
2286 * yielding the long-range correction for a switched function. We
2287 * perform both the pressure and energy loops at the same time for
2288 * simplicity, as the computational cost is low. */
2290 if (offstart == 0)
2292 /* Since the dispersion table has been scaled down a factor
2293 * 6.0 and the repulsion a factor 12.0 to compensate for the
2294 * c6/c12 parameters inside nbfp[] being scaled up (to save
2295 * flops in kernels), we need to correct for this.
2297 tabfactor = 6.0;
2299 else
2301 tabfactor = 12.0;
2304 enersum = 0.0;
2305 virsum = 0.0;
2306 for (ri = rstart; ri < rend; ++ri)
2308 r = ri*invscale;
2309 ea = invscale3;
2310 eb = 2.0*invscale2*r;
2311 ec = invscale*r*r;
2313 pa = invscale3;
2314 pb = 3.0*invscale2*r;
2315 pc = 3.0*invscale*r*r;
2316 pd = r*r*r;
2318 /* this "8" is from the packing in the vdwtab array - perhaps
2319 should be defined? */
2321 offset = 8*ri + offstart;
2322 y0 = vdwtab[offset];
2323 f = vdwtab[offset+1];
2324 g = vdwtab[offset+2];
2325 h = vdwtab[offset+3];
2327 enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2328 virsum += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2330 *enerout = 4.0*M_PI*enersum*tabfactor;
2331 *virout = 4.0*M_PI*virsum*tabfactor;
2334 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2336 double eners[2], virs[2], enersum, virsum;
2337 double r0, rc3, rc9;
2338 int ri0, ri1, i;
2339 real scale, *vdwtab;
2341 fr->enershiftsix = 0;
2342 fr->enershifttwelve = 0;
2343 fr->enerdiffsix = 0;
2344 fr->enerdifftwelve = 0;
2345 fr->virdiffsix = 0;
2346 fr->virdifftwelve = 0;
2348 const interaction_const_t *ic = fr->ic;
2350 if (eDispCorr != edispcNO)
2352 for (i = 0; i < 2; i++)
2354 eners[i] = 0;
2355 virs[i] = 0;
2357 if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2358 (ic->vdw_modifier == eintmodPOTSWITCH) ||
2359 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2360 (ic->vdwtype == evdwSHIFT) ||
2361 (ic->vdwtype == evdwSWITCH))
2363 if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2364 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2365 (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2367 gmx_fatal(FARGS,
2368 "With dispersion correction rvdw-switch can not be zero "
2369 "for vdw-type = %s", evdw_names[ic->vdwtype]);
2372 /* TODO This code depends on the logic in tables.c that
2373 constructs the table layout, which should be made
2374 explicit in future cleanup. */
2375 GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2376 "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2377 scale = fr->dispersionCorrectionTable->scale;
2378 vdwtab = fr->dispersionCorrectionTable->data;
2380 /* Round the cut-offs to exact table values for precision */
2381 ri0 = static_cast<int>(floor(ic->rvdw_switch*scale));
2382 ri1 = static_cast<int>(ceil(ic->rvdw*scale));
2384 /* The code below has some support for handling force-switching, i.e.
2385 * when the force (instead of potential) is switched over a limited
2386 * region. This leads to a constant shift in the potential inside the
2387 * switching region, which we can handle by adding a constant energy
2388 * term in the force-switch case just like when we do potential-shift.
2390 * For now this is not enabled, but to keep the functionality in the
2391 * code we check separately for switch and shift. When we do force-switch
2392 * the shifting point is rvdw_switch, while it is the cutoff when we
2393 * have a classical potential-shift.
2395 * For a pure potential-shift the potential has a constant shift
2396 * all the way out to the cutoff, and that is it. For other forms
2397 * we need to calculate the constant shift up to the point where we
2398 * start modifying the potential.
2400 ri0 = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2402 r0 = ri0/scale;
2403 rc3 = r0*r0*r0;
2404 rc9 = rc3*rc3*rc3;
2406 if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2407 (ic->vdwtype == evdwSHIFT))
2409 /* Determine the constant energy shift below rvdw_switch.
2410 * Table has a scale factor since we have scaled it down to compensate
2411 * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2413 fr->enershiftsix = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2414 fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2416 else if (ic->vdw_modifier == eintmodPOTSHIFT)
2418 fr->enershiftsix = (real)(-1.0/(rc3*rc3));
2419 fr->enershifttwelve = (real)( 1.0/(rc9*rc3));
2422 /* Add the constant part from 0 to rvdw_switch.
2423 * This integration from 0 to rvdw_switch overcounts the number
2424 * of interactions by 1, as it also counts the self interaction.
2425 * We will correct for this later.
2427 eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2428 eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2430 /* Calculate the contribution in the range [r0,r1] where we
2431 * modify the potential. For a pure potential-shift modifier we will
2432 * have ri0==ri1, and there will not be any contribution here.
2434 for (i = 0; i < 2; i++)
2436 enersum = 0;
2437 virsum = 0;
2438 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2439 eners[i] -= enersum;
2440 virs[i] -= virsum;
2443 /* Alright: Above we compensated by REMOVING the parts outside r0
2444 * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2446 * Regardless of whether r0 is the point where we start switching,
2447 * or the cutoff where we calculated the constant shift, we include
2448 * all the parts we are missing out to infinity from r0 by
2449 * calculating the analytical dispersion correction.
2451 eners[0] += -4.0*M_PI/(3.0*rc3);
2452 eners[1] += 4.0*M_PI/(9.0*rc9);
2453 virs[0] += 8.0*M_PI/rc3;
2454 virs[1] += -16.0*M_PI/(3.0*rc9);
2456 else if (ic->vdwtype == evdwCUT ||
2457 EVDW_PME(ic->vdwtype) ||
2458 ic->vdwtype == evdwUSER)
2460 if (ic->vdwtype == evdwUSER && fplog)
2462 fprintf(fplog,
2463 "WARNING: using dispersion correction with user tables\n");
2466 /* Note that with LJ-PME, the dispersion correction is multiplied
2467 * by the difference between the actual C6 and the value of C6
2468 * that would produce the combination rule.
2469 * This means the normal energy and virial difference formulas
2470 * can be used here.
2473 rc3 = ic->rvdw*ic->rvdw*ic->rvdw;
2474 rc9 = rc3*rc3*rc3;
2475 /* Contribution beyond the cut-off */
2476 eners[0] += -4.0*M_PI/(3.0*rc3);
2477 eners[1] += 4.0*M_PI/(9.0*rc9);
2478 if (ic->vdw_modifier == eintmodPOTSHIFT)
2480 /* Contribution within the cut-off */
2481 eners[0] += -4.0*M_PI/(3.0*rc3);
2482 eners[1] += 4.0*M_PI/(3.0*rc9);
2484 /* Contribution beyond the cut-off */
2485 virs[0] += 8.0*M_PI/rc3;
2486 virs[1] += -16.0*M_PI/(3.0*rc9);
2488 else
2490 gmx_fatal(FARGS,
2491 "Dispersion correction is not implemented for vdw-type = %s",
2492 evdw_names[ic->vdwtype]);
2495 /* When we deprecate the group kernels the code below can go too */
2496 if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2498 /* Calculate self-interaction coefficient (assuming that
2499 * the reciprocal-space contribution is constant in the
2500 * region that contributes to the self-interaction).
2502 fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2504 eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2505 virs[0] += gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2508 fr->enerdiffsix = eners[0];
2509 fr->enerdifftwelve = eners[1];
2510 /* The 0.5 is due to the Gromacs definition of the virial */
2511 fr->virdiffsix = 0.5*virs[0];
2512 fr->virdifftwelve = 0.5*virs[1];
2516 void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
2517 matrix box, real lambda, tensor pres, tensor virial,
2518 real *prescorr, real *enercorr, real *dvdlcorr)
2520 gmx_bool bCorrAll, bCorrPres;
2521 real dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2522 int m;
2524 *prescorr = 0;
2525 *enercorr = 0;
2526 *dvdlcorr = 0;
2528 clear_mat(virial);
2529 clear_mat(pres);
2531 if (ir->eDispCorr != edispcNO)
2533 bCorrAll = (ir->eDispCorr == edispcAllEner ||
2534 ir->eDispCorr == edispcAllEnerPres);
2535 bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2536 ir->eDispCorr == edispcAllEnerPres);
2538 invvol = 1/det(box);
2539 if (fr->n_tpi)
2541 /* Only correct for the interactions with the inserted molecule */
2542 dens = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2543 ninter = fr->n_tpi;
2545 else
2547 dens = fr->numAtomsForDispersionCorrection*invvol;
2548 ninter = 0.5*fr->numAtomsForDispersionCorrection;
2551 if (ir->efep == efepNO)
2553 avcsix = fr->avcsix[0];
2554 avctwelve = fr->avctwelve[0];
2556 else
2558 avcsix = (1 - lambda)*fr->avcsix[0] + lambda*fr->avcsix[1];
2559 avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2562 enerdiff = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2563 *enercorr += avcsix*enerdiff;
2564 dvdlambda = 0.0;
2565 if (ir->efep != efepNO)
2567 dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2569 if (bCorrAll)
2571 enerdiff = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2572 *enercorr += avctwelve*enerdiff;
2573 if (fr->efep != efepNO)
2575 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2579 if (bCorrPres)
2581 svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2582 if (ir->eDispCorr == edispcAllEnerPres)
2584 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2586 /* The factor 2 is because of the Gromacs virial definition */
2587 spres = -2.0*invvol*svir*PRESFAC;
2589 for (m = 0; m < DIM; m++)
2591 virial[m][m] += svir;
2592 pres[m][m] += spres;
2594 *prescorr += spres;
2597 /* Can't currently control when it prints, for now, just print when degugging */
2598 if (debug)
2600 if (bCorrAll)
2602 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2603 avcsix, avctwelve);
2605 if (bCorrPres)
2607 fprintf(debug,
2608 "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2609 *enercorr, spres, svir);
2611 else
2613 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2617 if (fr->efep != efepNO)
2619 *dvdlcorr += dvdlambda;
2624 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2625 const gmx_mtop_t *mtop, rvec x[],
2626 gmx_bool bFirst)
2628 t_graph *graph;
2629 int as, mol;
2631 if (bFirst && fplog)
2633 fprintf(fplog, "Removing pbc first time\n");
2636 snew(graph, 1);
2637 as = 0;
2638 for (const gmx_molblock_t &molb : mtop->molblock)
2640 const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2641 if (moltype.atoms.nr == 1 ||
2642 (!bFirst && moltype.cgs.nr == 1))
2644 /* Just one atom or charge group in the molecule, no PBC required */
2645 as += molb.nmol*moltype.atoms.nr;
2647 else
2649 /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2650 mk_graph_ilist(nullptr, moltype.ilist,
2651 0, moltype.atoms.nr, FALSE, FALSE, graph);
2653 for (mol = 0; mol < molb.nmol; mol++)
2655 mk_mshift(fplog, graph, ePBC, box, x+as);
2657 shift_self(graph, box, x+as);
2658 /* The molecule is whole now.
2659 * We don't need the second mk_mshift call as in do_pbc_first,
2660 * since we no longer need this graph.
2663 as += moltype.atoms.nr;
2665 done_graph(graph);
2668 sfree(graph);
2671 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2672 const gmx_mtop_t *mtop, rvec x[])
2674 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2677 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2678 const gmx_mtop_t *mtop, rvec x[])
2680 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2683 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2685 int t, nth;
2686 nth = gmx_omp_nthreads_get(emntDefault);
2688 #pragma omp parallel for num_threads(nth) schedule(static)
2689 for (t = 0; t < nth; t++)
2693 size_t natoms = x.size();
2694 size_t offset = (natoms*t )/nth;
2695 size_t len = (natoms*(t + 1))/nth - offset;
2696 put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2698 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2702 // TODO This can be cleaned up a lot, and move back to runner.cpp
2703 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2704 const t_inputrec *inputrec,
2705 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2706 gmx_walltime_accounting_t walltime_accounting,
2707 nonbonded_verlet_t *nbv,
2708 gmx_pme_t *pme,
2709 gmx_bool bWriteStat)
2711 t_nrnb *nrnb_tot = nullptr;
2712 double delta_t = 0;
2713 double nbfs = 0, mflop = 0;
2714 double elapsed_time,
2715 elapsed_time_over_all_ranks,
2716 elapsed_time_over_all_threads,
2717 elapsed_time_over_all_threads_over_all_ranks;
2718 /* Control whether it is valid to print a report. Only the
2719 simulation master may print, but it should not do so if the run
2720 terminated e.g. before a scheduled reset step. This is
2721 complicated by the fact that PME ranks are unaware of the
2722 reason why they were sent a pmerecvqxFINISH. To avoid
2723 communication deadlocks, we always do the communication for the
2724 report, even if we've decided not to write the report, because
2725 how long it takes to finish the run is not important when we've
2726 decided not to report on the simulation performance.
2728 Further, we only report performance for dynamical integrators,
2729 because those are the only ones for which we plan to
2730 consider doing any optimizations. */
2731 bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2733 if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2735 GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2736 printReport = false;
2739 if (cr->nnodes > 1)
2741 snew(nrnb_tot, 1);
2742 #if GMX_MPI
2743 MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2744 cr->mpi_comm_mysim);
2745 #endif
2747 else
2749 nrnb_tot = nrnb;
2752 elapsed_time = walltime_accounting_get_elapsed_time(walltime_accounting);
2753 elapsed_time_over_all_ranks = elapsed_time;
2754 elapsed_time_over_all_threads = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
2755 elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2756 #if GMX_MPI
2757 if (cr->nnodes > 1)
2759 /* reduce elapsed_time over all MPI ranks in the current simulation */
2760 MPI_Allreduce(&elapsed_time,
2761 &elapsed_time_over_all_ranks,
2762 1, MPI_DOUBLE, MPI_SUM,
2763 cr->mpi_comm_mysim);
2764 elapsed_time_over_all_ranks /= cr->nnodes;
2765 /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2766 * current simulation. */
2767 MPI_Allreduce(&elapsed_time_over_all_threads,
2768 &elapsed_time_over_all_threads_over_all_ranks,
2769 1, MPI_DOUBLE, MPI_SUM,
2770 cr->mpi_comm_mysim);
2772 #endif
2774 if (printReport)
2776 print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2778 if (cr->nnodes > 1)
2780 sfree(nrnb_tot);
2783 if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2785 print_dd_statistics(cr, inputrec, fplog);
2788 /* TODO Move the responsibility for any scaling by thread counts
2789 * to the code that handled the thread region, so that there's a
2790 * mechanism to keep cycle counting working during the transition
2791 * to task parallelism. */
2792 int nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
2793 int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2794 wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2795 auto cycle_sum(wallcycle_sum(cr, wcycle));
2797 if (printReport)
2799 auto nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2800 gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
2801 if (pme_gpu_task_enabled(pme))
2803 pme_gpu_get_timings(pme, &pme_gpu_timings);
2805 wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2806 elapsed_time_over_all_ranks,
2807 wcycle, cycle_sum,
2808 nbnxn_gpu_timings,
2809 &pme_gpu_timings);
2811 if (EI_DYNAMICS(inputrec->eI))
2813 delta_t = inputrec->delta_t;
2816 if (fplog)
2818 print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2819 elapsed_time_over_all_ranks,
2820 walltime_accounting_get_nsteps_done(walltime_accounting),
2821 delta_t, nbfs, mflop);
2823 if (bWriteStat)
2825 print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2826 elapsed_time_over_all_ranks,
2827 walltime_accounting_get_nsteps_done(walltime_accounting),
2828 delta_t, nbfs, mflop);
2833 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2835 /* this function works, but could probably use a logic rewrite to keep all the different
2836 types of efep straight. */
2838 if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
2840 return;
2843 t_lambda *fep = ir->fepvals;
2844 *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2845 if checkpoint is set -- a kludge is in for now
2846 to prevent this.*/
2848 for (int i = 0; i < efptNR; i++)
2850 /* overwrite lambda state with init_lambda for now for backwards compatibility */
2851 if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2853 lambda[i] = fep->init_lambda;
2854 if (lam0)
2856 lam0[i] = lambda[i];
2859 else
2861 lambda[i] = fep->all_lambda[i][*fep_state];
2862 if (lam0)
2864 lam0[i] = lambda[i];
2868 if (ir->bSimTemp)
2870 /* need to rescale control temperatures to match current state */
2871 for (int i = 0; i < ir->opts.ngtc; i++)
2873 if (ir->opts.ref_t[i] > 0)
2875 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2880 /* Send to the log the information on the current lambdas */
2881 if (fplog != nullptr)
2883 fprintf(fplog, "Initial vector of lambda components:[ ");
2884 for (const auto &l : lambda)
2886 fprintf(fplog, "%10.4f ", l);
2888 fprintf(fplog, "]\n");
2890 return;
2894 void init_md(FILE *fplog,
2895 const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2896 t_inputrec *ir, const gmx_output_env_t *oenv,
2897 const MdrunOptions &mdrunOptions,
2898 double *t, double *t0,
2899 t_state *globalState, double *lam0,
2900 t_nrnb *nrnb, gmx_mtop_t *mtop,
2901 gmx_update_t **upd,
2902 int nfile, const t_filenm fnm[],
2903 gmx_mdoutf_t *outf, t_mdebin **mdebin,
2904 tensor force_vir, tensor shake_vir, rvec mu_tot,
2905 gmx_bool *bSimAnn, t_vcm **vcm,
2906 gmx_wallcycle_t wcycle)
2908 int i;
2910 /* Initial values */
2911 *t = *t0 = ir->init_t;
2913 *bSimAnn = FALSE;
2914 for (i = 0; i < ir->opts.ngtc; i++)
2916 /* set bSimAnn if any group is being annealed */
2917 if (ir->opts.annealing[i] != eannNO)
2919 *bSimAnn = TRUE;
2923 /* Initialize lambda variables */
2924 /* TODO: Clean up initialization of fep_state and lambda in t_state.
2925 * We currently need to call initialize_lambdas on non-master ranks
2926 * to initialize lam0.
2928 if (MASTER(cr))
2930 initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
2932 else
2934 int tmpFepState;
2935 std::array<real, efptNR> tmpLambda;
2936 initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
2939 // TODO upd is never NULL in practice, but the analysers don't know that
2940 if (upd)
2942 *upd = init_update(ir);
2944 if (*bSimAnn)
2946 update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2949 if (vcm != nullptr)
2951 *vcm = init_vcm(fplog, &mtop->groups, ir);
2954 if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
2956 if (ir->etc == etcBERENDSEN)
2958 please_cite(fplog, "Berendsen84a");
2960 if (ir->etc == etcVRESCALE)
2962 please_cite(fplog, "Bussi2007a");
2964 if (ir->eI == eiSD1)
2966 please_cite(fplog, "Goga2012");
2969 init_nrnb(nrnb);
2971 if (nfile != -1)
2973 *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
2975 *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
2976 mtop, ir, mdoutf_get_fp_dhdl(*outf));
2979 /* Initiate variables */
2980 clear_mat(force_vir);
2981 clear_mat(shake_vir);
2982 clear_rvec(mu_tot);