Move natoms_mol out of gmx_molblock_t
[gromacs.git] / src / gromacs / mdlib / sim_util.cpp
blob137e5028c84ff22af597968e1dac9b41ba0f968b
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
37 #include "gmxpre.h"
39 #include "sim_util.h"
41 #include "config.h"
43 #include <stdio.h>
44 #include <string.h>
46 #include <cmath>
47 #include <cstdint>
49 #include <array>
51 #include "gromacs/awh/awh.h"
52 #include "gromacs/domdec/dlbtiming.h"
53 #include "gromacs/domdec/domdec.h"
54 #include "gromacs/domdec/domdec_struct.h"
55 #include "gromacs/essentialdynamics/edsam.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/gmxlib/chargegroup.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/gmxlib/nrnb.h"
60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
63 #include "gromacs/gpu_utils/gpu_utils.h"
64 #include "gromacs/imd/imd.h"
65 #include "gromacs/listed-forces/bonded.h"
66 #include "gromacs/listed-forces/disre.h"
67 #include "gromacs/listed-forces/orires.h"
68 #include "gromacs/math/functions.h"
69 #include "gromacs/math/units.h"
70 #include "gromacs/math/vec.h"
71 #include "gromacs/math/vecdump.h"
72 #include "gromacs/mdlib/calcmu.h"
73 #include "gromacs/mdlib/constr.h"
74 #include "gromacs/mdlib/force.h"
75 #include "gromacs/mdlib/forcerec.h"
76 #include "gromacs/mdlib/gmx_omp_nthreads.h"
77 #include "gromacs/mdlib/mdrun.h"
78 #include "gromacs/mdlib/nb_verlet.h"
79 #include "gromacs/mdlib/nbnxn_atomdata.h"
80 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
81 #include "gromacs/mdlib/nbnxn_grid.h"
82 #include "gromacs/mdlib/nbnxn_search.h"
83 #include "gromacs/mdlib/qmmm.h"
84 #include "gromacs/mdlib/update.h"
85 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
86 #include "gromacs/mdtypes/commrec.h"
87 #include "gromacs/mdtypes/forceoutput.h"
88 #include "gromacs/mdtypes/iforceprovider.h"
89 #include "gromacs/mdtypes/inputrec.h"
90 #include "gromacs/mdtypes/md_enums.h"
91 #include "gromacs/mdtypes/state.h"
92 #include "gromacs/pbcutil/ishift.h"
93 #include "gromacs/pbcutil/mshift.h"
94 #include "gromacs/pbcutil/pbc.h"
95 #include "gromacs/pulling/pull.h"
96 #include "gromacs/pulling/pull_rotation.h"
97 #include "gromacs/timing/cyclecounter.h"
98 #include "gromacs/timing/gpu_timing.h"
99 #include "gromacs/timing/wallcycle.h"
100 #include "gromacs/timing/wallcyclereporting.h"
101 #include "gromacs/timing/walltime_accounting.h"
102 #include "gromacs/topology/topology.h"
103 #include "gromacs/utility/arrayref.h"
104 #include "gromacs/utility/basedefinitions.h"
105 #include "gromacs/utility/cstringutil.h"
106 #include "gromacs/utility/exceptions.h"
107 #include "gromacs/utility/fatalerror.h"
108 #include "gromacs/utility/gmxassert.h"
109 #include "gromacs/utility/gmxmpi.h"
110 #include "gromacs/utility/logger.h"
111 #include "gromacs/utility/pleasecite.h"
112 #include "gromacs/utility/smalloc.h"
113 #include "gromacs/utility/sysinfo.h"
115 #include "nbnxn_gpu.h"
116 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
117 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
119 // TODO: this environment variable allows us to verify before release
120 // that on less common architectures the total cost of polling is not larger than
121 // a blocking wait (so polling does not introduce overhead when the static
122 // PME-first ordering would suffice).
123 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
126 void print_time(FILE *out,
127 gmx_walltime_accounting_t walltime_accounting,
128 gmx_int64_t step,
129 t_inputrec *ir,
130 const t_commrec *cr)
132 time_t finish;
133 char timebuf[STRLEN];
134 double dt, elapsed_seconds, time_per_step;
135 char buf[48];
137 #if !GMX_THREAD_MPI
138 if (!PAR(cr))
139 #endif
141 fprintf(out, "\r");
143 fprintf(out, "step %s", gmx_step_str(step, buf));
144 fflush(out);
146 if ((step >= ir->nstlist))
148 double seconds_since_epoch = gmx_gettime();
149 elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
150 time_per_step = elapsed_seconds/(step - ir->init_step + 1);
151 dt = (ir->nsteps + ir->init_step - step) * time_per_step;
153 if (ir->nsteps >= 0)
155 if (dt >= 300)
157 finish = (time_t) (seconds_since_epoch + dt);
158 gmx_ctime_r(&finish, timebuf, STRLEN);
159 sprintf(buf, "%s", timebuf);
160 buf[strlen(buf)-1] = '\0';
161 fprintf(out, ", will finish %s", buf);
163 else
165 fprintf(out, ", remaining wall clock time: %5d s ", (int)dt);
168 else
170 fprintf(out, " performance: %.1f ns/day ",
171 ir->delta_t/1000*24*60*60/time_per_step);
174 #if !GMX_THREAD_MPI
175 if (PAR(cr))
177 fprintf(out, "\n");
179 #else
180 GMX_UNUSED_VALUE(cr);
181 #endif
183 fflush(out);
186 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
187 double the_time)
189 char time_string[STRLEN];
191 if (!fplog)
193 return;
197 int i;
198 char timebuf[STRLEN];
199 time_t temp_time = (time_t) the_time;
201 gmx_ctime_r(&temp_time, timebuf, STRLEN);
202 for (i = 0; timebuf[i] >= ' '; i++)
204 time_string[i] = timebuf[i];
206 time_string[i] = '\0';
209 fprintf(fplog, "%s on rank %d %s\n", title, nodeid, time_string);
212 void print_start(FILE *fplog, const t_commrec *cr,
213 gmx_walltime_accounting_t walltime_accounting,
214 const char *name)
216 char buf[STRLEN];
218 sprintf(buf, "Started %s", name);
219 print_date_and_time(fplog, cr->nodeid, buf,
220 walltime_accounting_get_start_time_stamp(walltime_accounting));
223 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
225 const int end = forceToAdd.size();
227 // cppcheck-suppress unreadVariable
228 int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
229 #pragma omp parallel for num_threads(nt) schedule(static)
230 for (int i = 0; i < end; i++)
232 rvec_inc(f[i], forceToAdd[i]);
236 static void pme_gpu_reduce_outputs(gmx_wallcycle_t wcycle,
237 ForceWithVirial *forceWithVirial,
238 ArrayRef<const gmx::RVec> pmeForces,
239 gmx_enerdata_t *enerd,
240 const tensor vir_Q,
241 real Vlr_q)
243 wallcycle_start(wcycle, ewcPME_GPU_F_REDUCTION);
244 GMX_ASSERT(forceWithVirial, "Invalid force pointer");
245 forceWithVirial->addVirialContribution(vir_Q);
246 enerd->term[F_COUL_RECIP] += Vlr_q;
247 sum_forces(as_rvec_array(forceWithVirial->force_.data()), pmeForces);
248 wallcycle_stop(wcycle, ewcPME_GPU_F_REDUCTION);
251 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
252 tensor vir_part, t_graph *graph, matrix box,
253 t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
255 int i;
257 /* The short-range virial from surrounding boxes */
258 calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
259 inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
261 /* Calculate partial virial, for local atoms only, based on short range.
262 * Total virial is computed in global_stat, called from do_md
264 f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
265 inc_nrnb(nrnb, eNR_VIRIAL, homenr);
267 /* Add wall contribution */
268 for (i = 0; i < DIM; i++)
270 vir_part[i][ZZ] += fr->vir_wall_z[i];
273 if (debug)
275 pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
279 static void pull_potential_wrapper(const t_commrec *cr,
280 t_inputrec *ir,
281 matrix box, rvec x[],
282 ForceWithVirial *force,
283 t_mdatoms *mdatoms,
284 gmx_enerdata_t *enerd,
285 real *lambda,
286 double t,
287 gmx_wallcycle_t wcycle)
289 t_pbc pbc;
290 real dvdl;
292 /* Calculate the center of mass forces, this requires communication,
293 * which is why pull_potential is called close to other communication.
295 wallcycle_start(wcycle, ewcPULLPOT);
296 set_pbc(&pbc, ir->ePBC, box);
297 dvdl = 0;
298 enerd->term[F_COM_PULL] +=
299 pull_potential(ir->pull_work, mdatoms, &pbc,
300 cr, t, lambda[efptRESTRAINT], x, force, &dvdl);
301 enerd->dvdl_lin[efptRESTRAINT] += dvdl;
302 wallcycle_stop(wcycle, ewcPULLPOT);
305 static void pme_receive_force_ener(const t_commrec *cr,
306 ForceWithVirial *forceWithVirial,
307 gmx_enerdata_t *enerd,
308 gmx_wallcycle_t wcycle)
310 real e_q, e_lj, dvdl_q, dvdl_lj;
311 float cycles_ppdpme, cycles_seppme;
313 cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
314 dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
316 /* In case of node-splitting, the PP nodes receive the long-range
317 * forces, virial and energy from the PME nodes here.
319 wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
320 dvdl_q = 0;
321 dvdl_lj = 0;
322 gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
323 &cycles_seppme);
324 enerd->term[F_COUL_RECIP] += e_q;
325 enerd->term[F_LJ_RECIP] += e_lj;
326 enerd->dvdl_lin[efptCOUL] += dvdl_q;
327 enerd->dvdl_lin[efptVDW] += dvdl_lj;
329 if (wcycle)
331 dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
333 wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
336 static void print_large_forces(FILE *fp, t_mdatoms *md, const t_commrec *cr,
337 gmx_int64_t step, real forceTolerance,
338 const rvec *x, const rvec *f)
340 real force2Tolerance = gmx::square(forceTolerance);
341 std::uintmax_t numNonFinite = 0;
342 for (int i = 0; i < md->homenr; i++)
344 real force2 = norm2(f[i]);
345 bool nonFinite = !std::isfinite(force2);
346 if (force2 >= force2Tolerance || nonFinite)
348 fprintf(fp, "step %" GMX_PRId64 " atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
349 step,
350 ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
352 if (nonFinite)
354 numNonFinite++;
357 if (numNonFinite > 0)
359 /* Note that with MPI this fatal call on one rank might interrupt
360 * the printing on other ranks. But we can only avoid that with
361 * an expensive MPI barrier that we would need at each step.
363 gmx_fatal(FARGS, "At step %" GMX_PRId64 " detected non-finite forces on %ju atoms", step, numNonFinite);
367 static void post_process_forces(const t_commrec *cr,
368 gmx_int64_t step,
369 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
370 gmx_localtop_t *top,
371 matrix box, rvec x[],
372 rvec f[],
373 ForceWithVirial *forceWithVirial,
374 tensor vir_force,
375 t_mdatoms *mdatoms,
376 t_graph *graph,
377 t_forcerec *fr, gmx_vsite_t *vsite,
378 int flags)
380 if (fr->haveDirectVirialContributions)
382 rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
384 if (vsite)
386 /* Spread the mesh force on virtual sites to the other particles...
387 * This is parallellized. MPI communication is performed
388 * if the constructing atoms aren't local.
390 matrix virial = { { 0 } };
391 spread_vsite_f(vsite, x, fDirectVir, nullptr,
392 (flags & GMX_FORCE_VIRIAL), virial,
393 nrnb,
394 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
395 forceWithVirial->addVirialContribution(virial);
398 if (flags & GMX_FORCE_VIRIAL)
400 /* Now add the forces, this is local */
401 sum_forces(f, forceWithVirial->force_);
403 /* Add the direct virial contributions */
404 GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
405 m_add(vir_force, forceWithVirial->getVirial(), vir_force);
407 if (debug)
409 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
414 if (fr->print_force >= 0)
416 print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
420 static void do_nb_verlet(t_forcerec *fr,
421 interaction_const_t *ic,
422 gmx_enerdata_t *enerd,
423 int flags, int ilocality,
424 int clearF,
425 gmx_int64_t step,
426 t_nrnb *nrnb,
427 gmx_wallcycle_t wcycle)
429 if (!(flags & GMX_FORCE_NONBONDED))
431 /* skip non-bonded calculation */
432 return;
435 nonbonded_verlet_t *nbv = fr->nbv;
436 nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
438 /* GPU kernel launch overhead is already timed separately */
439 if (fr->cutoff_scheme != ecutsVERLET)
441 gmx_incons("Invalid cut-off scheme passed!");
444 bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
446 if (!bUsingGpuKernels)
448 /* When dynamic pair-list pruning is requested, we need to prune
449 * at nstlistPrune steps.
451 if (nbv->listParams->useDynamicPruning &&
452 (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
454 /* Prune the pair-list beyond fr->ic->rlistPrune using
455 * the current coordinates of the atoms.
457 wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
458 nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
459 wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
462 wallcycle_sub_start(wcycle, ewcsNONBONDED);
465 switch (nbvg->kernel_type)
467 case nbnxnk4x4_PlainC:
468 case nbnxnk4xN_SIMD_4xN:
469 case nbnxnk4xN_SIMD_2xNN:
470 nbnxn_kernel_cpu(nbvg,
471 nbv->nbat,
473 fr->shift_vec,
474 flags,
475 clearF,
476 fr->fshift[0],
477 enerd->grpp.ener[egCOULSR],
478 fr->bBHAM ?
479 enerd->grpp.ener[egBHAMSR] :
480 enerd->grpp.ener[egLJSR]);
481 break;
483 case nbnxnk8x8x8_GPU:
484 nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbv->nbat, flags, ilocality);
485 break;
487 case nbnxnk8x8x8_PlainC:
488 nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
489 nbv->nbat, ic,
490 fr->shift_vec,
491 flags,
492 clearF,
493 nbv->nbat->out[0].f,
494 fr->fshift[0],
495 enerd->grpp.ener[egCOULSR],
496 fr->bBHAM ?
497 enerd->grpp.ener[egBHAMSR] :
498 enerd->grpp.ener[egLJSR]);
499 break;
501 default:
502 GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
505 if (!bUsingGpuKernels)
507 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
510 int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
511 if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
513 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
515 else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
516 (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
518 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
520 else
522 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
524 enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
525 if (flags & GMX_FORCE_ENERGY)
527 /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
528 enr_nbnxn_kernel_ljc += 1;
529 enr_nbnxn_kernel_lj += 1;
532 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
533 nbvg->nbl_lists.natpair_ljq);
534 inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
535 nbvg->nbl_lists.natpair_lj);
536 /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
537 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
538 nbvg->nbl_lists.natpair_q);
540 if (ic->vdw_modifier == eintmodFORCESWITCH)
542 /* We add up the switch cost separately */
543 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
544 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
546 if (ic->vdw_modifier == eintmodPOTSWITCH)
548 /* We add up the switch cost separately */
549 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
550 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
552 if (ic->vdwtype == evdwPME)
554 /* We add up the LJ Ewald cost separately */
555 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
556 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
560 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
561 t_forcerec *fr,
562 rvec x[],
563 rvec f[],
564 t_mdatoms *mdatoms,
565 t_lambda *fepvals,
566 real *lambda,
567 gmx_enerdata_t *enerd,
568 int flags,
569 t_nrnb *nrnb,
570 gmx_wallcycle_t wcycle)
572 int donb_flags;
573 nb_kernel_data_t kernel_data;
574 real lam_i[efptNR];
575 real dvdl_nb[efptNR];
576 int th;
577 int i, j;
579 donb_flags = 0;
580 /* Add short-range interactions */
581 donb_flags |= GMX_NONBONDED_DO_SR;
583 /* Currently all group scheme kernels always calculate (shift-)forces */
584 if (flags & GMX_FORCE_FORCES)
586 donb_flags |= GMX_NONBONDED_DO_FORCE;
588 if (flags & GMX_FORCE_VIRIAL)
590 donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
592 if (flags & GMX_FORCE_ENERGY)
594 donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
597 kernel_data.flags = donb_flags;
598 kernel_data.lambda = lambda;
599 kernel_data.dvdl = dvdl_nb;
601 kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
602 kernel_data.energygrp_vdw = enerd->grpp.ener[egLJSR];
604 /* reset free energy components */
605 for (i = 0; i < efptNR; i++)
607 dvdl_nb[i] = 0;
610 GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
612 wallcycle_sub_start(wcycle, ewcsNONBONDED);
613 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
614 for (th = 0; th < nbl_lists->nnbl; th++)
618 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
619 x, f, fr, mdatoms, &kernel_data, nrnb);
621 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
624 if (fepvals->sc_alpha != 0)
626 enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
627 enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
629 else
631 enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
632 enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
635 /* If we do foreign lambda and we have soft-core interactions
636 * we have to recalculate the (non-linear) energies contributions.
638 if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
640 kernel_data.flags = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
641 kernel_data.lambda = lam_i;
642 kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
643 kernel_data.energygrp_vdw = enerd->foreign_grpp.ener[egLJSR];
644 /* Note that we add to kernel_data.dvdl, but ignore the result */
646 for (i = 0; i < enerd->n_lambda; i++)
648 for (j = 0; j < efptNR; j++)
650 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
652 reset_foreign_enerdata(enerd);
653 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
654 for (th = 0; th < nbl_lists->nnbl; th++)
658 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
659 x, f, fr, mdatoms, &kernel_data, nrnb);
661 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
664 sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
665 enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
669 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
672 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
674 return nbv != nullptr && nbv->bUseGPU;
677 static inline void clear_rvecs_omp(int n, rvec v[])
679 int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
681 /* Note that we would like to avoid this conditional by putting it
682 * into the omp pragma instead, but then we still take the full
683 * omp parallel for overhead (at least with gcc5).
685 if (nth == 1)
687 for (int i = 0; i < n; i++)
689 clear_rvec(v[i]);
692 else
694 #pragma omp parallel for num_threads(nth) schedule(static)
695 for (int i = 0; i < n; i++)
697 clear_rvec(v[i]);
702 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
704 * \param groupOptions Group options, containing T-coupling options
706 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
708 real nrdfCoupled = 0;
709 real nrdfUncoupled = 0;
710 real kineticEnergy = 0;
711 for (int g = 0; g < groupOptions.ngtc; g++)
713 if (groupOptions.tau_t[g] >= 0)
715 nrdfCoupled += groupOptions.nrdf[g];
716 kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
718 else
720 nrdfUncoupled += groupOptions.nrdf[g];
724 /* This conditional with > also catches nrdf=0 */
725 if (nrdfCoupled > nrdfUncoupled)
727 return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
729 else
731 return 0;
735 /*! \brief This routine checks that the potential energy is finite.
737 * Always checks that the potential energy is finite. If step equals
738 * inputrec.init_step also checks that the magnitude of the potential energy
739 * is reasonable. Terminates with a fatal error when a check fails.
740 * Note that passing this check does not guarantee finite forces,
741 * since those use slightly different arithmetics. But in most cases
742 * there is just a narrow coordinate range where forces are not finite
743 * and energies are finite.
745 * \param[in] step The step number, used for checking and printing
746 * \param[in] enerd The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
747 * \param[in] inputrec The input record
749 static void checkPotentialEnergyValidity(gmx_int64_t step,
750 const gmx_enerdata_t &enerd,
751 const t_inputrec &inputrec)
753 /* Threshold valid for comparing absolute potential energy against
754 * the kinetic energy. Normally one should not consider absolute
755 * potential energy values, but with a factor of one million
756 * we should never get false positives.
758 constexpr real c_thresholdFactor = 1e6;
760 bool energyIsNotFinite = !std::isfinite(enerd.term[F_EPOT]);
761 real averageKineticEnergy = 0;
762 /* We only check for large potential energy at the initial step,
763 * because that is by far the most likely step for this too occur
764 * and because computing the average kinetic energy is not free.
765 * Note: nstcalcenergy >> 1 often does not allow to catch large energies
766 * before they become NaN.
768 if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
770 averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
773 if (energyIsNotFinite || (averageKineticEnergy > 0 &&
774 enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
776 gmx_fatal(FARGS, "Step %" GMX_PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
777 step,
778 enerd.term[F_EPOT],
779 energyIsNotFinite ? "not finite" : "extremely high",
780 enerd.term[F_LJ],
781 enerd.term[F_COUL_SR],
782 energyIsNotFinite ? "non-finite" : "very high",
783 energyIsNotFinite ? " or Nan" : "");
787 /*! \brief Compute forces and/or energies for special algorithms
789 * The intention is to collect all calls to algorithms that compute
790 * forces on local atoms only and that do not contribute to the local
791 * virial sum (but add their virial contribution separately).
792 * Eventually these should likely all become ForceProviders.
793 * Within this function the intention is to have algorithms that do
794 * global communication at the end, so global barriers within the MD loop
795 * are as close together as possible.
797 * \param[in] fplog The log file
798 * \param[in] cr The communication record
799 * \param[in] inputrec The input record
800 * \param[in] step The current MD step
801 * \param[in] t The current time
802 * \param[in,out] wcycle Wallcycle accounting struct
803 * \param[in,out] forceProviders Pointer to a list of force providers
804 * \param[in] box The unit cell
805 * \param[in] x The coordinates
806 * \param[in] mdatoms Per atom properties
807 * \param[in] lambda Array of free-energy lambda values
808 * \param[in] forceFlags Flags that tell whether we should compute forces/energies/virial
809 * \param[in,out] forceWithVirial Force and virial buffers
810 * \param[in,out] enerd Energy buffer
811 * \param[in,out] ed Essential dynamics pointer
812 * \param[in] bNS Tells if we did neighbor searching this step, used for ED sampling
814 * \todo Remove bNS, which is used incorrectly.
815 * \todo Convert all other algorithms called here to ForceProviders.
817 static void
818 computeSpecialForces(FILE *fplog,
819 const t_commrec *cr,
820 t_inputrec *inputrec,
821 gmx_int64_t step,
822 double t,
823 gmx_wallcycle_t wcycle,
824 ForceProviders *forceProviders,
825 matrix box,
826 rvec *x,
827 t_mdatoms *mdatoms,
828 real *lambda,
829 int forceFlags,
830 ForceWithVirial *forceWithVirial,
831 gmx_enerdata_t *enerd,
832 gmx_edsam_t ed,
833 gmx_bool bNS)
835 const bool computeForces = (forceFlags & GMX_FORCE_FORCES);
837 /* NOTE: Currently all ForceProviders only provide forces.
838 * When they also provide energies, remove this conditional.
840 if (computeForces)
842 /* Collect forces from modules */
843 forceProviders->calculateForces(cr, mdatoms, box, t, x, forceWithVirial);
846 if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
848 pull_potential_wrapper(cr, inputrec, box, x,
849 forceWithVirial,
850 mdatoms, enerd, lambda, t,
851 wcycle);
853 if (inputrec->bDoAwh)
855 Awh &awh = *inputrec->awh;
856 enerd->term[F_COM_PULL] +=
857 awh.applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
858 forceWithVirial,
859 t, step, wcycle, fplog);
863 rvec *f = as_rvec_array(forceWithVirial->force_.data());
865 /* Add the forces from enforced rotation potentials (if any) */
866 if (inputrec->bRot)
868 wallcycle_start(wcycle, ewcROTadd);
869 enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
870 wallcycle_stop(wcycle, ewcROTadd);
873 if (ed)
875 /* Note that since init_edsam() is called after the initialization
876 * of forcerec, edsam doesn't request the noVirSum force buffer.
877 * Thus if no other algorithm (e.g. PME) requires it, the forces
878 * here will contribute to the virial.
880 do_flood(cr, inputrec, x, f, ed, box, step, bNS);
883 /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
884 if (inputrec->bIMD && computeForces)
886 IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
890 /*! \brief Launch the prepare_step and spread stages of PME GPU.
892 * \param[in] pmedata The PME structure
893 * \param[in] box The box matrix
894 * \param[in] x Coordinate array
895 * \param[in] flags Force flags
896 * \param[in] wcycle The wallcycle structure
898 static inline void launchPmeGpuSpread(gmx_pme_t *pmedata,
899 matrix box,
900 rvec x[],
901 int flags,
902 gmx_wallcycle_t wcycle)
904 int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE;
905 pmeFlags |= (flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0;
906 pmeFlags |= (flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0;
908 pme_gpu_prepare_computation(pmedata, flags & GMX_FORCE_DYNAMICBOX, box, wcycle, pmeFlags);
909 pme_gpu_launch_spread(pmedata, x, wcycle);
912 /*! \brief Launch the FFT and gather stages of PME GPU
914 * This function only implements setting the output forces (no accumulation).
916 * \param[in] pmedata The PME structure
917 * \param[in] wcycle The wallcycle structure
919 static void launchPmeGpuFftAndGather(gmx_pme_t *pmedata,
920 gmx_wallcycle_t wcycle)
922 pme_gpu_launch_complex_transforms(pmedata, wcycle);
923 pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
926 /*! \brief
927 * Polling wait for either of the PME or nonbonded GPU tasks.
929 * Instead of a static order in waiting for GPU tasks, this function
930 * polls checking which of the two tasks completes first, and does the
931 * associated force buffer reduction overlapped with the other task.
932 * By doing that, unlike static scheduling order, it can always overlap
933 * one of the reductions, regardless of the GPU task completion order.
935 * \param[in] nbv Nonbonded verlet structure
936 * \param[in] pmedata PME module data
937 * \param[in,out] force Force array to reduce task outputs into.
938 * \param[in,out] forceWithVirial Force and virial buffers
939 * \param[in,out] fshift Shift force output vector results are reduced into
940 * \param[in,out] enerd Energy data structure results are reduced into
941 * \param[in] flags Force flags
942 * \param[in] wcycle The wallcycle structure
944 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv,
945 const gmx_pme_t *pmedata,
946 gmx::PaddedArrayRef<gmx::RVec> *force,
947 ForceWithVirial *forceWithVirial,
948 rvec fshift[],
949 gmx_enerdata_t *enerd,
950 int flags,
951 gmx_wallcycle_t wcycle)
953 bool isPmeGpuDone = false;
954 bool isNbGpuDone = false;
957 gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
959 while (!isPmeGpuDone || !isNbGpuDone)
961 if (!isPmeGpuDone)
963 matrix vir_Q;
964 real Vlr_q;
966 GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
967 isPmeGpuDone = pme_gpu_try_finish_task(pmedata, wcycle, &pmeGpuForces,
968 vir_Q, &Vlr_q, completionType);
970 if (isPmeGpuDone)
972 pme_gpu_reduce_outputs(wcycle, forceWithVirial, pmeGpuForces,
973 enerd, vir_Q, Vlr_q);
977 if (!isNbGpuDone)
979 GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
980 wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
981 isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
982 flags, eatLocal,
983 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
984 fshift, completionType);
985 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
986 // To get the call count right, when the task finished we
987 // issue a start/stop.
988 // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
989 // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
990 if (isNbGpuDone)
992 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
993 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
995 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
996 nbv->nbat, as_rvec_array(force->data()), wcycle);
1002 /*! \brief
1003 * Launch the dynamic rolling pruning GPU task.
1005 * We currently alternate local/non-local list pruning in odd-even steps
1006 * (only pruning every second step without DD).
1008 * \param[in] cr The communication record
1009 * \param[in] nbv Nonbonded verlet structure
1010 * \param[in] inputrec The input record
1011 * \param[in] step The current MD step
1013 static inline void launchGpuRollingPruning(const t_commrec *cr,
1014 const nonbonded_verlet_t *nbv,
1015 const t_inputrec *inputrec,
1016 const gmx_int64_t step)
1018 /* We should not launch the rolling pruning kernel at a search
1019 * step or just before search steps, since that's useless.
1020 * Without domain decomposition we prune at even steps.
1021 * With domain decomposition we alternate local and non-local
1022 * pruning at even and odd steps.
1024 int numRollingParts = nbv->listParams->numRollingParts;
1025 GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1026 int stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1027 bool stepIsEven = ((stepWithCurrentList & 1) == 0);
1028 if (stepWithCurrentList > 0 &&
1029 stepWithCurrentList < inputrec->nstlist - 1 &&
1030 (stepIsEven || DOMAINDECOMP(cr)))
1032 nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
1033 stepIsEven ? eintLocal : eintNonlocal,
1034 numRollingParts);
1038 static void do_force_cutsVERLET(FILE *fplog, const t_commrec *cr,
1039 const gmx_multisim_t *ms,
1040 t_inputrec *inputrec,
1041 gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1042 gmx_localtop_t *top,
1043 gmx_groups_t gmx_unused *groups,
1044 matrix box, gmx::PaddedArrayRef<gmx::RVec> x, history_t *hist,
1045 gmx::PaddedArrayRef<gmx::RVec> force,
1046 tensor vir_force,
1047 t_mdatoms *mdatoms,
1048 gmx_enerdata_t *enerd, t_fcdata *fcd,
1049 real *lambda, t_graph *graph,
1050 t_forcerec *fr, interaction_const_t *ic,
1051 gmx_vsite_t *vsite, rvec mu_tot,
1052 double t, gmx_edsam_t ed,
1053 int flags,
1054 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1055 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1057 int cg1, i, j;
1058 double mu[2*DIM];
1059 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1060 gmx_bool bDoForces, bUseGPU, bUseOrEmulGPU;
1061 rvec vzero, box_diag;
1062 float cycles_pme, cycles_wait_gpu;
1063 nonbonded_verlet_t *nbv = fr->nbv;
1065 bStateChanged = (flags & GMX_FORCE_STATECHANGED);
1066 bNS = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1067 bFillGrid = (bNS && bStateChanged);
1068 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1069 bDoForces = (flags & GMX_FORCE_FORCES);
1070 bUseGPU = fr->nbv->bUseGPU;
1071 bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
1073 const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
1074 // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
1075 const bool useGpuPme = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
1076 ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
1078 /* At a search step we need to start the first balancing region
1079 * somewhere early inside the step after communication during domain
1080 * decomposition (and not during the previous step as usual).
1082 if (bNS &&
1083 ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1085 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
1088 cycles_wait_gpu = 0;
1090 const int start = 0;
1091 const int homenr = mdatoms->homenr;
1093 clear_mat(vir_force);
1095 if (DOMAINDECOMP(cr))
1097 cg1 = cr->dd->ncg_tot;
1099 else
1101 cg1 = top->cgs.nr;
1103 if (fr->n_tpi > 0)
1105 cg1--;
1108 if (bStateChanged)
1110 update_forcerec(fr, box);
1112 if (inputrecNeedMutot(inputrec))
1114 /* Calculate total (local) dipole moment in a temporary common array.
1115 * This makes it possible to sum them over nodes faster.
1117 calc_mu(start, homenr,
1118 x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1119 mu, mu+DIM);
1123 if (fr->ePBC != epbcNONE)
1125 /* Compute shift vectors every step,
1126 * because of pressure coupling or box deformation!
1128 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1130 calc_shifts(box, fr->shift_vec);
1133 if (bCalcCGCM)
1135 put_atoms_in_box_omp(fr->ePBC, box, x.subArray(0, homenr));
1136 inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1138 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1140 unshift_self(graph, box, as_rvec_array(x.data()));
1144 nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
1145 fr->shift_vec, nbv->nbat);
1147 #if GMX_MPI
1148 if (!thisRankHasDuty(cr, DUTY_PME))
1150 /* Send particle coordinates to the pme nodes.
1151 * Since this is only implemented for domain decomposition
1152 * and domain decomposition does not use the graph,
1153 * we do not need to worry about shifting.
1155 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1156 lambda[efptCOUL], lambda[efptVDW],
1157 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1158 step, wcycle);
1160 #endif /* GMX_MPI */
1162 if (useGpuPme)
1164 launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.data()), flags, wcycle);
1167 /* do gridding for pair search */
1168 if (bNS)
1170 if (graph && bStateChanged)
1172 /* Calculate intramolecular shift vectors to make molecules whole */
1173 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1176 clear_rvec(vzero);
1177 box_diag[XX] = box[XX][XX];
1178 box_diag[YY] = box[YY][YY];
1179 box_diag[ZZ] = box[ZZ][ZZ];
1181 wallcycle_start(wcycle, ewcNS);
1182 if (!fr->bDomDec)
1184 wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1185 nbnxn_put_on_grid(nbv->nbs, fr->ePBC, box,
1186 0, vzero, box_diag,
1187 0, mdatoms->homenr, -1, fr->cginfo, as_rvec_array(x.data()),
1188 0, nullptr,
1189 nbv->grp[eintLocal].kernel_type,
1190 nbv->nbat);
1191 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1193 else
1195 wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1196 nbnxn_put_on_grid_nonlocal(nbv->nbs, domdec_zones(cr->dd),
1197 fr->cginfo, as_rvec_array(x.data()),
1198 nbv->grp[eintNonlocal].kernel_type,
1199 nbv->nbat);
1200 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1203 nbnxn_atomdata_set(nbv->nbat, nbv->nbs, mdatoms, fr->cginfo);
1204 wallcycle_stop(wcycle, ewcNS);
1207 /* initialize the GPU atom data and copy shift vector */
1208 if (bUseGPU)
1210 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1211 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1213 if (bNS)
1215 nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1218 nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1220 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1221 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1224 /* do local pair search */
1225 if (bNS)
1227 wallcycle_start_nocount(wcycle, ewcNS);
1228 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1229 nbnxn_make_pairlist(nbv->nbs, nbv->nbat,
1230 &top->excls,
1231 nbv->listParams->rlistOuter,
1232 nbv->min_ci_balanced,
1233 &nbv->grp[eintLocal].nbl_lists,
1234 eintLocal,
1235 nbv->grp[eintLocal].kernel_type,
1236 nrnb);
1237 nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
1238 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1240 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
1242 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1244 if (bUseGPU)
1246 /* initialize local pair-list on the GPU */
1247 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1248 nbv->grp[eintLocal].nbl_lists.nbl[0],
1249 eintLocal);
1251 wallcycle_stop(wcycle, ewcNS);
1253 else
1255 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, as_rvec_array(x.data()),
1256 nbv->nbat, wcycle);
1259 if (bUseGPU)
1261 if (DOMAINDECOMP(cr))
1263 ddOpenBalanceRegionGpu(cr->dd);
1266 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1267 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1268 /* launch local nonbonded work on GPU */
1269 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1270 step, nrnb, wcycle);
1271 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1272 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1275 if (useGpuPme)
1277 // In PME GPU and mixed mode we launch FFT / gather after the
1278 // X copy/transform to allow overlap as well as after the GPU NB
1279 // launch to avoid FFT launch overhead hijacking the CPU and delaying
1280 // the nonbonded kernel.
1281 launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1284 /* Communicate coordinates and sum dipole if necessary +
1285 do non-local pair search */
1286 if (DOMAINDECOMP(cr))
1288 if (bNS)
1290 wallcycle_start_nocount(wcycle, ewcNS);
1291 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1293 nbnxn_make_pairlist(nbv->nbs, nbv->nbat,
1294 &top->excls,
1295 nbv->listParams->rlistOuter,
1296 nbv->min_ci_balanced,
1297 &nbv->grp[eintNonlocal].nbl_lists,
1298 eintNonlocal,
1299 nbv->grp[eintNonlocal].kernel_type,
1300 nrnb);
1301 nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1302 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1304 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1306 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1308 if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1310 /* initialize non-local pair-list on the GPU */
1311 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1312 nbv->grp[eintNonlocal].nbl_lists.nbl[0],
1313 eintNonlocal);
1315 wallcycle_stop(wcycle, ewcNS);
1317 else
1319 dd_move_x(cr->dd, box, as_rvec_array(x.data()), wcycle);
1321 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, as_rvec_array(x.data()),
1322 nbv->nbat, wcycle);
1325 if (bUseGPU)
1327 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1328 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1329 /* launch non-local nonbonded tasks on GPU */
1330 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1331 step, nrnb, wcycle);
1332 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1333 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1337 if (bUseGPU)
1339 /* launch D2H copy-back F */
1340 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1341 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1342 if (DOMAINDECOMP(cr))
1344 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1345 flags, eatNonlocal);
1347 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1348 flags, eatLocal);
1349 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1350 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1353 if (bStateChanged && inputrecNeedMutot(inputrec))
1355 if (PAR(cr))
1357 gmx_sumd(2*DIM, mu, cr);
1358 ddReopenBalanceRegionCpu(cr->dd);
1361 for (i = 0; i < 2; i++)
1363 for (j = 0; j < DIM; j++)
1365 fr->mu_tot[i][j] = mu[i*DIM + j];
1369 if (fr->efep == efepNO)
1371 copy_rvec(fr->mu_tot[0], mu_tot);
1373 else
1375 for (j = 0; j < DIM; j++)
1377 mu_tot[j] =
1378 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1379 lambda[efptCOUL]*fr->mu_tot[1][j];
1383 /* Reset energies */
1384 reset_enerdata(enerd);
1385 clear_rvecs(SHIFTS, fr->fshift);
1387 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1389 wallcycle_start(wcycle, ewcPPDURINGPME);
1390 dd_force_flop_start(cr->dd, nrnb);
1393 if (inputrec->bRot)
1395 wallcycle_start(wcycle, ewcROT);
1396 do_rotation(cr, inputrec, box, as_rvec_array(x.data()), t, step, bNS);
1397 wallcycle_stop(wcycle, ewcROT);
1400 /* Temporary solution until all routines take PaddedRVecVector */
1401 rvec *f = as_rvec_array(force.data());
1403 /* Start the force cycle counter.
1404 * Note that a different counter is used for dynamic load balancing.
1406 wallcycle_start(wcycle, ewcFORCE);
1408 gmx::ArrayRef<gmx::RVec> forceRef = force;
1409 if (bDoForces)
1411 /* If we need to compute the virial, we might need a separate
1412 * force buffer for algorithms for which the virial is calculated
1413 * directly, such as PME.
1415 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1417 forceRef = *fr->forceBufferForDirectVirialContributions;
1419 /* We only compute forces on local atoms. Note that vsites can
1420 * spread to non-local atoms, but that part of the buffer is
1421 * cleared separately in the vsite spreading code.
1423 clear_rvecs_omp(mdatoms->homenr, as_rvec_array(forceRef.data()));
1425 /* Clear the short- and long-range forces */
1426 clear_rvecs_omp(fr->natoms_force_constr, f);
1429 /* forceWithVirial uses the local atom range only */
1430 ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
1432 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1434 clear_pull_forces(inputrec->pull_work);
1437 /* We calculate the non-bonded forces, when done on the CPU, here.
1438 * We do this before calling do_force_lowlevel, because in that
1439 * function, the listed forces are calculated before PME, which
1440 * does communication. With this order, non-bonded and listed
1441 * force calculation imbalance can be balanced out by the domain
1442 * decomposition load balancing.
1445 if (!bUseOrEmulGPU)
1447 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1448 step, nrnb, wcycle);
1451 if (fr->efep != efepNO)
1453 /* Calculate the local and non-local free energy interactions here.
1454 * Happens here on the CPU both with and without GPU.
1456 if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1458 do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1459 fr, as_rvec_array(x.data()), f, mdatoms,
1460 inputrec->fepvals, lambda,
1461 enerd, flags, nrnb, wcycle);
1464 if (DOMAINDECOMP(cr) &&
1465 fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1467 do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1468 fr, as_rvec_array(x.data()), f, mdatoms,
1469 inputrec->fepvals, lambda,
1470 enerd, flags, nrnb, wcycle);
1474 if (!bUseOrEmulGPU)
1476 int aloc;
1478 if (DOMAINDECOMP(cr))
1480 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1481 step, nrnb, wcycle);
1484 if (!bUseOrEmulGPU)
1486 aloc = eintLocal;
1488 else
1490 aloc = eintNonlocal;
1493 /* Add all the non-bonded force to the normal force array.
1494 * This can be split into a local and a non-local part when overlapping
1495 * communication with calculation with domain decomposition.
1497 wallcycle_stop(wcycle, ewcFORCE);
1499 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->nbat, f, wcycle);
1501 wallcycle_start_nocount(wcycle, ewcFORCE);
1503 /* if there are multiple fshift output buffers reduce them */
1504 if ((flags & GMX_FORCE_VIRIAL) &&
1505 nbv->grp[aloc].nbl_lists.nnbl > 1)
1507 /* This is not in a subcounter because it takes a
1508 negligible and constant-sized amount of time */
1509 nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1510 fr->fshift);
1514 /* update QMMMrec, if necessary */
1515 if (fr->bQMMM)
1517 update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1520 /* Compute the bonded and non-bonded energies and optionally forces */
1521 do_force_lowlevel(fr, inputrec, &(top->idef),
1522 cr, ms, nrnb, wcycle, mdatoms,
1523 as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1524 box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1525 flags, &cycles_pme);
1527 wallcycle_stop(wcycle, ewcFORCE);
1529 computeSpecialForces(fplog, cr, inputrec, step, t, wcycle,
1530 fr->forceProviders, box, as_rvec_array(x.data()), mdatoms, lambda,
1531 flags, &forceWithVirial, enerd,
1532 ed, bNS);
1534 if (bUseOrEmulGPU)
1536 /* wait for non-local forces (or calculate in emulation mode) */
1537 if (DOMAINDECOMP(cr))
1539 if (bUseGPU)
1541 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1542 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1543 flags, eatNonlocal,
1544 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1545 fr->fshift);
1546 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1548 else
1550 wallcycle_start_nocount(wcycle, ewcFORCE);
1551 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1552 step, nrnb, wcycle);
1553 wallcycle_stop(wcycle, ewcFORCE);
1556 /* skip the reduction if there was no non-local work to do */
1557 if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1559 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
1560 nbv->nbat, f, wcycle);
1565 if (DOMAINDECOMP(cr))
1567 /* We are done with the CPU compute.
1568 * We will now communicate the non-local forces.
1569 * If we use a GPU this will overlap with GPU work, so in that case
1570 * we do not close the DD force balancing region here.
1572 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1574 ddCloseBalanceRegionCpu(cr->dd);
1576 if (bDoForces)
1578 dd_move_f(cr->dd, f, fr->fshift, wcycle);
1582 // With both nonbonded and PME offloaded a GPU on the same rank, we use
1583 // an alternating wait/reduction scheme.
1584 bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1585 if (alternateGpuWait)
1587 alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, wcycle);
1590 if (!alternateGpuWait && useGpuPme)
1592 gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
1593 matrix vir_Q;
1594 real Vlr_q = 0.0;
1595 pme_gpu_wait_finish_task(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
1596 pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
1599 /* Wait for local GPU NB outputs on the non-alternating wait path */
1600 if (!alternateGpuWait && bUseGPU)
1602 /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1603 * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1604 * but even with a step of 0.1 ms the difference is less than 1%
1605 * of the step time.
1607 const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1609 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1610 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1611 flags, eatLocal,
1612 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1613 fr->fshift);
1614 float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1616 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1618 DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1619 if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1621 /* We measured few cycles, it could be that the kernel
1622 * and transfer finished earlier and there was no actual
1623 * wait time, only API call overhead.
1624 * Then the actual time could be anywhere between 0 and
1625 * cycles_wait_est. We will use half of cycles_wait_est.
1627 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1629 ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1633 if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1635 // NOTE: emulation kernel is not included in the balancing region,
1636 // but emulation mode does not target performance anyway
1637 wallcycle_start_nocount(wcycle, ewcFORCE);
1638 do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1639 DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1640 step, nrnb, wcycle);
1641 wallcycle_stop(wcycle, ewcFORCE);
1644 if (bUseGPU)
1646 /* now clear the GPU outputs while we finish the step on the CPU */
1647 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1648 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1649 nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1651 /* Is dynamic pair-list pruning activated? */
1652 if (nbv->listParams->useDynamicPruning)
1654 launchGpuRollingPruning(cr, nbv, inputrec, step);
1656 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1657 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1659 // TODO: move here the PME buffer clearing call pme_gpu_reinit_computation()
1662 /* Do the nonbonded GPU (or emulation) force buffer reduction
1663 * on the non-alternating path. */
1664 if (bUseOrEmulGPU && !alternateGpuWait)
1666 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
1667 nbv->nbat, f, wcycle);
1670 if (DOMAINDECOMP(cr))
1672 dd_force_flop_stop(cr->dd, nrnb);
1675 if (bDoForces)
1677 /* If we have NoVirSum forces, but we do not calculate the virial,
1678 * we sum fr->f_novirsum=f later.
1680 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1682 spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
1683 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1686 if (flags & GMX_FORCE_VIRIAL)
1688 /* Calculation of the virial must be done after vsites! */
1689 calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
1690 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1694 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1696 /* In case of node-splitting, the PP nodes receive the long-range
1697 * forces, virial and energy from the PME nodes here.
1699 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1702 if (bDoForces)
1704 post_process_forces(cr, step, nrnb, wcycle,
1705 top, box, as_rvec_array(x.data()), f, &forceWithVirial,
1706 vir_force, mdatoms, graph, fr, vsite,
1707 flags);
1710 if (flags & GMX_FORCE_ENERGY)
1712 /* Sum the potential energy terms from group contributions */
1713 sum_epot(&(enerd->grpp), enerd->term);
1715 if (!EI_TPI(inputrec->eI))
1717 checkPotentialEnergyValidity(step, *enerd, *inputrec);
1722 static void do_force_cutsGROUP(FILE *fplog, const t_commrec *cr,
1723 const gmx_multisim_t *ms,
1724 t_inputrec *inputrec,
1725 gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1726 gmx_localtop_t *top,
1727 gmx_groups_t *groups,
1728 matrix box, gmx::PaddedArrayRef<gmx::RVec> x, history_t *hist,
1729 gmx::PaddedArrayRef<gmx::RVec> force,
1730 tensor vir_force,
1731 t_mdatoms *mdatoms,
1732 gmx_enerdata_t *enerd, t_fcdata *fcd,
1733 real *lambda, t_graph *graph,
1734 t_forcerec *fr, gmx_vsite_t *vsite, rvec mu_tot,
1735 double t, gmx_edsam_t ed,
1736 int flags,
1737 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1738 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1740 int cg0, cg1, i, j;
1741 double mu[2*DIM];
1742 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1743 gmx_bool bDoForces;
1744 float cycles_pme;
1746 const int start = 0;
1747 const int homenr = mdatoms->homenr;
1749 clear_mat(vir_force);
1751 cg0 = 0;
1752 if (DOMAINDECOMP(cr))
1754 cg1 = cr->dd->ncg_tot;
1756 else
1758 cg1 = top->cgs.nr;
1760 if (fr->n_tpi > 0)
1762 cg1--;
1765 bStateChanged = (flags & GMX_FORCE_STATECHANGED);
1766 bNS = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1767 /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1768 bFillGrid = (bNS && bStateChanged);
1769 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1770 bDoForces = (flags & GMX_FORCE_FORCES);
1772 if (bStateChanged)
1774 update_forcerec(fr, box);
1776 if (inputrecNeedMutot(inputrec))
1778 /* Calculate total (local) dipole moment in a temporary common array.
1779 * This makes it possible to sum them over nodes faster.
1781 calc_mu(start, homenr,
1782 x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1783 mu, mu+DIM);
1787 if (fr->ePBC != epbcNONE)
1789 /* Compute shift vectors every step,
1790 * because of pressure coupling or box deformation!
1792 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1794 calc_shifts(box, fr->shift_vec);
1797 if (bCalcCGCM)
1799 put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1800 &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1801 inc_nrnb(nrnb, eNR_CGCM, homenr);
1802 inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1804 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1806 unshift_self(graph, box, as_rvec_array(x.data()));
1809 else if (bCalcCGCM)
1811 calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1812 inc_nrnb(nrnb, eNR_CGCM, homenr);
1815 if (bCalcCGCM && gmx_debug_at)
1817 pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1820 #if GMX_MPI
1821 if (!thisRankHasDuty(cr, DUTY_PME))
1823 /* Send particle coordinates to the pme nodes.
1824 * Since this is only implemented for domain decomposition
1825 * and domain decomposition does not use the graph,
1826 * we do not need to worry about shifting.
1828 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1829 lambda[efptCOUL], lambda[efptVDW],
1830 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1831 step, wcycle);
1833 #endif /* GMX_MPI */
1835 /* Communicate coordinates and sum dipole if necessary */
1836 if (DOMAINDECOMP(cr))
1838 dd_move_x(cr->dd, box, as_rvec_array(x.data()), wcycle);
1840 /* No GPU support, no move_x overlap, so reopen the balance region here */
1841 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1843 ddReopenBalanceRegionCpu(cr->dd);
1847 if (inputrecNeedMutot(inputrec))
1849 if (bStateChanged)
1851 if (PAR(cr))
1853 gmx_sumd(2*DIM, mu, cr);
1854 ddReopenBalanceRegionCpu(cr->dd);
1856 for (i = 0; i < 2; i++)
1858 for (j = 0; j < DIM; j++)
1860 fr->mu_tot[i][j] = mu[i*DIM + j];
1864 if (fr->efep == efepNO)
1866 copy_rvec(fr->mu_tot[0], mu_tot);
1868 else
1870 for (j = 0; j < DIM; j++)
1872 mu_tot[j] =
1873 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1878 /* Reset energies */
1879 reset_enerdata(enerd);
1880 clear_rvecs(SHIFTS, fr->fshift);
1882 if (bNS)
1884 wallcycle_start(wcycle, ewcNS);
1886 if (graph && bStateChanged)
1888 /* Calculate intramolecular shift vectors to make molecules whole */
1889 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1892 /* Do the actual neighbour searching */
1893 ns(fplog, fr, box,
1894 groups, top, mdatoms,
1895 cr, nrnb, bFillGrid);
1897 wallcycle_stop(wcycle, ewcNS);
1900 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1902 wallcycle_start(wcycle, ewcPPDURINGPME);
1903 dd_force_flop_start(cr->dd, nrnb);
1906 if (inputrec->bRot)
1908 wallcycle_start(wcycle, ewcROT);
1909 do_rotation(cr, inputrec, box, as_rvec_array(x.data()), t, step, bNS);
1910 wallcycle_stop(wcycle, ewcROT);
1913 /* Temporary solution until all routines take PaddedRVecVector */
1914 rvec *f = as_rvec_array(force.data());
1916 /* Start the force cycle counter.
1917 * Note that a different counter is used for dynamic load balancing.
1919 wallcycle_start(wcycle, ewcFORCE);
1921 gmx::ArrayRef<gmx::RVec> forceRef = force;
1922 if (bDoForces)
1924 /* If we need to compute the virial, we might need a separate
1925 * force buffer for algorithms for which the virial is calculated
1926 * separately, such as PME.
1928 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1930 forceRef = *fr->forceBufferForDirectVirialContributions;
1932 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1935 /* Clear the short- and long-range forces */
1936 clear_rvecs(fr->natoms_force_constr, f);
1939 /* forceWithVirial might need the full force atom range */
1940 ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
1942 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1944 clear_pull_forces(inputrec->pull_work);
1947 /* update QMMMrec, if necessary */
1948 if (fr->bQMMM)
1950 update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1953 /* Compute the bonded and non-bonded energies and optionally forces */
1954 do_force_lowlevel(fr, inputrec, &(top->idef),
1955 cr, ms, nrnb, wcycle, mdatoms,
1956 as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1957 box, inputrec->fepvals, lambda,
1958 graph, &(top->excls), fr->mu_tot,
1959 flags,
1960 &cycles_pme);
1962 wallcycle_stop(wcycle, ewcFORCE);
1964 if (DOMAINDECOMP(cr))
1966 dd_force_flop_stop(cr->dd, nrnb);
1968 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1970 ddCloseBalanceRegionCpu(cr->dd);
1974 computeSpecialForces(fplog, cr, inputrec, step, t, wcycle,
1975 fr->forceProviders, box, as_rvec_array(x.data()), mdatoms, lambda,
1976 flags, &forceWithVirial, enerd,
1977 ed, bNS);
1979 if (bDoForces)
1981 /* Communicate the forces */
1982 if (DOMAINDECOMP(cr))
1984 dd_move_f(cr->dd, f, fr->fshift, wcycle);
1985 /* Do we need to communicate the separate force array
1986 * for terms that do not contribute to the single sum virial?
1987 * Position restraints and electric fields do not introduce
1988 * inter-cg forces, only full electrostatics methods do.
1989 * When we do not calculate the virial, fr->f_novirsum = f,
1990 * so we have already communicated these forces.
1992 if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
1993 (flags & GMX_FORCE_VIRIAL))
1995 dd_move_f(cr->dd, as_rvec_array(forceWithVirial.force_.data()), nullptr, wcycle);
1999 /* If we have NoVirSum forces, but we do not calculate the virial,
2000 * we sum fr->f_novirsum=f later.
2002 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
2004 spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
2005 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
2008 if (flags & GMX_FORCE_VIRIAL)
2010 /* Calculation of the virial must be done after vsites! */
2011 calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
2012 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
2016 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
2018 /* In case of node-splitting, the PP nodes receive the long-range
2019 * forces, virial and energy from the PME nodes here.
2021 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
2024 if (bDoForces)
2026 post_process_forces(cr, step, nrnb, wcycle,
2027 top, box, as_rvec_array(x.data()), f, &forceWithVirial,
2028 vir_force, mdatoms, graph, fr, vsite,
2029 flags);
2032 if (flags & GMX_FORCE_ENERGY)
2034 /* Sum the potential energy terms from group contributions */
2035 sum_epot(&(enerd->grpp), enerd->term);
2037 if (!EI_TPI(inputrec->eI))
2039 checkPotentialEnergyValidity(step, *enerd, *inputrec);
2045 void do_force(FILE *fplog, const t_commrec *cr,
2046 const gmx_multisim_t *ms,
2047 t_inputrec *inputrec,
2048 gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
2049 gmx_localtop_t *top,
2050 gmx_groups_t *groups,
2051 matrix box, gmx::PaddedArrayRef<gmx::RVec> x, history_t *hist,
2052 gmx::PaddedArrayRef<gmx::RVec> force,
2053 tensor vir_force,
2054 t_mdatoms *mdatoms,
2055 gmx_enerdata_t *enerd, t_fcdata *fcd,
2056 gmx::ArrayRef<real> lambda, t_graph *graph,
2057 t_forcerec *fr,
2058 gmx_vsite_t *vsite, rvec mu_tot,
2059 double t, gmx_edsam_t ed,
2060 int flags,
2061 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
2062 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
2064 /* modify force flag if not doing nonbonded */
2065 if (!fr->bNonbonded)
2067 flags &= ~GMX_FORCE_NONBONDED;
2070 GMX_ASSERT(x.size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "coordinates should be padded");
2071 GMX_ASSERT(force.size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "force should be padded");
2073 switch (inputrec->cutoff_scheme)
2075 case ecutsVERLET:
2076 do_force_cutsVERLET(fplog, cr, ms, inputrec,
2077 step, nrnb, wcycle,
2078 top,
2079 groups,
2080 box, x, hist,
2081 force, vir_force,
2082 mdatoms,
2083 enerd, fcd,
2084 lambda.data(), graph,
2085 fr, fr->ic,
2086 vsite, mu_tot,
2087 t, ed,
2088 flags,
2089 ddOpenBalanceRegion,
2090 ddCloseBalanceRegion);
2091 break;
2092 case ecutsGROUP:
2093 do_force_cutsGROUP(fplog, cr, ms, inputrec,
2094 step, nrnb, wcycle,
2095 top,
2096 groups,
2097 box, x, hist,
2098 force, vir_force,
2099 mdatoms,
2100 enerd, fcd,
2101 lambda.data(), graph,
2102 fr, vsite, mu_tot,
2103 t, ed,
2104 flags,
2105 ddOpenBalanceRegion,
2106 ddCloseBalanceRegion);
2107 break;
2108 default:
2109 gmx_incons("Invalid cut-off scheme passed!");
2112 /* In case we don't have constraints and are using GPUs, the next balancing
2113 * region starts here.
2114 * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2115 * virial calculation and COM pulling, is not thus not included in
2116 * the balance timing, which is ok as most tasks do communication.
2118 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
2120 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
2125 void do_constrain_first(FILE *fplog, Constraints *constr,
2126 t_inputrec *ir, t_mdatoms *md,
2127 t_state *state, const t_commrec *cr,
2128 const gmx_multisim_t *ms,
2129 t_nrnb *nrnb,
2130 t_forcerec *fr, gmx_localtop_t *top)
2132 int i, m, start, end;
2133 gmx_int64_t step;
2134 real dt = ir->delta_t;
2135 real dvdl_dum;
2136 rvec *savex;
2138 /* We need to allocate one element extra, since we might use
2139 * (unaligned) 4-wide SIMD loads to access rvec entries.
2141 snew(savex, state->natoms + 1);
2143 start = 0;
2144 end = md->homenr;
2146 if (debug)
2148 fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2149 start, md->homenr, end);
2151 /* Do a first constrain to reset particles... */
2152 step = ir->init_step;
2153 if (fplog)
2155 char buf[STEPSTRSIZE];
2156 fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2157 gmx_step_str(step, buf));
2159 dvdl_dum = 0;
2161 /* constrain the current position */
2162 constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
2163 ir, cr, ms, step, 0, 1.0, md,
2164 as_rvec_array(state->x.data()), as_rvec_array(state->x.data()), nullptr,
2165 fr->bMolPBC, state->box,
2166 state->lambda[efptBONDED], &dvdl_dum,
2167 nullptr, nullptr, nrnb, econqCoord);
2168 if (EI_VV(ir->eI))
2170 /* constrain the inital velocity, and save it */
2171 /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2172 constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
2173 ir, cr, ms, step, 0, 1.0, md,
2174 as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
2175 fr->bMolPBC, state->box,
2176 state->lambda[efptBONDED], &dvdl_dum,
2177 nullptr, nullptr, nrnb, econqVeloc);
2179 /* constrain the inital velocities at t-dt/2 */
2180 if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2182 for (i = start; (i < end); i++)
2184 for (m = 0; (m < DIM); m++)
2186 /* Reverse the velocity */
2187 state->v[i][m] = -state->v[i][m];
2188 /* Store the position at t-dt in buf */
2189 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
2192 /* Shake the positions at t=-dt with the positions at t=0
2193 * as reference coordinates.
2195 if (fplog)
2197 char buf[STEPSTRSIZE];
2198 fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2199 gmx_step_str(step, buf));
2201 dvdl_dum = 0;
2202 constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
2203 ir, cr, ms, step, -1, 1.0, md,
2204 as_rvec_array(state->x.data()), savex, nullptr,
2205 fr->bMolPBC, state->box,
2206 state->lambda[efptBONDED], &dvdl_dum,
2207 as_rvec_array(state->v.data()), nullptr, nrnb, econqCoord);
2209 for (i = start; i < end; i++)
2211 for (m = 0; m < DIM; m++)
2213 /* Re-reverse the velocities */
2214 state->v[i][m] = -state->v[i][m];
2218 sfree(savex);
2222 static void
2223 integrate_table(real vdwtab[], real scale, int offstart, int rstart, int rend,
2224 double *enerout, double *virout)
2226 double enersum, virsum;
2227 double invscale, invscale2, invscale3;
2228 double r, ea, eb, ec, pa, pb, pc, pd;
2229 double y0, f, g, h;
2230 int ri, offset;
2231 double tabfactor;
2233 invscale = 1.0/scale;
2234 invscale2 = invscale*invscale;
2235 invscale3 = invscale*invscale2;
2237 /* Following summation derived from cubic spline definition,
2238 * Numerical Recipies in C, second edition, p. 113-116. Exact for
2239 * the cubic spline. We first calculate the negative of the
2240 * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2241 * add the more standard, abrupt cutoff correction to that result,
2242 * yielding the long-range correction for a switched function. We
2243 * perform both the pressure and energy loops at the same time for
2244 * simplicity, as the computational cost is low. */
2246 if (offstart == 0)
2248 /* Since the dispersion table has been scaled down a factor
2249 * 6.0 and the repulsion a factor 12.0 to compensate for the
2250 * c6/c12 parameters inside nbfp[] being scaled up (to save
2251 * flops in kernels), we need to correct for this.
2253 tabfactor = 6.0;
2255 else
2257 tabfactor = 12.0;
2260 enersum = 0.0;
2261 virsum = 0.0;
2262 for (ri = rstart; ri < rend; ++ri)
2264 r = ri*invscale;
2265 ea = invscale3;
2266 eb = 2.0*invscale2*r;
2267 ec = invscale*r*r;
2269 pa = invscale3;
2270 pb = 3.0*invscale2*r;
2271 pc = 3.0*invscale*r*r;
2272 pd = r*r*r;
2274 /* this "8" is from the packing in the vdwtab array - perhaps
2275 should be defined? */
2277 offset = 8*ri + offstart;
2278 y0 = vdwtab[offset];
2279 f = vdwtab[offset+1];
2280 g = vdwtab[offset+2];
2281 h = vdwtab[offset+3];
2283 enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2284 virsum += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2286 *enerout = 4.0*M_PI*enersum*tabfactor;
2287 *virout = 4.0*M_PI*virsum*tabfactor;
2290 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2292 double eners[2], virs[2], enersum, virsum;
2293 double r0, rc3, rc9;
2294 int ri0, ri1, i;
2295 real scale, *vdwtab;
2297 fr->enershiftsix = 0;
2298 fr->enershifttwelve = 0;
2299 fr->enerdiffsix = 0;
2300 fr->enerdifftwelve = 0;
2301 fr->virdiffsix = 0;
2302 fr->virdifftwelve = 0;
2304 const interaction_const_t *ic = fr->ic;
2306 if (eDispCorr != edispcNO)
2308 for (i = 0; i < 2; i++)
2310 eners[i] = 0;
2311 virs[i] = 0;
2313 if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2314 (ic->vdw_modifier == eintmodPOTSWITCH) ||
2315 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2316 (ic->vdwtype == evdwSHIFT) ||
2317 (ic->vdwtype == evdwSWITCH))
2319 if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2320 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2321 (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2323 gmx_fatal(FARGS,
2324 "With dispersion correction rvdw-switch can not be zero "
2325 "for vdw-type = %s", evdw_names[ic->vdwtype]);
2328 /* TODO This code depends on the logic in tables.c that
2329 constructs the table layout, which should be made
2330 explicit in future cleanup. */
2331 GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2332 "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2333 scale = fr->dispersionCorrectionTable->scale;
2334 vdwtab = fr->dispersionCorrectionTable->data;
2336 /* Round the cut-offs to exact table values for precision */
2337 ri0 = static_cast<int>(floor(ic->rvdw_switch*scale));
2338 ri1 = static_cast<int>(ceil(ic->rvdw*scale));
2340 /* The code below has some support for handling force-switching, i.e.
2341 * when the force (instead of potential) is switched over a limited
2342 * region. This leads to a constant shift in the potential inside the
2343 * switching region, which we can handle by adding a constant energy
2344 * term in the force-switch case just like when we do potential-shift.
2346 * For now this is not enabled, but to keep the functionality in the
2347 * code we check separately for switch and shift. When we do force-switch
2348 * the shifting point is rvdw_switch, while it is the cutoff when we
2349 * have a classical potential-shift.
2351 * For a pure potential-shift the potential has a constant shift
2352 * all the way out to the cutoff, and that is it. For other forms
2353 * we need to calculate the constant shift up to the point where we
2354 * start modifying the potential.
2356 ri0 = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2358 r0 = ri0/scale;
2359 rc3 = r0*r0*r0;
2360 rc9 = rc3*rc3*rc3;
2362 if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2363 (ic->vdwtype == evdwSHIFT))
2365 /* Determine the constant energy shift below rvdw_switch.
2366 * Table has a scale factor since we have scaled it down to compensate
2367 * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2369 fr->enershiftsix = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2370 fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2372 else if (ic->vdw_modifier == eintmodPOTSHIFT)
2374 fr->enershiftsix = (real)(-1.0/(rc3*rc3));
2375 fr->enershifttwelve = (real)( 1.0/(rc9*rc3));
2378 /* Add the constant part from 0 to rvdw_switch.
2379 * This integration from 0 to rvdw_switch overcounts the number
2380 * of interactions by 1, as it also counts the self interaction.
2381 * We will correct for this later.
2383 eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2384 eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2386 /* Calculate the contribution in the range [r0,r1] where we
2387 * modify the potential. For a pure potential-shift modifier we will
2388 * have ri0==ri1, and there will not be any contribution here.
2390 for (i = 0; i < 2; i++)
2392 enersum = 0;
2393 virsum = 0;
2394 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2395 eners[i] -= enersum;
2396 virs[i] -= virsum;
2399 /* Alright: Above we compensated by REMOVING the parts outside r0
2400 * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2402 * Regardless of whether r0 is the point where we start switching,
2403 * or the cutoff where we calculated the constant shift, we include
2404 * all the parts we are missing out to infinity from r0 by
2405 * calculating the analytical dispersion correction.
2407 eners[0] += -4.0*M_PI/(3.0*rc3);
2408 eners[1] += 4.0*M_PI/(9.0*rc9);
2409 virs[0] += 8.0*M_PI/rc3;
2410 virs[1] += -16.0*M_PI/(3.0*rc9);
2412 else if (ic->vdwtype == evdwCUT ||
2413 EVDW_PME(ic->vdwtype) ||
2414 ic->vdwtype == evdwUSER)
2416 if (ic->vdwtype == evdwUSER && fplog)
2418 fprintf(fplog,
2419 "WARNING: using dispersion correction with user tables\n");
2422 /* Note that with LJ-PME, the dispersion correction is multiplied
2423 * by the difference between the actual C6 and the value of C6
2424 * that would produce the combination rule.
2425 * This means the normal energy and virial difference formulas
2426 * can be used here.
2429 rc3 = ic->rvdw*ic->rvdw*ic->rvdw;
2430 rc9 = rc3*rc3*rc3;
2431 /* Contribution beyond the cut-off */
2432 eners[0] += -4.0*M_PI/(3.0*rc3);
2433 eners[1] += 4.0*M_PI/(9.0*rc9);
2434 if (ic->vdw_modifier == eintmodPOTSHIFT)
2436 /* Contribution within the cut-off */
2437 eners[0] += -4.0*M_PI/(3.0*rc3);
2438 eners[1] += 4.0*M_PI/(3.0*rc9);
2440 /* Contribution beyond the cut-off */
2441 virs[0] += 8.0*M_PI/rc3;
2442 virs[1] += -16.0*M_PI/(3.0*rc9);
2444 else
2446 gmx_fatal(FARGS,
2447 "Dispersion correction is not implemented for vdw-type = %s",
2448 evdw_names[ic->vdwtype]);
2451 /* When we deprecate the group kernels the code below can go too */
2452 if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2454 /* Calculate self-interaction coefficient (assuming that
2455 * the reciprocal-space contribution is constant in the
2456 * region that contributes to the self-interaction).
2458 fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2460 eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2461 virs[0] += gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2464 fr->enerdiffsix = eners[0];
2465 fr->enerdifftwelve = eners[1];
2466 /* The 0.5 is due to the Gromacs definition of the virial */
2467 fr->virdiffsix = 0.5*virs[0];
2468 fr->virdifftwelve = 0.5*virs[1];
2472 void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
2473 matrix box, real lambda, tensor pres, tensor virial,
2474 real *prescorr, real *enercorr, real *dvdlcorr)
2476 gmx_bool bCorrAll, bCorrPres;
2477 real dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2478 int m;
2480 *prescorr = 0;
2481 *enercorr = 0;
2482 *dvdlcorr = 0;
2484 clear_mat(virial);
2485 clear_mat(pres);
2487 if (ir->eDispCorr != edispcNO)
2489 bCorrAll = (ir->eDispCorr == edispcAllEner ||
2490 ir->eDispCorr == edispcAllEnerPres);
2491 bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2492 ir->eDispCorr == edispcAllEnerPres);
2494 invvol = 1/det(box);
2495 if (fr->n_tpi)
2497 /* Only correct for the interactions with the inserted molecule */
2498 dens = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2499 ninter = fr->n_tpi;
2501 else
2503 dens = fr->numAtomsForDispersionCorrection*invvol;
2504 ninter = 0.5*fr->numAtomsForDispersionCorrection;
2507 if (ir->efep == efepNO)
2509 avcsix = fr->avcsix[0];
2510 avctwelve = fr->avctwelve[0];
2512 else
2514 avcsix = (1 - lambda)*fr->avcsix[0] + lambda*fr->avcsix[1];
2515 avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2518 enerdiff = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2519 *enercorr += avcsix*enerdiff;
2520 dvdlambda = 0.0;
2521 if (ir->efep != efepNO)
2523 dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2525 if (bCorrAll)
2527 enerdiff = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2528 *enercorr += avctwelve*enerdiff;
2529 if (fr->efep != efepNO)
2531 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2535 if (bCorrPres)
2537 svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2538 if (ir->eDispCorr == edispcAllEnerPres)
2540 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2542 /* The factor 2 is because of the Gromacs virial definition */
2543 spres = -2.0*invvol*svir*PRESFAC;
2545 for (m = 0; m < DIM; m++)
2547 virial[m][m] += svir;
2548 pres[m][m] += spres;
2550 *prescorr += spres;
2553 /* Can't currently control when it prints, for now, just print when degugging */
2554 if (debug)
2556 if (bCorrAll)
2558 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2559 avcsix, avctwelve);
2561 if (bCorrPres)
2563 fprintf(debug,
2564 "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2565 *enercorr, spres, svir);
2567 else
2569 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2573 if (fr->efep != efepNO)
2575 *dvdlcorr += dvdlambda;
2580 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2581 const gmx_mtop_t *mtop, rvec x[],
2582 gmx_bool bFirst)
2584 t_graph *graph;
2585 int as, mol;
2587 if (bFirst && fplog)
2589 fprintf(fplog, "Removing pbc first time\n");
2592 snew(graph, 1);
2593 as = 0;
2594 for (const gmx_molblock_t &molb : mtop->molblock)
2596 const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2597 if (moltype.atoms.nr == 1 ||
2598 (!bFirst && moltype.cgs.nr == 1))
2600 /* Just one atom or charge group in the molecule, no PBC required */
2601 as += molb.nmol*moltype.atoms.nr;
2603 else
2605 /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2606 mk_graph_ilist(nullptr, moltype.ilist,
2607 0, moltype.atoms.nr, FALSE, FALSE, graph);
2609 for (mol = 0; mol < molb.nmol; mol++)
2611 mk_mshift(fplog, graph, ePBC, box, x+as);
2613 shift_self(graph, box, x+as);
2614 /* The molecule is whole now.
2615 * We don't need the second mk_mshift call as in do_pbc_first,
2616 * since we no longer need this graph.
2619 as += moltype.atoms.nr;
2621 done_graph(graph);
2624 sfree(graph);
2627 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2628 const gmx_mtop_t *mtop, rvec x[])
2630 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2633 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2634 const gmx_mtop_t *mtop, rvec x[])
2636 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2639 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2641 int t, nth;
2642 nth = gmx_omp_nthreads_get(emntDefault);
2644 #pragma omp parallel for num_threads(nth) schedule(static)
2645 for (t = 0; t < nth; t++)
2649 size_t natoms = x.size();
2650 size_t offset = (natoms*t )/nth;
2651 size_t len = (natoms*(t + 1))/nth - offset;
2652 put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2654 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2658 // TODO This can be cleaned up a lot, and move back to runner.cpp
2659 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2660 const t_inputrec *inputrec,
2661 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2662 gmx_walltime_accounting_t walltime_accounting,
2663 nonbonded_verlet_t *nbv,
2664 gmx_pme_t *pme,
2665 gmx_bool bWriteStat)
2667 t_nrnb *nrnb_tot = nullptr;
2668 double delta_t = 0;
2669 double nbfs = 0, mflop = 0;
2670 double elapsed_time,
2671 elapsed_time_over_all_ranks,
2672 elapsed_time_over_all_threads,
2673 elapsed_time_over_all_threads_over_all_ranks;
2674 /* Control whether it is valid to print a report. Only the
2675 simulation master may print, but it should not do so if the run
2676 terminated e.g. before a scheduled reset step. This is
2677 complicated by the fact that PME ranks are unaware of the
2678 reason why they were sent a pmerecvqxFINISH. To avoid
2679 communication deadlocks, we always do the communication for the
2680 report, even if we've decided not to write the report, because
2681 how long it takes to finish the run is not important when we've
2682 decided not to report on the simulation performance.
2684 Further, we only report performance for dynamical integrators,
2685 because those are the only ones for which we plan to
2686 consider doing any optimizations. */
2687 bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2689 if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2691 GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2692 printReport = false;
2695 if (cr->nnodes > 1)
2697 snew(nrnb_tot, 1);
2698 #if GMX_MPI
2699 MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2700 cr->mpi_comm_mysim);
2701 #endif
2703 else
2705 nrnb_tot = nrnb;
2708 elapsed_time = walltime_accounting_get_elapsed_time(walltime_accounting);
2709 elapsed_time_over_all_ranks = elapsed_time;
2710 elapsed_time_over_all_threads = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
2711 elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2712 #if GMX_MPI
2713 if (cr->nnodes > 1)
2715 /* reduce elapsed_time over all MPI ranks in the current simulation */
2716 MPI_Allreduce(&elapsed_time,
2717 &elapsed_time_over_all_ranks,
2718 1, MPI_DOUBLE, MPI_SUM,
2719 cr->mpi_comm_mysim);
2720 elapsed_time_over_all_ranks /= cr->nnodes;
2721 /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2722 * current simulation. */
2723 MPI_Allreduce(&elapsed_time_over_all_threads,
2724 &elapsed_time_over_all_threads_over_all_ranks,
2725 1, MPI_DOUBLE, MPI_SUM,
2726 cr->mpi_comm_mysim);
2728 #endif
2730 if (printReport)
2732 print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2734 if (cr->nnodes > 1)
2736 sfree(nrnb_tot);
2739 if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2741 print_dd_statistics(cr, inputrec, fplog);
2744 /* TODO Move the responsibility for any scaling by thread counts
2745 * to the code that handled the thread region, so that there's a
2746 * mechanism to keep cycle counting working during the transition
2747 * to task parallelism. */
2748 int nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
2749 int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2750 wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2751 auto cycle_sum(wallcycle_sum(cr, wcycle));
2753 if (printReport)
2755 auto nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2756 gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
2757 if (pme_gpu_task_enabled(pme))
2759 pme_gpu_get_timings(pme, &pme_gpu_timings);
2761 wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2762 elapsed_time_over_all_ranks,
2763 wcycle, cycle_sum,
2764 nbnxn_gpu_timings,
2765 &pme_gpu_timings);
2767 if (EI_DYNAMICS(inputrec->eI))
2769 delta_t = inputrec->delta_t;
2772 if (fplog)
2774 print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2775 elapsed_time_over_all_ranks,
2776 walltime_accounting_get_nsteps_done(walltime_accounting),
2777 delta_t, nbfs, mflop);
2779 if (bWriteStat)
2781 print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2782 elapsed_time_over_all_ranks,
2783 walltime_accounting_get_nsteps_done(walltime_accounting),
2784 delta_t, nbfs, mflop);
2789 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2791 /* this function works, but could probably use a logic rewrite to keep all the different
2792 types of efep straight. */
2794 if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
2796 return;
2799 t_lambda *fep = ir->fepvals;
2800 *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2801 if checkpoint is set -- a kludge is in for now
2802 to prevent this.*/
2804 for (int i = 0; i < efptNR; i++)
2806 /* overwrite lambda state with init_lambda for now for backwards compatibility */
2807 if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2809 lambda[i] = fep->init_lambda;
2810 if (lam0)
2812 lam0[i] = lambda[i];
2815 else
2817 lambda[i] = fep->all_lambda[i][*fep_state];
2818 if (lam0)
2820 lam0[i] = lambda[i];
2824 if (ir->bSimTemp)
2826 /* need to rescale control temperatures to match current state */
2827 for (int i = 0; i < ir->opts.ngtc; i++)
2829 if (ir->opts.ref_t[i] > 0)
2831 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2836 /* Send to the log the information on the current lambdas */
2837 if (fplog != nullptr)
2839 fprintf(fplog, "Initial vector of lambda components:[ ");
2840 for (const auto &l : lambda)
2842 fprintf(fplog, "%10.4f ", l);
2844 fprintf(fplog, "]\n");
2846 return;
2850 void init_md(FILE *fplog,
2851 const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2852 t_inputrec *ir, const gmx_output_env_t *oenv,
2853 const MdrunOptions &mdrunOptions,
2854 double *t, double *t0,
2855 t_state *globalState, double *lam0,
2856 t_nrnb *nrnb, gmx_mtop_t *mtop,
2857 gmx_update_t **upd,
2858 int nfile, const t_filenm fnm[],
2859 gmx_mdoutf_t *outf, t_mdebin **mdebin,
2860 tensor force_vir, tensor shake_vir, rvec mu_tot,
2861 gmx_bool *bSimAnn, t_vcm **vcm,
2862 gmx_wallcycle_t wcycle)
2864 int i;
2866 /* Initial values */
2867 *t = *t0 = ir->init_t;
2869 *bSimAnn = FALSE;
2870 for (i = 0; i < ir->opts.ngtc; i++)
2872 /* set bSimAnn if any group is being annealed */
2873 if (ir->opts.annealing[i] != eannNO)
2875 *bSimAnn = TRUE;
2879 /* Initialize lambda variables */
2880 /* TODO: Clean up initialization of fep_state and lambda in t_state.
2881 * We currently need to call initialize_lambdas on non-master ranks
2882 * to initialize lam0.
2884 if (MASTER(cr))
2886 initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
2888 else
2890 int tmpFepState;
2891 std::array<real, efptNR> tmpLambda;
2892 initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
2895 // TODO upd is never NULL in practice, but the analysers don't know that
2896 if (upd)
2898 *upd = init_update(ir);
2900 if (*bSimAnn)
2902 update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2905 if (vcm != nullptr)
2907 *vcm = init_vcm(fplog, &mtop->groups, ir);
2910 if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
2912 if (ir->etc == etcBERENDSEN)
2914 please_cite(fplog, "Berendsen84a");
2916 if (ir->etc == etcVRESCALE)
2918 please_cite(fplog, "Bussi2007a");
2920 if (ir->eI == eiSD1)
2922 please_cite(fplog, "Goga2012");
2925 init_nrnb(nrnb);
2927 if (nfile != -1)
2929 *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
2931 *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
2932 mtop, ir, mdoutf_get_fp_dhdl(*outf));
2935 /* Initiate variables */
2936 clear_mat(force_vir);
2937 clear_mat(shake_vir);
2938 clear_rvec(mu_tot);