More const correctness
[gromacs.git] / src / gromacs / mdlib / sim_util.cpp
blob625b57427e13010f87747ddcb13b8242b8fa72cb
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
37 #include "gmxpre.h"
39 #include "sim_util.h"
41 #include "config.h"
43 #include <stdio.h>
44 #include <string.h>
46 #include <cmath>
47 #include <cstdint>
49 #include <array>
51 #include "gromacs/awh/awh.h"
52 #include "gromacs/domdec/dlbtiming.h"
53 #include "gromacs/domdec/domdec.h"
54 #include "gromacs/domdec/domdec_struct.h"
55 #include "gromacs/essentialdynamics/edsam.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/gmxlib/chargegroup.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/gmxlib/nrnb.h"
60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
63 #include "gromacs/gpu_utils/gpu_utils.h"
64 #include "gromacs/imd/imd.h"
65 #include "gromacs/listed-forces/bonded.h"
66 #include "gromacs/listed-forces/disre.h"
67 #include "gromacs/listed-forces/orires.h"
68 #include "gromacs/math/functions.h"
69 #include "gromacs/math/units.h"
70 #include "gromacs/math/vec.h"
71 #include "gromacs/math/vecdump.h"
72 #include "gromacs/mdlib/calcmu.h"
73 #include "gromacs/mdlib/constr.h"
74 #include "gromacs/mdlib/force.h"
75 #include "gromacs/mdlib/forcerec.h"
76 #include "gromacs/mdlib/gmx_omp_nthreads.h"
77 #include "gromacs/mdlib/mdrun.h"
78 #include "gromacs/mdlib/nb_verlet.h"
79 #include "gromacs/mdlib/nbnxn_atomdata.h"
80 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
81 #include "gromacs/mdlib/nbnxn_grid.h"
82 #include "gromacs/mdlib/nbnxn_search.h"
83 #include "gromacs/mdlib/qmmm.h"
84 #include "gromacs/mdlib/update.h"
85 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
86 #include "gromacs/mdtypes/commrec.h"
87 #include "gromacs/mdtypes/forceoutput.h"
88 #include "gromacs/mdtypes/iforceprovider.h"
89 #include "gromacs/mdtypes/inputrec.h"
90 #include "gromacs/mdtypes/md_enums.h"
91 #include "gromacs/mdtypes/state.h"
92 #include "gromacs/pbcutil/ishift.h"
93 #include "gromacs/pbcutil/mshift.h"
94 #include "gromacs/pbcutil/pbc.h"
95 #include "gromacs/pulling/pull.h"
96 #include "gromacs/pulling/pull_rotation.h"
97 #include "gromacs/timing/cyclecounter.h"
98 #include "gromacs/timing/gpu_timing.h"
99 #include "gromacs/timing/wallcycle.h"
100 #include "gromacs/timing/wallcyclereporting.h"
101 #include "gromacs/timing/walltime_accounting.h"
102 #include "gromacs/topology/topology.h"
103 #include "gromacs/utility/arrayref.h"
104 #include "gromacs/utility/basedefinitions.h"
105 #include "gromacs/utility/cstringutil.h"
106 #include "gromacs/utility/exceptions.h"
107 #include "gromacs/utility/fatalerror.h"
108 #include "gromacs/utility/gmxassert.h"
109 #include "gromacs/utility/gmxmpi.h"
110 #include "gromacs/utility/logger.h"
111 #include "gromacs/utility/pleasecite.h"
112 #include "gromacs/utility/smalloc.h"
113 #include "gromacs/utility/sysinfo.h"
115 #include "nbnxn_gpu.h"
116 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
117 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
119 // TODO: this environment variable allows us to verify before release
120 // that on less common architectures the total cost of polling is not larger than
121 // a blocking wait (so polling does not introduce overhead when the static
122 // PME-first ordering would suffice).
123 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
126 void print_time(FILE *out,
127 gmx_walltime_accounting_t walltime_accounting,
128 gmx_int64_t step,
129 t_inputrec *ir,
130 const t_commrec *cr)
132 time_t finish;
133 char timebuf[STRLEN];
134 double dt, elapsed_seconds, time_per_step;
135 char buf[48];
137 #if !GMX_THREAD_MPI
138 if (!PAR(cr))
139 #endif
141 fprintf(out, "\r");
143 fprintf(out, "step %s", gmx_step_str(step, buf));
144 fflush(out);
146 if ((step >= ir->nstlist))
148 double seconds_since_epoch = gmx_gettime();
149 elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
150 time_per_step = elapsed_seconds/(step - ir->init_step + 1);
151 dt = (ir->nsteps + ir->init_step - step) * time_per_step;
153 if (ir->nsteps >= 0)
155 if (dt >= 300)
157 finish = (time_t) (seconds_since_epoch + dt);
158 gmx_ctime_r(&finish, timebuf, STRLEN);
159 sprintf(buf, "%s", timebuf);
160 buf[strlen(buf)-1] = '\0';
161 fprintf(out, ", will finish %s", buf);
163 else
165 fprintf(out, ", remaining wall clock time: %5d s ", (int)dt);
168 else
170 fprintf(out, " performance: %.1f ns/day ",
171 ir->delta_t/1000*24*60*60/time_per_step);
174 #if !GMX_THREAD_MPI
175 if (PAR(cr))
177 fprintf(out, "\n");
179 #else
180 GMX_UNUSED_VALUE(cr);
181 #endif
183 fflush(out);
186 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
187 double the_time)
189 char time_string[STRLEN];
191 if (!fplog)
193 return;
197 int i;
198 char timebuf[STRLEN];
199 time_t temp_time = (time_t) the_time;
201 gmx_ctime_r(&temp_time, timebuf, STRLEN);
202 for (i = 0; timebuf[i] >= ' '; i++)
204 time_string[i] = timebuf[i];
206 time_string[i] = '\0';
209 fprintf(fplog, "%s on rank %d %s\n", title, nodeid, time_string);
212 void print_start(FILE *fplog, const t_commrec *cr,
213 gmx_walltime_accounting_t walltime_accounting,
214 const char *name)
216 char buf[STRLEN];
218 sprintf(buf, "Started %s", name);
219 print_date_and_time(fplog, cr->nodeid, buf,
220 walltime_accounting_get_start_time_stamp(walltime_accounting));
223 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
225 const int end = forceToAdd.size();
227 // cppcheck-suppress unreadVariable
228 int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
229 #pragma omp parallel for num_threads(nt) schedule(static)
230 for (int i = 0; i < end; i++)
232 rvec_inc(f[i], forceToAdd[i]);
236 static void pme_gpu_reduce_outputs(gmx_wallcycle_t wcycle,
237 ForceWithVirial *forceWithVirial,
238 ArrayRef<const gmx::RVec> pmeForces,
239 gmx_enerdata_t *enerd,
240 const tensor vir_Q,
241 real Vlr_q)
243 wallcycle_start(wcycle, ewcPME_GPU_F_REDUCTION);
244 GMX_ASSERT(forceWithVirial, "Invalid force pointer");
245 forceWithVirial->addVirialContribution(vir_Q);
246 enerd->term[F_COUL_RECIP] += Vlr_q;
247 sum_forces(as_rvec_array(forceWithVirial->force_.data()), pmeForces);
248 wallcycle_stop(wcycle, ewcPME_GPU_F_REDUCTION);
251 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
252 tensor vir_part, t_graph *graph, matrix box,
253 t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
255 int i;
257 /* The short-range virial from surrounding boxes */
258 calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
259 inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
261 /* Calculate partial virial, for local atoms only, based on short range.
262 * Total virial is computed in global_stat, called from do_md
264 f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
265 inc_nrnb(nrnb, eNR_VIRIAL, homenr);
267 /* Add wall contribution */
268 for (i = 0; i < DIM; i++)
270 vir_part[i][ZZ] += fr->vir_wall_z[i];
273 if (debug)
275 pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
279 static void pull_potential_wrapper(const t_commrec *cr,
280 const t_inputrec *ir,
281 matrix box, ArrayRef<const RVec> x,
282 ForceWithVirial *force,
283 const t_mdatoms *mdatoms,
284 gmx_enerdata_t *enerd,
285 real *lambda,
286 double t,
287 gmx_wallcycle_t wcycle)
289 t_pbc pbc;
290 real dvdl;
292 /* Calculate the center of mass forces, this requires communication,
293 * which is why pull_potential is called close to other communication.
295 wallcycle_start(wcycle, ewcPULLPOT);
296 set_pbc(&pbc, ir->ePBC, box);
297 dvdl = 0;
298 enerd->term[F_COM_PULL] +=
299 pull_potential(ir->pull_work, mdatoms, &pbc,
300 cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
301 enerd->dvdl_lin[efptRESTRAINT] += dvdl;
302 wallcycle_stop(wcycle, ewcPULLPOT);
305 static void pme_receive_force_ener(const t_commrec *cr,
306 ForceWithVirial *forceWithVirial,
307 gmx_enerdata_t *enerd,
308 gmx_wallcycle_t wcycle)
310 real e_q, e_lj, dvdl_q, dvdl_lj;
311 float cycles_ppdpme, cycles_seppme;
313 cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
314 dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
316 /* In case of node-splitting, the PP nodes receive the long-range
317 * forces, virial and energy from the PME nodes here.
319 wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
320 dvdl_q = 0;
321 dvdl_lj = 0;
322 gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
323 &cycles_seppme);
324 enerd->term[F_COUL_RECIP] += e_q;
325 enerd->term[F_LJ_RECIP] += e_lj;
326 enerd->dvdl_lin[efptCOUL] += dvdl_q;
327 enerd->dvdl_lin[efptVDW] += dvdl_lj;
329 if (wcycle)
331 dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
333 wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
336 static void print_large_forces(FILE *fp,
337 const t_mdatoms *md,
338 const t_commrec *cr,
339 gmx_int64_t step,
340 real forceTolerance,
341 const rvec *x,
342 const rvec *f)
344 real force2Tolerance = gmx::square(forceTolerance);
345 std::uintmax_t numNonFinite = 0;
346 for (int i = 0; i < md->homenr; i++)
348 real force2 = norm2(f[i]);
349 bool nonFinite = !std::isfinite(force2);
350 if (force2 >= force2Tolerance || nonFinite)
352 fprintf(fp, "step %" GMX_PRId64 " atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
353 step,
354 ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
356 if (nonFinite)
358 numNonFinite++;
361 if (numNonFinite > 0)
363 /* Note that with MPI this fatal call on one rank might interrupt
364 * the printing on other ranks. But we can only avoid that with
365 * an expensive MPI barrier that we would need at each step.
367 gmx_fatal(FARGS, "At step %" GMX_PRId64 " detected non-finite forces on %ju atoms", step, numNonFinite);
371 static void post_process_forces(const t_commrec *cr,
372 gmx_int64_t step,
373 t_nrnb *nrnb,
374 gmx_wallcycle_t wcycle,
375 const gmx_localtop_t *top,
376 matrix box,
377 rvec x[],
378 rvec f[],
379 ForceWithVirial *forceWithVirial,
380 tensor vir_force,
381 const t_mdatoms *mdatoms,
382 t_graph *graph,
383 t_forcerec *fr,
384 const gmx_vsite_t *vsite,
385 int flags)
387 if (fr->haveDirectVirialContributions)
389 rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
391 if (vsite)
393 /* Spread the mesh force on virtual sites to the other particles...
394 * This is parallellized. MPI communication is performed
395 * if the constructing atoms aren't local.
397 matrix virial = { { 0 } };
398 spread_vsite_f(vsite, x, fDirectVir, nullptr,
399 (flags & GMX_FORCE_VIRIAL), virial,
400 nrnb,
401 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
402 forceWithVirial->addVirialContribution(virial);
405 if (flags & GMX_FORCE_VIRIAL)
407 /* Now add the forces, this is local */
408 sum_forces(f, forceWithVirial->force_);
410 /* Add the direct virial contributions */
411 GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
412 m_add(vir_force, forceWithVirial->getVirial(), vir_force);
414 if (debug)
416 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
421 if (fr->print_force >= 0)
423 print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
427 static void do_nb_verlet(t_forcerec *fr,
428 interaction_const_t *ic,
429 gmx_enerdata_t *enerd,
430 int flags, int ilocality,
431 int clearF,
432 gmx_int64_t step,
433 t_nrnb *nrnb,
434 gmx_wallcycle_t wcycle)
436 if (!(flags & GMX_FORCE_NONBONDED))
438 /* skip non-bonded calculation */
439 return;
442 nonbonded_verlet_t *nbv = fr->nbv;
443 nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
445 /* GPU kernel launch overhead is already timed separately */
446 if (fr->cutoff_scheme != ecutsVERLET)
448 gmx_incons("Invalid cut-off scheme passed!");
451 bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
453 if (!bUsingGpuKernels)
455 /* When dynamic pair-list pruning is requested, we need to prune
456 * at nstlistPrune steps.
458 if (nbv->listParams->useDynamicPruning &&
459 (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
461 /* Prune the pair-list beyond fr->ic->rlistPrune using
462 * the current coordinates of the atoms.
464 wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
465 nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
466 wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
469 wallcycle_sub_start(wcycle, ewcsNONBONDED);
472 switch (nbvg->kernel_type)
474 case nbnxnk4x4_PlainC:
475 case nbnxnk4xN_SIMD_4xN:
476 case nbnxnk4xN_SIMD_2xNN:
477 nbnxn_kernel_cpu(nbvg,
478 nbv->nbat,
480 fr->shift_vec,
481 flags,
482 clearF,
483 fr->fshift[0],
484 enerd->grpp.ener[egCOULSR],
485 fr->bBHAM ?
486 enerd->grpp.ener[egBHAMSR] :
487 enerd->grpp.ener[egLJSR]);
488 break;
490 case nbnxnk8x8x8_GPU:
491 nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbv->nbat, flags, ilocality);
492 break;
494 case nbnxnk8x8x8_PlainC:
495 nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
496 nbv->nbat, ic,
497 fr->shift_vec,
498 flags,
499 clearF,
500 nbv->nbat->out[0].f,
501 fr->fshift[0],
502 enerd->grpp.ener[egCOULSR],
503 fr->bBHAM ?
504 enerd->grpp.ener[egBHAMSR] :
505 enerd->grpp.ener[egLJSR]);
506 break;
508 default:
509 GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
512 if (!bUsingGpuKernels)
514 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
517 int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
518 if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
520 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
522 else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
523 (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
525 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
527 else
529 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
531 enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
532 if (flags & GMX_FORCE_ENERGY)
534 /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
535 enr_nbnxn_kernel_ljc += 1;
536 enr_nbnxn_kernel_lj += 1;
539 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
540 nbvg->nbl_lists.natpair_ljq);
541 inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
542 nbvg->nbl_lists.natpair_lj);
543 /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
544 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
545 nbvg->nbl_lists.natpair_q);
547 if (ic->vdw_modifier == eintmodFORCESWITCH)
549 /* We add up the switch cost separately */
550 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
551 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
553 if (ic->vdw_modifier == eintmodPOTSWITCH)
555 /* We add up the switch cost separately */
556 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
557 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
559 if (ic->vdwtype == evdwPME)
561 /* We add up the LJ Ewald cost separately */
562 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
563 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
567 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
568 t_forcerec *fr,
569 rvec x[],
570 rvec f[],
571 const t_mdatoms *mdatoms,
572 t_lambda *fepvals,
573 real *lambda,
574 gmx_enerdata_t *enerd,
575 int flags,
576 t_nrnb *nrnb,
577 gmx_wallcycle_t wcycle)
579 int donb_flags;
580 nb_kernel_data_t kernel_data;
581 real lam_i[efptNR];
582 real dvdl_nb[efptNR];
583 int th;
584 int i, j;
586 donb_flags = 0;
587 /* Add short-range interactions */
588 donb_flags |= GMX_NONBONDED_DO_SR;
590 /* Currently all group scheme kernels always calculate (shift-)forces */
591 if (flags & GMX_FORCE_FORCES)
593 donb_flags |= GMX_NONBONDED_DO_FORCE;
595 if (flags & GMX_FORCE_VIRIAL)
597 donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
599 if (flags & GMX_FORCE_ENERGY)
601 donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
604 kernel_data.flags = donb_flags;
605 kernel_data.lambda = lambda;
606 kernel_data.dvdl = dvdl_nb;
608 kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
609 kernel_data.energygrp_vdw = enerd->grpp.ener[egLJSR];
611 /* reset free energy components */
612 for (i = 0; i < efptNR; i++)
614 dvdl_nb[i] = 0;
617 GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
619 wallcycle_sub_start(wcycle, ewcsNONBONDED);
620 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
621 for (th = 0; th < nbl_lists->nnbl; th++)
625 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
626 x, f, fr, mdatoms, &kernel_data, nrnb);
628 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
631 if (fepvals->sc_alpha != 0)
633 enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
634 enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
636 else
638 enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
639 enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
642 /* If we do foreign lambda and we have soft-core interactions
643 * we have to recalculate the (non-linear) energies contributions.
645 if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
647 kernel_data.flags = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
648 kernel_data.lambda = lam_i;
649 kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
650 kernel_data.energygrp_vdw = enerd->foreign_grpp.ener[egLJSR];
651 /* Note that we add to kernel_data.dvdl, but ignore the result */
653 for (i = 0; i < enerd->n_lambda; i++)
655 for (j = 0; j < efptNR; j++)
657 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
659 reset_foreign_enerdata(enerd);
660 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
661 for (th = 0; th < nbl_lists->nnbl; th++)
665 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
666 x, f, fr, mdatoms, &kernel_data, nrnb);
668 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
671 sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
672 enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
676 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
679 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
681 return nbv != nullptr && nbv->bUseGPU;
684 static inline void clear_rvecs_omp(int n, rvec v[])
686 int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
688 /* Note that we would like to avoid this conditional by putting it
689 * into the omp pragma instead, but then we still take the full
690 * omp parallel for overhead (at least with gcc5).
692 if (nth == 1)
694 for (int i = 0; i < n; i++)
696 clear_rvec(v[i]);
699 else
701 #pragma omp parallel for num_threads(nth) schedule(static)
702 for (int i = 0; i < n; i++)
704 clear_rvec(v[i]);
709 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
711 * \param groupOptions Group options, containing T-coupling options
713 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
715 real nrdfCoupled = 0;
716 real nrdfUncoupled = 0;
717 real kineticEnergy = 0;
718 for (int g = 0; g < groupOptions.ngtc; g++)
720 if (groupOptions.tau_t[g] >= 0)
722 nrdfCoupled += groupOptions.nrdf[g];
723 kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
725 else
727 nrdfUncoupled += groupOptions.nrdf[g];
731 /* This conditional with > also catches nrdf=0 */
732 if (nrdfCoupled > nrdfUncoupled)
734 return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
736 else
738 return 0;
742 /*! \brief This routine checks that the potential energy is finite.
744 * Always checks that the potential energy is finite. If step equals
745 * inputrec.init_step also checks that the magnitude of the potential energy
746 * is reasonable. Terminates with a fatal error when a check fails.
747 * Note that passing this check does not guarantee finite forces,
748 * since those use slightly different arithmetics. But in most cases
749 * there is just a narrow coordinate range where forces are not finite
750 * and energies are finite.
752 * \param[in] step The step number, used for checking and printing
753 * \param[in] enerd The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
754 * \param[in] inputrec The input record
756 static void checkPotentialEnergyValidity(gmx_int64_t step,
757 const gmx_enerdata_t &enerd,
758 const t_inputrec &inputrec)
760 /* Threshold valid for comparing absolute potential energy against
761 * the kinetic energy. Normally one should not consider absolute
762 * potential energy values, but with a factor of one million
763 * we should never get false positives.
765 constexpr real c_thresholdFactor = 1e6;
767 bool energyIsNotFinite = !std::isfinite(enerd.term[F_EPOT]);
768 real averageKineticEnergy = 0;
769 /* We only check for large potential energy at the initial step,
770 * because that is by far the most likely step for this too occur
771 * and because computing the average kinetic energy is not free.
772 * Note: nstcalcenergy >> 1 often does not allow to catch large energies
773 * before they become NaN.
775 if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
777 averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
780 if (energyIsNotFinite || (averageKineticEnergy > 0 &&
781 enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
783 gmx_fatal(FARGS, "Step %" GMX_PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
784 step,
785 enerd.term[F_EPOT],
786 energyIsNotFinite ? "not finite" : "extremely high",
787 enerd.term[F_LJ],
788 enerd.term[F_COUL_SR],
789 energyIsNotFinite ? "non-finite" : "very high",
790 energyIsNotFinite ? " or Nan" : "");
794 /*! \brief Compute forces and/or energies for special algorithms
796 * The intention is to collect all calls to algorithms that compute
797 * forces on local atoms only and that do not contribute to the local
798 * virial sum (but add their virial contribution separately).
799 * Eventually these should likely all become ForceProviders.
800 * Within this function the intention is to have algorithms that do
801 * global communication at the end, so global barriers within the MD loop
802 * are as close together as possible.
804 * \param[in] fplog The log file
805 * \param[in] cr The communication record
806 * \param[in] inputrec The input record
807 * \param[in] step The current MD step
808 * \param[in] t The current time
809 * \param[in,out] wcycle Wallcycle accounting struct
810 * \param[in,out] forceProviders Pointer to a list of force providers
811 * \param[in] box The unit cell
812 * \param[in] x The coordinates
813 * \param[in] mdatoms Per atom properties
814 * \param[in] lambda Array of free-energy lambda values
815 * \param[in] forceFlags Flags that tell whether we should compute forces/energies/virial
816 * \param[in,out] forceWithVirial Force and virial buffers
817 * \param[in,out] enerd Energy buffer
818 * \param[in,out] ed Essential dynamics pointer
819 * \param[in] bNS Tells if we did neighbor searching this step, used for ED sampling
821 * \todo Remove bNS, which is used incorrectly.
822 * \todo Convert all other algorithms called here to ForceProviders.
824 static void
825 computeSpecialForces(FILE *fplog,
826 const t_commrec *cr,
827 const t_inputrec *inputrec,
828 gmx_int64_t step,
829 double t,
830 gmx_wallcycle_t wcycle,
831 ForceProviders *forceProviders,
832 matrix box,
833 ArrayRef<const RVec> x,
834 const t_mdatoms *mdatoms,
835 real *lambda,
836 int forceFlags,
837 ForceWithVirial *forceWithVirial,
838 gmx_enerdata_t *enerd,
839 const gmx_edsam *ed,
840 gmx_bool bNS)
842 const bool computeForces = (forceFlags & GMX_FORCE_FORCES);
844 /* NOTE: Currently all ForceProviders only provide forces.
845 * When they also provide energies, remove this conditional.
847 if (computeForces)
849 /* Collect forces from modules */
850 forceProviders->calculateForces(cr, mdatoms, box, t, x, forceWithVirial);
853 if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
855 pull_potential_wrapper(cr, inputrec, box, x,
856 forceWithVirial,
857 mdatoms, enerd, lambda, t,
858 wcycle);
860 if (inputrec->bDoAwh)
862 Awh &awh = *inputrec->awh;
863 enerd->term[F_COM_PULL] +=
864 awh.applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
865 forceWithVirial,
866 t, step, wcycle, fplog);
870 rvec *f = as_rvec_array(forceWithVirial->force_.data());
872 /* Add the forces from enforced rotation potentials (if any) */
873 if (inputrec->bRot)
875 wallcycle_start(wcycle, ewcROTadd);
876 enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
877 wallcycle_stop(wcycle, ewcROTadd);
880 if (ed)
882 /* Note that since init_edsam() is called after the initialization
883 * of forcerec, edsam doesn't request the noVirSum force buffer.
884 * Thus if no other algorithm (e.g. PME) requires it, the forces
885 * here will contribute to the virial.
887 do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
890 /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
891 if (inputrec->bIMD && computeForces)
893 IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
897 /*! \brief Launch the prepare_step and spread stages of PME GPU.
899 * \param[in] pmedata The PME structure
900 * \param[in] box The box matrix
901 * \param[in] x Coordinate array
902 * \param[in] flags Force flags
903 * \param[in] wcycle The wallcycle structure
905 static inline void launchPmeGpuSpread(gmx_pme_t *pmedata,
906 matrix box,
907 rvec x[],
908 int flags,
909 gmx_wallcycle_t wcycle)
911 int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE;
912 pmeFlags |= (flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0;
913 pmeFlags |= (flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0;
915 pme_gpu_prepare_computation(pmedata, flags & GMX_FORCE_DYNAMICBOX, box, wcycle, pmeFlags);
916 pme_gpu_launch_spread(pmedata, x, wcycle);
919 /*! \brief Launch the FFT and gather stages of PME GPU
921 * This function only implements setting the output forces (no accumulation).
923 * \param[in] pmedata The PME structure
924 * \param[in] wcycle The wallcycle structure
926 static void launchPmeGpuFftAndGather(gmx_pme_t *pmedata,
927 gmx_wallcycle_t wcycle)
929 pme_gpu_launch_complex_transforms(pmedata, wcycle);
930 pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
933 /*! \brief
934 * Polling wait for either of the PME or nonbonded GPU tasks.
936 * Instead of a static order in waiting for GPU tasks, this function
937 * polls checking which of the two tasks completes first, and does the
938 * associated force buffer reduction overlapped with the other task.
939 * By doing that, unlike static scheduling order, it can always overlap
940 * one of the reductions, regardless of the GPU task completion order.
942 * \param[in] nbv Nonbonded verlet structure
943 * \param[in] pmedata PME module data
944 * \param[in,out] force Force array to reduce task outputs into.
945 * \param[in,out] forceWithVirial Force and virial buffers
946 * \param[in,out] fshift Shift force output vector results are reduced into
947 * \param[in,out] enerd Energy data structure results are reduced into
948 * \param[in] flags Force flags
949 * \param[in] wcycle The wallcycle structure
951 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv,
952 const gmx_pme_t *pmedata,
953 gmx::PaddedArrayRef<gmx::RVec> *force,
954 ForceWithVirial *forceWithVirial,
955 rvec fshift[],
956 gmx_enerdata_t *enerd,
957 int flags,
958 gmx_wallcycle_t wcycle)
960 bool isPmeGpuDone = false;
961 bool isNbGpuDone = false;
964 gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
966 while (!isPmeGpuDone || !isNbGpuDone)
968 if (!isPmeGpuDone)
970 matrix vir_Q;
971 real Vlr_q;
973 GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
974 isPmeGpuDone = pme_gpu_try_finish_task(pmedata, wcycle, &pmeGpuForces,
975 vir_Q, &Vlr_q, completionType);
977 if (isPmeGpuDone)
979 pme_gpu_reduce_outputs(wcycle, forceWithVirial, pmeGpuForces,
980 enerd, vir_Q, Vlr_q);
984 if (!isNbGpuDone)
986 GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
987 wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
988 isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
989 flags, eatLocal,
990 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
991 fshift, completionType);
992 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
993 // To get the call count right, when the task finished we
994 // issue a start/stop.
995 // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
996 // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
997 if (isNbGpuDone)
999 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1000 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1002 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
1003 nbv->nbat, as_rvec_array(force->data()), wcycle);
1009 /*! \brief
1010 * Launch the dynamic rolling pruning GPU task.
1012 * We currently alternate local/non-local list pruning in odd-even steps
1013 * (only pruning every second step without DD).
1015 * \param[in] cr The communication record
1016 * \param[in] nbv Nonbonded verlet structure
1017 * \param[in] inputrec The input record
1018 * \param[in] step The current MD step
1020 static inline void launchGpuRollingPruning(const t_commrec *cr,
1021 const nonbonded_verlet_t *nbv,
1022 const t_inputrec *inputrec,
1023 const gmx_int64_t step)
1025 /* We should not launch the rolling pruning kernel at a search
1026 * step or just before search steps, since that's useless.
1027 * Without domain decomposition we prune at even steps.
1028 * With domain decomposition we alternate local and non-local
1029 * pruning at even and odd steps.
1031 int numRollingParts = nbv->listParams->numRollingParts;
1032 GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1033 int stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1034 bool stepIsEven = ((stepWithCurrentList & 1) == 0);
1035 if (stepWithCurrentList > 0 &&
1036 stepWithCurrentList < inputrec->nstlist - 1 &&
1037 (stepIsEven || DOMAINDECOMP(cr)))
1039 nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
1040 stepIsEven ? eintLocal : eintNonlocal,
1041 numRollingParts);
1045 static void do_force_cutsVERLET(FILE *fplog,
1046 const t_commrec *cr,
1047 const gmx_multisim_t *ms,
1048 const t_inputrec *inputrec,
1049 gmx_int64_t step,
1050 t_nrnb *nrnb,
1051 gmx_wallcycle_t wcycle,
1052 const gmx_localtop_t *top,
1053 const gmx_groups_t * /* groups */,
1054 matrix box, gmx::PaddedArrayRef<gmx::RVec> x,
1055 history_t *hist,
1056 gmx::PaddedArrayRef<gmx::RVec> force,
1057 tensor vir_force,
1058 const t_mdatoms *mdatoms,
1059 gmx_enerdata_t *enerd, t_fcdata *fcd,
1060 real *lambda,
1061 t_graph *graph,
1062 t_forcerec *fr,
1063 interaction_const_t *ic,
1064 const gmx_vsite_t *vsite,
1065 rvec mu_tot,
1066 double t,
1067 const gmx_edsam *ed,
1068 int flags,
1069 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1070 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1072 int cg1, i, j;
1073 double mu[2*DIM];
1074 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1075 gmx_bool bDoForces, bUseGPU, bUseOrEmulGPU;
1076 rvec vzero, box_diag;
1077 float cycles_pme, cycles_wait_gpu;
1078 nonbonded_verlet_t *nbv = fr->nbv;
1080 bStateChanged = (flags & GMX_FORCE_STATECHANGED);
1081 bNS = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1082 bFillGrid = (bNS && bStateChanged);
1083 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1084 bDoForces = (flags & GMX_FORCE_FORCES);
1085 bUseGPU = fr->nbv->bUseGPU;
1086 bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
1088 const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
1089 // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
1090 const bool useGpuPme = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
1091 ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
1093 /* At a search step we need to start the first balancing region
1094 * somewhere early inside the step after communication during domain
1095 * decomposition (and not during the previous step as usual).
1097 if (bNS &&
1098 ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1100 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
1103 cycles_wait_gpu = 0;
1105 const int start = 0;
1106 const int homenr = mdatoms->homenr;
1108 clear_mat(vir_force);
1110 if (DOMAINDECOMP(cr))
1112 cg1 = cr->dd->ncg_tot;
1114 else
1116 cg1 = top->cgs.nr;
1118 if (fr->n_tpi > 0)
1120 cg1--;
1123 if (bStateChanged)
1125 update_forcerec(fr, box);
1127 if (inputrecNeedMutot(inputrec))
1129 /* Calculate total (local) dipole moment in a temporary common array.
1130 * This makes it possible to sum them over nodes faster.
1132 calc_mu(start, homenr,
1133 x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1134 mu, mu+DIM);
1138 if (fr->ePBC != epbcNONE)
1140 /* Compute shift vectors every step,
1141 * because of pressure coupling or box deformation!
1143 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1145 calc_shifts(box, fr->shift_vec);
1148 if (bCalcCGCM)
1150 put_atoms_in_box_omp(fr->ePBC, box, x.subArray(0, homenr));
1151 inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1153 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1155 unshift_self(graph, box, as_rvec_array(x.data()));
1159 nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
1160 fr->shift_vec, nbv->nbat);
1162 #if GMX_MPI
1163 if (!thisRankHasDuty(cr, DUTY_PME))
1165 /* Send particle coordinates to the pme nodes.
1166 * Since this is only implemented for domain decomposition
1167 * and domain decomposition does not use the graph,
1168 * we do not need to worry about shifting.
1170 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1171 lambda[efptCOUL], lambda[efptVDW],
1172 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1173 step, wcycle);
1175 #endif /* GMX_MPI */
1177 if (useGpuPme)
1179 launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.data()), flags, wcycle);
1182 /* do gridding for pair search */
1183 if (bNS)
1185 if (graph && bStateChanged)
1187 /* Calculate intramolecular shift vectors to make molecules whole */
1188 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1191 clear_rvec(vzero);
1192 box_diag[XX] = box[XX][XX];
1193 box_diag[YY] = box[YY][YY];
1194 box_diag[ZZ] = box[ZZ][ZZ];
1196 wallcycle_start(wcycle, ewcNS);
1197 if (!fr->bDomDec)
1199 wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1200 nbnxn_put_on_grid(nbv->nbs, fr->ePBC, box,
1201 0, vzero, box_diag,
1202 0, mdatoms->homenr, -1, fr->cginfo, as_rvec_array(x.data()),
1203 0, nullptr,
1204 nbv->grp[eintLocal].kernel_type,
1205 nbv->nbat);
1206 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1208 else
1210 wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1211 nbnxn_put_on_grid_nonlocal(nbv->nbs, domdec_zones(cr->dd),
1212 fr->cginfo, as_rvec_array(x.data()),
1213 nbv->grp[eintNonlocal].kernel_type,
1214 nbv->nbat);
1215 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1218 nbnxn_atomdata_set(nbv->nbat, nbv->nbs, mdatoms, fr->cginfo);
1219 wallcycle_stop(wcycle, ewcNS);
1222 /* initialize the GPU atom data and copy shift vector */
1223 if (bUseGPU)
1225 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1226 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1228 if (bNS)
1230 nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1233 nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1235 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1236 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1239 /* do local pair search */
1240 if (bNS)
1242 wallcycle_start_nocount(wcycle, ewcNS);
1243 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1244 nbnxn_make_pairlist(nbv->nbs, nbv->nbat,
1245 &top->excls,
1246 nbv->listParams->rlistOuter,
1247 nbv->min_ci_balanced,
1248 &nbv->grp[eintLocal].nbl_lists,
1249 eintLocal,
1250 nbv->grp[eintLocal].kernel_type,
1251 nrnb);
1252 nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
1253 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1255 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
1257 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1259 if (bUseGPU)
1261 /* initialize local pair-list on the GPU */
1262 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1263 nbv->grp[eintLocal].nbl_lists.nbl[0],
1264 eintLocal);
1266 wallcycle_stop(wcycle, ewcNS);
1268 else
1270 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, as_rvec_array(x.data()),
1271 nbv->nbat, wcycle);
1274 if (bUseGPU)
1276 if (DOMAINDECOMP(cr))
1278 ddOpenBalanceRegionGpu(cr->dd);
1281 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1282 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1283 /* launch local nonbonded work on GPU */
1284 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1285 step, nrnb, wcycle);
1286 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1287 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1290 if (useGpuPme)
1292 // In PME GPU and mixed mode we launch FFT / gather after the
1293 // X copy/transform to allow overlap as well as after the GPU NB
1294 // launch to avoid FFT launch overhead hijacking the CPU and delaying
1295 // the nonbonded kernel.
1296 launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1299 /* Communicate coordinates and sum dipole if necessary +
1300 do non-local pair search */
1301 if (DOMAINDECOMP(cr))
1303 if (bNS)
1305 wallcycle_start_nocount(wcycle, ewcNS);
1306 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1308 nbnxn_make_pairlist(nbv->nbs, nbv->nbat,
1309 &top->excls,
1310 nbv->listParams->rlistOuter,
1311 nbv->min_ci_balanced,
1312 &nbv->grp[eintNonlocal].nbl_lists,
1313 eintNonlocal,
1314 nbv->grp[eintNonlocal].kernel_type,
1315 nrnb);
1316 nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1317 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1319 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1321 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1323 if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1325 /* initialize non-local pair-list on the GPU */
1326 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1327 nbv->grp[eintNonlocal].nbl_lists.nbl[0],
1328 eintNonlocal);
1330 wallcycle_stop(wcycle, ewcNS);
1332 else
1334 dd_move_x(cr->dd, box, as_rvec_array(x.data()), wcycle);
1336 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, as_rvec_array(x.data()),
1337 nbv->nbat, wcycle);
1340 if (bUseGPU)
1342 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1343 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1344 /* launch non-local nonbonded tasks on GPU */
1345 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1346 step, nrnb, wcycle);
1347 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1348 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1352 if (bUseGPU)
1354 /* launch D2H copy-back F */
1355 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1356 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1357 if (DOMAINDECOMP(cr))
1359 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1360 flags, eatNonlocal);
1362 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1363 flags, eatLocal);
1364 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1365 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1368 if (bStateChanged && inputrecNeedMutot(inputrec))
1370 if (PAR(cr))
1372 gmx_sumd(2*DIM, mu, cr);
1373 ddReopenBalanceRegionCpu(cr->dd);
1376 for (i = 0; i < 2; i++)
1378 for (j = 0; j < DIM; j++)
1380 fr->mu_tot[i][j] = mu[i*DIM + j];
1384 if (fr->efep == efepNO)
1386 copy_rvec(fr->mu_tot[0], mu_tot);
1388 else
1390 for (j = 0; j < DIM; j++)
1392 mu_tot[j] =
1393 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1394 lambda[efptCOUL]*fr->mu_tot[1][j];
1398 /* Reset energies */
1399 reset_enerdata(enerd);
1400 clear_rvecs(SHIFTS, fr->fshift);
1402 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1404 wallcycle_start(wcycle, ewcPPDURINGPME);
1405 dd_force_flop_start(cr->dd, nrnb);
1408 if (inputrec->bRot)
1410 wallcycle_start(wcycle, ewcROT);
1411 do_rotation(cr, inputrec, box, as_rvec_array(x.data()), t, step, bNS);
1412 wallcycle_stop(wcycle, ewcROT);
1415 /* Temporary solution until all routines take PaddedRVecVector */
1416 rvec *f = as_rvec_array(force.data());
1418 /* Start the force cycle counter.
1419 * Note that a different counter is used for dynamic load balancing.
1421 wallcycle_start(wcycle, ewcFORCE);
1423 gmx::ArrayRef<gmx::RVec> forceRef = force;
1424 if (bDoForces)
1426 /* If we need to compute the virial, we might need a separate
1427 * force buffer for algorithms for which the virial is calculated
1428 * directly, such as PME.
1430 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1432 forceRef = *fr->forceBufferForDirectVirialContributions;
1434 /* We only compute forces on local atoms. Note that vsites can
1435 * spread to non-local atoms, but that part of the buffer is
1436 * cleared separately in the vsite spreading code.
1438 clear_rvecs_omp(mdatoms->homenr, as_rvec_array(forceRef.data()));
1440 /* Clear the short- and long-range forces */
1441 clear_rvecs_omp(fr->natoms_force_constr, f);
1444 /* forceWithVirial uses the local atom range only */
1445 ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
1447 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1449 clear_pull_forces(inputrec->pull_work);
1452 /* We calculate the non-bonded forces, when done on the CPU, here.
1453 * We do this before calling do_force_lowlevel, because in that
1454 * function, the listed forces are calculated before PME, which
1455 * does communication. With this order, non-bonded and listed
1456 * force calculation imbalance can be balanced out by the domain
1457 * decomposition load balancing.
1460 if (!bUseOrEmulGPU)
1462 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1463 step, nrnb, wcycle);
1466 if (fr->efep != efepNO)
1468 /* Calculate the local and non-local free energy interactions here.
1469 * Happens here on the CPU both with and without GPU.
1471 if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1473 do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1474 fr, as_rvec_array(x.data()), f, mdatoms,
1475 inputrec->fepvals, lambda,
1476 enerd, flags, nrnb, wcycle);
1479 if (DOMAINDECOMP(cr) &&
1480 fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1482 do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1483 fr, as_rvec_array(x.data()), f, mdatoms,
1484 inputrec->fepvals, lambda,
1485 enerd, flags, nrnb, wcycle);
1489 if (!bUseOrEmulGPU)
1491 int aloc;
1493 if (DOMAINDECOMP(cr))
1495 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1496 step, nrnb, wcycle);
1499 if (!bUseOrEmulGPU)
1501 aloc = eintLocal;
1503 else
1505 aloc = eintNonlocal;
1508 /* Add all the non-bonded force to the normal force array.
1509 * This can be split into a local and a non-local part when overlapping
1510 * communication with calculation with domain decomposition.
1512 wallcycle_stop(wcycle, ewcFORCE);
1514 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->nbat, f, wcycle);
1516 wallcycle_start_nocount(wcycle, ewcFORCE);
1518 /* if there are multiple fshift output buffers reduce them */
1519 if ((flags & GMX_FORCE_VIRIAL) &&
1520 nbv->grp[aloc].nbl_lists.nnbl > 1)
1522 /* This is not in a subcounter because it takes a
1523 negligible and constant-sized amount of time */
1524 nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1525 fr->fshift);
1529 /* update QMMMrec, if necessary */
1530 if (fr->bQMMM)
1532 update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1535 /* Compute the bonded and non-bonded energies and optionally forces */
1536 do_force_lowlevel(fr, inputrec, &(top->idef),
1537 cr, ms, nrnb, wcycle, mdatoms,
1538 as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1539 box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1540 flags, &cycles_pme);
1542 wallcycle_stop(wcycle, ewcFORCE);
1544 computeSpecialForces(fplog, cr, inputrec, step, t, wcycle,
1545 fr->forceProviders, box, x, mdatoms, lambda,
1546 flags, &forceWithVirial, enerd,
1547 ed, bNS);
1549 if (bUseOrEmulGPU)
1551 /* wait for non-local forces (or calculate in emulation mode) */
1552 if (DOMAINDECOMP(cr))
1554 if (bUseGPU)
1556 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1557 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1558 flags, eatNonlocal,
1559 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1560 fr->fshift);
1561 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1563 else
1565 wallcycle_start_nocount(wcycle, ewcFORCE);
1566 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1567 step, nrnb, wcycle);
1568 wallcycle_stop(wcycle, ewcFORCE);
1571 /* skip the reduction if there was no non-local work to do */
1572 if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1574 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
1575 nbv->nbat, f, wcycle);
1580 if (DOMAINDECOMP(cr))
1582 /* We are done with the CPU compute.
1583 * We will now communicate the non-local forces.
1584 * If we use a GPU this will overlap with GPU work, so in that case
1585 * we do not close the DD force balancing region here.
1587 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1589 ddCloseBalanceRegionCpu(cr->dd);
1591 if (bDoForces)
1593 dd_move_f(cr->dd, f, fr->fshift, wcycle);
1597 // With both nonbonded and PME offloaded a GPU on the same rank, we use
1598 // an alternating wait/reduction scheme.
1599 bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1600 if (alternateGpuWait)
1602 alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, wcycle);
1605 if (!alternateGpuWait && useGpuPme)
1607 gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
1608 matrix vir_Q;
1609 real Vlr_q = 0.0;
1610 pme_gpu_wait_finish_task(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
1611 pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
1614 /* Wait for local GPU NB outputs on the non-alternating wait path */
1615 if (!alternateGpuWait && bUseGPU)
1617 /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1618 * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1619 * but even with a step of 0.1 ms the difference is less than 1%
1620 * of the step time.
1622 const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1624 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1625 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1626 flags, eatLocal,
1627 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1628 fr->fshift);
1629 float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1631 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1633 DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1634 if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1636 /* We measured few cycles, it could be that the kernel
1637 * and transfer finished earlier and there was no actual
1638 * wait time, only API call overhead.
1639 * Then the actual time could be anywhere between 0 and
1640 * cycles_wait_est. We will use half of cycles_wait_est.
1642 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1644 ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1648 if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1650 // NOTE: emulation kernel is not included in the balancing region,
1651 // but emulation mode does not target performance anyway
1652 wallcycle_start_nocount(wcycle, ewcFORCE);
1653 do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1654 DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1655 step, nrnb, wcycle);
1656 wallcycle_stop(wcycle, ewcFORCE);
1659 if (useGpuPme)
1661 pme_gpu_reinit_computation(fr->pmedata, wcycle);
1664 if (bUseGPU)
1666 /* now clear the GPU outputs while we finish the step on the CPU */
1667 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1668 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1669 nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1671 /* Is dynamic pair-list pruning activated? */
1672 if (nbv->listParams->useDynamicPruning)
1674 launchGpuRollingPruning(cr, nbv, inputrec, step);
1676 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1677 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1680 /* Do the nonbonded GPU (or emulation) force buffer reduction
1681 * on the non-alternating path. */
1682 if (bUseOrEmulGPU && !alternateGpuWait)
1684 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
1685 nbv->nbat, f, wcycle);
1688 if (DOMAINDECOMP(cr))
1690 dd_force_flop_stop(cr->dd, nrnb);
1693 if (bDoForces)
1695 /* If we have NoVirSum forces, but we do not calculate the virial,
1696 * we sum fr->f_novirsum=f later.
1698 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1700 spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
1701 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1704 if (flags & GMX_FORCE_VIRIAL)
1706 /* Calculation of the virial must be done after vsites! */
1707 calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
1708 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1712 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1714 /* In case of node-splitting, the PP nodes receive the long-range
1715 * forces, virial and energy from the PME nodes here.
1717 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1720 if (bDoForces)
1722 post_process_forces(cr, step, nrnb, wcycle,
1723 top, box, as_rvec_array(x.data()), f, &forceWithVirial,
1724 vir_force, mdatoms, graph, fr, vsite,
1725 flags);
1728 if (flags & GMX_FORCE_ENERGY)
1730 /* Sum the potential energy terms from group contributions */
1731 sum_epot(&(enerd->grpp), enerd->term);
1733 if (!EI_TPI(inputrec->eI))
1735 checkPotentialEnergyValidity(step, *enerd, *inputrec);
1740 static void do_force_cutsGROUP(FILE *fplog,
1741 const t_commrec *cr,
1742 const gmx_multisim_t *ms,
1743 const t_inputrec *inputrec,
1744 gmx_int64_t step,
1745 t_nrnb *nrnb,
1746 gmx_wallcycle_t wcycle,
1747 gmx_localtop_t *top,
1748 const gmx_groups_t *groups,
1749 matrix box, gmx::PaddedArrayRef<gmx::RVec> x,
1750 history_t *hist,
1751 gmx::PaddedArrayRef<gmx::RVec> force,
1752 tensor vir_force,
1753 const t_mdatoms *mdatoms,
1754 gmx_enerdata_t *enerd,
1755 t_fcdata *fcd,
1756 real *lambda,
1757 t_graph *graph,
1758 t_forcerec *fr,
1759 const gmx_vsite_t *vsite,
1760 rvec mu_tot,
1761 double t,
1762 const gmx_edsam *ed,
1763 int flags,
1764 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1765 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1767 int cg0, cg1, i, j;
1768 double mu[2*DIM];
1769 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1770 gmx_bool bDoForces;
1771 float cycles_pme;
1773 const int start = 0;
1774 const int homenr = mdatoms->homenr;
1776 clear_mat(vir_force);
1778 cg0 = 0;
1779 if (DOMAINDECOMP(cr))
1781 cg1 = cr->dd->ncg_tot;
1783 else
1785 cg1 = top->cgs.nr;
1787 if (fr->n_tpi > 0)
1789 cg1--;
1792 bStateChanged = (flags & GMX_FORCE_STATECHANGED);
1793 bNS = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1794 /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1795 bFillGrid = (bNS && bStateChanged);
1796 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1797 bDoForces = (flags & GMX_FORCE_FORCES);
1799 if (bStateChanged)
1801 update_forcerec(fr, box);
1803 if (inputrecNeedMutot(inputrec))
1805 /* Calculate total (local) dipole moment in a temporary common array.
1806 * This makes it possible to sum them over nodes faster.
1808 calc_mu(start, homenr,
1809 x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1810 mu, mu+DIM);
1814 if (fr->ePBC != epbcNONE)
1816 /* Compute shift vectors every step,
1817 * because of pressure coupling or box deformation!
1819 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1821 calc_shifts(box, fr->shift_vec);
1824 if (bCalcCGCM)
1826 put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1827 &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1828 inc_nrnb(nrnb, eNR_CGCM, homenr);
1829 inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1831 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1833 unshift_self(graph, box, as_rvec_array(x.data()));
1836 else if (bCalcCGCM)
1838 calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1839 inc_nrnb(nrnb, eNR_CGCM, homenr);
1842 if (bCalcCGCM && gmx_debug_at)
1844 pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1847 #if GMX_MPI
1848 if (!thisRankHasDuty(cr, DUTY_PME))
1850 /* Send particle coordinates to the pme nodes.
1851 * Since this is only implemented for domain decomposition
1852 * and domain decomposition does not use the graph,
1853 * we do not need to worry about shifting.
1855 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1856 lambda[efptCOUL], lambda[efptVDW],
1857 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1858 step, wcycle);
1860 #endif /* GMX_MPI */
1862 /* Communicate coordinates and sum dipole if necessary */
1863 if (DOMAINDECOMP(cr))
1865 dd_move_x(cr->dd, box, as_rvec_array(x.data()), wcycle);
1867 /* No GPU support, no move_x overlap, so reopen the balance region here */
1868 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1870 ddReopenBalanceRegionCpu(cr->dd);
1874 if (inputrecNeedMutot(inputrec))
1876 if (bStateChanged)
1878 if (PAR(cr))
1880 gmx_sumd(2*DIM, mu, cr);
1881 ddReopenBalanceRegionCpu(cr->dd);
1883 for (i = 0; i < 2; i++)
1885 for (j = 0; j < DIM; j++)
1887 fr->mu_tot[i][j] = mu[i*DIM + j];
1891 if (fr->efep == efepNO)
1893 copy_rvec(fr->mu_tot[0], mu_tot);
1895 else
1897 for (j = 0; j < DIM; j++)
1899 mu_tot[j] =
1900 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1905 /* Reset energies */
1906 reset_enerdata(enerd);
1907 clear_rvecs(SHIFTS, fr->fshift);
1909 if (bNS)
1911 wallcycle_start(wcycle, ewcNS);
1913 if (graph && bStateChanged)
1915 /* Calculate intramolecular shift vectors to make molecules whole */
1916 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1919 /* Do the actual neighbour searching */
1920 ns(fplog, fr, box,
1921 groups, top, mdatoms,
1922 cr, nrnb, bFillGrid);
1924 wallcycle_stop(wcycle, ewcNS);
1927 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1929 wallcycle_start(wcycle, ewcPPDURINGPME);
1930 dd_force_flop_start(cr->dd, nrnb);
1933 if (inputrec->bRot)
1935 wallcycle_start(wcycle, ewcROT);
1936 do_rotation(cr, inputrec, box, as_rvec_array(x.data()), t, step, bNS);
1937 wallcycle_stop(wcycle, ewcROT);
1940 /* Temporary solution until all routines take PaddedRVecVector */
1941 rvec *f = as_rvec_array(force.data());
1943 /* Start the force cycle counter.
1944 * Note that a different counter is used for dynamic load balancing.
1946 wallcycle_start(wcycle, ewcFORCE);
1948 gmx::ArrayRef<gmx::RVec> forceRef = force;
1949 if (bDoForces)
1951 /* If we need to compute the virial, we might need a separate
1952 * force buffer for algorithms for which the virial is calculated
1953 * separately, such as PME.
1955 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1957 forceRef = *fr->forceBufferForDirectVirialContributions;
1959 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1962 /* Clear the short- and long-range forces */
1963 clear_rvecs(fr->natoms_force_constr, f);
1966 /* forceWithVirial might need the full force atom range */
1967 ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
1969 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1971 clear_pull_forces(inputrec->pull_work);
1974 /* update QMMMrec, if necessary */
1975 if (fr->bQMMM)
1977 update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1980 /* Compute the bonded and non-bonded energies and optionally forces */
1981 do_force_lowlevel(fr, inputrec, &(top->idef),
1982 cr, ms, nrnb, wcycle, mdatoms,
1983 as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1984 box, inputrec->fepvals, lambda,
1985 graph, &(top->excls), fr->mu_tot,
1986 flags,
1987 &cycles_pme);
1989 wallcycle_stop(wcycle, ewcFORCE);
1991 if (DOMAINDECOMP(cr))
1993 dd_force_flop_stop(cr->dd, nrnb);
1995 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1997 ddCloseBalanceRegionCpu(cr->dd);
2001 computeSpecialForces(fplog, cr, inputrec, step, t, wcycle,
2002 fr->forceProviders, box, x, mdatoms, lambda,
2003 flags, &forceWithVirial, enerd,
2004 ed, bNS);
2006 if (bDoForces)
2008 /* Communicate the forces */
2009 if (DOMAINDECOMP(cr))
2011 dd_move_f(cr->dd, f, fr->fshift, wcycle);
2012 /* Do we need to communicate the separate force array
2013 * for terms that do not contribute to the single sum virial?
2014 * Position restraints and electric fields do not introduce
2015 * inter-cg forces, only full electrostatics methods do.
2016 * When we do not calculate the virial, fr->f_novirsum = f,
2017 * so we have already communicated these forces.
2019 if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
2020 (flags & GMX_FORCE_VIRIAL))
2022 dd_move_f(cr->dd, as_rvec_array(forceWithVirial.force_.data()), nullptr, wcycle);
2026 /* If we have NoVirSum forces, but we do not calculate the virial,
2027 * we sum fr->f_novirsum=f later.
2029 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
2031 spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
2032 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
2035 if (flags & GMX_FORCE_VIRIAL)
2037 /* Calculation of the virial must be done after vsites! */
2038 calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
2039 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
2043 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
2045 /* In case of node-splitting, the PP nodes receive the long-range
2046 * forces, virial and energy from the PME nodes here.
2048 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
2051 if (bDoForces)
2053 post_process_forces(cr, step, nrnb, wcycle,
2054 top, box, as_rvec_array(x.data()), f, &forceWithVirial,
2055 vir_force, mdatoms, graph, fr, vsite,
2056 flags);
2059 if (flags & GMX_FORCE_ENERGY)
2061 /* Sum the potential energy terms from group contributions */
2062 sum_epot(&(enerd->grpp), enerd->term);
2064 if (!EI_TPI(inputrec->eI))
2066 checkPotentialEnergyValidity(step, *enerd, *inputrec);
2072 void do_force(FILE *fplog,
2073 const t_commrec *cr,
2074 const gmx_multisim_t *ms,
2075 const t_inputrec *inputrec,
2076 gmx_int64_t step,
2077 t_nrnb *nrnb,
2078 gmx_wallcycle_t wcycle,
2079 gmx_localtop_t *top,
2080 const gmx_groups_t *groups,
2081 matrix box,
2082 gmx::PaddedArrayRef<gmx::RVec> x,
2083 history_t *hist,
2084 gmx::PaddedArrayRef<gmx::RVec> force,
2085 tensor vir_force,
2086 const t_mdatoms *mdatoms,
2087 gmx_enerdata_t *enerd,
2088 t_fcdata *fcd,
2089 gmx::ArrayRef<real> lambda,
2090 t_graph *graph,
2091 t_forcerec *fr,
2092 const gmx_vsite_t *vsite,
2093 rvec mu_tot,
2094 double t,
2095 const gmx_edsam *ed,
2096 int flags,
2097 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
2098 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
2100 /* modify force flag if not doing nonbonded */
2101 if (!fr->bNonbonded)
2103 flags &= ~GMX_FORCE_NONBONDED;
2106 GMX_ASSERT(x.size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "coordinates should be padded");
2107 GMX_ASSERT(force.size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "force should be padded");
2109 switch (inputrec->cutoff_scheme)
2111 case ecutsVERLET:
2112 do_force_cutsVERLET(fplog, cr, ms, inputrec,
2113 step, nrnb, wcycle,
2114 top,
2115 groups,
2116 box, x, hist,
2117 force, vir_force,
2118 mdatoms,
2119 enerd, fcd,
2120 lambda.data(), graph,
2121 fr, fr->ic,
2122 vsite, mu_tot,
2123 t, ed,
2124 flags,
2125 ddOpenBalanceRegion,
2126 ddCloseBalanceRegion);
2127 break;
2128 case ecutsGROUP:
2129 do_force_cutsGROUP(fplog, cr, ms, inputrec,
2130 step, nrnb, wcycle,
2131 top,
2132 groups,
2133 box, x, hist,
2134 force, vir_force,
2135 mdatoms,
2136 enerd, fcd,
2137 lambda.data(), graph,
2138 fr, vsite, mu_tot,
2139 t, ed,
2140 flags,
2141 ddOpenBalanceRegion,
2142 ddCloseBalanceRegion);
2143 break;
2144 default:
2145 gmx_incons("Invalid cut-off scheme passed!");
2148 /* In case we don't have constraints and are using GPUs, the next balancing
2149 * region starts here.
2150 * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2151 * virial calculation and COM pulling, is not thus not included in
2152 * the balance timing, which is ok as most tasks do communication.
2154 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
2156 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
2161 void do_constrain_first(FILE *fplog, Constraints *constr,
2162 t_inputrec *ir, t_mdatoms *md,
2163 t_state *state, const t_commrec *cr,
2164 const gmx_multisim_t *ms,
2165 t_nrnb *nrnb,
2166 t_forcerec *fr, gmx_localtop_t *top)
2168 int i, m, start, end;
2169 gmx_int64_t step;
2170 real dt = ir->delta_t;
2171 real dvdl_dum;
2172 rvec *savex;
2174 /* We need to allocate one element extra, since we might use
2175 * (unaligned) 4-wide SIMD loads to access rvec entries.
2177 snew(savex, state->natoms + 1);
2179 start = 0;
2180 end = md->homenr;
2182 if (debug)
2184 fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2185 start, md->homenr, end);
2187 /* Do a first constrain to reset particles... */
2188 step = ir->init_step;
2189 if (fplog)
2191 char buf[STEPSTRSIZE];
2192 fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2193 gmx_step_str(step, buf));
2195 dvdl_dum = 0;
2197 /* constrain the current position */
2198 constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
2199 ir, cr, ms, step, 0, 1.0, md,
2200 as_rvec_array(state->x.data()), as_rvec_array(state->x.data()), nullptr,
2201 fr->bMolPBC, state->box,
2202 state->lambda[efptBONDED], &dvdl_dum,
2203 nullptr, nullptr, nrnb, econqCoord);
2204 if (EI_VV(ir->eI))
2206 /* constrain the inital velocity, and save it */
2207 /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2208 constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
2209 ir, cr, ms, step, 0, 1.0, md,
2210 as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
2211 fr->bMolPBC, state->box,
2212 state->lambda[efptBONDED], &dvdl_dum,
2213 nullptr, nullptr, nrnb, econqVeloc);
2215 /* constrain the inital velocities at t-dt/2 */
2216 if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2218 for (i = start; (i < end); i++)
2220 for (m = 0; (m < DIM); m++)
2222 /* Reverse the velocity */
2223 state->v[i][m] = -state->v[i][m];
2224 /* Store the position at t-dt in buf */
2225 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
2228 /* Shake the positions at t=-dt with the positions at t=0
2229 * as reference coordinates.
2231 if (fplog)
2233 char buf[STEPSTRSIZE];
2234 fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2235 gmx_step_str(step, buf));
2237 dvdl_dum = 0;
2238 constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
2239 ir, cr, ms, step, -1, 1.0, md,
2240 as_rvec_array(state->x.data()), savex, nullptr,
2241 fr->bMolPBC, state->box,
2242 state->lambda[efptBONDED], &dvdl_dum,
2243 as_rvec_array(state->v.data()), nullptr, nrnb, econqCoord);
2245 for (i = start; i < end; i++)
2247 for (m = 0; m < DIM; m++)
2249 /* Re-reverse the velocities */
2250 state->v[i][m] = -state->v[i][m];
2254 sfree(savex);
2258 static void
2259 integrate_table(real vdwtab[], real scale, int offstart, int rstart, int rend,
2260 double *enerout, double *virout)
2262 double enersum, virsum;
2263 double invscale, invscale2, invscale3;
2264 double r, ea, eb, ec, pa, pb, pc, pd;
2265 double y0, f, g, h;
2266 int ri, offset;
2267 double tabfactor;
2269 invscale = 1.0/scale;
2270 invscale2 = invscale*invscale;
2271 invscale3 = invscale*invscale2;
2273 /* Following summation derived from cubic spline definition,
2274 * Numerical Recipies in C, second edition, p. 113-116. Exact for
2275 * the cubic spline. We first calculate the negative of the
2276 * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2277 * add the more standard, abrupt cutoff correction to that result,
2278 * yielding the long-range correction for a switched function. We
2279 * perform both the pressure and energy loops at the same time for
2280 * simplicity, as the computational cost is low. */
2282 if (offstart == 0)
2284 /* Since the dispersion table has been scaled down a factor
2285 * 6.0 and the repulsion a factor 12.0 to compensate for the
2286 * c6/c12 parameters inside nbfp[] being scaled up (to save
2287 * flops in kernels), we need to correct for this.
2289 tabfactor = 6.0;
2291 else
2293 tabfactor = 12.0;
2296 enersum = 0.0;
2297 virsum = 0.0;
2298 for (ri = rstart; ri < rend; ++ri)
2300 r = ri*invscale;
2301 ea = invscale3;
2302 eb = 2.0*invscale2*r;
2303 ec = invscale*r*r;
2305 pa = invscale3;
2306 pb = 3.0*invscale2*r;
2307 pc = 3.0*invscale*r*r;
2308 pd = r*r*r;
2310 /* this "8" is from the packing in the vdwtab array - perhaps
2311 should be defined? */
2313 offset = 8*ri + offstart;
2314 y0 = vdwtab[offset];
2315 f = vdwtab[offset+1];
2316 g = vdwtab[offset+2];
2317 h = vdwtab[offset+3];
2319 enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2320 virsum += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2322 *enerout = 4.0*M_PI*enersum*tabfactor;
2323 *virout = 4.0*M_PI*virsum*tabfactor;
2326 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2328 double eners[2], virs[2], enersum, virsum;
2329 double r0, rc3, rc9;
2330 int ri0, ri1, i;
2331 real scale, *vdwtab;
2333 fr->enershiftsix = 0;
2334 fr->enershifttwelve = 0;
2335 fr->enerdiffsix = 0;
2336 fr->enerdifftwelve = 0;
2337 fr->virdiffsix = 0;
2338 fr->virdifftwelve = 0;
2340 const interaction_const_t *ic = fr->ic;
2342 if (eDispCorr != edispcNO)
2344 for (i = 0; i < 2; i++)
2346 eners[i] = 0;
2347 virs[i] = 0;
2349 if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2350 (ic->vdw_modifier == eintmodPOTSWITCH) ||
2351 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2352 (ic->vdwtype == evdwSHIFT) ||
2353 (ic->vdwtype == evdwSWITCH))
2355 if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2356 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2357 (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2359 gmx_fatal(FARGS,
2360 "With dispersion correction rvdw-switch can not be zero "
2361 "for vdw-type = %s", evdw_names[ic->vdwtype]);
2364 /* TODO This code depends on the logic in tables.c that
2365 constructs the table layout, which should be made
2366 explicit in future cleanup. */
2367 GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2368 "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2369 scale = fr->dispersionCorrectionTable->scale;
2370 vdwtab = fr->dispersionCorrectionTable->data;
2372 /* Round the cut-offs to exact table values for precision */
2373 ri0 = static_cast<int>(floor(ic->rvdw_switch*scale));
2374 ri1 = static_cast<int>(ceil(ic->rvdw*scale));
2376 /* The code below has some support for handling force-switching, i.e.
2377 * when the force (instead of potential) is switched over a limited
2378 * region. This leads to a constant shift in the potential inside the
2379 * switching region, which we can handle by adding a constant energy
2380 * term in the force-switch case just like when we do potential-shift.
2382 * For now this is not enabled, but to keep the functionality in the
2383 * code we check separately for switch and shift. When we do force-switch
2384 * the shifting point is rvdw_switch, while it is the cutoff when we
2385 * have a classical potential-shift.
2387 * For a pure potential-shift the potential has a constant shift
2388 * all the way out to the cutoff, and that is it. For other forms
2389 * we need to calculate the constant shift up to the point where we
2390 * start modifying the potential.
2392 ri0 = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2394 r0 = ri0/scale;
2395 rc3 = r0*r0*r0;
2396 rc9 = rc3*rc3*rc3;
2398 if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2399 (ic->vdwtype == evdwSHIFT))
2401 /* Determine the constant energy shift below rvdw_switch.
2402 * Table has a scale factor since we have scaled it down to compensate
2403 * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2405 fr->enershiftsix = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2406 fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2408 else if (ic->vdw_modifier == eintmodPOTSHIFT)
2410 fr->enershiftsix = (real)(-1.0/(rc3*rc3));
2411 fr->enershifttwelve = (real)( 1.0/(rc9*rc3));
2414 /* Add the constant part from 0 to rvdw_switch.
2415 * This integration from 0 to rvdw_switch overcounts the number
2416 * of interactions by 1, as it also counts the self interaction.
2417 * We will correct for this later.
2419 eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2420 eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2422 /* Calculate the contribution in the range [r0,r1] where we
2423 * modify the potential. For a pure potential-shift modifier we will
2424 * have ri0==ri1, and there will not be any contribution here.
2426 for (i = 0; i < 2; i++)
2428 enersum = 0;
2429 virsum = 0;
2430 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2431 eners[i] -= enersum;
2432 virs[i] -= virsum;
2435 /* Alright: Above we compensated by REMOVING the parts outside r0
2436 * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2438 * Regardless of whether r0 is the point where we start switching,
2439 * or the cutoff where we calculated the constant shift, we include
2440 * all the parts we are missing out to infinity from r0 by
2441 * calculating the analytical dispersion correction.
2443 eners[0] += -4.0*M_PI/(3.0*rc3);
2444 eners[1] += 4.0*M_PI/(9.0*rc9);
2445 virs[0] += 8.0*M_PI/rc3;
2446 virs[1] += -16.0*M_PI/(3.0*rc9);
2448 else if (ic->vdwtype == evdwCUT ||
2449 EVDW_PME(ic->vdwtype) ||
2450 ic->vdwtype == evdwUSER)
2452 if (ic->vdwtype == evdwUSER && fplog)
2454 fprintf(fplog,
2455 "WARNING: using dispersion correction with user tables\n");
2458 /* Note that with LJ-PME, the dispersion correction is multiplied
2459 * by the difference between the actual C6 and the value of C6
2460 * that would produce the combination rule.
2461 * This means the normal energy and virial difference formulas
2462 * can be used here.
2465 rc3 = ic->rvdw*ic->rvdw*ic->rvdw;
2466 rc9 = rc3*rc3*rc3;
2467 /* Contribution beyond the cut-off */
2468 eners[0] += -4.0*M_PI/(3.0*rc3);
2469 eners[1] += 4.0*M_PI/(9.0*rc9);
2470 if (ic->vdw_modifier == eintmodPOTSHIFT)
2472 /* Contribution within the cut-off */
2473 eners[0] += -4.0*M_PI/(3.0*rc3);
2474 eners[1] += 4.0*M_PI/(3.0*rc9);
2476 /* Contribution beyond the cut-off */
2477 virs[0] += 8.0*M_PI/rc3;
2478 virs[1] += -16.0*M_PI/(3.0*rc9);
2480 else
2482 gmx_fatal(FARGS,
2483 "Dispersion correction is not implemented for vdw-type = %s",
2484 evdw_names[ic->vdwtype]);
2487 /* When we deprecate the group kernels the code below can go too */
2488 if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2490 /* Calculate self-interaction coefficient (assuming that
2491 * the reciprocal-space contribution is constant in the
2492 * region that contributes to the self-interaction).
2494 fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2496 eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2497 virs[0] += gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2500 fr->enerdiffsix = eners[0];
2501 fr->enerdifftwelve = eners[1];
2502 /* The 0.5 is due to the Gromacs definition of the virial */
2503 fr->virdiffsix = 0.5*virs[0];
2504 fr->virdifftwelve = 0.5*virs[1];
2508 void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
2509 matrix box, real lambda, tensor pres, tensor virial,
2510 real *prescorr, real *enercorr, real *dvdlcorr)
2512 gmx_bool bCorrAll, bCorrPres;
2513 real dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2514 int m;
2516 *prescorr = 0;
2517 *enercorr = 0;
2518 *dvdlcorr = 0;
2520 clear_mat(virial);
2521 clear_mat(pres);
2523 if (ir->eDispCorr != edispcNO)
2525 bCorrAll = (ir->eDispCorr == edispcAllEner ||
2526 ir->eDispCorr == edispcAllEnerPres);
2527 bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2528 ir->eDispCorr == edispcAllEnerPres);
2530 invvol = 1/det(box);
2531 if (fr->n_tpi)
2533 /* Only correct for the interactions with the inserted molecule */
2534 dens = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2535 ninter = fr->n_tpi;
2537 else
2539 dens = fr->numAtomsForDispersionCorrection*invvol;
2540 ninter = 0.5*fr->numAtomsForDispersionCorrection;
2543 if (ir->efep == efepNO)
2545 avcsix = fr->avcsix[0];
2546 avctwelve = fr->avctwelve[0];
2548 else
2550 avcsix = (1 - lambda)*fr->avcsix[0] + lambda*fr->avcsix[1];
2551 avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2554 enerdiff = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2555 *enercorr += avcsix*enerdiff;
2556 dvdlambda = 0.0;
2557 if (ir->efep != efepNO)
2559 dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2561 if (bCorrAll)
2563 enerdiff = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2564 *enercorr += avctwelve*enerdiff;
2565 if (fr->efep != efepNO)
2567 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2571 if (bCorrPres)
2573 svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2574 if (ir->eDispCorr == edispcAllEnerPres)
2576 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2578 /* The factor 2 is because of the Gromacs virial definition */
2579 spres = -2.0*invvol*svir*PRESFAC;
2581 for (m = 0; m < DIM; m++)
2583 virial[m][m] += svir;
2584 pres[m][m] += spres;
2586 *prescorr += spres;
2589 /* Can't currently control when it prints, for now, just print when degugging */
2590 if (debug)
2592 if (bCorrAll)
2594 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2595 avcsix, avctwelve);
2597 if (bCorrPres)
2599 fprintf(debug,
2600 "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2601 *enercorr, spres, svir);
2603 else
2605 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2609 if (fr->efep != efepNO)
2611 *dvdlcorr += dvdlambda;
2616 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2617 const gmx_mtop_t *mtop, rvec x[],
2618 gmx_bool bFirst)
2620 t_graph *graph;
2621 int as, mol;
2623 if (bFirst && fplog)
2625 fprintf(fplog, "Removing pbc first time\n");
2628 snew(graph, 1);
2629 as = 0;
2630 for (const gmx_molblock_t &molb : mtop->molblock)
2632 const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2633 if (moltype.atoms.nr == 1 ||
2634 (!bFirst && moltype.cgs.nr == 1))
2636 /* Just one atom or charge group in the molecule, no PBC required */
2637 as += molb.nmol*moltype.atoms.nr;
2639 else
2641 /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2642 mk_graph_ilist(nullptr, moltype.ilist,
2643 0, moltype.atoms.nr, FALSE, FALSE, graph);
2645 for (mol = 0; mol < molb.nmol; mol++)
2647 mk_mshift(fplog, graph, ePBC, box, x+as);
2649 shift_self(graph, box, x+as);
2650 /* The molecule is whole now.
2651 * We don't need the second mk_mshift call as in do_pbc_first,
2652 * since we no longer need this graph.
2655 as += moltype.atoms.nr;
2657 done_graph(graph);
2660 sfree(graph);
2663 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2664 const gmx_mtop_t *mtop, rvec x[])
2666 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2669 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2670 const gmx_mtop_t *mtop, rvec x[])
2672 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2675 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2677 int t, nth;
2678 nth = gmx_omp_nthreads_get(emntDefault);
2680 #pragma omp parallel for num_threads(nth) schedule(static)
2681 for (t = 0; t < nth; t++)
2685 size_t natoms = x.size();
2686 size_t offset = (natoms*t )/nth;
2687 size_t len = (natoms*(t + 1))/nth - offset;
2688 put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2690 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2694 // TODO This can be cleaned up a lot, and move back to runner.cpp
2695 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2696 const t_inputrec *inputrec,
2697 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2698 gmx_walltime_accounting_t walltime_accounting,
2699 nonbonded_verlet_t *nbv,
2700 gmx_pme_t *pme,
2701 gmx_bool bWriteStat)
2703 t_nrnb *nrnb_tot = nullptr;
2704 double delta_t = 0;
2705 double nbfs = 0, mflop = 0;
2706 double elapsed_time,
2707 elapsed_time_over_all_ranks,
2708 elapsed_time_over_all_threads,
2709 elapsed_time_over_all_threads_over_all_ranks;
2710 /* Control whether it is valid to print a report. Only the
2711 simulation master may print, but it should not do so if the run
2712 terminated e.g. before a scheduled reset step. This is
2713 complicated by the fact that PME ranks are unaware of the
2714 reason why they were sent a pmerecvqxFINISH. To avoid
2715 communication deadlocks, we always do the communication for the
2716 report, even if we've decided not to write the report, because
2717 how long it takes to finish the run is not important when we've
2718 decided not to report on the simulation performance.
2720 Further, we only report performance for dynamical integrators,
2721 because those are the only ones for which we plan to
2722 consider doing any optimizations. */
2723 bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2725 if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2727 GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2728 printReport = false;
2731 if (cr->nnodes > 1)
2733 snew(nrnb_tot, 1);
2734 #if GMX_MPI
2735 MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2736 cr->mpi_comm_mysim);
2737 #endif
2739 else
2741 nrnb_tot = nrnb;
2744 elapsed_time = walltime_accounting_get_elapsed_time(walltime_accounting);
2745 elapsed_time_over_all_ranks = elapsed_time;
2746 elapsed_time_over_all_threads = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
2747 elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2748 #if GMX_MPI
2749 if (cr->nnodes > 1)
2751 /* reduce elapsed_time over all MPI ranks in the current simulation */
2752 MPI_Allreduce(&elapsed_time,
2753 &elapsed_time_over_all_ranks,
2754 1, MPI_DOUBLE, MPI_SUM,
2755 cr->mpi_comm_mysim);
2756 elapsed_time_over_all_ranks /= cr->nnodes;
2757 /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2758 * current simulation. */
2759 MPI_Allreduce(&elapsed_time_over_all_threads,
2760 &elapsed_time_over_all_threads_over_all_ranks,
2761 1, MPI_DOUBLE, MPI_SUM,
2762 cr->mpi_comm_mysim);
2764 #endif
2766 if (printReport)
2768 print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2770 if (cr->nnodes > 1)
2772 sfree(nrnb_tot);
2775 if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2777 print_dd_statistics(cr, inputrec, fplog);
2780 /* TODO Move the responsibility for any scaling by thread counts
2781 * to the code that handled the thread region, so that there's a
2782 * mechanism to keep cycle counting working during the transition
2783 * to task parallelism. */
2784 int nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
2785 int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2786 wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2787 auto cycle_sum(wallcycle_sum(cr, wcycle));
2789 if (printReport)
2791 auto nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2792 gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
2793 if (pme_gpu_task_enabled(pme))
2795 pme_gpu_get_timings(pme, &pme_gpu_timings);
2797 wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2798 elapsed_time_over_all_ranks,
2799 wcycle, cycle_sum,
2800 nbnxn_gpu_timings,
2801 &pme_gpu_timings);
2803 if (EI_DYNAMICS(inputrec->eI))
2805 delta_t = inputrec->delta_t;
2808 if (fplog)
2810 print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2811 elapsed_time_over_all_ranks,
2812 walltime_accounting_get_nsteps_done(walltime_accounting),
2813 delta_t, nbfs, mflop);
2815 if (bWriteStat)
2817 print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2818 elapsed_time_over_all_ranks,
2819 walltime_accounting_get_nsteps_done(walltime_accounting),
2820 delta_t, nbfs, mflop);
2825 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2827 /* this function works, but could probably use a logic rewrite to keep all the different
2828 types of efep straight. */
2830 if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
2832 return;
2835 t_lambda *fep = ir->fepvals;
2836 *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2837 if checkpoint is set -- a kludge is in for now
2838 to prevent this.*/
2840 for (int i = 0; i < efptNR; i++)
2842 /* overwrite lambda state with init_lambda for now for backwards compatibility */
2843 if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2845 lambda[i] = fep->init_lambda;
2846 if (lam0)
2848 lam0[i] = lambda[i];
2851 else
2853 lambda[i] = fep->all_lambda[i][*fep_state];
2854 if (lam0)
2856 lam0[i] = lambda[i];
2860 if (ir->bSimTemp)
2862 /* need to rescale control temperatures to match current state */
2863 for (int i = 0; i < ir->opts.ngtc; i++)
2865 if (ir->opts.ref_t[i] > 0)
2867 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2872 /* Send to the log the information on the current lambdas */
2873 if (fplog != nullptr)
2875 fprintf(fplog, "Initial vector of lambda components:[ ");
2876 for (const auto &l : lambda)
2878 fprintf(fplog, "%10.4f ", l);
2880 fprintf(fplog, "]\n");
2882 return;
2886 void init_md(FILE *fplog,
2887 const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2888 t_inputrec *ir, const gmx_output_env_t *oenv,
2889 const MdrunOptions &mdrunOptions,
2890 double *t, double *t0,
2891 t_state *globalState, double *lam0,
2892 t_nrnb *nrnb, gmx_mtop_t *mtop,
2893 gmx_update_t **upd,
2894 int nfile, const t_filenm fnm[],
2895 gmx_mdoutf_t *outf, t_mdebin **mdebin,
2896 tensor force_vir, tensor shake_vir, rvec mu_tot,
2897 gmx_bool *bSimAnn, t_vcm **vcm,
2898 gmx_wallcycle_t wcycle)
2900 int i;
2902 /* Initial values */
2903 *t = *t0 = ir->init_t;
2905 *bSimAnn = FALSE;
2906 for (i = 0; i < ir->opts.ngtc; i++)
2908 /* set bSimAnn if any group is being annealed */
2909 if (ir->opts.annealing[i] != eannNO)
2911 *bSimAnn = TRUE;
2915 /* Initialize lambda variables */
2916 /* TODO: Clean up initialization of fep_state and lambda in t_state.
2917 * We currently need to call initialize_lambdas on non-master ranks
2918 * to initialize lam0.
2920 if (MASTER(cr))
2922 initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
2924 else
2926 int tmpFepState;
2927 std::array<real, efptNR> tmpLambda;
2928 initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
2931 // TODO upd is never NULL in practice, but the analysers don't know that
2932 if (upd)
2934 *upd = init_update(ir);
2936 if (*bSimAnn)
2938 update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2941 if (vcm != nullptr)
2943 *vcm = init_vcm(fplog, &mtop->groups, ir);
2946 if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
2948 if (ir->etc == etcBERENDSEN)
2950 please_cite(fplog, "Berendsen84a");
2952 if (ir->etc == etcVRESCALE)
2954 please_cite(fplog, "Bussi2007a");
2956 if (ir->eI == eiSD1)
2958 please_cite(fplog, "Goga2012");
2961 init_nrnb(nrnb);
2963 if (nfile != -1)
2965 *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
2967 *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
2968 mtop, ir, mdoutf_get_fp_dhdl(*outf));
2971 /* Initiate variables */
2972 clear_mat(force_vir);
2973 clear_mat(shake_vir);
2974 clear_rvec(mu_tot);