Merge branch release-2016 into release-2018
[gromacs.git] / src / gromacs / mdlib / sim_util.cpp
blobee757e41813383d99063be53eb3a16f466be1a9e
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
37 #include "gmxpre.h"
39 #include "sim_util.h"
41 #include "config.h"
43 #include <stdio.h>
44 #include <string.h>
46 #include <cmath>
47 #include <cstdint>
49 #include <array>
51 #include "gromacs/awh/awh.h"
52 #include "gromacs/domdec/dlbtiming.h"
53 #include "gromacs/domdec/domdec.h"
54 #include "gromacs/domdec/domdec_struct.h"
55 #include "gromacs/essentialdynamics/edsam.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/gmxlib/chargegroup.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/gmxlib/nrnb.h"
60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
63 #include "gromacs/gpu_utils/gpu_utils.h"
64 #include "gromacs/imd/imd.h"
65 #include "gromacs/listed-forces/bonded.h"
66 #include "gromacs/listed-forces/disre.h"
67 #include "gromacs/listed-forces/orires.h"
68 #include "gromacs/math/functions.h"
69 #include "gromacs/math/units.h"
70 #include "gromacs/math/vec.h"
71 #include "gromacs/math/vecdump.h"
72 #include "gromacs/mdlib/calcmu.h"
73 #include "gromacs/mdlib/constr.h"
74 #include "gromacs/mdlib/force.h"
75 #include "gromacs/mdlib/forcerec.h"
76 #include "gromacs/mdlib/genborn.h"
77 #include "gromacs/mdlib/gmx_omp_nthreads.h"
78 #include "gromacs/mdlib/mdrun.h"
79 #include "gromacs/mdlib/nb_verlet.h"
80 #include "gromacs/mdlib/nbnxn_atomdata.h"
81 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
82 #include "gromacs/mdlib/nbnxn_grid.h"
83 #include "gromacs/mdlib/nbnxn_search.h"
84 #include "gromacs/mdlib/qmmm.h"
85 #include "gromacs/mdlib/update.h"
86 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
87 #include "gromacs/mdtypes/commrec.h"
88 #include "gromacs/mdtypes/forceoutput.h"
89 #include "gromacs/mdtypes/iforceprovider.h"
90 #include "gromacs/mdtypes/inputrec.h"
91 #include "gromacs/mdtypes/md_enums.h"
92 #include "gromacs/mdtypes/state.h"
93 #include "gromacs/pbcutil/ishift.h"
94 #include "gromacs/pbcutil/mshift.h"
95 #include "gromacs/pbcutil/pbc.h"
96 #include "gromacs/pulling/pull.h"
97 #include "gromacs/pulling/pull_rotation.h"
98 #include "gromacs/timing/cyclecounter.h"
99 #include "gromacs/timing/gpu_timing.h"
100 #include "gromacs/timing/wallcycle.h"
101 #include "gromacs/timing/wallcyclereporting.h"
102 #include "gromacs/timing/walltime_accounting.h"
103 #include "gromacs/utility/arrayref.h"
104 #include "gromacs/utility/basedefinitions.h"
105 #include "gromacs/utility/cstringutil.h"
106 #include "gromacs/utility/exceptions.h"
107 #include "gromacs/utility/fatalerror.h"
108 #include "gromacs/utility/gmxassert.h"
109 #include "gromacs/utility/gmxmpi.h"
110 #include "gromacs/utility/logger.h"
111 #include "gromacs/utility/pleasecite.h"
112 #include "gromacs/utility/smalloc.h"
113 #include "gromacs/utility/sysinfo.h"
115 #include "nbnxn_gpu.h"
116 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
117 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
119 // TODO: this environment variable allows us to verify before release
120 // that on less common architectures the total cost of polling is not larger than
121 // a blocking wait (so polling does not introduce overhead when the static
122 // PME-first ordering would suffice).
123 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
126 void print_time(FILE *out,
127 gmx_walltime_accounting_t walltime_accounting,
128 gmx_int64_t step,
129 t_inputrec *ir,
130 t_commrec gmx_unused *cr)
132 time_t finish;
133 char timebuf[STRLEN];
134 double dt, elapsed_seconds, time_per_step;
135 char buf[48];
137 #if !GMX_THREAD_MPI
138 if (!PAR(cr))
139 #endif
141 fprintf(out, "\r");
143 fprintf(out, "step %s", gmx_step_str(step, buf));
144 fflush(out);
146 if ((step >= ir->nstlist))
148 double seconds_since_epoch = gmx_gettime();
149 elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
150 time_per_step = elapsed_seconds/(step - ir->init_step + 1);
151 dt = (ir->nsteps + ir->init_step - step) * time_per_step;
153 if (ir->nsteps >= 0)
155 if (dt >= 300)
157 finish = (time_t) (seconds_since_epoch + dt);
158 gmx_ctime_r(&finish, timebuf, STRLEN);
159 sprintf(buf, "%s", timebuf);
160 buf[strlen(buf)-1] = '\0';
161 fprintf(out, ", will finish %s", buf);
163 else
165 fprintf(out, ", remaining wall clock time: %5d s ", (int)dt);
168 else
170 fprintf(out, " performance: %.1f ns/day ",
171 ir->delta_t/1000*24*60*60/time_per_step);
174 #if !GMX_THREAD_MPI
175 if (PAR(cr))
177 fprintf(out, "\n");
179 #endif
181 fflush(out);
184 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
185 double the_time)
187 char time_string[STRLEN];
189 if (!fplog)
191 return;
195 int i;
196 char timebuf[STRLEN];
197 time_t temp_time = (time_t) the_time;
199 gmx_ctime_r(&temp_time, timebuf, STRLEN);
200 for (i = 0; timebuf[i] >= ' '; i++)
202 time_string[i] = timebuf[i];
204 time_string[i] = '\0';
207 fprintf(fplog, "%s on rank %d %s\n", title, nodeid, time_string);
210 void print_start(FILE *fplog, t_commrec *cr,
211 gmx_walltime_accounting_t walltime_accounting,
212 const char *name)
214 char buf[STRLEN];
216 sprintf(buf, "Started %s", name);
217 print_date_and_time(fplog, cr->nodeid, buf,
218 walltime_accounting_get_start_time_stamp(walltime_accounting));
221 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
223 const int end = forceToAdd.size();
225 // cppcheck-suppress unreadVariable
226 int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
227 #pragma omp parallel for num_threads(nt) schedule(static)
228 for (int i = 0; i < end; i++)
230 rvec_inc(f[i], forceToAdd[i]);
234 static void pme_gpu_reduce_outputs(gmx_wallcycle_t wcycle,
235 ForceWithVirial *forceWithVirial,
236 ArrayRef<const gmx::RVec> pmeForces,
237 gmx_enerdata_t *enerd,
238 const tensor vir_Q,
239 real Vlr_q)
241 wallcycle_start(wcycle, ewcPME_GPU_F_REDUCTION);
242 GMX_ASSERT(forceWithVirial, "Invalid force pointer");
243 forceWithVirial->addVirialContribution(vir_Q);
244 enerd->term[F_COUL_RECIP] += Vlr_q;
245 sum_forces(as_rvec_array(forceWithVirial->force_.data()), pmeForces);
246 wallcycle_stop(wcycle, ewcPME_GPU_F_REDUCTION);
249 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
250 tensor vir_part, t_graph *graph, matrix box,
251 t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
253 int i;
255 /* The short-range virial from surrounding boxes */
256 calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
257 inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
259 /* Calculate partial virial, for local atoms only, based on short range.
260 * Total virial is computed in global_stat, called from do_md
262 f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
263 inc_nrnb(nrnb, eNR_VIRIAL, homenr);
265 /* Add wall contribution */
266 for (i = 0; i < DIM; i++)
268 vir_part[i][ZZ] += fr->vir_wall_z[i];
271 if (debug)
273 pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
277 static void pull_potential_wrapper(t_commrec *cr,
278 t_inputrec *ir,
279 matrix box, rvec x[],
280 ForceWithVirial *force,
281 t_mdatoms *mdatoms,
282 gmx_enerdata_t *enerd,
283 real *lambda,
284 double t,
285 gmx_wallcycle_t wcycle)
287 t_pbc pbc;
288 real dvdl;
290 /* Calculate the center of mass forces, this requires communication,
291 * which is why pull_potential is called close to other communication.
293 wallcycle_start(wcycle, ewcPULLPOT);
294 set_pbc(&pbc, ir->ePBC, box);
295 dvdl = 0;
296 enerd->term[F_COM_PULL] +=
297 pull_potential(ir->pull_work, mdatoms, &pbc,
298 cr, t, lambda[efptRESTRAINT], x, force, &dvdl);
299 enerd->dvdl_lin[efptRESTRAINT] += dvdl;
300 wallcycle_stop(wcycle, ewcPULLPOT);
303 static void pme_receive_force_ener(t_commrec *cr,
304 ForceWithVirial *forceWithVirial,
305 gmx_enerdata_t *enerd,
306 gmx_wallcycle_t wcycle)
308 real e_q, e_lj, dvdl_q, dvdl_lj;
309 float cycles_ppdpme, cycles_seppme;
311 cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
312 dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
314 /* In case of node-splitting, the PP nodes receive the long-range
315 * forces, virial and energy from the PME nodes here.
317 wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
318 dvdl_q = 0;
319 dvdl_lj = 0;
320 gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
321 &cycles_seppme);
322 enerd->term[F_COUL_RECIP] += e_q;
323 enerd->term[F_LJ_RECIP] += e_lj;
324 enerd->dvdl_lin[efptCOUL] += dvdl_q;
325 enerd->dvdl_lin[efptVDW] += dvdl_lj;
327 if (wcycle)
329 dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
331 wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
334 static void print_large_forces(FILE *fp, t_mdatoms *md, t_commrec *cr,
335 gmx_int64_t step, real forceTolerance,
336 const rvec *x, const rvec *f)
338 real force2Tolerance = gmx::square(forceTolerance);
339 std::uintmax_t numNonFinite = 0;
340 for (int i = 0; i < md->homenr; i++)
342 real force2 = norm2(f[i]);
343 bool nonFinite = !std::isfinite(force2);
344 if (force2 >= force2Tolerance || nonFinite)
346 fprintf(fp, "step %" GMX_PRId64 " atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
347 step,
348 ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
350 if (nonFinite)
352 numNonFinite++;
355 if (numNonFinite > 0)
357 /* Note that with MPI this fatal call on one rank might interrupt
358 * the printing on other ranks. But we can only avoid that with
359 * an expensive MPI barrier that we would need at each step.
361 gmx_fatal(FARGS, "At step %" GMX_PRId64 " detected non-finite forces on %ju atoms", step, numNonFinite);
365 static void post_process_forces(t_commrec *cr,
366 gmx_int64_t step,
367 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
368 gmx_localtop_t *top,
369 matrix box, rvec x[],
370 rvec f[],
371 ForceWithVirial *forceWithVirial,
372 tensor vir_force,
373 t_mdatoms *mdatoms,
374 t_graph *graph,
375 t_forcerec *fr, gmx_vsite_t *vsite,
376 int flags)
378 if (fr->haveDirectVirialContributions)
380 rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
382 if (vsite)
384 /* Spread the mesh force on virtual sites to the other particles...
385 * This is parallellized. MPI communication is performed
386 * if the constructing atoms aren't local.
388 wallcycle_start(wcycle, ewcVSITESPREAD);
389 matrix virial = { { 0 } };
390 spread_vsite_f(vsite, x, fDirectVir, nullptr,
391 (flags & GMX_FORCE_VIRIAL), virial,
392 nrnb,
393 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
394 forceWithVirial->addVirialContribution(virial);
395 wallcycle_stop(wcycle, ewcVSITESPREAD);
398 if (flags & GMX_FORCE_VIRIAL)
400 /* Now add the forces, this is local */
401 sum_forces(f, forceWithVirial->force_);
403 /* Add the direct virial contributions */
404 GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
405 m_add(vir_force, forceWithVirial->getVirial(), vir_force);
407 if (debug)
409 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
414 if (fr->print_force >= 0)
416 print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
420 static void do_nb_verlet(t_forcerec *fr,
421 interaction_const_t *ic,
422 gmx_enerdata_t *enerd,
423 int flags, int ilocality,
424 int clearF,
425 gmx_int64_t step,
426 t_nrnb *nrnb,
427 gmx_wallcycle_t wcycle)
429 if (!(flags & GMX_FORCE_NONBONDED))
431 /* skip non-bonded calculation */
432 return;
435 nonbonded_verlet_t *nbv = fr->nbv;
436 nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
438 /* GPU kernel launch overhead is already timed separately */
439 if (fr->cutoff_scheme != ecutsVERLET)
441 gmx_incons("Invalid cut-off scheme passed!");
444 bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
446 if (!bUsingGpuKernels)
448 /* When dynamic pair-list pruning is requested, we need to prune
449 * at nstlistPrune steps.
451 if (nbv->listParams->useDynamicPruning &&
452 (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
454 /* Prune the pair-list beyond fr->ic->rlistPrune using
455 * the current coordinates of the atoms.
457 wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
458 nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
459 wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
462 wallcycle_sub_start(wcycle, ewcsNONBONDED);
465 switch (nbvg->kernel_type)
467 case nbnxnk4x4_PlainC:
468 case nbnxnk4xN_SIMD_4xN:
469 case nbnxnk4xN_SIMD_2xNN:
470 nbnxn_kernel_cpu(nbvg,
471 nbv->nbat,
473 fr->shift_vec,
474 flags,
475 clearF,
476 fr->fshift[0],
477 enerd->grpp.ener[egCOULSR],
478 fr->bBHAM ?
479 enerd->grpp.ener[egBHAMSR] :
480 enerd->grpp.ener[egLJSR]);
481 break;
483 case nbnxnk8x8x8_GPU:
484 nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbv->nbat, flags, ilocality);
485 break;
487 case nbnxnk8x8x8_PlainC:
488 nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
489 nbv->nbat, ic,
490 fr->shift_vec,
491 flags,
492 clearF,
493 nbv->nbat->out[0].f,
494 fr->fshift[0],
495 enerd->grpp.ener[egCOULSR],
496 fr->bBHAM ?
497 enerd->grpp.ener[egBHAMSR] :
498 enerd->grpp.ener[egLJSR]);
499 break;
501 default:
502 GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
505 if (!bUsingGpuKernels)
507 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
510 int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
511 if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
513 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
515 else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
516 (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
518 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
520 else
522 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
524 enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
525 if (flags & GMX_FORCE_ENERGY)
527 /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
528 enr_nbnxn_kernel_ljc += 1;
529 enr_nbnxn_kernel_lj += 1;
532 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
533 nbvg->nbl_lists.natpair_ljq);
534 inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
535 nbvg->nbl_lists.natpair_lj);
536 /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
537 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
538 nbvg->nbl_lists.natpair_q);
540 if (ic->vdw_modifier == eintmodFORCESWITCH)
542 /* We add up the switch cost separately */
543 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
544 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
546 if (ic->vdw_modifier == eintmodPOTSWITCH)
548 /* We add up the switch cost separately */
549 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
550 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
552 if (ic->vdwtype == evdwPME)
554 /* We add up the LJ Ewald cost separately */
555 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
556 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
560 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
561 t_forcerec *fr,
562 rvec x[],
563 rvec f[],
564 t_mdatoms *mdatoms,
565 t_lambda *fepvals,
566 real *lambda,
567 gmx_enerdata_t *enerd,
568 int flags,
569 t_nrnb *nrnb,
570 gmx_wallcycle_t wcycle)
572 int donb_flags;
573 nb_kernel_data_t kernel_data;
574 real lam_i[efptNR];
575 real dvdl_nb[efptNR];
576 int th;
577 int i, j;
579 donb_flags = 0;
580 /* Add short-range interactions */
581 donb_flags |= GMX_NONBONDED_DO_SR;
583 /* Currently all group scheme kernels always calculate (shift-)forces */
584 if (flags & GMX_FORCE_FORCES)
586 donb_flags |= GMX_NONBONDED_DO_FORCE;
588 if (flags & GMX_FORCE_VIRIAL)
590 donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
592 if (flags & GMX_FORCE_ENERGY)
594 donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
597 kernel_data.flags = donb_flags;
598 kernel_data.lambda = lambda;
599 kernel_data.dvdl = dvdl_nb;
601 kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
602 kernel_data.energygrp_vdw = enerd->grpp.ener[egLJSR];
604 /* reset free energy components */
605 for (i = 0; i < efptNR; i++)
607 dvdl_nb[i] = 0;
610 GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
612 wallcycle_sub_start(wcycle, ewcsNONBONDED);
613 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
614 for (th = 0; th < nbl_lists->nnbl; th++)
618 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
619 x, f, fr, mdatoms, &kernel_data, nrnb);
621 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
624 if (fepvals->sc_alpha != 0)
626 enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
627 enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
629 else
631 enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
632 enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
635 /* If we do foreign lambda and we have soft-core interactions
636 * we have to recalculate the (non-linear) energies contributions.
638 if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
640 kernel_data.flags = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
641 kernel_data.lambda = lam_i;
642 kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
643 kernel_data.energygrp_vdw = enerd->foreign_grpp.ener[egLJSR];
644 /* Note that we add to kernel_data.dvdl, but ignore the result */
646 for (i = 0; i < enerd->n_lambda; i++)
648 for (j = 0; j < efptNR; j++)
650 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
652 reset_foreign_enerdata(enerd);
653 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
654 for (th = 0; th < nbl_lists->nnbl; th++)
658 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
659 x, f, fr, mdatoms, &kernel_data, nrnb);
661 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
664 sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
665 enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
669 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
672 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
674 return nbv != nullptr && nbv->bUseGPU;
677 static gmx_inline void clear_rvecs_omp(int n, rvec v[])
679 int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
681 /* Note that we would like to avoid this conditional by putting it
682 * into the omp pragma instead, but then we still take the full
683 * omp parallel for overhead (at least with gcc5).
685 if (nth == 1)
687 for (int i = 0; i < n; i++)
689 clear_rvec(v[i]);
692 else
694 #pragma omp parallel for num_threads(nth) schedule(static)
695 for (int i = 0; i < n; i++)
697 clear_rvec(v[i]);
702 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
704 * \param groupOptions Group options, containing T-coupling options
706 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
708 real nrdfCoupled = 0;
709 real nrdfUncoupled = 0;
710 real kineticEnergy = 0;
711 for (int g = 0; g < groupOptions.ngtc; g++)
713 if (groupOptions.tau_t[g] >= 0)
715 nrdfCoupled += groupOptions.nrdf[g];
716 kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
718 else
720 nrdfUncoupled += groupOptions.nrdf[g];
724 /* This conditional with > also catches nrdf=0 */
725 if (nrdfCoupled > nrdfUncoupled)
727 return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
729 else
731 return 0;
735 /*! \brief This routine checks that the potential energy is finite.
737 * Always checks that the potential energy is finite. If step equals
738 * inputrec.init_step also checks that the magnitude of the potential energy
739 * is reasonable. Terminates with a fatal error when a check fails.
740 * Note that passing this check does not guarantee finite forces,
741 * since those use slightly different arithmetics. But in most cases
742 * there is just a narrow coordinate range where forces are not finite
743 * and energies are finite.
745 * \param[in] step The step number, used for checking and printing
746 * \param[in] enerd The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
747 * \param[in] inputrec The input record
749 static void checkPotentialEnergyValidity(gmx_int64_t step,
750 const gmx_enerdata_t &enerd,
751 const t_inputrec &inputrec)
753 /* Threshold valid for comparing absolute potential energy against
754 * the kinetic energy. Normally one should not consider absolute
755 * potential energy values, but with a factor of one million
756 * we should never get false positives.
758 constexpr real c_thresholdFactor = 1e6;
760 bool energyIsNotFinite = !std::isfinite(enerd.term[F_EPOT]);
761 real averageKineticEnergy = 0;
762 /* We only check for large potential energy at the initial step,
763 * because that is by far the most likely step for this too occur
764 * and because computing the average kinetic energy is not free.
765 * Note: nstcalcenergy >> 1 often does not allow to catch large energies
766 * before they become NaN.
768 if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
770 averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
773 if (energyIsNotFinite || (averageKineticEnergy > 0 &&
774 enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
776 gmx_fatal(FARGS, "Step %" GMX_PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
777 step,
778 enerd.term[F_EPOT],
779 energyIsNotFinite ? "not finite" : "extremely high",
780 enerd.term[F_LJ],
781 enerd.term[F_COUL_SR],
782 energyIsNotFinite ? "non-finite" : "very high",
783 energyIsNotFinite ? " or Nan" : "");
787 /*! \brief Compute forces and/or energies for special algorithms
789 * The intention is to collect all calls to algorithms that compute
790 * forces on local atoms only and that do not contribute to the local
791 * virial sum (but add their virial contribution separately).
792 * Eventually these should likely all become ForceProviders.
793 * Within this function the intention is to have algorithms that do
794 * global communication at the end, so global barriers within the MD loop
795 * are as close together as possible.
797 * \param[in] fplog The log file
798 * \param[in] cr The communication record
799 * \param[in] inputrec The input record
800 * \param[in] step The current MD step
801 * \param[in] t The current time
802 * \param[in,out] wcycle Wallcycle accounting struct
803 * \param[in,out] forceProviders Pointer to a list of force providers
804 * \param[in] box The unit cell
805 * \param[in] x The coordinates
806 * \param[in] mdatoms Per atom properties
807 * \param[in] lambda Array of free-energy lambda values
808 * \param[in] forceFlags Flags that tell whether we should compute forces/energies/virial
809 * \param[in,out] forceWithVirial Force and virial buffers
810 * \param[in,out] enerd Energy buffer
811 * \param[in,out] ed Essential dynamics pointer
812 * \param[in] bNS Tells if we did neighbor searching this step, used for ED sampling
814 * \todo Remove bNS, which is used incorrectly.
815 * \todo Convert all other algorithms called here to ForceProviders.
817 static void
818 computeSpecialForces(FILE *fplog,
819 t_commrec *cr,
820 t_inputrec *inputrec,
821 gmx_int64_t step,
822 double t,
823 gmx_wallcycle_t wcycle,
824 ForceProviders *forceProviders,
825 matrix box,
826 rvec *x,
827 t_mdatoms *mdatoms,
828 real *lambda,
829 int forceFlags,
830 ForceWithVirial *forceWithVirial,
831 gmx_enerdata_t *enerd,
832 gmx_edsam_t ed,
833 gmx_bool bNS)
835 const bool computeForces = (forceFlags & GMX_FORCE_FORCES);
837 /* NOTE: Currently all ForceProviders only provide forces.
838 * When they also provide energies, remove this conditional.
840 if (computeForces)
842 /* Collect forces from modules */
843 forceProviders->calculateForces(cr, mdatoms, box, t, x, forceWithVirial);
846 if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
848 pull_potential_wrapper(cr, inputrec, box, x,
849 forceWithVirial,
850 mdatoms, enerd, lambda, t,
851 wcycle);
853 if (inputrec->bDoAwh)
855 Awh &awh = *inputrec->awh;
856 enerd->term[F_COM_PULL] +=
857 awh.applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
858 forceWithVirial,
859 t, step, wcycle, fplog);
863 rvec *f = as_rvec_array(forceWithVirial->force_.data());
865 /* Add the forces from enforced rotation potentials (if any) */
866 if (inputrec->bRot)
868 wallcycle_start(wcycle, ewcROTadd);
869 enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
870 wallcycle_stop(wcycle, ewcROTadd);
873 if (ed)
875 /* Note that since init_edsam() is called after the initialization
876 * of forcerec, edsam doesn't request the noVirSum force buffer.
877 * Thus if no other algorithm (e.g. PME) requires it, the forces
878 * here will contribute to the virial.
880 do_flood(cr, inputrec, x, f, ed, box, step, bNS);
883 /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
884 if (inputrec->bIMD && computeForces)
886 IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
890 /*! \brief Launch the prepare_step and spread stages of PME GPU.
892 * \param[in] pmedata The PME structure
893 * \param[in] box The box matrix
894 * \param[in] x Coordinate array
895 * \param[in] flags Force flags
896 * \param[in] wcycle The wallcycle structure
898 static inline void launchPmeGpuSpread(gmx_pme_t *pmedata,
899 matrix box,
900 rvec x[],
901 int flags,
902 gmx_wallcycle_t wcycle)
904 int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE;
905 pmeFlags |= (flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0;
906 pmeFlags |= (flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0;
908 pme_gpu_prepare_computation(pmedata, flags & GMX_FORCE_DYNAMICBOX, box, wcycle, pmeFlags);
909 pme_gpu_launch_spread(pmedata, x, wcycle);
912 /*! \brief Launch the FFT and gather stages of PME GPU
914 * This function only implements setting the output forces (no accumulation).
916 * \param[in] pmedata The PME structure
917 * \param[in] wcycle The wallcycle structure
919 static void launchPmeGpuFftAndGather(gmx_pme_t *pmedata,
920 gmx_wallcycle_t wcycle)
922 pme_gpu_launch_complex_transforms(pmedata, wcycle);
923 pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
926 /*! \brief
927 * Polling wait for either of the PME or nonbonded GPU tasks.
929 * Instead of a static order in waiting for GPU tasks, this function
930 * polls checking which of the two tasks completes first, and does the
931 * associated force buffer reduction overlapped with the other task.
932 * By doing that, unlike static scheduling order, it can always overlap
933 * one of the reductions, regardless of the GPU task completion order.
935 * \param[in] nbv Nonbonded verlet structure
936 * \param[in] pmedata PME module data
937 * \param[in,out] force Force array to reduce task outputs into.
938 * \param[in,out] forceWithVirial Force and virial buffers
939 * \param[in,out] fshift Shift force output vector results are reduced into
940 * \param[in,out] enerd Energy data structure results are reduced into
941 * \param[in] flags Force flags
942 * \param[in] wcycle The wallcycle structure
944 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv,
945 const gmx_pme_t *pmedata,
946 gmx::PaddedArrayRef<gmx::RVec> *force,
947 ForceWithVirial *forceWithVirial,
948 rvec fshift[],
949 gmx_enerdata_t *enerd,
950 int flags,
951 gmx_wallcycle_t wcycle)
953 bool isPmeGpuDone = false;
954 bool isNbGpuDone = false;
957 gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
959 while (!isPmeGpuDone || !isNbGpuDone)
961 if (!isPmeGpuDone)
963 matrix vir_Q;
964 real Vlr_q;
966 GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
967 isPmeGpuDone = pme_gpu_try_finish_task(pmedata, wcycle, &pmeGpuForces,
968 vir_Q, &Vlr_q, completionType);
970 if (isPmeGpuDone)
972 pme_gpu_reduce_outputs(wcycle, forceWithVirial, pmeGpuForces,
973 enerd, vir_Q, Vlr_q);
977 if (!isNbGpuDone)
979 GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
980 wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
981 isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
982 flags, eatLocal,
983 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
984 fshift, completionType);
985 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
986 // To get the call count right, when the task finished we
987 // issue a start/stop.
988 // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
989 // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
990 if (isNbGpuDone)
992 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
993 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
995 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
996 wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
997 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
998 nbv->nbat, as_rvec_array(force->data()));
999 wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1000 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1006 /*! \brief
1007 * Launch the dynamic rolling pruning GPU task.
1009 * We currently alternate local/non-local list pruning in odd-even steps
1010 * (only pruning every second step without DD).
1012 * \param[in] cr The communication record
1013 * \param[in] nbv Nonbonded verlet structure
1014 * \param[in] inputrec The input record
1015 * \param[in] step The current MD step
1017 static inline void launchGpuRollingPruning(const t_commrec *cr,
1018 const nonbonded_verlet_t *nbv,
1019 const t_inputrec *inputrec,
1020 const gmx_int64_t step)
1022 /* We should not launch the rolling pruning kernel at a search
1023 * step or just before search steps, since that's useless.
1024 * Without domain decomposition we prune at even steps.
1025 * With domain decomposition we alternate local and non-local
1026 * pruning at even and odd steps.
1028 int numRollingParts = nbv->listParams->numRollingParts;
1029 GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1030 int stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1031 bool stepIsEven = ((stepWithCurrentList & 1) == 0);
1032 if (stepWithCurrentList > 0 &&
1033 stepWithCurrentList < inputrec->nstlist - 1 &&
1034 (stepIsEven || DOMAINDECOMP(cr)))
1036 nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
1037 stepIsEven ? eintLocal : eintNonlocal,
1038 numRollingParts);
1042 static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
1043 t_inputrec *inputrec,
1044 gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1045 gmx_localtop_t *top,
1046 gmx_groups_t gmx_unused *groups,
1047 matrix box, gmx::PaddedArrayRef<gmx::RVec> x, history_t *hist,
1048 gmx::PaddedArrayRef<gmx::RVec> force,
1049 tensor vir_force,
1050 t_mdatoms *mdatoms,
1051 gmx_enerdata_t *enerd, t_fcdata *fcd,
1052 real *lambda, t_graph *graph,
1053 t_forcerec *fr, interaction_const_t *ic,
1054 gmx_vsite_t *vsite, rvec mu_tot,
1055 double t, gmx_edsam_t ed,
1056 gmx_bool bBornRadii,
1057 int flags,
1058 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1059 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1061 int cg1, i, j;
1062 double mu[2*DIM];
1063 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1064 gmx_bool bDoForces, bUseGPU, bUseOrEmulGPU;
1065 rvec vzero, box_diag;
1066 float cycles_pme, cycles_wait_gpu;
1067 nonbonded_verlet_t *nbv = fr->nbv;
1069 bStateChanged = (flags & GMX_FORCE_STATECHANGED);
1070 bNS = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1071 bFillGrid = (bNS && bStateChanged);
1072 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1073 bDoForces = (flags & GMX_FORCE_FORCES);
1074 bUseGPU = fr->nbv->bUseGPU;
1075 bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
1077 const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
1078 // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
1079 const bool useGpuPme = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
1080 ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
1082 /* At a search step we need to start the first balancing region
1083 * somewhere early inside the step after communication during domain
1084 * decomposition (and not during the previous step as usual).
1086 if (bNS &&
1087 ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1089 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
1092 cycles_wait_gpu = 0;
1094 const int start = 0;
1095 const int homenr = mdatoms->homenr;
1097 clear_mat(vir_force);
1099 if (DOMAINDECOMP(cr))
1101 cg1 = cr->dd->ncg_tot;
1103 else
1105 cg1 = top->cgs.nr;
1107 if (fr->n_tpi > 0)
1109 cg1--;
1112 if (bStateChanged)
1114 update_forcerec(fr, box);
1116 if (inputrecNeedMutot(inputrec))
1118 /* Calculate total (local) dipole moment in a temporary common array.
1119 * This makes it possible to sum them over nodes faster.
1121 calc_mu(start, homenr,
1122 x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1123 mu, mu+DIM);
1127 if (fr->ePBC != epbcNONE)
1129 /* Compute shift vectors every step,
1130 * because of pressure coupling or box deformation!
1132 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1134 calc_shifts(box, fr->shift_vec);
1137 if (bCalcCGCM)
1139 put_atoms_in_box_omp(fr->ePBC, box, x.subArray(0, homenr));
1140 inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1142 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1144 unshift_self(graph, box, as_rvec_array(x.data()));
1148 nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
1149 fr->shift_vec, nbv->nbat);
1151 #if GMX_MPI
1152 if (!thisRankHasDuty(cr, DUTY_PME))
1154 /* Send particle coordinates to the pme nodes.
1155 * Since this is only implemented for domain decomposition
1156 * and domain decomposition does not use the graph,
1157 * we do not need to worry about shifting.
1159 wallcycle_start(wcycle, ewcPP_PMESENDX);
1160 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1161 lambda[efptCOUL], lambda[efptVDW],
1162 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1163 step);
1164 wallcycle_stop(wcycle, ewcPP_PMESENDX);
1166 #endif /* GMX_MPI */
1168 if (useGpuPme)
1170 launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.data()), flags, wcycle);
1173 /* do gridding for pair search */
1174 if (bNS)
1176 if (graph && bStateChanged)
1178 /* Calculate intramolecular shift vectors to make molecules whole */
1179 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1182 clear_rvec(vzero);
1183 box_diag[XX] = box[XX][XX];
1184 box_diag[YY] = box[YY][YY];
1185 box_diag[ZZ] = box[ZZ][ZZ];
1187 wallcycle_start(wcycle, ewcNS);
1188 if (!fr->bDomDec)
1190 wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1191 nbnxn_put_on_grid(nbv->nbs, fr->ePBC, box,
1192 0, vzero, box_diag,
1193 0, mdatoms->homenr, -1, fr->cginfo, as_rvec_array(x.data()),
1194 0, nullptr,
1195 nbv->grp[eintLocal].kernel_type,
1196 nbv->nbat);
1197 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1199 else
1201 wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1202 nbnxn_put_on_grid_nonlocal(nbv->nbs, domdec_zones(cr->dd),
1203 fr->cginfo, as_rvec_array(x.data()),
1204 nbv->grp[eintNonlocal].kernel_type,
1205 nbv->nbat);
1206 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1209 nbnxn_atomdata_set(nbv->nbat, nbv->nbs, mdatoms, fr->cginfo);
1210 wallcycle_stop(wcycle, ewcNS);
1213 /* initialize the GPU atom data and copy shift vector */
1214 if (bUseGPU)
1216 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1217 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1219 if (bNS)
1221 nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1224 nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1226 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1227 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1230 /* do local pair search */
1231 if (bNS)
1233 wallcycle_start_nocount(wcycle, ewcNS);
1234 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1235 nbnxn_make_pairlist(nbv->nbs, nbv->nbat,
1236 &top->excls,
1237 nbv->listParams->rlistOuter,
1238 nbv->min_ci_balanced,
1239 &nbv->grp[eintLocal].nbl_lists,
1240 eintLocal,
1241 nbv->grp[eintLocal].kernel_type,
1242 nrnb);
1243 nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
1244 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1246 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
1248 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1250 if (bUseGPU)
1252 /* initialize local pair-list on the GPU */
1253 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1254 nbv->grp[eintLocal].nbl_lists.nbl[0],
1255 eintLocal);
1257 wallcycle_stop(wcycle, ewcNS);
1259 else
1261 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1262 wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
1263 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, as_rvec_array(x.data()),
1264 nbv->nbat);
1265 wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
1266 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1269 if (bUseGPU)
1271 if (DOMAINDECOMP(cr))
1273 ddOpenBalanceRegionGpu(cr->dd);
1276 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1277 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1278 /* launch local nonbonded work on GPU */
1279 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1280 step, nrnb, wcycle);
1281 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1282 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1285 if (useGpuPme)
1287 // In PME GPU and mixed mode we launch FFT / gather after the
1288 // X copy/transform to allow overlap as well as after the GPU NB
1289 // launch to avoid FFT launch overhead hijacking the CPU and delaying
1290 // the nonbonded kernel.
1291 launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1294 /* Communicate coordinates and sum dipole if necessary +
1295 do non-local pair search */
1296 if (DOMAINDECOMP(cr))
1298 if (bNS)
1300 wallcycle_start_nocount(wcycle, ewcNS);
1301 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1303 nbnxn_make_pairlist(nbv->nbs, nbv->nbat,
1304 &top->excls,
1305 nbv->listParams->rlistOuter,
1306 nbv->min_ci_balanced,
1307 &nbv->grp[eintNonlocal].nbl_lists,
1308 eintNonlocal,
1309 nbv->grp[eintNonlocal].kernel_type,
1310 nrnb);
1311 nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1312 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1314 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1316 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1318 if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1320 /* initialize non-local pair-list on the GPU */
1321 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1322 nbv->grp[eintNonlocal].nbl_lists.nbl[0],
1323 eintNonlocal);
1325 wallcycle_stop(wcycle, ewcNS);
1327 else
1329 wallcycle_start(wcycle, ewcMOVEX);
1330 dd_move_x(cr->dd, box, as_rvec_array(x.data()));
1331 wallcycle_stop(wcycle, ewcMOVEX);
1333 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1334 wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
1335 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, as_rvec_array(x.data()),
1336 nbv->nbat);
1337 wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
1338 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1341 if (bUseGPU)
1343 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1344 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1345 /* launch non-local nonbonded tasks on GPU */
1346 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1347 step, nrnb, wcycle);
1348 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1349 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1353 if (bUseGPU)
1355 /* launch D2H copy-back F */
1356 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1357 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1358 if (DOMAINDECOMP(cr))
1360 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1361 flags, eatNonlocal);
1363 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1364 flags, eatLocal);
1365 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1366 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1369 if (bStateChanged && inputrecNeedMutot(inputrec))
1371 if (PAR(cr))
1373 gmx_sumd(2*DIM, mu, cr);
1374 ddReopenBalanceRegionCpu(cr->dd);
1377 for (i = 0; i < 2; i++)
1379 for (j = 0; j < DIM; j++)
1381 fr->mu_tot[i][j] = mu[i*DIM + j];
1385 if (fr->efep == efepNO)
1387 copy_rvec(fr->mu_tot[0], mu_tot);
1389 else
1391 for (j = 0; j < DIM; j++)
1393 mu_tot[j] =
1394 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1395 lambda[efptCOUL]*fr->mu_tot[1][j];
1399 /* Reset energies */
1400 reset_enerdata(enerd);
1401 clear_rvecs(SHIFTS, fr->fshift);
1403 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1405 wallcycle_start(wcycle, ewcPPDURINGPME);
1406 dd_force_flop_start(cr->dd, nrnb);
1409 if (inputrec->bRot)
1411 wallcycle_start(wcycle, ewcROT);
1412 do_rotation(cr, inputrec, box, as_rvec_array(x.data()), t, step, bNS);
1413 wallcycle_stop(wcycle, ewcROT);
1416 /* Temporary solution until all routines take PaddedRVecVector */
1417 rvec *f = as_rvec_array(force.data());
1419 /* Start the force cycle counter.
1420 * Note that a different counter is used for dynamic load balancing.
1422 wallcycle_start(wcycle, ewcFORCE);
1424 gmx::ArrayRef<gmx::RVec> forceRef = force;
1425 if (bDoForces)
1427 /* If we need to compute the virial, we might need a separate
1428 * force buffer for algorithms for which the virial is calculated
1429 * directly, such as PME.
1431 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1433 forceRef = *fr->forceBufferForDirectVirialContributions;
1435 /* We only compute forces on local atoms. Note that vsites can
1436 * spread to non-local atoms, but that part of the buffer is
1437 * cleared separately in the vsite spreading code.
1439 clear_rvecs_omp(mdatoms->homenr, as_rvec_array(forceRef.data()));
1441 /* Clear the short- and long-range forces */
1442 clear_rvecs_omp(fr->natoms_force_constr, f);
1445 /* forceWithVirial uses the local atom range only */
1446 ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
1448 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1450 clear_pull_forces(inputrec->pull_work);
1453 /* We calculate the non-bonded forces, when done on the CPU, here.
1454 * We do this before calling do_force_lowlevel, because in that
1455 * function, the listed forces are calculated before PME, which
1456 * does communication. With this order, non-bonded and listed
1457 * force calculation imbalance can be balanced out by the domain
1458 * decomposition load balancing.
1461 if (!bUseOrEmulGPU)
1463 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1464 step, nrnb, wcycle);
1467 if (fr->efep != efepNO)
1469 /* Calculate the local and non-local free energy interactions here.
1470 * Happens here on the CPU both with and without GPU.
1472 if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1474 do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1475 fr, as_rvec_array(x.data()), f, mdatoms,
1476 inputrec->fepvals, lambda,
1477 enerd, flags, nrnb, wcycle);
1480 if (DOMAINDECOMP(cr) &&
1481 fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1483 do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1484 fr, as_rvec_array(x.data()), f, mdatoms,
1485 inputrec->fepvals, lambda,
1486 enerd, flags, nrnb, wcycle);
1490 if (!bUseOrEmulGPU)
1492 int aloc;
1494 if (DOMAINDECOMP(cr))
1496 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1497 step, nrnb, wcycle);
1500 if (!bUseOrEmulGPU)
1502 aloc = eintLocal;
1504 else
1506 aloc = eintNonlocal;
1509 /* Add all the non-bonded force to the normal force array.
1510 * This can be split into a local and a non-local part when overlapping
1511 * communication with calculation with domain decomposition.
1513 wallcycle_stop(wcycle, ewcFORCE);
1514 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1515 wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1516 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->nbat, f);
1517 wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1518 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1519 wallcycle_start_nocount(wcycle, ewcFORCE);
1521 /* if there are multiple fshift output buffers reduce them */
1522 if ((flags & GMX_FORCE_VIRIAL) &&
1523 nbv->grp[aloc].nbl_lists.nnbl > 1)
1525 /* This is not in a subcounter because it takes a
1526 negligible and constant-sized amount of time */
1527 nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1528 fr->fshift);
1532 /* update QMMMrec, if necessary */
1533 if (fr->bQMMM)
1535 update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1538 /* Compute the bonded and non-bonded energies and optionally forces */
1539 do_force_lowlevel(fr, inputrec, &(top->idef),
1540 cr, nrnb, wcycle, mdatoms,
1541 as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd, top, fr->born,
1542 bBornRadii, box,
1543 inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1544 flags, &cycles_pme);
1546 wallcycle_stop(wcycle, ewcFORCE);
1548 computeSpecialForces(fplog, cr, inputrec, step, t, wcycle,
1549 fr->forceProviders, box, as_rvec_array(x.data()), mdatoms, lambda,
1550 flags, &forceWithVirial, enerd,
1551 ed, bNS);
1553 if (bUseOrEmulGPU)
1555 /* wait for non-local forces (or calculate in emulation mode) */
1556 if (DOMAINDECOMP(cr))
1558 if (bUseGPU)
1560 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1561 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1562 flags, eatNonlocal,
1563 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1564 fr->fshift);
1565 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1567 else
1569 wallcycle_start_nocount(wcycle, ewcFORCE);
1570 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1571 step, nrnb, wcycle);
1572 wallcycle_stop(wcycle, ewcFORCE);
1574 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1575 wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1576 /* skip the reduction if there was no non-local work to do */
1577 if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1579 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
1580 nbv->nbat, f);
1582 wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1583 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1587 if (DOMAINDECOMP(cr))
1589 /* We are done with the CPU compute.
1590 * We will now communicate the non-local forces.
1591 * If we use a GPU this will overlap with GPU work, so in that case
1592 * we do not close the DD force balancing region here.
1594 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1596 ddCloseBalanceRegionCpu(cr->dd);
1598 if (bDoForces)
1600 wallcycle_start(wcycle, ewcMOVEF);
1601 dd_move_f(cr->dd, f, fr->fshift);
1602 wallcycle_stop(wcycle, ewcMOVEF);
1606 // With both nonbonded and PME offloaded a GPU on the same rank, we use
1607 // an alternating wait/reduction scheme.
1608 bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1609 if (alternateGpuWait)
1611 alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, wcycle);
1614 if (!alternateGpuWait && useGpuPme)
1616 gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
1617 matrix vir_Q;
1618 real Vlr_q;
1619 pme_gpu_wait_finish_task(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
1620 pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
1623 /* Wait for local GPU NB outputs on the non-alternating wait path */
1624 if (!alternateGpuWait && bUseGPU)
1626 /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1627 * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1628 * but even with a step of 0.1 ms the difference is less than 1%
1629 * of the step time.
1631 const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1633 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1634 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1635 flags, eatLocal,
1636 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1637 fr->fshift);
1638 float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1640 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1642 DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1643 if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1645 /* We measured few cycles, it could be that the kernel
1646 * and transfer finished earlier and there was no actual
1647 * wait time, only API call overhead.
1648 * Then the actual time could be anywhere between 0 and
1649 * cycles_wait_est. We will use half of cycles_wait_est.
1651 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1653 ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1657 if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1659 // NOTE: emulation kernel is not included in the balancing region,
1660 // but emulation mode does not target performance anyway
1661 wallcycle_start_nocount(wcycle, ewcFORCE);
1662 do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1663 DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1664 step, nrnb, wcycle);
1665 wallcycle_stop(wcycle, ewcFORCE);
1668 if (bUseGPU)
1670 /* now clear the GPU outputs while we finish the step on the CPU */
1671 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1672 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1673 nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1675 /* Is dynamic pair-list pruning activated? */
1676 if (nbv->listParams->useDynamicPruning)
1678 launchGpuRollingPruning(cr, nbv, inputrec, step);
1680 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1681 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1683 // TODO: move here the PME buffer clearing call pme_gpu_reinit_computation()
1686 /* Do the nonbonded GPU (or emulation) force buffer reduction
1687 * on the non-alternating path. */
1688 if (bUseOrEmulGPU && !alternateGpuWait)
1690 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1691 wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1692 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
1693 nbv->nbat, f);
1694 wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1695 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1698 if (DOMAINDECOMP(cr))
1700 dd_force_flop_stop(cr->dd, nrnb);
1703 if (bDoForces)
1705 /* If we have NoVirSum forces, but we do not calculate the virial,
1706 * we sum fr->f_novirsum=f later.
1708 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1710 wallcycle_start(wcycle, ewcVSITESPREAD);
1711 spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
1712 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1713 wallcycle_stop(wcycle, ewcVSITESPREAD);
1716 if (flags & GMX_FORCE_VIRIAL)
1718 /* Calculation of the virial must be done after vsites! */
1719 calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
1720 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1724 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1726 /* In case of node-splitting, the PP nodes receive the long-range
1727 * forces, virial and energy from the PME nodes here.
1729 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1732 if (bDoForces)
1734 post_process_forces(cr, step, nrnb, wcycle,
1735 top, box, as_rvec_array(x.data()), f, &forceWithVirial,
1736 vir_force, mdatoms, graph, fr, vsite,
1737 flags);
1740 if (flags & GMX_FORCE_ENERGY)
1742 /* Sum the potential energy terms from group contributions */
1743 sum_epot(&(enerd->grpp), enerd->term);
1745 if (!EI_TPI(inputrec->eI))
1747 checkPotentialEnergyValidity(step, *enerd, *inputrec);
1752 static void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
1753 t_inputrec *inputrec,
1754 gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1755 gmx_localtop_t *top,
1756 gmx_groups_t *groups,
1757 matrix box, gmx::PaddedArrayRef<gmx::RVec> x, history_t *hist,
1758 gmx::PaddedArrayRef<gmx::RVec> force,
1759 tensor vir_force,
1760 t_mdatoms *mdatoms,
1761 gmx_enerdata_t *enerd, t_fcdata *fcd,
1762 real *lambda, t_graph *graph,
1763 t_forcerec *fr, gmx_vsite_t *vsite, rvec mu_tot,
1764 double t, gmx_edsam_t ed,
1765 gmx_bool bBornRadii,
1766 int flags,
1767 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1768 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1770 int cg0, cg1, i, j;
1771 double mu[2*DIM];
1772 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1773 gmx_bool bDoForces;
1774 float cycles_pme;
1776 const int start = 0;
1777 const int homenr = mdatoms->homenr;
1779 clear_mat(vir_force);
1781 cg0 = 0;
1782 if (DOMAINDECOMP(cr))
1784 cg1 = cr->dd->ncg_tot;
1786 else
1788 cg1 = top->cgs.nr;
1790 if (fr->n_tpi > 0)
1792 cg1--;
1795 bStateChanged = (flags & GMX_FORCE_STATECHANGED);
1796 bNS = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1797 /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1798 bFillGrid = (bNS && bStateChanged);
1799 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1800 bDoForces = (flags & GMX_FORCE_FORCES);
1802 if (bStateChanged)
1804 update_forcerec(fr, box);
1806 if (inputrecNeedMutot(inputrec))
1808 /* Calculate total (local) dipole moment in a temporary common array.
1809 * This makes it possible to sum them over nodes faster.
1811 calc_mu(start, homenr,
1812 x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1813 mu, mu+DIM);
1817 if (fr->ePBC != epbcNONE)
1819 /* Compute shift vectors every step,
1820 * because of pressure coupling or box deformation!
1822 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1824 calc_shifts(box, fr->shift_vec);
1827 if (bCalcCGCM)
1829 put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1830 &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1831 inc_nrnb(nrnb, eNR_CGCM, homenr);
1832 inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1834 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1836 unshift_self(graph, box, as_rvec_array(x.data()));
1839 else if (bCalcCGCM)
1841 calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1842 inc_nrnb(nrnb, eNR_CGCM, homenr);
1845 if (bCalcCGCM && gmx_debug_at)
1847 pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1850 #if GMX_MPI
1851 if (!thisRankHasDuty(cr, DUTY_PME))
1853 /* Send particle coordinates to the pme nodes.
1854 * Since this is only implemented for domain decomposition
1855 * and domain decomposition does not use the graph,
1856 * we do not need to worry about shifting.
1858 wallcycle_start(wcycle, ewcPP_PMESENDX);
1859 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1860 lambda[efptCOUL], lambda[efptVDW],
1861 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1862 step);
1863 wallcycle_stop(wcycle, ewcPP_PMESENDX);
1865 #endif /* GMX_MPI */
1867 /* Communicate coordinates and sum dipole if necessary */
1868 if (DOMAINDECOMP(cr))
1870 wallcycle_start(wcycle, ewcMOVEX);
1871 dd_move_x(cr->dd, box, as_rvec_array(x.data()));
1872 wallcycle_stop(wcycle, ewcMOVEX);
1873 /* No GPU support, no move_x overlap, so reopen the balance region here */
1874 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1876 ddReopenBalanceRegionCpu(cr->dd);
1880 if (inputrecNeedMutot(inputrec))
1882 if (bStateChanged)
1884 if (PAR(cr))
1886 gmx_sumd(2*DIM, mu, cr);
1887 ddReopenBalanceRegionCpu(cr->dd);
1889 for (i = 0; i < 2; i++)
1891 for (j = 0; j < DIM; j++)
1893 fr->mu_tot[i][j] = mu[i*DIM + j];
1897 if (fr->efep == efepNO)
1899 copy_rvec(fr->mu_tot[0], mu_tot);
1901 else
1903 for (j = 0; j < DIM; j++)
1905 mu_tot[j] =
1906 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1911 /* Reset energies */
1912 reset_enerdata(enerd);
1913 clear_rvecs(SHIFTS, fr->fshift);
1915 if (bNS)
1917 wallcycle_start(wcycle, ewcNS);
1919 if (graph && bStateChanged)
1921 /* Calculate intramolecular shift vectors to make molecules whole */
1922 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1925 /* Do the actual neighbour searching */
1926 ns(fplog, fr, box,
1927 groups, top, mdatoms,
1928 cr, nrnb, bFillGrid);
1930 wallcycle_stop(wcycle, ewcNS);
1933 if (inputrec->implicit_solvent && bNS)
1935 make_gb_nblist(cr, inputrec->gb_algorithm,
1936 as_rvec_array(x.data()), box, fr, &top->idef, graph, fr->born);
1939 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1941 wallcycle_start(wcycle, ewcPPDURINGPME);
1942 dd_force_flop_start(cr->dd, nrnb);
1945 if (inputrec->bRot)
1947 wallcycle_start(wcycle, ewcROT);
1948 do_rotation(cr, inputrec, box, as_rvec_array(x.data()), t, step, bNS);
1949 wallcycle_stop(wcycle, ewcROT);
1952 /* Temporary solution until all routines take PaddedRVecVector */
1953 rvec *f = as_rvec_array(force.data());
1955 /* Start the force cycle counter.
1956 * Note that a different counter is used for dynamic load balancing.
1958 wallcycle_start(wcycle, ewcFORCE);
1960 gmx::ArrayRef<gmx::RVec> forceRef = force;
1961 if (bDoForces)
1963 /* If we need to compute the virial, we might need a separate
1964 * force buffer for algorithms for which the virial is calculated
1965 * separately, such as PME.
1967 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1969 forceRef = *fr->forceBufferForDirectVirialContributions;
1971 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1974 /* Clear the short- and long-range forces */
1975 clear_rvecs(fr->natoms_force_constr, f);
1978 /* forceWithVirial might need the full force atom range */
1979 ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
1981 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1983 clear_pull_forces(inputrec->pull_work);
1986 /* update QMMMrec, if necessary */
1987 if (fr->bQMMM)
1989 update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1992 /* Compute the bonded and non-bonded energies and optionally forces */
1993 do_force_lowlevel(fr, inputrec, &(top->idef),
1994 cr, nrnb, wcycle, mdatoms,
1995 as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd, top, fr->born,
1996 bBornRadii, box,
1997 inputrec->fepvals, lambda,
1998 graph, &(top->excls), fr->mu_tot,
1999 flags,
2000 &cycles_pme);
2002 wallcycle_stop(wcycle, ewcFORCE);
2004 if (DOMAINDECOMP(cr))
2006 dd_force_flop_stop(cr->dd, nrnb);
2008 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
2010 ddCloseBalanceRegionCpu(cr->dd);
2014 computeSpecialForces(fplog, cr, inputrec, step, t, wcycle,
2015 fr->forceProviders, box, as_rvec_array(x.data()), mdatoms, lambda,
2016 flags, &forceWithVirial, enerd,
2017 ed, bNS);
2019 if (bDoForces)
2021 /* Communicate the forces */
2022 if (DOMAINDECOMP(cr))
2024 wallcycle_start(wcycle, ewcMOVEF);
2025 dd_move_f(cr->dd, f, fr->fshift);
2026 /* Do we need to communicate the separate force array
2027 * for terms that do not contribute to the single sum virial?
2028 * Position restraints and electric fields do not introduce
2029 * inter-cg forces, only full electrostatics methods do.
2030 * When we do not calculate the virial, fr->f_novirsum = f,
2031 * so we have already communicated these forces.
2033 if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
2034 (flags & GMX_FORCE_VIRIAL))
2036 dd_move_f(cr->dd, as_rvec_array(forceWithVirial.force_.data()), nullptr);
2038 wallcycle_stop(wcycle, ewcMOVEF);
2041 /* If we have NoVirSum forces, but we do not calculate the virial,
2042 * we sum fr->f_novirsum=f later.
2044 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
2046 wallcycle_start(wcycle, ewcVSITESPREAD);
2047 spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
2048 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
2049 wallcycle_stop(wcycle, ewcVSITESPREAD);
2052 if (flags & GMX_FORCE_VIRIAL)
2054 /* Calculation of the virial must be done after vsites! */
2055 calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
2056 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
2060 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
2062 /* In case of node-splitting, the PP nodes receive the long-range
2063 * forces, virial and energy from the PME nodes here.
2065 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
2068 if (bDoForces)
2070 post_process_forces(cr, step, nrnb, wcycle,
2071 top, box, as_rvec_array(x.data()), f, &forceWithVirial,
2072 vir_force, mdatoms, graph, fr, vsite,
2073 flags);
2076 if (flags & GMX_FORCE_ENERGY)
2078 /* Sum the potential energy terms from group contributions */
2079 sum_epot(&(enerd->grpp), enerd->term);
2081 if (!EI_TPI(inputrec->eI))
2083 checkPotentialEnergyValidity(step, *enerd, *inputrec);
2089 void do_force(FILE *fplog, t_commrec *cr,
2090 t_inputrec *inputrec,
2091 gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
2092 gmx_localtop_t *top,
2093 gmx_groups_t *groups,
2094 matrix box, gmx::PaddedArrayRef<gmx::RVec> x, history_t *hist,
2095 gmx::PaddedArrayRef<gmx::RVec> force,
2096 tensor vir_force,
2097 t_mdatoms *mdatoms,
2098 gmx_enerdata_t *enerd, t_fcdata *fcd,
2099 gmx::ArrayRef<real> lambda, t_graph *graph,
2100 t_forcerec *fr,
2101 gmx_vsite_t *vsite, rvec mu_tot,
2102 double t, gmx_edsam_t ed,
2103 gmx_bool bBornRadii,
2104 int flags,
2105 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
2106 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
2108 /* modify force flag if not doing nonbonded */
2109 if (!fr->bNonbonded)
2111 flags &= ~GMX_FORCE_NONBONDED;
2114 GMX_ASSERT(x.size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "coordinates should be padded");
2115 GMX_ASSERT(force.size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "force should be padded");
2117 switch (inputrec->cutoff_scheme)
2119 case ecutsVERLET:
2120 do_force_cutsVERLET(fplog, cr, inputrec,
2121 step, nrnb, wcycle,
2122 top,
2123 groups,
2124 box, x, hist,
2125 force, vir_force,
2126 mdatoms,
2127 enerd, fcd,
2128 lambda.data(), graph,
2129 fr, fr->ic,
2130 vsite, mu_tot,
2131 t, ed,
2132 bBornRadii,
2133 flags,
2134 ddOpenBalanceRegion,
2135 ddCloseBalanceRegion);
2136 break;
2137 case ecutsGROUP:
2138 do_force_cutsGROUP(fplog, cr, inputrec,
2139 step, nrnb, wcycle,
2140 top,
2141 groups,
2142 box, x, hist,
2143 force, vir_force,
2144 mdatoms,
2145 enerd, fcd,
2146 lambda.data(), graph,
2147 fr, vsite, mu_tot,
2148 t, ed,
2149 bBornRadii,
2150 flags,
2151 ddOpenBalanceRegion,
2152 ddCloseBalanceRegion);
2153 break;
2154 default:
2155 gmx_incons("Invalid cut-off scheme passed!");
2158 /* In case we don't have constraints and are using GPUs, the next balancing
2159 * region starts here.
2160 * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2161 * virial calculation and COM pulling, is not thus not included in
2162 * the balance timing, which is ok as most tasks do communication.
2164 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
2166 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
2171 void do_constrain_first(FILE *fplog, gmx_constr_t constr,
2172 t_inputrec *ir, t_mdatoms *md,
2173 t_state *state, t_commrec *cr, t_nrnb *nrnb,
2174 t_forcerec *fr, gmx_localtop_t *top)
2176 int i, m, start, end;
2177 gmx_int64_t step;
2178 real dt = ir->delta_t;
2179 real dvdl_dum;
2180 rvec *savex;
2182 /* We need to allocate one element extra, since we might use
2183 * (unaligned) 4-wide SIMD loads to access rvec entries.
2185 snew(savex, state->natoms + 1);
2187 start = 0;
2188 end = md->homenr;
2190 if (debug)
2192 fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2193 start, md->homenr, end);
2195 /* Do a first constrain to reset particles... */
2196 step = ir->init_step;
2197 if (fplog)
2199 char buf[STEPSTRSIZE];
2200 fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2201 gmx_step_str(step, buf));
2203 dvdl_dum = 0;
2205 /* constrain the current position */
2206 constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
2207 ir, cr, step, 0, 1.0, md,
2208 as_rvec_array(state->x.data()), as_rvec_array(state->x.data()), nullptr,
2209 fr->bMolPBC, state->box,
2210 state->lambda[efptBONDED], &dvdl_dum,
2211 nullptr, nullptr, nrnb, econqCoord);
2212 if (EI_VV(ir->eI))
2214 /* constrain the inital velocity, and save it */
2215 /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2216 constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
2217 ir, cr, step, 0, 1.0, md,
2218 as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
2219 fr->bMolPBC, state->box,
2220 state->lambda[efptBONDED], &dvdl_dum,
2221 nullptr, nullptr, nrnb, econqVeloc);
2223 /* constrain the inital velocities at t-dt/2 */
2224 if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2226 for (i = start; (i < end); i++)
2228 for (m = 0; (m < DIM); m++)
2230 /* Reverse the velocity */
2231 state->v[i][m] = -state->v[i][m];
2232 /* Store the position at t-dt in buf */
2233 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
2236 /* Shake the positions at t=-dt with the positions at t=0
2237 * as reference coordinates.
2239 if (fplog)
2241 char buf[STEPSTRSIZE];
2242 fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2243 gmx_step_str(step, buf));
2245 dvdl_dum = 0;
2246 constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
2247 ir, cr, step, -1, 1.0, md,
2248 as_rvec_array(state->x.data()), savex, nullptr,
2249 fr->bMolPBC, state->box,
2250 state->lambda[efptBONDED], &dvdl_dum,
2251 as_rvec_array(state->v.data()), nullptr, nrnb, econqCoord);
2253 for (i = start; i < end; i++)
2255 for (m = 0; m < DIM; m++)
2257 /* Re-reverse the velocities */
2258 state->v[i][m] = -state->v[i][m];
2262 sfree(savex);
2266 static void
2267 integrate_table(real vdwtab[], real scale, int offstart, int rstart, int rend,
2268 double *enerout, double *virout)
2270 double enersum, virsum;
2271 double invscale, invscale2, invscale3;
2272 double r, ea, eb, ec, pa, pb, pc, pd;
2273 double y0, f, g, h;
2274 int ri, offset;
2275 double tabfactor;
2277 invscale = 1.0/scale;
2278 invscale2 = invscale*invscale;
2279 invscale3 = invscale*invscale2;
2281 /* Following summation derived from cubic spline definition,
2282 * Numerical Recipies in C, second edition, p. 113-116. Exact for
2283 * the cubic spline. We first calculate the negative of the
2284 * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2285 * add the more standard, abrupt cutoff correction to that result,
2286 * yielding the long-range correction for a switched function. We
2287 * perform both the pressure and energy loops at the same time for
2288 * simplicity, as the computational cost is low. */
2290 if (offstart == 0)
2292 /* Since the dispersion table has been scaled down a factor
2293 * 6.0 and the repulsion a factor 12.0 to compensate for the
2294 * c6/c12 parameters inside nbfp[] being scaled up (to save
2295 * flops in kernels), we need to correct for this.
2297 tabfactor = 6.0;
2299 else
2301 tabfactor = 12.0;
2304 enersum = 0.0;
2305 virsum = 0.0;
2306 for (ri = rstart; ri < rend; ++ri)
2308 r = ri*invscale;
2309 ea = invscale3;
2310 eb = 2.0*invscale2*r;
2311 ec = invscale*r*r;
2313 pa = invscale3;
2314 pb = 3.0*invscale2*r;
2315 pc = 3.0*invscale*r*r;
2316 pd = r*r*r;
2318 /* this "8" is from the packing in the vdwtab array - perhaps
2319 should be defined? */
2321 offset = 8*ri + offstart;
2322 y0 = vdwtab[offset];
2323 f = vdwtab[offset+1];
2324 g = vdwtab[offset+2];
2325 h = vdwtab[offset+3];
2327 enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2328 virsum += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2330 *enerout = 4.0*M_PI*enersum*tabfactor;
2331 *virout = 4.0*M_PI*virsum*tabfactor;
2334 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2336 double eners[2], virs[2], enersum, virsum;
2337 double r0, rc3, rc9;
2338 int ri0, ri1, i;
2339 real scale, *vdwtab;
2341 fr->enershiftsix = 0;
2342 fr->enershifttwelve = 0;
2343 fr->enerdiffsix = 0;
2344 fr->enerdifftwelve = 0;
2345 fr->virdiffsix = 0;
2346 fr->virdifftwelve = 0;
2348 const interaction_const_t *ic = fr->ic;
2350 if (eDispCorr != edispcNO)
2352 for (i = 0; i < 2; i++)
2354 eners[i] = 0;
2355 virs[i] = 0;
2357 if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2358 (ic->vdw_modifier == eintmodPOTSWITCH) ||
2359 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2360 (ic->vdwtype == evdwSHIFT) ||
2361 (ic->vdwtype == evdwSWITCH))
2363 if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2364 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2365 (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2367 gmx_fatal(FARGS,
2368 "With dispersion correction rvdw-switch can not be zero "
2369 "for vdw-type = %s", evdw_names[ic->vdwtype]);
2372 /* TODO This code depends on the logic in tables.c that
2373 constructs the table layout, which should be made
2374 explicit in future cleanup. */
2375 GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2376 "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2377 scale = fr->dispersionCorrectionTable->scale;
2378 vdwtab = fr->dispersionCorrectionTable->data;
2380 /* Round the cut-offs to exact table values for precision */
2381 ri0 = static_cast<int>(floor(ic->rvdw_switch*scale));
2382 ri1 = static_cast<int>(ceil(ic->rvdw*scale));
2384 /* The code below has some support for handling force-switching, i.e.
2385 * when the force (instead of potential) is switched over a limited
2386 * region. This leads to a constant shift in the potential inside the
2387 * switching region, which we can handle by adding a constant energy
2388 * term in the force-switch case just like when we do potential-shift.
2390 * For now this is not enabled, but to keep the functionality in the
2391 * code we check separately for switch and shift. When we do force-switch
2392 * the shifting point is rvdw_switch, while it is the cutoff when we
2393 * have a classical potential-shift.
2395 * For a pure potential-shift the potential has a constant shift
2396 * all the way out to the cutoff, and that is it. For other forms
2397 * we need to calculate the constant shift up to the point where we
2398 * start modifying the potential.
2400 ri0 = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2402 r0 = ri0/scale;
2403 rc3 = r0*r0*r0;
2404 rc9 = rc3*rc3*rc3;
2406 if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2407 (ic->vdwtype == evdwSHIFT))
2409 /* Determine the constant energy shift below rvdw_switch.
2410 * Table has a scale factor since we have scaled it down to compensate
2411 * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2413 fr->enershiftsix = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2414 fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2416 else if (ic->vdw_modifier == eintmodPOTSHIFT)
2418 fr->enershiftsix = (real)(-1.0/(rc3*rc3));
2419 fr->enershifttwelve = (real)( 1.0/(rc9*rc3));
2422 /* Add the constant part from 0 to rvdw_switch.
2423 * This integration from 0 to rvdw_switch overcounts the number
2424 * of interactions by 1, as it also counts the self interaction.
2425 * We will correct for this later.
2427 eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2428 eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2430 /* Calculate the contribution in the range [r0,r1] where we
2431 * modify the potential. For a pure potential-shift modifier we will
2432 * have ri0==ri1, and there will not be any contribution here.
2434 for (i = 0; i < 2; i++)
2436 enersum = 0;
2437 virsum = 0;
2438 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2439 eners[i] -= enersum;
2440 virs[i] -= virsum;
2443 /* Alright: Above we compensated by REMOVING the parts outside r0
2444 * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2446 * Regardless of whether r0 is the point where we start switching,
2447 * or the cutoff where we calculated the constant shift, we include
2448 * all the parts we are missing out to infinity from r0 by
2449 * calculating the analytical dispersion correction.
2451 eners[0] += -4.0*M_PI/(3.0*rc3);
2452 eners[1] += 4.0*M_PI/(9.0*rc9);
2453 virs[0] += 8.0*M_PI/rc3;
2454 virs[1] += -16.0*M_PI/(3.0*rc9);
2456 else if (ic->vdwtype == evdwCUT ||
2457 EVDW_PME(ic->vdwtype) ||
2458 ic->vdwtype == evdwUSER)
2460 if (ic->vdwtype == evdwUSER && fplog)
2462 fprintf(fplog,
2463 "WARNING: using dispersion correction with user tables\n");
2466 /* Note that with LJ-PME, the dispersion correction is multiplied
2467 * by the difference between the actual C6 and the value of C6
2468 * that would produce the combination rule.
2469 * This means the normal energy and virial difference formulas
2470 * can be used here.
2473 rc3 = ic->rvdw*ic->rvdw*ic->rvdw;
2474 rc9 = rc3*rc3*rc3;
2475 /* Contribution beyond the cut-off */
2476 eners[0] += -4.0*M_PI/(3.0*rc3);
2477 eners[1] += 4.0*M_PI/(9.0*rc9);
2478 if (ic->vdw_modifier == eintmodPOTSHIFT)
2480 /* Contribution within the cut-off */
2481 eners[0] += -4.0*M_PI/(3.0*rc3);
2482 eners[1] += 4.0*M_PI/(3.0*rc9);
2484 /* Contribution beyond the cut-off */
2485 virs[0] += 8.0*M_PI/rc3;
2486 virs[1] += -16.0*M_PI/(3.0*rc9);
2488 else
2490 gmx_fatal(FARGS,
2491 "Dispersion correction is not implemented for vdw-type = %s",
2492 evdw_names[ic->vdwtype]);
2495 /* When we deprecate the group kernels the code below can go too */
2496 if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2498 /* Calculate self-interaction coefficient (assuming that
2499 * the reciprocal-space contribution is constant in the
2500 * region that contributes to the self-interaction).
2502 fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2504 eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2505 virs[0] += gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2508 fr->enerdiffsix = eners[0];
2509 fr->enerdifftwelve = eners[1];
2510 /* The 0.5 is due to the Gromacs definition of the virial */
2511 fr->virdiffsix = 0.5*virs[0];
2512 fr->virdifftwelve = 0.5*virs[1];
2516 void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
2517 matrix box, real lambda, tensor pres, tensor virial,
2518 real *prescorr, real *enercorr, real *dvdlcorr)
2520 gmx_bool bCorrAll, bCorrPres;
2521 real dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2522 int m;
2524 *prescorr = 0;
2525 *enercorr = 0;
2526 *dvdlcorr = 0;
2528 clear_mat(virial);
2529 clear_mat(pres);
2531 if (ir->eDispCorr != edispcNO)
2533 bCorrAll = (ir->eDispCorr == edispcAllEner ||
2534 ir->eDispCorr == edispcAllEnerPres);
2535 bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2536 ir->eDispCorr == edispcAllEnerPres);
2538 invvol = 1/det(box);
2539 if (fr->n_tpi)
2541 /* Only correct for the interactions with the inserted molecule */
2542 dens = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2543 ninter = fr->n_tpi;
2545 else
2547 dens = fr->numAtomsForDispersionCorrection*invvol;
2548 ninter = 0.5*fr->numAtomsForDispersionCorrection;
2551 if (ir->efep == efepNO)
2553 avcsix = fr->avcsix[0];
2554 avctwelve = fr->avctwelve[0];
2556 else
2558 avcsix = (1 - lambda)*fr->avcsix[0] + lambda*fr->avcsix[1];
2559 avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2562 enerdiff = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2563 *enercorr += avcsix*enerdiff;
2564 dvdlambda = 0.0;
2565 if (ir->efep != efepNO)
2567 dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2569 if (bCorrAll)
2571 enerdiff = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2572 *enercorr += avctwelve*enerdiff;
2573 if (fr->efep != efepNO)
2575 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2579 if (bCorrPres)
2581 svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2582 if (ir->eDispCorr == edispcAllEnerPres)
2584 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2586 /* The factor 2 is because of the Gromacs virial definition */
2587 spres = -2.0*invvol*svir*PRESFAC;
2589 for (m = 0; m < DIM; m++)
2591 virial[m][m] += svir;
2592 pres[m][m] += spres;
2594 *prescorr += spres;
2597 /* Can't currently control when it prints, for now, just print when degugging */
2598 if (debug)
2600 if (bCorrAll)
2602 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2603 avcsix, avctwelve);
2605 if (bCorrPres)
2607 fprintf(debug,
2608 "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2609 *enercorr, spres, svir);
2611 else
2613 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2617 if (fr->efep != efepNO)
2619 *dvdlcorr += dvdlambda;
2624 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2625 const gmx_mtop_t *mtop, rvec x[],
2626 gmx_bool bFirst)
2628 t_graph *graph;
2629 int mb, as, mol;
2630 gmx_molblock_t *molb;
2632 if (bFirst && fplog)
2634 fprintf(fplog, "Removing pbc first time\n");
2637 snew(graph, 1);
2638 as = 0;
2639 for (mb = 0; mb < mtop->nmolblock; mb++)
2641 molb = &mtop->molblock[mb];
2642 if (molb->natoms_mol == 1 ||
2643 (!bFirst && mtop->moltype[molb->type].cgs.nr == 1))
2645 /* Just one atom or charge group in the molecule, no PBC required */
2646 as += molb->nmol*molb->natoms_mol;
2648 else
2650 /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2651 mk_graph_ilist(nullptr, mtop->moltype[molb->type].ilist,
2652 0, molb->natoms_mol, FALSE, FALSE, graph);
2654 for (mol = 0; mol < molb->nmol; mol++)
2656 mk_mshift(fplog, graph, ePBC, box, x+as);
2658 shift_self(graph, box, x+as);
2659 /* The molecule is whole now.
2660 * We don't need the second mk_mshift call as in do_pbc_first,
2661 * since we no longer need this graph.
2664 as += molb->natoms_mol;
2666 done_graph(graph);
2669 sfree(graph);
2672 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2673 const gmx_mtop_t *mtop, rvec x[])
2675 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2678 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2679 const gmx_mtop_t *mtop, rvec x[])
2681 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2684 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2686 int t, nth;
2687 nth = gmx_omp_nthreads_get(emntDefault);
2689 #pragma omp parallel for num_threads(nth) schedule(static)
2690 for (t = 0; t < nth; t++)
2694 size_t natoms = x.size();
2695 size_t offset = (natoms*t )/nth;
2696 size_t len = (natoms*(t + 1))/nth - offset;
2697 put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2699 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2703 // TODO This can be cleaned up a lot, and move back to runner.cpp
2704 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, t_commrec *cr,
2705 const t_inputrec *inputrec,
2706 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2707 gmx_walltime_accounting_t walltime_accounting,
2708 nonbonded_verlet_t *nbv,
2709 gmx_pme_t *pme,
2710 gmx_bool bWriteStat)
2712 t_nrnb *nrnb_tot = nullptr;
2713 double delta_t = 0;
2714 double nbfs = 0, mflop = 0;
2715 double elapsed_time,
2716 elapsed_time_over_all_ranks,
2717 elapsed_time_over_all_threads,
2718 elapsed_time_over_all_threads_over_all_ranks;
2719 /* Control whether it is valid to print a report. Only the
2720 simulation master may print, but it should not do so if the run
2721 terminated e.g. before a scheduled reset step. This is
2722 complicated by the fact that PME ranks are unaware of the
2723 reason why they were sent a pmerecvqxFINISH. To avoid
2724 communication deadlocks, we always do the communication for the
2725 report, even if we've decided not to write the report, because
2726 how long it takes to finish the run is not important when we've
2727 decided not to report on the simulation performance.
2729 Further, we only report performance for dynamical integrators,
2730 because those are the only ones for which we plan to
2731 consider doing any optimizations. */
2732 bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2734 if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2736 GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2737 printReport = false;
2740 if (cr->nnodes > 1)
2742 snew(nrnb_tot, 1);
2743 #if GMX_MPI
2744 MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2745 cr->mpi_comm_mysim);
2746 #endif
2748 else
2750 nrnb_tot = nrnb;
2753 elapsed_time = walltime_accounting_get_elapsed_time(walltime_accounting);
2754 elapsed_time_over_all_ranks = elapsed_time;
2755 elapsed_time_over_all_threads = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
2756 elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2757 #if GMX_MPI
2758 if (cr->nnodes > 1)
2760 /* reduce elapsed_time over all MPI ranks in the current simulation */
2761 MPI_Allreduce(&elapsed_time,
2762 &elapsed_time_over_all_ranks,
2763 1, MPI_DOUBLE, MPI_SUM,
2764 cr->mpi_comm_mysim);
2765 elapsed_time_over_all_ranks /= cr->nnodes;
2766 /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2767 * current simulation. */
2768 MPI_Allreduce(&elapsed_time_over_all_threads,
2769 &elapsed_time_over_all_threads_over_all_ranks,
2770 1, MPI_DOUBLE, MPI_SUM,
2771 cr->mpi_comm_mysim);
2773 #endif
2775 if (printReport)
2777 print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2779 if (cr->nnodes > 1)
2781 sfree(nrnb_tot);
2784 if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2786 print_dd_statistics(cr, inputrec, fplog);
2789 /* TODO Move the responsibility for any scaling by thread counts
2790 * to the code that handled the thread region, so that there's a
2791 * mechanism to keep cycle counting working during the transition
2792 * to task parallelism. */
2793 int nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
2794 int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2795 wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2796 auto cycle_sum(wallcycle_sum(cr, wcycle));
2798 if (printReport)
2800 auto nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2801 gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
2802 if (pme_gpu_task_enabled(pme))
2804 pme_gpu_get_timings(pme, &pme_gpu_timings);
2806 wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2807 elapsed_time_over_all_ranks,
2808 wcycle, cycle_sum,
2809 nbnxn_gpu_timings,
2810 &pme_gpu_timings);
2812 if (EI_DYNAMICS(inputrec->eI))
2814 delta_t = inputrec->delta_t;
2817 if (fplog)
2819 print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2820 elapsed_time_over_all_ranks,
2821 walltime_accounting_get_nsteps_done(walltime_accounting),
2822 delta_t, nbfs, mflop);
2824 if (bWriteStat)
2826 print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2827 elapsed_time_over_all_ranks,
2828 walltime_accounting_get_nsteps_done(walltime_accounting),
2829 delta_t, nbfs, mflop);
2834 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2836 /* this function works, but could probably use a logic rewrite to keep all the different
2837 types of efep straight. */
2839 if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
2841 return;
2844 t_lambda *fep = ir->fepvals;
2845 *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2846 if checkpoint is set -- a kludge is in for now
2847 to prevent this.*/
2849 for (int i = 0; i < efptNR; i++)
2851 /* overwrite lambda state with init_lambda for now for backwards compatibility */
2852 if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2854 lambda[i] = fep->init_lambda;
2855 if (lam0)
2857 lam0[i] = lambda[i];
2860 else
2862 lambda[i] = fep->all_lambda[i][*fep_state];
2863 if (lam0)
2865 lam0[i] = lambda[i];
2869 if (ir->bSimTemp)
2871 /* need to rescale control temperatures to match current state */
2872 for (int i = 0; i < ir->opts.ngtc; i++)
2874 if (ir->opts.ref_t[i] > 0)
2876 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2881 /* Send to the log the information on the current lambdas */
2882 if (fplog != nullptr)
2884 fprintf(fplog, "Initial vector of lambda components:[ ");
2885 for (const auto &l : lambda)
2887 fprintf(fplog, "%10.4f ", l);
2889 fprintf(fplog, "]\n");
2891 return;
2895 void init_md(FILE *fplog,
2896 t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2897 t_inputrec *ir, const gmx_output_env_t *oenv,
2898 const MdrunOptions &mdrunOptions,
2899 double *t, double *t0,
2900 t_state *globalState, double *lam0,
2901 t_nrnb *nrnb, gmx_mtop_t *mtop,
2902 gmx_update_t **upd,
2903 int nfile, const t_filenm fnm[],
2904 gmx_mdoutf_t *outf, t_mdebin **mdebin,
2905 tensor force_vir, tensor shake_vir, rvec mu_tot,
2906 gmx_bool *bSimAnn, t_vcm **vcm,
2907 gmx_wallcycle_t wcycle)
2909 int i;
2911 /* Initial values */
2912 *t = *t0 = ir->init_t;
2914 *bSimAnn = FALSE;
2915 for (i = 0; i < ir->opts.ngtc; i++)
2917 /* set bSimAnn if any group is being annealed */
2918 if (ir->opts.annealing[i] != eannNO)
2920 *bSimAnn = TRUE;
2924 /* Initialize lambda variables */
2925 /* TODO: Clean up initialization of fep_state and lambda in t_state.
2926 * We currently need to call initialize_lambdas on non-master ranks
2927 * to initialize lam0.
2929 if (MASTER(cr))
2931 initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
2933 else
2935 int tmpFepState;
2936 std::array<real, efptNR> tmpLambda;
2937 initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
2940 // TODO upd is never NULL in practice, but the analysers don't know that
2941 if (upd)
2943 *upd = init_update(ir);
2945 if (*bSimAnn)
2947 update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2950 if (vcm != nullptr)
2952 *vcm = init_vcm(fplog, &mtop->groups, ir);
2955 if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
2957 if (ir->etc == etcBERENDSEN)
2959 please_cite(fplog, "Berendsen84a");
2961 if (ir->etc == etcVRESCALE)
2963 please_cite(fplog, "Bussi2007a");
2965 if (ir->eI == eiSD1)
2967 please_cite(fplog, "Goga2012");
2970 init_nrnb(nrnb);
2972 if (nfile != -1)
2974 *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
2976 *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
2977 mtop, ir, mdoutf_get_fp_dhdl(*outf));
2980 /* Initiate variables */
2981 clear_mat(force_vir);
2982 clear_mat(shake_vir);
2983 clear_rvec(mu_tot);