Remove gmx custom fixed int (e.g. gmx_int64_t) types
[gromacs.git] / src / gromacs / mdlib / sim_util.cpp
blob9cfc324715f4728442454c5445af36227e96453b
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
37 #include "gmxpre.h"
39 #include "sim_util.h"
41 #include "config.h"
43 #include <stdio.h>
44 #include <string.h>
46 #include <cmath>
47 #include <cstdint>
49 #include <array>
51 #include "gromacs/awh/awh.h"
52 #include "gromacs/domdec/dlbtiming.h"
53 #include "gromacs/domdec/domdec.h"
54 #include "gromacs/domdec/domdec_struct.h"
55 #include "gromacs/essentialdynamics/edsam.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/gmxlib/chargegroup.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/gmxlib/nrnb.h"
60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
63 #include "gromacs/gpu_utils/gpu_utils.h"
64 #include "gromacs/imd/imd.h"
65 #include "gromacs/listed-forces/bonded.h"
66 #include "gromacs/listed-forces/disre.h"
67 #include "gromacs/listed-forces/orires.h"
68 #include "gromacs/math/functions.h"
69 #include "gromacs/math/units.h"
70 #include "gromacs/math/vec.h"
71 #include "gromacs/math/vecdump.h"
72 #include "gromacs/mdlib/calcmu.h"
73 #include "gromacs/mdlib/calcvir.h"
74 #include "gromacs/mdlib/constr.h"
75 #include "gromacs/mdlib/force.h"
76 #include "gromacs/mdlib/forcerec.h"
77 #include "gromacs/mdlib/gmx_omp_nthreads.h"
78 #include "gromacs/mdlib/mdrun.h"
79 #include "gromacs/mdlib/nb_verlet.h"
80 #include "gromacs/mdlib/nbnxn_atomdata.h"
81 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
82 #include "gromacs/mdlib/nbnxn_grid.h"
83 #include "gromacs/mdlib/nbnxn_search.h"
84 #include "gromacs/mdlib/qmmm.h"
85 #include "gromacs/mdlib/update.h"
86 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
87 #include "gromacs/mdtypes/commrec.h"
88 #include "gromacs/mdtypes/forceoutput.h"
89 #include "gromacs/mdtypes/iforceprovider.h"
90 #include "gromacs/mdtypes/inputrec.h"
91 #include "gromacs/mdtypes/md_enums.h"
92 #include "gromacs/mdtypes/state.h"
93 #include "gromacs/pbcutil/ishift.h"
94 #include "gromacs/pbcutil/mshift.h"
95 #include "gromacs/pbcutil/pbc.h"
96 #include "gromacs/pulling/pull.h"
97 #include "gromacs/pulling/pull_rotation.h"
98 #include "gromacs/timing/cyclecounter.h"
99 #include "gromacs/timing/gpu_timing.h"
100 #include "gromacs/timing/wallcycle.h"
101 #include "gromacs/timing/wallcyclereporting.h"
102 #include "gromacs/timing/walltime_accounting.h"
103 #include "gromacs/topology/topology.h"
104 #include "gromacs/utility/arrayref.h"
105 #include "gromacs/utility/basedefinitions.h"
106 #include "gromacs/utility/cstringutil.h"
107 #include "gromacs/utility/exceptions.h"
108 #include "gromacs/utility/fatalerror.h"
109 #include "gromacs/utility/gmxassert.h"
110 #include "gromacs/utility/gmxmpi.h"
111 #include "gromacs/utility/logger.h"
112 #include "gromacs/utility/pleasecite.h"
113 #include "gromacs/utility/smalloc.h"
114 #include "gromacs/utility/sysinfo.h"
116 #include "nbnxn_gpu.h"
117 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
118 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
120 // TODO: this environment variable allows us to verify before release
121 // that on less common architectures the total cost of polling is not larger than
122 // a blocking wait (so polling does not introduce overhead when the static
123 // PME-first ordering would suffice).
124 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
127 void print_time(FILE *out,
128 gmx_walltime_accounting_t walltime_accounting,
129 int64_t step,
130 t_inputrec *ir,
131 const t_commrec *cr)
133 time_t finish;
134 char timebuf[STRLEN];
135 double dt, elapsed_seconds, time_per_step;
136 char buf[48];
138 #if !GMX_THREAD_MPI
139 if (!PAR(cr))
140 #endif
142 fprintf(out, "\r");
144 fprintf(out, "step %s", gmx_step_str(step, buf));
145 fflush(out);
147 if ((step >= ir->nstlist))
149 double seconds_since_epoch = gmx_gettime();
150 elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
151 time_per_step = elapsed_seconds/(step - ir->init_step + 1);
152 dt = (ir->nsteps + ir->init_step - step) * time_per_step;
154 if (ir->nsteps >= 0)
156 if (dt >= 300)
158 finish = (time_t) (seconds_since_epoch + dt);
159 gmx_ctime_r(&finish, timebuf, STRLEN);
160 sprintf(buf, "%s", timebuf);
161 buf[strlen(buf)-1] = '\0';
162 fprintf(out, ", will finish %s", buf);
164 else
166 fprintf(out, ", remaining wall clock time: %5d s ", (int)dt);
169 else
171 fprintf(out, " performance: %.1f ns/day ",
172 ir->delta_t/1000*24*60*60/time_per_step);
175 #if !GMX_THREAD_MPI
176 if (PAR(cr))
178 fprintf(out, "\n");
180 #else
181 GMX_UNUSED_VALUE(cr);
182 #endif
184 fflush(out);
187 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
188 double the_time)
190 char time_string[STRLEN];
192 if (!fplog)
194 return;
198 int i;
199 char timebuf[STRLEN];
200 time_t temp_time = (time_t) the_time;
202 gmx_ctime_r(&temp_time, timebuf, STRLEN);
203 for (i = 0; timebuf[i] >= ' '; i++)
205 time_string[i] = timebuf[i];
207 time_string[i] = '\0';
210 fprintf(fplog, "%s on rank %d %s\n", title, nodeid, time_string);
213 void print_start(FILE *fplog, const t_commrec *cr,
214 gmx_walltime_accounting_t walltime_accounting,
215 const char *name)
217 char buf[STRLEN];
219 sprintf(buf, "Started %s", name);
220 print_date_and_time(fplog, cr->nodeid, buf,
221 walltime_accounting_get_start_time_stamp(walltime_accounting));
224 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
226 const int end = forceToAdd.size();
228 // cppcheck-suppress unreadVariable
229 int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
230 #pragma omp parallel for num_threads(nt) schedule(static)
231 for (int i = 0; i < end; i++)
233 rvec_inc(f[i], forceToAdd[i]);
237 static void pme_gpu_reduce_outputs(gmx_wallcycle_t wcycle,
238 gmx::ForceWithVirial *forceWithVirial,
239 gmx::ArrayRef<const gmx::RVec> pmeForces,
240 gmx_enerdata_t *enerd,
241 const tensor vir_Q,
242 real Vlr_q)
244 wallcycle_start(wcycle, ewcPME_GPU_F_REDUCTION);
245 GMX_ASSERT(forceWithVirial, "Invalid force pointer");
246 forceWithVirial->addVirialContribution(vir_Q);
247 enerd->term[F_COUL_RECIP] += Vlr_q;
248 sum_forces(as_rvec_array(forceWithVirial->force_.data()), pmeForces);
249 wallcycle_stop(wcycle, ewcPME_GPU_F_REDUCTION);
252 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
253 tensor vir_part, t_graph *graph, matrix box,
254 t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
256 int i;
258 /* The short-range virial from surrounding boxes */
259 calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
260 inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
262 /* Calculate partial virial, for local atoms only, based on short range.
263 * Total virial is computed in global_stat, called from do_md
265 f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
266 inc_nrnb(nrnb, eNR_VIRIAL, homenr);
268 /* Add wall contribution */
269 for (i = 0; i < DIM; i++)
271 vir_part[i][ZZ] += fr->vir_wall_z[i];
274 if (debug)
276 pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
280 static void pull_potential_wrapper(const t_commrec *cr,
281 const t_inputrec *ir,
282 matrix box, gmx::ArrayRef<const gmx::RVec> x,
283 gmx::ForceWithVirial *force,
284 const t_mdatoms *mdatoms,
285 gmx_enerdata_t *enerd,
286 real *lambda,
287 double t,
288 gmx_wallcycle_t wcycle)
290 t_pbc pbc;
291 real dvdl;
293 /* Calculate the center of mass forces, this requires communication,
294 * which is why pull_potential is called close to other communication.
296 wallcycle_start(wcycle, ewcPULLPOT);
297 set_pbc(&pbc, ir->ePBC, box);
298 dvdl = 0;
299 enerd->term[F_COM_PULL] +=
300 pull_potential(ir->pull_work, mdatoms, &pbc,
301 cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
302 enerd->dvdl_lin[efptRESTRAINT] += dvdl;
303 wallcycle_stop(wcycle, ewcPULLPOT);
306 static void pme_receive_force_ener(const t_commrec *cr,
307 gmx::ForceWithVirial *forceWithVirial,
308 gmx_enerdata_t *enerd,
309 gmx_wallcycle_t wcycle)
311 real e_q, e_lj, dvdl_q, dvdl_lj;
312 float cycles_ppdpme, cycles_seppme;
314 cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
315 dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
317 /* In case of node-splitting, the PP nodes receive the long-range
318 * forces, virial and energy from the PME nodes here.
320 wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
321 dvdl_q = 0;
322 dvdl_lj = 0;
323 gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
324 &cycles_seppme);
325 enerd->term[F_COUL_RECIP] += e_q;
326 enerd->term[F_LJ_RECIP] += e_lj;
327 enerd->dvdl_lin[efptCOUL] += dvdl_q;
328 enerd->dvdl_lin[efptVDW] += dvdl_lj;
330 if (wcycle)
332 dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
334 wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
337 static void print_large_forces(FILE *fp,
338 const t_mdatoms *md,
339 const t_commrec *cr,
340 int64_t step,
341 real forceTolerance,
342 const rvec *x,
343 const rvec *f)
345 real force2Tolerance = gmx::square(forceTolerance);
346 std::uintmax_t numNonFinite = 0;
347 for (int i = 0; i < md->homenr; i++)
349 real force2 = norm2(f[i]);
350 bool nonFinite = !std::isfinite(force2);
351 if (force2 >= force2Tolerance || nonFinite)
353 fprintf(fp, "step %" PRId64 " atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
354 step,
355 ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
357 if (nonFinite)
359 numNonFinite++;
362 if (numNonFinite > 0)
364 /* Note that with MPI this fatal call on one rank might interrupt
365 * the printing on other ranks. But we can only avoid that with
366 * an expensive MPI barrier that we would need at each step.
368 gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %ju atoms", step, numNonFinite);
372 static void post_process_forces(const t_commrec *cr,
373 int64_t step,
374 t_nrnb *nrnb,
375 gmx_wallcycle_t wcycle,
376 const gmx_localtop_t *top,
377 matrix box,
378 rvec x[],
379 rvec f[],
380 gmx::ForceWithVirial *forceWithVirial,
381 tensor vir_force,
382 const t_mdatoms *mdatoms,
383 t_graph *graph,
384 t_forcerec *fr,
385 const gmx_vsite_t *vsite,
386 int flags)
388 if (fr->haveDirectVirialContributions)
390 rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
392 if (vsite)
394 /* Spread the mesh force on virtual sites to the other particles...
395 * This is parallellized. MPI communication is performed
396 * if the constructing atoms aren't local.
398 matrix virial = { { 0 } };
399 spread_vsite_f(vsite, x, fDirectVir, nullptr,
400 (flags & GMX_FORCE_VIRIAL), virial,
401 nrnb,
402 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
403 forceWithVirial->addVirialContribution(virial);
406 if (flags & GMX_FORCE_VIRIAL)
408 /* Now add the forces, this is local */
409 sum_forces(f, forceWithVirial->force_);
411 /* Add the direct virial contributions */
412 GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
413 m_add(vir_force, forceWithVirial->getVirial(), vir_force);
415 if (debug)
417 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
422 if (fr->print_force >= 0)
424 print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
428 static void do_nb_verlet(t_forcerec *fr,
429 interaction_const_t *ic,
430 gmx_enerdata_t *enerd,
431 int flags, int ilocality,
432 int clearF,
433 int64_t step,
434 t_nrnb *nrnb,
435 gmx_wallcycle_t wcycle)
437 if (!(flags & GMX_FORCE_NONBONDED))
439 /* skip non-bonded calculation */
440 return;
443 nonbonded_verlet_t *nbv = fr->nbv;
444 nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
446 /* GPU kernel launch overhead is already timed separately */
447 if (fr->cutoff_scheme != ecutsVERLET)
449 gmx_incons("Invalid cut-off scheme passed!");
452 bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
454 if (!bUsingGpuKernels)
456 /* When dynamic pair-list pruning is requested, we need to prune
457 * at nstlistPrune steps.
459 if (nbv->listParams->useDynamicPruning &&
460 (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
462 /* Prune the pair-list beyond fr->ic->rlistPrune using
463 * the current coordinates of the atoms.
465 wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
466 nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
467 wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
470 wallcycle_sub_start(wcycle, ewcsNONBONDED);
473 switch (nbvg->kernel_type)
475 case nbnxnk4x4_PlainC:
476 case nbnxnk4xN_SIMD_4xN:
477 case nbnxnk4xN_SIMD_2xNN:
478 nbnxn_kernel_cpu(nbvg,
479 nbv->nbat,
481 fr->shift_vec,
482 flags,
483 clearF,
484 fr->fshift[0],
485 enerd->grpp.ener[egCOULSR],
486 fr->bBHAM ?
487 enerd->grpp.ener[egBHAMSR] :
488 enerd->grpp.ener[egLJSR]);
489 break;
491 case nbnxnk8x8x8_GPU:
492 nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbv->nbat, flags, ilocality);
493 break;
495 case nbnxnk8x8x8_PlainC:
496 nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
497 nbv->nbat, ic,
498 fr->shift_vec,
499 flags,
500 clearF,
501 nbv->nbat->out[0].f,
502 fr->fshift[0],
503 enerd->grpp.ener[egCOULSR],
504 fr->bBHAM ?
505 enerd->grpp.ener[egBHAMSR] :
506 enerd->grpp.ener[egLJSR]);
507 break;
509 default:
510 GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
513 if (!bUsingGpuKernels)
515 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
518 int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
519 if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
521 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
523 else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
524 (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
526 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
528 else
530 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
532 enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
533 if (flags & GMX_FORCE_ENERGY)
535 /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
536 enr_nbnxn_kernel_ljc += 1;
537 enr_nbnxn_kernel_lj += 1;
540 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
541 nbvg->nbl_lists.natpair_ljq);
542 inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
543 nbvg->nbl_lists.natpair_lj);
544 /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
545 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
546 nbvg->nbl_lists.natpair_q);
548 if (ic->vdw_modifier == eintmodFORCESWITCH)
550 /* We add up the switch cost separately */
551 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
552 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
554 if (ic->vdw_modifier == eintmodPOTSWITCH)
556 /* We add up the switch cost separately */
557 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
558 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
560 if (ic->vdwtype == evdwPME)
562 /* We add up the LJ Ewald cost separately */
563 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
564 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
568 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
569 t_forcerec *fr,
570 rvec x[],
571 rvec f[],
572 const t_mdatoms *mdatoms,
573 t_lambda *fepvals,
574 real *lambda,
575 gmx_enerdata_t *enerd,
576 int flags,
577 t_nrnb *nrnb,
578 gmx_wallcycle_t wcycle)
580 int donb_flags;
581 nb_kernel_data_t kernel_data;
582 real lam_i[efptNR];
583 real dvdl_nb[efptNR];
584 int th;
585 int i, j;
587 donb_flags = 0;
588 /* Add short-range interactions */
589 donb_flags |= GMX_NONBONDED_DO_SR;
591 /* Currently all group scheme kernels always calculate (shift-)forces */
592 if (flags & GMX_FORCE_FORCES)
594 donb_flags |= GMX_NONBONDED_DO_FORCE;
596 if (flags & GMX_FORCE_VIRIAL)
598 donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
600 if (flags & GMX_FORCE_ENERGY)
602 donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
605 kernel_data.flags = donb_flags;
606 kernel_data.lambda = lambda;
607 kernel_data.dvdl = dvdl_nb;
609 kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
610 kernel_data.energygrp_vdw = enerd->grpp.ener[egLJSR];
612 /* reset free energy components */
613 for (i = 0; i < efptNR; i++)
615 dvdl_nb[i] = 0;
618 GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
620 wallcycle_sub_start(wcycle, ewcsNONBONDED);
621 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
622 for (th = 0; th < nbl_lists->nnbl; th++)
626 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
627 x, f, fr, mdatoms, &kernel_data, nrnb);
629 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
632 if (fepvals->sc_alpha != 0)
634 enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
635 enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
637 else
639 enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
640 enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
643 /* If we do foreign lambda and we have soft-core interactions
644 * we have to recalculate the (non-linear) energies contributions.
646 if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
648 kernel_data.flags = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
649 kernel_data.lambda = lam_i;
650 kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
651 kernel_data.energygrp_vdw = enerd->foreign_grpp.ener[egLJSR];
652 /* Note that we add to kernel_data.dvdl, but ignore the result */
654 for (i = 0; i < enerd->n_lambda; i++)
656 for (j = 0; j < efptNR; j++)
658 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
660 reset_foreign_enerdata(enerd);
661 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
662 for (th = 0; th < nbl_lists->nnbl; th++)
666 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
667 x, f, fr, mdatoms, &kernel_data, nrnb);
669 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
672 sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
673 enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
677 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
680 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
682 return nbv != nullptr && nbv->bUseGPU;
685 static inline void clear_rvecs_omp(int n, rvec v[])
687 int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
689 /* Note that we would like to avoid this conditional by putting it
690 * into the omp pragma instead, but then we still take the full
691 * omp parallel for overhead (at least with gcc5).
693 if (nth == 1)
695 for (int i = 0; i < n; i++)
697 clear_rvec(v[i]);
700 else
702 #pragma omp parallel for num_threads(nth) schedule(static)
703 for (int i = 0; i < n; i++)
705 clear_rvec(v[i]);
710 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
712 * \param groupOptions Group options, containing T-coupling options
714 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
716 real nrdfCoupled = 0;
717 real nrdfUncoupled = 0;
718 real kineticEnergy = 0;
719 for (int g = 0; g < groupOptions.ngtc; g++)
721 if (groupOptions.tau_t[g] >= 0)
723 nrdfCoupled += groupOptions.nrdf[g];
724 kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
726 else
728 nrdfUncoupled += groupOptions.nrdf[g];
732 /* This conditional with > also catches nrdf=0 */
733 if (nrdfCoupled > nrdfUncoupled)
735 return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
737 else
739 return 0;
743 /*! \brief This routine checks that the potential energy is finite.
745 * Always checks that the potential energy is finite. If step equals
746 * inputrec.init_step also checks that the magnitude of the potential energy
747 * is reasonable. Terminates with a fatal error when a check fails.
748 * Note that passing this check does not guarantee finite forces,
749 * since those use slightly different arithmetics. But in most cases
750 * there is just a narrow coordinate range where forces are not finite
751 * and energies are finite.
753 * \param[in] step The step number, used for checking and printing
754 * \param[in] enerd The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
755 * \param[in] inputrec The input record
757 static void checkPotentialEnergyValidity(int64_t step,
758 const gmx_enerdata_t &enerd,
759 const t_inputrec &inputrec)
761 /* Threshold valid for comparing absolute potential energy against
762 * the kinetic energy. Normally one should not consider absolute
763 * potential energy values, but with a factor of one million
764 * we should never get false positives.
766 constexpr real c_thresholdFactor = 1e6;
768 bool energyIsNotFinite = !std::isfinite(enerd.term[F_EPOT]);
769 real averageKineticEnergy = 0;
770 /* We only check for large potential energy at the initial step,
771 * because that is by far the most likely step for this too occur
772 * and because computing the average kinetic energy is not free.
773 * Note: nstcalcenergy >> 1 often does not allow to catch large energies
774 * before they become NaN.
776 if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
778 averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
781 if (energyIsNotFinite || (averageKineticEnergy > 0 &&
782 enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
784 gmx_fatal(FARGS, "Step %" PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
785 step,
786 enerd.term[F_EPOT],
787 energyIsNotFinite ? "not finite" : "extremely high",
788 enerd.term[F_LJ],
789 enerd.term[F_COUL_SR],
790 energyIsNotFinite ? "non-finite" : "very high",
791 energyIsNotFinite ? " or Nan" : "");
795 /*! \brief Compute forces and/or energies for special algorithms
797 * The intention is to collect all calls to algorithms that compute
798 * forces on local atoms only and that do not contribute to the local
799 * virial sum (but add their virial contribution separately).
800 * Eventually these should likely all become ForceProviders.
801 * Within this function the intention is to have algorithms that do
802 * global communication at the end, so global barriers within the MD loop
803 * are as close together as possible.
805 * \param[in] fplog The log file
806 * \param[in] cr The communication record
807 * \param[in] inputrec The input record
808 * \param[in] awh The Awh module (nullptr if none in use).
809 * \param[in] enforcedRotation Enforced rotation module.
810 * \param[in] step The current MD step
811 * \param[in] t The current time
812 * \param[in,out] wcycle Wallcycle accounting struct
813 * \param[in,out] forceProviders Pointer to a list of force providers
814 * \param[in] box The unit cell
815 * \param[in] x The coordinates
816 * \param[in] mdatoms Per atom properties
817 * \param[in] lambda Array of free-energy lambda values
818 * \param[in] forceFlags Flags that tell whether we should compute forces/energies/virial
819 * \param[in,out] forceWithVirial Force and virial buffers
820 * \param[in,out] enerd Energy buffer
821 * \param[in,out] ed Essential dynamics pointer
822 * \param[in] bNS Tells if we did neighbor searching this step, used for ED sampling
824 * \todo Remove bNS, which is used incorrectly.
825 * \todo Convert all other algorithms called here to ForceProviders.
827 static void
828 computeSpecialForces(FILE *fplog,
829 const t_commrec *cr,
830 const t_inputrec *inputrec,
831 gmx::Awh *awh,
832 gmx_enfrot *enforcedRotation,
833 int64_t step,
834 double t,
835 gmx_wallcycle_t wcycle,
836 ForceProviders *forceProviders,
837 matrix box,
838 gmx::ArrayRef<const gmx::RVec> x,
839 const t_mdatoms *mdatoms,
840 real *lambda,
841 int forceFlags,
842 gmx::ForceWithVirial *forceWithVirial,
843 gmx_enerdata_t *enerd,
844 const gmx_edsam *ed,
845 gmx_bool bNS)
847 const bool computeForces = (forceFlags & GMX_FORCE_FORCES);
849 /* NOTE: Currently all ForceProviders only provide forces.
850 * When they also provide energies, remove this conditional.
852 if (computeForces)
854 gmx::ForceProviderInput forceProviderInput(x, *mdatoms, t, box, *cr);
855 gmx::ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
857 /* Collect forces from modules */
858 forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
861 if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
863 pull_potential_wrapper(cr, inputrec, box, x,
864 forceWithVirial,
865 mdatoms, enerd, lambda, t,
866 wcycle);
868 if (awh)
870 enerd->term[F_COM_PULL] +=
871 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
872 forceWithVirial,
873 t, step, wcycle, fplog);
877 rvec *f = as_rvec_array(forceWithVirial->force_.data());
879 /* Add the forces from enforced rotation potentials (if any) */
880 if (inputrec->bRot)
882 wallcycle_start(wcycle, ewcROTadd);
883 enerd->term[F_COM_PULL] += add_rot_forces(enforcedRotation, f, cr, step, t);
884 wallcycle_stop(wcycle, ewcROTadd);
887 if (ed)
889 /* Note that since init_edsam() is called after the initialization
890 * of forcerec, edsam doesn't request the noVirSum force buffer.
891 * Thus if no other algorithm (e.g. PME) requires it, the forces
892 * here will contribute to the virial.
894 do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
897 /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
898 if (inputrec->bIMD && computeForces)
900 IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
904 /*! \brief Launch the prepare_step and spread stages of PME GPU.
906 * \param[in] pmedata The PME structure
907 * \param[in] box The box matrix
908 * \param[in] x Coordinate array
909 * \param[in] flags Force flags
910 * \param[in] wcycle The wallcycle structure
912 static inline void launchPmeGpuSpread(gmx_pme_t *pmedata,
913 matrix box,
914 rvec x[],
915 int flags,
916 gmx_wallcycle_t wcycle)
918 int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE;
919 pmeFlags |= (flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0;
920 pmeFlags |= (flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0;
922 pme_gpu_prepare_computation(pmedata, flags & GMX_FORCE_DYNAMICBOX, box, wcycle, pmeFlags);
923 pme_gpu_launch_spread(pmedata, x, wcycle);
926 /*! \brief Launch the FFT and gather stages of PME GPU
928 * This function only implements setting the output forces (no accumulation).
930 * \param[in] pmedata The PME structure
931 * \param[in] wcycle The wallcycle structure
933 static void launchPmeGpuFftAndGather(gmx_pme_t *pmedata,
934 gmx_wallcycle_t wcycle)
936 pme_gpu_launch_complex_transforms(pmedata, wcycle);
937 pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
940 /*! \brief
941 * Polling wait for either of the PME or nonbonded GPU tasks.
943 * Instead of a static order in waiting for GPU tasks, this function
944 * polls checking which of the two tasks completes first, and does the
945 * associated force buffer reduction overlapped with the other task.
946 * By doing that, unlike static scheduling order, it can always overlap
947 * one of the reductions, regardless of the GPU task completion order.
949 * \param[in] nbv Nonbonded verlet structure
950 * \param[in] pmedata PME module data
951 * \param[in,out] force Force array to reduce task outputs into.
952 * \param[in,out] forceWithVirial Force and virial buffers
953 * \param[in,out] fshift Shift force output vector results are reduced into
954 * \param[in,out] enerd Energy data structure results are reduced into
955 * \param[in] flags Force flags
956 * \param[in] wcycle The wallcycle structure
958 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv,
959 const gmx_pme_t *pmedata,
960 gmx::PaddedArrayRef<gmx::RVec> *force,
961 gmx::ForceWithVirial *forceWithVirial,
962 rvec fshift[],
963 gmx_enerdata_t *enerd,
964 int flags,
965 gmx_wallcycle_t wcycle)
967 bool isPmeGpuDone = false;
968 bool isNbGpuDone = false;
971 gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
973 while (!isPmeGpuDone || !isNbGpuDone)
975 if (!isPmeGpuDone)
977 matrix vir_Q;
978 real Vlr_q;
980 GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
981 isPmeGpuDone = pme_gpu_try_finish_task(pmedata, wcycle, &pmeGpuForces,
982 vir_Q, &Vlr_q, completionType);
984 if (isPmeGpuDone)
986 pme_gpu_reduce_outputs(wcycle, forceWithVirial, pmeGpuForces,
987 enerd, vir_Q, Vlr_q);
991 if (!isNbGpuDone)
993 GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
994 wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
995 isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
996 flags, eatLocal,
997 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
998 fshift, completionType);
999 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1000 // To get the call count right, when the task finished we
1001 // issue a start/stop.
1002 // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
1003 // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
1004 if (isNbGpuDone)
1006 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1007 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1009 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1010 nbv->nbat, as_rvec_array(force->data()), wcycle);
1016 /*! \brief
1017 * Launch the dynamic rolling pruning GPU task.
1019 * We currently alternate local/non-local list pruning in odd-even steps
1020 * (only pruning every second step without DD).
1022 * \param[in] cr The communication record
1023 * \param[in] nbv Nonbonded verlet structure
1024 * \param[in] inputrec The input record
1025 * \param[in] step The current MD step
1027 static inline void launchGpuRollingPruning(const t_commrec *cr,
1028 const nonbonded_verlet_t *nbv,
1029 const t_inputrec *inputrec,
1030 const int64_t step)
1032 /* We should not launch the rolling pruning kernel at a search
1033 * step or just before search steps, since that's useless.
1034 * Without domain decomposition we prune at even steps.
1035 * With domain decomposition we alternate local and non-local
1036 * pruning at even and odd steps.
1038 int numRollingParts = nbv->listParams->numRollingParts;
1039 GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1040 int stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1041 bool stepIsEven = ((stepWithCurrentList & 1) == 0);
1042 if (stepWithCurrentList > 0 &&
1043 stepWithCurrentList < inputrec->nstlist - 1 &&
1044 (stepIsEven || DOMAINDECOMP(cr)))
1046 nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
1047 stepIsEven ? eintLocal : eintNonlocal,
1048 numRollingParts);
1052 static void do_force_cutsVERLET(FILE *fplog,
1053 const t_commrec *cr,
1054 const gmx_multisim_t *ms,
1055 const t_inputrec *inputrec,
1056 gmx::Awh *awh,
1057 gmx_enfrot *enforcedRotation,
1058 int64_t step,
1059 t_nrnb *nrnb,
1060 gmx_wallcycle_t wcycle,
1061 const gmx_localtop_t *top,
1062 const gmx_groups_t * /* groups */,
1063 matrix box, gmx::PaddedArrayRef<gmx::RVec> x,
1064 history_t *hist,
1065 gmx::PaddedArrayRef<gmx::RVec> force,
1066 tensor vir_force,
1067 const t_mdatoms *mdatoms,
1068 gmx_enerdata_t *enerd, t_fcdata *fcd,
1069 real *lambda,
1070 t_graph *graph,
1071 t_forcerec *fr,
1072 interaction_const_t *ic,
1073 const gmx_vsite_t *vsite,
1074 rvec mu_tot,
1075 double t,
1076 const gmx_edsam *ed,
1077 int flags,
1078 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1079 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1081 int cg1, i, j;
1082 double mu[2*DIM];
1083 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1084 gmx_bool bDoForces, bUseGPU, bUseOrEmulGPU;
1085 rvec vzero, box_diag;
1086 float cycles_pme, cycles_wait_gpu;
1087 nonbonded_verlet_t *nbv = fr->nbv;
1089 bStateChanged = (flags & GMX_FORCE_STATECHANGED);
1090 bNS = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1091 bFillGrid = (bNS && bStateChanged);
1092 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1093 bDoForces = (flags & GMX_FORCE_FORCES);
1094 bUseGPU = fr->nbv->bUseGPU;
1095 bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
1097 const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
1098 // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
1099 const bool useGpuPme = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
1100 ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
1102 /* At a search step we need to start the first balancing region
1103 * somewhere early inside the step after communication during domain
1104 * decomposition (and not during the previous step as usual).
1106 if (bNS &&
1107 ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1109 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
1112 cycles_wait_gpu = 0;
1114 const int start = 0;
1115 const int homenr = mdatoms->homenr;
1117 clear_mat(vir_force);
1119 if (DOMAINDECOMP(cr))
1121 cg1 = cr->dd->globalAtomGroupIndices.size();
1123 else
1125 cg1 = top->cgs.nr;
1127 if (fr->n_tpi > 0)
1129 cg1--;
1132 if (bStateChanged)
1134 update_forcerec(fr, box);
1136 if (inputrecNeedMutot(inputrec))
1138 /* Calculate total (local) dipole moment in a temporary common array.
1139 * This makes it possible to sum them over nodes faster.
1141 calc_mu(start, homenr,
1142 x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1143 mu, mu+DIM);
1147 if (fr->ePBC != epbcNONE)
1149 /* Compute shift vectors every step,
1150 * because of pressure coupling or box deformation!
1152 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1154 calc_shifts(box, fr->shift_vec);
1157 if (bCalcCGCM)
1159 put_atoms_in_box_omp(fr->ePBC, box, x.subArray(0, homenr));
1160 inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1162 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1164 unshift_self(graph, box, as_rvec_array(x.data()));
1168 nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
1169 fr->shift_vec, nbv->nbat);
1171 #if GMX_MPI
1172 if (!thisRankHasDuty(cr, DUTY_PME))
1174 /* Send particle coordinates to the pme nodes.
1175 * Since this is only implemented for domain decomposition
1176 * and domain decomposition does not use the graph,
1177 * we do not need to worry about shifting.
1179 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1180 lambda[efptCOUL], lambda[efptVDW],
1181 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1182 step, wcycle);
1184 #endif /* GMX_MPI */
1186 if (useGpuPme)
1188 launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.data()), flags, wcycle);
1191 /* do gridding for pair search */
1192 if (bNS)
1194 if (graph && bStateChanged)
1196 /* Calculate intramolecular shift vectors to make molecules whole */
1197 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1200 clear_rvec(vzero);
1201 box_diag[XX] = box[XX][XX];
1202 box_diag[YY] = box[YY][YY];
1203 box_diag[ZZ] = box[ZZ][ZZ];
1205 wallcycle_start(wcycle, ewcNS);
1206 if (!DOMAINDECOMP(cr))
1208 wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1209 nbnxn_put_on_grid(nbv->nbs.get(), fr->ePBC, box,
1210 0, vzero, box_diag,
1211 0, mdatoms->homenr, -1, fr->cginfo, as_rvec_array(x.data()),
1212 0, nullptr,
1213 nbv->grp[eintLocal].kernel_type,
1214 nbv->nbat);
1215 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1217 else
1219 wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1220 nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
1221 fr->cginfo, as_rvec_array(x.data()),
1222 nbv->grp[eintNonlocal].kernel_type,
1223 nbv->nbat);
1224 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1227 nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
1228 wallcycle_stop(wcycle, ewcNS);
1231 /* initialize the GPU atom data and copy shift vector */
1232 if (bUseGPU)
1234 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1235 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1237 if (bNS)
1239 nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1242 nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1244 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1245 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1248 /* do local pair search */
1249 if (bNS)
1251 wallcycle_start_nocount(wcycle, ewcNS);
1252 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1253 nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1254 &top->excls,
1255 nbv->listParams->rlistOuter,
1256 nbv->min_ci_balanced,
1257 &nbv->grp[eintLocal].nbl_lists,
1258 eintLocal,
1259 nbv->grp[eintLocal].kernel_type,
1260 nrnb);
1261 nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
1262 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1264 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
1266 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1268 if (bUseGPU)
1270 /* initialize local pair-list on the GPU */
1271 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1272 nbv->grp[eintLocal].nbl_lists.nbl[0],
1273 eintLocal);
1275 wallcycle_stop(wcycle, ewcNS);
1277 else
1279 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatLocal, FALSE, as_rvec_array(x.data()),
1280 nbv->nbat, wcycle);
1283 if (bUseGPU)
1285 if (DOMAINDECOMP(cr))
1287 ddOpenBalanceRegionGpu(cr->dd);
1290 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1291 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1292 /* launch local nonbonded work on GPU */
1293 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1294 step, nrnb, wcycle);
1295 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1296 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1299 if (useGpuPme)
1301 // In PME GPU and mixed mode we launch FFT / gather after the
1302 // X copy/transform to allow overlap as well as after the GPU NB
1303 // launch to avoid FFT launch overhead hijacking the CPU and delaying
1304 // the nonbonded kernel.
1305 launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1308 /* Communicate coordinates and sum dipole if necessary +
1309 do non-local pair search */
1310 if (DOMAINDECOMP(cr))
1312 if (bNS)
1314 wallcycle_start_nocount(wcycle, ewcNS);
1315 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1317 nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1318 &top->excls,
1319 nbv->listParams->rlistOuter,
1320 nbv->min_ci_balanced,
1321 &nbv->grp[eintNonlocal].nbl_lists,
1322 eintNonlocal,
1323 nbv->grp[eintNonlocal].kernel_type,
1324 nrnb);
1325 nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1326 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1328 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1330 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1332 if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1334 /* initialize non-local pair-list on the GPU */
1335 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1336 nbv->grp[eintNonlocal].nbl_lists.nbl[0],
1337 eintNonlocal);
1339 wallcycle_stop(wcycle, ewcNS);
1341 else
1343 dd_move_x(cr->dd, box, x, wcycle);
1345 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatNonlocal, FALSE, as_rvec_array(x.data()),
1346 nbv->nbat, wcycle);
1349 if (bUseGPU)
1351 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1352 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1353 /* launch non-local nonbonded tasks on GPU */
1354 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1355 step, nrnb, wcycle);
1356 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1357 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1361 if (bUseGPU)
1363 /* launch D2H copy-back F */
1364 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1365 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1366 if (DOMAINDECOMP(cr))
1368 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1369 flags, eatNonlocal);
1371 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1372 flags, eatLocal);
1373 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1374 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1377 if (bStateChanged && inputrecNeedMutot(inputrec))
1379 if (PAR(cr))
1381 gmx_sumd(2*DIM, mu, cr);
1382 ddReopenBalanceRegionCpu(cr->dd);
1385 for (i = 0; i < 2; i++)
1387 for (j = 0; j < DIM; j++)
1389 fr->mu_tot[i][j] = mu[i*DIM + j];
1393 if (fr->efep == efepNO)
1395 copy_rvec(fr->mu_tot[0], mu_tot);
1397 else
1399 for (j = 0; j < DIM; j++)
1401 mu_tot[j] =
1402 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1403 lambda[efptCOUL]*fr->mu_tot[1][j];
1407 /* Reset energies */
1408 reset_enerdata(enerd);
1409 clear_rvecs(SHIFTS, fr->fshift);
1411 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1413 wallcycle_start(wcycle, ewcPPDURINGPME);
1414 dd_force_flop_start(cr->dd, nrnb);
1417 if (inputrec->bRot)
1419 wallcycle_start(wcycle, ewcROT);
1420 do_rotation(cr, enforcedRotation, box, as_rvec_array(x.data()), t, step, bNS);
1421 wallcycle_stop(wcycle, ewcROT);
1424 /* Temporary solution until all routines take PaddedRVecVector */
1425 rvec *const f = as_rvec_array(force.data());
1427 /* Start the force cycle counter.
1428 * Note that a different counter is used for dynamic load balancing.
1430 wallcycle_start(wcycle, ewcFORCE);
1432 gmx::ArrayRef<gmx::RVec> forceRef = force;
1433 if (bDoForces)
1435 /* If we need to compute the virial, we might need a separate
1436 * force buffer for algorithms for which the virial is calculated
1437 * directly, such as PME.
1439 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1441 forceRef = *fr->forceBufferForDirectVirialContributions;
1443 /* TODO: update comment
1444 * We only compute forces on local atoms. Note that vsites can
1445 * spread to non-local atoms, but that part of the buffer is
1446 * cleared separately in the vsite spreading code.
1448 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1450 /* Clear the short- and long-range forces */
1451 clear_rvecs_omp(fr->natoms_force_constr, f);
1454 /* forceWithVirial uses the local atom range only */
1455 gmx::ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
1457 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1459 clear_pull_forces(inputrec->pull_work);
1462 /* We calculate the non-bonded forces, when done on the CPU, here.
1463 * We do this before calling do_force_lowlevel, because in that
1464 * function, the listed forces are calculated before PME, which
1465 * does communication. With this order, non-bonded and listed
1466 * force calculation imbalance can be balanced out by the domain
1467 * decomposition load balancing.
1470 if (!bUseOrEmulGPU)
1472 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1473 step, nrnb, wcycle);
1476 if (fr->efep != efepNO)
1478 /* Calculate the local and non-local free energy interactions here.
1479 * Happens here on the CPU both with and without GPU.
1481 if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1483 do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1484 fr, as_rvec_array(x.data()), f, mdatoms,
1485 inputrec->fepvals, lambda,
1486 enerd, flags, nrnb, wcycle);
1489 if (DOMAINDECOMP(cr) &&
1490 fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1492 do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1493 fr, as_rvec_array(x.data()), f, mdatoms,
1494 inputrec->fepvals, lambda,
1495 enerd, flags, nrnb, wcycle);
1499 if (!bUseOrEmulGPU)
1501 int aloc;
1503 if (DOMAINDECOMP(cr))
1505 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1506 step, nrnb, wcycle);
1509 if (!bUseOrEmulGPU)
1511 aloc = eintLocal;
1513 else
1515 aloc = eintNonlocal;
1518 /* Add all the non-bonded force to the normal force array.
1519 * This can be split into a local and a non-local part when overlapping
1520 * communication with calculation with domain decomposition.
1522 wallcycle_stop(wcycle, ewcFORCE);
1524 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatAll, nbv->nbat, f, wcycle);
1526 wallcycle_start_nocount(wcycle, ewcFORCE);
1528 /* if there are multiple fshift output buffers reduce them */
1529 if ((flags & GMX_FORCE_VIRIAL) &&
1530 nbv->grp[aloc].nbl_lists.nnbl > 1)
1532 /* This is not in a subcounter because it takes a
1533 negligible and constant-sized amount of time */
1534 nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1535 fr->fshift);
1539 /* update QMMMrec, if necessary */
1540 if (fr->bQMMM)
1542 update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1545 /* Compute the bonded and non-bonded energies and optionally forces */
1546 do_force_lowlevel(fr, inputrec, &(top->idef),
1547 cr, ms, nrnb, wcycle, mdatoms,
1548 as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1549 box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1550 flags, &cycles_pme);
1552 wallcycle_stop(wcycle, ewcFORCE);
1554 computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1555 step, t, wcycle,
1556 fr->forceProviders, box, x, mdatoms, lambda,
1557 flags, &forceWithVirial, enerd,
1558 ed, bNS);
1560 if (bUseOrEmulGPU)
1562 /* wait for non-local forces (or calculate in emulation mode) */
1563 if (DOMAINDECOMP(cr))
1565 if (bUseGPU)
1567 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1568 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1569 flags, eatNonlocal,
1570 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1571 fr->fshift);
1572 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1574 else
1576 wallcycle_start_nocount(wcycle, ewcFORCE);
1577 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1578 step, nrnb, wcycle);
1579 wallcycle_stop(wcycle, ewcFORCE);
1582 /* skip the reduction if there was no non-local work to do */
1583 if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1585 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal,
1586 nbv->nbat, f, wcycle);
1591 if (DOMAINDECOMP(cr))
1593 /* We are done with the CPU compute.
1594 * We will now communicate the non-local forces.
1595 * If we use a GPU this will overlap with GPU work, so in that case
1596 * we do not close the DD force balancing region here.
1598 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1600 ddCloseBalanceRegionCpu(cr->dd);
1602 if (bDoForces)
1604 dd_move_f(cr->dd, force, fr->fshift, wcycle);
1608 // With both nonbonded and PME offloaded a GPU on the same rank, we use
1609 // an alternating wait/reduction scheme.
1610 bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1611 if (alternateGpuWait)
1613 alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, wcycle);
1616 if (!alternateGpuWait && useGpuPme)
1618 gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
1619 matrix vir_Q;
1620 real Vlr_q = 0.0;
1621 pme_gpu_wait_finish_task(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
1622 pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
1625 /* Wait for local GPU NB outputs on the non-alternating wait path */
1626 if (!alternateGpuWait && bUseGPU)
1628 /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1629 * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1630 * but even with a step of 0.1 ms the difference is less than 1%
1631 * of the step time.
1633 const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1635 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1636 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1637 flags, eatLocal,
1638 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1639 fr->fshift);
1640 float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1642 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1644 DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1645 if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1647 /* We measured few cycles, it could be that the kernel
1648 * and transfer finished earlier and there was no actual
1649 * wait time, only API call overhead.
1650 * Then the actual time could be anywhere between 0 and
1651 * cycles_wait_est. We will use half of cycles_wait_est.
1653 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1655 ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1659 if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1661 // NOTE: emulation kernel is not included in the balancing region,
1662 // but emulation mode does not target performance anyway
1663 wallcycle_start_nocount(wcycle, ewcFORCE);
1664 do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1665 DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1666 step, nrnb, wcycle);
1667 wallcycle_stop(wcycle, ewcFORCE);
1670 if (useGpuPme)
1672 pme_gpu_reinit_computation(fr->pmedata, wcycle);
1675 if (bUseGPU)
1677 /* now clear the GPU outputs while we finish the step on the CPU */
1678 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1679 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1680 nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1682 /* Is dynamic pair-list pruning activated? */
1683 if (nbv->listParams->useDynamicPruning)
1685 launchGpuRollingPruning(cr, nbv, inputrec, step);
1687 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1688 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1691 /* Do the nonbonded GPU (or emulation) force buffer reduction
1692 * on the non-alternating path. */
1693 if (bUseOrEmulGPU && !alternateGpuWait)
1695 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1696 nbv->nbat, f, wcycle);
1699 if (DOMAINDECOMP(cr))
1701 dd_force_flop_stop(cr->dd, nrnb);
1704 if (bDoForces)
1706 /* If we have NoVirSum forces, but we do not calculate the virial,
1707 * we sum fr->f_novirsum=f later.
1709 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1711 spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
1712 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1715 if (flags & GMX_FORCE_VIRIAL)
1717 /* Calculation of the virial must be done after vsites! */
1718 calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
1719 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1723 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1725 /* In case of node-splitting, the PP nodes receive the long-range
1726 * forces, virial and energy from the PME nodes here.
1728 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1731 if (bDoForces)
1733 post_process_forces(cr, step, nrnb, wcycle,
1734 top, box, as_rvec_array(x.data()), f, &forceWithVirial,
1735 vir_force, mdatoms, graph, fr, vsite,
1736 flags);
1739 if (flags & GMX_FORCE_ENERGY)
1741 /* Sum the potential energy terms from group contributions */
1742 sum_epot(&(enerd->grpp), enerd->term);
1744 if (!EI_TPI(inputrec->eI))
1746 checkPotentialEnergyValidity(step, *enerd, *inputrec);
1751 static void do_force_cutsGROUP(FILE *fplog,
1752 const t_commrec *cr,
1753 const gmx_multisim_t *ms,
1754 const t_inputrec *inputrec,
1755 gmx::Awh *awh,
1756 gmx_enfrot *enforcedRotation,
1757 int64_t step,
1758 t_nrnb *nrnb,
1759 gmx_wallcycle_t wcycle,
1760 gmx_localtop_t *top,
1761 const gmx_groups_t *groups,
1762 matrix box, gmx::PaddedArrayRef<gmx::RVec> x,
1763 history_t *hist,
1764 gmx::PaddedArrayRef<gmx::RVec> force,
1765 tensor vir_force,
1766 const t_mdatoms *mdatoms,
1767 gmx_enerdata_t *enerd,
1768 t_fcdata *fcd,
1769 real *lambda,
1770 t_graph *graph,
1771 t_forcerec *fr,
1772 const gmx_vsite_t *vsite,
1773 rvec mu_tot,
1774 double t,
1775 const gmx_edsam *ed,
1776 int flags,
1777 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1778 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1780 int cg0, cg1, i, j;
1781 double mu[2*DIM];
1782 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1783 gmx_bool bDoForces;
1784 float cycles_pme;
1786 const int start = 0;
1787 const int homenr = mdatoms->homenr;
1789 clear_mat(vir_force);
1791 cg0 = 0;
1792 if (DOMAINDECOMP(cr))
1794 cg1 = cr->dd->globalAtomGroupIndices.size();
1796 else
1798 cg1 = top->cgs.nr;
1800 if (fr->n_tpi > 0)
1802 cg1--;
1805 bStateChanged = (flags & GMX_FORCE_STATECHANGED);
1806 bNS = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1807 /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1808 bFillGrid = (bNS && bStateChanged);
1809 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1810 bDoForces = (flags & GMX_FORCE_FORCES);
1812 if (bStateChanged)
1814 update_forcerec(fr, box);
1816 if (inputrecNeedMutot(inputrec))
1818 /* Calculate total (local) dipole moment in a temporary common array.
1819 * This makes it possible to sum them over nodes faster.
1821 calc_mu(start, homenr,
1822 x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1823 mu, mu+DIM);
1827 if (fr->ePBC != epbcNONE)
1829 /* Compute shift vectors every step,
1830 * because of pressure coupling or box deformation!
1832 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1834 calc_shifts(box, fr->shift_vec);
1837 if (bCalcCGCM)
1839 put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1840 &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1841 inc_nrnb(nrnb, eNR_CGCM, homenr);
1842 inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1844 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1846 unshift_self(graph, box, as_rvec_array(x.data()));
1849 else if (bCalcCGCM)
1851 calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1852 inc_nrnb(nrnb, eNR_CGCM, homenr);
1855 if (bCalcCGCM && gmx_debug_at)
1857 pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1860 #if GMX_MPI
1861 if (!thisRankHasDuty(cr, DUTY_PME))
1863 /* Send particle coordinates to the pme nodes.
1864 * Since this is only implemented for domain decomposition
1865 * and domain decomposition does not use the graph,
1866 * we do not need to worry about shifting.
1868 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1869 lambda[efptCOUL], lambda[efptVDW],
1870 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1871 step, wcycle);
1873 #endif /* GMX_MPI */
1875 /* Communicate coordinates and sum dipole if necessary */
1876 if (DOMAINDECOMP(cr))
1878 dd_move_x(cr->dd, box, x, wcycle);
1880 /* No GPU support, no move_x overlap, so reopen the balance region here */
1881 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1883 ddReopenBalanceRegionCpu(cr->dd);
1887 if (inputrecNeedMutot(inputrec))
1889 if (bStateChanged)
1891 if (PAR(cr))
1893 gmx_sumd(2*DIM, mu, cr);
1894 ddReopenBalanceRegionCpu(cr->dd);
1896 for (i = 0; i < 2; i++)
1898 for (j = 0; j < DIM; j++)
1900 fr->mu_tot[i][j] = mu[i*DIM + j];
1904 if (fr->efep == efepNO)
1906 copy_rvec(fr->mu_tot[0], mu_tot);
1908 else
1910 for (j = 0; j < DIM; j++)
1912 mu_tot[j] =
1913 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1918 /* Reset energies */
1919 reset_enerdata(enerd);
1920 clear_rvecs(SHIFTS, fr->fshift);
1922 if (bNS)
1924 wallcycle_start(wcycle, ewcNS);
1926 if (graph && bStateChanged)
1928 /* Calculate intramolecular shift vectors to make molecules whole */
1929 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1932 /* Do the actual neighbour searching */
1933 ns(fplog, fr, box,
1934 groups, top, mdatoms,
1935 cr, nrnb, bFillGrid);
1937 wallcycle_stop(wcycle, ewcNS);
1940 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1942 wallcycle_start(wcycle, ewcPPDURINGPME);
1943 dd_force_flop_start(cr->dd, nrnb);
1946 if (inputrec->bRot)
1948 wallcycle_start(wcycle, ewcROT);
1949 do_rotation(cr, enforcedRotation, box, as_rvec_array(x.data()), t, step, bNS);
1950 wallcycle_stop(wcycle, ewcROT);
1953 /* Temporary solution until all routines take PaddedRVecVector */
1954 rvec *f = as_rvec_array(force.data());
1956 /* Start the force cycle counter.
1957 * Note that a different counter is used for dynamic load balancing.
1959 wallcycle_start(wcycle, ewcFORCE);
1961 gmx::ArrayRef<gmx::RVec> forceRef = force;
1962 if (bDoForces)
1964 /* If we need to compute the virial, we might need a separate
1965 * force buffer for algorithms for which the virial is calculated
1966 * separately, such as PME.
1968 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1970 forceRef = *fr->forceBufferForDirectVirialContributions;
1972 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1975 /* Clear the short- and long-range forces */
1976 clear_rvecs(fr->natoms_force_constr, f);
1979 /* forceWithVirial might need the full force atom range */
1980 gmx::ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
1982 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1984 clear_pull_forces(inputrec->pull_work);
1987 /* update QMMMrec, if necessary */
1988 if (fr->bQMMM)
1990 update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1993 /* Compute the bonded and non-bonded energies and optionally forces */
1994 do_force_lowlevel(fr, inputrec, &(top->idef),
1995 cr, ms, nrnb, wcycle, mdatoms,
1996 as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1997 box, inputrec->fepvals, lambda,
1998 graph, &(top->excls), fr->mu_tot,
1999 flags,
2000 &cycles_pme);
2002 wallcycle_stop(wcycle, ewcFORCE);
2004 if (DOMAINDECOMP(cr))
2006 dd_force_flop_stop(cr->dd, nrnb);
2008 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
2010 ddCloseBalanceRegionCpu(cr->dd);
2014 computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
2015 step, t, wcycle,
2016 fr->forceProviders, box, x, mdatoms, lambda,
2017 flags, &forceWithVirial, enerd,
2018 ed, bNS);
2020 if (bDoForces)
2022 /* Communicate the forces */
2023 if (DOMAINDECOMP(cr))
2025 dd_move_f(cr->dd, force, fr->fshift, wcycle);
2026 /* Do we need to communicate the separate force array
2027 * for terms that do not contribute to the single sum virial?
2028 * Position restraints and electric fields do not introduce
2029 * inter-cg forces, only full electrostatics methods do.
2030 * When we do not calculate the virial, fr->f_novirsum = f,
2031 * so we have already communicated these forces.
2033 if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
2034 (flags & GMX_FORCE_VIRIAL))
2036 dd_move_f(cr->dd, forceWithVirial.force_, nullptr, wcycle);
2040 /* If we have NoVirSum forces, but we do not calculate the virial,
2041 * we sum fr->f_novirsum=f later.
2043 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
2045 spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
2046 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
2049 if (flags & GMX_FORCE_VIRIAL)
2051 /* Calculation of the virial must be done after vsites! */
2052 calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
2053 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
2057 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
2059 /* In case of node-splitting, the PP nodes receive the long-range
2060 * forces, virial and energy from the PME nodes here.
2062 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
2065 if (bDoForces)
2067 post_process_forces(cr, step, nrnb, wcycle,
2068 top, box, as_rvec_array(x.data()), f, &forceWithVirial,
2069 vir_force, mdatoms, graph, fr, vsite,
2070 flags);
2073 if (flags & GMX_FORCE_ENERGY)
2075 /* Sum the potential energy terms from group contributions */
2076 sum_epot(&(enerd->grpp), enerd->term);
2078 if (!EI_TPI(inputrec->eI))
2080 checkPotentialEnergyValidity(step, *enerd, *inputrec);
2086 void do_force(FILE *fplog,
2087 const t_commrec *cr,
2088 const gmx_multisim_t *ms,
2089 const t_inputrec *inputrec,
2090 gmx::Awh *awh,
2091 gmx_enfrot *enforcedRotation,
2092 int64_t step,
2093 t_nrnb *nrnb,
2094 gmx_wallcycle_t wcycle,
2095 gmx_localtop_t *top,
2096 const gmx_groups_t *groups,
2097 matrix box,
2098 gmx::PaddedArrayRef<gmx::RVec> x,
2099 history_t *hist,
2100 gmx::PaddedArrayRef<gmx::RVec> force,
2101 tensor vir_force,
2102 const t_mdatoms *mdatoms,
2103 gmx_enerdata_t *enerd,
2104 t_fcdata *fcd,
2105 gmx::ArrayRef<real> lambda,
2106 t_graph *graph,
2107 t_forcerec *fr,
2108 const gmx_vsite_t *vsite,
2109 rvec mu_tot,
2110 double t,
2111 const gmx_edsam *ed,
2112 int flags,
2113 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
2114 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
2116 /* modify force flag if not doing nonbonded */
2117 if (!fr->bNonbonded)
2119 flags &= ~GMX_FORCE_NONBONDED;
2122 GMX_ASSERT(x.size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "coordinates should be padded");
2123 GMX_ASSERT(force.size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "force should be padded");
2125 switch (inputrec->cutoff_scheme)
2127 case ecutsVERLET:
2128 do_force_cutsVERLET(fplog, cr, ms, inputrec,
2129 awh, enforcedRotation, step, nrnb, wcycle,
2130 top,
2131 groups,
2132 box, x, hist,
2133 force, vir_force,
2134 mdatoms,
2135 enerd, fcd,
2136 lambda.data(), graph,
2137 fr, fr->ic,
2138 vsite, mu_tot,
2139 t, ed,
2140 flags,
2141 ddOpenBalanceRegion,
2142 ddCloseBalanceRegion);
2143 break;
2144 case ecutsGROUP:
2145 do_force_cutsGROUP(fplog, cr, ms, inputrec,
2146 awh, enforcedRotation, step, nrnb, wcycle,
2147 top,
2148 groups,
2149 box, x, hist,
2150 force, vir_force,
2151 mdatoms,
2152 enerd, fcd,
2153 lambda.data(), graph,
2154 fr, vsite, mu_tot,
2155 t, ed,
2156 flags,
2157 ddOpenBalanceRegion,
2158 ddCloseBalanceRegion);
2159 break;
2160 default:
2161 gmx_incons("Invalid cut-off scheme passed!");
2164 /* In case we don't have constraints and are using GPUs, the next balancing
2165 * region starts here.
2166 * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2167 * virial calculation and COM pulling, is not thus not included in
2168 * the balance timing, which is ok as most tasks do communication.
2170 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
2172 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
2177 void do_constrain_first(FILE *fplog, gmx::Constraints *constr,
2178 t_inputrec *ir, t_mdatoms *md,
2179 t_state *state)
2181 int i, m, start, end;
2182 int64_t step;
2183 real dt = ir->delta_t;
2184 real dvdl_dum;
2185 rvec *savex;
2187 /* We need to allocate one element extra, since we might use
2188 * (unaligned) 4-wide SIMD loads to access rvec entries.
2190 snew(savex, state->natoms + 1);
2192 start = 0;
2193 end = md->homenr;
2195 if (debug)
2197 fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2198 start, md->homenr, end);
2200 /* Do a first constrain to reset particles... */
2201 step = ir->init_step;
2202 if (fplog)
2204 char buf[STEPSTRSIZE];
2205 fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2206 gmx_step_str(step, buf));
2208 dvdl_dum = 0;
2210 /* constrain the current position */
2211 constr->apply(TRUE, FALSE,
2212 step, 0, 1.0,
2213 as_rvec_array(state->x.data()), as_rvec_array(state->x.data()), nullptr,
2214 state->box,
2215 state->lambda[efptBONDED], &dvdl_dum,
2216 nullptr, nullptr, gmx::ConstraintVariable::Positions);
2217 if (EI_VV(ir->eI))
2219 /* constrain the inital velocity, and save it */
2220 /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2221 constr->apply(TRUE, FALSE,
2222 step, 0, 1.0,
2223 as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
2224 state->box,
2225 state->lambda[efptBONDED], &dvdl_dum,
2226 nullptr, nullptr, gmx::ConstraintVariable::Velocities);
2228 /* constrain the inital velocities at t-dt/2 */
2229 if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2231 for (i = start; (i < end); i++)
2233 for (m = 0; (m < DIM); m++)
2235 /* Reverse the velocity */
2236 state->v[i][m] = -state->v[i][m];
2237 /* Store the position at t-dt in buf */
2238 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
2241 /* Shake the positions at t=-dt with the positions at t=0
2242 * as reference coordinates.
2244 if (fplog)
2246 char buf[STEPSTRSIZE];
2247 fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2248 gmx_step_str(step, buf));
2250 dvdl_dum = 0;
2251 constr->apply(TRUE, FALSE,
2252 step, -1, 1.0,
2253 as_rvec_array(state->x.data()), savex, nullptr,
2254 state->box,
2255 state->lambda[efptBONDED], &dvdl_dum,
2256 as_rvec_array(state->v.data()), nullptr, gmx::ConstraintVariable::Positions);
2258 for (i = start; i < end; i++)
2260 for (m = 0; m < DIM; m++)
2262 /* Re-reverse the velocities */
2263 state->v[i][m] = -state->v[i][m];
2267 sfree(savex);
2271 static void
2272 integrate_table(const real vdwtab[], real scale, int offstart, int rstart, int rend,
2273 double *enerout, double *virout)
2275 double enersum, virsum;
2276 double invscale, invscale2, invscale3;
2277 double r, ea, eb, ec, pa, pb, pc, pd;
2278 double y0, f, g, h;
2279 int ri, offset;
2280 double tabfactor;
2282 invscale = 1.0/scale;
2283 invscale2 = invscale*invscale;
2284 invscale3 = invscale*invscale2;
2286 /* Following summation derived from cubic spline definition,
2287 * Numerical Recipies in C, second edition, p. 113-116. Exact for
2288 * the cubic spline. We first calculate the negative of the
2289 * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2290 * add the more standard, abrupt cutoff correction to that result,
2291 * yielding the long-range correction for a switched function. We
2292 * perform both the pressure and energy loops at the same time for
2293 * simplicity, as the computational cost is low. */
2295 if (offstart == 0)
2297 /* Since the dispersion table has been scaled down a factor
2298 * 6.0 and the repulsion a factor 12.0 to compensate for the
2299 * c6/c12 parameters inside nbfp[] being scaled up (to save
2300 * flops in kernels), we need to correct for this.
2302 tabfactor = 6.0;
2304 else
2306 tabfactor = 12.0;
2309 enersum = 0.0;
2310 virsum = 0.0;
2311 for (ri = rstart; ri < rend; ++ri)
2313 r = ri*invscale;
2314 ea = invscale3;
2315 eb = 2.0*invscale2*r;
2316 ec = invscale*r*r;
2318 pa = invscale3;
2319 pb = 3.0*invscale2*r;
2320 pc = 3.0*invscale*r*r;
2321 pd = r*r*r;
2323 /* this "8" is from the packing in the vdwtab array - perhaps
2324 should be defined? */
2326 offset = 8*ri + offstart;
2327 y0 = vdwtab[offset];
2328 f = vdwtab[offset+1];
2329 g = vdwtab[offset+2];
2330 h = vdwtab[offset+3];
2332 enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2333 virsum += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2335 *enerout = 4.0*M_PI*enersum*tabfactor;
2336 *virout = 4.0*M_PI*virsum*tabfactor;
2339 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2341 double eners[2], virs[2], enersum, virsum;
2342 double r0, rc3, rc9;
2343 int ri0, ri1, i;
2344 real scale, *vdwtab;
2346 fr->enershiftsix = 0;
2347 fr->enershifttwelve = 0;
2348 fr->enerdiffsix = 0;
2349 fr->enerdifftwelve = 0;
2350 fr->virdiffsix = 0;
2351 fr->virdifftwelve = 0;
2353 const interaction_const_t *ic = fr->ic;
2355 if (eDispCorr != edispcNO)
2357 for (i = 0; i < 2; i++)
2359 eners[i] = 0;
2360 virs[i] = 0;
2362 if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2363 (ic->vdw_modifier == eintmodPOTSWITCH) ||
2364 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2365 (ic->vdwtype == evdwSHIFT) ||
2366 (ic->vdwtype == evdwSWITCH))
2368 if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2369 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2370 (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2372 gmx_fatal(FARGS,
2373 "With dispersion correction rvdw-switch can not be zero "
2374 "for vdw-type = %s", evdw_names[ic->vdwtype]);
2377 /* TODO This code depends on the logic in tables.c that
2378 constructs the table layout, which should be made
2379 explicit in future cleanup. */
2380 GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2381 "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2382 scale = fr->dispersionCorrectionTable->scale;
2383 vdwtab = fr->dispersionCorrectionTable->data;
2385 /* Round the cut-offs to exact table values for precision */
2386 ri0 = static_cast<int>(std::floor(ic->rvdw_switch*scale));
2387 ri1 = static_cast<int>(std::ceil(ic->rvdw*scale));
2389 /* The code below has some support for handling force-switching, i.e.
2390 * when the force (instead of potential) is switched over a limited
2391 * region. This leads to a constant shift in the potential inside the
2392 * switching region, which we can handle by adding a constant energy
2393 * term in the force-switch case just like when we do potential-shift.
2395 * For now this is not enabled, but to keep the functionality in the
2396 * code we check separately for switch and shift. When we do force-switch
2397 * the shifting point is rvdw_switch, while it is the cutoff when we
2398 * have a classical potential-shift.
2400 * For a pure potential-shift the potential has a constant shift
2401 * all the way out to the cutoff, and that is it. For other forms
2402 * we need to calculate the constant shift up to the point where we
2403 * start modifying the potential.
2405 ri0 = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2407 r0 = ri0/scale;
2408 rc3 = r0*r0*r0;
2409 rc9 = rc3*rc3*rc3;
2411 if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2412 (ic->vdwtype == evdwSHIFT))
2414 /* Determine the constant energy shift below rvdw_switch.
2415 * Table has a scale factor since we have scaled it down to compensate
2416 * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2418 fr->enershiftsix = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2419 fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2421 else if (ic->vdw_modifier == eintmodPOTSHIFT)
2423 fr->enershiftsix = (real)(-1.0/(rc3*rc3));
2424 fr->enershifttwelve = (real)( 1.0/(rc9*rc3));
2427 /* Add the constant part from 0 to rvdw_switch.
2428 * This integration from 0 to rvdw_switch overcounts the number
2429 * of interactions by 1, as it also counts the self interaction.
2430 * We will correct for this later.
2432 eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2433 eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2435 /* Calculate the contribution in the range [r0,r1] where we
2436 * modify the potential. For a pure potential-shift modifier we will
2437 * have ri0==ri1, and there will not be any contribution here.
2439 for (i = 0; i < 2; i++)
2441 enersum = 0;
2442 virsum = 0;
2443 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2444 eners[i] -= enersum;
2445 virs[i] -= virsum;
2448 /* Alright: Above we compensated by REMOVING the parts outside r0
2449 * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2451 * Regardless of whether r0 is the point where we start switching,
2452 * or the cutoff where we calculated the constant shift, we include
2453 * all the parts we are missing out to infinity from r0 by
2454 * calculating the analytical dispersion correction.
2456 eners[0] += -4.0*M_PI/(3.0*rc3);
2457 eners[1] += 4.0*M_PI/(9.0*rc9);
2458 virs[0] += 8.0*M_PI/rc3;
2459 virs[1] += -16.0*M_PI/(3.0*rc9);
2461 else if (ic->vdwtype == evdwCUT ||
2462 EVDW_PME(ic->vdwtype) ||
2463 ic->vdwtype == evdwUSER)
2465 if (ic->vdwtype == evdwUSER && fplog)
2467 fprintf(fplog,
2468 "WARNING: using dispersion correction with user tables\n");
2471 /* Note that with LJ-PME, the dispersion correction is multiplied
2472 * by the difference between the actual C6 and the value of C6
2473 * that would produce the combination rule.
2474 * This means the normal energy and virial difference formulas
2475 * can be used here.
2478 rc3 = ic->rvdw*ic->rvdw*ic->rvdw;
2479 rc9 = rc3*rc3*rc3;
2480 /* Contribution beyond the cut-off */
2481 eners[0] += -4.0*M_PI/(3.0*rc3);
2482 eners[1] += 4.0*M_PI/(9.0*rc9);
2483 if (ic->vdw_modifier == eintmodPOTSHIFT)
2485 /* Contribution within the cut-off */
2486 eners[0] += -4.0*M_PI/(3.0*rc3);
2487 eners[1] += 4.0*M_PI/(3.0*rc9);
2489 /* Contribution beyond the cut-off */
2490 virs[0] += 8.0*M_PI/rc3;
2491 virs[1] += -16.0*M_PI/(3.0*rc9);
2493 else
2495 gmx_fatal(FARGS,
2496 "Dispersion correction is not implemented for vdw-type = %s",
2497 evdw_names[ic->vdwtype]);
2500 /* When we deprecate the group kernels the code below can go too */
2501 if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2503 /* Calculate self-interaction coefficient (assuming that
2504 * the reciprocal-space contribution is constant in the
2505 * region that contributes to the self-interaction).
2507 fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2509 eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2510 virs[0] += gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2513 fr->enerdiffsix = eners[0];
2514 fr->enerdifftwelve = eners[1];
2515 /* The 0.5 is due to the Gromacs definition of the virial */
2516 fr->virdiffsix = 0.5*virs[0];
2517 fr->virdifftwelve = 0.5*virs[1];
2521 void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
2522 matrix box, real lambda, tensor pres, tensor virial,
2523 real *prescorr, real *enercorr, real *dvdlcorr)
2525 gmx_bool bCorrAll, bCorrPres;
2526 real dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2527 int m;
2529 *prescorr = 0;
2530 *enercorr = 0;
2531 *dvdlcorr = 0;
2533 clear_mat(virial);
2534 clear_mat(pres);
2536 if (ir->eDispCorr != edispcNO)
2538 bCorrAll = (ir->eDispCorr == edispcAllEner ||
2539 ir->eDispCorr == edispcAllEnerPres);
2540 bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2541 ir->eDispCorr == edispcAllEnerPres);
2543 invvol = 1/det(box);
2544 if (fr->n_tpi)
2546 /* Only correct for the interactions with the inserted molecule */
2547 dens = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2548 ninter = fr->n_tpi;
2550 else
2552 dens = fr->numAtomsForDispersionCorrection*invvol;
2553 ninter = 0.5*fr->numAtomsForDispersionCorrection;
2556 if (ir->efep == efepNO)
2558 avcsix = fr->avcsix[0];
2559 avctwelve = fr->avctwelve[0];
2561 else
2563 avcsix = (1 - lambda)*fr->avcsix[0] + lambda*fr->avcsix[1];
2564 avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2567 enerdiff = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2568 *enercorr += avcsix*enerdiff;
2569 dvdlambda = 0.0;
2570 if (ir->efep != efepNO)
2572 dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2574 if (bCorrAll)
2576 enerdiff = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2577 *enercorr += avctwelve*enerdiff;
2578 if (fr->efep != efepNO)
2580 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2584 if (bCorrPres)
2586 svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2587 if (ir->eDispCorr == edispcAllEnerPres)
2589 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2591 /* The factor 2 is because of the Gromacs virial definition */
2592 spres = -2.0*invvol*svir*PRESFAC;
2594 for (m = 0; m < DIM; m++)
2596 virial[m][m] += svir;
2597 pres[m][m] += spres;
2599 *prescorr += spres;
2602 /* Can't currently control when it prints, for now, just print when degugging */
2603 if (debug)
2605 if (bCorrAll)
2607 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2608 avcsix, avctwelve);
2610 if (bCorrPres)
2612 fprintf(debug,
2613 "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2614 *enercorr, spres, svir);
2616 else
2618 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2622 if (fr->efep != efepNO)
2624 *dvdlcorr += dvdlambda;
2629 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2630 const gmx_mtop_t *mtop, rvec x[],
2631 gmx_bool bFirst)
2633 t_graph *graph;
2634 int as, mol;
2636 if (bFirst && fplog)
2638 fprintf(fplog, "Removing pbc first time\n");
2641 snew(graph, 1);
2642 as = 0;
2643 for (const gmx_molblock_t &molb : mtop->molblock)
2645 const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2646 if (moltype.atoms.nr == 1 ||
2647 (!bFirst && moltype.cgs.nr == 1))
2649 /* Just one atom or charge group in the molecule, no PBC required */
2650 as += molb.nmol*moltype.atoms.nr;
2652 else
2654 /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2655 mk_graph_ilist(nullptr, moltype.ilist,
2656 0, moltype.atoms.nr, FALSE, FALSE, graph);
2658 for (mol = 0; mol < molb.nmol; mol++)
2660 mk_mshift(fplog, graph, ePBC, box, x+as);
2662 shift_self(graph, box, x+as);
2663 /* The molecule is whole now.
2664 * We don't need the second mk_mshift call as in do_pbc_first,
2665 * since we no longer need this graph.
2668 as += moltype.atoms.nr;
2670 done_graph(graph);
2673 sfree(graph);
2676 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2677 const gmx_mtop_t *mtop, rvec x[])
2679 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2682 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2683 const gmx_mtop_t *mtop, rvec x[])
2685 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2688 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2690 int t, nth;
2691 nth = gmx_omp_nthreads_get(emntDefault);
2693 #pragma omp parallel for num_threads(nth) schedule(static)
2694 for (t = 0; t < nth; t++)
2698 size_t natoms = x.size();
2699 size_t offset = (natoms*t )/nth;
2700 size_t len = (natoms*(t + 1))/nth - offset;
2701 put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2703 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2707 // TODO This can be cleaned up a lot, and move back to runner.cpp
2708 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2709 const t_inputrec *inputrec,
2710 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2711 gmx_walltime_accounting_t walltime_accounting,
2712 nonbonded_verlet_t *nbv,
2713 gmx_pme_t *pme,
2714 gmx_bool bWriteStat)
2716 t_nrnb *nrnb_tot = nullptr;
2717 double delta_t = 0;
2718 double nbfs = 0, mflop = 0;
2719 double elapsed_time,
2720 elapsed_time_over_all_ranks,
2721 elapsed_time_over_all_threads,
2722 elapsed_time_over_all_threads_over_all_ranks;
2723 /* Control whether it is valid to print a report. Only the
2724 simulation master may print, but it should not do so if the run
2725 terminated e.g. before a scheduled reset step. This is
2726 complicated by the fact that PME ranks are unaware of the
2727 reason why they were sent a pmerecvqxFINISH. To avoid
2728 communication deadlocks, we always do the communication for the
2729 report, even if we've decided not to write the report, because
2730 how long it takes to finish the run is not important when we've
2731 decided not to report on the simulation performance.
2733 Further, we only report performance for dynamical integrators,
2734 because those are the only ones for which we plan to
2735 consider doing any optimizations. */
2736 bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2738 if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2740 GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2741 printReport = false;
2744 if (cr->nnodes > 1)
2746 snew(nrnb_tot, 1);
2747 #if GMX_MPI
2748 MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2749 cr->mpi_comm_mysim);
2750 #endif
2752 else
2754 nrnb_tot = nrnb;
2757 elapsed_time = walltime_accounting_get_elapsed_time(walltime_accounting);
2758 elapsed_time_over_all_ranks = elapsed_time;
2759 elapsed_time_over_all_threads = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
2760 elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2761 #if GMX_MPI
2762 if (cr->nnodes > 1)
2764 /* reduce elapsed_time over all MPI ranks in the current simulation */
2765 MPI_Allreduce(&elapsed_time,
2766 &elapsed_time_over_all_ranks,
2767 1, MPI_DOUBLE, MPI_SUM,
2768 cr->mpi_comm_mysim);
2769 elapsed_time_over_all_ranks /= cr->nnodes;
2770 /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2771 * current simulation. */
2772 MPI_Allreduce(&elapsed_time_over_all_threads,
2773 &elapsed_time_over_all_threads_over_all_ranks,
2774 1, MPI_DOUBLE, MPI_SUM,
2775 cr->mpi_comm_mysim);
2777 #endif
2779 if (printReport)
2781 print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2783 if (cr->nnodes > 1)
2785 sfree(nrnb_tot);
2788 if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2790 print_dd_statistics(cr, inputrec, fplog);
2793 /* TODO Move the responsibility for any scaling by thread counts
2794 * to the code that handled the thread region, so that there's a
2795 * mechanism to keep cycle counting working during the transition
2796 * to task parallelism. */
2797 int nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
2798 int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2799 wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2800 auto cycle_sum(wallcycle_sum(cr, wcycle));
2802 if (printReport)
2804 auto nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2805 gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
2806 if (pme_gpu_task_enabled(pme))
2808 pme_gpu_get_timings(pme, &pme_gpu_timings);
2810 wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2811 elapsed_time_over_all_ranks,
2812 wcycle, cycle_sum,
2813 nbnxn_gpu_timings,
2814 &pme_gpu_timings);
2816 if (EI_DYNAMICS(inputrec->eI))
2818 delta_t = inputrec->delta_t;
2821 if (fplog)
2823 print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2824 elapsed_time_over_all_ranks,
2825 walltime_accounting_get_nsteps_done(walltime_accounting),
2826 delta_t, nbfs, mflop);
2828 if (bWriteStat)
2830 print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2831 elapsed_time_over_all_ranks,
2832 walltime_accounting_get_nsteps_done(walltime_accounting),
2833 delta_t, nbfs, mflop);
2838 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2840 /* this function works, but could probably use a logic rewrite to keep all the different
2841 types of efep straight. */
2843 if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
2845 return;
2848 t_lambda *fep = ir->fepvals;
2849 *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2850 if checkpoint is set -- a kludge is in for now
2851 to prevent this.*/
2853 for (int i = 0; i < efptNR; i++)
2855 /* overwrite lambda state with init_lambda for now for backwards compatibility */
2856 if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2858 lambda[i] = fep->init_lambda;
2859 if (lam0)
2861 lam0[i] = lambda[i];
2864 else
2866 lambda[i] = fep->all_lambda[i][*fep_state];
2867 if (lam0)
2869 lam0[i] = lambda[i];
2873 if (ir->bSimTemp)
2875 /* need to rescale control temperatures to match current state */
2876 for (int i = 0; i < ir->opts.ngtc; i++)
2878 if (ir->opts.ref_t[i] > 0)
2880 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2885 /* Send to the log the information on the current lambdas */
2886 if (fplog != nullptr)
2888 fprintf(fplog, "Initial vector of lambda components:[ ");
2889 for (const auto &l : lambda)
2891 fprintf(fplog, "%10.4f ", l);
2893 fprintf(fplog, "]\n");
2898 void init_md(FILE *fplog,
2899 const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2900 t_inputrec *ir, const gmx_output_env_t *oenv,
2901 const MdrunOptions &mdrunOptions,
2902 double *t, double *t0,
2903 t_state *globalState, double *lam0,
2904 t_nrnb *nrnb, gmx_mtop_t *mtop,
2905 gmx_update_t **upd,
2906 gmx::BoxDeformation *deform,
2907 int nfile, const t_filenm fnm[],
2908 gmx_mdoutf_t *outf, t_mdebin **mdebin,
2909 tensor force_vir, tensor shake_vir, rvec mu_tot,
2910 gmx_bool *bSimAnn, t_vcm **vcm,
2911 gmx_wallcycle_t wcycle)
2913 int i;
2915 /* Initial values */
2916 *t = *t0 = ir->init_t;
2918 *bSimAnn = FALSE;
2919 for (i = 0; i < ir->opts.ngtc; i++)
2921 /* set bSimAnn if any group is being annealed */
2922 if (ir->opts.annealing[i] != eannNO)
2924 *bSimAnn = TRUE;
2928 /* Initialize lambda variables */
2929 /* TODO: Clean up initialization of fep_state and lambda in t_state.
2930 * We currently need to call initialize_lambdas on non-master ranks
2931 * to initialize lam0.
2933 if (MASTER(cr))
2935 initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
2937 else
2939 int tmpFepState;
2940 std::array<real, efptNR> tmpLambda;
2941 initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
2944 // TODO upd is never NULL in practice, but the analysers don't know that
2945 if (upd)
2947 *upd = init_update(ir, deform);
2949 if (*bSimAnn)
2951 update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2954 if (vcm != nullptr)
2956 *vcm = init_vcm(fplog, &mtop->groups, ir);
2959 if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
2961 if (ir->etc == etcBERENDSEN)
2963 please_cite(fplog, "Berendsen84a");
2965 if (ir->etc == etcVRESCALE)
2967 please_cite(fplog, "Bussi2007a");
2969 if (ir->eI == eiSD1)
2971 please_cite(fplog, "Goga2012");
2974 init_nrnb(nrnb);
2976 if (nfile != -1)
2978 *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
2980 *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
2981 mtop, ir, mdoutf_get_fp_dhdl(*outf));
2984 /* Initiate variables */
2985 clear_mat(force_vir);
2986 clear_mat(shake_vir);
2987 clear_rvec(mu_tot);