2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
50 #include "gromacs/awh/awh.h"
51 #include "gromacs/domdec/dlbtiming.h"
52 #include "gromacs/domdec/domdec_struct.h"
53 #include "gromacs/domdec/partition.h"
54 #include "gromacs/essentialdynamics/edsam.h"
55 #include "gromacs/ewald/pme.h"
56 #include "gromacs/gmxlib/chargegroup.h"
57 #include "gromacs/gmxlib/network.h"
58 #include "gromacs/gmxlib/nrnb.h"
59 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
60 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
61 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
62 #include "gromacs/gpu_utils/gpu_utils.h"
63 #include "gromacs/imd/imd.h"
64 #include "gromacs/listed-forces/bonded.h"
65 #include "gromacs/listed-forces/disre.h"
66 #include "gromacs/listed-forces/manage-threading.h"
67 #include "gromacs/listed-forces/orires.h"
68 #include "gromacs/math/functions.h"
69 #include "gromacs/math/units.h"
70 #include "gromacs/math/vec.h"
71 #include "gromacs/math/vecdump.h"
72 #include "gromacs/mdlib/calcmu.h"
73 #include "gromacs/mdlib/calcvir.h"
74 #include "gromacs/mdlib/constr.h"
75 #include "gromacs/mdlib/force.h"
76 #include "gromacs/mdlib/forcerec.h"
77 #include "gromacs/mdlib/gmx_omp_nthreads.h"
78 #include "gromacs/mdlib/mdrun.h"
79 #include "gromacs/mdlib/nb_verlet.h"
80 #include "gromacs/mdlib/nbnxn_atomdata.h"
81 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
82 #include "gromacs/mdlib/nbnxn_grid.h"
83 #include "gromacs/mdlib/nbnxn_search.h"
84 #include "gromacs/mdlib/qmmm.h"
85 #include "gromacs/mdlib/update.h"
86 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
87 #include "gromacs/mdtypes/commrec.h"
88 #include "gromacs/mdtypes/forceoutput.h"
89 #include "gromacs/mdtypes/iforceprovider.h"
90 #include "gromacs/mdtypes/inputrec.h"
91 #include "gromacs/mdtypes/md_enums.h"
92 #include "gromacs/mdtypes/state.h"
93 #include "gromacs/pbcutil/ishift.h"
94 #include "gromacs/pbcutil/mshift.h"
95 #include "gromacs/pbcutil/pbc.h"
96 #include "gromacs/pulling/pull.h"
97 #include "gromacs/pulling/pull_rotation.h"
98 #include "gromacs/timing/cyclecounter.h"
99 #include "gromacs/timing/gpu_timing.h"
100 #include "gromacs/timing/wallcycle.h"
101 #include "gromacs/timing/wallcyclereporting.h"
102 #include "gromacs/timing/walltime_accounting.h"
103 #include "gromacs/topology/topology.h"
104 #include "gromacs/utility/arrayref.h"
105 #include "gromacs/utility/basedefinitions.h"
106 #include "gromacs/utility/cstringutil.h"
107 #include "gromacs/utility/exceptions.h"
108 #include "gromacs/utility/fatalerror.h"
109 #include "gromacs/utility/gmxassert.h"
110 #include "gromacs/utility/gmxmpi.h"
111 #include "gromacs/utility/logger.h"
112 #include "gromacs/utility/pleasecite.h"
113 #include "gromacs/utility/smalloc.h"
114 #include "gromacs/utility/strconvert.h"
115 #include "gromacs/utility/sysinfo.h"
117 #include "nbnxn_gpu.h"
118 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
119 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
121 // TODO: this environment variable allows us to verify before release
122 // that on less common architectures the total cost of polling is not larger than
123 // a blocking wait (so polling does not introduce overhead when the static
124 // PME-first ordering would suffice).
125 static const bool c_disableAlternatingWait
= (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
128 void print_time(FILE *out
,
129 gmx_walltime_accounting_t walltime_accounting
,
131 const t_inputrec
*ir
,
135 double dt
, elapsed_seconds
, time_per_step
;
144 fputs(gmx::int64ToString(step
).c_str(), out
);
147 if ((step
>= ir
->nstlist
))
149 double seconds_since_epoch
= gmx_gettime();
150 elapsed_seconds
= seconds_since_epoch
- walltime_accounting_get_start_time_stamp(walltime_accounting
);
151 time_per_step
= elapsed_seconds
/(step
- ir
->init_step
+ 1);
152 dt
= (ir
->nsteps
+ ir
->init_step
- step
) * time_per_step
;
158 finish
= static_cast<time_t>(seconds_since_epoch
+ dt
);
159 auto timebuf
= gmx_ctime_r(&finish
);
160 fputs(", will finish ", out
);
161 fputs(timebuf
.c_str(), out
);
165 fprintf(out
, ", remaining wall clock time: %5d s ", static_cast<int>(dt
));
170 fprintf(out
, " performance: %.1f ns/day ",
171 ir
->delta_t
/1000*24*60*60/time_per_step
);
180 GMX_UNUSED_VALUE(cr
);
186 void print_date_and_time(FILE *fplog
, int nodeid
, const char *title
,
194 time_t temp_time
= static_cast<time_t>(the_time
);
196 auto timebuf
= gmx_ctime_r(&temp_time
);
197 // Retain only the characters before the first space
198 timebuf
.erase(std::find(timebuf
.begin(), timebuf
.end(), ' '), timebuf
.end());
200 fprintf(fplog
, "%s on rank %d %s\n", title
, nodeid
, timebuf
.c_str());
203 void print_start(FILE *fplog
, const t_commrec
*cr
,
204 gmx_walltime_accounting_t walltime_accounting
,
209 sprintf(buf
, "Started %s", name
);
210 print_date_and_time(fplog
, cr
->nodeid
, buf
,
211 walltime_accounting_get_start_time_stamp(walltime_accounting
));
214 static void sum_forces(rvec f
[], gmx::ArrayRef
<const gmx::RVec
> forceToAdd
)
216 const int end
= forceToAdd
.size();
218 int gmx_unused nt
= gmx_omp_nthreads_get(emntDefault
);
219 #pragma omp parallel for num_threads(nt) schedule(static)
220 for (int i
= 0; i
< end
; i
++)
222 rvec_inc(f
[i
], forceToAdd
[i
]);
226 static void pme_gpu_reduce_outputs(gmx_wallcycle_t wcycle
,
227 gmx::ForceWithVirial
*forceWithVirial
,
228 gmx::ArrayRef
<const gmx::RVec
> pmeForces
,
229 gmx_enerdata_t
*enerd
,
233 wallcycle_start(wcycle
, ewcPME_GPU_F_REDUCTION
);
234 GMX_ASSERT(forceWithVirial
, "Invalid force pointer");
235 forceWithVirial
->addVirialContribution(vir_Q
);
236 enerd
->term
[F_COUL_RECIP
] += Vlr_q
;
237 sum_forces(as_rvec_array(forceWithVirial
->force_
.data()), pmeForces
);
238 wallcycle_stop(wcycle
, ewcPME_GPU_F_REDUCTION
);
241 static void calc_virial(int start
, int homenr
, const rvec x
[], const rvec f
[],
242 tensor vir_part
, const t_graph
*graph
, const matrix box
,
243 t_nrnb
*nrnb
, const t_forcerec
*fr
, int ePBC
)
245 /* The short-range virial from surrounding boxes */
246 calc_vir(SHIFTS
, fr
->shift_vec
, fr
->fshift
, vir_part
, ePBC
== epbcSCREW
, box
);
247 inc_nrnb(nrnb
, eNR_VIRIAL
, SHIFTS
);
249 /* Calculate partial virial, for local atoms only, based on short range.
250 * Total virial is computed in global_stat, called from do_md
252 f_calc_vir(start
, start
+homenr
, x
, f
, vir_part
, graph
, box
);
253 inc_nrnb(nrnb
, eNR_VIRIAL
, homenr
);
257 pr_rvecs(debug
, 0, "vir_part", vir_part
, DIM
);
261 static void pull_potential_wrapper(const t_commrec
*cr
,
262 const t_inputrec
*ir
,
263 const matrix box
, gmx::ArrayRef
<const gmx::RVec
> x
,
264 gmx::ForceWithVirial
*force
,
265 const t_mdatoms
*mdatoms
,
266 gmx_enerdata_t
*enerd
,
269 gmx_wallcycle_t wcycle
)
274 /* Calculate the center of mass forces, this requires communication,
275 * which is why pull_potential is called close to other communication.
277 wallcycle_start(wcycle
, ewcPULLPOT
);
278 set_pbc(&pbc
, ir
->ePBC
, box
);
280 enerd
->term
[F_COM_PULL
] +=
281 pull_potential(ir
->pull_work
, mdatoms
, &pbc
,
282 cr
, t
, lambda
[efptRESTRAINT
], as_rvec_array(x
.data()), force
, &dvdl
);
283 enerd
->dvdl_lin
[efptRESTRAINT
] += dvdl
;
284 wallcycle_stop(wcycle
, ewcPULLPOT
);
287 static void pme_receive_force_ener(const t_commrec
*cr
,
288 gmx::ForceWithVirial
*forceWithVirial
,
289 gmx_enerdata_t
*enerd
,
290 gmx_wallcycle_t wcycle
)
292 real e_q
, e_lj
, dvdl_q
, dvdl_lj
;
293 float cycles_ppdpme
, cycles_seppme
;
295 cycles_ppdpme
= wallcycle_stop(wcycle
, ewcPPDURINGPME
);
296 dd_cycles_add(cr
->dd
, cycles_ppdpme
, ddCyclPPduringPME
);
298 /* In case of node-splitting, the PP nodes receive the long-range
299 * forces, virial and energy from the PME nodes here.
301 wallcycle_start(wcycle
, ewcPP_PMEWAITRECVF
);
304 gmx_pme_receive_f(cr
, forceWithVirial
, &e_q
, &e_lj
, &dvdl_q
, &dvdl_lj
,
306 enerd
->term
[F_COUL_RECIP
] += e_q
;
307 enerd
->term
[F_LJ_RECIP
] += e_lj
;
308 enerd
->dvdl_lin
[efptCOUL
] += dvdl_q
;
309 enerd
->dvdl_lin
[efptVDW
] += dvdl_lj
;
313 dd_cycles_add(cr
->dd
, cycles_seppme
, ddCyclPME
);
315 wallcycle_stop(wcycle
, ewcPP_PMEWAITRECVF
);
318 static void print_large_forces(FILE *fp
,
326 real force2Tolerance
= gmx::square(forceTolerance
);
327 gmx::index numNonFinite
= 0;
328 for (int i
= 0; i
< md
->homenr
; i
++)
330 real force2
= norm2(f
[i
]);
331 bool nonFinite
= !std::isfinite(force2
);
332 if (force2
>= force2Tolerance
|| nonFinite
)
334 fprintf(fp
, "step %" PRId64
" atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
336 ddglatnr(cr
->dd
, i
), x
[i
][XX
], x
[i
][YY
], x
[i
][ZZ
], std::sqrt(force2
));
343 if (numNonFinite
> 0)
345 /* Note that with MPI this fatal call on one rank might interrupt
346 * the printing on other ranks. But we can only avoid that with
347 * an expensive MPI barrier that we would need at each step.
349 gmx_fatal(FARGS
, "At step %" PRId64
" detected non-finite forces on %td atoms", step
, numNonFinite
);
353 static void post_process_forces(const t_commrec
*cr
,
356 gmx_wallcycle_t wcycle
,
357 const gmx_localtop_t
*top
,
361 gmx::ForceWithVirial
*forceWithVirial
,
363 const t_mdatoms
*mdatoms
,
364 const t_graph
*graph
,
365 const t_forcerec
*fr
,
366 const gmx_vsite_t
*vsite
,
369 if (fr
->haveDirectVirialContributions
)
371 rvec
*fDirectVir
= as_rvec_array(forceWithVirial
->force_
.data());
375 /* Spread the mesh force on virtual sites to the other particles...
376 * This is parallellized. MPI communication is performed
377 * if the constructing atoms aren't local.
379 matrix virial
= { { 0 } };
380 spread_vsite_f(vsite
, x
, fDirectVir
, nullptr,
381 (flags
& GMX_FORCE_VIRIAL
) != 0, virial
,
383 &top
->idef
, fr
->ePBC
, fr
->bMolPBC
, graph
, box
, cr
, wcycle
);
384 forceWithVirial
->addVirialContribution(virial
);
387 if (flags
& GMX_FORCE_VIRIAL
)
389 /* Now add the forces, this is local */
390 sum_forces(f
, forceWithVirial
->force_
);
392 /* Add the direct virial contributions */
393 GMX_ASSERT(forceWithVirial
->computeVirial_
, "forceWithVirial should request virial computation when we request the virial");
394 m_add(vir_force
, forceWithVirial
->getVirial(), vir_force
);
398 pr_rvecs(debug
, 0, "vir_force", vir_force
, DIM
);
403 if (fr
->print_force
>= 0)
405 print_large_forces(stderr
, mdatoms
, cr
, step
, fr
->print_force
, x
, f
);
409 static void do_nb_verlet(const t_forcerec
*fr
,
410 const interaction_const_t
*ic
,
411 gmx_enerdata_t
*enerd
,
412 int flags
, int ilocality
,
416 gmx_wallcycle_t wcycle
)
418 if (!(flags
& GMX_FORCE_NONBONDED
))
420 /* skip non-bonded calculation */
424 nonbonded_verlet_t
*nbv
= fr
->nbv
;
425 nonbonded_verlet_group_t
*nbvg
= &nbv
->grp
[ilocality
];
427 /* GPU kernel launch overhead is already timed separately */
428 if (fr
->cutoff_scheme
!= ecutsVERLET
)
430 gmx_incons("Invalid cut-off scheme passed!");
433 bool bUsingGpuKernels
= (nbvg
->kernel_type
== nbnxnk8x8x8_GPU
);
435 if (!bUsingGpuKernels
)
437 /* When dynamic pair-list pruning is requested, we need to prune
438 * at nstlistPrune steps.
440 if (nbv
->listParams
->useDynamicPruning
&&
441 (step
- nbvg
->nbl_lists
.outerListCreationStep
) % nbv
->listParams
->nstlistPrune
== 0)
443 /* Prune the pair-list beyond fr->ic->rlistPrune using
444 * the current coordinates of the atoms.
446 wallcycle_sub_start(wcycle
, ewcsNONBONDED_PRUNING
);
447 nbnxn_kernel_cpu_prune(nbvg
, nbv
->nbat
, fr
->shift_vec
, nbv
->listParams
->rlistInner
);
448 wallcycle_sub_stop(wcycle
, ewcsNONBONDED_PRUNING
);
451 wallcycle_sub_start(wcycle
, ewcsNONBONDED
);
454 switch (nbvg
->kernel_type
)
456 case nbnxnk4x4_PlainC
:
457 case nbnxnk4xN_SIMD_4xN
:
458 case nbnxnk4xN_SIMD_2xNN
:
459 nbnxn_kernel_cpu(nbvg
,
466 enerd
->grpp
.ener
[egCOULSR
],
468 enerd
->grpp
.ener
[egBHAMSR
] :
469 enerd
->grpp
.ener
[egLJSR
]);
472 case nbnxnk8x8x8_GPU
:
473 nbnxn_gpu_launch_kernel(nbv
->gpu_nbv
, nbv
->nbat
, flags
, ilocality
);
476 case nbnxnk8x8x8_PlainC
:
477 nbnxn_kernel_gpu_ref(nbvg
->nbl_lists
.nbl
[0],
484 enerd
->grpp
.ener
[egCOULSR
],
486 enerd
->grpp
.ener
[egBHAMSR
] :
487 enerd
->grpp
.ener
[egLJSR
]);
491 GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
494 if (!bUsingGpuKernels
)
496 wallcycle_sub_stop(wcycle
, ewcsNONBONDED
);
499 int enr_nbnxn_kernel_ljc
, enr_nbnxn_kernel_lj
;
500 if (EEL_RF(ic
->eeltype
) || ic
->eeltype
== eelCUT
)
502 enr_nbnxn_kernel_ljc
= eNR_NBNXN_LJ_RF
;
504 else if ((!bUsingGpuKernels
&& nbvg
->ewald_excl
== ewaldexclAnalytical
) ||
505 (bUsingGpuKernels
&& nbnxn_gpu_is_kernel_ewald_analytical(nbv
->gpu_nbv
)))
507 enr_nbnxn_kernel_ljc
= eNR_NBNXN_LJ_EWALD
;
511 enr_nbnxn_kernel_ljc
= eNR_NBNXN_LJ_TAB
;
513 enr_nbnxn_kernel_lj
= eNR_NBNXN_LJ
;
514 if (flags
& GMX_FORCE_ENERGY
)
516 /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
517 enr_nbnxn_kernel_ljc
+= 1;
518 enr_nbnxn_kernel_lj
+= 1;
521 inc_nrnb(nrnb
, enr_nbnxn_kernel_ljc
,
522 nbvg
->nbl_lists
.natpair_ljq
);
523 inc_nrnb(nrnb
, enr_nbnxn_kernel_lj
,
524 nbvg
->nbl_lists
.natpair_lj
);
525 /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
526 inc_nrnb(nrnb
, enr_nbnxn_kernel_ljc
-eNR_NBNXN_LJ_RF
+eNR_NBNXN_RF
,
527 nbvg
->nbl_lists
.natpair_q
);
529 if (ic
->vdw_modifier
== eintmodFORCESWITCH
)
531 /* We add up the switch cost separately */
532 inc_nrnb(nrnb
, eNR_NBNXN_ADD_LJ_FSW
+((flags
& GMX_FORCE_ENERGY
) ? 1 : 0),
533 nbvg
->nbl_lists
.natpair_ljq
+ nbvg
->nbl_lists
.natpair_lj
);
535 if (ic
->vdw_modifier
== eintmodPOTSWITCH
)
537 /* We add up the switch cost separately */
538 inc_nrnb(nrnb
, eNR_NBNXN_ADD_LJ_PSW
+((flags
& GMX_FORCE_ENERGY
) ? 1 : 0),
539 nbvg
->nbl_lists
.natpair_ljq
+ nbvg
->nbl_lists
.natpair_lj
);
541 if (ic
->vdwtype
== evdwPME
)
543 /* We add up the LJ Ewald cost separately */
544 inc_nrnb(nrnb
, eNR_NBNXN_ADD_LJ_EWALD
+((flags
& GMX_FORCE_ENERGY
) ? 1 : 0),
545 nbvg
->nbl_lists
.natpair_ljq
+ nbvg
->nbl_lists
.natpair_lj
);
549 static void do_nb_verlet_fep(nbnxn_pairlist_set_t
*nbl_lists
,
553 const t_mdatoms
*mdatoms
,
556 gmx_enerdata_t
*enerd
,
559 gmx_wallcycle_t wcycle
)
562 nb_kernel_data_t kernel_data
;
564 real dvdl_nb
[efptNR
];
569 /* Add short-range interactions */
570 donb_flags
|= GMX_NONBONDED_DO_SR
;
572 /* Currently all group scheme kernels always calculate (shift-)forces */
573 if (flags
& GMX_FORCE_FORCES
)
575 donb_flags
|= GMX_NONBONDED_DO_FORCE
;
577 if (flags
& GMX_FORCE_VIRIAL
)
579 donb_flags
|= GMX_NONBONDED_DO_SHIFTFORCE
;
581 if (flags
& GMX_FORCE_ENERGY
)
583 donb_flags
|= GMX_NONBONDED_DO_POTENTIAL
;
586 kernel_data
.flags
= donb_flags
;
587 kernel_data
.lambda
= lambda
;
588 kernel_data
.dvdl
= dvdl_nb
;
590 kernel_data
.energygrp_elec
= enerd
->grpp
.ener
[egCOULSR
];
591 kernel_data
.energygrp_vdw
= enerd
->grpp
.ener
[egLJSR
];
593 /* reset free energy components */
594 for (i
= 0; i
< efptNR
; i
++)
599 GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded
) == nbl_lists
->nnbl
, "Number of lists should be same as number of NB threads");
601 wallcycle_sub_start(wcycle
, ewcsNONBONDED
);
602 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
603 for (th
= 0; th
< nbl_lists
->nnbl
; th
++)
607 gmx_nb_free_energy_kernel(nbl_lists
->nbl_fep
[th
],
608 x
, f
, fr
, mdatoms
, &kernel_data
, nrnb
);
610 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
613 if (fepvals
->sc_alpha
!= 0)
615 enerd
->dvdl_nonlin
[efptVDW
] += dvdl_nb
[efptVDW
];
616 enerd
->dvdl_nonlin
[efptCOUL
] += dvdl_nb
[efptCOUL
];
620 enerd
->dvdl_lin
[efptVDW
] += dvdl_nb
[efptVDW
];
621 enerd
->dvdl_lin
[efptCOUL
] += dvdl_nb
[efptCOUL
];
624 /* If we do foreign lambda and we have soft-core interactions
625 * we have to recalculate the (non-linear) energies contributions.
627 if (fepvals
->n_lambda
> 0 && (flags
& GMX_FORCE_DHDL
) && fepvals
->sc_alpha
!= 0)
629 kernel_data
.flags
= (donb_flags
& ~(GMX_NONBONDED_DO_FORCE
| GMX_NONBONDED_DO_SHIFTFORCE
)) | GMX_NONBONDED_DO_FOREIGNLAMBDA
;
630 kernel_data
.lambda
= lam_i
;
631 kernel_data
.energygrp_elec
= enerd
->foreign_grpp
.ener
[egCOULSR
];
632 kernel_data
.energygrp_vdw
= enerd
->foreign_grpp
.ener
[egLJSR
];
633 /* Note that we add to kernel_data.dvdl, but ignore the result */
635 for (i
= 0; i
< enerd
->n_lambda
; i
++)
637 for (j
= 0; j
< efptNR
; j
++)
639 lam_i
[j
] = (i
== 0 ? lambda
[j
] : fepvals
->all_lambda
[j
][i
-1]);
641 reset_foreign_enerdata(enerd
);
642 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
643 for (th
= 0; th
< nbl_lists
->nnbl
; th
++)
647 gmx_nb_free_energy_kernel(nbl_lists
->nbl_fep
[th
],
648 x
, f
, fr
, mdatoms
, &kernel_data
, nrnb
);
650 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
653 sum_epot(&(enerd
->foreign_grpp
), enerd
->foreign_term
);
654 enerd
->enerpart_lambda
[i
] += enerd
->foreign_term
[F_EPOT
];
658 wallcycle_sub_stop(wcycle
, ewcsNONBONDED
);
661 gmx_bool
use_GPU(const nonbonded_verlet_t
*nbv
)
663 return nbv
!= nullptr && nbv
->bUseGPU
;
666 static inline void clear_rvecs_omp(int n
, rvec v
[])
668 int nth
= gmx_omp_nthreads_get_simple_rvec_task(emntDefault
, n
);
670 /* Note that we would like to avoid this conditional by putting it
671 * into the omp pragma instead, but then we still take the full
672 * omp parallel for overhead (at least with gcc5).
676 for (int i
= 0; i
< n
; i
++)
683 #pragma omp parallel for num_threads(nth) schedule(static)
684 for (int i
= 0; i
< n
; i
++)
691 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
693 * \param groupOptions Group options, containing T-coupling options
695 static real
averageKineticEnergyEstimate(const t_grpopts
&groupOptions
)
697 real nrdfCoupled
= 0;
698 real nrdfUncoupled
= 0;
699 real kineticEnergy
= 0;
700 for (int g
= 0; g
< groupOptions
.ngtc
; g
++)
702 if (groupOptions
.tau_t
[g
] >= 0)
704 nrdfCoupled
+= groupOptions
.nrdf
[g
];
705 kineticEnergy
+= groupOptions
.nrdf
[g
]*0.5*groupOptions
.ref_t
[g
]*BOLTZ
;
709 nrdfUncoupled
+= groupOptions
.nrdf
[g
];
713 /* This conditional with > also catches nrdf=0 */
714 if (nrdfCoupled
> nrdfUncoupled
)
716 return kineticEnergy
*(nrdfCoupled
+ nrdfUncoupled
)/nrdfCoupled
;
724 /*! \brief This routine checks that the potential energy is finite.
726 * Always checks that the potential energy is finite. If step equals
727 * inputrec.init_step also checks that the magnitude of the potential energy
728 * is reasonable. Terminates with a fatal error when a check fails.
729 * Note that passing this check does not guarantee finite forces,
730 * since those use slightly different arithmetics. But in most cases
731 * there is just a narrow coordinate range where forces are not finite
732 * and energies are finite.
734 * \param[in] step The step number, used for checking and printing
735 * \param[in] enerd The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
736 * \param[in] inputrec The input record
738 static void checkPotentialEnergyValidity(int64_t step
,
739 const gmx_enerdata_t
&enerd
,
740 const t_inputrec
&inputrec
)
742 /* Threshold valid for comparing absolute potential energy against
743 * the kinetic energy. Normally one should not consider absolute
744 * potential energy values, but with a factor of one million
745 * we should never get false positives.
747 constexpr real c_thresholdFactor
= 1e6
;
749 bool energyIsNotFinite
= !std::isfinite(enerd
.term
[F_EPOT
]);
750 real averageKineticEnergy
= 0;
751 /* We only check for large potential energy at the initial step,
752 * because that is by far the most likely step for this too occur
753 * and because computing the average kinetic energy is not free.
754 * Note: nstcalcenergy >> 1 often does not allow to catch large energies
755 * before they become NaN.
757 if (step
== inputrec
.init_step
&& EI_DYNAMICS(inputrec
.eI
))
759 averageKineticEnergy
= averageKineticEnergyEstimate(inputrec
.opts
);
762 if (energyIsNotFinite
|| (averageKineticEnergy
> 0 &&
763 enerd
.term
[F_EPOT
] > c_thresholdFactor
*averageKineticEnergy
))
765 gmx_fatal(FARGS
, "Step %" PRId64
": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
768 energyIsNotFinite
? "not finite" : "extremely high",
770 enerd
.term
[F_COUL_SR
],
771 energyIsNotFinite
? "non-finite" : "very high",
772 energyIsNotFinite
? " or Nan" : "");
776 /*! \brief Compute forces and/or energies for special algorithms
778 * The intention is to collect all calls to algorithms that compute
779 * forces on local atoms only and that do not contribute to the local
780 * virial sum (but add their virial contribution separately).
781 * Eventually these should likely all become ForceProviders.
782 * Within this function the intention is to have algorithms that do
783 * global communication at the end, so global barriers within the MD loop
784 * are as close together as possible.
786 * \param[in] fplog The log file
787 * \param[in] cr The communication record
788 * \param[in] inputrec The input record
789 * \param[in] awh The Awh module (nullptr if none in use).
790 * \param[in] enforcedRotation Enforced rotation module.
791 * \param[in] step The current MD step
792 * \param[in] t The current time
793 * \param[in,out] wcycle Wallcycle accounting struct
794 * \param[in,out] forceProviders Pointer to a list of force providers
795 * \param[in] box The unit cell
796 * \param[in] x The coordinates
797 * \param[in] mdatoms Per atom properties
798 * \param[in] lambda Array of free-energy lambda values
799 * \param[in] forceFlags Flags that tell whether we should compute forces/energies/virial
800 * \param[in,out] forceWithVirial Force and virial buffers
801 * \param[in,out] enerd Energy buffer
802 * \param[in,out] ed Essential dynamics pointer
803 * \param[in] bNS Tells if we did neighbor searching this step, used for ED sampling
805 * \todo Remove bNS, which is used incorrectly.
806 * \todo Convert all other algorithms called here to ForceProviders.
809 computeSpecialForces(FILE *fplog
,
811 const t_inputrec
*inputrec
,
813 gmx_enfrot
*enforcedRotation
,
816 gmx_wallcycle_t wcycle
,
817 ForceProviders
*forceProviders
,
819 gmx::ArrayRef
<const gmx::RVec
> x
,
820 const t_mdatoms
*mdatoms
,
823 gmx::ForceWithVirial
*forceWithVirial
,
824 gmx_enerdata_t
*enerd
,
828 const bool computeForces
= (forceFlags
& GMX_FORCE_FORCES
) != 0;
830 /* NOTE: Currently all ForceProviders only provide forces.
831 * When they also provide energies, remove this conditional.
835 gmx::ForceProviderInput
forceProviderInput(x
, *mdatoms
, t
, box
, *cr
);
836 gmx::ForceProviderOutput
forceProviderOutput(forceWithVirial
, enerd
);
838 /* Collect forces from modules */
839 forceProviders
->calculateForces(forceProviderInput
, &forceProviderOutput
);
842 if (inputrec
->bPull
&& pull_have_potential(inputrec
->pull_work
))
844 pull_potential_wrapper(cr
, inputrec
, box
, x
,
846 mdatoms
, enerd
, lambda
, t
,
851 enerd
->term
[F_COM_PULL
] +=
852 awh
->applyBiasForcesAndUpdateBias(inputrec
->ePBC
, *mdatoms
, box
,
854 t
, step
, wcycle
, fplog
);
858 rvec
*f
= as_rvec_array(forceWithVirial
->force_
.data());
860 /* Add the forces from enforced rotation potentials (if any) */
863 wallcycle_start(wcycle
, ewcROTadd
);
864 enerd
->term
[F_COM_PULL
] += add_rot_forces(enforcedRotation
, f
, cr
, step
, t
);
865 wallcycle_stop(wcycle
, ewcROTadd
);
870 /* Note that since init_edsam() is called after the initialization
871 * of forcerec, edsam doesn't request the noVirSum force buffer.
872 * Thus if no other algorithm (e.g. PME) requires it, the forces
873 * here will contribute to the virial.
875 do_flood(cr
, inputrec
, as_rvec_array(x
.data()), f
, ed
, box
, step
, bNS
);
878 /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
879 if (inputrec
->bIMD
&& computeForces
)
881 IMD_apply_forces(inputrec
->bIMD
, inputrec
->imd
, cr
, f
, wcycle
);
885 /*! \brief Launch the prepare_step and spread stages of PME GPU.
887 * \param[in] pmedata The PME structure
888 * \param[in] box The box matrix
889 * \param[in] x Coordinate array
890 * \param[in] flags Force flags
891 * \param[in] wcycle The wallcycle structure
893 static inline void launchPmeGpuSpread(gmx_pme_t
*pmedata
,
897 gmx_wallcycle_t wcycle
)
899 int pmeFlags
= GMX_PME_SPREAD
| GMX_PME_SOLVE
;
900 pmeFlags
|= (flags
& GMX_FORCE_FORCES
) ? GMX_PME_CALC_F
: 0;
901 pmeFlags
|= (flags
& GMX_FORCE_VIRIAL
) ? GMX_PME_CALC_ENER_VIR
: 0;
903 pme_gpu_prepare_computation(pmedata
, (flags
& GMX_FORCE_DYNAMICBOX
) != 0, box
, wcycle
, pmeFlags
);
904 pme_gpu_launch_spread(pmedata
, x
, wcycle
);
907 /*! \brief Launch the FFT and gather stages of PME GPU
909 * This function only implements setting the output forces (no accumulation).
911 * \param[in] pmedata The PME structure
912 * \param[in] wcycle The wallcycle structure
914 static void launchPmeGpuFftAndGather(gmx_pme_t
*pmedata
,
915 gmx_wallcycle_t wcycle
)
917 pme_gpu_launch_complex_transforms(pmedata
, wcycle
);
918 pme_gpu_launch_gather(pmedata
, wcycle
, PmeForceOutputHandling::Set
);
922 * Polling wait for either of the PME or nonbonded GPU tasks.
924 * Instead of a static order in waiting for GPU tasks, this function
925 * polls checking which of the two tasks completes first, and does the
926 * associated force buffer reduction overlapped with the other task.
927 * By doing that, unlike static scheduling order, it can always overlap
928 * one of the reductions, regardless of the GPU task completion order.
930 * \param[in] nbv Nonbonded verlet structure
931 * \param[in] pmedata PME module data
932 * \param[in,out] force Force array to reduce task outputs into.
933 * \param[in,out] forceWithVirial Force and virial buffers
934 * \param[in,out] fshift Shift force output vector results are reduced into
935 * \param[in,out] enerd Energy data structure results are reduced into
936 * \param[in] flags Force flags
937 * \param[in] wcycle The wallcycle structure
939 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t
*nbv
,
940 const gmx_pme_t
*pmedata
,
941 gmx::PaddedArrayRef
<gmx::RVec
> *force
,
942 gmx::ForceWithVirial
*forceWithVirial
,
944 gmx_enerdata_t
*enerd
,
946 gmx_wallcycle_t wcycle
)
948 bool isPmeGpuDone
= false;
949 bool isNbGpuDone
= false;
952 gmx::ArrayRef
<const gmx::RVec
> pmeGpuForces
;
954 while (!isPmeGpuDone
|| !isNbGpuDone
)
961 GpuTaskCompletion completionType
= (isNbGpuDone
) ? GpuTaskCompletion::Wait
: GpuTaskCompletion::Check
;
962 isPmeGpuDone
= pme_gpu_try_finish_task(pmedata
, wcycle
, &pmeGpuForces
,
963 vir_Q
, &Vlr_q
, completionType
);
967 pme_gpu_reduce_outputs(wcycle
, forceWithVirial
, pmeGpuForces
,
968 enerd
, vir_Q
, Vlr_q
);
974 GpuTaskCompletion completionType
= (isPmeGpuDone
) ? GpuTaskCompletion::Wait
: GpuTaskCompletion::Check
;
975 wallcycle_start_nocount(wcycle
, ewcWAIT_GPU_NB_L
);
976 isNbGpuDone
= nbnxn_gpu_try_finish_task(nbv
->gpu_nbv
,
978 enerd
->grpp
.ener
[egLJSR
], enerd
->grpp
.ener
[egCOULSR
],
979 fshift
, completionType
);
980 wallcycle_stop(wcycle
, ewcWAIT_GPU_NB_L
);
981 // To get the call count right, when the task finished we
982 // issue a start/stop.
983 // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
984 // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
987 wallcycle_start(wcycle
, ewcWAIT_GPU_NB_L
);
988 wallcycle_stop(wcycle
, ewcWAIT_GPU_NB_L
);
990 nbnxn_atomdata_add_nbat_f_to_f(nbv
->nbs
.get(), eatLocal
,
991 nbv
->nbat
, as_rvec_array(force
->data()), wcycle
);
998 * Launch the dynamic rolling pruning GPU task.
1000 * We currently alternate local/non-local list pruning in odd-even steps
1001 * (only pruning every second step without DD).
1003 * \param[in] cr The communication record
1004 * \param[in] nbv Nonbonded verlet structure
1005 * \param[in] inputrec The input record
1006 * \param[in] step The current MD step
1008 static inline void launchGpuRollingPruning(const t_commrec
*cr
,
1009 const nonbonded_verlet_t
*nbv
,
1010 const t_inputrec
*inputrec
,
1013 /* We should not launch the rolling pruning kernel at a search
1014 * step or just before search steps, since that's useless.
1015 * Without domain decomposition we prune at even steps.
1016 * With domain decomposition we alternate local and non-local
1017 * pruning at even and odd steps.
1019 int numRollingParts
= nbv
->listParams
->numRollingParts
;
1020 GMX_ASSERT(numRollingParts
== nbv
->listParams
->nstlistPrune
/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1021 int stepWithCurrentList
= step
- nbv
->grp
[eintLocal
].nbl_lists
.outerListCreationStep
;
1022 bool stepIsEven
= ((stepWithCurrentList
& 1) == 0);
1023 if (stepWithCurrentList
> 0 &&
1024 stepWithCurrentList
< inputrec
->nstlist
- 1 &&
1025 (stepIsEven
|| DOMAINDECOMP(cr
)))
1027 nbnxn_gpu_launch_kernel_pruneonly(nbv
->gpu_nbv
,
1028 stepIsEven
? eintLocal
: eintNonlocal
,
1033 static void do_force_cutsVERLET(FILE *fplog
,
1034 const t_commrec
*cr
,
1035 const gmx_multisim_t
*ms
,
1036 const t_inputrec
*inputrec
,
1038 gmx_enfrot
*enforcedRotation
,
1041 gmx_wallcycle_t wcycle
,
1042 const gmx_localtop_t
*top
,
1043 const gmx_groups_t
* /* groups */,
1044 matrix box
, gmx::PaddedArrayRef
<gmx::RVec
> x
,
1046 gmx::PaddedArrayRef
<gmx::RVec
> force
,
1048 const t_mdatoms
*mdatoms
,
1049 gmx_enerdata_t
*enerd
, t_fcdata
*fcd
,
1053 interaction_const_t
*ic
,
1054 const gmx_vsite_t
*vsite
,
1059 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion
,
1060 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion
)
1064 gmx_bool bStateChanged
, bNS
, bFillGrid
, bCalcCGCM
;
1065 gmx_bool bDoForces
, bUseGPU
, bUseOrEmulGPU
;
1066 rvec vzero
, box_diag
;
1067 float cycles_pme
, cycles_wait_gpu
;
1068 nonbonded_verlet_t
*nbv
= fr
->nbv
;
1070 bStateChanged
= ((flags
& GMX_FORCE_STATECHANGED
) != 0);
1071 bNS
= ((flags
& GMX_FORCE_NS
) != 0) && (!fr
->bAllvsAll
);
1072 bFillGrid
= (bNS
&& bStateChanged
);
1073 bCalcCGCM
= (bFillGrid
&& !DOMAINDECOMP(cr
));
1074 bDoForces
= ((flags
& GMX_FORCE_FORCES
) != 0);
1075 bUseGPU
= fr
->nbv
->bUseGPU
;
1076 bUseOrEmulGPU
= bUseGPU
|| (fr
->nbv
->emulateGpu
== EmulateGpuNonbonded::Yes
);
1078 const auto pmeRunMode
= fr
->pmedata
? pme_run_mode(fr
->pmedata
) : PmeRunMode::CPU
;
1079 // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
1080 const bool useGpuPme
= EEL_PME(fr
->ic
->eeltype
) && thisRankHasDuty(cr
, DUTY_PME
) &&
1081 ((pmeRunMode
== PmeRunMode::GPU
) || (pmeRunMode
== PmeRunMode::Mixed
));
1083 /* At a search step we need to start the first balancing region
1084 * somewhere early inside the step after communication during domain
1085 * decomposition (and not during the previous step as usual).
1088 ddOpenBalanceRegion
== DdOpenBalanceRegionBeforeForceComputation::yes
)
1090 ddOpenBalanceRegionCpu(cr
->dd
, DdAllowBalanceRegionReopen::yes
);
1093 cycles_wait_gpu
= 0;
1095 const int start
= 0;
1096 const int homenr
= mdatoms
->homenr
;
1098 clear_mat(vir_force
);
1100 if (DOMAINDECOMP(cr
))
1102 cg1
= cr
->dd
->globalAtomGroupIndices
.size();
1115 update_forcerec(fr
, box
);
1117 if (inputrecNeedMutot(inputrec
))
1119 /* Calculate total (local) dipole moment in a temporary common array.
1120 * This makes it possible to sum them over nodes faster.
1122 calc_mu(start
, homenr
,
1123 x
, mdatoms
->chargeA
, mdatoms
->chargeB
, mdatoms
->nChargePerturbed
,
1128 if (fr
->ePBC
!= epbcNONE
)
1130 /* Compute shift vectors every step,
1131 * because of pressure coupling or box deformation!
1133 if ((flags
& GMX_FORCE_DYNAMICBOX
) && bStateChanged
)
1135 calc_shifts(box
, fr
->shift_vec
);
1140 put_atoms_in_box_omp(fr
->ePBC
, box
, x
.subArray(0, homenr
));
1141 inc_nrnb(nrnb
, eNR_SHIFTX
, homenr
);
1143 else if (EI_ENERGY_MINIMIZATION(inputrec
->eI
) && graph
)
1145 unshift_self(graph
, box
, as_rvec_array(x
.data()));
1149 nbnxn_atomdata_copy_shiftvec((flags
& GMX_FORCE_DYNAMICBOX
) != 0,
1150 fr
->shift_vec
, nbv
->nbat
);
1153 if (!thisRankHasDuty(cr
, DUTY_PME
))
1155 /* Send particle coordinates to the pme nodes.
1156 * Since this is only implemented for domain decomposition
1157 * and domain decomposition does not use the graph,
1158 * we do not need to worry about shifting.
1160 gmx_pme_send_coordinates(cr
, box
, as_rvec_array(x
.data()),
1161 lambda
[efptCOUL
], lambda
[efptVDW
],
1162 (flags
& (GMX_FORCE_VIRIAL
| GMX_FORCE_ENERGY
)) != 0,
1165 #endif /* GMX_MPI */
1169 launchPmeGpuSpread(fr
->pmedata
, box
, as_rvec_array(x
.data()), flags
, wcycle
);
1172 /* do gridding for pair search */
1175 if (graph
&& bStateChanged
)
1177 /* Calculate intramolecular shift vectors to make molecules whole */
1178 mk_mshift(fplog
, graph
, fr
->ePBC
, box
, as_rvec_array(x
.data()));
1182 box_diag
[XX
] = box
[XX
][XX
];
1183 box_diag
[YY
] = box
[YY
][YY
];
1184 box_diag
[ZZ
] = box
[ZZ
][ZZ
];
1186 wallcycle_start(wcycle
, ewcNS
);
1187 if (!DOMAINDECOMP(cr
))
1189 wallcycle_sub_start(wcycle
, ewcsNBS_GRID_LOCAL
);
1190 nbnxn_put_on_grid(nbv
->nbs
.get(), fr
->ePBC
, box
,
1192 nullptr, 0, mdatoms
->homenr
, -1,
1195 nbv
->grp
[eintLocal
].kernel_type
,
1197 wallcycle_sub_stop(wcycle
, ewcsNBS_GRID_LOCAL
);
1201 wallcycle_sub_start(wcycle
, ewcsNBS_GRID_NONLOCAL
);
1202 nbnxn_put_on_grid_nonlocal(nbv
->nbs
.get(), domdec_zones(cr
->dd
),
1204 nbv
->grp
[eintNonlocal
].kernel_type
,
1206 wallcycle_sub_stop(wcycle
, ewcsNBS_GRID_NONLOCAL
);
1209 nbnxn_atomdata_set(nbv
->nbat
, nbv
->nbs
.get(), mdatoms
, fr
->cginfo
);
1211 /* Now we put all atoms on the grid, we can assign bonded interactions
1212 * to the GPU, where the grid order is needed.
1214 if (fr
->gpuBondedLists
)
1216 assign_bondeds_to_gpu(fr
->gpuBondedLists
,
1217 nbnxn_get_gridindices(fr
->nbv
->nbs
.get()),
1221 wallcycle_stop(wcycle
, ewcNS
);
1224 /* initialize the GPU atom data and copy shift vector */
1227 wallcycle_start_nocount(wcycle
, ewcLAUNCH_GPU
);
1228 wallcycle_sub_start_nocount(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1232 nbnxn_gpu_init_atomdata(nbv
->gpu_nbv
, nbv
->nbat
);
1235 nbnxn_gpu_upload_shiftvec(nbv
->gpu_nbv
, nbv
->nbat
);
1237 wallcycle_sub_stop(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1238 wallcycle_stop(wcycle
, ewcLAUNCH_GPU
);
1241 /* do local pair search */
1244 wallcycle_start_nocount(wcycle
, ewcNS
);
1245 wallcycle_sub_start(wcycle
, ewcsNBS_SEARCH_LOCAL
);
1246 nbnxn_make_pairlist(nbv
->nbs
.get(), nbv
->nbat
,
1248 nbv
->listParams
->rlistOuter
,
1249 nbv
->min_ci_balanced
,
1250 &nbv
->grp
[eintLocal
].nbl_lists
,
1252 nbv
->grp
[eintLocal
].kernel_type
,
1254 nbv
->grp
[eintLocal
].nbl_lists
.outerListCreationStep
= step
;
1255 if (nbv
->listParams
->useDynamicPruning
&& !bUseGPU
)
1257 nbnxnPrepareListForDynamicPruning(&nbv
->grp
[eintLocal
].nbl_lists
);
1259 wallcycle_sub_stop(wcycle
, ewcsNBS_SEARCH_LOCAL
);
1263 /* initialize local pair-list on the GPU */
1264 nbnxn_gpu_init_pairlist(nbv
->gpu_nbv
,
1265 nbv
->grp
[eintLocal
].nbl_lists
.nbl
[0],
1268 wallcycle_stop(wcycle
, ewcNS
);
1272 nbnxn_atomdata_copy_x_to_nbat_x(nbv
->nbs
.get(), eatLocal
, FALSE
, as_rvec_array(x
.data()),
1278 if (DOMAINDECOMP(cr
))
1280 ddOpenBalanceRegionGpu(cr
->dd
);
1283 wallcycle_start(wcycle
, ewcLAUNCH_GPU
);
1284 wallcycle_sub_start(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1285 /* launch local nonbonded work on GPU */
1286 do_nb_verlet(fr
, ic
, enerd
, flags
, eintLocal
, enbvClearFNo
,
1287 step
, nrnb
, wcycle
);
1288 wallcycle_sub_stop(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1289 wallcycle_stop(wcycle
, ewcLAUNCH_GPU
);
1294 // In PME GPU and mixed mode we launch FFT / gather after the
1295 // X copy/transform to allow overlap as well as after the GPU NB
1296 // launch to avoid FFT launch overhead hijacking the CPU and delaying
1297 // the nonbonded kernel.
1298 launchPmeGpuFftAndGather(fr
->pmedata
, wcycle
);
1301 /* Communicate coordinates and sum dipole if necessary +
1302 do non-local pair search */
1303 if (DOMAINDECOMP(cr
))
1307 wallcycle_start_nocount(wcycle
, ewcNS
);
1308 wallcycle_sub_start(wcycle
, ewcsNBS_SEARCH_NONLOCAL
);
1310 nbnxn_make_pairlist(nbv
->nbs
.get(), nbv
->nbat
,
1312 nbv
->listParams
->rlistOuter
,
1313 nbv
->min_ci_balanced
,
1314 &nbv
->grp
[eintNonlocal
].nbl_lists
,
1316 nbv
->grp
[eintNonlocal
].kernel_type
,
1318 nbv
->grp
[eintNonlocal
].nbl_lists
.outerListCreationStep
= step
;
1319 if (nbv
->listParams
->useDynamicPruning
&& !bUseGPU
)
1321 nbnxnPrepareListForDynamicPruning(&nbv
->grp
[eintNonlocal
].nbl_lists
);
1323 wallcycle_sub_stop(wcycle
, ewcsNBS_SEARCH_NONLOCAL
);
1325 if (nbv
->grp
[eintNonlocal
].kernel_type
== nbnxnk8x8x8_GPU
)
1327 /* initialize non-local pair-list on the GPU */
1328 nbnxn_gpu_init_pairlist(nbv
->gpu_nbv
,
1329 nbv
->grp
[eintNonlocal
].nbl_lists
.nbl
[0],
1332 wallcycle_stop(wcycle
, ewcNS
);
1336 dd_move_x(cr
->dd
, box
, x
, wcycle
);
1338 nbnxn_atomdata_copy_x_to_nbat_x(nbv
->nbs
.get(), eatNonlocal
, FALSE
, as_rvec_array(x
.data()),
1344 wallcycle_start(wcycle
, ewcLAUNCH_GPU
);
1345 wallcycle_sub_start(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1346 /* launch non-local nonbonded tasks on GPU */
1347 do_nb_verlet(fr
, ic
, enerd
, flags
, eintNonlocal
, enbvClearFNo
,
1348 step
, nrnb
, wcycle
);
1349 wallcycle_sub_stop(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1350 wallcycle_stop(wcycle
, ewcLAUNCH_GPU
);
1356 /* launch D2H copy-back F */
1357 wallcycle_start_nocount(wcycle
, ewcLAUNCH_GPU
);
1358 wallcycle_sub_start_nocount(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1359 if (DOMAINDECOMP(cr
))
1361 nbnxn_gpu_launch_cpyback(nbv
->gpu_nbv
, nbv
->nbat
,
1362 flags
, eatNonlocal
);
1364 nbnxn_gpu_launch_cpyback(nbv
->gpu_nbv
, nbv
->nbat
,
1366 wallcycle_sub_stop(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1367 wallcycle_stop(wcycle
, ewcLAUNCH_GPU
);
1370 if (bStateChanged
&& inputrecNeedMutot(inputrec
))
1374 gmx_sumd(2*DIM
, mu
, cr
);
1375 ddReopenBalanceRegionCpu(cr
->dd
);
1378 for (i
= 0; i
< 2; i
++)
1380 for (j
= 0; j
< DIM
; j
++)
1382 fr
->mu_tot
[i
][j
] = mu
[i
*DIM
+ j
];
1386 if (fr
->efep
== efepNO
)
1388 copy_rvec(fr
->mu_tot
[0], mu_tot
);
1392 for (j
= 0; j
< DIM
; j
++)
1395 (1.0 - lambda
[efptCOUL
])*fr
->mu_tot
[0][j
] +
1396 lambda
[efptCOUL
]*fr
->mu_tot
[1][j
];
1400 /* Reset energies */
1401 reset_enerdata(enerd
);
1402 clear_rvecs(SHIFTS
, fr
->fshift
);
1404 if (DOMAINDECOMP(cr
) && !thisRankHasDuty(cr
, DUTY_PME
))
1406 wallcycle_start(wcycle
, ewcPPDURINGPME
);
1407 dd_force_flop_start(cr
->dd
, nrnb
);
1412 wallcycle_start(wcycle
, ewcROT
);
1413 do_rotation(cr
, enforcedRotation
, box
, as_rvec_array(x
.data()), t
, step
, bNS
);
1414 wallcycle_stop(wcycle
, ewcROT
);
1417 /* Temporary solution until all routines take PaddedRVecVector */
1418 rvec
*const f
= as_rvec_array(force
.data());
1420 /* Start the force cycle counter.
1421 * Note that a different counter is used for dynamic load balancing.
1423 wallcycle_start(wcycle
, ewcFORCE
);
1425 gmx::ArrayRef
<gmx::RVec
> forceRef
= force
;
1428 /* If we need to compute the virial, we might need a separate
1429 * force buffer for algorithms for which the virial is calculated
1430 * directly, such as PME.
1432 if ((flags
& GMX_FORCE_VIRIAL
) && fr
->haveDirectVirialContributions
)
1434 forceRef
= *fr
->forceBufferForDirectVirialContributions
;
1436 /* TODO: update comment
1437 * We only compute forces on local atoms. Note that vsites can
1438 * spread to non-local atoms, but that part of the buffer is
1439 * cleared separately in the vsite spreading code.
1441 clear_rvecs_omp(forceRef
.size(), as_rvec_array(forceRef
.data()));
1443 /* Clear the short- and long-range forces */
1444 clear_rvecs_omp(fr
->natoms_force_constr
, f
);
1447 /* forceWithVirial uses the local atom range only */
1448 gmx::ForceWithVirial
forceWithVirial(forceRef
, (flags
& GMX_FORCE_VIRIAL
) != 0);
1450 if (inputrec
->bPull
&& pull_have_constraint(inputrec
->pull_work
))
1452 clear_pull_forces(inputrec
->pull_work
);
1455 /* We calculate the non-bonded forces, when done on the CPU, here.
1456 * We do this before calling do_force_lowlevel, because in that
1457 * function, the listed forces are calculated before PME, which
1458 * does communication. With this order, non-bonded and listed
1459 * force calculation imbalance can be balanced out by the domain
1460 * decomposition load balancing.
1465 do_nb_verlet(fr
, ic
, enerd
, flags
, eintLocal
, enbvClearFYes
,
1466 step
, nrnb
, wcycle
);
1469 if (fr
->efep
!= efepNO
)
1471 /* Calculate the local and non-local free energy interactions here.
1472 * Happens here on the CPU both with and without GPU.
1474 if (fr
->nbv
->grp
[eintLocal
].nbl_lists
.nbl_fep
[0]->nrj
> 0)
1476 do_nb_verlet_fep(&fr
->nbv
->grp
[eintLocal
].nbl_lists
,
1477 fr
, as_rvec_array(x
.data()), f
, mdatoms
,
1478 inputrec
->fepvals
, lambda
,
1479 enerd
, flags
, nrnb
, wcycle
);
1482 if (DOMAINDECOMP(cr
) &&
1483 fr
->nbv
->grp
[eintNonlocal
].nbl_lists
.nbl_fep
[0]->nrj
> 0)
1485 do_nb_verlet_fep(&fr
->nbv
->grp
[eintNonlocal
].nbl_lists
,
1486 fr
, as_rvec_array(x
.data()), f
, mdatoms
,
1487 inputrec
->fepvals
, lambda
,
1488 enerd
, flags
, nrnb
, wcycle
);
1496 if (DOMAINDECOMP(cr
))
1498 do_nb_verlet(fr
, ic
, enerd
, flags
, eintNonlocal
, enbvClearFNo
,
1499 step
, nrnb
, wcycle
);
1508 aloc
= eintNonlocal
;
1511 /* Add all the non-bonded force to the normal force array.
1512 * This can be split into a local and a non-local part when overlapping
1513 * communication with calculation with domain decomposition.
1515 wallcycle_stop(wcycle
, ewcFORCE
);
1517 nbnxn_atomdata_add_nbat_f_to_f(nbv
->nbs
.get(), eatAll
, nbv
->nbat
, f
, wcycle
);
1519 wallcycle_start_nocount(wcycle
, ewcFORCE
);
1521 /* if there are multiple fshift output buffers reduce them */
1522 if ((flags
& GMX_FORCE_VIRIAL
) &&
1523 nbv
->grp
[aloc
].nbl_lists
.nnbl
> 1)
1525 /* This is not in a subcounter because it takes a
1526 negligible and constant-sized amount of time */
1527 nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv
->nbat
,
1532 /* update QMMMrec, if necessary */
1535 update_QMMMrec(cr
, fr
, as_rvec_array(x
.data()), mdatoms
, box
);
1538 /* Compute the bonded and non-bonded energies and optionally forces */
1539 do_force_lowlevel(fr
, inputrec
, &(top
->idef
),
1540 cr
, ms
, nrnb
, wcycle
, mdatoms
,
1541 as_rvec_array(x
.data()), hist
, f
, &forceWithVirial
, enerd
, fcd
,
1542 box
, inputrec
->fepvals
, lambda
, graph
, &(top
->excls
), fr
->mu_tot
,
1543 flags
, &cycles_pme
);
1545 wallcycle_stop(wcycle
, ewcFORCE
);
1547 computeSpecialForces(fplog
, cr
, inputrec
, awh
, enforcedRotation
,
1549 fr
->forceProviders
, box
, x
, mdatoms
, lambda
,
1550 flags
, &forceWithVirial
, enerd
,
1555 /* wait for non-local forces (or calculate in emulation mode) */
1556 if (DOMAINDECOMP(cr
))
1560 wallcycle_start(wcycle
, ewcWAIT_GPU_NB_NL
);
1561 nbnxn_gpu_wait_finish_task(nbv
->gpu_nbv
,
1563 enerd
->grpp
.ener
[egLJSR
], enerd
->grpp
.ener
[egCOULSR
],
1565 cycles_wait_gpu
+= wallcycle_stop(wcycle
, ewcWAIT_GPU_NB_NL
);
1569 wallcycle_start_nocount(wcycle
, ewcFORCE
);
1570 do_nb_verlet(fr
, ic
, enerd
, flags
, eintNonlocal
, enbvClearFYes
,
1571 step
, nrnb
, wcycle
);
1572 wallcycle_stop(wcycle
, ewcFORCE
);
1575 /* skip the reduction if there was no non-local work to do */
1576 if (nbv
->grp
[eintNonlocal
].nbl_lists
.nbl
[0]->nsci
> 0)
1578 nbnxn_atomdata_add_nbat_f_to_f(nbv
->nbs
.get(), eatNonlocal
,
1579 nbv
->nbat
, f
, wcycle
);
1584 if (DOMAINDECOMP(cr
))
1586 /* We are done with the CPU compute.
1587 * We will now communicate the non-local forces.
1588 * If we use a GPU this will overlap with GPU work, so in that case
1589 * we do not close the DD force balancing region here.
1591 if (ddCloseBalanceRegion
== DdCloseBalanceRegionAfterForceComputation::yes
)
1593 ddCloseBalanceRegionCpu(cr
->dd
);
1597 dd_move_f(cr
->dd
, force
, fr
->fshift
, wcycle
);
1601 // With both nonbonded and PME offloaded a GPU on the same rank, we use
1602 // an alternating wait/reduction scheme.
1603 bool alternateGpuWait
= (!c_disableAlternatingWait
&& useGpuPme
&& bUseGPU
&& !DOMAINDECOMP(cr
));
1604 if (alternateGpuWait
)
1606 alternatePmeNbGpuWaitReduce(fr
->nbv
, fr
->pmedata
, &force
, &forceWithVirial
, fr
->fshift
, enerd
, flags
, wcycle
);
1609 if (!alternateGpuWait
&& useGpuPme
)
1611 gmx::ArrayRef
<const gmx::RVec
> pmeGpuForces
;
1614 pme_gpu_wait_finish_task(fr
->pmedata
, wcycle
, &pmeGpuForces
, vir_Q
, &Vlr_q
);
1615 pme_gpu_reduce_outputs(wcycle
, &forceWithVirial
, pmeGpuForces
, enerd
, vir_Q
, Vlr_q
);
1618 /* Wait for local GPU NB outputs on the non-alternating wait path */
1619 if (!alternateGpuWait
&& bUseGPU
)
1621 /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1622 * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1623 * but even with a step of 0.1 ms the difference is less than 1%
1626 const float gpuWaitApiOverheadMargin
= 2e6f
; /* cycles */
1628 wallcycle_start(wcycle
, ewcWAIT_GPU_NB_L
);
1629 nbnxn_gpu_wait_finish_task(nbv
->gpu_nbv
,
1631 enerd
->grpp
.ener
[egLJSR
], enerd
->grpp
.ener
[egCOULSR
],
1633 float cycles_tmp
= wallcycle_stop(wcycle
, ewcWAIT_GPU_NB_L
);
1635 if (ddCloseBalanceRegion
== DdCloseBalanceRegionAfterForceComputation::yes
)
1637 DdBalanceRegionWaitedForGpu waitedForGpu
= DdBalanceRegionWaitedForGpu::yes
;
1638 if (bDoForces
&& cycles_tmp
<= gpuWaitApiOverheadMargin
)
1640 /* We measured few cycles, it could be that the kernel
1641 * and transfer finished earlier and there was no actual
1642 * wait time, only API call overhead.
1643 * Then the actual time could be anywhere between 0 and
1644 * cycles_wait_est. We will use half of cycles_wait_est.
1646 waitedForGpu
= DdBalanceRegionWaitedForGpu::no
;
1648 ddCloseBalanceRegionGpu(cr
->dd
, cycles_wait_gpu
, waitedForGpu
);
1652 if (fr
->nbv
->emulateGpu
== EmulateGpuNonbonded::Yes
)
1654 // NOTE: emulation kernel is not included in the balancing region,
1655 // but emulation mode does not target performance anyway
1656 wallcycle_start_nocount(wcycle
, ewcFORCE
);
1657 do_nb_verlet(fr
, ic
, enerd
, flags
, eintLocal
,
1658 DOMAINDECOMP(cr
) ? enbvClearFNo
: enbvClearFYes
,
1659 step
, nrnb
, wcycle
);
1660 wallcycle_stop(wcycle
, ewcFORCE
);
1665 pme_gpu_reinit_computation(fr
->pmedata
, wcycle
);
1670 /* now clear the GPU outputs while we finish the step on the CPU */
1671 wallcycle_start_nocount(wcycle
, ewcLAUNCH_GPU
);
1672 wallcycle_sub_start_nocount(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1673 nbnxn_gpu_clear_outputs(nbv
->gpu_nbv
, flags
);
1675 /* Is dynamic pair-list pruning activated? */
1676 if (nbv
->listParams
->useDynamicPruning
)
1678 launchGpuRollingPruning(cr
, nbv
, inputrec
, step
);
1680 wallcycle_sub_stop(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1681 wallcycle_stop(wcycle
, ewcLAUNCH_GPU
);
1684 /* Do the nonbonded GPU (or emulation) force buffer reduction
1685 * on the non-alternating path. */
1686 if (bUseOrEmulGPU
&& !alternateGpuWait
)
1688 nbnxn_atomdata_add_nbat_f_to_f(nbv
->nbs
.get(), eatLocal
,
1689 nbv
->nbat
, f
, wcycle
);
1692 if (DOMAINDECOMP(cr
))
1694 dd_force_flop_stop(cr
->dd
, nrnb
);
1699 /* If we have NoVirSum forces, but we do not calculate the virial,
1700 * we sum fr->f_novirsum=f later.
1702 if (vsite
&& !(fr
->haveDirectVirialContributions
&& !(flags
& GMX_FORCE_VIRIAL
)))
1704 spread_vsite_f(vsite
, as_rvec_array(x
.data()), f
, fr
->fshift
, FALSE
, nullptr, nrnb
,
1705 &top
->idef
, fr
->ePBC
, fr
->bMolPBC
, graph
, box
, cr
, wcycle
);
1708 if (flags
& GMX_FORCE_VIRIAL
)
1710 /* Calculation of the virial must be done after vsites! */
1711 calc_virial(0, mdatoms
->homenr
, as_rvec_array(x
.data()), f
,
1712 vir_force
, graph
, box
, nrnb
, fr
, inputrec
->ePBC
);
1716 if (PAR(cr
) && !thisRankHasDuty(cr
, DUTY_PME
))
1718 /* In case of node-splitting, the PP nodes receive the long-range
1719 * forces, virial and energy from the PME nodes here.
1721 pme_receive_force_ener(cr
, &forceWithVirial
, enerd
, wcycle
);
1726 post_process_forces(cr
, step
, nrnb
, wcycle
,
1727 top
, box
, as_rvec_array(x
.data()), f
, &forceWithVirial
,
1728 vir_force
, mdatoms
, graph
, fr
, vsite
,
1732 if (flags
& GMX_FORCE_ENERGY
)
1734 /* Sum the potential energy terms from group contributions */
1735 sum_epot(&(enerd
->grpp
), enerd
->term
);
1737 if (!EI_TPI(inputrec
->eI
))
1739 checkPotentialEnergyValidity(step
, *enerd
, *inputrec
);
1744 static void do_force_cutsGROUP(FILE *fplog
,
1745 const t_commrec
*cr
,
1746 const gmx_multisim_t
*ms
,
1747 const t_inputrec
*inputrec
,
1749 gmx_enfrot
*enforcedRotation
,
1752 gmx_wallcycle_t wcycle
,
1753 gmx_localtop_t
*top
,
1754 const gmx_groups_t
*groups
,
1755 matrix box
, gmx::PaddedArrayRef
<gmx::RVec
> x
,
1757 gmx::PaddedArrayRef
<gmx::RVec
> force
,
1759 const t_mdatoms
*mdatoms
,
1760 gmx_enerdata_t
*enerd
,
1765 const gmx_vsite_t
*vsite
,
1770 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion
,
1771 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion
)
1775 gmx_bool bStateChanged
, bNS
, bFillGrid
, bCalcCGCM
;
1779 const int start
= 0;
1780 const int homenr
= mdatoms
->homenr
;
1782 clear_mat(vir_force
);
1785 if (DOMAINDECOMP(cr
))
1787 cg1
= cr
->dd
->globalAtomGroupIndices
.size();
1798 bStateChanged
= ((flags
& GMX_FORCE_STATECHANGED
) != 0);
1799 bNS
= ((flags
& GMX_FORCE_NS
) != 0) && (!fr
->bAllvsAll
);
1800 /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1801 bFillGrid
= (bNS
&& bStateChanged
);
1802 bCalcCGCM
= (bFillGrid
&& !DOMAINDECOMP(cr
));
1803 bDoForces
= ((flags
& GMX_FORCE_FORCES
) != 0);
1807 update_forcerec(fr
, box
);
1809 if (inputrecNeedMutot(inputrec
))
1811 /* Calculate total (local) dipole moment in a temporary common array.
1812 * This makes it possible to sum them over nodes faster.
1814 calc_mu(start
, homenr
,
1815 x
, mdatoms
->chargeA
, mdatoms
->chargeB
, mdatoms
->nChargePerturbed
,
1820 if (fr
->ePBC
!= epbcNONE
)
1822 /* Compute shift vectors every step,
1823 * because of pressure coupling or box deformation!
1825 if ((flags
& GMX_FORCE_DYNAMICBOX
) && bStateChanged
)
1827 calc_shifts(box
, fr
->shift_vec
);
1832 put_charge_groups_in_box(fplog
, cg0
, cg1
, fr
->ePBC
, box
,
1833 &(top
->cgs
), as_rvec_array(x
.data()), fr
->cg_cm
);
1834 inc_nrnb(nrnb
, eNR_CGCM
, homenr
);
1835 inc_nrnb(nrnb
, eNR_RESETX
, cg1
-cg0
);
1837 else if (EI_ENERGY_MINIMIZATION(inputrec
->eI
) && graph
)
1839 unshift_self(graph
, box
, as_rvec_array(x
.data()));
1844 calc_cgcm(fplog
, cg0
, cg1
, &(top
->cgs
), as_rvec_array(x
.data()), fr
->cg_cm
);
1845 inc_nrnb(nrnb
, eNR_CGCM
, homenr
);
1848 if (bCalcCGCM
&& gmx_debug_at
)
1850 pr_rvecs(debug
, 0, "cgcm", fr
->cg_cm
, top
->cgs
.nr
);
1854 if (!thisRankHasDuty(cr
, DUTY_PME
))
1856 /* Send particle coordinates to the pme nodes.
1857 * Since this is only implemented for domain decomposition
1858 * and domain decomposition does not use the graph,
1859 * we do not need to worry about shifting.
1861 gmx_pme_send_coordinates(cr
, box
, as_rvec_array(x
.data()),
1862 lambda
[efptCOUL
], lambda
[efptVDW
],
1863 (flags
& (GMX_FORCE_VIRIAL
| GMX_FORCE_ENERGY
)) != 0,
1866 #endif /* GMX_MPI */
1868 /* Communicate coordinates and sum dipole if necessary */
1869 if (DOMAINDECOMP(cr
))
1871 dd_move_x(cr
->dd
, box
, x
, wcycle
);
1873 /* No GPU support, no move_x overlap, so reopen the balance region here */
1874 if (ddOpenBalanceRegion
== DdOpenBalanceRegionBeforeForceComputation::yes
)
1876 ddReopenBalanceRegionCpu(cr
->dd
);
1880 if (inputrecNeedMutot(inputrec
))
1886 gmx_sumd(2*DIM
, mu
, cr
);
1887 ddReopenBalanceRegionCpu(cr
->dd
);
1889 for (i
= 0; i
< 2; i
++)
1891 for (j
= 0; j
< DIM
; j
++)
1893 fr
->mu_tot
[i
][j
] = mu
[i
*DIM
+ j
];
1897 if (fr
->efep
== efepNO
)
1899 copy_rvec(fr
->mu_tot
[0], mu_tot
);
1903 for (j
= 0; j
< DIM
; j
++)
1906 (1.0 - lambda
[efptCOUL
])*fr
->mu_tot
[0][j
] + lambda
[efptCOUL
]*fr
->mu_tot
[1][j
];
1911 /* Reset energies */
1912 reset_enerdata(enerd
);
1913 clear_rvecs(SHIFTS
, fr
->fshift
);
1917 wallcycle_start(wcycle
, ewcNS
);
1919 if (graph
&& bStateChanged
)
1921 /* Calculate intramolecular shift vectors to make molecules whole */
1922 mk_mshift(fplog
, graph
, fr
->ePBC
, box
, as_rvec_array(x
.data()));
1925 /* Do the actual neighbour searching */
1927 groups
, top
, mdatoms
,
1928 cr
, nrnb
, bFillGrid
);
1930 wallcycle_stop(wcycle
, ewcNS
);
1933 if (DOMAINDECOMP(cr
) && !thisRankHasDuty(cr
, DUTY_PME
))
1935 wallcycle_start(wcycle
, ewcPPDURINGPME
);
1936 dd_force_flop_start(cr
->dd
, nrnb
);
1941 wallcycle_start(wcycle
, ewcROT
);
1942 do_rotation(cr
, enforcedRotation
, box
, as_rvec_array(x
.data()), t
, step
, bNS
);
1943 wallcycle_stop(wcycle
, ewcROT
);
1946 /* Temporary solution until all routines take PaddedRVecVector */
1947 rvec
*f
= as_rvec_array(force
.data());
1949 /* Start the force cycle counter.
1950 * Note that a different counter is used for dynamic load balancing.
1952 wallcycle_start(wcycle
, ewcFORCE
);
1954 gmx::ArrayRef
<gmx::RVec
> forceRef
= force
;
1957 /* If we need to compute the virial, we might need a separate
1958 * force buffer for algorithms for which the virial is calculated
1959 * separately, such as PME.
1961 if ((flags
& GMX_FORCE_VIRIAL
) && fr
->haveDirectVirialContributions
)
1963 forceRef
= *fr
->forceBufferForDirectVirialContributions
;
1965 clear_rvecs_omp(forceRef
.size(), as_rvec_array(forceRef
.data()));
1968 /* Clear the short- and long-range forces */
1969 clear_rvecs(fr
->natoms_force_constr
, f
);
1972 /* forceWithVirial might need the full force atom range */
1973 gmx::ForceWithVirial
forceWithVirial(forceRef
, (flags
& GMX_FORCE_VIRIAL
) != 0);
1975 if (inputrec
->bPull
&& pull_have_constraint(inputrec
->pull_work
))
1977 clear_pull_forces(inputrec
->pull_work
);
1980 /* update QMMMrec, if necessary */
1983 update_QMMMrec(cr
, fr
, as_rvec_array(x
.data()), mdatoms
, box
);
1986 /* Compute the bonded and non-bonded energies and optionally forces */
1987 do_force_lowlevel(fr
, inputrec
, &(top
->idef
),
1988 cr
, ms
, nrnb
, wcycle
, mdatoms
,
1989 as_rvec_array(x
.data()), hist
, f
, &forceWithVirial
, enerd
, fcd
,
1990 box
, inputrec
->fepvals
, lambda
,
1991 graph
, &(top
->excls
), fr
->mu_tot
,
1995 wallcycle_stop(wcycle
, ewcFORCE
);
1997 if (DOMAINDECOMP(cr
))
1999 dd_force_flop_stop(cr
->dd
, nrnb
);
2001 if (ddCloseBalanceRegion
== DdCloseBalanceRegionAfterForceComputation::yes
)
2003 ddCloseBalanceRegionCpu(cr
->dd
);
2007 computeSpecialForces(fplog
, cr
, inputrec
, awh
, enforcedRotation
,
2009 fr
->forceProviders
, box
, x
, mdatoms
, lambda
,
2010 flags
, &forceWithVirial
, enerd
,
2015 /* Communicate the forces */
2016 if (DOMAINDECOMP(cr
))
2018 dd_move_f(cr
->dd
, force
, fr
->fshift
, wcycle
);
2019 /* Do we need to communicate the separate force array
2020 * for terms that do not contribute to the single sum virial?
2021 * Position restraints and electric fields do not introduce
2022 * inter-cg forces, only full electrostatics methods do.
2023 * When we do not calculate the virial, fr->f_novirsum = f,
2024 * so we have already communicated these forces.
2026 if (EEL_FULL(fr
->ic
->eeltype
) && cr
->dd
->n_intercg_excl
&&
2027 (flags
& GMX_FORCE_VIRIAL
))
2029 dd_move_f(cr
->dd
, forceWithVirial
.force_
, nullptr, wcycle
);
2033 /* If we have NoVirSum forces, but we do not calculate the virial,
2034 * we sum fr->f_novirsum=f later.
2036 if (vsite
&& !(fr
->haveDirectVirialContributions
&& !(flags
& GMX_FORCE_VIRIAL
)))
2038 spread_vsite_f(vsite
, as_rvec_array(x
.data()), f
, fr
->fshift
, FALSE
, nullptr, nrnb
,
2039 &top
->idef
, fr
->ePBC
, fr
->bMolPBC
, graph
, box
, cr
, wcycle
);
2042 if (flags
& GMX_FORCE_VIRIAL
)
2044 /* Calculation of the virial must be done after vsites! */
2045 calc_virial(0, mdatoms
->homenr
, as_rvec_array(x
.data()), f
,
2046 vir_force
, graph
, box
, nrnb
, fr
, inputrec
->ePBC
);
2050 if (PAR(cr
) && !thisRankHasDuty(cr
, DUTY_PME
))
2052 /* In case of node-splitting, the PP nodes receive the long-range
2053 * forces, virial and energy from the PME nodes here.
2055 pme_receive_force_ener(cr
, &forceWithVirial
, enerd
, wcycle
);
2060 post_process_forces(cr
, step
, nrnb
, wcycle
,
2061 top
, box
, as_rvec_array(x
.data()), f
, &forceWithVirial
,
2062 vir_force
, mdatoms
, graph
, fr
, vsite
,
2066 if (flags
& GMX_FORCE_ENERGY
)
2068 /* Sum the potential energy terms from group contributions */
2069 sum_epot(&(enerd
->grpp
), enerd
->term
);
2071 if (!EI_TPI(inputrec
->eI
))
2073 checkPotentialEnergyValidity(step
, *enerd
, *inputrec
);
2079 void do_force(FILE *fplog
,
2080 const t_commrec
*cr
,
2081 const gmx_multisim_t
*ms
,
2082 const t_inputrec
*inputrec
,
2084 gmx_enfrot
*enforcedRotation
,
2087 gmx_wallcycle_t wcycle
,
2088 gmx_localtop_t
*top
,
2089 const gmx_groups_t
*groups
,
2091 gmx::PaddedArrayRef
<gmx::RVec
> x
,
2093 gmx::PaddedArrayRef
<gmx::RVec
> force
,
2095 const t_mdatoms
*mdatoms
,
2096 gmx_enerdata_t
*enerd
,
2098 gmx::ArrayRef
<real
> lambda
,
2101 const gmx_vsite_t
*vsite
,
2106 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion
,
2107 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion
)
2109 /* modify force flag if not doing nonbonded */
2110 if (!fr
->bNonbonded
)
2112 flags
&= ~GMX_FORCE_NONBONDED
;
2115 switch (inputrec
->cutoff_scheme
)
2118 do_force_cutsVERLET(fplog
, cr
, ms
, inputrec
,
2119 awh
, enforcedRotation
, step
, nrnb
, wcycle
,
2126 lambda
.data(), graph
,
2131 ddOpenBalanceRegion
,
2132 ddCloseBalanceRegion
);
2135 do_force_cutsGROUP(fplog
, cr
, ms
, inputrec
,
2136 awh
, enforcedRotation
, step
, nrnb
, wcycle
,
2143 lambda
.data(), graph
,
2147 ddOpenBalanceRegion
,
2148 ddCloseBalanceRegion
);
2151 gmx_incons("Invalid cut-off scheme passed!");
2154 /* In case we don't have constraints and are using GPUs, the next balancing
2155 * region starts here.
2156 * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2157 * virial calculation and COM pulling, is not thus not included in
2158 * the balance timing, which is ok as most tasks do communication.
2160 if (ddOpenBalanceRegion
== DdOpenBalanceRegionBeforeForceComputation::yes
)
2162 ddOpenBalanceRegionCpu(cr
->dd
, DdAllowBalanceRegionReopen::no
);
2167 void do_constrain_first(FILE *fplog
, gmx::Constraints
*constr
,
2168 const t_inputrec
*ir
, const t_mdatoms
*md
,
2171 int i
, m
, start
, end
;
2173 real dt
= ir
->delta_t
;
2177 /* We need to allocate one element extra, since we might use
2178 * (unaligned) 4-wide SIMD loads to access rvec entries.
2180 snew(savex
, state
->natoms
+ 1);
2187 fprintf(debug
, "vcm: start=%d, homenr=%d, end=%d\n",
2188 start
, md
->homenr
, end
);
2190 /* Do a first constrain to reset particles... */
2191 step
= ir
->init_step
;
2194 char buf
[STEPSTRSIZE
];
2195 fprintf(fplog
, "\nConstraining the starting coordinates (step %s)\n",
2196 gmx_step_str(step
, buf
));
2200 /* constrain the current position */
2201 constr
->apply(TRUE
, FALSE
,
2203 state
->x
.rvec_array(), state
->x
.rvec_array(), nullptr,
2205 state
->lambda
[efptBONDED
], &dvdl_dum
,
2206 nullptr, nullptr, gmx::ConstraintVariable::Positions
);
2209 /* constrain the inital velocity, and save it */
2210 /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2211 constr
->apply(TRUE
, FALSE
,
2213 state
->x
.rvec_array(), state
->v
.rvec_array(), state
->v
.rvec_array(),
2215 state
->lambda
[efptBONDED
], &dvdl_dum
,
2216 nullptr, nullptr, gmx::ConstraintVariable::Velocities
);
2218 /* constrain the inital velocities at t-dt/2 */
2219 if (EI_STATE_VELOCITY(ir
->eI
) && ir
->eI
!= eiVV
)
2221 auto x
= makeArrayRef(state
->x
).subArray(start
, end
);
2222 auto v
= makeArrayRef(state
->v
).subArray(start
, end
);
2223 for (i
= start
; (i
< end
); i
++)
2225 for (m
= 0; (m
< DIM
); m
++)
2227 /* Reverse the velocity */
2229 /* Store the position at t-dt in buf */
2230 savex
[i
][m
] = x
[i
][m
] + dt
*v
[i
][m
];
2233 /* Shake the positions at t=-dt with the positions at t=0
2234 * as reference coordinates.
2238 char buf
[STEPSTRSIZE
];
2239 fprintf(fplog
, "\nConstraining the coordinates at t0-dt (step %s)\n",
2240 gmx_step_str(step
, buf
));
2243 constr
->apply(TRUE
, FALSE
,
2245 state
->x
.rvec_array(), savex
, nullptr,
2247 state
->lambda
[efptBONDED
], &dvdl_dum
,
2248 state
->v
.rvec_array(), nullptr, gmx::ConstraintVariable::Positions
);
2250 for (i
= start
; i
< end
; i
++)
2252 for (m
= 0; m
< DIM
; m
++)
2254 /* Re-reverse the velocities */
2264 integrate_table(const real vdwtab
[], real scale
, int offstart
, int rstart
, int rend
,
2265 double *enerout
, double *virout
)
2267 double enersum
, virsum
;
2268 double invscale
, invscale2
, invscale3
;
2269 double r
, ea
, eb
, ec
, pa
, pb
, pc
, pd
;
2274 invscale
= 1.0/scale
;
2275 invscale2
= invscale
*invscale
;
2276 invscale3
= invscale
*invscale2
;
2278 /* Following summation derived from cubic spline definition,
2279 * Numerical Recipies in C, second edition, p. 113-116. Exact for
2280 * the cubic spline. We first calculate the negative of the
2281 * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2282 * add the more standard, abrupt cutoff correction to that result,
2283 * yielding the long-range correction for a switched function. We
2284 * perform both the pressure and energy loops at the same time for
2285 * simplicity, as the computational cost is low. */
2289 /* Since the dispersion table has been scaled down a factor
2290 * 6.0 and the repulsion a factor 12.0 to compensate for the
2291 * c6/c12 parameters inside nbfp[] being scaled up (to save
2292 * flops in kernels), we need to correct for this.
2303 for (ri
= rstart
; ri
< rend
; ++ri
)
2307 eb
= 2.0*invscale2
*r
;
2311 pb
= 3.0*invscale2
*r
;
2312 pc
= 3.0*invscale
*r
*r
;
2315 /* this "8" is from the packing in the vdwtab array - perhaps
2316 should be defined? */
2318 offset
= 8*ri
+ offstart
;
2319 y0
= vdwtab
[offset
];
2320 f
= vdwtab
[offset
+1];
2321 g
= vdwtab
[offset
+2];
2322 h
= vdwtab
[offset
+3];
2324 enersum
+= y0
*(ea
/3 + eb
/2 + ec
) + f
*(ea
/4 + eb
/3 + ec
/2) + g
*(ea
/5 + eb
/4 + ec
/3) + h
*(ea
/6 + eb
/5 + ec
/4);
2325 virsum
+= f
*(pa
/4 + pb
/3 + pc
/2 + pd
) + 2*g
*(pa
/5 + pb
/4 + pc
/3 + pd
/2) + 3*h
*(pa
/6 + pb
/5 + pc
/4 + pd
/3);
2327 *enerout
= 4.0*M_PI
*enersum
*tabfactor
;
2328 *virout
= 4.0*M_PI
*virsum
*tabfactor
;
2331 void calc_enervirdiff(FILE *fplog
, int eDispCorr
, t_forcerec
*fr
)
2333 double eners
[2], virs
[2], enersum
, virsum
;
2334 double r0
, rc3
, rc9
;
2336 real scale
, *vdwtab
;
2338 fr
->enershiftsix
= 0;
2339 fr
->enershifttwelve
= 0;
2340 fr
->enerdiffsix
= 0;
2341 fr
->enerdifftwelve
= 0;
2343 fr
->virdifftwelve
= 0;
2345 const interaction_const_t
*ic
= fr
->ic
;
2347 if (eDispCorr
!= edispcNO
)
2349 for (i
= 0; i
< 2; i
++)
2354 if ((ic
->vdw_modifier
== eintmodPOTSHIFT
) ||
2355 (ic
->vdw_modifier
== eintmodPOTSWITCH
) ||
2356 (ic
->vdw_modifier
== eintmodFORCESWITCH
) ||
2357 (ic
->vdwtype
== evdwSHIFT
) ||
2358 (ic
->vdwtype
== evdwSWITCH
))
2360 if (((ic
->vdw_modifier
== eintmodPOTSWITCH
) ||
2361 (ic
->vdw_modifier
== eintmodFORCESWITCH
) ||
2362 (ic
->vdwtype
== evdwSWITCH
)) && ic
->rvdw_switch
== 0)
2365 "With dispersion correction rvdw-switch can not be zero "
2366 "for vdw-type = %s", evdw_names
[ic
->vdwtype
]);
2369 /* TODO This code depends on the logic in tables.c that
2370 constructs the table layout, which should be made
2371 explicit in future cleanup. */
2372 GMX_ASSERT(fr
->dispersionCorrectionTable
->interaction
== GMX_TABLE_INTERACTION_VDWREP_VDWDISP
,
2373 "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2374 scale
= fr
->dispersionCorrectionTable
->scale
;
2375 vdwtab
= fr
->dispersionCorrectionTable
->data
;
2377 /* Round the cut-offs to exact table values for precision */
2378 ri0
= static_cast<int>(std::floor(ic
->rvdw_switch
*scale
));
2379 ri1
= static_cast<int>(std::ceil(ic
->rvdw
*scale
));
2381 /* The code below has some support for handling force-switching, i.e.
2382 * when the force (instead of potential) is switched over a limited
2383 * region. This leads to a constant shift in the potential inside the
2384 * switching region, which we can handle by adding a constant energy
2385 * term in the force-switch case just like when we do potential-shift.
2387 * For now this is not enabled, but to keep the functionality in the
2388 * code we check separately for switch and shift. When we do force-switch
2389 * the shifting point is rvdw_switch, while it is the cutoff when we
2390 * have a classical potential-shift.
2392 * For a pure potential-shift the potential has a constant shift
2393 * all the way out to the cutoff, and that is it. For other forms
2394 * we need to calculate the constant shift up to the point where we
2395 * start modifying the potential.
2397 ri0
= (ic
->vdw_modifier
== eintmodPOTSHIFT
) ? ri1
: ri0
;
2403 if ((ic
->vdw_modifier
== eintmodFORCESWITCH
) ||
2404 (ic
->vdwtype
== evdwSHIFT
))
2406 /* Determine the constant energy shift below rvdw_switch.
2407 * Table has a scale factor since we have scaled it down to compensate
2408 * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2410 fr
->enershiftsix
= static_cast<real
>(-1.0/(rc3
*rc3
)) - 6.0*vdwtab
[8*ri0
];
2411 fr
->enershifttwelve
= static_cast<real
>( 1.0/(rc9
*rc3
)) - 12.0*vdwtab
[8*ri0
+ 4];
2413 else if (ic
->vdw_modifier
== eintmodPOTSHIFT
)
2415 fr
->enershiftsix
= static_cast<real
>(-1.0/(rc3
*rc3
));
2416 fr
->enershifttwelve
= static_cast<real
>( 1.0/(rc9
*rc3
));
2419 /* Add the constant part from 0 to rvdw_switch.
2420 * This integration from 0 to rvdw_switch overcounts the number
2421 * of interactions by 1, as it also counts the self interaction.
2422 * We will correct for this later.
2424 eners
[0] += 4.0*M_PI
*fr
->enershiftsix
*rc3
/3.0;
2425 eners
[1] += 4.0*M_PI
*fr
->enershifttwelve
*rc3
/3.0;
2427 /* Calculate the contribution in the range [r0,r1] where we
2428 * modify the potential. For a pure potential-shift modifier we will
2429 * have ri0==ri1, and there will not be any contribution here.
2431 for (i
= 0; i
< 2; i
++)
2435 integrate_table(vdwtab
, scale
, (i
== 0 ? 0 : 4), ri0
, ri1
, &enersum
, &virsum
);
2436 eners
[i
] -= enersum
;
2440 /* Alright: Above we compensated by REMOVING the parts outside r0
2441 * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2443 * Regardless of whether r0 is the point where we start switching,
2444 * or the cutoff where we calculated the constant shift, we include
2445 * all the parts we are missing out to infinity from r0 by
2446 * calculating the analytical dispersion correction.
2448 eners
[0] += -4.0*M_PI
/(3.0*rc3
);
2449 eners
[1] += 4.0*M_PI
/(9.0*rc9
);
2450 virs
[0] += 8.0*M_PI
/rc3
;
2451 virs
[1] += -16.0*M_PI
/(3.0*rc9
);
2453 else if (ic
->vdwtype
== evdwCUT
||
2454 EVDW_PME(ic
->vdwtype
) ||
2455 ic
->vdwtype
== evdwUSER
)
2457 if (ic
->vdwtype
== evdwUSER
&& fplog
)
2460 "WARNING: using dispersion correction with user tables\n");
2463 /* Note that with LJ-PME, the dispersion correction is multiplied
2464 * by the difference between the actual C6 and the value of C6
2465 * that would produce the combination rule.
2466 * This means the normal energy and virial difference formulas
2470 rc3
= ic
->rvdw
*ic
->rvdw
*ic
->rvdw
;
2472 /* Contribution beyond the cut-off */
2473 eners
[0] += -4.0*M_PI
/(3.0*rc3
);
2474 eners
[1] += 4.0*M_PI
/(9.0*rc9
);
2475 if (ic
->vdw_modifier
== eintmodPOTSHIFT
)
2477 /* Contribution within the cut-off */
2478 eners
[0] += -4.0*M_PI
/(3.0*rc3
);
2479 eners
[1] += 4.0*M_PI
/(3.0*rc9
);
2481 /* Contribution beyond the cut-off */
2482 virs
[0] += 8.0*M_PI
/rc3
;
2483 virs
[1] += -16.0*M_PI
/(3.0*rc9
);
2488 "Dispersion correction is not implemented for vdw-type = %s",
2489 evdw_names
[ic
->vdwtype
]);
2492 /* When we deprecate the group kernels the code below can go too */
2493 if (ic
->vdwtype
== evdwPME
&& fr
->cutoff_scheme
== ecutsGROUP
)
2495 /* Calculate self-interaction coefficient (assuming that
2496 * the reciprocal-space contribution is constant in the
2497 * region that contributes to the self-interaction).
2499 fr
->enershiftsix
= gmx::power6(ic
->ewaldcoeff_lj
) / 6.0;
2501 eners
[0] += -gmx::power3(std::sqrt(M_PI
)*ic
->ewaldcoeff_lj
)/3.0;
2502 virs
[0] += gmx::power3(std::sqrt(M_PI
)*ic
->ewaldcoeff_lj
);
2505 fr
->enerdiffsix
= eners
[0];
2506 fr
->enerdifftwelve
= eners
[1];
2507 /* The 0.5 is due to the Gromacs definition of the virial */
2508 fr
->virdiffsix
= 0.5*virs
[0];
2509 fr
->virdifftwelve
= 0.5*virs
[1];
2513 void calc_dispcorr(const t_inputrec
*ir
, const t_forcerec
*fr
,
2514 const matrix box
, real lambda
, tensor pres
, tensor virial
,
2515 real
*prescorr
, real
*enercorr
, real
*dvdlcorr
)
2517 gmx_bool bCorrAll
, bCorrPres
;
2518 real dvdlambda
, invvol
, dens
, ninter
, avcsix
, avctwelve
, enerdiff
, svir
= 0, spres
= 0;
2528 if (ir
->eDispCorr
!= edispcNO
)
2530 bCorrAll
= (ir
->eDispCorr
== edispcAllEner
||
2531 ir
->eDispCorr
== edispcAllEnerPres
);
2532 bCorrPres
= (ir
->eDispCorr
== edispcEnerPres
||
2533 ir
->eDispCorr
== edispcAllEnerPres
);
2535 invvol
= 1/det(box
);
2538 /* Only correct for the interactions with the inserted molecule */
2539 dens
= (fr
->numAtomsForDispersionCorrection
- fr
->n_tpi
)*invvol
;
2544 dens
= fr
->numAtomsForDispersionCorrection
*invvol
;
2545 ninter
= 0.5*fr
->numAtomsForDispersionCorrection
;
2548 if (ir
->efep
== efepNO
)
2550 avcsix
= fr
->avcsix
[0];
2551 avctwelve
= fr
->avctwelve
[0];
2555 avcsix
= (1 - lambda
)*fr
->avcsix
[0] + lambda
*fr
->avcsix
[1];
2556 avctwelve
= (1 - lambda
)*fr
->avctwelve
[0] + lambda
*fr
->avctwelve
[1];
2559 enerdiff
= ninter
*(dens
*fr
->enerdiffsix
- fr
->enershiftsix
);
2560 *enercorr
+= avcsix
*enerdiff
;
2562 if (ir
->efep
!= efepNO
)
2564 dvdlambda
+= (fr
->avcsix
[1] - fr
->avcsix
[0])*enerdiff
;
2568 enerdiff
= ninter
*(dens
*fr
->enerdifftwelve
- fr
->enershifttwelve
);
2569 *enercorr
+= avctwelve
*enerdiff
;
2570 if (fr
->efep
!= efepNO
)
2572 dvdlambda
+= (fr
->avctwelve
[1] - fr
->avctwelve
[0])*enerdiff
;
2578 svir
= ninter
*dens
*avcsix
*fr
->virdiffsix
/3.0;
2579 if (ir
->eDispCorr
== edispcAllEnerPres
)
2581 svir
+= ninter
*dens
*avctwelve
*fr
->virdifftwelve
/3.0;
2583 /* The factor 2 is because of the Gromacs virial definition */
2584 spres
= -2.0*invvol
*svir
*PRESFAC
;
2586 for (m
= 0; m
< DIM
; m
++)
2588 virial
[m
][m
] += svir
;
2589 pres
[m
][m
] += spres
;
2594 /* Can't currently control when it prints, for now, just print when degugging */
2599 fprintf(debug
, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2605 "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2606 *enercorr
, spres
, svir
);
2610 fprintf(debug
, "Long Range LJ corr.: Epot %10g\n", *enercorr
);
2614 if (fr
->efep
!= efepNO
)
2616 *dvdlcorr
+= dvdlambda
;
2621 static void low_do_pbc_mtop(FILE *fplog
, int ePBC
, const matrix box
,
2622 const gmx_mtop_t
*mtop
, rvec x
[],
2628 if (bFirst
&& fplog
)
2630 fprintf(fplog
, "Removing pbc first time\n");
2635 for (const gmx_molblock_t
&molb
: mtop
->molblock
)
2637 const gmx_moltype_t
&moltype
= mtop
->moltype
[molb
.type
];
2638 if (moltype
.atoms
.nr
== 1 ||
2639 (!bFirst
&& moltype
.cgs
.nr
== 1))
2641 /* Just one atom or charge group in the molecule, no PBC required */
2642 as
+= molb
.nmol
*moltype
.atoms
.nr
;
2646 mk_graph_moltype(moltype
, graph
);
2648 for (mol
= 0; mol
< molb
.nmol
; mol
++)
2650 mk_mshift(fplog
, graph
, ePBC
, box
, x
+as
);
2652 shift_self(graph
, box
, x
+as
);
2653 /* The molecule is whole now.
2654 * We don't need the second mk_mshift call as in do_pbc_first,
2655 * since we no longer need this graph.
2658 as
+= moltype
.atoms
.nr
;
2666 void do_pbc_first_mtop(FILE *fplog
, int ePBC
, const matrix box
,
2667 const gmx_mtop_t
*mtop
, rvec x
[])
2669 low_do_pbc_mtop(fplog
, ePBC
, box
, mtop
, x
, TRUE
);
2672 void do_pbc_mtop(FILE *fplog
, int ePBC
, const matrix box
,
2673 const gmx_mtop_t
*mtop
, rvec x
[])
2675 low_do_pbc_mtop(fplog
, ePBC
, box
, mtop
, x
, FALSE
);
2678 void put_atoms_in_box_omp(int ePBC
, const matrix box
, gmx::ArrayRef
<gmx::RVec
> x
)
2681 nth
= gmx_omp_nthreads_get(emntDefault
);
2683 #pragma omp parallel for num_threads(nth) schedule(static)
2684 for (t
= 0; t
< nth
; t
++)
2688 size_t natoms
= x
.size();
2689 size_t offset
= (natoms
*t
)/nth
;
2690 size_t len
= (natoms
*(t
+ 1))/nth
- offset
;
2691 put_atoms_in_box(ePBC
, box
, x
.subArray(offset
, len
));
2693 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
2697 // TODO This can be cleaned up a lot, and move back to runner.cpp
2698 void finish_run(FILE *fplog
, const gmx::MDLogger
&mdlog
, const t_commrec
*cr
,
2699 const t_inputrec
*inputrec
,
2700 t_nrnb nrnb
[], gmx_wallcycle_t wcycle
,
2701 gmx_walltime_accounting_t walltime_accounting
,
2702 nonbonded_verlet_t
*nbv
,
2703 const gmx_pme_t
*pme
,
2704 gmx_bool bWriteStat
)
2706 t_nrnb
*nrnb_tot
= nullptr;
2708 double nbfs
= 0, mflop
= 0;
2709 double elapsed_time
,
2710 elapsed_time_over_all_ranks
,
2711 elapsed_time_over_all_threads
,
2712 elapsed_time_over_all_threads_over_all_ranks
;
2713 /* Control whether it is valid to print a report. Only the
2714 simulation master may print, but it should not do so if the run
2715 terminated e.g. before a scheduled reset step. This is
2716 complicated by the fact that PME ranks are unaware of the
2717 reason why they were sent a pmerecvqxFINISH. To avoid
2718 communication deadlocks, we always do the communication for the
2719 report, even if we've decided not to write the report, because
2720 how long it takes to finish the run is not important when we've
2721 decided not to report on the simulation performance.
2723 Further, we only report performance for dynamical integrators,
2724 because those are the only ones for which we plan to
2725 consider doing any optimizations. */
2726 bool printReport
= EI_DYNAMICS(inputrec
->eI
) && SIMMASTER(cr
);
2728 if (printReport
&& !walltime_accounting_get_valid_finish(walltime_accounting
))
2730 GMX_LOG(mdlog
.warning
).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2731 printReport
= false;
2738 MPI_Allreduce(nrnb
->n
, nrnb_tot
->n
, eNRNB
, MPI_DOUBLE
, MPI_SUM
,
2739 cr
->mpi_comm_mysim
);
2747 elapsed_time
= walltime_accounting_get_time_since_reset(walltime_accounting
);
2748 elapsed_time_over_all_threads
= walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting
);
2752 /* reduce elapsed_time over all MPI ranks in the current simulation */
2753 MPI_Allreduce(&elapsed_time
,
2754 &elapsed_time_over_all_ranks
,
2755 1, MPI_DOUBLE
, MPI_SUM
,
2756 cr
->mpi_comm_mysim
);
2757 elapsed_time_over_all_ranks
/= cr
->nnodes
;
2758 /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2759 * current simulation. */
2760 MPI_Allreduce(&elapsed_time_over_all_threads
,
2761 &elapsed_time_over_all_threads_over_all_ranks
,
2762 1, MPI_DOUBLE
, MPI_SUM
,
2763 cr
->mpi_comm_mysim
);
2768 elapsed_time_over_all_ranks
= elapsed_time
;
2769 elapsed_time_over_all_threads_over_all_ranks
= elapsed_time_over_all_threads
;
2774 print_flop(fplog
, nrnb_tot
, &nbfs
, &mflop
);
2781 if (thisRankHasDuty(cr
, DUTY_PP
) && DOMAINDECOMP(cr
))
2783 print_dd_statistics(cr
, inputrec
, fplog
);
2786 /* TODO Move the responsibility for any scaling by thread counts
2787 * to the code that handled the thread region, so that there's a
2788 * mechanism to keep cycle counting working during the transition
2789 * to task parallelism. */
2790 int nthreads_pp
= gmx_omp_nthreads_get(emntNonbonded
);
2791 int nthreads_pme
= gmx_omp_nthreads_get(emntPME
);
2792 wallcycle_scale_by_num_threads(wcycle
, thisRankHasDuty(cr
, DUTY_PME
) && !thisRankHasDuty(cr
, DUTY_PP
), nthreads_pp
, nthreads_pme
);
2793 auto cycle_sum(wallcycle_sum(cr
, wcycle
));
2797 auto nbnxn_gpu_timings
= use_GPU(nbv
) ? nbnxn_gpu_get_timings(nbv
->gpu_nbv
) : nullptr;
2798 gmx_wallclock_gpu_pme_t pme_gpu_timings
= {};
2799 if (pme_gpu_task_enabled(pme
))
2801 pme_gpu_get_timings(pme
, &pme_gpu_timings
);
2803 wallcycle_print(fplog
, mdlog
, cr
->nnodes
, cr
->npmenodes
, nthreads_pp
, nthreads_pme
,
2804 elapsed_time_over_all_ranks
,
2809 if (EI_DYNAMICS(inputrec
->eI
))
2811 delta_t
= inputrec
->delta_t
;
2816 print_perf(fplog
, elapsed_time_over_all_threads_over_all_ranks
,
2817 elapsed_time_over_all_ranks
,
2818 walltime_accounting_get_nsteps_done_since_reset(walltime_accounting
),
2819 delta_t
, nbfs
, mflop
);
2823 print_perf(stderr
, elapsed_time_over_all_threads_over_all_ranks
,
2824 elapsed_time_over_all_ranks
,
2825 walltime_accounting_get_nsteps_done_since_reset(walltime_accounting
),
2826 delta_t
, nbfs
, mflop
);
2831 extern void initialize_lambdas(FILE *fplog
, t_inputrec
*ir
, int *fep_state
, gmx::ArrayRef
<real
> lambda
, double *lam0
)
2833 /* this function works, but could probably use a logic rewrite to keep all the different
2834 types of efep straight. */
2836 if ((ir
->efep
== efepNO
) && (!ir
->bSimTemp
))
2841 t_lambda
*fep
= ir
->fepvals
;
2842 *fep_state
= fep
->init_fep_state
; /* this might overwrite the checkpoint
2843 if checkpoint is set -- a kludge is in for now
2846 for (int i
= 0; i
< efptNR
; i
++)
2848 /* overwrite lambda state with init_lambda for now for backwards compatibility */
2849 if (fep
->init_lambda
>= 0) /* if it's -1, it was never initializd */
2851 lambda
[i
] = fep
->init_lambda
;
2854 lam0
[i
] = lambda
[i
];
2859 lambda
[i
] = fep
->all_lambda
[i
][*fep_state
];
2862 lam0
[i
] = lambda
[i
];
2868 /* need to rescale control temperatures to match current state */
2869 for (int i
= 0; i
< ir
->opts
.ngtc
; i
++)
2871 if (ir
->opts
.ref_t
[i
] > 0)
2873 ir
->opts
.ref_t
[i
] = ir
->simtempvals
->temperatures
[*fep_state
];
2878 /* Send to the log the information on the current lambdas */
2879 if (fplog
!= nullptr)
2881 fprintf(fplog
, "Initial vector of lambda components:[ ");
2882 for (const auto &l
: lambda
)
2884 fprintf(fplog
, "%10.4f ", l
);
2886 fprintf(fplog
, "]\n");
2891 void init_md(FILE *fplog
,
2892 const t_commrec
*cr
, gmx::IMDOutputProvider
*outputProvider
,
2893 t_inputrec
*ir
, const gmx_output_env_t
*oenv
,
2894 const MdrunOptions
&mdrunOptions
,
2895 double *t
, double *t0
,
2896 t_state
*globalState
, double *lam0
,
2897 t_nrnb
*nrnb
, gmx_mtop_t
*mtop
,
2899 gmx::BoxDeformation
*deform
,
2900 int nfile
, const t_filenm fnm
[],
2901 gmx_mdoutf_t
*outf
, t_mdebin
**mdebin
,
2902 tensor force_vir
, tensor shake_vir
,
2903 tensor total_vir
, tensor pres
, rvec mu_tot
,
2904 gmx_bool
*bSimAnn
, t_vcm
**vcm
,
2905 gmx_wallcycle_t wcycle
)
2909 /* Initial values */
2910 *t
= *t0
= ir
->init_t
;
2913 for (i
= 0; i
< ir
->opts
.ngtc
; i
++)
2915 /* set bSimAnn if any group is being annealed */
2916 if (ir
->opts
.annealing
[i
] != eannNO
)
2922 /* Initialize lambda variables */
2923 /* TODO: Clean up initialization of fep_state and lambda in t_state.
2924 * We currently need to call initialize_lambdas on non-master ranks
2925 * to initialize lam0.
2929 initialize_lambdas(fplog
, ir
, &globalState
->fep_state
, globalState
->lambda
, lam0
);
2934 std::array
<real
, efptNR
> tmpLambda
;
2935 initialize_lambdas(fplog
, ir
, &tmpFepState
, tmpLambda
, lam0
);
2938 // TODO upd is never NULL in practice, but the analysers don't know that
2941 *upd
= init_update(ir
, deform
);
2945 update_annealing_target_temp(ir
, ir
->init_t
, upd
? *upd
: nullptr);
2950 *vcm
= init_vcm(fplog
, &mtop
->groups
, ir
);
2953 if (EI_DYNAMICS(ir
->eI
) && !mdrunOptions
.continuationOptions
.appendFiles
)
2955 if (ir
->etc
== etcBERENDSEN
)
2957 please_cite(fplog
, "Berendsen84a");
2959 if (ir
->etc
== etcVRESCALE
)
2961 please_cite(fplog
, "Bussi2007a");
2963 if (ir
->eI
== eiSD1
)
2965 please_cite(fplog
, "Goga2012");
2972 *outf
= init_mdoutf(fplog
, nfile
, fnm
, mdrunOptions
, cr
, outputProvider
, ir
, mtop
, oenv
, wcycle
);
2974 *mdebin
= init_mdebin(mdrunOptions
.continuationOptions
.appendFiles
? nullptr : mdoutf_get_fp_ene(*outf
),
2975 mtop
, ir
, mdoutf_get_fp_dhdl(*outf
));
2978 /* Initiate variables */
2979 clear_mat(force_vir
);
2980 clear_mat(shake_vir
);
2982 clear_mat(total_vir
);
2986 void init_rerun(FILE *fplog
,
2987 const t_commrec
*cr
, gmx::IMDOutputProvider
*outputProvider
,
2988 t_inputrec
*ir
, const gmx_output_env_t
*oenv
,
2989 const MdrunOptions
&mdrunOptions
,
2990 t_state
*globalState
, double *lam0
,
2991 t_nrnb
*nrnb
, gmx_mtop_t
*mtop
,
2992 int nfile
, const t_filenm fnm
[],
2993 gmx_mdoutf_t
*outf
, t_mdebin
**mdebin
,
2994 gmx_wallcycle_t wcycle
)
2996 /* Initialize lambda variables */
2997 /* TODO: Clean up initialization of fep_state and lambda in t_state.
2998 * We currently need to call initialize_lambdas on non-master ranks
2999 * to initialize lam0.
3003 initialize_lambdas(fplog
, ir
, &globalState
->fep_state
, globalState
->lambda
, lam0
);
3008 std::array
<real
, efptNR
> tmpLambda
;
3009 initialize_lambdas(fplog
, ir
, &tmpFepState
, tmpLambda
, lam0
);
3016 *outf
= init_mdoutf(fplog
, nfile
, fnm
, mdrunOptions
, cr
, outputProvider
, ir
, mtop
, oenv
, wcycle
);
3017 *mdebin
= init_mdebin(mdrunOptions
.continuationOptions
.appendFiles
? nullptr : mdoutf_get_fp_ene(*outf
),
3018 mtop
, ir
, mdoutf_get_fp_dhdl(*outf
), true);