2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
51 #include "gromacs/awh/awh.h"
52 #include "gromacs/domdec/dlbtiming.h"
53 #include "gromacs/domdec/domdec.h"
54 #include "gromacs/domdec/domdec_struct.h"
55 #include "gromacs/essentialdynamics/edsam.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/gmxlib/chargegroup.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/gmxlib/nrnb.h"
60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
63 #include "gromacs/gpu_utils/gpu_utils.h"
64 #include "gromacs/imd/imd.h"
65 #include "gromacs/listed-forces/bonded.h"
66 #include "gromacs/listed-forces/disre.h"
67 #include "gromacs/listed-forces/orires.h"
68 #include "gromacs/math/functions.h"
69 #include "gromacs/math/units.h"
70 #include "gromacs/math/vec.h"
71 #include "gromacs/math/vecdump.h"
72 #include "gromacs/mdlib/calcmu.h"
73 #include "gromacs/mdlib/constr.h"
74 #include "gromacs/mdlib/force.h"
75 #include "gromacs/mdlib/forcerec.h"
76 #include "gromacs/mdlib/gmx_omp_nthreads.h"
77 #include "gromacs/mdlib/mdrun.h"
78 #include "gromacs/mdlib/nb_verlet.h"
79 #include "gromacs/mdlib/nbnxn_atomdata.h"
80 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
81 #include "gromacs/mdlib/nbnxn_grid.h"
82 #include "gromacs/mdlib/nbnxn_search.h"
83 #include "gromacs/mdlib/qmmm.h"
84 #include "gromacs/mdlib/update.h"
85 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
86 #include "gromacs/mdtypes/commrec.h"
87 #include "gromacs/mdtypes/forceoutput.h"
88 #include "gromacs/mdtypes/iforceprovider.h"
89 #include "gromacs/mdtypes/inputrec.h"
90 #include "gromacs/mdtypes/md_enums.h"
91 #include "gromacs/mdtypes/state.h"
92 #include "gromacs/pbcutil/ishift.h"
93 #include "gromacs/pbcutil/mshift.h"
94 #include "gromacs/pbcutil/pbc.h"
95 #include "gromacs/pulling/pull.h"
96 #include "gromacs/pulling/pull_rotation.h"
97 #include "gromacs/timing/cyclecounter.h"
98 #include "gromacs/timing/gpu_timing.h"
99 #include "gromacs/timing/wallcycle.h"
100 #include "gromacs/timing/wallcyclereporting.h"
101 #include "gromacs/timing/walltime_accounting.h"
102 #include "gromacs/topology/topology.h"
103 #include "gromacs/utility/arrayref.h"
104 #include "gromacs/utility/basedefinitions.h"
105 #include "gromacs/utility/cstringutil.h"
106 #include "gromacs/utility/exceptions.h"
107 #include "gromacs/utility/fatalerror.h"
108 #include "gromacs/utility/gmxassert.h"
109 #include "gromacs/utility/gmxmpi.h"
110 #include "gromacs/utility/logger.h"
111 #include "gromacs/utility/pleasecite.h"
112 #include "gromacs/utility/smalloc.h"
113 #include "gromacs/utility/sysinfo.h"
115 #include "nbnxn_gpu.h"
116 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
117 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
119 // TODO: this environment variable allows us to verify before release
120 // that on less common architectures the total cost of polling is not larger than
121 // a blocking wait (so polling does not introduce overhead when the static
122 // PME-first ordering would suffice).
123 static const bool c_disableAlternatingWait
= (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
126 void print_time(FILE *out
,
127 gmx_walltime_accounting_t walltime_accounting
,
133 char timebuf
[STRLEN
];
134 double dt
, elapsed_seconds
, time_per_step
;
143 fprintf(out
, "step %s", gmx_step_str(step
, buf
));
146 if ((step
>= ir
->nstlist
))
148 double seconds_since_epoch
= gmx_gettime();
149 elapsed_seconds
= seconds_since_epoch
- walltime_accounting_get_start_time_stamp(walltime_accounting
);
150 time_per_step
= elapsed_seconds
/(step
- ir
->init_step
+ 1);
151 dt
= (ir
->nsteps
+ ir
->init_step
- step
) * time_per_step
;
157 finish
= (time_t) (seconds_since_epoch
+ dt
);
158 gmx_ctime_r(&finish
, timebuf
, STRLEN
);
159 sprintf(buf
, "%s", timebuf
);
160 buf
[strlen(buf
)-1] = '\0';
161 fprintf(out
, ", will finish %s", buf
);
165 fprintf(out
, ", remaining wall clock time: %5d s ", (int)dt
);
170 fprintf(out
, " performance: %.1f ns/day ",
171 ir
->delta_t
/1000*24*60*60/time_per_step
);
180 GMX_UNUSED_VALUE(cr
);
186 void print_date_and_time(FILE *fplog
, int nodeid
, const char *title
,
189 char time_string
[STRLEN
];
198 char timebuf
[STRLEN
];
199 time_t temp_time
= (time_t) the_time
;
201 gmx_ctime_r(&temp_time
, timebuf
, STRLEN
);
202 for (i
= 0; timebuf
[i
] >= ' '; i
++)
204 time_string
[i
] = timebuf
[i
];
206 time_string
[i
] = '\0';
209 fprintf(fplog
, "%s on rank %d %s\n", title
, nodeid
, time_string
);
212 void print_start(FILE *fplog
, const t_commrec
*cr
,
213 gmx_walltime_accounting_t walltime_accounting
,
218 sprintf(buf
, "Started %s", name
);
219 print_date_and_time(fplog
, cr
->nodeid
, buf
,
220 walltime_accounting_get_start_time_stamp(walltime_accounting
));
223 static void sum_forces(rvec f
[], gmx::ArrayRef
<const gmx::RVec
> forceToAdd
)
225 const int end
= forceToAdd
.size();
227 // cppcheck-suppress unreadVariable
228 int gmx_unused nt
= gmx_omp_nthreads_get(emntDefault
);
229 #pragma omp parallel for num_threads(nt) schedule(static)
230 for (int i
= 0; i
< end
; i
++)
232 rvec_inc(f
[i
], forceToAdd
[i
]);
236 static void pme_gpu_reduce_outputs(gmx_wallcycle_t wcycle
,
237 ForceWithVirial
*forceWithVirial
,
238 ArrayRef
<const gmx::RVec
> pmeForces
,
239 gmx_enerdata_t
*enerd
,
243 wallcycle_start(wcycle
, ewcPME_GPU_F_REDUCTION
);
244 GMX_ASSERT(forceWithVirial
, "Invalid force pointer");
245 forceWithVirial
->addVirialContribution(vir_Q
);
246 enerd
->term
[F_COUL_RECIP
] += Vlr_q
;
247 sum_forces(as_rvec_array(forceWithVirial
->force_
.data()), pmeForces
);
248 wallcycle_stop(wcycle
, ewcPME_GPU_F_REDUCTION
);
251 static void calc_virial(int start
, int homenr
, rvec x
[], rvec f
[],
252 tensor vir_part
, t_graph
*graph
, matrix box
,
253 t_nrnb
*nrnb
, const t_forcerec
*fr
, int ePBC
)
257 /* The short-range virial from surrounding boxes */
258 calc_vir(SHIFTS
, fr
->shift_vec
, fr
->fshift
, vir_part
, ePBC
== epbcSCREW
, box
);
259 inc_nrnb(nrnb
, eNR_VIRIAL
, SHIFTS
);
261 /* Calculate partial virial, for local atoms only, based on short range.
262 * Total virial is computed in global_stat, called from do_md
264 f_calc_vir(start
, start
+homenr
, x
, f
, vir_part
, graph
, box
);
265 inc_nrnb(nrnb
, eNR_VIRIAL
, homenr
);
267 /* Add wall contribution */
268 for (i
= 0; i
< DIM
; i
++)
270 vir_part
[i
][ZZ
] += fr
->vir_wall_z
[i
];
275 pr_rvecs(debug
, 0, "vir_part", vir_part
, DIM
);
279 static void pull_potential_wrapper(const t_commrec
*cr
,
280 const t_inputrec
*ir
,
281 matrix box
, ArrayRef
<const RVec
> x
,
282 ForceWithVirial
*force
,
283 const t_mdatoms
*mdatoms
,
284 gmx_enerdata_t
*enerd
,
287 gmx_wallcycle_t wcycle
)
292 /* Calculate the center of mass forces, this requires communication,
293 * which is why pull_potential is called close to other communication.
295 wallcycle_start(wcycle
, ewcPULLPOT
);
296 set_pbc(&pbc
, ir
->ePBC
, box
);
298 enerd
->term
[F_COM_PULL
] +=
299 pull_potential(ir
->pull_work
, mdatoms
, &pbc
,
300 cr
, t
, lambda
[efptRESTRAINT
], as_rvec_array(x
.data()), force
, &dvdl
);
301 enerd
->dvdl_lin
[efptRESTRAINT
] += dvdl
;
302 wallcycle_stop(wcycle
, ewcPULLPOT
);
305 static void pme_receive_force_ener(const t_commrec
*cr
,
306 ForceWithVirial
*forceWithVirial
,
307 gmx_enerdata_t
*enerd
,
308 gmx_wallcycle_t wcycle
)
310 real e_q
, e_lj
, dvdl_q
, dvdl_lj
;
311 float cycles_ppdpme
, cycles_seppme
;
313 cycles_ppdpme
= wallcycle_stop(wcycle
, ewcPPDURINGPME
);
314 dd_cycles_add(cr
->dd
, cycles_ppdpme
, ddCyclPPduringPME
);
316 /* In case of node-splitting, the PP nodes receive the long-range
317 * forces, virial and energy from the PME nodes here.
319 wallcycle_start(wcycle
, ewcPP_PMEWAITRECVF
);
322 gmx_pme_receive_f(cr
, forceWithVirial
, &e_q
, &e_lj
, &dvdl_q
, &dvdl_lj
,
324 enerd
->term
[F_COUL_RECIP
] += e_q
;
325 enerd
->term
[F_LJ_RECIP
] += e_lj
;
326 enerd
->dvdl_lin
[efptCOUL
] += dvdl_q
;
327 enerd
->dvdl_lin
[efptVDW
] += dvdl_lj
;
331 dd_cycles_add(cr
->dd
, cycles_seppme
, ddCyclPME
);
333 wallcycle_stop(wcycle
, ewcPP_PMEWAITRECVF
);
336 static void print_large_forces(FILE *fp
,
344 real force2Tolerance
= gmx::square(forceTolerance
);
345 std::uintmax_t numNonFinite
= 0;
346 for (int i
= 0; i
< md
->homenr
; i
++)
348 real force2
= norm2(f
[i
]);
349 bool nonFinite
= !std::isfinite(force2
);
350 if (force2
>= force2Tolerance
|| nonFinite
)
352 fprintf(fp
, "step %" GMX_PRId64
" atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
354 ddglatnr(cr
->dd
, i
), x
[i
][XX
], x
[i
][YY
], x
[i
][ZZ
], std::sqrt(force2
));
361 if (numNonFinite
> 0)
363 /* Note that with MPI this fatal call on one rank might interrupt
364 * the printing on other ranks. But we can only avoid that with
365 * an expensive MPI barrier that we would need at each step.
367 gmx_fatal(FARGS
, "At step %" GMX_PRId64
" detected non-finite forces on %ju atoms", step
, numNonFinite
);
371 static void post_process_forces(const t_commrec
*cr
,
374 gmx_wallcycle_t wcycle
,
375 const gmx_localtop_t
*top
,
379 ForceWithVirial
*forceWithVirial
,
381 const t_mdatoms
*mdatoms
,
384 const gmx_vsite_t
*vsite
,
387 if (fr
->haveDirectVirialContributions
)
389 rvec
*fDirectVir
= as_rvec_array(forceWithVirial
->force_
.data());
393 /* Spread the mesh force on virtual sites to the other particles...
394 * This is parallellized. MPI communication is performed
395 * if the constructing atoms aren't local.
397 matrix virial
= { { 0 } };
398 spread_vsite_f(vsite
, x
, fDirectVir
, nullptr,
399 (flags
& GMX_FORCE_VIRIAL
), virial
,
401 &top
->idef
, fr
->ePBC
, fr
->bMolPBC
, graph
, box
, cr
, wcycle
);
402 forceWithVirial
->addVirialContribution(virial
);
405 if (flags
& GMX_FORCE_VIRIAL
)
407 /* Now add the forces, this is local */
408 sum_forces(f
, forceWithVirial
->force_
);
410 /* Add the direct virial contributions */
411 GMX_ASSERT(forceWithVirial
->computeVirial_
, "forceWithVirial should request virial computation when we request the virial");
412 m_add(vir_force
, forceWithVirial
->getVirial(), vir_force
);
416 pr_rvecs(debug
, 0, "vir_force", vir_force
, DIM
);
421 if (fr
->print_force
>= 0)
423 print_large_forces(stderr
, mdatoms
, cr
, step
, fr
->print_force
, x
, f
);
427 static void do_nb_verlet(t_forcerec
*fr
,
428 interaction_const_t
*ic
,
429 gmx_enerdata_t
*enerd
,
430 int flags
, int ilocality
,
434 gmx_wallcycle_t wcycle
)
436 if (!(flags
& GMX_FORCE_NONBONDED
))
438 /* skip non-bonded calculation */
442 nonbonded_verlet_t
*nbv
= fr
->nbv
;
443 nonbonded_verlet_group_t
*nbvg
= &nbv
->grp
[ilocality
];
445 /* GPU kernel launch overhead is already timed separately */
446 if (fr
->cutoff_scheme
!= ecutsVERLET
)
448 gmx_incons("Invalid cut-off scheme passed!");
451 bool bUsingGpuKernels
= (nbvg
->kernel_type
== nbnxnk8x8x8_GPU
);
453 if (!bUsingGpuKernels
)
455 /* When dynamic pair-list pruning is requested, we need to prune
456 * at nstlistPrune steps.
458 if (nbv
->listParams
->useDynamicPruning
&&
459 (step
- nbvg
->nbl_lists
.outerListCreationStep
) % nbv
->listParams
->nstlistPrune
== 0)
461 /* Prune the pair-list beyond fr->ic->rlistPrune using
462 * the current coordinates of the atoms.
464 wallcycle_sub_start(wcycle
, ewcsNONBONDED_PRUNING
);
465 nbnxn_kernel_cpu_prune(nbvg
, nbv
->nbat
, fr
->shift_vec
, nbv
->listParams
->rlistInner
);
466 wallcycle_sub_stop(wcycle
, ewcsNONBONDED_PRUNING
);
469 wallcycle_sub_start(wcycle
, ewcsNONBONDED
);
472 switch (nbvg
->kernel_type
)
474 case nbnxnk4x4_PlainC
:
475 case nbnxnk4xN_SIMD_4xN
:
476 case nbnxnk4xN_SIMD_2xNN
:
477 nbnxn_kernel_cpu(nbvg
,
484 enerd
->grpp
.ener
[egCOULSR
],
486 enerd
->grpp
.ener
[egBHAMSR
] :
487 enerd
->grpp
.ener
[egLJSR
]);
490 case nbnxnk8x8x8_GPU
:
491 nbnxn_gpu_launch_kernel(nbv
->gpu_nbv
, nbv
->nbat
, flags
, ilocality
);
494 case nbnxnk8x8x8_PlainC
:
495 nbnxn_kernel_gpu_ref(nbvg
->nbl_lists
.nbl
[0],
502 enerd
->grpp
.ener
[egCOULSR
],
504 enerd
->grpp
.ener
[egBHAMSR
] :
505 enerd
->grpp
.ener
[egLJSR
]);
509 GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
512 if (!bUsingGpuKernels
)
514 wallcycle_sub_stop(wcycle
, ewcsNONBONDED
);
517 int enr_nbnxn_kernel_ljc
, enr_nbnxn_kernel_lj
;
518 if (EEL_RF(ic
->eeltype
) || ic
->eeltype
== eelCUT
)
520 enr_nbnxn_kernel_ljc
= eNR_NBNXN_LJ_RF
;
522 else if ((!bUsingGpuKernels
&& nbvg
->ewald_excl
== ewaldexclAnalytical
) ||
523 (bUsingGpuKernels
&& nbnxn_gpu_is_kernel_ewald_analytical(nbv
->gpu_nbv
)))
525 enr_nbnxn_kernel_ljc
= eNR_NBNXN_LJ_EWALD
;
529 enr_nbnxn_kernel_ljc
= eNR_NBNXN_LJ_TAB
;
531 enr_nbnxn_kernel_lj
= eNR_NBNXN_LJ
;
532 if (flags
& GMX_FORCE_ENERGY
)
534 /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
535 enr_nbnxn_kernel_ljc
+= 1;
536 enr_nbnxn_kernel_lj
+= 1;
539 inc_nrnb(nrnb
, enr_nbnxn_kernel_ljc
,
540 nbvg
->nbl_lists
.natpair_ljq
);
541 inc_nrnb(nrnb
, enr_nbnxn_kernel_lj
,
542 nbvg
->nbl_lists
.natpair_lj
);
543 /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
544 inc_nrnb(nrnb
, enr_nbnxn_kernel_ljc
-eNR_NBNXN_LJ_RF
+eNR_NBNXN_RF
,
545 nbvg
->nbl_lists
.natpair_q
);
547 if (ic
->vdw_modifier
== eintmodFORCESWITCH
)
549 /* We add up the switch cost separately */
550 inc_nrnb(nrnb
, eNR_NBNXN_ADD_LJ_FSW
+((flags
& GMX_FORCE_ENERGY
) ? 1 : 0),
551 nbvg
->nbl_lists
.natpair_ljq
+ nbvg
->nbl_lists
.natpair_lj
);
553 if (ic
->vdw_modifier
== eintmodPOTSWITCH
)
555 /* We add up the switch cost separately */
556 inc_nrnb(nrnb
, eNR_NBNXN_ADD_LJ_PSW
+((flags
& GMX_FORCE_ENERGY
) ? 1 : 0),
557 nbvg
->nbl_lists
.natpair_ljq
+ nbvg
->nbl_lists
.natpair_lj
);
559 if (ic
->vdwtype
== evdwPME
)
561 /* We add up the LJ Ewald cost separately */
562 inc_nrnb(nrnb
, eNR_NBNXN_ADD_LJ_EWALD
+((flags
& GMX_FORCE_ENERGY
) ? 1 : 0),
563 nbvg
->nbl_lists
.natpair_ljq
+ nbvg
->nbl_lists
.natpair_lj
);
567 static void do_nb_verlet_fep(nbnxn_pairlist_set_t
*nbl_lists
,
571 const t_mdatoms
*mdatoms
,
574 gmx_enerdata_t
*enerd
,
577 gmx_wallcycle_t wcycle
)
580 nb_kernel_data_t kernel_data
;
582 real dvdl_nb
[efptNR
];
587 /* Add short-range interactions */
588 donb_flags
|= GMX_NONBONDED_DO_SR
;
590 /* Currently all group scheme kernels always calculate (shift-)forces */
591 if (flags
& GMX_FORCE_FORCES
)
593 donb_flags
|= GMX_NONBONDED_DO_FORCE
;
595 if (flags
& GMX_FORCE_VIRIAL
)
597 donb_flags
|= GMX_NONBONDED_DO_SHIFTFORCE
;
599 if (flags
& GMX_FORCE_ENERGY
)
601 donb_flags
|= GMX_NONBONDED_DO_POTENTIAL
;
604 kernel_data
.flags
= donb_flags
;
605 kernel_data
.lambda
= lambda
;
606 kernel_data
.dvdl
= dvdl_nb
;
608 kernel_data
.energygrp_elec
= enerd
->grpp
.ener
[egCOULSR
];
609 kernel_data
.energygrp_vdw
= enerd
->grpp
.ener
[egLJSR
];
611 /* reset free energy components */
612 for (i
= 0; i
< efptNR
; i
++)
617 GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded
) == nbl_lists
->nnbl
, "Number of lists should be same as number of NB threads");
619 wallcycle_sub_start(wcycle
, ewcsNONBONDED
);
620 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
621 for (th
= 0; th
< nbl_lists
->nnbl
; th
++)
625 gmx_nb_free_energy_kernel(nbl_lists
->nbl_fep
[th
],
626 x
, f
, fr
, mdatoms
, &kernel_data
, nrnb
);
628 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
631 if (fepvals
->sc_alpha
!= 0)
633 enerd
->dvdl_nonlin
[efptVDW
] += dvdl_nb
[efptVDW
];
634 enerd
->dvdl_nonlin
[efptCOUL
] += dvdl_nb
[efptCOUL
];
638 enerd
->dvdl_lin
[efptVDW
] += dvdl_nb
[efptVDW
];
639 enerd
->dvdl_lin
[efptCOUL
] += dvdl_nb
[efptCOUL
];
642 /* If we do foreign lambda and we have soft-core interactions
643 * we have to recalculate the (non-linear) energies contributions.
645 if (fepvals
->n_lambda
> 0 && (flags
& GMX_FORCE_DHDL
) && fepvals
->sc_alpha
!= 0)
647 kernel_data
.flags
= (donb_flags
& ~(GMX_NONBONDED_DO_FORCE
| GMX_NONBONDED_DO_SHIFTFORCE
)) | GMX_NONBONDED_DO_FOREIGNLAMBDA
;
648 kernel_data
.lambda
= lam_i
;
649 kernel_data
.energygrp_elec
= enerd
->foreign_grpp
.ener
[egCOULSR
];
650 kernel_data
.energygrp_vdw
= enerd
->foreign_grpp
.ener
[egLJSR
];
651 /* Note that we add to kernel_data.dvdl, but ignore the result */
653 for (i
= 0; i
< enerd
->n_lambda
; i
++)
655 for (j
= 0; j
< efptNR
; j
++)
657 lam_i
[j
] = (i
== 0 ? lambda
[j
] : fepvals
->all_lambda
[j
][i
-1]);
659 reset_foreign_enerdata(enerd
);
660 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
661 for (th
= 0; th
< nbl_lists
->nnbl
; th
++)
665 gmx_nb_free_energy_kernel(nbl_lists
->nbl_fep
[th
],
666 x
, f
, fr
, mdatoms
, &kernel_data
, nrnb
);
668 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
671 sum_epot(&(enerd
->foreign_grpp
), enerd
->foreign_term
);
672 enerd
->enerpart_lambda
[i
] += enerd
->foreign_term
[F_EPOT
];
676 wallcycle_sub_stop(wcycle
, ewcsNONBONDED
);
679 gmx_bool
use_GPU(const nonbonded_verlet_t
*nbv
)
681 return nbv
!= nullptr && nbv
->bUseGPU
;
684 static inline void clear_rvecs_omp(int n
, rvec v
[])
686 int nth
= gmx_omp_nthreads_get_simple_rvec_task(emntDefault
, n
);
688 /* Note that we would like to avoid this conditional by putting it
689 * into the omp pragma instead, but then we still take the full
690 * omp parallel for overhead (at least with gcc5).
694 for (int i
= 0; i
< n
; i
++)
701 #pragma omp parallel for num_threads(nth) schedule(static)
702 for (int i
= 0; i
< n
; i
++)
709 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
711 * \param groupOptions Group options, containing T-coupling options
713 static real
averageKineticEnergyEstimate(const t_grpopts
&groupOptions
)
715 real nrdfCoupled
= 0;
716 real nrdfUncoupled
= 0;
717 real kineticEnergy
= 0;
718 for (int g
= 0; g
< groupOptions
.ngtc
; g
++)
720 if (groupOptions
.tau_t
[g
] >= 0)
722 nrdfCoupled
+= groupOptions
.nrdf
[g
];
723 kineticEnergy
+= groupOptions
.nrdf
[g
]*0.5*groupOptions
.ref_t
[g
]*BOLTZ
;
727 nrdfUncoupled
+= groupOptions
.nrdf
[g
];
731 /* This conditional with > also catches nrdf=0 */
732 if (nrdfCoupled
> nrdfUncoupled
)
734 return kineticEnergy
*(nrdfCoupled
+ nrdfUncoupled
)/nrdfCoupled
;
742 /*! \brief This routine checks that the potential energy is finite.
744 * Always checks that the potential energy is finite. If step equals
745 * inputrec.init_step also checks that the magnitude of the potential energy
746 * is reasonable. Terminates with a fatal error when a check fails.
747 * Note that passing this check does not guarantee finite forces,
748 * since those use slightly different arithmetics. But in most cases
749 * there is just a narrow coordinate range where forces are not finite
750 * and energies are finite.
752 * \param[in] step The step number, used for checking and printing
753 * \param[in] enerd The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
754 * \param[in] inputrec The input record
756 static void checkPotentialEnergyValidity(gmx_int64_t step
,
757 const gmx_enerdata_t
&enerd
,
758 const t_inputrec
&inputrec
)
760 /* Threshold valid for comparing absolute potential energy against
761 * the kinetic energy. Normally one should not consider absolute
762 * potential energy values, but with a factor of one million
763 * we should never get false positives.
765 constexpr real c_thresholdFactor
= 1e6
;
767 bool energyIsNotFinite
= !std::isfinite(enerd
.term
[F_EPOT
]);
768 real averageKineticEnergy
= 0;
769 /* We only check for large potential energy at the initial step,
770 * because that is by far the most likely step for this too occur
771 * and because computing the average kinetic energy is not free.
772 * Note: nstcalcenergy >> 1 often does not allow to catch large energies
773 * before they become NaN.
775 if (step
== inputrec
.init_step
&& EI_DYNAMICS(inputrec
.eI
))
777 averageKineticEnergy
= averageKineticEnergyEstimate(inputrec
.opts
);
780 if (energyIsNotFinite
|| (averageKineticEnergy
> 0 &&
781 enerd
.term
[F_EPOT
] > c_thresholdFactor
*averageKineticEnergy
))
783 gmx_fatal(FARGS
, "Step %" GMX_PRId64
": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
786 energyIsNotFinite
? "not finite" : "extremely high",
788 enerd
.term
[F_COUL_SR
],
789 energyIsNotFinite
? "non-finite" : "very high",
790 energyIsNotFinite
? " or Nan" : "");
794 /*! \brief Compute forces and/or energies for special algorithms
796 * The intention is to collect all calls to algorithms that compute
797 * forces on local atoms only and that do not contribute to the local
798 * virial sum (but add their virial contribution separately).
799 * Eventually these should likely all become ForceProviders.
800 * Within this function the intention is to have algorithms that do
801 * global communication at the end, so global barriers within the MD loop
802 * are as close together as possible.
804 * \param[in] fplog The log file
805 * \param[in] cr The communication record
806 * \param[in] inputrec The input record
807 * \param[in] step The current MD step
808 * \param[in] t The current time
809 * \param[in,out] wcycle Wallcycle accounting struct
810 * \param[in,out] forceProviders Pointer to a list of force providers
811 * \param[in] box The unit cell
812 * \param[in] x The coordinates
813 * \param[in] mdatoms Per atom properties
814 * \param[in] lambda Array of free-energy lambda values
815 * \param[in] forceFlags Flags that tell whether we should compute forces/energies/virial
816 * \param[in,out] forceWithVirial Force and virial buffers
817 * \param[in,out] enerd Energy buffer
818 * \param[in,out] ed Essential dynamics pointer
819 * \param[in] bNS Tells if we did neighbor searching this step, used for ED sampling
821 * \todo Remove bNS, which is used incorrectly.
822 * \todo Convert all other algorithms called here to ForceProviders.
825 computeSpecialForces(FILE *fplog
,
827 const t_inputrec
*inputrec
,
830 gmx_wallcycle_t wcycle
,
831 ForceProviders
*forceProviders
,
833 ArrayRef
<const RVec
> x
,
834 const t_mdatoms
*mdatoms
,
837 ForceWithVirial
*forceWithVirial
,
838 gmx_enerdata_t
*enerd
,
842 const bool computeForces
= (forceFlags
& GMX_FORCE_FORCES
);
844 /* NOTE: Currently all ForceProviders only provide forces.
845 * When they also provide energies, remove this conditional.
849 /* Collect forces from modules */
850 forceProviders
->calculateForces(cr
, mdatoms
, box
, t
, x
, forceWithVirial
);
853 if (inputrec
->bPull
&& pull_have_potential(inputrec
->pull_work
))
855 pull_potential_wrapper(cr
, inputrec
, box
, x
,
857 mdatoms
, enerd
, lambda
, t
,
860 if (inputrec
->bDoAwh
)
862 Awh
&awh
= *inputrec
->awh
;
863 enerd
->term
[F_COM_PULL
] +=
864 awh
.applyBiasForcesAndUpdateBias(inputrec
->ePBC
, *mdatoms
, box
,
866 t
, step
, wcycle
, fplog
);
870 rvec
*f
= as_rvec_array(forceWithVirial
->force_
.data());
872 /* Add the forces from enforced rotation potentials (if any) */
875 wallcycle_start(wcycle
, ewcROTadd
);
876 enerd
->term
[F_COM_PULL
] += add_rot_forces(inputrec
->rot
, f
, cr
, step
, t
);
877 wallcycle_stop(wcycle
, ewcROTadd
);
882 /* Note that since init_edsam() is called after the initialization
883 * of forcerec, edsam doesn't request the noVirSum force buffer.
884 * Thus if no other algorithm (e.g. PME) requires it, the forces
885 * here will contribute to the virial.
887 do_flood(cr
, inputrec
, as_rvec_array(x
.data()), f
, ed
, box
, step
, bNS
);
890 /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
891 if (inputrec
->bIMD
&& computeForces
)
893 IMD_apply_forces(inputrec
->bIMD
, inputrec
->imd
, cr
, f
, wcycle
);
897 /*! \brief Launch the prepare_step and spread stages of PME GPU.
899 * \param[in] pmedata The PME structure
900 * \param[in] box The box matrix
901 * \param[in] x Coordinate array
902 * \param[in] flags Force flags
903 * \param[in] wcycle The wallcycle structure
905 static inline void launchPmeGpuSpread(gmx_pme_t
*pmedata
,
909 gmx_wallcycle_t wcycle
)
911 int pmeFlags
= GMX_PME_SPREAD
| GMX_PME_SOLVE
;
912 pmeFlags
|= (flags
& GMX_FORCE_FORCES
) ? GMX_PME_CALC_F
: 0;
913 pmeFlags
|= (flags
& GMX_FORCE_VIRIAL
) ? GMX_PME_CALC_ENER_VIR
: 0;
915 pme_gpu_prepare_computation(pmedata
, flags
& GMX_FORCE_DYNAMICBOX
, box
, wcycle
, pmeFlags
);
916 pme_gpu_launch_spread(pmedata
, x
, wcycle
);
919 /*! \brief Launch the FFT and gather stages of PME GPU
921 * This function only implements setting the output forces (no accumulation).
923 * \param[in] pmedata The PME structure
924 * \param[in] wcycle The wallcycle structure
926 static void launchPmeGpuFftAndGather(gmx_pme_t
*pmedata
,
927 gmx_wallcycle_t wcycle
)
929 pme_gpu_launch_complex_transforms(pmedata
, wcycle
);
930 pme_gpu_launch_gather(pmedata
, wcycle
, PmeForceOutputHandling::Set
);
934 * Polling wait for either of the PME or nonbonded GPU tasks.
936 * Instead of a static order in waiting for GPU tasks, this function
937 * polls checking which of the two tasks completes first, and does the
938 * associated force buffer reduction overlapped with the other task.
939 * By doing that, unlike static scheduling order, it can always overlap
940 * one of the reductions, regardless of the GPU task completion order.
942 * \param[in] nbv Nonbonded verlet structure
943 * \param[in] pmedata PME module data
944 * \param[in,out] force Force array to reduce task outputs into.
945 * \param[in,out] forceWithVirial Force and virial buffers
946 * \param[in,out] fshift Shift force output vector results are reduced into
947 * \param[in,out] enerd Energy data structure results are reduced into
948 * \param[in] flags Force flags
949 * \param[in] wcycle The wallcycle structure
951 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t
*nbv
,
952 const gmx_pme_t
*pmedata
,
953 gmx::PaddedArrayRef
<gmx::RVec
> *force
,
954 ForceWithVirial
*forceWithVirial
,
956 gmx_enerdata_t
*enerd
,
958 gmx_wallcycle_t wcycle
)
960 bool isPmeGpuDone
= false;
961 bool isNbGpuDone
= false;
964 gmx::ArrayRef
<const gmx::RVec
> pmeGpuForces
;
966 while (!isPmeGpuDone
|| !isNbGpuDone
)
973 GpuTaskCompletion completionType
= (isNbGpuDone
) ? GpuTaskCompletion::Wait
: GpuTaskCompletion::Check
;
974 isPmeGpuDone
= pme_gpu_try_finish_task(pmedata
, wcycle
, &pmeGpuForces
,
975 vir_Q
, &Vlr_q
, completionType
);
979 pme_gpu_reduce_outputs(wcycle
, forceWithVirial
, pmeGpuForces
,
980 enerd
, vir_Q
, Vlr_q
);
986 GpuTaskCompletion completionType
= (isPmeGpuDone
) ? GpuTaskCompletion::Wait
: GpuTaskCompletion::Check
;
987 wallcycle_start_nocount(wcycle
, ewcWAIT_GPU_NB_L
);
988 isNbGpuDone
= nbnxn_gpu_try_finish_task(nbv
->gpu_nbv
,
990 enerd
->grpp
.ener
[egLJSR
], enerd
->grpp
.ener
[egCOULSR
],
991 fshift
, completionType
);
992 wallcycle_stop(wcycle
, ewcWAIT_GPU_NB_L
);
993 // To get the call count right, when the task finished we
994 // issue a start/stop.
995 // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
996 // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
999 wallcycle_start(wcycle
, ewcWAIT_GPU_NB_L
);
1000 wallcycle_stop(wcycle
, ewcWAIT_GPU_NB_L
);
1002 nbnxn_atomdata_add_nbat_f_to_f(nbv
->nbs
, eatLocal
,
1003 nbv
->nbat
, as_rvec_array(force
->data()), wcycle
);
1010 * Launch the dynamic rolling pruning GPU task.
1012 * We currently alternate local/non-local list pruning in odd-even steps
1013 * (only pruning every second step without DD).
1015 * \param[in] cr The communication record
1016 * \param[in] nbv Nonbonded verlet structure
1017 * \param[in] inputrec The input record
1018 * \param[in] step The current MD step
1020 static inline void launchGpuRollingPruning(const t_commrec
*cr
,
1021 const nonbonded_verlet_t
*nbv
,
1022 const t_inputrec
*inputrec
,
1023 const gmx_int64_t step
)
1025 /* We should not launch the rolling pruning kernel at a search
1026 * step or just before search steps, since that's useless.
1027 * Without domain decomposition we prune at even steps.
1028 * With domain decomposition we alternate local and non-local
1029 * pruning at even and odd steps.
1031 int numRollingParts
= nbv
->listParams
->numRollingParts
;
1032 GMX_ASSERT(numRollingParts
== nbv
->listParams
->nstlistPrune
/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1033 int stepWithCurrentList
= step
- nbv
->grp
[eintLocal
].nbl_lists
.outerListCreationStep
;
1034 bool stepIsEven
= ((stepWithCurrentList
& 1) == 0);
1035 if (stepWithCurrentList
> 0 &&
1036 stepWithCurrentList
< inputrec
->nstlist
- 1 &&
1037 (stepIsEven
|| DOMAINDECOMP(cr
)))
1039 nbnxn_gpu_launch_kernel_pruneonly(nbv
->gpu_nbv
,
1040 stepIsEven
? eintLocal
: eintNonlocal
,
1045 static void do_force_cutsVERLET(FILE *fplog
,
1046 const t_commrec
*cr
,
1047 const gmx_multisim_t
*ms
,
1048 const t_inputrec
*inputrec
,
1051 gmx_wallcycle_t wcycle
,
1052 const gmx_localtop_t
*top
,
1053 const gmx_groups_t
* /* groups */,
1054 matrix box
, gmx::PaddedArrayRef
<gmx::RVec
> x
,
1056 gmx::PaddedArrayRef
<gmx::RVec
> force
,
1058 const t_mdatoms
*mdatoms
,
1059 gmx_enerdata_t
*enerd
, t_fcdata
*fcd
,
1063 interaction_const_t
*ic
,
1064 const gmx_vsite_t
*vsite
,
1067 const gmx_edsam
*ed
,
1069 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion
,
1070 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion
)
1074 gmx_bool bStateChanged
, bNS
, bFillGrid
, bCalcCGCM
;
1075 gmx_bool bDoForces
, bUseGPU
, bUseOrEmulGPU
;
1076 rvec vzero
, box_diag
;
1077 float cycles_pme
, cycles_wait_gpu
;
1078 nonbonded_verlet_t
*nbv
= fr
->nbv
;
1080 bStateChanged
= (flags
& GMX_FORCE_STATECHANGED
);
1081 bNS
= (flags
& GMX_FORCE_NS
) && (fr
->bAllvsAll
== FALSE
);
1082 bFillGrid
= (bNS
&& bStateChanged
);
1083 bCalcCGCM
= (bFillGrid
&& !DOMAINDECOMP(cr
));
1084 bDoForces
= (flags
& GMX_FORCE_FORCES
);
1085 bUseGPU
= fr
->nbv
->bUseGPU
;
1086 bUseOrEmulGPU
= bUseGPU
|| (fr
->nbv
->emulateGpu
== EmulateGpuNonbonded::Yes
);
1088 const auto pmeRunMode
= fr
->pmedata
? pme_run_mode(fr
->pmedata
) : PmeRunMode::CPU
;
1089 // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
1090 const bool useGpuPme
= EEL_PME(fr
->ic
->eeltype
) && thisRankHasDuty(cr
, DUTY_PME
) &&
1091 ((pmeRunMode
== PmeRunMode::GPU
) || (pmeRunMode
== PmeRunMode::Mixed
));
1093 /* At a search step we need to start the first balancing region
1094 * somewhere early inside the step after communication during domain
1095 * decomposition (and not during the previous step as usual).
1098 ddOpenBalanceRegion
== DdOpenBalanceRegionBeforeForceComputation::yes
)
1100 ddOpenBalanceRegionCpu(cr
->dd
, DdAllowBalanceRegionReopen::yes
);
1103 cycles_wait_gpu
= 0;
1105 const int start
= 0;
1106 const int homenr
= mdatoms
->homenr
;
1108 clear_mat(vir_force
);
1110 if (DOMAINDECOMP(cr
))
1112 cg1
= cr
->dd
->ncg_tot
;
1125 update_forcerec(fr
, box
);
1127 if (inputrecNeedMutot(inputrec
))
1129 /* Calculate total (local) dipole moment in a temporary common array.
1130 * This makes it possible to sum them over nodes faster.
1132 calc_mu(start
, homenr
,
1133 x
, mdatoms
->chargeA
, mdatoms
->chargeB
, mdatoms
->nChargePerturbed
,
1138 if (fr
->ePBC
!= epbcNONE
)
1140 /* Compute shift vectors every step,
1141 * because of pressure coupling or box deformation!
1143 if ((flags
& GMX_FORCE_DYNAMICBOX
) && bStateChanged
)
1145 calc_shifts(box
, fr
->shift_vec
);
1150 put_atoms_in_box_omp(fr
->ePBC
, box
, x
.subArray(0, homenr
));
1151 inc_nrnb(nrnb
, eNR_SHIFTX
, homenr
);
1153 else if (EI_ENERGY_MINIMIZATION(inputrec
->eI
) && graph
)
1155 unshift_self(graph
, box
, as_rvec_array(x
.data()));
1159 nbnxn_atomdata_copy_shiftvec(flags
& GMX_FORCE_DYNAMICBOX
,
1160 fr
->shift_vec
, nbv
->nbat
);
1163 if (!thisRankHasDuty(cr
, DUTY_PME
))
1165 /* Send particle coordinates to the pme nodes.
1166 * Since this is only implemented for domain decomposition
1167 * and domain decomposition does not use the graph,
1168 * we do not need to worry about shifting.
1170 gmx_pme_send_coordinates(cr
, box
, as_rvec_array(x
.data()),
1171 lambda
[efptCOUL
], lambda
[efptVDW
],
1172 (flags
& (GMX_FORCE_VIRIAL
| GMX_FORCE_ENERGY
)),
1175 #endif /* GMX_MPI */
1179 launchPmeGpuSpread(fr
->pmedata
, box
, as_rvec_array(x
.data()), flags
, wcycle
);
1182 /* do gridding for pair search */
1185 if (graph
&& bStateChanged
)
1187 /* Calculate intramolecular shift vectors to make molecules whole */
1188 mk_mshift(fplog
, graph
, fr
->ePBC
, box
, as_rvec_array(x
.data()));
1192 box_diag
[XX
] = box
[XX
][XX
];
1193 box_diag
[YY
] = box
[YY
][YY
];
1194 box_diag
[ZZ
] = box
[ZZ
][ZZ
];
1196 wallcycle_start(wcycle
, ewcNS
);
1199 wallcycle_sub_start(wcycle
, ewcsNBS_GRID_LOCAL
);
1200 nbnxn_put_on_grid(nbv
->nbs
, fr
->ePBC
, box
,
1202 0, mdatoms
->homenr
, -1, fr
->cginfo
, as_rvec_array(x
.data()),
1204 nbv
->grp
[eintLocal
].kernel_type
,
1206 wallcycle_sub_stop(wcycle
, ewcsNBS_GRID_LOCAL
);
1210 wallcycle_sub_start(wcycle
, ewcsNBS_GRID_NONLOCAL
);
1211 nbnxn_put_on_grid_nonlocal(nbv
->nbs
, domdec_zones(cr
->dd
),
1212 fr
->cginfo
, as_rvec_array(x
.data()),
1213 nbv
->grp
[eintNonlocal
].kernel_type
,
1215 wallcycle_sub_stop(wcycle
, ewcsNBS_GRID_NONLOCAL
);
1218 nbnxn_atomdata_set(nbv
->nbat
, nbv
->nbs
, mdatoms
, fr
->cginfo
);
1219 wallcycle_stop(wcycle
, ewcNS
);
1222 /* initialize the GPU atom data and copy shift vector */
1225 wallcycle_start_nocount(wcycle
, ewcLAUNCH_GPU
);
1226 wallcycle_sub_start_nocount(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1230 nbnxn_gpu_init_atomdata(nbv
->gpu_nbv
, nbv
->nbat
);
1233 nbnxn_gpu_upload_shiftvec(nbv
->gpu_nbv
, nbv
->nbat
);
1235 wallcycle_sub_stop(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1236 wallcycle_stop(wcycle
, ewcLAUNCH_GPU
);
1239 /* do local pair search */
1242 wallcycle_start_nocount(wcycle
, ewcNS
);
1243 wallcycle_sub_start(wcycle
, ewcsNBS_SEARCH_LOCAL
);
1244 nbnxn_make_pairlist(nbv
->nbs
, nbv
->nbat
,
1246 nbv
->listParams
->rlistOuter
,
1247 nbv
->min_ci_balanced
,
1248 &nbv
->grp
[eintLocal
].nbl_lists
,
1250 nbv
->grp
[eintLocal
].kernel_type
,
1252 nbv
->grp
[eintLocal
].nbl_lists
.outerListCreationStep
= step
;
1253 if (nbv
->listParams
->useDynamicPruning
&& !bUseGPU
)
1255 nbnxnPrepareListForDynamicPruning(&nbv
->grp
[eintLocal
].nbl_lists
);
1257 wallcycle_sub_stop(wcycle
, ewcsNBS_SEARCH_LOCAL
);
1261 /* initialize local pair-list on the GPU */
1262 nbnxn_gpu_init_pairlist(nbv
->gpu_nbv
,
1263 nbv
->grp
[eintLocal
].nbl_lists
.nbl
[0],
1266 wallcycle_stop(wcycle
, ewcNS
);
1270 nbnxn_atomdata_copy_x_to_nbat_x(nbv
->nbs
, eatLocal
, FALSE
, as_rvec_array(x
.data()),
1276 if (DOMAINDECOMP(cr
))
1278 ddOpenBalanceRegionGpu(cr
->dd
);
1281 wallcycle_start(wcycle
, ewcLAUNCH_GPU
);
1282 wallcycle_sub_start(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1283 /* launch local nonbonded work on GPU */
1284 do_nb_verlet(fr
, ic
, enerd
, flags
, eintLocal
, enbvClearFNo
,
1285 step
, nrnb
, wcycle
);
1286 wallcycle_sub_stop(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1287 wallcycle_stop(wcycle
, ewcLAUNCH_GPU
);
1292 // In PME GPU and mixed mode we launch FFT / gather after the
1293 // X copy/transform to allow overlap as well as after the GPU NB
1294 // launch to avoid FFT launch overhead hijacking the CPU and delaying
1295 // the nonbonded kernel.
1296 launchPmeGpuFftAndGather(fr
->pmedata
, wcycle
);
1299 /* Communicate coordinates and sum dipole if necessary +
1300 do non-local pair search */
1301 if (DOMAINDECOMP(cr
))
1305 wallcycle_start_nocount(wcycle
, ewcNS
);
1306 wallcycle_sub_start(wcycle
, ewcsNBS_SEARCH_NONLOCAL
);
1308 nbnxn_make_pairlist(nbv
->nbs
, nbv
->nbat
,
1310 nbv
->listParams
->rlistOuter
,
1311 nbv
->min_ci_balanced
,
1312 &nbv
->grp
[eintNonlocal
].nbl_lists
,
1314 nbv
->grp
[eintNonlocal
].kernel_type
,
1316 nbv
->grp
[eintNonlocal
].nbl_lists
.outerListCreationStep
= step
;
1317 if (nbv
->listParams
->useDynamicPruning
&& !bUseGPU
)
1319 nbnxnPrepareListForDynamicPruning(&nbv
->grp
[eintNonlocal
].nbl_lists
);
1321 wallcycle_sub_stop(wcycle
, ewcsNBS_SEARCH_NONLOCAL
);
1323 if (nbv
->grp
[eintNonlocal
].kernel_type
== nbnxnk8x8x8_GPU
)
1325 /* initialize non-local pair-list on the GPU */
1326 nbnxn_gpu_init_pairlist(nbv
->gpu_nbv
,
1327 nbv
->grp
[eintNonlocal
].nbl_lists
.nbl
[0],
1330 wallcycle_stop(wcycle
, ewcNS
);
1334 dd_move_x(cr
->dd
, box
, as_rvec_array(x
.data()), wcycle
);
1336 nbnxn_atomdata_copy_x_to_nbat_x(nbv
->nbs
, eatNonlocal
, FALSE
, as_rvec_array(x
.data()),
1342 wallcycle_start(wcycle
, ewcLAUNCH_GPU
);
1343 wallcycle_sub_start(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1344 /* launch non-local nonbonded tasks on GPU */
1345 do_nb_verlet(fr
, ic
, enerd
, flags
, eintNonlocal
, enbvClearFNo
,
1346 step
, nrnb
, wcycle
);
1347 wallcycle_sub_stop(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1348 wallcycle_stop(wcycle
, ewcLAUNCH_GPU
);
1354 /* launch D2H copy-back F */
1355 wallcycle_start_nocount(wcycle
, ewcLAUNCH_GPU
);
1356 wallcycle_sub_start_nocount(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1357 if (DOMAINDECOMP(cr
))
1359 nbnxn_gpu_launch_cpyback(nbv
->gpu_nbv
, nbv
->nbat
,
1360 flags
, eatNonlocal
);
1362 nbnxn_gpu_launch_cpyback(nbv
->gpu_nbv
, nbv
->nbat
,
1364 wallcycle_sub_stop(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1365 wallcycle_stop(wcycle
, ewcLAUNCH_GPU
);
1368 if (bStateChanged
&& inputrecNeedMutot(inputrec
))
1372 gmx_sumd(2*DIM
, mu
, cr
);
1373 ddReopenBalanceRegionCpu(cr
->dd
);
1376 for (i
= 0; i
< 2; i
++)
1378 for (j
= 0; j
< DIM
; j
++)
1380 fr
->mu_tot
[i
][j
] = mu
[i
*DIM
+ j
];
1384 if (fr
->efep
== efepNO
)
1386 copy_rvec(fr
->mu_tot
[0], mu_tot
);
1390 for (j
= 0; j
< DIM
; j
++)
1393 (1.0 - lambda
[efptCOUL
])*fr
->mu_tot
[0][j
] +
1394 lambda
[efptCOUL
]*fr
->mu_tot
[1][j
];
1398 /* Reset energies */
1399 reset_enerdata(enerd
);
1400 clear_rvecs(SHIFTS
, fr
->fshift
);
1402 if (DOMAINDECOMP(cr
) && !thisRankHasDuty(cr
, DUTY_PME
))
1404 wallcycle_start(wcycle
, ewcPPDURINGPME
);
1405 dd_force_flop_start(cr
->dd
, nrnb
);
1410 wallcycle_start(wcycle
, ewcROT
);
1411 do_rotation(cr
, inputrec
, box
, as_rvec_array(x
.data()), t
, step
, bNS
);
1412 wallcycle_stop(wcycle
, ewcROT
);
1415 /* Temporary solution until all routines take PaddedRVecVector */
1416 rvec
*f
= as_rvec_array(force
.data());
1418 /* Start the force cycle counter.
1419 * Note that a different counter is used for dynamic load balancing.
1421 wallcycle_start(wcycle
, ewcFORCE
);
1423 gmx::ArrayRef
<gmx::RVec
> forceRef
= force
;
1426 /* If we need to compute the virial, we might need a separate
1427 * force buffer for algorithms for which the virial is calculated
1428 * directly, such as PME.
1430 if ((flags
& GMX_FORCE_VIRIAL
) && fr
->haveDirectVirialContributions
)
1432 forceRef
= *fr
->forceBufferForDirectVirialContributions
;
1434 /* We only compute forces on local atoms. Note that vsites can
1435 * spread to non-local atoms, but that part of the buffer is
1436 * cleared separately in the vsite spreading code.
1438 clear_rvecs_omp(mdatoms
->homenr
, as_rvec_array(forceRef
.data()));
1440 /* Clear the short- and long-range forces */
1441 clear_rvecs_omp(fr
->natoms_force_constr
, f
);
1444 /* forceWithVirial uses the local atom range only */
1445 ForceWithVirial
forceWithVirial(forceRef
, flags
& GMX_FORCE_VIRIAL
);
1447 if (inputrec
->bPull
&& pull_have_constraint(inputrec
->pull_work
))
1449 clear_pull_forces(inputrec
->pull_work
);
1452 /* We calculate the non-bonded forces, when done on the CPU, here.
1453 * We do this before calling do_force_lowlevel, because in that
1454 * function, the listed forces are calculated before PME, which
1455 * does communication. With this order, non-bonded and listed
1456 * force calculation imbalance can be balanced out by the domain
1457 * decomposition load balancing.
1462 do_nb_verlet(fr
, ic
, enerd
, flags
, eintLocal
, enbvClearFYes
,
1463 step
, nrnb
, wcycle
);
1466 if (fr
->efep
!= efepNO
)
1468 /* Calculate the local and non-local free energy interactions here.
1469 * Happens here on the CPU both with and without GPU.
1471 if (fr
->nbv
->grp
[eintLocal
].nbl_lists
.nbl_fep
[0]->nrj
> 0)
1473 do_nb_verlet_fep(&fr
->nbv
->grp
[eintLocal
].nbl_lists
,
1474 fr
, as_rvec_array(x
.data()), f
, mdatoms
,
1475 inputrec
->fepvals
, lambda
,
1476 enerd
, flags
, nrnb
, wcycle
);
1479 if (DOMAINDECOMP(cr
) &&
1480 fr
->nbv
->grp
[eintNonlocal
].nbl_lists
.nbl_fep
[0]->nrj
> 0)
1482 do_nb_verlet_fep(&fr
->nbv
->grp
[eintNonlocal
].nbl_lists
,
1483 fr
, as_rvec_array(x
.data()), f
, mdatoms
,
1484 inputrec
->fepvals
, lambda
,
1485 enerd
, flags
, nrnb
, wcycle
);
1493 if (DOMAINDECOMP(cr
))
1495 do_nb_verlet(fr
, ic
, enerd
, flags
, eintNonlocal
, enbvClearFNo
,
1496 step
, nrnb
, wcycle
);
1505 aloc
= eintNonlocal
;
1508 /* Add all the non-bonded force to the normal force array.
1509 * This can be split into a local and a non-local part when overlapping
1510 * communication with calculation with domain decomposition.
1512 wallcycle_stop(wcycle
, ewcFORCE
);
1514 nbnxn_atomdata_add_nbat_f_to_f(nbv
->nbs
, eatAll
, nbv
->nbat
, f
, wcycle
);
1516 wallcycle_start_nocount(wcycle
, ewcFORCE
);
1518 /* if there are multiple fshift output buffers reduce them */
1519 if ((flags
& GMX_FORCE_VIRIAL
) &&
1520 nbv
->grp
[aloc
].nbl_lists
.nnbl
> 1)
1522 /* This is not in a subcounter because it takes a
1523 negligible and constant-sized amount of time */
1524 nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv
->nbat
,
1529 /* update QMMMrec, if necessary */
1532 update_QMMMrec(cr
, fr
, as_rvec_array(x
.data()), mdatoms
, box
);
1535 /* Compute the bonded and non-bonded energies and optionally forces */
1536 do_force_lowlevel(fr
, inputrec
, &(top
->idef
),
1537 cr
, ms
, nrnb
, wcycle
, mdatoms
,
1538 as_rvec_array(x
.data()), hist
, f
, &forceWithVirial
, enerd
, fcd
,
1539 box
, inputrec
->fepvals
, lambda
, graph
, &(top
->excls
), fr
->mu_tot
,
1540 flags
, &cycles_pme
);
1542 wallcycle_stop(wcycle
, ewcFORCE
);
1544 computeSpecialForces(fplog
, cr
, inputrec
, step
, t
, wcycle
,
1545 fr
->forceProviders
, box
, x
, mdatoms
, lambda
,
1546 flags
, &forceWithVirial
, enerd
,
1551 /* wait for non-local forces (or calculate in emulation mode) */
1552 if (DOMAINDECOMP(cr
))
1556 wallcycle_start(wcycle
, ewcWAIT_GPU_NB_NL
);
1557 nbnxn_gpu_wait_finish_task(nbv
->gpu_nbv
,
1559 enerd
->grpp
.ener
[egLJSR
], enerd
->grpp
.ener
[egCOULSR
],
1561 cycles_wait_gpu
+= wallcycle_stop(wcycle
, ewcWAIT_GPU_NB_NL
);
1565 wallcycle_start_nocount(wcycle
, ewcFORCE
);
1566 do_nb_verlet(fr
, ic
, enerd
, flags
, eintNonlocal
, enbvClearFYes
,
1567 step
, nrnb
, wcycle
);
1568 wallcycle_stop(wcycle
, ewcFORCE
);
1571 /* skip the reduction if there was no non-local work to do */
1572 if (nbv
->grp
[eintNonlocal
].nbl_lists
.nbl
[0]->nsci
> 0)
1574 nbnxn_atomdata_add_nbat_f_to_f(nbv
->nbs
, eatNonlocal
,
1575 nbv
->nbat
, f
, wcycle
);
1580 if (DOMAINDECOMP(cr
))
1582 /* We are done with the CPU compute.
1583 * We will now communicate the non-local forces.
1584 * If we use a GPU this will overlap with GPU work, so in that case
1585 * we do not close the DD force balancing region here.
1587 if (ddCloseBalanceRegion
== DdCloseBalanceRegionAfterForceComputation::yes
)
1589 ddCloseBalanceRegionCpu(cr
->dd
);
1593 dd_move_f(cr
->dd
, f
, fr
->fshift
, wcycle
);
1597 // With both nonbonded and PME offloaded a GPU on the same rank, we use
1598 // an alternating wait/reduction scheme.
1599 bool alternateGpuWait
= (!c_disableAlternatingWait
&& useGpuPme
&& bUseGPU
&& !DOMAINDECOMP(cr
));
1600 if (alternateGpuWait
)
1602 alternatePmeNbGpuWaitReduce(fr
->nbv
, fr
->pmedata
, &force
, &forceWithVirial
, fr
->fshift
, enerd
, flags
, wcycle
);
1605 if (!alternateGpuWait
&& useGpuPme
)
1607 gmx::ArrayRef
<const gmx::RVec
> pmeGpuForces
;
1610 pme_gpu_wait_finish_task(fr
->pmedata
, wcycle
, &pmeGpuForces
, vir_Q
, &Vlr_q
);
1611 pme_gpu_reduce_outputs(wcycle
, &forceWithVirial
, pmeGpuForces
, enerd
, vir_Q
, Vlr_q
);
1614 /* Wait for local GPU NB outputs on the non-alternating wait path */
1615 if (!alternateGpuWait
&& bUseGPU
)
1617 /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1618 * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1619 * but even with a step of 0.1 ms the difference is less than 1%
1622 const float gpuWaitApiOverheadMargin
= 2e6f
; /* cycles */
1624 wallcycle_start(wcycle
, ewcWAIT_GPU_NB_L
);
1625 nbnxn_gpu_wait_finish_task(nbv
->gpu_nbv
,
1627 enerd
->grpp
.ener
[egLJSR
], enerd
->grpp
.ener
[egCOULSR
],
1629 float cycles_tmp
= wallcycle_stop(wcycle
, ewcWAIT_GPU_NB_L
);
1631 if (ddCloseBalanceRegion
== DdCloseBalanceRegionAfterForceComputation::yes
)
1633 DdBalanceRegionWaitedForGpu waitedForGpu
= DdBalanceRegionWaitedForGpu::yes
;
1634 if (bDoForces
&& cycles_tmp
<= gpuWaitApiOverheadMargin
)
1636 /* We measured few cycles, it could be that the kernel
1637 * and transfer finished earlier and there was no actual
1638 * wait time, only API call overhead.
1639 * Then the actual time could be anywhere between 0 and
1640 * cycles_wait_est. We will use half of cycles_wait_est.
1642 waitedForGpu
= DdBalanceRegionWaitedForGpu::no
;
1644 ddCloseBalanceRegionGpu(cr
->dd
, cycles_wait_gpu
, waitedForGpu
);
1648 if (fr
->nbv
->emulateGpu
== EmulateGpuNonbonded::Yes
)
1650 // NOTE: emulation kernel is not included in the balancing region,
1651 // but emulation mode does not target performance anyway
1652 wallcycle_start_nocount(wcycle
, ewcFORCE
);
1653 do_nb_verlet(fr
, ic
, enerd
, flags
, eintLocal
,
1654 DOMAINDECOMP(cr
) ? enbvClearFNo
: enbvClearFYes
,
1655 step
, nrnb
, wcycle
);
1656 wallcycle_stop(wcycle
, ewcFORCE
);
1661 pme_gpu_reinit_computation(fr
->pmedata
, wcycle
);
1666 /* now clear the GPU outputs while we finish the step on the CPU */
1667 wallcycle_start_nocount(wcycle
, ewcLAUNCH_GPU
);
1668 wallcycle_sub_start_nocount(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1669 nbnxn_gpu_clear_outputs(nbv
->gpu_nbv
, flags
);
1671 /* Is dynamic pair-list pruning activated? */
1672 if (nbv
->listParams
->useDynamicPruning
)
1674 launchGpuRollingPruning(cr
, nbv
, inputrec
, step
);
1676 wallcycle_sub_stop(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1677 wallcycle_stop(wcycle
, ewcLAUNCH_GPU
);
1680 /* Do the nonbonded GPU (or emulation) force buffer reduction
1681 * on the non-alternating path. */
1682 if (bUseOrEmulGPU
&& !alternateGpuWait
)
1684 nbnxn_atomdata_add_nbat_f_to_f(nbv
->nbs
, eatLocal
,
1685 nbv
->nbat
, f
, wcycle
);
1688 if (DOMAINDECOMP(cr
))
1690 dd_force_flop_stop(cr
->dd
, nrnb
);
1695 /* If we have NoVirSum forces, but we do not calculate the virial,
1696 * we sum fr->f_novirsum=f later.
1698 if (vsite
&& !(fr
->haveDirectVirialContributions
&& !(flags
& GMX_FORCE_VIRIAL
)))
1700 spread_vsite_f(vsite
, as_rvec_array(x
.data()), f
, fr
->fshift
, FALSE
, nullptr, nrnb
,
1701 &top
->idef
, fr
->ePBC
, fr
->bMolPBC
, graph
, box
, cr
, wcycle
);
1704 if (flags
& GMX_FORCE_VIRIAL
)
1706 /* Calculation of the virial must be done after vsites! */
1707 calc_virial(0, mdatoms
->homenr
, as_rvec_array(x
.data()), f
,
1708 vir_force
, graph
, box
, nrnb
, fr
, inputrec
->ePBC
);
1712 if (PAR(cr
) && !thisRankHasDuty(cr
, DUTY_PME
))
1714 /* In case of node-splitting, the PP nodes receive the long-range
1715 * forces, virial and energy from the PME nodes here.
1717 pme_receive_force_ener(cr
, &forceWithVirial
, enerd
, wcycle
);
1722 post_process_forces(cr
, step
, nrnb
, wcycle
,
1723 top
, box
, as_rvec_array(x
.data()), f
, &forceWithVirial
,
1724 vir_force
, mdatoms
, graph
, fr
, vsite
,
1728 if (flags
& GMX_FORCE_ENERGY
)
1730 /* Sum the potential energy terms from group contributions */
1731 sum_epot(&(enerd
->grpp
), enerd
->term
);
1733 if (!EI_TPI(inputrec
->eI
))
1735 checkPotentialEnergyValidity(step
, *enerd
, *inputrec
);
1740 static void do_force_cutsGROUP(FILE *fplog
,
1741 const t_commrec
*cr
,
1742 const gmx_multisim_t
*ms
,
1743 const t_inputrec
*inputrec
,
1746 gmx_wallcycle_t wcycle
,
1747 gmx_localtop_t
*top
,
1748 const gmx_groups_t
*groups
,
1749 matrix box
, gmx::PaddedArrayRef
<gmx::RVec
> x
,
1751 gmx::PaddedArrayRef
<gmx::RVec
> force
,
1753 const t_mdatoms
*mdatoms
,
1754 gmx_enerdata_t
*enerd
,
1759 const gmx_vsite_t
*vsite
,
1762 const gmx_edsam
*ed
,
1764 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion
,
1765 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion
)
1769 gmx_bool bStateChanged
, bNS
, bFillGrid
, bCalcCGCM
;
1773 const int start
= 0;
1774 const int homenr
= mdatoms
->homenr
;
1776 clear_mat(vir_force
);
1779 if (DOMAINDECOMP(cr
))
1781 cg1
= cr
->dd
->ncg_tot
;
1792 bStateChanged
= (flags
& GMX_FORCE_STATECHANGED
);
1793 bNS
= (flags
& GMX_FORCE_NS
) && (fr
->bAllvsAll
== FALSE
);
1794 /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1795 bFillGrid
= (bNS
&& bStateChanged
);
1796 bCalcCGCM
= (bFillGrid
&& !DOMAINDECOMP(cr
));
1797 bDoForces
= (flags
& GMX_FORCE_FORCES
);
1801 update_forcerec(fr
, box
);
1803 if (inputrecNeedMutot(inputrec
))
1805 /* Calculate total (local) dipole moment in a temporary common array.
1806 * This makes it possible to sum them over nodes faster.
1808 calc_mu(start
, homenr
,
1809 x
, mdatoms
->chargeA
, mdatoms
->chargeB
, mdatoms
->nChargePerturbed
,
1814 if (fr
->ePBC
!= epbcNONE
)
1816 /* Compute shift vectors every step,
1817 * because of pressure coupling or box deformation!
1819 if ((flags
& GMX_FORCE_DYNAMICBOX
) && bStateChanged
)
1821 calc_shifts(box
, fr
->shift_vec
);
1826 put_charge_groups_in_box(fplog
, cg0
, cg1
, fr
->ePBC
, box
,
1827 &(top
->cgs
), as_rvec_array(x
.data()), fr
->cg_cm
);
1828 inc_nrnb(nrnb
, eNR_CGCM
, homenr
);
1829 inc_nrnb(nrnb
, eNR_RESETX
, cg1
-cg0
);
1831 else if (EI_ENERGY_MINIMIZATION(inputrec
->eI
) && graph
)
1833 unshift_self(graph
, box
, as_rvec_array(x
.data()));
1838 calc_cgcm(fplog
, cg0
, cg1
, &(top
->cgs
), as_rvec_array(x
.data()), fr
->cg_cm
);
1839 inc_nrnb(nrnb
, eNR_CGCM
, homenr
);
1842 if (bCalcCGCM
&& gmx_debug_at
)
1844 pr_rvecs(debug
, 0, "cgcm", fr
->cg_cm
, top
->cgs
.nr
);
1848 if (!thisRankHasDuty(cr
, DUTY_PME
))
1850 /* Send particle coordinates to the pme nodes.
1851 * Since this is only implemented for domain decomposition
1852 * and domain decomposition does not use the graph,
1853 * we do not need to worry about shifting.
1855 gmx_pme_send_coordinates(cr
, box
, as_rvec_array(x
.data()),
1856 lambda
[efptCOUL
], lambda
[efptVDW
],
1857 (flags
& (GMX_FORCE_VIRIAL
| GMX_FORCE_ENERGY
)),
1860 #endif /* GMX_MPI */
1862 /* Communicate coordinates and sum dipole if necessary */
1863 if (DOMAINDECOMP(cr
))
1865 dd_move_x(cr
->dd
, box
, as_rvec_array(x
.data()), wcycle
);
1867 /* No GPU support, no move_x overlap, so reopen the balance region here */
1868 if (ddOpenBalanceRegion
== DdOpenBalanceRegionBeforeForceComputation::yes
)
1870 ddReopenBalanceRegionCpu(cr
->dd
);
1874 if (inputrecNeedMutot(inputrec
))
1880 gmx_sumd(2*DIM
, mu
, cr
);
1881 ddReopenBalanceRegionCpu(cr
->dd
);
1883 for (i
= 0; i
< 2; i
++)
1885 for (j
= 0; j
< DIM
; j
++)
1887 fr
->mu_tot
[i
][j
] = mu
[i
*DIM
+ j
];
1891 if (fr
->efep
== efepNO
)
1893 copy_rvec(fr
->mu_tot
[0], mu_tot
);
1897 for (j
= 0; j
< DIM
; j
++)
1900 (1.0 - lambda
[efptCOUL
])*fr
->mu_tot
[0][j
] + lambda
[efptCOUL
]*fr
->mu_tot
[1][j
];
1905 /* Reset energies */
1906 reset_enerdata(enerd
);
1907 clear_rvecs(SHIFTS
, fr
->fshift
);
1911 wallcycle_start(wcycle
, ewcNS
);
1913 if (graph
&& bStateChanged
)
1915 /* Calculate intramolecular shift vectors to make molecules whole */
1916 mk_mshift(fplog
, graph
, fr
->ePBC
, box
, as_rvec_array(x
.data()));
1919 /* Do the actual neighbour searching */
1921 groups
, top
, mdatoms
,
1922 cr
, nrnb
, bFillGrid
);
1924 wallcycle_stop(wcycle
, ewcNS
);
1927 if (DOMAINDECOMP(cr
) && !thisRankHasDuty(cr
, DUTY_PME
))
1929 wallcycle_start(wcycle
, ewcPPDURINGPME
);
1930 dd_force_flop_start(cr
->dd
, nrnb
);
1935 wallcycle_start(wcycle
, ewcROT
);
1936 do_rotation(cr
, inputrec
, box
, as_rvec_array(x
.data()), t
, step
, bNS
);
1937 wallcycle_stop(wcycle
, ewcROT
);
1940 /* Temporary solution until all routines take PaddedRVecVector */
1941 rvec
*f
= as_rvec_array(force
.data());
1943 /* Start the force cycle counter.
1944 * Note that a different counter is used for dynamic load balancing.
1946 wallcycle_start(wcycle
, ewcFORCE
);
1948 gmx::ArrayRef
<gmx::RVec
> forceRef
= force
;
1951 /* If we need to compute the virial, we might need a separate
1952 * force buffer for algorithms for which the virial is calculated
1953 * separately, such as PME.
1955 if ((flags
& GMX_FORCE_VIRIAL
) && fr
->haveDirectVirialContributions
)
1957 forceRef
= *fr
->forceBufferForDirectVirialContributions
;
1959 clear_rvecs_omp(forceRef
.size(), as_rvec_array(forceRef
.data()));
1962 /* Clear the short- and long-range forces */
1963 clear_rvecs(fr
->natoms_force_constr
, f
);
1966 /* forceWithVirial might need the full force atom range */
1967 ForceWithVirial
forceWithVirial(forceRef
, flags
& GMX_FORCE_VIRIAL
);
1969 if (inputrec
->bPull
&& pull_have_constraint(inputrec
->pull_work
))
1971 clear_pull_forces(inputrec
->pull_work
);
1974 /* update QMMMrec, if necessary */
1977 update_QMMMrec(cr
, fr
, as_rvec_array(x
.data()), mdatoms
, box
);
1980 /* Compute the bonded and non-bonded energies and optionally forces */
1981 do_force_lowlevel(fr
, inputrec
, &(top
->idef
),
1982 cr
, ms
, nrnb
, wcycle
, mdatoms
,
1983 as_rvec_array(x
.data()), hist
, f
, &forceWithVirial
, enerd
, fcd
,
1984 box
, inputrec
->fepvals
, lambda
,
1985 graph
, &(top
->excls
), fr
->mu_tot
,
1989 wallcycle_stop(wcycle
, ewcFORCE
);
1991 if (DOMAINDECOMP(cr
))
1993 dd_force_flop_stop(cr
->dd
, nrnb
);
1995 if (ddCloseBalanceRegion
== DdCloseBalanceRegionAfterForceComputation::yes
)
1997 ddCloseBalanceRegionCpu(cr
->dd
);
2001 computeSpecialForces(fplog
, cr
, inputrec
, step
, t
, wcycle
,
2002 fr
->forceProviders
, box
, x
, mdatoms
, lambda
,
2003 flags
, &forceWithVirial
, enerd
,
2008 /* Communicate the forces */
2009 if (DOMAINDECOMP(cr
))
2011 dd_move_f(cr
->dd
, f
, fr
->fshift
, wcycle
);
2012 /* Do we need to communicate the separate force array
2013 * for terms that do not contribute to the single sum virial?
2014 * Position restraints and electric fields do not introduce
2015 * inter-cg forces, only full electrostatics methods do.
2016 * When we do not calculate the virial, fr->f_novirsum = f,
2017 * so we have already communicated these forces.
2019 if (EEL_FULL(fr
->ic
->eeltype
) && cr
->dd
->n_intercg_excl
&&
2020 (flags
& GMX_FORCE_VIRIAL
))
2022 dd_move_f(cr
->dd
, as_rvec_array(forceWithVirial
.force_
.data()), nullptr, wcycle
);
2026 /* If we have NoVirSum forces, but we do not calculate the virial,
2027 * we sum fr->f_novirsum=f later.
2029 if (vsite
&& !(fr
->haveDirectVirialContributions
&& !(flags
& GMX_FORCE_VIRIAL
)))
2031 spread_vsite_f(vsite
, as_rvec_array(x
.data()), f
, fr
->fshift
, FALSE
, nullptr, nrnb
,
2032 &top
->idef
, fr
->ePBC
, fr
->bMolPBC
, graph
, box
, cr
, wcycle
);
2035 if (flags
& GMX_FORCE_VIRIAL
)
2037 /* Calculation of the virial must be done after vsites! */
2038 calc_virial(0, mdatoms
->homenr
, as_rvec_array(x
.data()), f
,
2039 vir_force
, graph
, box
, nrnb
, fr
, inputrec
->ePBC
);
2043 if (PAR(cr
) && !thisRankHasDuty(cr
, DUTY_PME
))
2045 /* In case of node-splitting, the PP nodes receive the long-range
2046 * forces, virial and energy from the PME nodes here.
2048 pme_receive_force_ener(cr
, &forceWithVirial
, enerd
, wcycle
);
2053 post_process_forces(cr
, step
, nrnb
, wcycle
,
2054 top
, box
, as_rvec_array(x
.data()), f
, &forceWithVirial
,
2055 vir_force
, mdatoms
, graph
, fr
, vsite
,
2059 if (flags
& GMX_FORCE_ENERGY
)
2061 /* Sum the potential energy terms from group contributions */
2062 sum_epot(&(enerd
->grpp
), enerd
->term
);
2064 if (!EI_TPI(inputrec
->eI
))
2066 checkPotentialEnergyValidity(step
, *enerd
, *inputrec
);
2072 void do_force(FILE *fplog
,
2073 const t_commrec
*cr
,
2074 const gmx_multisim_t
*ms
,
2075 const t_inputrec
*inputrec
,
2078 gmx_wallcycle_t wcycle
,
2079 gmx_localtop_t
*top
,
2080 const gmx_groups_t
*groups
,
2082 gmx::PaddedArrayRef
<gmx::RVec
> x
,
2084 gmx::PaddedArrayRef
<gmx::RVec
> force
,
2086 const t_mdatoms
*mdatoms
,
2087 gmx_enerdata_t
*enerd
,
2089 gmx::ArrayRef
<real
> lambda
,
2092 const gmx_vsite_t
*vsite
,
2095 const gmx_edsam
*ed
,
2097 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion
,
2098 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion
)
2100 /* modify force flag if not doing nonbonded */
2101 if (!fr
->bNonbonded
)
2103 flags
&= ~GMX_FORCE_NONBONDED
;
2106 GMX_ASSERT(x
.size() >= gmx::paddedRVecVectorSize(fr
->natoms_force
), "coordinates should be padded");
2107 GMX_ASSERT(force
.size() >= gmx::paddedRVecVectorSize(fr
->natoms_force
), "force should be padded");
2109 switch (inputrec
->cutoff_scheme
)
2112 do_force_cutsVERLET(fplog
, cr
, ms
, inputrec
,
2120 lambda
.data(), graph
,
2125 ddOpenBalanceRegion
,
2126 ddCloseBalanceRegion
);
2129 do_force_cutsGROUP(fplog
, cr
, ms
, inputrec
,
2137 lambda
.data(), graph
,
2141 ddOpenBalanceRegion
,
2142 ddCloseBalanceRegion
);
2145 gmx_incons("Invalid cut-off scheme passed!");
2148 /* In case we don't have constraints and are using GPUs, the next balancing
2149 * region starts here.
2150 * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2151 * virial calculation and COM pulling, is not thus not included in
2152 * the balance timing, which is ok as most tasks do communication.
2154 if (ddOpenBalanceRegion
== DdOpenBalanceRegionBeforeForceComputation::yes
)
2156 ddOpenBalanceRegionCpu(cr
->dd
, DdAllowBalanceRegionReopen::no
);
2161 void do_constrain_first(FILE *fplog
, Constraints
*constr
,
2162 t_inputrec
*ir
, t_mdatoms
*md
,
2163 t_state
*state
, const t_commrec
*cr
,
2164 const gmx_multisim_t
*ms
,
2166 t_forcerec
*fr
, gmx_localtop_t
*top
)
2168 int i
, m
, start
, end
;
2170 real dt
= ir
->delta_t
;
2174 /* We need to allocate one element extra, since we might use
2175 * (unaligned) 4-wide SIMD loads to access rvec entries.
2177 snew(savex
, state
->natoms
+ 1);
2184 fprintf(debug
, "vcm: start=%d, homenr=%d, end=%d\n",
2185 start
, md
->homenr
, end
);
2187 /* Do a first constrain to reset particles... */
2188 step
= ir
->init_step
;
2191 char buf
[STEPSTRSIZE
];
2192 fprintf(fplog
, "\nConstraining the starting coordinates (step %s)\n",
2193 gmx_step_str(step
, buf
));
2197 /* constrain the current position */
2198 constrain(nullptr, TRUE
, FALSE
, constr
, &(top
->idef
),
2199 ir
, cr
, ms
, step
, 0, 1.0, md
,
2200 as_rvec_array(state
->x
.data()), as_rvec_array(state
->x
.data()), nullptr,
2201 fr
->bMolPBC
, state
->box
,
2202 state
->lambda
[efptBONDED
], &dvdl_dum
,
2203 nullptr, nullptr, nrnb
, econqCoord
);
2206 /* constrain the inital velocity, and save it */
2207 /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2208 constrain(nullptr, TRUE
, FALSE
, constr
, &(top
->idef
),
2209 ir
, cr
, ms
, step
, 0, 1.0, md
,
2210 as_rvec_array(state
->x
.data()), as_rvec_array(state
->v
.data()), as_rvec_array(state
->v
.data()),
2211 fr
->bMolPBC
, state
->box
,
2212 state
->lambda
[efptBONDED
], &dvdl_dum
,
2213 nullptr, nullptr, nrnb
, econqVeloc
);
2215 /* constrain the inital velocities at t-dt/2 */
2216 if (EI_STATE_VELOCITY(ir
->eI
) && ir
->eI
!= eiVV
)
2218 for (i
= start
; (i
< end
); i
++)
2220 for (m
= 0; (m
< DIM
); m
++)
2222 /* Reverse the velocity */
2223 state
->v
[i
][m
] = -state
->v
[i
][m
];
2224 /* Store the position at t-dt in buf */
2225 savex
[i
][m
] = state
->x
[i
][m
] + dt
*state
->v
[i
][m
];
2228 /* Shake the positions at t=-dt with the positions at t=0
2229 * as reference coordinates.
2233 char buf
[STEPSTRSIZE
];
2234 fprintf(fplog
, "\nConstraining the coordinates at t0-dt (step %s)\n",
2235 gmx_step_str(step
, buf
));
2238 constrain(nullptr, TRUE
, FALSE
, constr
, &(top
->idef
),
2239 ir
, cr
, ms
, step
, -1, 1.0, md
,
2240 as_rvec_array(state
->x
.data()), savex
, nullptr,
2241 fr
->bMolPBC
, state
->box
,
2242 state
->lambda
[efptBONDED
], &dvdl_dum
,
2243 as_rvec_array(state
->v
.data()), nullptr, nrnb
, econqCoord
);
2245 for (i
= start
; i
< end
; i
++)
2247 for (m
= 0; m
< DIM
; m
++)
2249 /* Re-reverse the velocities */
2250 state
->v
[i
][m
] = -state
->v
[i
][m
];
2259 integrate_table(real vdwtab
[], real scale
, int offstart
, int rstart
, int rend
,
2260 double *enerout
, double *virout
)
2262 double enersum
, virsum
;
2263 double invscale
, invscale2
, invscale3
;
2264 double r
, ea
, eb
, ec
, pa
, pb
, pc
, pd
;
2269 invscale
= 1.0/scale
;
2270 invscale2
= invscale
*invscale
;
2271 invscale3
= invscale
*invscale2
;
2273 /* Following summation derived from cubic spline definition,
2274 * Numerical Recipies in C, second edition, p. 113-116. Exact for
2275 * the cubic spline. We first calculate the negative of the
2276 * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2277 * add the more standard, abrupt cutoff correction to that result,
2278 * yielding the long-range correction for a switched function. We
2279 * perform both the pressure and energy loops at the same time for
2280 * simplicity, as the computational cost is low. */
2284 /* Since the dispersion table has been scaled down a factor
2285 * 6.0 and the repulsion a factor 12.0 to compensate for the
2286 * c6/c12 parameters inside nbfp[] being scaled up (to save
2287 * flops in kernels), we need to correct for this.
2298 for (ri
= rstart
; ri
< rend
; ++ri
)
2302 eb
= 2.0*invscale2
*r
;
2306 pb
= 3.0*invscale2
*r
;
2307 pc
= 3.0*invscale
*r
*r
;
2310 /* this "8" is from the packing in the vdwtab array - perhaps
2311 should be defined? */
2313 offset
= 8*ri
+ offstart
;
2314 y0
= vdwtab
[offset
];
2315 f
= vdwtab
[offset
+1];
2316 g
= vdwtab
[offset
+2];
2317 h
= vdwtab
[offset
+3];
2319 enersum
+= y0
*(ea
/3 + eb
/2 + ec
) + f
*(ea
/4 + eb
/3 + ec
/2) + g
*(ea
/5 + eb
/4 + ec
/3) + h
*(ea
/6 + eb
/5 + ec
/4);
2320 virsum
+= f
*(pa
/4 + pb
/3 + pc
/2 + pd
) + 2*g
*(pa
/5 + pb
/4 + pc
/3 + pd
/2) + 3*h
*(pa
/6 + pb
/5 + pc
/4 + pd
/3);
2322 *enerout
= 4.0*M_PI
*enersum
*tabfactor
;
2323 *virout
= 4.0*M_PI
*virsum
*tabfactor
;
2326 void calc_enervirdiff(FILE *fplog
, int eDispCorr
, t_forcerec
*fr
)
2328 double eners
[2], virs
[2], enersum
, virsum
;
2329 double r0
, rc3
, rc9
;
2331 real scale
, *vdwtab
;
2333 fr
->enershiftsix
= 0;
2334 fr
->enershifttwelve
= 0;
2335 fr
->enerdiffsix
= 0;
2336 fr
->enerdifftwelve
= 0;
2338 fr
->virdifftwelve
= 0;
2340 const interaction_const_t
*ic
= fr
->ic
;
2342 if (eDispCorr
!= edispcNO
)
2344 for (i
= 0; i
< 2; i
++)
2349 if ((ic
->vdw_modifier
== eintmodPOTSHIFT
) ||
2350 (ic
->vdw_modifier
== eintmodPOTSWITCH
) ||
2351 (ic
->vdw_modifier
== eintmodFORCESWITCH
) ||
2352 (ic
->vdwtype
== evdwSHIFT
) ||
2353 (ic
->vdwtype
== evdwSWITCH
))
2355 if (((ic
->vdw_modifier
== eintmodPOTSWITCH
) ||
2356 (ic
->vdw_modifier
== eintmodFORCESWITCH
) ||
2357 (ic
->vdwtype
== evdwSWITCH
)) && ic
->rvdw_switch
== 0)
2360 "With dispersion correction rvdw-switch can not be zero "
2361 "for vdw-type = %s", evdw_names
[ic
->vdwtype
]);
2364 /* TODO This code depends on the logic in tables.c that
2365 constructs the table layout, which should be made
2366 explicit in future cleanup. */
2367 GMX_ASSERT(fr
->dispersionCorrectionTable
->interaction
== GMX_TABLE_INTERACTION_VDWREP_VDWDISP
,
2368 "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2369 scale
= fr
->dispersionCorrectionTable
->scale
;
2370 vdwtab
= fr
->dispersionCorrectionTable
->data
;
2372 /* Round the cut-offs to exact table values for precision */
2373 ri0
= static_cast<int>(floor(ic
->rvdw_switch
*scale
));
2374 ri1
= static_cast<int>(ceil(ic
->rvdw
*scale
));
2376 /* The code below has some support for handling force-switching, i.e.
2377 * when the force (instead of potential) is switched over a limited
2378 * region. This leads to a constant shift in the potential inside the
2379 * switching region, which we can handle by adding a constant energy
2380 * term in the force-switch case just like when we do potential-shift.
2382 * For now this is not enabled, but to keep the functionality in the
2383 * code we check separately for switch and shift. When we do force-switch
2384 * the shifting point is rvdw_switch, while it is the cutoff when we
2385 * have a classical potential-shift.
2387 * For a pure potential-shift the potential has a constant shift
2388 * all the way out to the cutoff, and that is it. For other forms
2389 * we need to calculate the constant shift up to the point where we
2390 * start modifying the potential.
2392 ri0
= (ic
->vdw_modifier
== eintmodPOTSHIFT
) ? ri1
: ri0
;
2398 if ((ic
->vdw_modifier
== eintmodFORCESWITCH
) ||
2399 (ic
->vdwtype
== evdwSHIFT
))
2401 /* Determine the constant energy shift below rvdw_switch.
2402 * Table has a scale factor since we have scaled it down to compensate
2403 * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2405 fr
->enershiftsix
= (real
)(-1.0/(rc3
*rc3
)) - 6.0*vdwtab
[8*ri0
];
2406 fr
->enershifttwelve
= (real
)( 1.0/(rc9
*rc3
)) - 12.0*vdwtab
[8*ri0
+ 4];
2408 else if (ic
->vdw_modifier
== eintmodPOTSHIFT
)
2410 fr
->enershiftsix
= (real
)(-1.0/(rc3
*rc3
));
2411 fr
->enershifttwelve
= (real
)( 1.0/(rc9
*rc3
));
2414 /* Add the constant part from 0 to rvdw_switch.
2415 * This integration from 0 to rvdw_switch overcounts the number
2416 * of interactions by 1, as it also counts the self interaction.
2417 * We will correct for this later.
2419 eners
[0] += 4.0*M_PI
*fr
->enershiftsix
*rc3
/3.0;
2420 eners
[1] += 4.0*M_PI
*fr
->enershifttwelve
*rc3
/3.0;
2422 /* Calculate the contribution in the range [r0,r1] where we
2423 * modify the potential. For a pure potential-shift modifier we will
2424 * have ri0==ri1, and there will not be any contribution here.
2426 for (i
= 0; i
< 2; i
++)
2430 integrate_table(vdwtab
, scale
, (i
== 0 ? 0 : 4), ri0
, ri1
, &enersum
, &virsum
);
2431 eners
[i
] -= enersum
;
2435 /* Alright: Above we compensated by REMOVING the parts outside r0
2436 * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2438 * Regardless of whether r0 is the point where we start switching,
2439 * or the cutoff where we calculated the constant shift, we include
2440 * all the parts we are missing out to infinity from r0 by
2441 * calculating the analytical dispersion correction.
2443 eners
[0] += -4.0*M_PI
/(3.0*rc3
);
2444 eners
[1] += 4.0*M_PI
/(9.0*rc9
);
2445 virs
[0] += 8.0*M_PI
/rc3
;
2446 virs
[1] += -16.0*M_PI
/(3.0*rc9
);
2448 else if (ic
->vdwtype
== evdwCUT
||
2449 EVDW_PME(ic
->vdwtype
) ||
2450 ic
->vdwtype
== evdwUSER
)
2452 if (ic
->vdwtype
== evdwUSER
&& fplog
)
2455 "WARNING: using dispersion correction with user tables\n");
2458 /* Note that with LJ-PME, the dispersion correction is multiplied
2459 * by the difference between the actual C6 and the value of C6
2460 * that would produce the combination rule.
2461 * This means the normal energy and virial difference formulas
2465 rc3
= ic
->rvdw
*ic
->rvdw
*ic
->rvdw
;
2467 /* Contribution beyond the cut-off */
2468 eners
[0] += -4.0*M_PI
/(3.0*rc3
);
2469 eners
[1] += 4.0*M_PI
/(9.0*rc9
);
2470 if (ic
->vdw_modifier
== eintmodPOTSHIFT
)
2472 /* Contribution within the cut-off */
2473 eners
[0] += -4.0*M_PI
/(3.0*rc3
);
2474 eners
[1] += 4.0*M_PI
/(3.0*rc9
);
2476 /* Contribution beyond the cut-off */
2477 virs
[0] += 8.0*M_PI
/rc3
;
2478 virs
[1] += -16.0*M_PI
/(3.0*rc9
);
2483 "Dispersion correction is not implemented for vdw-type = %s",
2484 evdw_names
[ic
->vdwtype
]);
2487 /* When we deprecate the group kernels the code below can go too */
2488 if (ic
->vdwtype
== evdwPME
&& fr
->cutoff_scheme
== ecutsGROUP
)
2490 /* Calculate self-interaction coefficient (assuming that
2491 * the reciprocal-space contribution is constant in the
2492 * region that contributes to the self-interaction).
2494 fr
->enershiftsix
= gmx::power6(ic
->ewaldcoeff_lj
) / 6.0;
2496 eners
[0] += -gmx::power3(std::sqrt(M_PI
)*ic
->ewaldcoeff_lj
)/3.0;
2497 virs
[0] += gmx::power3(std::sqrt(M_PI
)*ic
->ewaldcoeff_lj
);
2500 fr
->enerdiffsix
= eners
[0];
2501 fr
->enerdifftwelve
= eners
[1];
2502 /* The 0.5 is due to the Gromacs definition of the virial */
2503 fr
->virdiffsix
= 0.5*virs
[0];
2504 fr
->virdifftwelve
= 0.5*virs
[1];
2508 void calc_dispcorr(t_inputrec
*ir
, t_forcerec
*fr
,
2509 matrix box
, real lambda
, tensor pres
, tensor virial
,
2510 real
*prescorr
, real
*enercorr
, real
*dvdlcorr
)
2512 gmx_bool bCorrAll
, bCorrPres
;
2513 real dvdlambda
, invvol
, dens
, ninter
, avcsix
, avctwelve
, enerdiff
, svir
= 0, spres
= 0;
2523 if (ir
->eDispCorr
!= edispcNO
)
2525 bCorrAll
= (ir
->eDispCorr
== edispcAllEner
||
2526 ir
->eDispCorr
== edispcAllEnerPres
);
2527 bCorrPres
= (ir
->eDispCorr
== edispcEnerPres
||
2528 ir
->eDispCorr
== edispcAllEnerPres
);
2530 invvol
= 1/det(box
);
2533 /* Only correct for the interactions with the inserted molecule */
2534 dens
= (fr
->numAtomsForDispersionCorrection
- fr
->n_tpi
)*invvol
;
2539 dens
= fr
->numAtomsForDispersionCorrection
*invvol
;
2540 ninter
= 0.5*fr
->numAtomsForDispersionCorrection
;
2543 if (ir
->efep
== efepNO
)
2545 avcsix
= fr
->avcsix
[0];
2546 avctwelve
= fr
->avctwelve
[0];
2550 avcsix
= (1 - lambda
)*fr
->avcsix
[0] + lambda
*fr
->avcsix
[1];
2551 avctwelve
= (1 - lambda
)*fr
->avctwelve
[0] + lambda
*fr
->avctwelve
[1];
2554 enerdiff
= ninter
*(dens
*fr
->enerdiffsix
- fr
->enershiftsix
);
2555 *enercorr
+= avcsix
*enerdiff
;
2557 if (ir
->efep
!= efepNO
)
2559 dvdlambda
+= (fr
->avcsix
[1] - fr
->avcsix
[0])*enerdiff
;
2563 enerdiff
= ninter
*(dens
*fr
->enerdifftwelve
- fr
->enershifttwelve
);
2564 *enercorr
+= avctwelve
*enerdiff
;
2565 if (fr
->efep
!= efepNO
)
2567 dvdlambda
+= (fr
->avctwelve
[1] - fr
->avctwelve
[0])*enerdiff
;
2573 svir
= ninter
*dens
*avcsix
*fr
->virdiffsix
/3.0;
2574 if (ir
->eDispCorr
== edispcAllEnerPres
)
2576 svir
+= ninter
*dens
*avctwelve
*fr
->virdifftwelve
/3.0;
2578 /* The factor 2 is because of the Gromacs virial definition */
2579 spres
= -2.0*invvol
*svir
*PRESFAC
;
2581 for (m
= 0; m
< DIM
; m
++)
2583 virial
[m
][m
] += svir
;
2584 pres
[m
][m
] += spres
;
2589 /* Can't currently control when it prints, for now, just print when degugging */
2594 fprintf(debug
, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2600 "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2601 *enercorr
, spres
, svir
);
2605 fprintf(debug
, "Long Range LJ corr.: Epot %10g\n", *enercorr
);
2609 if (fr
->efep
!= efepNO
)
2611 *dvdlcorr
+= dvdlambda
;
2616 static void low_do_pbc_mtop(FILE *fplog
, int ePBC
, const matrix box
,
2617 const gmx_mtop_t
*mtop
, rvec x
[],
2623 if (bFirst
&& fplog
)
2625 fprintf(fplog
, "Removing pbc first time\n");
2630 for (const gmx_molblock_t
&molb
: mtop
->molblock
)
2632 const gmx_moltype_t
&moltype
= mtop
->moltype
[molb
.type
];
2633 if (moltype
.atoms
.nr
== 1 ||
2634 (!bFirst
&& moltype
.cgs
.nr
== 1))
2636 /* Just one atom or charge group in the molecule, no PBC required */
2637 as
+= molb
.nmol
*moltype
.atoms
.nr
;
2641 /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2642 mk_graph_ilist(nullptr, moltype
.ilist
,
2643 0, moltype
.atoms
.nr
, FALSE
, FALSE
, graph
);
2645 for (mol
= 0; mol
< molb
.nmol
; mol
++)
2647 mk_mshift(fplog
, graph
, ePBC
, box
, x
+as
);
2649 shift_self(graph
, box
, x
+as
);
2650 /* The molecule is whole now.
2651 * We don't need the second mk_mshift call as in do_pbc_first,
2652 * since we no longer need this graph.
2655 as
+= moltype
.atoms
.nr
;
2663 void do_pbc_first_mtop(FILE *fplog
, int ePBC
, const matrix box
,
2664 const gmx_mtop_t
*mtop
, rvec x
[])
2666 low_do_pbc_mtop(fplog
, ePBC
, box
, mtop
, x
, TRUE
);
2669 void do_pbc_mtop(FILE *fplog
, int ePBC
, const matrix box
,
2670 const gmx_mtop_t
*mtop
, rvec x
[])
2672 low_do_pbc_mtop(fplog
, ePBC
, box
, mtop
, x
, FALSE
);
2675 void put_atoms_in_box_omp(int ePBC
, const matrix box
, gmx::ArrayRef
<gmx::RVec
> x
)
2678 nth
= gmx_omp_nthreads_get(emntDefault
);
2680 #pragma omp parallel for num_threads(nth) schedule(static)
2681 for (t
= 0; t
< nth
; t
++)
2685 size_t natoms
= x
.size();
2686 size_t offset
= (natoms
*t
)/nth
;
2687 size_t len
= (natoms
*(t
+ 1))/nth
- offset
;
2688 put_atoms_in_box(ePBC
, box
, x
.subArray(offset
, len
));
2690 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
2694 // TODO This can be cleaned up a lot, and move back to runner.cpp
2695 void finish_run(FILE *fplog
, const gmx::MDLogger
&mdlog
, const t_commrec
*cr
,
2696 const t_inputrec
*inputrec
,
2697 t_nrnb nrnb
[], gmx_wallcycle_t wcycle
,
2698 gmx_walltime_accounting_t walltime_accounting
,
2699 nonbonded_verlet_t
*nbv
,
2701 gmx_bool bWriteStat
)
2703 t_nrnb
*nrnb_tot
= nullptr;
2705 double nbfs
= 0, mflop
= 0;
2706 double elapsed_time
,
2707 elapsed_time_over_all_ranks
,
2708 elapsed_time_over_all_threads
,
2709 elapsed_time_over_all_threads_over_all_ranks
;
2710 /* Control whether it is valid to print a report. Only the
2711 simulation master may print, but it should not do so if the run
2712 terminated e.g. before a scheduled reset step. This is
2713 complicated by the fact that PME ranks are unaware of the
2714 reason why they were sent a pmerecvqxFINISH. To avoid
2715 communication deadlocks, we always do the communication for the
2716 report, even if we've decided not to write the report, because
2717 how long it takes to finish the run is not important when we've
2718 decided not to report on the simulation performance.
2720 Further, we only report performance for dynamical integrators,
2721 because those are the only ones for which we plan to
2722 consider doing any optimizations. */
2723 bool printReport
= EI_DYNAMICS(inputrec
->eI
) && SIMMASTER(cr
);
2725 if (printReport
&& !walltime_accounting_get_valid_finish(walltime_accounting
))
2727 GMX_LOG(mdlog
.warning
).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2728 printReport
= false;
2735 MPI_Allreduce(nrnb
->n
, nrnb_tot
->n
, eNRNB
, MPI_DOUBLE
, MPI_SUM
,
2736 cr
->mpi_comm_mysim
);
2744 elapsed_time
= walltime_accounting_get_elapsed_time(walltime_accounting
);
2745 elapsed_time_over_all_ranks
= elapsed_time
;
2746 elapsed_time_over_all_threads
= walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting
);
2747 elapsed_time_over_all_threads_over_all_ranks
= elapsed_time_over_all_threads
;
2751 /* reduce elapsed_time over all MPI ranks in the current simulation */
2752 MPI_Allreduce(&elapsed_time
,
2753 &elapsed_time_over_all_ranks
,
2754 1, MPI_DOUBLE
, MPI_SUM
,
2755 cr
->mpi_comm_mysim
);
2756 elapsed_time_over_all_ranks
/= cr
->nnodes
;
2757 /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2758 * current simulation. */
2759 MPI_Allreduce(&elapsed_time_over_all_threads
,
2760 &elapsed_time_over_all_threads_over_all_ranks
,
2761 1, MPI_DOUBLE
, MPI_SUM
,
2762 cr
->mpi_comm_mysim
);
2768 print_flop(fplog
, nrnb_tot
, &nbfs
, &mflop
);
2775 if (thisRankHasDuty(cr
, DUTY_PP
) && DOMAINDECOMP(cr
))
2777 print_dd_statistics(cr
, inputrec
, fplog
);
2780 /* TODO Move the responsibility for any scaling by thread counts
2781 * to the code that handled the thread region, so that there's a
2782 * mechanism to keep cycle counting working during the transition
2783 * to task parallelism. */
2784 int nthreads_pp
= gmx_omp_nthreads_get(emntNonbonded
);
2785 int nthreads_pme
= gmx_omp_nthreads_get(emntPME
);
2786 wallcycle_scale_by_num_threads(wcycle
, thisRankHasDuty(cr
, DUTY_PME
) && !thisRankHasDuty(cr
, DUTY_PP
), nthreads_pp
, nthreads_pme
);
2787 auto cycle_sum(wallcycle_sum(cr
, wcycle
));
2791 auto nbnxn_gpu_timings
= use_GPU(nbv
) ? nbnxn_gpu_get_timings(nbv
->gpu_nbv
) : nullptr;
2792 gmx_wallclock_gpu_pme_t pme_gpu_timings
= {};
2793 if (pme_gpu_task_enabled(pme
))
2795 pme_gpu_get_timings(pme
, &pme_gpu_timings
);
2797 wallcycle_print(fplog
, mdlog
, cr
->nnodes
, cr
->npmenodes
, nthreads_pp
, nthreads_pme
,
2798 elapsed_time_over_all_ranks
,
2803 if (EI_DYNAMICS(inputrec
->eI
))
2805 delta_t
= inputrec
->delta_t
;
2810 print_perf(fplog
, elapsed_time_over_all_threads_over_all_ranks
,
2811 elapsed_time_over_all_ranks
,
2812 walltime_accounting_get_nsteps_done(walltime_accounting
),
2813 delta_t
, nbfs
, mflop
);
2817 print_perf(stderr
, elapsed_time_over_all_threads_over_all_ranks
,
2818 elapsed_time_over_all_ranks
,
2819 walltime_accounting_get_nsteps_done(walltime_accounting
),
2820 delta_t
, nbfs
, mflop
);
2825 extern void initialize_lambdas(FILE *fplog
, t_inputrec
*ir
, int *fep_state
, gmx::ArrayRef
<real
> lambda
, double *lam0
)
2827 /* this function works, but could probably use a logic rewrite to keep all the different
2828 types of efep straight. */
2830 if ((ir
->efep
== efepNO
) && (ir
->bSimTemp
== FALSE
))
2835 t_lambda
*fep
= ir
->fepvals
;
2836 *fep_state
= fep
->init_fep_state
; /* this might overwrite the checkpoint
2837 if checkpoint is set -- a kludge is in for now
2840 for (int i
= 0; i
< efptNR
; i
++)
2842 /* overwrite lambda state with init_lambda for now for backwards compatibility */
2843 if (fep
->init_lambda
>= 0) /* if it's -1, it was never initializd */
2845 lambda
[i
] = fep
->init_lambda
;
2848 lam0
[i
] = lambda
[i
];
2853 lambda
[i
] = fep
->all_lambda
[i
][*fep_state
];
2856 lam0
[i
] = lambda
[i
];
2862 /* need to rescale control temperatures to match current state */
2863 for (int i
= 0; i
< ir
->opts
.ngtc
; i
++)
2865 if (ir
->opts
.ref_t
[i
] > 0)
2867 ir
->opts
.ref_t
[i
] = ir
->simtempvals
->temperatures
[*fep_state
];
2872 /* Send to the log the information on the current lambdas */
2873 if (fplog
!= nullptr)
2875 fprintf(fplog
, "Initial vector of lambda components:[ ");
2876 for (const auto &l
: lambda
)
2878 fprintf(fplog
, "%10.4f ", l
);
2880 fprintf(fplog
, "]\n");
2886 void init_md(FILE *fplog
,
2887 const t_commrec
*cr
, gmx::IMDOutputProvider
*outputProvider
,
2888 t_inputrec
*ir
, const gmx_output_env_t
*oenv
,
2889 const MdrunOptions
&mdrunOptions
,
2890 double *t
, double *t0
,
2891 t_state
*globalState
, double *lam0
,
2892 t_nrnb
*nrnb
, gmx_mtop_t
*mtop
,
2894 int nfile
, const t_filenm fnm
[],
2895 gmx_mdoutf_t
*outf
, t_mdebin
**mdebin
,
2896 tensor force_vir
, tensor shake_vir
, rvec mu_tot
,
2897 gmx_bool
*bSimAnn
, t_vcm
**vcm
,
2898 gmx_wallcycle_t wcycle
)
2902 /* Initial values */
2903 *t
= *t0
= ir
->init_t
;
2906 for (i
= 0; i
< ir
->opts
.ngtc
; i
++)
2908 /* set bSimAnn if any group is being annealed */
2909 if (ir
->opts
.annealing
[i
] != eannNO
)
2915 /* Initialize lambda variables */
2916 /* TODO: Clean up initialization of fep_state and lambda in t_state.
2917 * We currently need to call initialize_lambdas on non-master ranks
2918 * to initialize lam0.
2922 initialize_lambdas(fplog
, ir
, &globalState
->fep_state
, globalState
->lambda
, lam0
);
2927 std::array
<real
, efptNR
> tmpLambda
;
2928 initialize_lambdas(fplog
, ir
, &tmpFepState
, tmpLambda
, lam0
);
2931 // TODO upd is never NULL in practice, but the analysers don't know that
2934 *upd
= init_update(ir
);
2938 update_annealing_target_temp(ir
, ir
->init_t
, upd
? *upd
: nullptr);
2943 *vcm
= init_vcm(fplog
, &mtop
->groups
, ir
);
2946 if (EI_DYNAMICS(ir
->eI
) && !mdrunOptions
.continuationOptions
.appendFiles
)
2948 if (ir
->etc
== etcBERENDSEN
)
2950 please_cite(fplog
, "Berendsen84a");
2952 if (ir
->etc
== etcVRESCALE
)
2954 please_cite(fplog
, "Bussi2007a");
2956 if (ir
->eI
== eiSD1
)
2958 please_cite(fplog
, "Goga2012");
2965 *outf
= init_mdoutf(fplog
, nfile
, fnm
, mdrunOptions
, cr
, outputProvider
, ir
, mtop
, oenv
, wcycle
);
2967 *mdebin
= init_mdebin(mdrunOptions
.continuationOptions
.appendFiles
? nullptr : mdoutf_get_fp_ene(*outf
),
2968 mtop
, ir
, mdoutf_get_fp_dhdl(*outf
));
2971 /* Initiate variables */
2972 clear_mat(force_vir
);
2973 clear_mat(shake_vir
);