src/gromacs/mdlib/sim_util.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "sim_util.h"
  40
  41 #include "config.h"
  42
  43 #include <cmath>
  44 #include <cstdint>
  45 #include <cstdio>
  46 #include <cstring>
  47
  48 #include <array>
  49
  50 #include "gromacs/awh/awh.h"
  51 #include "gromacs/domdec/dlbtiming.h"
  52 #include "gromacs/domdec/domdec_struct.h"
  53 #include "gromacs/domdec/partition.h"
  54 #include "gromacs/essentialdynamics/edsam.h"
  55 #include "gromacs/ewald/pme.h"
  56 #include "gromacs/gmxlib/chargegroup.h"
  57 #include "gromacs/gmxlib/network.h"
  58 #include "gromacs/gmxlib/nrnb.h"
  59 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
  60 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
  61 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
  62 #include "gromacs/gpu_utils/gpu_utils.h"
  63 #include "gromacs/imd/imd.h"
  64 #include "gromacs/listed-forces/bonded.h"
  65 #include "gromacs/listed-forces/disre.h"
  66 #include "gromacs/listed-forces/listed-forces.h"
  67 #include "gromacs/listed-forces/manage-threading.h"
  68 #include "gromacs/listed-forces/orires.h"
  69 #include "gromacs/math/arrayrefwithpadding.h"
  70 #include "gromacs/math/functions.h"
  71 #include "gromacs/math/units.h"
  72 #include "gromacs/math/vec.h"
  73 #include "gromacs/math/vecdump.h"
  74 #include "gromacs/mdlib/calcmu.h"
  75 #include "gromacs/mdlib/calcvir.h"
  76 #include "gromacs/mdlib/constr.h"
  77 #include "gromacs/mdlib/force.h"
  78 #include "gromacs/mdlib/forcerec.h"
  79 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  80 #include "gromacs/mdlib/mdrun.h"
  81 #include "gromacs/mdlib/nb_verlet.h"
  82 #include "gromacs/mdlib/nbnxn_atomdata.h"
  83 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  84 #include "gromacs/mdlib/nbnxn_grid.h"
  85 #include "gromacs/mdlib/nbnxn_search.h"
  86 #include "gromacs/mdlib/ppforceworkload.h"
  87 #include "gromacs/mdlib/qmmm.h"
  88 #include "gromacs/mdlib/update.h"
  89 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
  90 #include "gromacs/mdtypes/commrec.h"
  91 #include "gromacs/mdtypes/forceoutput.h"
  92 #include "gromacs/mdtypes/iforceprovider.h"
  93 #include "gromacs/mdtypes/inputrec.h"
  94 #include "gromacs/mdtypes/md_enums.h"
  95 #include "gromacs/mdtypes/state.h"
  96 #include "gromacs/pbcutil/ishift.h"
  97 #include "gromacs/pbcutil/mshift.h"
  98 #include "gromacs/pbcutil/pbc.h"
  99 #include "gromacs/pulling/pull.h"
 100 #include "gromacs/pulling/pull_rotation.h"
 101 #include "gromacs/timing/cyclecounter.h"
 102 #include "gromacs/timing/gpu_timing.h"
 103 #include "gromacs/timing/wallcycle.h"
 104 #include "gromacs/timing/wallcyclereporting.h"
 105 #include "gromacs/timing/walltime_accounting.h"
 106 #include "gromacs/topology/topology.h"
 107 #include "gromacs/utility/arrayref.h"
 108 #include "gromacs/utility/basedefinitions.h"
 109 #include "gromacs/utility/cstringutil.h"
 110 #include "gromacs/utility/exceptions.h"
 111 #include "gromacs/utility/fatalerror.h"
 112 #include "gromacs/utility/gmxassert.h"
 113 #include "gromacs/utility/gmxmpi.h"
 114 #include "gromacs/utility/logger.h"
 115 #include "gromacs/utility/pleasecite.h"
 116 #include "gromacs/utility/smalloc.h"
 117 #include "gromacs/utility/strconvert.h"
 118 #include "gromacs/utility/sysinfo.h"
 119
 120 #include "nbnxn_gpu.h"
 121 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
 122 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
 123
 124 // TODO: this environment variable allows us to verify before release
 125 // that on less common architectures the total cost of polling is not larger than
 126 // a blocking wait (so polling does not introduce overhead when the static
 127 // PME-first ordering would suffice).
 128 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
 129
 130
 131 void print_time(FILE                     *out,
 132                 gmx_walltime_accounting_t walltime_accounting,
 133                 int64_t                   step,
 134                 const t_inputrec         *ir,
 135                 const t_commrec          *cr)
 136 {
 137     time_t finish;
 138     double dt, elapsed_seconds, time_per_step;
 139
 140 #if !GMX_THREAD_MPI
 141     if (!PAR(cr))
 142 #endif
 143     {
 144         fprintf(out, "\r");
 145     }
 146     fputs("step ", out);
 147     fputs(gmx::int64ToString(step).c_str(), out);
 148     fflush(out);
 149
 150     if ((step >= ir->nstlist))
 151     {
 152         double seconds_since_epoch = gmx_gettime();
 153         elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
 154         time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
 155         dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
 156
 157         if (ir->nsteps >= 0)
 158         {
 159             if (dt >= 300)
 160             {
 161                 finish = static_cast<time_t>(seconds_since_epoch + dt);
 162                 auto timebuf = gmx_ctime_r(&finish);
 163                 fputs(", will finish ", out);
 164                 fputs(timebuf.c_str(), out);
 165             }
 166             else
 167             {
 168                 fprintf(out, ", remaining wall clock time: %5d s          ", static_cast<int>(dt));
 169             }
 170         }
 171         else
 172         {
 173             fprintf(out, " performance: %.1f ns/day    ",
 174                     ir->delta_t/1000*24*60*60/time_per_step);
 175         }
 176     }
 177 #if !GMX_THREAD_MPI
 178     if (PAR(cr))
 179     {
 180         fprintf(out, "\n");
 181     }
 182 #else
 183     GMX_UNUSED_VALUE(cr);
 184 #endif
 185
 186     fflush(out);
 187 }
 188
 189 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
 190                          double the_time)
 191 {
 192     if (!fplog)
 193     {
 194         return;
 195     }
 196
 197     time_t temp_time = static_cast<time_t>(the_time);
 198
 199     auto   timebuf = gmx_ctime_r(&temp_time);
 200
 201     fprintf(fplog, "%s on rank %d %s\n", title, nodeid, timebuf.c_str());
 202 }
 203
 204 void print_start(FILE *fplog, const t_commrec *cr,
 205                  gmx_walltime_accounting_t walltime_accounting,
 206                  const char *name)
 207 {
 208     char buf[STRLEN];
 209
 210     sprintf(buf, "Started %s", name);
 211     print_date_and_time(fplog, cr->nodeid, buf,
 212                         walltime_accounting_get_start_time_stamp(walltime_accounting));
 213 }
 214
 215 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
 216 {
 217     const int      end = forceToAdd.size();
 218
 219     int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
 220 #pragma omp parallel for num_threads(nt) schedule(static)
 221     for (int i = 0; i < end; i++)
 222     {
 223         rvec_inc(f[i], forceToAdd[i]);
 224     }
 225 }
 226
 227 static void pme_gpu_reduce_outputs(gmx_wallcycle_t                 wcycle,
 228                                    gmx::ForceWithVirial           *forceWithVirial,
 229                                    gmx::ArrayRef<const gmx::RVec>  pmeForces,
 230                                    gmx_enerdata_t                 *enerd,
 231                                    const tensor                    vir_Q,
 232                                    real                            Vlr_q)
 233 {
 234     wallcycle_start(wcycle, ewcPME_GPU_F_REDUCTION);
 235     GMX_ASSERT(forceWithVirial, "Invalid force pointer");
 236     forceWithVirial->addVirialContribution(vir_Q);
 237     enerd->term[F_COUL_RECIP] += Vlr_q;
 238     sum_forces(as_rvec_array(forceWithVirial->force_.data()), pmeForces);
 239     wallcycle_stop(wcycle, ewcPME_GPU_F_REDUCTION);
 240 }
 241
 242 static void calc_virial(int start, int homenr, const rvec x[], const rvec f[],
 243                         tensor vir_part, const t_graph *graph, const matrix box,
 244                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
 245 {
 246     /* The short-range virial from surrounding boxes */
 247     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
 248     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
 249
 250     /* Calculate partial virial, for local atoms only, based on short range.
 251      * Total virial is computed in global_stat, called from do_md
 252      */
 253     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
 254     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
 255
 256     if (debug)
 257     {
 258         pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
 259     }
 260 }
 261
 262 static void pull_potential_wrapper(const t_commrec *cr,
 263                                    const t_inputrec *ir,
 264                                    const matrix box, gmx::ArrayRef<const gmx::RVec> x,
 265                                    gmx::ForceWithVirial *force,
 266                                    const t_mdatoms *mdatoms,
 267                                    gmx_enerdata_t *enerd,
 268                                    const real *lambda,
 269                                    double t,
 270                                    gmx_wallcycle_t wcycle)
 271 {
 272     t_pbc  pbc;
 273     real   dvdl;
 274
 275     /* Calculate the center of mass forces, this requires communication,
 276      * which is why pull_potential is called close to other communication.
 277      */
 278     wallcycle_start(wcycle, ewcPULLPOT);
 279     set_pbc(&pbc, ir->ePBC, box);
 280     dvdl                     = 0;
 281     enerd->term[F_COM_PULL] +=
 282         pull_potential(ir->pull_work, mdatoms, &pbc,
 283                        cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
 284     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
 285     wallcycle_stop(wcycle, ewcPULLPOT);
 286 }
 287
 288 static void pme_receive_force_ener(const t_commrec      *cr,
 289                                    gmx::ForceWithVirial *forceWithVirial,
 290                                    gmx_enerdata_t       *enerd,
 291                                    gmx_wallcycle_t       wcycle)
 292 {
 293     real   e_q, e_lj, dvdl_q, dvdl_lj;
 294     float  cycles_ppdpme, cycles_seppme;
 295
 296     cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
 297     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
 298
 299     /* In case of node-splitting, the PP nodes receive the long-range
 300      * forces, virial and energy from the PME nodes here.
 301      */
 302     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
 303     dvdl_q  = 0;
 304     dvdl_lj = 0;
 305     gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
 306                       &cycles_seppme);
 307     enerd->term[F_COUL_RECIP] += e_q;
 308     enerd->term[F_LJ_RECIP]   += e_lj;
 309     enerd->dvdl_lin[efptCOUL] += dvdl_q;
 310     enerd->dvdl_lin[efptVDW]  += dvdl_lj;
 311
 312     if (wcycle)
 313     {
 314         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
 315     }
 316     wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
 317 }
 318
 319 static void print_large_forces(FILE            *fp,
 320                                const t_mdatoms *md,
 321                                const t_commrec *cr,
 322                                int64_t          step,
 323                                real             forceTolerance,
 324                                const rvec      *x,
 325                                const rvec      *f)
 326 {
 327     real           force2Tolerance = gmx::square(forceTolerance);
 328     gmx::index     numNonFinite    = 0;
 329     for (int i = 0; i < md->homenr; i++)
 330     {
 331         real force2    = norm2(f[i]);
 332         bool nonFinite = !std::isfinite(force2);
 333         if (force2 >= force2Tolerance || nonFinite)
 334         {
 335             fprintf(fp, "step %" PRId64 " atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
 336                     step,
 337                     ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
 338         }
 339         if (nonFinite)
 340         {
 341             numNonFinite++;
 342         }
 343     }
 344     if (numNonFinite > 0)
 345     {
 346         /* Note that with MPI this fatal call on one rank might interrupt
 347          * the printing on other ranks. But we can only avoid that with
 348          * an expensive MPI barrier that we would need at each step.
 349          */
 350         gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %td atoms", step, numNonFinite);
 351     }
 352 }
 353
 354 static void post_process_forces(const t_commrec           *cr,
 355                                 int64_t                    step,
 356                                 t_nrnb                    *nrnb,
 357                                 gmx_wallcycle_t            wcycle,
 358                                 const gmx_localtop_t      *top,
 359                                 const matrix               box,
 360                                 const rvec                 x[],
 361                                 rvec                       f[],
 362                                 gmx::ForceWithVirial      *forceWithVirial,
 363                                 tensor                     vir_force,
 364                                 const t_mdatoms           *mdatoms,
 365                                 const t_graph             *graph,
 366                                 const t_forcerec          *fr,
 367                                 const gmx_vsite_t         *vsite,
 368                                 int                        flags)
 369 {
 370     if (fr->haveDirectVirialContributions)
 371     {
 372         rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
 373
 374         if (vsite)
 375         {
 376             /* Spread the mesh force on virtual sites to the other particles...
 377              * This is parallellized. MPI communication is performed
 378              * if the constructing atoms aren't local.
 379              */
 380             matrix virial = { { 0 } };
 381             spread_vsite_f(vsite, x, fDirectVir, nullptr,
 382                            (flags & GMX_FORCE_VIRIAL) != 0, virial,
 383                            nrnb,
 384                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
 385             forceWithVirial->addVirialContribution(virial);
 386         }
 387
 388         if (flags & GMX_FORCE_VIRIAL)
 389         {
 390             /* Now add the forces, this is local */
 391             sum_forces(f, forceWithVirial->force_);
 392
 393             /* Add the direct virial contributions */
 394             GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
 395             m_add(vir_force, forceWithVirial->getVirial(), vir_force);
 396
 397             if (debug)
 398             {
 399                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
 400             }
 401         }
 402     }
 403
 404     if (fr->print_force >= 0)
 405     {
 406         print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
 407     }
 408 }
 409
 410 static void do_nb_verlet(const t_forcerec *fr,
 411                          const interaction_const_t *ic,
 412                          gmx_enerdata_t *enerd,
 413                          int flags, int ilocality,
 414                          int clearF,
 415                          int64_t step,
 416                          t_nrnb *nrnb,
 417                          gmx_wallcycle_t wcycle)
 418 {
 419     if (!(flags & GMX_FORCE_NONBONDED))
 420     {
 421         /* skip non-bonded calculation */
 422         return;
 423     }
 424
 425     nonbonded_verlet_t       *nbv  = fr->nbv;
 426     nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
 427
 428     /* GPU kernel launch overhead is already timed separately */
 429     if (fr->cutoff_scheme != ecutsVERLET)
 430     {
 431         gmx_incons("Invalid cut-off scheme passed!");
 432     }
 433
 434     bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
 435
 436     if (!bUsingGpuKernels)
 437     {
 438         /* When dynamic pair-list  pruning is requested, we need to prune
 439          * at nstlistPrune steps.
 440          */
 441         if (nbv->listParams->useDynamicPruning &&
 442             (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
 443         {
 444             /* Prune the pair-list beyond fr->ic->rlistPrune using
 445              * the current coordinates of the atoms.
 446              */
 447             wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
 448             nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
 449             wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
 450         }
 451
 452         wallcycle_sub_start(wcycle, ewcsNONBONDED);
 453     }
 454
 455     switch (nbvg->kernel_type)
 456     {
 457         case nbnxnk4x4_PlainC:
 458         case nbnxnk4xN_SIMD_4xN:
 459         case nbnxnk4xN_SIMD_2xNN:
 460             nbnxn_kernel_cpu(nbvg,
 461                              nbv->nbat,
 462                              ic,
 463                              fr->shift_vec,
 464                              flags,
 465                              clearF,
 466                              fr->fshift[0],
 467                              enerd->grpp.ener[egCOULSR],
 468                              fr->bBHAM ?
 469                              enerd->grpp.ener[egBHAMSR] :
 470                              enerd->grpp.ener[egLJSR]);
 471             break;
 472
 473         case nbnxnk8x8x8_GPU:
 474             nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbv->nbat, flags, ilocality);
 475             break;
 476
 477         case nbnxnk8x8x8_PlainC:
 478             nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
 479                                  nbv->nbat, ic,
 480                                  fr->shift_vec,
 481                                  flags,
 482                                  clearF,
 483                                  nbv->nbat->out[0].f,
 484                                  fr->fshift[0],
 485                                  enerd->grpp.ener[egCOULSR],
 486                                  fr->bBHAM ?
 487                                  enerd->grpp.ener[egBHAMSR] :
 488                                  enerd->grpp.ener[egLJSR]);
 489             break;
 490
 491         default:
 492             GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
 493
 494     }
 495     if (!bUsingGpuKernels)
 496     {
 497         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 498     }
 499
 500     int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
 501     if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
 502     {
 503         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
 504     }
 505     else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
 506              (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
 507     {
 508         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
 509     }
 510     else
 511     {
 512         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
 513     }
 514     enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
 515     if (flags & GMX_FORCE_ENERGY)
 516     {
 517         /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
 518         enr_nbnxn_kernel_ljc += 1;
 519         enr_nbnxn_kernel_lj  += 1;
 520     }
 521
 522     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
 523              nbvg->nbl_lists.natpair_ljq);
 524     inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
 525              nbvg->nbl_lists.natpair_lj);
 526     /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
 527     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
 528              nbvg->nbl_lists.natpair_q);
 529
 530     if (ic->vdw_modifier == eintmodFORCESWITCH)
 531     {
 532         /* We add up the switch cost separately */
 533         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 534                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 535     }
 536     if (ic->vdw_modifier == eintmodPOTSWITCH)
 537     {
 538         /* We add up the switch cost separately */
 539         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 540                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 541     }
 542     if (ic->vdwtype == evdwPME)
 543     {
 544         /* We add up the LJ Ewald cost separately */
 545         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 546                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 547     }
 548 }
 549
 550 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
 551                              t_forcerec           *fr,
 552                              rvec                  x[],
 553                              rvec                  f[],
 554                              const t_mdatoms      *mdatoms,
 555                              t_lambda             *fepvals,
 556                              real                 *lambda,
 557                              gmx_enerdata_t       *enerd,
 558                              int                   flags,
 559                              t_nrnb               *nrnb,
 560                              gmx_wallcycle_t       wcycle)
 561 {
 562     int              donb_flags;
 563     nb_kernel_data_t kernel_data;
 564     real             lam_i[efptNR];
 565     real             dvdl_nb[efptNR];
 566     int              th;
 567     int              i, j;
 568
 569     donb_flags = 0;
 570     /* Add short-range interactions */
 571     donb_flags |= GMX_NONBONDED_DO_SR;
 572
 573     /* Currently all group scheme kernels always calculate (shift-)forces */
 574     if (flags & GMX_FORCE_FORCES)
 575     {
 576         donb_flags |= GMX_NONBONDED_DO_FORCE;
 577     }
 578     if (flags & GMX_FORCE_VIRIAL)
 579     {
 580         donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
 581     }
 582     if (flags & GMX_FORCE_ENERGY)
 583     {
 584         donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
 585     }
 586
 587     kernel_data.flags  = donb_flags;
 588     kernel_data.lambda = lambda;
 589     kernel_data.dvdl   = dvdl_nb;
 590
 591     kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
 592     kernel_data.energygrp_vdw  = enerd->grpp.ener[egLJSR];
 593
 594     /* reset free energy components */
 595     for (i = 0; i < efptNR; i++)
 596     {
 597         dvdl_nb[i]  = 0;
 598     }
 599
 600     GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
 601
 602     wallcycle_sub_start(wcycle, ewcsNONBONDED);
 603 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
 604     for (th = 0; th < nbl_lists->nnbl; th++)
 605     {
 606         try
 607         {
 608             gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
 609                                       x, f, fr, mdatoms, &kernel_data, nrnb);
 610         }
 611         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 612     }
 613
 614     if (fepvals->sc_alpha != 0)
 615     {
 616         enerd->dvdl_nonlin[efptVDW]  += dvdl_nb[efptVDW];
 617         enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
 618     }
 619     else
 620     {
 621         enerd->dvdl_lin[efptVDW]  += dvdl_nb[efptVDW];
 622         enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
 623     }
 624
 625     /* If we do foreign lambda and we have soft-core interactions
 626      * we have to recalculate the (non-linear) energies contributions.
 627      */
 628     if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
 629     {
 630         kernel_data.flags          = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
 631         kernel_data.lambda         = lam_i;
 632         kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
 633         kernel_data.energygrp_vdw  = enerd->foreign_grpp.ener[egLJSR];
 634         /* Note that we add to kernel_data.dvdl, but ignore the result */
 635
 636         for (i = 0; i < enerd->n_lambda; i++)
 637         {
 638             for (j = 0; j < efptNR; j++)
 639             {
 640                 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
 641             }
 642             reset_foreign_enerdata(enerd);
 643 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
 644             for (th = 0; th < nbl_lists->nnbl; th++)
 645             {
 646                 try
 647                 {
 648                     gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
 649                                               x, f, fr, mdatoms, &kernel_data, nrnb);
 650                 }
 651                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 652             }
 653
 654             sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
 655             enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
 656         }
 657     }
 658
 659     wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 660 }
 661
 662 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
 663 {
 664     return nbv != nullptr && nbv->bUseGPU;
 665 }
 666
 667 static inline void clear_rvecs_omp(int n, rvec v[])
 668 {
 669     int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
 670
 671     /* Note that we would like to avoid this conditional by putting it
 672      * into the omp pragma instead, but then we still take the full
 673      * omp parallel for overhead (at least with gcc5).
 674      */
 675     if (nth == 1)
 676     {
 677         for (int i = 0; i < n; i++)
 678         {
 679             clear_rvec(v[i]);
 680         }
 681     }
 682     else
 683     {
 684 #pragma omp parallel for num_threads(nth) schedule(static)
 685         for (int i = 0; i < n; i++)
 686         {
 687             clear_rvec(v[i]);
 688         }
 689     }
 690 }
 691
 692 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
 693  *
 694  * \param groupOptions  Group options, containing T-coupling options
 695  */
 696 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
 697 {
 698     real nrdfCoupled   = 0;
 699     real nrdfUncoupled = 0;
 700     real kineticEnergy = 0;
 701     for (int g = 0; g < groupOptions.ngtc; g++)
 702     {
 703         if (groupOptions.tau_t[g] >= 0)
 704         {
 705             nrdfCoupled   += groupOptions.nrdf[g];
 706             kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
 707         }
 708         else
 709         {
 710             nrdfUncoupled += groupOptions.nrdf[g];
 711         }
 712     }
 713
 714     /* This conditional with > also catches nrdf=0 */
 715     if (nrdfCoupled > nrdfUncoupled)
 716     {
 717         return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
 718     }
 719     else
 720     {
 721         return 0;
 722     }
 723 }
 724
 725 /*! \brief This routine checks that the potential energy is finite.
 726  *
 727  * Always checks that the potential energy is finite. If step equals
 728  * inputrec.init_step also checks that the magnitude of the potential energy
 729  * is reasonable. Terminates with a fatal error when a check fails.
 730  * Note that passing this check does not guarantee finite forces,
 731  * since those use slightly different arithmetics. But in most cases
 732  * there is just a narrow coordinate range where forces are not finite
 733  * and energies are finite.
 734  *
 735  * \param[in] step      The step number, used for checking and printing
 736  * \param[in] enerd     The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
 737  * \param[in] inputrec  The input record
 738  */
 739 static void checkPotentialEnergyValidity(int64_t               step,
 740                                          const gmx_enerdata_t &enerd,
 741                                          const t_inputrec     &inputrec)
 742 {
 743     /* Threshold valid for comparing absolute potential energy against
 744      * the kinetic energy. Normally one should not consider absolute
 745      * potential energy values, but with a factor of one million
 746      * we should never get false positives.
 747      */
 748     constexpr real c_thresholdFactor = 1e6;
 749
 750     bool           energyIsNotFinite    = !std::isfinite(enerd.term[F_EPOT]);
 751     real           averageKineticEnergy = 0;
 752     /* We only check for large potential energy at the initial step,
 753      * because that is by far the most likely step for this too occur
 754      * and because computing the average kinetic energy is not free.
 755      * Note: nstcalcenergy >> 1 often does not allow to catch large energies
 756      * before they become NaN.
 757      */
 758     if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
 759     {
 760         averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
 761     }
 762
 763     if (energyIsNotFinite || (averageKineticEnergy > 0 &&
 764                               enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
 765     {
 766         gmx_fatal(FARGS, "Step %" PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
 767                   step,
 768                   enerd.term[F_EPOT],
 769                   energyIsNotFinite ? "not finite" : "extremely high",
 770                   enerd.term[F_LJ],
 771                   enerd.term[F_COUL_SR],
 772                   energyIsNotFinite ? "non-finite" : "very high",
 773                   energyIsNotFinite ? " or Nan" : "");
 774     }
 775 }
 776
 777 /*! \brief Compute forces and/or energies for special algorithms
 778  *
 779  * The intention is to collect all calls to algorithms that compute
 780  * forces on local atoms only and that do not contribute to the local
 781  * virial sum (but add their virial contribution separately).
 782  * Eventually these should likely all become ForceProviders.
 783  * Within this function the intention is to have algorithms that do
 784  * global communication at the end, so global barriers within the MD loop
 785  * are as close together as possible.
 786  *
 787  * \param[in]     fplog            The log file
 788  * \param[in]     cr               The communication record
 789  * \param[in]     inputrec         The input record
 790  * \param[in]     awh              The Awh module (nullptr if none in use).
 791  * \param[in]     enforcedRotation Enforced rotation module.
 792  * \param[in]     step             The current MD step
 793  * \param[in]     t                The current time
 794  * \param[in,out] wcycle           Wallcycle accounting struct
 795  * \param[in,out] forceProviders   Pointer to a list of force providers
 796  * \param[in]     box              The unit cell
 797  * \param[in]     x                The coordinates
 798  * \param[in]     mdatoms          Per atom properties
 799  * \param[in]     lambda           Array of free-energy lambda values
 800  * \param[in]     forceFlags       Flags that tell whether we should compute forces/energies/virial
 801  * \param[in,out] forceWithVirial  Force and virial buffers
 802  * \param[in,out] enerd            Energy buffer
 803  * \param[in,out] ed               Essential dynamics pointer
 804  * \param[in]     bNS              Tells if we did neighbor searching this step, used for ED sampling
 805  *
 806  * \todo Remove bNS, which is used incorrectly.
 807  * \todo Convert all other algorithms called here to ForceProviders.
 808  */
 809 static void
 810 computeSpecialForces(FILE                          *fplog,
 811                      const t_commrec               *cr,
 812                      const t_inputrec              *inputrec,
 813                      gmx::Awh                      *awh,
 814                      gmx_enfrot                    *enforcedRotation,
 815                      int64_t                        step,
 816                      double                         t,
 817                      gmx_wallcycle_t                wcycle,
 818                      ForceProviders                *forceProviders,
 819                      matrix                         box,
 820                      gmx::ArrayRef<const gmx::RVec> x,
 821                      const t_mdatoms               *mdatoms,
 822                      real                          *lambda,
 823                      int                            forceFlags,
 824                      gmx::ForceWithVirial          *forceWithVirial,
 825                      gmx_enerdata_t                *enerd,
 826                      gmx_edsam                     *ed,
 827                      gmx_bool                       bNS)
 828 {
 829     const bool computeForces = (forceFlags & GMX_FORCE_FORCES) != 0;
 830
 831     /* NOTE: Currently all ForceProviders only provide forces.
 832      *       When they also provide energies, remove this conditional.
 833      */
 834     if (computeForces)
 835     {
 836         gmx::ForceProviderInput  forceProviderInput(x, *mdatoms, t, box, *cr);
 837         gmx::ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
 838
 839         /* Collect forces from modules */
 840         forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
 841     }
 842
 843     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
 844     {
 845         pull_potential_wrapper(cr, inputrec, box, x,
 846                                forceWithVirial,
 847                                mdatoms, enerd, lambda, t,
 848                                wcycle);
 849
 850         if (awh)
 851         {
 852             enerd->term[F_COM_PULL] +=
 853                 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
 854                                                   forceWithVirial,
 855                                                   t, step, wcycle, fplog);
 856         }
 857     }
 858
 859     rvec *f = as_rvec_array(forceWithVirial->force_.data());
 860
 861     /* Add the forces from enforced rotation potentials (if any) */
 862     if (inputrec->bRot)
 863     {
 864         wallcycle_start(wcycle, ewcROTadd);
 865         enerd->term[F_COM_PULL] += add_rot_forces(enforcedRotation, f, cr, step, t);
 866         wallcycle_stop(wcycle, ewcROTadd);
 867     }
 868
 869     if (ed)
 870     {
 871         /* Note that since init_edsam() is called after the initialization
 872          * of forcerec, edsam doesn't request the noVirSum force buffer.
 873          * Thus if no other algorithm (e.g. PME) requires it, the forces
 874          * here will contribute to the virial.
 875          */
 876         do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
 877     }
 878
 879     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
 880     if (inputrec->bIMD && computeForces)
 881     {
 882         IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
 883     }
 884 }
 885
 886 /*! \brief Launch the prepare_step and spread stages of PME GPU.
 887  *
 888  * \param[in]  pmedata       The PME structure
 889  * \param[in]  box           The box matrix
 890  * \param[in]  x             Coordinate array
 891  * \param[in]  flags         Force flags
 892  * \param[in]  wcycle        The wallcycle structure
 893  */
 894 static inline void launchPmeGpuSpread(gmx_pme_t      *pmedata,
 895                                       matrix          box,
 896                                       rvec            x[],
 897                                       int             flags,
 898                                       gmx_wallcycle_t wcycle)
 899 {
 900     int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE;
 901     pmeFlags |= (flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0;
 902     pmeFlags |= (flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0;
 903
 904     pme_gpu_prepare_computation(pmedata, (flags & GMX_FORCE_DYNAMICBOX) != 0, box, wcycle, pmeFlags);
 905     pme_gpu_launch_spread(pmedata, x, wcycle);
 906 }
 907
 908 /*! \brief Launch the FFT and gather stages of PME GPU
 909  *
 910  * This function only implements setting the output forces (no accumulation).
 911  *
 912  * \param[in]  pmedata        The PME structure
 913  * \param[in]  wcycle         The wallcycle structure
 914  */
 915 static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
 916                                      gmx_wallcycle_t   wcycle)
 917 {
 918     pme_gpu_launch_complex_transforms(pmedata, wcycle);
 919     pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
 920 }
 921
 922 /*! \brief
 923  *  Polling wait for either of the PME or nonbonded GPU tasks.
 924  *
 925  * Instead of a static order in waiting for GPU tasks, this function
 926  * polls checking which of the two tasks completes first, and does the
 927  * associated force buffer reduction overlapped with the other task.
 928  * By doing that, unlike static scheduling order, it can always overlap
 929  * one of the reductions, regardless of the GPU task completion order.
 930  *
 931  * \param[in]     nbv              Nonbonded verlet structure
 932  * \param[in]     pmedata          PME module data
 933  * \param[in,out] force            Force array to reduce task outputs into.
 934  * \param[in,out] forceWithVirial  Force and virial buffers
 935  * \param[in,out] fshift           Shift force output vector results are reduced into
 936  * \param[in,out] enerd            Energy data structure results are reduced into
 937  * \param[in]     flags            Force flags
 938  * \param[in]     haveOtherWork    Tells whether there is other work than non-bonded in the stream(s)
 939  * \param[in]     wcycle           The wallcycle structure
 940  */
 941 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv,
 942                                         const gmx_pme_t                     *pmedata,
 943                                         gmx::ArrayRefWithPadding<gmx::RVec> *force,
 944                                         gmx::ForceWithVirial                *forceWithVirial,
 945                                         rvec                                 fshift[],
 946                                         gmx_enerdata_t                      *enerd,
 947                                         int                                  flags,
 948                                         bool                                 haveOtherWork,
 949                                         gmx_wallcycle_t                      wcycle)
 950 {
 951     bool isPmeGpuDone = false;
 952     bool isNbGpuDone  = false;
 953
 954
 955     gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
 956
 957     while (!isPmeGpuDone || !isNbGpuDone)
 958     {
 959         if (!isPmeGpuDone)
 960         {
 961             matrix            vir_Q;
 962             real              Vlr_q;
 963
 964             GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
 965             isPmeGpuDone = pme_gpu_try_finish_task(pmedata, wcycle, &pmeGpuForces,
 966                                                    vir_Q, &Vlr_q, completionType);
 967
 968             if (isPmeGpuDone)
 969             {
 970                 pme_gpu_reduce_outputs(wcycle, forceWithVirial, pmeGpuForces,
 971                                        enerd, vir_Q, Vlr_q);
 972             }
 973         }
 974
 975         if (!isNbGpuDone)
 976         {
 977             GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
 978             wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
 979             isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
 980                                                     flags, eatLocal,
 981                                                     haveOtherWork,
 982                                                     enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
 983                                                     fshift, completionType);
 984             wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 985             // To get the call count right, when the task finished we
 986             // issue a start/stop.
 987             // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
 988             // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
 989             if (isNbGpuDone)
 990             {
 991                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
 992                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 993
 994                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
 995                                                nbv->nbat, as_rvec_array(force->unpaddedArrayRef().data()), wcycle);
 996             }
 997         }
 998     }
 999 }
1000
1001 /*! \brief
1002  *  Launch the dynamic rolling pruning GPU task.
1003  *
1004  *  We currently alternate local/non-local list pruning in odd-even steps
1005  *  (only pruning every second step without DD).
1006  *
1007  * \param[in]     cr               The communication record
1008  * \param[in]     nbv              Nonbonded verlet structure
1009  * \param[in]     inputrec         The input record
1010  * \param[in]     step             The current MD step
1011  */
1012 static inline void launchGpuRollingPruning(const t_commrec          *cr,
1013                                            const nonbonded_verlet_t *nbv,
1014                                            const t_inputrec         *inputrec,
1015                                            const int64_t             step)
1016 {
1017     /* We should not launch the rolling pruning kernel at a search
1018      * step or just before search steps, since that's useless.
1019      * Without domain decomposition we prune at even steps.
1020      * With domain decomposition we alternate local and non-local
1021      * pruning at even and odd steps.
1022      */
1023     int  numRollingParts     = nbv->listParams->numRollingParts;
1024     GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1025     int  stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1026     bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
1027     if (stepWithCurrentList > 0 &&
1028         stepWithCurrentList < inputrec->nstlist - 1 &&
1029         (stepIsEven || DOMAINDECOMP(cr)))
1030     {
1031         nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
1032                                           stepIsEven ? eintLocal : eintNonlocal,
1033                                           numRollingParts);
1034     }
1035 }
1036
1037 static void do_force_cutsVERLET(FILE *fplog,
1038                                 const t_commrec *cr,
1039                                 const gmx_multisim_t *ms,
1040                                 const t_inputrec *inputrec,
1041                                 gmx::Awh *awh,
1042                                 gmx_enfrot *enforcedRotation,
1043                                 int64_t step,
1044                                 t_nrnb *nrnb,
1045                                 gmx_wallcycle_t wcycle,
1046                                 const gmx_localtop_t *top,
1047                                 const gmx_groups_t * /* groups */,
1048                                 matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
1049                                 history_t *hist,
1050                                 gmx::ArrayRefWithPadding<gmx::RVec> force,
1051                                 tensor vir_force,
1052                                 const t_mdatoms *mdatoms,
1053                                 gmx_enerdata_t *enerd, t_fcdata *fcd,
1054                                 real *lambda,
1055                                 t_graph *graph,
1056                                 t_forcerec *fr,
1057                                 gmx::PpForceWorkload *ppForceWorkload,
1058                                 interaction_const_t *ic,
1059                                 const gmx_vsite_t *vsite,
1060                                 rvec mu_tot,
1061                                 double t,
1062                                 gmx_edsam *ed,
1063                                 int flags,
1064                                 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1065                                 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1066 {
1067     int                 cg1, i, j;
1068     double              mu[2*DIM];
1069     gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM;
1070     gmx_bool            bDoForces, bUseGPU, bUseOrEmulGPU;
1071     rvec                vzero, box_diag;
1072     float               cycles_pme, cycles_wait_gpu;
1073     nonbonded_verlet_t *nbv = fr->nbv;
1074
1075     bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
1076     bNS           = ((flags & GMX_FORCE_NS) != 0) && (!fr->bAllvsAll);
1077     bFillGrid     = (bNS && bStateChanged);
1078     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
1079     bDoForces     = ((flags & GMX_FORCE_FORCES) != 0);
1080     bUseGPU       = fr->nbv->bUseGPU;
1081     bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
1082
1083     const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
1084     // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
1085     const bool useGpuPme  = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
1086         ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
1087
1088     /* At a search step we need to start the first balancing region
1089      * somewhere early inside the step after communication during domain
1090      * decomposition (and not during the previous step as usual).
1091      */
1092     if (bNS &&
1093         ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1094     {
1095         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
1096     }
1097
1098     cycles_wait_gpu = 0;
1099
1100     const int start  = 0;
1101     const int homenr = mdatoms->homenr;
1102
1103     clear_mat(vir_force);
1104
1105     if (DOMAINDECOMP(cr))
1106     {
1107         cg1 = cr->dd->globalAtomGroupIndices.size();
1108     }
1109     else
1110     {
1111         cg1 = top->cgs.nr;
1112     }
1113     if (fr->n_tpi > 0)
1114     {
1115         cg1--;
1116     }
1117
1118     if (bStateChanged)
1119     {
1120         update_forcerec(fr, box);
1121
1122         if (inputrecNeedMutot(inputrec))
1123         {
1124             /* Calculate total (local) dipole moment in a temporary common array.
1125              * This makes it possible to sum them over nodes faster.
1126              */
1127             calc_mu(start, homenr,
1128                     x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1129                     mu, mu+DIM);
1130         }
1131     }
1132
1133     if (fr->ePBC != epbcNONE)
1134     {
1135         /* Compute shift vectors every step,
1136          * because of pressure coupling or box deformation!
1137          */
1138         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1139         {
1140             calc_shifts(box, fr->shift_vec);
1141         }
1142
1143         if (bCalcCGCM)
1144         {
1145             put_atoms_in_box_omp(fr->ePBC, box, x.unpaddedArrayRef().subArray(0, homenr));
1146             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1147         }
1148         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1149         {
1150             unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1151         }
1152     }
1153
1154     nbnxn_atomdata_copy_shiftvec((flags & GMX_FORCE_DYNAMICBOX) != 0,
1155                                  fr->shift_vec, nbv->nbat);
1156
1157 #if GMX_MPI
1158     if (!thisRankHasDuty(cr, DUTY_PME))
1159     {
1160         /* Send particle coordinates to the pme nodes.
1161          * Since this is only implemented for domain decomposition
1162          * and domain decomposition does not use the graph,
1163          * we do not need to worry about shifting.
1164          */
1165         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1166                                  lambda[efptCOUL], lambda[efptVDW],
1167                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1168                                  step, wcycle);
1169     }
1170 #endif /* GMX_MPI */
1171
1172     if (useGpuPme)
1173     {
1174         launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.unpaddedArrayRef().data()), flags, wcycle);
1175     }
1176
1177     /* do gridding for pair search */
1178     if (bNS)
1179     {
1180         if (graph && bStateChanged)
1181         {
1182             /* Calculate intramolecular shift vectors to make molecules whole */
1183             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1184         }
1185
1186         clear_rvec(vzero);
1187         box_diag[XX] = box[XX][XX];
1188         box_diag[YY] = box[YY][YY];
1189         box_diag[ZZ] = box[ZZ][ZZ];
1190
1191         wallcycle_start(wcycle, ewcNS);
1192         if (!DOMAINDECOMP(cr))
1193         {
1194             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1195             nbnxn_put_on_grid(nbv->nbs.get(), fr->ePBC, box,
1196                               0, vzero, box_diag,
1197                               nullptr, 0, mdatoms->homenr, -1,
1198                               fr->cginfo, x.unpaddedArrayRef(),
1199                               0, nullptr,
1200                               nbv->grp[eintLocal].kernel_type,
1201                               nbv->nbat);
1202             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1203         }
1204         else
1205         {
1206             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1207             nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
1208                                        fr->cginfo, x.unpaddedArrayRef(),
1209                                        nbv->grp[eintNonlocal].kernel_type,
1210                                        nbv->nbat);
1211             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1212         }
1213
1214         nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
1215
1216         /* Now we put all atoms on the grid, we can assign bonded interactions
1217          * to the GPU, where the grid order is needed.
1218          */
1219         if (fr->gpuBondedLists)
1220         {
1221             assign_bondeds_to_gpu(fr->gpuBondedLists,
1222                                   nbnxn_get_gridindices(fr->nbv->nbs.get()),
1223                                   top->idef);
1224
1225             update_gpu_bonded(fr->gpuBondedLists);
1226             ppForceWorkload->haveGpuBondedWork = bonded_gpu_have_interactions(fr->gpuBondedLists);
1227         }
1228
1229         wallcycle_stop(wcycle, ewcNS);
1230     }
1231
1232     /* initialize the GPU atom data and copy shift vector */
1233     if (bUseGPU)
1234     {
1235         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1236         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1237
1238         if (bNS)
1239         {
1240             nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1241         }
1242
1243         nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1244
1245         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1246         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1247     }
1248
1249     /* do local pair search */
1250     if (bNS)
1251     {
1252         wallcycle_start_nocount(wcycle, ewcNS);
1253         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1254         nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1255                             &top->excls,
1256                             nbv->listParams->rlistOuter,
1257                             nbv->min_ci_balanced,
1258                             &nbv->grp[eintLocal].nbl_lists,
1259                             eintLocal,
1260                             nbv->grp[eintLocal].kernel_type,
1261                             nrnb);
1262         nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
1263         if (nbv->listParams->useDynamicPruning && !bUseGPU)
1264         {
1265             nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
1266         }
1267         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1268
1269         if (bUseGPU)
1270         {
1271             /* initialize local pair-list on the GPU */
1272             nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1273                                     nbv->grp[eintLocal].nbl_lists.nbl[0],
1274                                     eintLocal);
1275         }
1276         wallcycle_stop(wcycle, ewcNS);
1277     }
1278     else
1279     {
1280         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatLocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1281                                         nbv->nbat, wcycle);
1282     }
1283
1284     if (bUseGPU)
1285     {
1286         if (DOMAINDECOMP(cr))
1287         {
1288             ddOpenBalanceRegionGpu(cr->dd);
1289         }
1290
1291         wallcycle_start(wcycle, ewcLAUNCH_GPU);
1292         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1293         /* launch local nonbonded work on GPU */
1294         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1295                      step, nrnb, wcycle);
1296         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1297
1298         if (ppForceWorkload->haveGpuBondedWork && !DOMAINDECOMP(cr))
1299         {
1300             do_bonded_gpu(fr, flags,
1301                           nbnxn_gpu_get_xq(nbv->gpu_nbv), box,
1302                           nbnxn_gpu_get_f(nbv->gpu_nbv),
1303                           nbnxn_gpu_get_fshift(nbv->gpu_nbv));
1304
1305         }
1306         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1307     }
1308
1309     if (useGpuPme)
1310     {
1311         // In PME GPU and mixed mode we launch FFT / gather after the
1312         // X copy/transform to allow overlap as well as after the GPU NB
1313         // launch to avoid FFT launch overhead hijacking the CPU and delaying
1314         // the nonbonded kernel.
1315         launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1316     }
1317
1318     /* Communicate coordinates and sum dipole if necessary +
1319        do non-local pair search */
1320     if (DOMAINDECOMP(cr))
1321     {
1322         if (bNS)
1323         {
1324             wallcycle_start_nocount(wcycle, ewcNS);
1325             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1326
1327             nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1328                                 &top->excls,
1329                                 nbv->listParams->rlistOuter,
1330                                 nbv->min_ci_balanced,
1331                                 &nbv->grp[eintNonlocal].nbl_lists,
1332                                 eintNonlocal,
1333                                 nbv->grp[eintNonlocal].kernel_type,
1334                                 nrnb);
1335             nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1336             if (nbv->listParams->useDynamicPruning && !bUseGPU)
1337             {
1338                 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1339             }
1340             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1341
1342             if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1343             {
1344                 /* initialize non-local pair-list on the GPU */
1345                 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1346                                         nbv->grp[eintNonlocal].nbl_lists.nbl[0],
1347                                         eintNonlocal);
1348             }
1349             wallcycle_stop(wcycle, ewcNS);
1350         }
1351         else
1352         {
1353             dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1354
1355             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatNonlocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1356                                             nbv->nbat, wcycle);
1357         }
1358
1359         if (bUseGPU)
1360         {
1361             wallcycle_start(wcycle, ewcLAUNCH_GPU);
1362             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1363             /* launch non-local nonbonded tasks on GPU */
1364             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1365                          step, nrnb, wcycle);
1366             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1367
1368             if (ppForceWorkload->haveGpuBondedWork)
1369             {
1370                 do_bonded_gpu(fr, flags,
1371                               nbnxn_gpu_get_xq(nbv->gpu_nbv), box,
1372                               nbnxn_gpu_get_f(nbv->gpu_nbv),
1373                               nbnxn_gpu_get_fshift(nbv->gpu_nbv));
1374             }
1375
1376             wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1377         }
1378     }
1379
1380     if (bUseGPU)
1381     {
1382         /* launch D2H copy-back F */
1383         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1384         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1385         if (DOMAINDECOMP(cr))
1386         {
1387             nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1388                                      flags, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
1389         }
1390         nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1391                                  flags, eatLocal, ppForceWorkload->haveGpuBondedWork);
1392         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1393         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1394     }
1395
1396     if (bStateChanged && inputrecNeedMutot(inputrec))
1397     {
1398         if (PAR(cr))
1399         {
1400             gmx_sumd(2*DIM, mu, cr);
1401             ddReopenBalanceRegionCpu(cr->dd);
1402         }
1403
1404         for (i = 0; i < 2; i++)
1405         {
1406             for (j = 0; j < DIM; j++)
1407             {
1408                 fr->mu_tot[i][j] = mu[i*DIM + j];
1409             }
1410         }
1411     }
1412     if (fr->efep == efepNO)
1413     {
1414         copy_rvec(fr->mu_tot[0], mu_tot);
1415     }
1416     else
1417     {
1418         for (j = 0; j < DIM; j++)
1419         {
1420             mu_tot[j] =
1421                 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1422                 lambda[efptCOUL]*fr->mu_tot[1][j];
1423         }
1424     }
1425
1426     /* Reset energies */
1427     reset_enerdata(enerd);
1428     clear_rvecs(SHIFTS, fr->fshift);
1429
1430     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1431     {
1432         wallcycle_start(wcycle, ewcPPDURINGPME);
1433         dd_force_flop_start(cr->dd, nrnb);
1434     }
1435
1436     if (inputrec->bRot)
1437     {
1438         wallcycle_start(wcycle, ewcROT);
1439         do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1440         wallcycle_stop(wcycle, ewcROT);
1441     }
1442
1443     /* Temporary solution until all routines take PaddedRVecVector */
1444     rvec *const f = as_rvec_array(force.unpaddedArrayRef().data());
1445
1446     /* Start the force cycle counter.
1447      * Note that a different counter is used for dynamic load balancing.
1448      */
1449     wallcycle_start(wcycle, ewcFORCE);
1450
1451     gmx::ArrayRef<gmx::RVec> forceRef = force.unpaddedArrayRef();
1452     if (bDoForces)
1453     {
1454         /* If we need to compute the virial, we might need a separate
1455          * force buffer for algorithms for which the virial is calculated
1456          * directly, such as PME.
1457          */
1458         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1459         {
1460             forceRef = *fr->forceBufferForDirectVirialContributions;
1461
1462             /* TODO: update comment
1463              * We only compute forces on local atoms. Note that vsites can
1464              * spread to non-local atoms, but that part of the buffer is
1465              * cleared separately in the vsite spreading code.
1466              */
1467             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1468         }
1469         /* Clear the short- and long-range forces */
1470         clear_rvecs_omp(fr->natoms_force_constr, f);
1471     }
1472
1473     /* forceWithVirial uses the local atom range only */
1474     gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1475
1476     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1477     {
1478         clear_pull_forces(inputrec->pull_work);
1479     }
1480
1481     /* We calculate the non-bonded forces, when done on the CPU, here.
1482      * We do this before calling do_force_lowlevel, because in that
1483      * function, the listed forces are calculated before PME, which
1484      * does communication.  With this order, non-bonded and listed
1485      * force calculation imbalance can be balanced out by the domain
1486      * decomposition load balancing.
1487      */
1488
1489     if (!bUseOrEmulGPU)
1490     {
1491         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1492                      step, nrnb, wcycle);
1493     }
1494
1495     if (fr->efep != efepNO)
1496     {
1497         /* Calculate the local and non-local free energy interactions here.
1498          * Happens here on the CPU both with and without GPU.
1499          */
1500         if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1501         {
1502             do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1503                              fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
1504                              inputrec->fepvals, lambda,
1505                              enerd, flags, nrnb, wcycle);
1506         }
1507
1508         if (DOMAINDECOMP(cr) &&
1509             fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1510         {
1511             do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1512                              fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
1513                              inputrec->fepvals, lambda,
1514                              enerd, flags, nrnb, wcycle);
1515         }
1516     }
1517
1518     if (!bUseOrEmulGPU)
1519     {
1520         int aloc;
1521
1522         if (DOMAINDECOMP(cr))
1523         {
1524             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1525                          step, nrnb, wcycle);
1526         }
1527
1528         if (!bUseOrEmulGPU)
1529         {
1530             aloc = eintLocal;
1531         }
1532         else
1533         {
1534             aloc = eintNonlocal;
1535         }
1536
1537         /* Add all the non-bonded force to the normal force array.
1538          * This can be split into a local and a non-local part when overlapping
1539          * communication with calculation with domain decomposition.
1540          */
1541         wallcycle_stop(wcycle, ewcFORCE);
1542
1543         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatAll, nbv->nbat, f, wcycle);
1544
1545         wallcycle_start_nocount(wcycle, ewcFORCE);
1546
1547         /* if there are multiple fshift output buffers reduce them */
1548         if ((flags & GMX_FORCE_VIRIAL) &&
1549             nbv->grp[aloc].nbl_lists.nnbl > 1)
1550         {
1551             /* This is not in a subcounter because it takes a
1552                negligible and constant-sized amount of time */
1553             nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1554                                                      fr->fshift);
1555         }
1556     }
1557
1558     /* update QMMMrec, if necessary */
1559     if (fr->bQMMM)
1560     {
1561         update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
1562     }
1563
1564     /* Compute the bonded and non-bonded energies and optionally forces */
1565     do_force_lowlevel(fr, inputrec, &(top->idef),
1566                       cr, ms, nrnb, wcycle, mdatoms,
1567                       as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
1568                       box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1569                       flags, &cycles_pme);
1570
1571     wallcycle_stop(wcycle, ewcFORCE);
1572
1573     computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1574                          step, t, wcycle,
1575                          fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
1576                          flags, &forceWithVirial, enerd,
1577                          ed, bNS);
1578
1579     if (bUseOrEmulGPU)
1580     {
1581         /* wait for non-local forces (or calculate in emulation mode) */
1582         if (DOMAINDECOMP(cr))
1583         {
1584             if (bUseGPU)
1585             {
1586                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1587                 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1588                                            flags, eatNonlocal,
1589                                            ppForceWorkload->haveGpuBondedWork,
1590                                            enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1591                                            fr->fshift);
1592                 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1593             }
1594             else
1595             {
1596                 wallcycle_start_nocount(wcycle, ewcFORCE);
1597                 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1598                              step, nrnb, wcycle);
1599                 wallcycle_stop(wcycle, ewcFORCE);
1600             }
1601
1602             /* skip the reduction if there was no non-local work to do */
1603             if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1604             {
1605                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal,
1606                                                nbv->nbat, f, wcycle);
1607             }
1608         }
1609     }
1610
1611     if (DOMAINDECOMP(cr))
1612     {
1613         /* We are done with the CPU compute.
1614          * We will now communicate the non-local forces.
1615          * If we use a GPU this will overlap with GPU work, so in that case
1616          * we do not close the DD force balancing region here.
1617          */
1618         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1619         {
1620             ddCloseBalanceRegionCpu(cr->dd);
1621         }
1622         if (bDoForces)
1623         {
1624             dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
1625         }
1626     }
1627
1628     // With both nonbonded and PME offloaded a GPU on the same rank, we use
1629     // an alternating wait/reduction scheme.
1630     bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1631     if (alternateGpuWait)
1632     {
1633         alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, ppForceWorkload->haveGpuBondedWork, wcycle);
1634     }
1635
1636     if (!alternateGpuWait && useGpuPme)
1637     {
1638         gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
1639         matrix vir_Q;
1640         real   Vlr_q = 0.0;
1641         pme_gpu_wait_finish_task(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
1642         pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
1643     }
1644
1645     /* Wait for local GPU NB outputs on the non-alternating wait path */
1646     if (!alternateGpuWait && bUseGPU)
1647     {
1648         /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1649          * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1650          * but even with a step of 0.1 ms the difference is less than 1%
1651          * of the step time.
1652          */
1653         const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1654
1655         wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1656         nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1657                                    flags, eatLocal, ppForceWorkload->haveGpuBondedWork,
1658                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1659                                    fr->fshift);
1660         float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1661
1662         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1663         {
1664             DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1665             if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1666             {
1667                 /* We measured few cycles, it could be that the kernel
1668                  * and transfer finished earlier and there was no actual
1669                  * wait time, only API call overhead.
1670                  * Then the actual time could be anywhere between 0 and
1671                  * cycles_wait_est. We will use half of cycles_wait_est.
1672                  */
1673                 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1674             }
1675             ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1676         }
1677     }
1678
1679     if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1680     {
1681         // NOTE: emulation kernel is not included in the balancing region,
1682         // but emulation mode does not target performance anyway
1683         wallcycle_start_nocount(wcycle, ewcFORCE);
1684         do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1685                      DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1686                      step, nrnb, wcycle);
1687         wallcycle_stop(wcycle, ewcFORCE);
1688     }
1689
1690     if (useGpuPme)
1691     {
1692         pme_gpu_reinit_computation(fr->pmedata, wcycle);
1693     }
1694
1695     if (bUseGPU)
1696     {
1697         /* now clear the GPU outputs while we finish the step on the CPU */
1698         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1699         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1700         nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1701
1702         /* Is dynamic pair-list pruning activated? */
1703         if (nbv->listParams->useDynamicPruning)
1704         {
1705             launchGpuRollingPruning(cr, nbv, inputrec, step);
1706         }
1707         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1708         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1709     }
1710
1711     /* Do the nonbonded GPU (or emulation) force buffer reduction
1712      * on the non-alternating path. */
1713     if (bUseOrEmulGPU && !alternateGpuWait)
1714     {
1715         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1716                                        nbv->nbat, f, wcycle);
1717     }
1718
1719     if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1720     {
1721         bonded_gpu_get_energies(fr, enerd);
1722
1723         bonded_gpu_clear_energies(fr->gpuBondedLists);
1724     }
1725
1726     if (DOMAINDECOMP(cr))
1727     {
1728         dd_force_flop_stop(cr->dd, nrnb);
1729     }
1730
1731     if (bDoForces)
1732     {
1733         /* If we have NoVirSum forces, but we do not calculate the virial,
1734          * we sum fr->f_novirsum=f later.
1735          */
1736         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1737         {
1738             spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
1739                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1740         }
1741
1742         if (flags & GMX_FORCE_VIRIAL)
1743         {
1744             /* Calculation of the virial must be done after vsites! */
1745             calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
1746                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1747         }
1748     }
1749
1750     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1751     {
1752         /* In case of node-splitting, the PP nodes receive the long-range
1753          * forces, virial and energy from the PME nodes here.
1754          */
1755         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1756     }
1757
1758     if (bDoForces)
1759     {
1760         post_process_forces(cr, step, nrnb, wcycle,
1761                             top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
1762                             vir_force, mdatoms, graph, fr, vsite,
1763                             flags);
1764     }
1765
1766     if (flags & GMX_FORCE_ENERGY)
1767     {
1768         /* Sum the potential energy terms from group contributions */
1769         sum_epot(&(enerd->grpp), enerd->term);
1770
1771         if (!EI_TPI(inputrec->eI))
1772         {
1773             checkPotentialEnergyValidity(step, *enerd, *inputrec);
1774         }
1775     }
1776 }
1777
1778 static void do_force_cutsGROUP(FILE *fplog,
1779                                const t_commrec *cr,
1780                                const gmx_multisim_t *ms,
1781                                const t_inputrec *inputrec,
1782                                gmx::Awh *awh,
1783                                gmx_enfrot *enforcedRotation,
1784                                int64_t step,
1785                                t_nrnb *nrnb,
1786                                gmx_wallcycle_t wcycle,
1787                                gmx_localtop_t *top,
1788                                const gmx_groups_t *groups,
1789                                matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
1790                                history_t *hist,
1791                                gmx::ArrayRefWithPadding<gmx::RVec> force,
1792                                tensor vir_force,
1793                                const t_mdatoms *mdatoms,
1794                                gmx_enerdata_t *enerd,
1795                                t_fcdata *fcd,
1796                                real *lambda,
1797                                t_graph *graph,
1798                                t_forcerec *fr,
1799                                const gmx_vsite_t *vsite,
1800                                rvec mu_tot,
1801                                double t,
1802                                gmx_edsam *ed,
1803                                int flags,
1804                                DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1805                                DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1806 {
1807     int        cg0, cg1, i, j;
1808     double     mu[2*DIM];
1809     gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM;
1810     gmx_bool   bDoForces;
1811     float      cycles_pme;
1812
1813     const int  start  = 0;
1814     const int  homenr = mdatoms->homenr;
1815
1816     clear_mat(vir_force);
1817
1818     cg0 = 0;
1819     if (DOMAINDECOMP(cr))
1820     {
1821         cg1 = cr->dd->globalAtomGroupIndices.size();
1822     }
1823     else
1824     {
1825         cg1 = top->cgs.nr;
1826     }
1827     if (fr->n_tpi > 0)
1828     {
1829         cg1--;
1830     }
1831
1832     bStateChanged  = ((flags & GMX_FORCE_STATECHANGED) != 0);
1833     bNS            = ((flags & GMX_FORCE_NS) != 0) && (!fr->bAllvsAll);
1834     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1835     bFillGrid      = (bNS && bStateChanged);
1836     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
1837     bDoForces      = ((flags & GMX_FORCE_FORCES) != 0);
1838
1839     if (bStateChanged)
1840     {
1841         update_forcerec(fr, box);
1842
1843         if (inputrecNeedMutot(inputrec))
1844         {
1845             /* Calculate total (local) dipole moment in a temporary common array.
1846              * This makes it possible to sum them over nodes faster.
1847              */
1848             calc_mu(start, homenr,
1849                     x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1850                     mu, mu+DIM);
1851         }
1852     }
1853
1854     if (fr->ePBC != epbcNONE)
1855     {
1856         /* Compute shift vectors every step,
1857          * because of pressure coupling or box deformation!
1858          */
1859         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1860         {
1861             calc_shifts(box, fr->shift_vec);
1862         }
1863
1864         if (bCalcCGCM)
1865         {
1866             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1867                                      &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1868             inc_nrnb(nrnb, eNR_CGCM, homenr);
1869             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1870         }
1871         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1872         {
1873             unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1874         }
1875     }
1876     else if (bCalcCGCM)
1877     {
1878         calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1879         inc_nrnb(nrnb, eNR_CGCM, homenr);
1880     }
1881
1882     if (bCalcCGCM && gmx_debug_at)
1883     {
1884         pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1885     }
1886
1887 #if GMX_MPI
1888     if (!thisRankHasDuty(cr, DUTY_PME))
1889     {
1890         /* Send particle coordinates to the pme nodes.
1891          * Since this is only implemented for domain decomposition
1892          * and domain decomposition does not use the graph,
1893          * we do not need to worry about shifting.
1894          */
1895         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1896                                  lambda[efptCOUL], lambda[efptVDW],
1897                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1898                                  step, wcycle);
1899     }
1900 #endif /* GMX_MPI */
1901
1902     /* Communicate coordinates and sum dipole if necessary */
1903     if (DOMAINDECOMP(cr))
1904     {
1905         dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1906
1907         /* No GPU support, no move_x overlap, so reopen the balance region here */
1908         if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1909         {
1910             ddReopenBalanceRegionCpu(cr->dd);
1911         }
1912     }
1913
1914     if (inputrecNeedMutot(inputrec))
1915     {
1916         if (bStateChanged)
1917         {
1918             if (PAR(cr))
1919             {
1920                 gmx_sumd(2*DIM, mu, cr);
1921                 ddReopenBalanceRegionCpu(cr->dd);
1922             }
1923             for (i = 0; i < 2; i++)
1924             {
1925                 for (j = 0; j < DIM; j++)
1926                 {
1927                     fr->mu_tot[i][j] = mu[i*DIM + j];
1928                 }
1929             }
1930         }
1931         if (fr->efep == efepNO)
1932         {
1933             copy_rvec(fr->mu_tot[0], mu_tot);
1934         }
1935         else
1936         {
1937             for (j = 0; j < DIM; j++)
1938             {
1939                 mu_tot[j] =
1940                     (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1941             }
1942         }
1943     }
1944
1945     /* Reset energies */
1946     reset_enerdata(enerd);
1947     clear_rvecs(SHIFTS, fr->fshift);
1948
1949     if (bNS)
1950     {
1951         wallcycle_start(wcycle, ewcNS);
1952
1953         if (graph && bStateChanged)
1954         {
1955             /* Calculate intramolecular shift vectors to make molecules whole */
1956             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1957         }
1958
1959         /* Do the actual neighbour searching */
1960         ns(fplog, fr, box,
1961            groups, top, mdatoms,
1962            cr, nrnb, bFillGrid);
1963
1964         wallcycle_stop(wcycle, ewcNS);
1965     }
1966
1967     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1968     {
1969         wallcycle_start(wcycle, ewcPPDURINGPME);
1970         dd_force_flop_start(cr->dd, nrnb);
1971     }
1972
1973     if (inputrec->bRot)
1974     {
1975         wallcycle_start(wcycle, ewcROT);
1976         do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1977         wallcycle_stop(wcycle, ewcROT);
1978     }
1979
1980     /* Temporary solution until all routines take PaddedRVecVector */
1981     rvec *f = as_rvec_array(force.unpaddedArrayRef().data());
1982
1983     /* Start the force cycle counter.
1984      * Note that a different counter is used for dynamic load balancing.
1985      */
1986     wallcycle_start(wcycle, ewcFORCE);
1987
1988     gmx::ArrayRef<gmx::RVec> forceRef = force.paddedArrayRef();
1989     if (bDoForces)
1990     {
1991         /* If we need to compute the virial, we might need a separate
1992          * force buffer for algorithms for which the virial is calculated
1993          * separately, such as PME.
1994          */
1995         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1996         {
1997             forceRef = *fr->forceBufferForDirectVirialContributions;
1998
1999             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
2000         }
2001
2002         /* Clear the short- and long-range forces */
2003         clear_rvecs(fr->natoms_force_constr, f);
2004     }
2005
2006     /* forceWithVirial might need the full force atom range */
2007     gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
2008
2009     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
2010     {
2011         clear_pull_forces(inputrec->pull_work);
2012     }
2013
2014     /* update QMMMrec, if necessary */
2015     if (fr->bQMMM)
2016     {
2017         update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
2018     }
2019
2020     /* Compute the bonded and non-bonded energies and optionally forces */
2021     do_force_lowlevel(fr, inputrec, &(top->idef),
2022                       cr, ms, nrnb, wcycle, mdatoms,
2023                       as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
2024                       box, inputrec->fepvals, lambda,
2025                       graph, &(top->excls), fr->mu_tot,
2026                       flags,
2027                       &cycles_pme);
2028
2029     wallcycle_stop(wcycle, ewcFORCE);
2030
2031     if (DOMAINDECOMP(cr))
2032     {
2033         dd_force_flop_stop(cr->dd, nrnb);
2034
2035         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
2036         {
2037             ddCloseBalanceRegionCpu(cr->dd);
2038         }
2039     }
2040
2041     computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
2042                          step, t, wcycle,
2043                          fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
2044                          flags, &forceWithVirial, enerd,
2045                          ed, bNS);
2046
2047     if (bDoForces)
2048     {
2049         /* Communicate the forces */
2050         if (DOMAINDECOMP(cr))
2051         {
2052             dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
2053             /* Do we need to communicate the separate force array
2054              * for terms that do not contribute to the single sum virial?
2055              * Position restraints and electric fields do not introduce
2056              * inter-cg forces, only full electrostatics methods do.
2057              * When we do not calculate the virial, fr->f_novirsum = f,
2058              * so we have already communicated these forces.
2059              */
2060             if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
2061                 (flags & GMX_FORCE_VIRIAL))
2062             {
2063                 dd_move_f(cr->dd, forceWithVirial.force_, nullptr, wcycle);
2064             }
2065         }
2066
2067         /* If we have NoVirSum forces, but we do not calculate the virial,
2068          * we sum fr->f_novirsum=f later.
2069          */
2070         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
2071         {
2072             spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
2073                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
2074         }
2075
2076         if (flags & GMX_FORCE_VIRIAL)
2077         {
2078             /* Calculation of the virial must be done after vsites! */
2079             calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
2080                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
2081         }
2082     }
2083
2084     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
2085     {
2086         /* In case of node-splitting, the PP nodes receive the long-range
2087          * forces, virial and energy from the PME nodes here.
2088          */
2089         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
2090     }
2091
2092     if (bDoForces)
2093     {
2094         post_process_forces(cr, step, nrnb, wcycle,
2095                             top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
2096                             vir_force, mdatoms, graph, fr, vsite,
2097                             flags);
2098     }
2099
2100     if (flags & GMX_FORCE_ENERGY)
2101     {
2102         /* Sum the potential energy terms from group contributions */
2103         sum_epot(&(enerd->grpp), enerd->term);
2104
2105         if (!EI_TPI(inputrec->eI))
2106         {
2107             checkPotentialEnergyValidity(step, *enerd, *inputrec);
2108         }
2109     }
2110
2111 }
2112
2113 void do_force(FILE                                     *fplog,
2114               const t_commrec                          *cr,
2115               const gmx_multisim_t                     *ms,
2116               const t_inputrec                         *inputrec,
2117               gmx::Awh                                 *awh,
2118               gmx_enfrot                               *enforcedRotation,
2119               int64_t                                   step,
2120               t_nrnb                                   *nrnb,
2121               gmx_wallcycle_t                           wcycle,
2122               gmx_localtop_t                           *top,
2123               const gmx_groups_t                       *groups,
2124               matrix                                    box,
2125               gmx::ArrayRefWithPadding<gmx::RVec>       x,     //NOLINT(performance-unnecessary-value-param)
2126               history_t                                *hist,
2127               gmx::ArrayRefWithPadding<gmx::RVec>       force, //NOLINT(performance-unnecessary-value-param)
2128               tensor                                    vir_force,
2129               const t_mdatoms                          *mdatoms,
2130               gmx_enerdata_t                           *enerd,
2131               t_fcdata                                 *fcd,
2132               gmx::ArrayRef<real>                       lambda,
2133               t_graph                                  *graph,
2134               t_forcerec                               *fr,
2135               gmx::PpForceWorkload                     *ppForceWorkload,
2136               const gmx_vsite_t                        *vsite,
2137               rvec                                      mu_tot,
2138               double                                    t,
2139               gmx_edsam                                *ed,
2140               int                                       flags,
2141               DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
2142               DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
2143 {
2144     /* modify force flag if not doing nonbonded */
2145     if (!fr->bNonbonded)
2146     {
2147         flags &= ~GMX_FORCE_NONBONDED;
2148     }
2149
2150     switch (inputrec->cutoff_scheme)
2151     {
2152         case ecutsVERLET:
2153             do_force_cutsVERLET(fplog, cr, ms, inputrec,
2154                                 awh, enforcedRotation, step, nrnb, wcycle,
2155                                 top,
2156                                 groups,
2157                                 box, x, hist,
2158                                 force, vir_force,
2159                                 mdatoms,
2160                                 enerd, fcd,
2161                                 lambda.data(), graph,
2162                                 fr,
2163                                 ppForceWorkload,
2164                                 fr->ic,
2165                                 vsite, mu_tot,
2166                                 t, ed,
2167                                 flags,
2168                                 ddOpenBalanceRegion,
2169                                 ddCloseBalanceRegion);
2170             break;
2171         case ecutsGROUP:
2172             do_force_cutsGROUP(fplog, cr, ms, inputrec,
2173                                awh, enforcedRotation, step, nrnb, wcycle,
2174                                top,
2175                                groups,
2176                                box, x, hist,
2177                                force, vir_force,
2178                                mdatoms,
2179                                enerd, fcd,
2180                                lambda.data(), graph,
2181                                fr, vsite, mu_tot,
2182                                t, ed,
2183                                flags,
2184                                ddOpenBalanceRegion,
2185                                ddCloseBalanceRegion);
2186             break;
2187         default:
2188             gmx_incons("Invalid cut-off scheme passed!");
2189     }
2190
2191     /* In case we don't have constraints and are using GPUs, the next balancing
2192      * region starts here.
2193      * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2194      * virial calculation and COM pulling, is not thus not included in
2195      * the balance timing, which is ok as most tasks do communication.
2196      */
2197     if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
2198     {
2199         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
2200     }
2201 }
2202
2203
2204 void do_constrain_first(FILE *fplog, gmx::Constraints *constr,
2205                         const t_inputrec *ir, const t_mdatoms *md,
2206                         t_state *state)
2207 {
2208     int             i, m, start, end;
2209     int64_t         step;
2210     real            dt = ir->delta_t;
2211     real            dvdl_dum;
2212     rvec           *savex;
2213
2214     /* We need to allocate one element extra, since we might use
2215      * (unaligned) 4-wide SIMD loads to access rvec entries.
2216      */
2217     snew(savex, state->natoms + 1);
2218
2219     start = 0;
2220     end   = md->homenr;
2221
2222     if (debug)
2223     {
2224         fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2225                 start, md->homenr, end);
2226     }
2227     /* Do a first constrain to reset particles... */
2228     step = ir->init_step;
2229     if (fplog)
2230     {
2231         char buf[STEPSTRSIZE];
2232         fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2233                 gmx_step_str(step, buf));
2234     }
2235     dvdl_dum = 0;
2236
2237     /* constrain the current position */
2238     constr->apply(TRUE, FALSE,
2239                   step, 0, 1.0,
2240                   state->x.rvec_array(), state->x.rvec_array(), nullptr,
2241                   state->box,
2242                   state->lambda[efptBONDED], &dvdl_dum,
2243                   nullptr, nullptr, gmx::ConstraintVariable::Positions);
2244     if (EI_VV(ir->eI))
2245     {
2246         /* constrain the inital velocity, and save it */
2247         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2248         constr->apply(TRUE, FALSE,
2249                       step, 0, 1.0,
2250                       state->x.rvec_array(), state->v.rvec_array(), state->v.rvec_array(),
2251                       state->box,
2252                       state->lambda[efptBONDED], &dvdl_dum,
2253                       nullptr, nullptr, gmx::ConstraintVariable::Velocities);
2254     }
2255     /* constrain the inital velocities at t-dt/2 */
2256     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2257     {
2258         auto x = makeArrayRef(state->x).subArray(start, end);
2259         auto v = makeArrayRef(state->v).subArray(start, end);
2260         for (i = start; (i < end); i++)
2261         {
2262             for (m = 0; (m < DIM); m++)
2263             {
2264                 /* Reverse the velocity */
2265                 v[i][m] = -v[i][m];
2266                 /* Store the position at t-dt in buf */
2267                 savex[i][m] = x[i][m] + dt*v[i][m];
2268             }
2269         }
2270         /* Shake the positions at t=-dt with the positions at t=0
2271          * as reference coordinates.
2272          */
2273         if (fplog)
2274         {
2275             char buf[STEPSTRSIZE];
2276             fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2277                     gmx_step_str(step, buf));
2278         }
2279         dvdl_dum = 0;
2280         constr->apply(TRUE, FALSE,
2281                       step, -1, 1.0,
2282                       state->x.rvec_array(), savex, nullptr,
2283                       state->box,
2284                       state->lambda[efptBONDED], &dvdl_dum,
2285                       state->v.rvec_array(), nullptr, gmx::ConstraintVariable::Positions);
2286
2287         for (i = start; i < end; i++)
2288         {
2289             for (m = 0; m < DIM; m++)
2290             {
2291                 /* Re-reverse the velocities */
2292                 v[i][m] = -v[i][m];
2293             }
2294         }
2295     }
2296     sfree(savex);
2297 }
2298
2299
2300 static void
2301 integrate_table(const real vdwtab[], real scale, int offstart, int rstart, int rend,
2302                 double *enerout, double *virout)
2303 {
2304     double enersum, virsum;
2305     double invscale, invscale2, invscale3;
2306     double r, ea, eb, ec, pa, pb, pc, pd;
2307     double y0, f, g, h;
2308     int    ri, offset;
2309     double tabfactor;
2310
2311     invscale  = 1.0/scale;
2312     invscale2 = invscale*invscale;
2313     invscale3 = invscale*invscale2;
2314
2315     /* Following summation derived from cubic spline definition,
2316      * Numerical Recipies in C, second edition, p. 113-116.  Exact for
2317      * the cubic spline.  We first calculate the negative of the
2318      * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2319      * add the more standard, abrupt cutoff correction to that result,
2320      * yielding the long-range correction for a switched function.  We
2321      * perform both the pressure and energy loops at the same time for
2322      * simplicity, as the computational cost is low. */
2323
2324     if (offstart == 0)
2325     {
2326         /* Since the dispersion table has been scaled down a factor
2327          * 6.0 and the repulsion a factor 12.0 to compensate for the
2328          * c6/c12 parameters inside nbfp[] being scaled up (to save
2329          * flops in kernels), we need to correct for this.
2330          */
2331         tabfactor = 6.0;
2332     }
2333     else
2334     {
2335         tabfactor = 12.0;
2336     }
2337
2338     enersum = 0.0;
2339     virsum  = 0.0;
2340     for (ri = rstart; ri < rend; ++ri)
2341     {
2342         r  = ri*invscale;
2343         ea = invscale3;
2344         eb = 2.0*invscale2*r;
2345         ec = invscale*r*r;
2346
2347         pa = invscale3;
2348         pb = 3.0*invscale2*r;
2349         pc = 3.0*invscale*r*r;
2350         pd = r*r*r;
2351
2352         /* this "8" is from the packing in the vdwtab array - perhaps
2353            should be defined? */
2354
2355         offset = 8*ri + offstart;
2356         y0     = vdwtab[offset];
2357         f      = vdwtab[offset+1];
2358         g      = vdwtab[offset+2];
2359         h      = vdwtab[offset+3];
2360
2361         enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2362         virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2363     }
2364     *enerout = 4.0*M_PI*enersum*tabfactor;
2365     *virout  = 4.0*M_PI*virsum*tabfactor;
2366 }
2367
2368 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2369 {
2370     double   eners[2], virs[2], enersum, virsum;
2371     double   r0, rc3, rc9;
2372     int      ri0, ri1, i;
2373     real     scale, *vdwtab;
2374
2375     fr->enershiftsix    = 0;
2376     fr->enershifttwelve = 0;
2377     fr->enerdiffsix     = 0;
2378     fr->enerdifftwelve  = 0;
2379     fr->virdiffsix      = 0;
2380     fr->virdifftwelve   = 0;
2381
2382     const interaction_const_t *ic = fr->ic;
2383
2384     if (eDispCorr != edispcNO)
2385     {
2386         for (i = 0; i < 2; i++)
2387         {
2388             eners[i] = 0;
2389             virs[i]  = 0;
2390         }
2391         if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2392             (ic->vdw_modifier == eintmodPOTSWITCH) ||
2393             (ic->vdw_modifier == eintmodFORCESWITCH) ||
2394             (ic->vdwtype == evdwSHIFT) ||
2395             (ic->vdwtype == evdwSWITCH))
2396         {
2397             if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2398                  (ic->vdw_modifier == eintmodFORCESWITCH) ||
2399                  (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2400             {
2401                 gmx_fatal(FARGS,
2402                           "With dispersion correction rvdw-switch can not be zero "
2403                           "for vdw-type = %s", evdw_names[ic->vdwtype]);
2404             }
2405
2406             /* TODO This code depends on the logic in tables.c that
2407                constructs the table layout, which should be made
2408                explicit in future cleanup. */
2409             GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2410                        "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2411             scale  = fr->dispersionCorrectionTable->scale;
2412             vdwtab = fr->dispersionCorrectionTable->data;
2413
2414             /* Round the cut-offs to exact table values for precision */
2415             ri0  = static_cast<int>(std::floor(ic->rvdw_switch*scale));
2416             ri1  = static_cast<int>(std::ceil(ic->rvdw*scale));
2417
2418             /* The code below has some support for handling force-switching, i.e.
2419              * when the force (instead of potential) is switched over a limited
2420              * region. This leads to a constant shift in the potential inside the
2421              * switching region, which we can handle by adding a constant energy
2422              * term in the force-switch case just like when we do potential-shift.
2423              *
2424              * For now this is not enabled, but to keep the functionality in the
2425              * code we check separately for switch and shift. When we do force-switch
2426              * the shifting point is rvdw_switch, while it is the cutoff when we
2427              * have a classical potential-shift.
2428              *
2429              * For a pure potential-shift the potential has a constant shift
2430              * all the way out to the cutoff, and that is it. For other forms
2431              * we need to calculate the constant shift up to the point where we
2432              * start modifying the potential.
2433              */
2434             ri0  = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2435
2436             r0   = ri0/scale;
2437             rc3  = r0*r0*r0;
2438             rc9  = rc3*rc3*rc3;
2439
2440             if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2441                 (ic->vdwtype == evdwSHIFT))
2442             {
2443                 /* Determine the constant energy shift below rvdw_switch.
2444                  * Table has a scale factor since we have scaled it down to compensate
2445                  * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2446                  */
2447                 fr->enershiftsix    = static_cast<real>(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2448                 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2449             }
2450             else if (ic->vdw_modifier == eintmodPOTSHIFT)
2451             {
2452                 fr->enershiftsix    = static_cast<real>(-1.0/(rc3*rc3));
2453                 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3));
2454             }
2455
2456             /* Add the constant part from 0 to rvdw_switch.
2457              * This integration from 0 to rvdw_switch overcounts the number
2458              * of interactions by 1, as it also counts the self interaction.
2459              * We will correct for this later.
2460              */
2461             eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2462             eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2463
2464             /* Calculate the contribution in the range [r0,r1] where we
2465              * modify the potential. For a pure potential-shift modifier we will
2466              * have ri0==ri1, and there will not be any contribution here.
2467              */
2468             for (i = 0; i < 2; i++)
2469             {
2470                 enersum = 0;
2471                 virsum  = 0;
2472                 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2473                 eners[i] -= enersum;
2474                 virs[i]  -= virsum;
2475             }
2476
2477             /* Alright: Above we compensated by REMOVING the parts outside r0
2478              * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2479              *
2480              * Regardless of whether r0 is the point where we start switching,
2481              * or the cutoff where we calculated the constant shift, we include
2482              * all the parts we are missing out to infinity from r0 by
2483              * calculating the analytical dispersion correction.
2484              */
2485             eners[0] += -4.0*M_PI/(3.0*rc3);
2486             eners[1] +=  4.0*M_PI/(9.0*rc9);
2487             virs[0]  +=  8.0*M_PI/rc3;
2488             virs[1]  += -16.0*M_PI/(3.0*rc9);
2489         }
2490         else if (ic->vdwtype == evdwCUT ||
2491                  EVDW_PME(ic->vdwtype) ||
2492                  ic->vdwtype == evdwUSER)
2493         {
2494             if (ic->vdwtype == evdwUSER && fplog)
2495             {
2496                 fprintf(fplog,
2497                         "WARNING: using dispersion correction with user tables\n");
2498             }
2499
2500             /* Note that with LJ-PME, the dispersion correction is multiplied
2501              * by the difference between the actual C6 and the value of C6
2502              * that would produce the combination rule.
2503              * This means the normal energy and virial difference formulas
2504              * can be used here.
2505              */
2506
2507             rc3  = ic->rvdw*ic->rvdw*ic->rvdw;
2508             rc9  = rc3*rc3*rc3;
2509             /* Contribution beyond the cut-off */
2510             eners[0] += -4.0*M_PI/(3.0*rc3);
2511             eners[1] +=  4.0*M_PI/(9.0*rc9);
2512             if (ic->vdw_modifier == eintmodPOTSHIFT)
2513             {
2514                 /* Contribution within the cut-off */
2515                 eners[0] += -4.0*M_PI/(3.0*rc3);
2516                 eners[1] +=  4.0*M_PI/(3.0*rc9);
2517             }
2518             /* Contribution beyond the cut-off */
2519             virs[0]  +=  8.0*M_PI/rc3;
2520             virs[1]  += -16.0*M_PI/(3.0*rc9);
2521         }
2522         else
2523         {
2524             gmx_fatal(FARGS,
2525                       "Dispersion correction is not implemented for vdw-type = %s",
2526                       evdw_names[ic->vdwtype]);
2527         }
2528
2529         /* When we deprecate the group kernels the code below can go too */
2530         if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2531         {
2532             /* Calculate self-interaction coefficient (assuming that
2533              * the reciprocal-space contribution is constant in the
2534              * region that contributes to the self-interaction).
2535              */
2536             fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2537
2538             eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2539             virs[0]  +=  gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2540         }
2541
2542         fr->enerdiffsix    = eners[0];
2543         fr->enerdifftwelve = eners[1];
2544         /* The 0.5 is due to the Gromacs definition of the virial */
2545         fr->virdiffsix     = 0.5*virs[0];
2546         fr->virdifftwelve  = 0.5*virs[1];
2547     }
2548 }
2549
2550 void calc_dispcorr(const t_inputrec *ir, const t_forcerec *fr,
2551                    const matrix box, real lambda, tensor pres, tensor virial,
2552                    real *prescorr, real *enercorr, real *dvdlcorr)
2553 {
2554     gmx_bool bCorrAll, bCorrPres;
2555     real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2556     int      m;
2557
2558     *prescorr = 0;
2559     *enercorr = 0;
2560     *dvdlcorr = 0;
2561
2562     clear_mat(virial);
2563     clear_mat(pres);
2564
2565     if (ir->eDispCorr != edispcNO)
2566     {
2567         bCorrAll  = (ir->eDispCorr == edispcAllEner ||
2568                      ir->eDispCorr == edispcAllEnerPres);
2569         bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2570                      ir->eDispCorr == edispcAllEnerPres);
2571
2572         invvol = 1/det(box);
2573         if (fr->n_tpi)
2574         {
2575             /* Only correct for the interactions with the inserted molecule */
2576             dens   = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2577             ninter = fr->n_tpi;
2578         }
2579         else
2580         {
2581             dens   = fr->numAtomsForDispersionCorrection*invvol;
2582             ninter = 0.5*fr->numAtomsForDispersionCorrection;
2583         }
2584
2585         if (ir->efep == efepNO)
2586         {
2587             avcsix    = fr->avcsix[0];
2588             avctwelve = fr->avctwelve[0];
2589         }
2590         else
2591         {
2592             avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
2593             avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2594         }
2595
2596         enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2597         *enercorr += avcsix*enerdiff;
2598         dvdlambda  = 0.0;
2599         if (ir->efep != efepNO)
2600         {
2601             dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2602         }
2603         if (bCorrAll)
2604         {
2605             enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2606             *enercorr += avctwelve*enerdiff;
2607             if (fr->efep != efepNO)
2608             {
2609                 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2610             }
2611         }
2612
2613         if (bCorrPres)
2614         {
2615             svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2616             if (ir->eDispCorr == edispcAllEnerPres)
2617             {
2618                 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2619             }
2620             /* The factor 2 is because of the Gromacs virial definition */
2621             spres = -2.0*invvol*svir*PRESFAC;
2622
2623             for (m = 0; m < DIM; m++)
2624             {
2625                 virial[m][m] += svir;
2626                 pres[m][m]   += spres;
2627             }
2628             *prescorr += spres;
2629         }
2630
2631         /* Can't currently control when it prints, for now, just print when degugging */
2632         if (debug)
2633         {
2634             if (bCorrAll)
2635             {
2636                 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2637                         avcsix, avctwelve);
2638             }
2639             if (bCorrPres)
2640             {
2641                 fprintf(debug,
2642                         "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2643                         *enercorr, spres, svir);
2644             }
2645             else
2646             {
2647                 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2648             }
2649         }
2650
2651         if (fr->efep != efepNO)
2652         {
2653             *dvdlcorr += dvdlambda;
2654         }
2655     }
2656 }
2657
2658 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2659                             const gmx_mtop_t *mtop, rvec x[],
2660                             gmx_bool bFirst)
2661 {
2662     t_graph        *graph;
2663     int             as, mol;
2664
2665     if (bFirst && fplog)
2666     {
2667         fprintf(fplog, "Removing pbc first time\n");
2668     }
2669
2670     snew(graph, 1);
2671     as = 0;
2672     for (const gmx_molblock_t &molb : mtop->molblock)
2673     {
2674         const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2675         if (moltype.atoms.nr == 1 ||
2676             (!bFirst && moltype.cgs.nr == 1))
2677         {
2678             /* Just one atom or charge group in the molecule, no PBC required */
2679             as += molb.nmol*moltype.atoms.nr;
2680         }
2681         else
2682         {
2683             mk_graph_moltype(moltype, graph);
2684
2685             for (mol = 0; mol < molb.nmol; mol++)
2686             {
2687                 mk_mshift(fplog, graph, ePBC, box, x+as);
2688
2689                 shift_self(graph, box, x+as);
2690                 /* The molecule is whole now.
2691                  * We don't need the second mk_mshift call as in do_pbc_first,
2692                  * since we no longer need this graph.
2693                  */
2694
2695                 as += moltype.atoms.nr;
2696             }
2697             done_graph(graph);
2698         }
2699     }
2700     sfree(graph);
2701 }
2702
2703 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2704                        const gmx_mtop_t *mtop, rvec x[])
2705 {
2706     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2707 }
2708
2709 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2710                  const gmx_mtop_t *mtop, rvec x[])
2711 {
2712     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2713 }
2714
2715 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2716 {
2717     int t, nth;
2718     nth = gmx_omp_nthreads_get(emntDefault);
2719
2720 #pragma omp parallel for num_threads(nth) schedule(static)
2721     for (t = 0; t < nth; t++)
2722     {
2723         try
2724         {
2725             size_t natoms = x.size();
2726             size_t offset = (natoms*t    )/nth;
2727             size_t len    = (natoms*(t + 1))/nth - offset;
2728             put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2729         }
2730         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2731     }
2732 }
2733
2734 // TODO This can be cleaned up a lot, and move back to runner.cpp
2735 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2736                 const t_inputrec *inputrec,
2737                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2738                 gmx_walltime_accounting_t walltime_accounting,
2739                 nonbonded_verlet_t *nbv,
2740                 const gmx_pme_t *pme,
2741                 gmx_bool bWriteStat)
2742 {
2743     t_nrnb *nrnb_tot = nullptr;
2744     double  delta_t  = 0;
2745     double  nbfs     = 0, mflop = 0;
2746     double  elapsed_time,
2747             elapsed_time_over_all_ranks,
2748             elapsed_time_over_all_threads,
2749             elapsed_time_over_all_threads_over_all_ranks;
2750     /* Control whether it is valid to print a report. Only the
2751        simulation master may print, but it should not do so if the run
2752        terminated e.g. before a scheduled reset step. This is
2753        complicated by the fact that PME ranks are unaware of the
2754        reason why they were sent a pmerecvqxFINISH. To avoid
2755        communication deadlocks, we always do the communication for the
2756        report, even if we've decided not to write the report, because
2757        how long it takes to finish the run is not important when we've
2758        decided not to report on the simulation performance.
2759
2760        Further, we only report performance for dynamical integrators,
2761        because those are the only ones for which we plan to
2762        consider doing any optimizations. */
2763     bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2764
2765     if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2766     {
2767         GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2768         printReport = false;
2769     }
2770
2771     if (cr->nnodes > 1)
2772     {
2773         snew(nrnb_tot, 1);
2774 #if GMX_MPI
2775         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2776                       cr->mpi_comm_mysim);
2777 #endif
2778     }
2779     else
2780     {
2781         nrnb_tot = nrnb;
2782     }
2783
2784     elapsed_time                  = walltime_accounting_get_time_since_reset(walltime_accounting);
2785     elapsed_time_over_all_threads = walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
2786     if (cr->nnodes > 1)
2787     {
2788 #if GMX_MPI
2789         /* reduce elapsed_time over all MPI ranks in the current simulation */
2790         MPI_Allreduce(&elapsed_time,
2791                       &elapsed_time_over_all_ranks,
2792                       1, MPI_DOUBLE, MPI_SUM,
2793                       cr->mpi_comm_mysim);
2794         elapsed_time_over_all_ranks /= cr->nnodes;
2795         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2796          * current simulation. */
2797         MPI_Allreduce(&elapsed_time_over_all_threads,
2798                       &elapsed_time_over_all_threads_over_all_ranks,
2799                       1, MPI_DOUBLE, MPI_SUM,
2800                       cr->mpi_comm_mysim);
2801 #endif
2802     }
2803     else
2804     {
2805         elapsed_time_over_all_ranks                  = elapsed_time;
2806         elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2807     }
2808
2809     if (printReport)
2810     {
2811         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2812     }
2813     if (cr->nnodes > 1)
2814     {
2815         sfree(nrnb_tot);
2816     }
2817
2818     if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2819     {
2820         print_dd_statistics(cr, inputrec, fplog);
2821     }
2822
2823     /* TODO Move the responsibility for any scaling by thread counts
2824      * to the code that handled the thread region, so that there's a
2825      * mechanism to keep cycle counting working during the transition
2826      * to task parallelism. */
2827     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
2828     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2829     wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2830     auto cycle_sum(wallcycle_sum(cr, wcycle));
2831
2832     if (printReport)
2833     {
2834         auto                    nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2835         gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
2836         if (pme_gpu_task_enabled(pme))
2837         {
2838             pme_gpu_get_timings(pme, &pme_gpu_timings);
2839         }
2840         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2841                         elapsed_time_over_all_ranks,
2842                         wcycle, cycle_sum,
2843                         nbnxn_gpu_timings,
2844                         &pme_gpu_timings);
2845
2846         if (EI_DYNAMICS(inputrec->eI))
2847         {
2848             delta_t = inputrec->delta_t;
2849         }
2850
2851         if (fplog)
2852         {
2853             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2854                        elapsed_time_over_all_ranks,
2855                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2856                        delta_t, nbfs, mflop);
2857         }
2858         if (bWriteStat)
2859         {
2860             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2861                        elapsed_time_over_all_ranks,
2862                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2863                        delta_t, nbfs, mflop);
2864         }
2865     }
2866 }
2867
2868 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2869 {
2870     /* this function works, but could probably use a logic rewrite to keep all the different
2871        types of efep straight. */
2872
2873     if ((ir->efep == efepNO) && (!ir->bSimTemp))
2874     {
2875         return;
2876     }
2877
2878     t_lambda *fep = ir->fepvals;
2879     *fep_state    = fep->init_fep_state; /* this might overwrite the checkpoint
2880                                             if checkpoint is set -- a kludge is in for now
2881                                             to prevent this.*/
2882
2883     for (int i = 0; i < efptNR; i++)
2884     {
2885         /* overwrite lambda state with init_lambda for now for backwards compatibility */
2886         if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2887         {
2888             lambda[i] = fep->init_lambda;
2889             if (lam0)
2890             {
2891                 lam0[i] = lambda[i];
2892             }
2893         }
2894         else
2895         {
2896             lambda[i] = fep->all_lambda[i][*fep_state];
2897             if (lam0)
2898             {
2899                 lam0[i] = lambda[i];
2900             }
2901         }
2902     }
2903     if (ir->bSimTemp)
2904     {
2905         /* need to rescale control temperatures to match current state */
2906         for (int i = 0; i < ir->opts.ngtc; i++)
2907         {
2908             if (ir->opts.ref_t[i] > 0)
2909             {
2910                 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2911             }
2912         }
2913     }
2914
2915     /* Send to the log the information on the current lambdas */
2916     if (fplog != nullptr)
2917     {
2918         fprintf(fplog, "Initial vector of lambda components:[ ");
2919         for (const auto &l : lambda)
2920         {
2921             fprintf(fplog, "%10.4f ", l);
2922         }
2923         fprintf(fplog, "]\n");
2924     }
2925 }
2926
2927
2928 void init_md(FILE *fplog,
2929              const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2930              t_inputrec *ir, const gmx_output_env_t *oenv,
2931              const MdrunOptions &mdrunOptions,
2932              double *t, double *t0,
2933              t_state *globalState, double *lam0,
2934              t_nrnb *nrnb, gmx_mtop_t *mtop,
2935              gmx_update_t **upd,
2936              gmx::BoxDeformation *deform,
2937              int nfile, const t_filenm fnm[],
2938              gmx_mdoutf_t *outf, t_mdebin **mdebin,
2939              tensor force_vir, tensor shake_vir,
2940              tensor total_vir, tensor pres, rvec mu_tot,
2941              gmx_bool *bSimAnn, t_vcm **vcm,
2942              gmx_wallcycle_t wcycle)
2943 {
2944     int  i;
2945
2946     /* Initial values */
2947     *t = *t0       = ir->init_t;
2948
2949     *bSimAnn = FALSE;
2950     for (i = 0; i < ir->opts.ngtc; i++)
2951     {
2952         /* set bSimAnn if any group is being annealed */
2953         if (ir->opts.annealing[i] != eannNO)
2954         {
2955             *bSimAnn = TRUE;
2956         }
2957     }
2958
2959     /* Initialize lambda variables */
2960     /* TODO: Clean up initialization of fep_state and lambda in t_state.
2961      * We currently need to call initialize_lambdas on non-master ranks
2962      * to initialize lam0.
2963      */
2964     if (MASTER(cr))
2965     {
2966         initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
2967     }
2968     else
2969     {
2970         int                      tmpFepState;
2971         std::array<real, efptNR> tmpLambda;
2972         initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
2973     }
2974
2975     // TODO upd is never NULL in practice, but the analysers don't know that
2976     if (upd)
2977     {
2978         *upd = init_update(ir, deform);
2979     }
2980     if (*bSimAnn)
2981     {
2982         update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2983     }
2984
2985     if (vcm != nullptr)
2986     {
2987         *vcm = init_vcm(fplog, &mtop->groups, ir);
2988     }
2989
2990     if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
2991     {
2992         if (ir->etc == etcBERENDSEN)
2993         {
2994             please_cite(fplog, "Berendsen84a");
2995         }
2996         if (ir->etc == etcVRESCALE)
2997         {
2998             please_cite(fplog, "Bussi2007a");
2999         }
3000         if (ir->eI == eiSD1)
3001         {
3002             please_cite(fplog, "Goga2012");
3003         }
3004     }
3005     init_nrnb(nrnb);
3006
3007     if (nfile != -1)
3008     {
3009         *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
3010
3011         *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
3012                               mtop, ir, mdoutf_get_fp_dhdl(*outf));
3013     }
3014
3015     /* Initiate variables */
3016     clear_mat(force_vir);
3017     clear_mat(shake_vir);
3018     clear_rvec(mu_tot);
3019     clear_mat(total_vir);
3020     clear_mat(pres);
3021 }
3022
3023 void init_rerun(FILE *fplog,
3024                 const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
3025                 t_inputrec *ir, const gmx_output_env_t *oenv,
3026                 const MdrunOptions &mdrunOptions,
3027                 t_state *globalState, double *lam0,
3028                 t_nrnb *nrnb, gmx_mtop_t *mtop,
3029                 int nfile, const t_filenm fnm[],
3030                 gmx_mdoutf_t *outf, t_mdebin **mdebin,
3031                 gmx_wallcycle_t wcycle)
3032 {
3033     /* Initialize lambda variables */
3034     /* TODO: Clean up initialization of fep_state and lambda in t_state.
3035      * We currently need to call initialize_lambdas on non-master ranks
3036      * to initialize lam0.
3037      */
3038     if (MASTER(cr))
3039     {
3040         initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
3041     }
3042     else
3043     {
3044         int                      tmpFepState;
3045         std::array<real, efptNR> tmpLambda;
3046         initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
3047     }
3048
3049     init_nrnb(nrnb);
3050
3051     if (nfile != -1)
3052     {
3053         *outf   = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
3054         *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
3055                               mtop, ir, mdoutf_get_fp_dhdl(*outf), true);
3056     }
3057 }