src/gromacs/mdlib/sim_util.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "sim_util.h"
  40
  41 #include "config.h"
  42
  43 #include <stdio.h>
  44 #include <string.h>
  45
  46 #include <cmath>
  47 #include <cstdint>
  48
  49 #include <array>
  50
  51 #include "gromacs/awh/awh.h"
  52 #include "gromacs/domdec/dlbtiming.h"
  53 #include "gromacs/domdec/domdec.h"
  54 #include "gromacs/domdec/domdec_struct.h"
  55 #include "gromacs/essentialdynamics/edsam.h"
  56 #include "gromacs/ewald/pme.h"
  57 #include "gromacs/gmxlib/chargegroup.h"
  58 #include "gromacs/gmxlib/network.h"
  59 #include "gromacs/gmxlib/nrnb.h"
  60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
  61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
  62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
  63 #include "gromacs/gpu_utils/gpu_utils.h"
  64 #include "gromacs/imd/imd.h"
  65 #include "gromacs/listed-forces/bonded.h"
  66 #include "gromacs/listed-forces/disre.h"
  67 #include "gromacs/listed-forces/orires.h"
  68 #include "gromacs/math/functions.h"
  69 #include "gromacs/math/units.h"
  70 #include "gromacs/math/vec.h"
  71 #include "gromacs/math/vecdump.h"
  72 #include "gromacs/mdlib/calcmu.h"
  73 #include "gromacs/mdlib/constr.h"
  74 #include "gromacs/mdlib/force.h"
  75 #include "gromacs/mdlib/forcerec.h"
  76 #include "gromacs/mdlib/genborn.h"
  77 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  78 #include "gromacs/mdlib/mdrun.h"
  79 #include "gromacs/mdlib/nb_verlet.h"
  80 #include "gromacs/mdlib/nbnxn_atomdata.h"
  81 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  82 #include "gromacs/mdlib/nbnxn_grid.h"
  83 #include "gromacs/mdlib/nbnxn_search.h"
  84 #include "gromacs/mdlib/qmmm.h"
  85 #include "gromacs/mdlib/update.h"
  86 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
  87 #include "gromacs/mdtypes/commrec.h"
  88 #include "gromacs/mdtypes/forceoutput.h"
  89 #include "gromacs/mdtypes/iforceprovider.h"
  90 #include "gromacs/mdtypes/inputrec.h"
  91 #include "gromacs/mdtypes/md_enums.h"
  92 #include "gromacs/mdtypes/state.h"
  93 #include "gromacs/pbcutil/ishift.h"
  94 #include "gromacs/pbcutil/mshift.h"
  95 #include "gromacs/pbcutil/pbc.h"
  96 #include "gromacs/pulling/pull.h"
  97 #include "gromacs/pulling/pull_rotation.h"
  98 #include "gromacs/timing/cyclecounter.h"
  99 #include "gromacs/timing/gpu_timing.h"
 100 #include "gromacs/timing/wallcycle.h"
 101 #include "gromacs/timing/wallcyclereporting.h"
 102 #include "gromacs/timing/walltime_accounting.h"
 103 #include "gromacs/utility/arrayref.h"
 104 #include "gromacs/utility/basedefinitions.h"
 105 #include "gromacs/utility/cstringutil.h"
 106 #include "gromacs/utility/exceptions.h"
 107 #include "gromacs/utility/fatalerror.h"
 108 #include "gromacs/utility/gmxassert.h"
 109 #include "gromacs/utility/gmxmpi.h"
 110 #include "gromacs/utility/logger.h"
 111 #include "gromacs/utility/pleasecite.h"
 112 #include "gromacs/utility/smalloc.h"
 113 #include "gromacs/utility/sysinfo.h"
 114
 115 #include "nbnxn_gpu.h"
 116 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
 117 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
 118
 119 // TODO: this environment variable allows us to verify before release
 120 // that on less common architectures the total cost of polling is not larger than
 121 // a blocking wait (so polling does not introduce overhead when the static
 122 // PME-first ordering would suffice).
 123 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
 124
 125
 126 void print_time(FILE                     *out,
 127                 gmx_walltime_accounting_t walltime_accounting,
 128                 gmx_int64_t               step,
 129                 t_inputrec               *ir,
 130                 t_commrec gmx_unused     *cr)
 131 {
 132     time_t finish;
 133     char   timebuf[STRLEN];
 134     double dt, elapsed_seconds, time_per_step;
 135     char   buf[48];
 136
 137 #if !GMX_THREAD_MPI
 138     if (!PAR(cr))
 139 #endif
 140     {
 141         fprintf(out, "\r");
 142     }
 143     fprintf(out, "step %s", gmx_step_str(step, buf));
 144     fflush(out);
 145
 146     if ((step >= ir->nstlist))
 147     {
 148         double seconds_since_epoch = gmx_gettime();
 149         elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
 150         time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
 151         dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
 152
 153         if (ir->nsteps >= 0)
 154         {
 155             if (dt >= 300)
 156             {
 157                 finish = (time_t) (seconds_since_epoch + dt);
 158                 gmx_ctime_r(&finish, timebuf, STRLEN);
 159                 sprintf(buf, "%s", timebuf);
 160                 buf[strlen(buf)-1] = '\0';
 161                 fprintf(out, ", will finish %s", buf);
 162             }
 163             else
 164             {
 165                 fprintf(out, ", remaining wall clock time: %5d s          ", (int)dt);
 166             }
 167         }
 168         else
 169         {
 170             fprintf(out, " performance: %.1f ns/day    ",
 171                     ir->delta_t/1000*24*60*60/time_per_step);
 172         }
 173     }
 174 #if !GMX_THREAD_MPI
 175     if (PAR(cr))
 176     {
 177         fprintf(out, "\n");
 178     }
 179 #endif
 180
 181     fflush(out);
 182 }
 183
 184 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
 185                          double the_time)
 186 {
 187     char   time_string[STRLEN];
 188
 189     if (!fplog)
 190     {
 191         return;
 192     }
 193
 194     {
 195         int    i;
 196         char   timebuf[STRLEN];
 197         time_t temp_time = (time_t) the_time;
 198
 199         gmx_ctime_r(&temp_time, timebuf, STRLEN);
 200         for (i = 0; timebuf[i] >= ' '; i++)
 201         {
 202             time_string[i] = timebuf[i];
 203         }
 204         time_string[i] = '\0';
 205     }
 206
 207     fprintf(fplog, "%s on rank %d %s\n", title, nodeid, time_string);
 208 }
 209
 210 void print_start(FILE *fplog, t_commrec *cr,
 211                  gmx_walltime_accounting_t walltime_accounting,
 212                  const char *name)
 213 {
 214     char buf[STRLEN];
 215
 216     sprintf(buf, "Started %s", name);
 217     print_date_and_time(fplog, cr->nodeid, buf,
 218                         walltime_accounting_get_start_time_stamp(walltime_accounting));
 219 }
 220
 221 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
 222 {
 223     const int end = forceToAdd.size();
 224
 225     // cppcheck-suppress unreadVariable
 226     int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
 227 #pragma omp parallel for num_threads(nt) schedule(static)
 228     for (int i = 0; i < end; i++)
 229     {
 230         rvec_inc(f[i], forceToAdd[i]);
 231     }
 232 }
 233
 234 static void pme_gpu_reduce_outputs(gmx_wallcycle_t            wcycle,
 235                                    ForceWithVirial           *forceWithVirial,
 236                                    ArrayRef<const gmx::RVec>  pmeForces,
 237                                    gmx_enerdata_t            *enerd,
 238                                    const tensor               vir_Q,
 239                                    real                       Vlr_q)
 240 {
 241     wallcycle_start(wcycle, ewcPME_GPU_F_REDUCTION);
 242     GMX_ASSERT(forceWithVirial, "Invalid force pointer");
 243     forceWithVirial->addVirialContribution(vir_Q);
 244     enerd->term[F_COUL_RECIP] += Vlr_q;
 245     sum_forces(as_rvec_array(forceWithVirial->force_.data()), pmeForces);
 246     wallcycle_stop(wcycle, ewcPME_GPU_F_REDUCTION);
 247 }
 248
 249 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
 250                         tensor vir_part, t_graph *graph, matrix box,
 251                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
 252 {
 253     int    i;
 254
 255     /* The short-range virial from surrounding boxes */
 256     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
 257     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
 258
 259     /* Calculate partial virial, for local atoms only, based on short range.
 260      * Total virial is computed in global_stat, called from do_md
 261      */
 262     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
 263     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
 264
 265     /* Add wall contribution */
 266     for (i = 0; i < DIM; i++)
 267     {
 268         vir_part[i][ZZ] += fr->vir_wall_z[i];
 269     }
 270
 271     if (debug)
 272     {
 273         pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
 274     }
 275 }
 276
 277 static void pull_potential_wrapper(t_commrec *cr,
 278                                    t_inputrec *ir,
 279                                    matrix box, rvec x[],
 280                                    ForceWithVirial *force,
 281                                    t_mdatoms *mdatoms,
 282                                    gmx_enerdata_t *enerd,
 283                                    real *lambda,
 284                                    double t,
 285                                    gmx_wallcycle_t wcycle)
 286 {
 287     t_pbc  pbc;
 288     real   dvdl;
 289
 290     /* Calculate the center of mass forces, this requires communication,
 291      * which is why pull_potential is called close to other communication.
 292      */
 293     wallcycle_start(wcycle, ewcPULLPOT);
 294     set_pbc(&pbc, ir->ePBC, box);
 295     dvdl                     = 0;
 296     enerd->term[F_COM_PULL] +=
 297         pull_potential(ir->pull_work, mdatoms, &pbc,
 298                        cr, t, lambda[efptRESTRAINT], x, force, &dvdl);
 299     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
 300     wallcycle_stop(wcycle, ewcPULLPOT);
 301 }
 302
 303 static void pme_receive_force_ener(t_commrec       *cr,
 304                                    ForceWithVirial *forceWithVirial,
 305                                    gmx_enerdata_t  *enerd,
 306                                    gmx_wallcycle_t  wcycle)
 307 {
 308     real   e_q, e_lj, dvdl_q, dvdl_lj;
 309     float  cycles_ppdpme, cycles_seppme;
 310
 311     cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
 312     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
 313
 314     /* In case of node-splitting, the PP nodes receive the long-range
 315      * forces, virial and energy from the PME nodes here.
 316      */
 317     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
 318     dvdl_q  = 0;
 319     dvdl_lj = 0;
 320     gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
 321                       &cycles_seppme);
 322     enerd->term[F_COUL_RECIP] += e_q;
 323     enerd->term[F_LJ_RECIP]   += e_lj;
 324     enerd->dvdl_lin[efptCOUL] += dvdl_q;
 325     enerd->dvdl_lin[efptVDW]  += dvdl_lj;
 326
 327     if (wcycle)
 328     {
 329         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
 330     }
 331     wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
 332 }
 333
 334 static void print_large_forces(FILE *fp, t_mdatoms *md, t_commrec *cr,
 335                                gmx_int64_t step, real forceTolerance,
 336                                const rvec *x, const rvec *f)
 337 {
 338     real           force2Tolerance = gmx::square(forceTolerance);
 339     std::uintmax_t numNonFinite    = 0;
 340     for (int i = 0; i < md->homenr; i++)
 341     {
 342         real force2    = norm2(f[i]);
 343         bool nonFinite = !std::isfinite(force2);
 344         if (force2 >= force2Tolerance || nonFinite)
 345         {
 346             fprintf(fp, "step %" GMX_PRId64 " atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
 347                     step,
 348                     ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
 349         }
 350         if (nonFinite)
 351         {
 352             numNonFinite++;
 353         }
 354     }
 355     if (numNonFinite > 0)
 356     {
 357         /* Note that with MPI this fatal call on one rank might interrupt
 358          * the printing on other ranks. But we can only avoid that with
 359          * an expensive MPI barrier that we would need at each step.
 360          */
 361         gmx_fatal(FARGS, "At step %" GMX_PRId64 " detected non-finite forces on %ju atoms", step, numNonFinite);
 362     }
 363 }
 364
 365 static void post_process_forces(t_commrec *cr,
 366                                 gmx_int64_t step,
 367                                 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 368                                 gmx_localtop_t *top,
 369                                 matrix box, rvec x[],
 370                                 rvec f[],
 371                                 ForceWithVirial *forceWithVirial,
 372                                 tensor vir_force,
 373                                 t_mdatoms *mdatoms,
 374                                 t_graph *graph,
 375                                 t_forcerec *fr, gmx_vsite_t *vsite,
 376                                 int flags)
 377 {
 378     if (fr->haveDirectVirialContributions)
 379     {
 380         rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
 381
 382         if (vsite)
 383         {
 384             /* Spread the mesh force on virtual sites to the other particles...
 385              * This is parallellized. MPI communication is performed
 386              * if the constructing atoms aren't local.
 387              */
 388             wallcycle_start(wcycle, ewcVSITESPREAD);
 389             matrix virial = { { 0 } };
 390             spread_vsite_f(vsite, x, fDirectVir, nullptr,
 391                            (flags & GMX_FORCE_VIRIAL), virial,
 392                            nrnb,
 393                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
 394             forceWithVirial->addVirialContribution(virial);
 395             wallcycle_stop(wcycle, ewcVSITESPREAD);
 396         }
 397
 398         if (flags & GMX_FORCE_VIRIAL)
 399         {
 400             /* Now add the forces, this is local */
 401             sum_forces(f, forceWithVirial->force_);
 402
 403             /* Add the direct virial contributions */
 404             GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
 405             m_add(vir_force, forceWithVirial->getVirial(), vir_force);
 406
 407             if (debug)
 408             {
 409                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
 410             }
 411         }
 412     }
 413
 414     if (fr->print_force >= 0)
 415     {
 416         print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
 417     }
 418 }
 419
 420 static void do_nb_verlet(t_forcerec *fr,
 421                          interaction_const_t *ic,
 422                          gmx_enerdata_t *enerd,
 423                          int flags, int ilocality,
 424                          int clearF,
 425                          gmx_int64_t step,
 426                          t_nrnb *nrnb,
 427                          gmx_wallcycle_t wcycle)
 428 {
 429     if (!(flags & GMX_FORCE_NONBONDED))
 430     {
 431         /* skip non-bonded calculation */
 432         return;
 433     }
 434
 435     nonbonded_verlet_t       *nbv  = fr->nbv;
 436     nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
 437
 438     /* GPU kernel launch overhead is already timed separately */
 439     if (fr->cutoff_scheme != ecutsVERLET)
 440     {
 441         gmx_incons("Invalid cut-off scheme passed!");
 442     }
 443
 444     bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
 445
 446     if (!bUsingGpuKernels)
 447     {
 448         /* When dynamic pair-list  pruning is requested, we need to prune
 449          * at nstlistPrune steps.
 450          */
 451         if (nbv->listParams->useDynamicPruning &&
 452             (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
 453         {
 454             /* Prune the pair-list beyond fr->ic->rlistPrune using
 455              * the current coordinates of the atoms.
 456              */
 457             wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
 458             nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
 459             wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
 460         }
 461
 462         wallcycle_sub_start(wcycle, ewcsNONBONDED);
 463     }
 464
 465     switch (nbvg->kernel_type)
 466     {
 467         case nbnxnk4x4_PlainC:
 468         case nbnxnk4xN_SIMD_4xN:
 469         case nbnxnk4xN_SIMD_2xNN:
 470             nbnxn_kernel_cpu(nbvg,
 471                              nbv->nbat,
 472                              ic,
 473                              fr->shift_vec,
 474                              flags,
 475                              clearF,
 476                              fr->fshift[0],
 477                              enerd->grpp.ener[egCOULSR],
 478                              fr->bBHAM ?
 479                              enerd->grpp.ener[egBHAMSR] :
 480                              enerd->grpp.ener[egLJSR]);
 481             break;
 482
 483         case nbnxnk8x8x8_GPU:
 484             nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbv->nbat, flags, ilocality);
 485             break;
 486
 487         case nbnxnk8x8x8_PlainC:
 488             nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
 489                                  nbv->nbat, ic,
 490                                  fr->shift_vec,
 491                                  flags,
 492                                  clearF,
 493                                  nbv->nbat->out[0].f,
 494                                  fr->fshift[0],
 495                                  enerd->grpp.ener[egCOULSR],
 496                                  fr->bBHAM ?
 497                                  enerd->grpp.ener[egBHAMSR] :
 498                                  enerd->grpp.ener[egLJSR]);
 499             break;
 500
 501         default:
 502             GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
 503
 504     }
 505     if (!bUsingGpuKernels)
 506     {
 507         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 508     }
 509
 510     int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
 511     if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
 512     {
 513         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
 514     }
 515     else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
 516              (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
 517     {
 518         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
 519     }
 520     else
 521     {
 522         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
 523     }
 524     enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
 525     if (flags & GMX_FORCE_ENERGY)
 526     {
 527         /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
 528         enr_nbnxn_kernel_ljc += 1;
 529         enr_nbnxn_kernel_lj  += 1;
 530     }
 531
 532     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
 533              nbvg->nbl_lists.natpair_ljq);
 534     inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
 535              nbvg->nbl_lists.natpair_lj);
 536     /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
 537     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
 538              nbvg->nbl_lists.natpair_q);
 539
 540     if (ic->vdw_modifier == eintmodFORCESWITCH)
 541     {
 542         /* We add up the switch cost separately */
 543         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 544                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 545     }
 546     if (ic->vdw_modifier == eintmodPOTSWITCH)
 547     {
 548         /* We add up the switch cost separately */
 549         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 550                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 551     }
 552     if (ic->vdwtype == evdwPME)
 553     {
 554         /* We add up the LJ Ewald cost separately */
 555         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 556                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 557     }
 558 }
 559
 560 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
 561                              t_forcerec           *fr,
 562                              rvec                  x[],
 563                              rvec                  f[],
 564                              t_mdatoms            *mdatoms,
 565                              t_lambda             *fepvals,
 566                              real                 *lambda,
 567                              gmx_enerdata_t       *enerd,
 568                              int                   flags,
 569                              t_nrnb               *nrnb,
 570                              gmx_wallcycle_t       wcycle)
 571 {
 572     int              donb_flags;
 573     nb_kernel_data_t kernel_data;
 574     real             lam_i[efptNR];
 575     real             dvdl_nb[efptNR];
 576     int              th;
 577     int              i, j;
 578
 579     donb_flags = 0;
 580     /* Add short-range interactions */
 581     donb_flags |= GMX_NONBONDED_DO_SR;
 582
 583     /* Currently all group scheme kernels always calculate (shift-)forces */
 584     if (flags & GMX_FORCE_FORCES)
 585     {
 586         donb_flags |= GMX_NONBONDED_DO_FORCE;
 587     }
 588     if (flags & GMX_FORCE_VIRIAL)
 589     {
 590         donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
 591     }
 592     if (flags & GMX_FORCE_ENERGY)
 593     {
 594         donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
 595     }
 596
 597     kernel_data.flags  = donb_flags;
 598     kernel_data.lambda = lambda;
 599     kernel_data.dvdl   = dvdl_nb;
 600
 601     kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
 602     kernel_data.energygrp_vdw  = enerd->grpp.ener[egLJSR];
 603
 604     /* reset free energy components */
 605     for (i = 0; i < efptNR; i++)
 606     {
 607         dvdl_nb[i]  = 0;
 608     }
 609
 610     GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
 611
 612     wallcycle_sub_start(wcycle, ewcsNONBONDED);
 613 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
 614     for (th = 0; th < nbl_lists->nnbl; th++)
 615     {
 616         try
 617         {
 618             gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
 619                                       x, f, fr, mdatoms, &kernel_data, nrnb);
 620         }
 621         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 622     }
 623
 624     if (fepvals->sc_alpha != 0)
 625     {
 626         enerd->dvdl_nonlin[efptVDW]  += dvdl_nb[efptVDW];
 627         enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
 628     }
 629     else
 630     {
 631         enerd->dvdl_lin[efptVDW]  += dvdl_nb[efptVDW];
 632         enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
 633     }
 634
 635     /* If we do foreign lambda and we have soft-core interactions
 636      * we have to recalculate the (non-linear) energies contributions.
 637      */
 638     if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
 639     {
 640         kernel_data.flags          = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
 641         kernel_data.lambda         = lam_i;
 642         kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
 643         kernel_data.energygrp_vdw  = enerd->foreign_grpp.ener[egLJSR];
 644         /* Note that we add to kernel_data.dvdl, but ignore the result */
 645
 646         for (i = 0; i < enerd->n_lambda; i++)
 647         {
 648             for (j = 0; j < efptNR; j++)
 649             {
 650                 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
 651             }
 652             reset_foreign_enerdata(enerd);
 653 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
 654             for (th = 0; th < nbl_lists->nnbl; th++)
 655             {
 656                 try
 657                 {
 658                     gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
 659                                               x, f, fr, mdatoms, &kernel_data, nrnb);
 660                 }
 661                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 662             }
 663
 664             sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
 665             enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
 666         }
 667     }
 668
 669     wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 670 }
 671
 672 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
 673 {
 674     return nbv != nullptr && nbv->bUseGPU;
 675 }
 676
 677 static gmx_inline void clear_rvecs_omp(int n, rvec v[])
 678 {
 679     int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
 680
 681     /* Note that we would like to avoid this conditional by putting it
 682      * into the omp pragma instead, but then we still take the full
 683      * omp parallel for overhead (at least with gcc5).
 684      */
 685     if (nth == 1)
 686     {
 687         for (int i = 0; i < n; i++)
 688         {
 689             clear_rvec(v[i]);
 690         }
 691     }
 692     else
 693     {
 694 #pragma omp parallel for num_threads(nth) schedule(static)
 695         for (int i = 0; i < n; i++)
 696         {
 697             clear_rvec(v[i]);
 698         }
 699     }
 700 }
 701
 702 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
 703  *
 704  * \param groupOptions  Group options, containing T-coupling options
 705  */
 706 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
 707 {
 708     real nrdfCoupled   = 0;
 709     real nrdfUncoupled = 0;
 710     real kineticEnergy = 0;
 711     for (int g = 0; g < groupOptions.ngtc; g++)
 712     {
 713         if (groupOptions.tau_t[g] >= 0)
 714         {
 715             nrdfCoupled   += groupOptions.nrdf[g];
 716             kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
 717         }
 718         else
 719         {
 720             nrdfUncoupled += groupOptions.nrdf[g];
 721         }
 722     }
 723
 724     /* This conditional with > also catches nrdf=0 */
 725     if (nrdfCoupled > nrdfUncoupled)
 726     {
 727         return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
 728     }
 729     else
 730     {
 731         return 0;
 732     }
 733 }
 734
 735 /*! \brief This routine checks that the potential energy is finite.
 736  *
 737  * Always checks that the potential energy is finite. If step equals
 738  * inputrec.init_step also checks that the magnitude of the potential energy
 739  * is reasonable. Terminates with a fatal error when a check fails.
 740  * Note that passing this check does not guarantee finite forces,
 741  * since those use slightly different arithmetics. But in most cases
 742  * there is just a narrow coordinate range where forces are not finite
 743  * and energies are finite.
 744  *
 745  * \param[in] step      The step number, used for checking and printing
 746  * \param[in] enerd     The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
 747  * \param[in] inputrec  The input record
 748  */
 749 static void checkPotentialEnergyValidity(gmx_int64_t           step,
 750                                          const gmx_enerdata_t &enerd,
 751                                          const t_inputrec     &inputrec)
 752 {
 753     /* Threshold valid for comparing absolute potential energy against
 754      * the kinetic energy. Normally one should not consider absolute
 755      * potential energy values, but with a factor of one million
 756      * we should never get false positives.
 757      */
 758     constexpr real c_thresholdFactor = 1e6;
 759
 760     bool           energyIsNotFinite    = !std::isfinite(enerd.term[F_EPOT]);
 761     real           averageKineticEnergy = 0;
 762     /* We only check for large potential energy at the initial step,
 763      * because that is by far the most likely step for this too occur
 764      * and because computing the average kinetic energy is not free.
 765      * Note: nstcalcenergy >> 1 often does not allow to catch large energies
 766      * before they become NaN.
 767      */
 768     if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
 769     {
 770         averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
 771     }
 772
 773     if (energyIsNotFinite || (averageKineticEnergy > 0 &&
 774                               enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
 775     {
 776         gmx_fatal(FARGS, "Step %" GMX_PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
 777                   step,
 778                   enerd.term[F_EPOT],
 779                   energyIsNotFinite ? "not finite" : "extremely high",
 780                   enerd.term[F_LJ],
 781                   enerd.term[F_COUL_SR],
 782                   energyIsNotFinite ? "non-finite" : "very high",
 783                   energyIsNotFinite ? " or Nan" : "");
 784     }
 785 }
 786
 787 /*! \brief Compute forces and/or energies for special algorithms
 788  *
 789  * The intention is to collect all calls to algorithms that compute
 790  * forces on local atoms only and that do not contribute to the local
 791  * virial sum (but add their virial contribution separately).
 792  * Eventually these should likely all become ForceProviders.
 793  * Within this function the intention is to have algorithms that do
 794  * global communication at the end, so global barriers within the MD loop
 795  * are as close together as possible.
 796  *
 797  * \param[in]     fplog            The log file
 798  * \param[in]     cr               The communication record
 799  * \param[in]     inputrec         The input record
 800  * \param[in]     step             The current MD step
 801  * \param[in]     t                The current time
 802  * \param[in,out] wcycle           Wallcycle accounting struct
 803  * \param[in,out] forceProviders   Pointer to a list of force providers
 804  * \param[in]     box              The unit cell
 805  * \param[in]     x                The coordinates
 806  * \param[in]     mdatoms          Per atom properties
 807  * \param[in]     lambda           Array of free-energy lambda values
 808  * \param[in]     forceFlags       Flags that tell whether we should compute forces/energies/virial
 809  * \param[in,out] forceWithVirial  Force and virial buffers
 810  * \param[in,out] enerd            Energy buffer
 811  * \param[in,out] ed               Essential dynamics pointer
 812  * \param[in]     bNS              Tells if we did neighbor searching this step, used for ED sampling
 813  *
 814  * \todo Remove bNS, which is used incorrectly.
 815  * \todo Convert all other algorithms called here to ForceProviders.
 816  */
 817 static void
 818 computeSpecialForces(FILE             *fplog,
 819                      t_commrec        *cr,
 820                      t_inputrec       *inputrec,
 821                      gmx_int64_t       step,
 822                      double            t,
 823                      gmx_wallcycle_t   wcycle,
 824                      ForceProviders   *forceProviders,
 825                      matrix            box,
 826                      rvec             *x,
 827                      t_mdatoms        *mdatoms,
 828                      real             *lambda,
 829                      int               forceFlags,
 830                      ForceWithVirial  *forceWithVirial,
 831                      gmx_enerdata_t   *enerd,
 832                      gmx_edsam_t       ed,
 833                      gmx_bool          bNS)
 834 {
 835     const bool computeForces = (forceFlags & GMX_FORCE_FORCES);
 836
 837     /* NOTE: Currently all ForceProviders only provide forces.
 838      *       When they also provide energies, remove this conditional.
 839      */
 840     if (computeForces)
 841     {
 842         /* Collect forces from modules */
 843         forceProviders->calculateForces(cr, mdatoms, box, t, x, forceWithVirial);
 844     }
 845
 846     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
 847     {
 848         pull_potential_wrapper(cr, inputrec, box, x,
 849                                forceWithVirial,
 850                                mdatoms, enerd, lambda, t,
 851                                wcycle);
 852
 853         if (inputrec->bDoAwh)
 854         {
 855             Awh &awh = *inputrec->awh;
 856             enerd->term[F_COM_PULL] +=
 857                 awh.applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
 858                                                  forceWithVirial,
 859                                                  t, step, wcycle, fplog);
 860         }
 861     }
 862
 863     rvec *f = as_rvec_array(forceWithVirial->force_.data());
 864
 865     /* Add the forces from enforced rotation potentials (if any) */
 866     if (inputrec->bRot)
 867     {
 868         wallcycle_start(wcycle, ewcROTadd);
 869         enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
 870         wallcycle_stop(wcycle, ewcROTadd);
 871     }
 872
 873     if (ed)
 874     {
 875         /* Note that since init_edsam() is called after the initialization
 876          * of forcerec, edsam doesn't request the noVirSum force buffer.
 877          * Thus if no other algorithm (e.g. PME) requires it, the forces
 878          * here will contribute to the virial.
 879          */
 880         do_flood(cr, inputrec, x, f, ed, box, step, bNS);
 881     }
 882
 883     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
 884     if (inputrec->bIMD && computeForces)
 885     {
 886         IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
 887     }
 888 }
 889
 890 /*! \brief Launch the prepare_step and spread stages of PME GPU.
 891  *
 892  * \param[in]  pmedata       The PME structure
 893  * \param[in]  box           The box matrix
 894  * \param[in]  x             Coordinate array
 895  * \param[in]  flags         Force flags
 896  * \param[in]  wcycle        The wallcycle structure
 897  */
 898 static inline void launchPmeGpuSpread(gmx_pme_t      *pmedata,
 899                                       matrix          box,
 900                                       rvec            x[],
 901                                       int             flags,
 902                                       gmx_wallcycle_t wcycle)
 903 {
 904     int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE;
 905     pmeFlags |= (flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0;
 906     pmeFlags |= (flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0;
 907
 908     pme_gpu_prepare_computation(pmedata, flags & GMX_FORCE_DYNAMICBOX, box, wcycle, pmeFlags);
 909     pme_gpu_launch_spread(pmedata, x, wcycle);
 910 }
 911
 912 /*! \brief Launch the FFT and gather stages of PME GPU
 913  *
 914  * This function only implements setting the output forces (no accumulation).
 915  *
 916  * \param[in]  pmedata        The PME structure
 917  * \param[in]  wcycle         The wallcycle structure
 918  */
 919 static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
 920                                      gmx_wallcycle_t   wcycle)
 921 {
 922     pme_gpu_launch_complex_transforms(pmedata, wcycle);
 923     pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
 924 }
 925
 926 /*! \brief
 927  *  Polling wait for either of the PME or nonbonded GPU tasks.
 928  *
 929  * Instead of a static order in waiting for GPU tasks, this function
 930  * polls checking which of the two tasks completes first, and does the
 931  * associated force buffer reduction overlapped with the other task.
 932  * By doing that, unlike static scheduling order, it can always overlap
 933  * one of the reductions, regardless of the GPU task completion order.
 934  *
 935  * \param[in]     nbv              Nonbonded verlet structure
 936  * \param[in]     pmedata          PME module data
 937  * \param[in,out] force            Force array to reduce task outputs into.
 938  * \param[in,out] forceWithVirial  Force and virial buffers
 939  * \param[in,out] fshift           Shift force output vector results are reduced into
 940  * \param[in,out] enerd            Energy data structure results are reduced into
 941  * \param[in]     flags            Force flags
 942  * \param[in]     wcycle           The wallcycle structure
 943  */
 944 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t             *nbv,
 945                                         const gmx_pme_t                *pmedata,
 946                                         gmx::PaddedArrayRef<gmx::RVec> *force,
 947                                         ForceWithVirial                *forceWithVirial,
 948                                         rvec                            fshift[],
 949                                         gmx_enerdata_t                 *enerd,
 950                                         int                             flags,
 951                                         gmx_wallcycle_t                 wcycle)
 952 {
 953     bool isPmeGpuDone = false;
 954     bool isNbGpuDone  = false;
 955
 956
 957     gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
 958
 959     while (!isPmeGpuDone || !isNbGpuDone)
 960     {
 961         if (!isPmeGpuDone)
 962         {
 963             matrix            vir_Q;
 964             real              Vlr_q;
 965
 966             GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
 967             isPmeGpuDone = pme_gpu_try_finish_task(pmedata, wcycle, &pmeGpuForces,
 968                                                    vir_Q, &Vlr_q, completionType);
 969
 970             if (isPmeGpuDone)
 971             {
 972                 pme_gpu_reduce_outputs(wcycle, forceWithVirial, pmeGpuForces,
 973                                        enerd, vir_Q, Vlr_q);
 974             }
 975         }
 976
 977         if (!isNbGpuDone)
 978         {
 979             GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
 980             wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
 981             isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
 982                                                     flags, eatLocal,
 983                                                     enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
 984                                                     fshift, completionType);
 985             wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 986             // To get the call count right, when the task finished we
 987             // issue a start/stop.
 988             // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
 989             // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
 990             if (isNbGpuDone)
 991             {
 992                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
 993                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 994
 995                 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
 996                 wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
 997                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
 998                                                nbv->nbat, as_rvec_array(force->data()));
 999                 wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1000                 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1001             }
1002         }
1003     }
1004 }
1005
1006 /*! \brief
1007  *  Launch the dynamic rolling pruning GPU task.
1008  *
1009  *  We currently alternate local/non-local list pruning in odd-even steps
1010  *  (only pruning every second step without DD).
1011  *
1012  * \param[in]     cr               The communication record
1013  * \param[in]     nbv              Nonbonded verlet structure
1014  * \param[in]     inputrec         The input record
1015  * \param[in]     step             The current MD step
1016  */
1017 static inline void launchGpuRollingPruning(const t_commrec          *cr,
1018                                            const nonbonded_verlet_t *nbv,
1019                                            const t_inputrec         *inputrec,
1020                                            const gmx_int64_t         step)
1021 {
1022     /* We should not launch the rolling pruning kernel at a search
1023      * step or just before search steps, since that's useless.
1024      * Without domain decomposition we prune at even steps.
1025      * With domain decomposition we alternate local and non-local
1026      * pruning at even and odd steps.
1027      */
1028     int  numRollingParts     = nbv->listParams->numRollingParts;
1029     GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1030     int  stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1031     bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
1032     if (stepWithCurrentList > 0 &&
1033         stepWithCurrentList < inputrec->nstlist - 1 &&
1034         (stepIsEven || DOMAINDECOMP(cr)))
1035     {
1036         nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
1037                                           stepIsEven ? eintLocal : eintNonlocal,
1038                                           numRollingParts);
1039     }
1040 }
1041
1042 static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
1043                                 t_inputrec *inputrec,
1044                                 gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1045                                 gmx_localtop_t *top,
1046                                 gmx_groups_t gmx_unused *groups,
1047                                 matrix box, gmx::PaddedArrayRef<gmx::RVec> x, history_t *hist,
1048                                 gmx::PaddedArrayRef<gmx::RVec> force,
1049                                 tensor vir_force,
1050                                 t_mdatoms *mdatoms,
1051                                 gmx_enerdata_t *enerd, t_fcdata *fcd,
1052                                 real *lambda, t_graph *graph,
1053                                 t_forcerec *fr, interaction_const_t *ic,
1054                                 gmx_vsite_t *vsite, rvec mu_tot,
1055                                 double t, gmx_edsam_t ed,
1056                                 gmx_bool bBornRadii,
1057                                 int flags,
1058                                 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1059                                 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1060 {
1061     int                 cg1, i, j;
1062     double              mu[2*DIM];
1063     gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM;
1064     gmx_bool            bDoForces, bUseGPU, bUseOrEmulGPU;
1065     rvec                vzero, box_diag;
1066     float               cycles_pme, cycles_wait_gpu;
1067     nonbonded_verlet_t *nbv = fr->nbv;
1068
1069     bStateChanged = (flags & GMX_FORCE_STATECHANGED);
1070     bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1071     bFillGrid     = (bNS && bStateChanged);
1072     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
1073     bDoForces     = (flags & GMX_FORCE_FORCES);
1074     bUseGPU       = fr->nbv->bUseGPU;
1075     bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
1076
1077     const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
1078     // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
1079     const bool useGpuPme  = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
1080         ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
1081
1082     /* At a search step we need to start the first balancing region
1083      * somewhere early inside the step after communication during domain
1084      * decomposition (and not during the previous step as usual).
1085      */
1086     if (bNS &&
1087         ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1088     {
1089         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
1090     }
1091
1092     cycles_wait_gpu = 0;
1093
1094     const int start  = 0;
1095     const int homenr = mdatoms->homenr;
1096
1097     clear_mat(vir_force);
1098
1099     if (DOMAINDECOMP(cr))
1100     {
1101         cg1 = cr->dd->ncg_tot;
1102     }
1103     else
1104     {
1105         cg1 = top->cgs.nr;
1106     }
1107     if (fr->n_tpi > 0)
1108     {
1109         cg1--;
1110     }
1111
1112     if (bStateChanged)
1113     {
1114         update_forcerec(fr, box);
1115
1116         if (inputrecNeedMutot(inputrec))
1117         {
1118             /* Calculate total (local) dipole moment in a temporary common array.
1119              * This makes it possible to sum them over nodes faster.
1120              */
1121             calc_mu(start, homenr,
1122                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1123                     mu, mu+DIM);
1124         }
1125     }
1126
1127     if (fr->ePBC != epbcNONE)
1128     {
1129         /* Compute shift vectors every step,
1130          * because of pressure coupling or box deformation!
1131          */
1132         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1133         {
1134             calc_shifts(box, fr->shift_vec);
1135         }
1136
1137         if (bCalcCGCM)
1138         {
1139             put_atoms_in_box_omp(fr->ePBC, box, x.subArray(0, homenr));
1140             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1141         }
1142         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1143         {
1144             unshift_self(graph, box, as_rvec_array(x.data()));
1145         }
1146     }
1147
1148     nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
1149                                  fr->shift_vec, nbv->nbat);
1150
1151 #if GMX_MPI
1152     if (!thisRankHasDuty(cr, DUTY_PME))
1153     {
1154         /* Send particle coordinates to the pme nodes.
1155          * Since this is only implemented for domain decomposition
1156          * and domain decomposition does not use the graph,
1157          * we do not need to worry about shifting.
1158          */
1159         wallcycle_start(wcycle, ewcPP_PMESENDX);
1160         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1161                                  lambda[efptCOUL], lambda[efptVDW],
1162                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1163                                  step);
1164         wallcycle_stop(wcycle, ewcPP_PMESENDX);
1165     }
1166 #endif /* GMX_MPI */
1167
1168     if (useGpuPme)
1169     {
1170         launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.data()), flags, wcycle);
1171     }
1172
1173     /* do gridding for pair search */
1174     if (bNS)
1175     {
1176         if (graph && bStateChanged)
1177         {
1178             /* Calculate intramolecular shift vectors to make molecules whole */
1179             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1180         }
1181
1182         clear_rvec(vzero);
1183         box_diag[XX] = box[XX][XX];
1184         box_diag[YY] = box[YY][YY];
1185         box_diag[ZZ] = box[ZZ][ZZ];
1186
1187         wallcycle_start(wcycle, ewcNS);
1188         if (!fr->bDomDec)
1189         {
1190             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1191             nbnxn_put_on_grid(nbv->nbs, fr->ePBC, box,
1192                               0, vzero, box_diag,
1193                               0, mdatoms->homenr, -1, fr->cginfo, as_rvec_array(x.data()),
1194                               0, nullptr,
1195                               nbv->grp[eintLocal].kernel_type,
1196                               nbv->nbat);
1197             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1198         }
1199         else
1200         {
1201             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1202             nbnxn_put_on_grid_nonlocal(nbv->nbs, domdec_zones(cr->dd),
1203                                        fr->cginfo, as_rvec_array(x.data()),
1204                                        nbv->grp[eintNonlocal].kernel_type,
1205                                        nbv->nbat);
1206             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1207         }
1208
1209         nbnxn_atomdata_set(nbv->nbat, nbv->nbs, mdatoms, fr->cginfo);
1210         wallcycle_stop(wcycle, ewcNS);
1211     }
1212
1213     /* initialize the GPU atom data and copy shift vector */
1214     if (bUseGPU)
1215     {
1216         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1217         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1218
1219         if (bNS)
1220         {
1221             nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1222         }
1223
1224         nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1225
1226         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1227         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1228     }
1229
1230     /* do local pair search */
1231     if (bNS)
1232     {
1233         wallcycle_start_nocount(wcycle, ewcNS);
1234         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1235         nbnxn_make_pairlist(nbv->nbs, nbv->nbat,
1236                             &top->excls,
1237                             nbv->listParams->rlistOuter,
1238                             nbv->min_ci_balanced,
1239                             &nbv->grp[eintLocal].nbl_lists,
1240                             eintLocal,
1241                             nbv->grp[eintLocal].kernel_type,
1242                             nrnb);
1243         nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
1244         if (nbv->listParams->useDynamicPruning && !bUseGPU)
1245         {
1246             nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
1247         }
1248         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1249
1250         if (bUseGPU)
1251         {
1252             /* initialize local pair-list on the GPU */
1253             nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1254                                     nbv->grp[eintLocal].nbl_lists.nbl[0],
1255                                     eintLocal);
1256         }
1257         wallcycle_stop(wcycle, ewcNS);
1258     }
1259     else
1260     {
1261         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1262         wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
1263         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, as_rvec_array(x.data()),
1264                                         nbv->nbat);
1265         wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
1266         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1267     }
1268
1269     if (bUseGPU)
1270     {
1271         if (DOMAINDECOMP(cr))
1272         {
1273             ddOpenBalanceRegionGpu(cr->dd);
1274         }
1275
1276         wallcycle_start(wcycle, ewcLAUNCH_GPU);
1277         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1278         /* launch local nonbonded work on GPU */
1279         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1280                      step, nrnb, wcycle);
1281         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1282         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1283     }
1284
1285     if (useGpuPme)
1286     {
1287         // In PME GPU and mixed mode we launch FFT / gather after the
1288         // X copy/transform to allow overlap as well as after the GPU NB
1289         // launch to avoid FFT launch overhead hijacking the CPU and delaying
1290         // the nonbonded kernel.
1291         launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1292     }
1293
1294     /* Communicate coordinates and sum dipole if necessary +
1295        do non-local pair search */
1296     if (DOMAINDECOMP(cr))
1297     {
1298         if (bNS)
1299         {
1300             wallcycle_start_nocount(wcycle, ewcNS);
1301             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1302
1303             nbnxn_make_pairlist(nbv->nbs, nbv->nbat,
1304                                 &top->excls,
1305                                 nbv->listParams->rlistOuter,
1306                                 nbv->min_ci_balanced,
1307                                 &nbv->grp[eintNonlocal].nbl_lists,
1308                                 eintNonlocal,
1309                                 nbv->grp[eintNonlocal].kernel_type,
1310                                 nrnb);
1311             nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1312             if (nbv->listParams->useDynamicPruning && !bUseGPU)
1313             {
1314                 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1315             }
1316             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1317
1318             if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1319             {
1320                 /* initialize non-local pair-list on the GPU */
1321                 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1322                                         nbv->grp[eintNonlocal].nbl_lists.nbl[0],
1323                                         eintNonlocal);
1324             }
1325             wallcycle_stop(wcycle, ewcNS);
1326         }
1327         else
1328         {
1329             wallcycle_start(wcycle, ewcMOVEX);
1330             dd_move_x(cr->dd, box, as_rvec_array(x.data()));
1331             wallcycle_stop(wcycle, ewcMOVEX);
1332
1333             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1334             wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
1335             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, as_rvec_array(x.data()),
1336                                             nbv->nbat);
1337             wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
1338             wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1339         }
1340
1341         if (bUseGPU)
1342         {
1343             wallcycle_start(wcycle, ewcLAUNCH_GPU);
1344             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1345             /* launch non-local nonbonded tasks on GPU */
1346             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1347                          step, nrnb, wcycle);
1348             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1349             wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1350         }
1351     }
1352
1353     if (bUseGPU)
1354     {
1355         /* launch D2H copy-back F */
1356         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1357         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1358         if (DOMAINDECOMP(cr))
1359         {
1360             nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1361                                      flags, eatNonlocal);
1362         }
1363         nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1364                                  flags, eatLocal);
1365         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1366         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1367     }
1368
1369     if (bStateChanged && inputrecNeedMutot(inputrec))
1370     {
1371         if (PAR(cr))
1372         {
1373             gmx_sumd(2*DIM, mu, cr);
1374             ddReopenBalanceRegionCpu(cr->dd);
1375         }
1376
1377         for (i = 0; i < 2; i++)
1378         {
1379             for (j = 0; j < DIM; j++)
1380             {
1381                 fr->mu_tot[i][j] = mu[i*DIM + j];
1382             }
1383         }
1384     }
1385     if (fr->efep == efepNO)
1386     {
1387         copy_rvec(fr->mu_tot[0], mu_tot);
1388     }
1389     else
1390     {
1391         for (j = 0; j < DIM; j++)
1392         {
1393             mu_tot[j] =
1394                 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1395                 lambda[efptCOUL]*fr->mu_tot[1][j];
1396         }
1397     }
1398
1399     /* Reset energies */
1400     reset_enerdata(enerd);
1401     clear_rvecs(SHIFTS, fr->fshift);
1402
1403     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1404     {
1405         wallcycle_start(wcycle, ewcPPDURINGPME);
1406         dd_force_flop_start(cr->dd, nrnb);
1407     }
1408
1409     if (inputrec->bRot)
1410     {
1411         wallcycle_start(wcycle, ewcROT);
1412         do_rotation(cr, inputrec, box, as_rvec_array(x.data()), t, step, bNS);
1413         wallcycle_stop(wcycle, ewcROT);
1414     }
1415
1416     /* Temporary solution until all routines take PaddedRVecVector */
1417     rvec *f = as_rvec_array(force.data());
1418
1419     /* Start the force cycle counter.
1420      * Note that a different counter is used for dynamic load balancing.
1421      */
1422     wallcycle_start(wcycle, ewcFORCE);
1423
1424     gmx::ArrayRef<gmx::RVec> forceRef = force;
1425     if (bDoForces)
1426     {
1427         /* If we need to compute the virial, we might need a separate
1428          * force buffer for algorithms for which the virial is calculated
1429          * directly, such as PME.
1430          */
1431         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1432         {
1433             forceRef = *fr->forceBufferForDirectVirialContributions;
1434
1435             /* We only compute forces on local atoms. Note that vsites can
1436              * spread to non-local atoms, but that part of the buffer is
1437              * cleared separately in the vsite spreading code.
1438              */
1439             clear_rvecs_omp(mdatoms->homenr, as_rvec_array(forceRef.data()));
1440         }
1441         /* Clear the short- and long-range forces */
1442         clear_rvecs_omp(fr->natoms_force_constr, f);
1443     }
1444
1445     /* forceWithVirial uses the local atom range only */
1446     ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
1447
1448     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1449     {
1450         clear_pull_forces(inputrec->pull_work);
1451     }
1452
1453     /* We calculate the non-bonded forces, when done on the CPU, here.
1454      * We do this before calling do_force_lowlevel, because in that
1455      * function, the listed forces are calculated before PME, which
1456      * does communication.  With this order, non-bonded and listed
1457      * force calculation imbalance can be balanced out by the domain
1458      * decomposition load balancing.
1459      */
1460
1461     if (!bUseOrEmulGPU)
1462     {
1463         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1464                      step, nrnb, wcycle);
1465     }
1466
1467     if (fr->efep != efepNO)
1468     {
1469         /* Calculate the local and non-local free energy interactions here.
1470          * Happens here on the CPU both with and without GPU.
1471          */
1472         if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1473         {
1474             do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1475                              fr, as_rvec_array(x.data()), f, mdatoms,
1476                              inputrec->fepvals, lambda,
1477                              enerd, flags, nrnb, wcycle);
1478         }
1479
1480         if (DOMAINDECOMP(cr) &&
1481             fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1482         {
1483             do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1484                              fr, as_rvec_array(x.data()), f, mdatoms,
1485                              inputrec->fepvals, lambda,
1486                              enerd, flags, nrnb, wcycle);
1487         }
1488     }
1489
1490     if (!bUseOrEmulGPU)
1491     {
1492         int aloc;
1493
1494         if (DOMAINDECOMP(cr))
1495         {
1496             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1497                          step, nrnb, wcycle);
1498         }
1499
1500         if (!bUseOrEmulGPU)
1501         {
1502             aloc = eintLocal;
1503         }
1504         else
1505         {
1506             aloc = eintNonlocal;
1507         }
1508
1509         /* Add all the non-bonded force to the normal force array.
1510          * This can be split into a local and a non-local part when overlapping
1511          * communication with calculation with domain decomposition.
1512          */
1513         wallcycle_stop(wcycle, ewcFORCE);
1514         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1515         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1516         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->nbat, f);
1517         wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1518         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1519         wallcycle_start_nocount(wcycle, ewcFORCE);
1520
1521         /* if there are multiple fshift output buffers reduce them */
1522         if ((flags & GMX_FORCE_VIRIAL) &&
1523             nbv->grp[aloc].nbl_lists.nnbl > 1)
1524         {
1525             /* This is not in a subcounter because it takes a
1526                negligible and constant-sized amount of time */
1527             nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1528                                                      fr->fshift);
1529         }
1530     }
1531
1532     /* update QMMMrec, if necessary */
1533     if (fr->bQMMM)
1534     {
1535         update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1536     }
1537
1538     /* Compute the bonded and non-bonded energies and optionally forces */
1539     do_force_lowlevel(fr, inputrec, &(top->idef),
1540                       cr, nrnb, wcycle, mdatoms,
1541                       as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd, top, fr->born,
1542                       bBornRadii, box,
1543                       inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1544                       flags, &cycles_pme);
1545
1546     wallcycle_stop(wcycle, ewcFORCE);
1547
1548     computeSpecialForces(fplog, cr, inputrec, step, t, wcycle,
1549                          fr->forceProviders, box, as_rvec_array(x.data()), mdatoms, lambda,
1550                          flags, &forceWithVirial, enerd,
1551                          ed, bNS);
1552
1553     if (bUseOrEmulGPU)
1554     {
1555         /* wait for non-local forces (or calculate in emulation mode) */
1556         if (DOMAINDECOMP(cr))
1557         {
1558             if (bUseGPU)
1559             {
1560                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1561                 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1562                                            flags, eatNonlocal,
1563                                            enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1564                                            fr->fshift);
1565                 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1566             }
1567             else
1568             {
1569                 wallcycle_start_nocount(wcycle, ewcFORCE);
1570                 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1571                              step, nrnb, wcycle);
1572                 wallcycle_stop(wcycle, ewcFORCE);
1573             }
1574             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1575             wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1576             /* skip the reduction if there was no non-local work to do */
1577             if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1578             {
1579                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
1580                                                nbv->nbat, f);
1581             }
1582             wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1583             wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1584         }
1585     }
1586
1587     if (DOMAINDECOMP(cr))
1588     {
1589         /* We are done with the CPU compute.
1590          * We will now communicate the non-local forces.
1591          * If we use a GPU this will overlap with GPU work, so in that case
1592          * we do not close the DD force balancing region here.
1593          */
1594         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1595         {
1596             ddCloseBalanceRegionCpu(cr->dd);
1597         }
1598         if (bDoForces)
1599         {
1600             wallcycle_start(wcycle, ewcMOVEF);
1601             dd_move_f(cr->dd, f, fr->fshift);
1602             wallcycle_stop(wcycle, ewcMOVEF);
1603         }
1604     }
1605
1606     // With both nonbonded and PME offloaded a GPU on the same rank, we use
1607     // an alternating wait/reduction scheme.
1608     bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1609     if (alternateGpuWait)
1610     {
1611         alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, wcycle);
1612     }
1613
1614     if (!alternateGpuWait && useGpuPme)
1615     {
1616         gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
1617         matrix vir_Q;
1618         real   Vlr_q;
1619         pme_gpu_wait_finish_task(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
1620         pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
1621     }
1622
1623     /* Wait for local GPU NB outputs on the non-alternating wait path */
1624     if (!alternateGpuWait && bUseGPU)
1625     {
1626         /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1627          * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1628          * but even with a step of 0.1 ms the difference is less than 1%
1629          * of the step time.
1630          */
1631         const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1632
1633         wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1634         nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1635                                    flags, eatLocal,
1636                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1637                                    fr->fshift);
1638         float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1639
1640         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1641         {
1642             DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1643             if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1644             {
1645                 /* We measured few cycles, it could be that the kernel
1646                  * and transfer finished earlier and there was no actual
1647                  * wait time, only API call overhead.
1648                  * Then the actual time could be anywhere between 0 and
1649                  * cycles_wait_est. We will use half of cycles_wait_est.
1650                  */
1651                 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1652             }
1653             ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1654         }
1655     }
1656
1657     if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1658     {
1659         // NOTE: emulation kernel is not included in the balancing region,
1660         // but emulation mode does not target performance anyway
1661         wallcycle_start_nocount(wcycle, ewcFORCE);
1662         do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1663                      DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1664                      step, nrnb, wcycle);
1665         wallcycle_stop(wcycle, ewcFORCE);
1666     }
1667
1668     if (bUseGPU)
1669     {
1670         /* now clear the GPU outputs while we finish the step on the CPU */
1671         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1672         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1673         nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1674
1675         /* Is dynamic pair-list pruning activated? */
1676         if (nbv->listParams->useDynamicPruning)
1677         {
1678             launchGpuRollingPruning(cr, nbv, inputrec, step);
1679         }
1680         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1681         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1682
1683         // TODO: move here the PME buffer clearing call pme_gpu_reinit_computation()
1684     }
1685
1686     /* Do the nonbonded GPU (or emulation) force buffer reduction
1687      * on the non-alternating path. */
1688     if (bUseOrEmulGPU && !alternateGpuWait)
1689     {
1690         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1691         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1692         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
1693                                        nbv->nbat, f);
1694         wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1695         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1696     }
1697
1698     if (DOMAINDECOMP(cr))
1699     {
1700         dd_force_flop_stop(cr->dd, nrnb);
1701     }
1702
1703     if (bDoForces)
1704     {
1705         /* If we have NoVirSum forces, but we do not calculate the virial,
1706          * we sum fr->f_novirsum=f later.
1707          */
1708         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1709         {
1710             wallcycle_start(wcycle, ewcVSITESPREAD);
1711             spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
1712                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1713             wallcycle_stop(wcycle, ewcVSITESPREAD);
1714         }
1715
1716         if (flags & GMX_FORCE_VIRIAL)
1717         {
1718             /* Calculation of the virial must be done after vsites! */
1719             calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
1720                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1721         }
1722     }
1723
1724     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1725     {
1726         /* In case of node-splitting, the PP nodes receive the long-range
1727          * forces, virial and energy from the PME nodes here.
1728          */
1729         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1730     }
1731
1732     if (bDoForces)
1733     {
1734         post_process_forces(cr, step, nrnb, wcycle,
1735                             top, box, as_rvec_array(x.data()), f, &forceWithVirial,
1736                             vir_force, mdatoms, graph, fr, vsite,
1737                             flags);
1738     }
1739
1740     if (flags & GMX_FORCE_ENERGY)
1741     {
1742         /* Sum the potential energy terms from group contributions */
1743         sum_epot(&(enerd->grpp), enerd->term);
1744
1745         if (!EI_TPI(inputrec->eI))
1746         {
1747             checkPotentialEnergyValidity(step, *enerd, *inputrec);
1748         }
1749     }
1750 }
1751
1752 static void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
1753                                t_inputrec *inputrec,
1754                                gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1755                                gmx_localtop_t *top,
1756                                gmx_groups_t *groups,
1757                                matrix box, gmx::PaddedArrayRef<gmx::RVec> x, history_t *hist,
1758                                gmx::PaddedArrayRef<gmx::RVec> force,
1759                                tensor vir_force,
1760                                t_mdatoms *mdatoms,
1761                                gmx_enerdata_t *enerd, t_fcdata *fcd,
1762                                real *lambda, t_graph *graph,
1763                                t_forcerec *fr, gmx_vsite_t *vsite, rvec mu_tot,
1764                                double t, gmx_edsam_t ed,
1765                                gmx_bool bBornRadii,
1766                                int flags,
1767                                DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1768                                DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1769 {
1770     int        cg0, cg1, i, j;
1771     double     mu[2*DIM];
1772     gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM;
1773     gmx_bool   bDoForces;
1774     float      cycles_pme;
1775
1776     const int  start  = 0;
1777     const int  homenr = mdatoms->homenr;
1778
1779     clear_mat(vir_force);
1780
1781     cg0 = 0;
1782     if (DOMAINDECOMP(cr))
1783     {
1784         cg1 = cr->dd->ncg_tot;
1785     }
1786     else
1787     {
1788         cg1 = top->cgs.nr;
1789     }
1790     if (fr->n_tpi > 0)
1791     {
1792         cg1--;
1793     }
1794
1795     bStateChanged  = (flags & GMX_FORCE_STATECHANGED);
1796     bNS            = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1797     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1798     bFillGrid      = (bNS && bStateChanged);
1799     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
1800     bDoForces      = (flags & GMX_FORCE_FORCES);
1801
1802     if (bStateChanged)
1803     {
1804         update_forcerec(fr, box);
1805
1806         if (inputrecNeedMutot(inputrec))
1807         {
1808             /* Calculate total (local) dipole moment in a temporary common array.
1809              * This makes it possible to sum them over nodes faster.
1810              */
1811             calc_mu(start, homenr,
1812                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1813                     mu, mu+DIM);
1814         }
1815     }
1816
1817     if (fr->ePBC != epbcNONE)
1818     {
1819         /* Compute shift vectors every step,
1820          * because of pressure coupling or box deformation!
1821          */
1822         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1823         {
1824             calc_shifts(box, fr->shift_vec);
1825         }
1826
1827         if (bCalcCGCM)
1828         {
1829             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1830                                      &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1831             inc_nrnb(nrnb, eNR_CGCM, homenr);
1832             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1833         }
1834         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1835         {
1836             unshift_self(graph, box, as_rvec_array(x.data()));
1837         }
1838     }
1839     else if (bCalcCGCM)
1840     {
1841         calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1842         inc_nrnb(nrnb, eNR_CGCM, homenr);
1843     }
1844
1845     if (bCalcCGCM && gmx_debug_at)
1846     {
1847         pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1848     }
1849
1850 #if GMX_MPI
1851     if (!thisRankHasDuty(cr, DUTY_PME))
1852     {
1853         /* Send particle coordinates to the pme nodes.
1854          * Since this is only implemented for domain decomposition
1855          * and domain decomposition does not use the graph,
1856          * we do not need to worry about shifting.
1857          */
1858         wallcycle_start(wcycle, ewcPP_PMESENDX);
1859         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1860                                  lambda[efptCOUL], lambda[efptVDW],
1861                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1862                                  step);
1863         wallcycle_stop(wcycle, ewcPP_PMESENDX);
1864     }
1865 #endif /* GMX_MPI */
1866
1867     /* Communicate coordinates and sum dipole if necessary */
1868     if (DOMAINDECOMP(cr))
1869     {
1870         wallcycle_start(wcycle, ewcMOVEX);
1871         dd_move_x(cr->dd, box, as_rvec_array(x.data()));
1872         wallcycle_stop(wcycle, ewcMOVEX);
1873         /* No GPU support, no move_x overlap, so reopen the balance region here */
1874         if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1875         {
1876             ddReopenBalanceRegionCpu(cr->dd);
1877         }
1878     }
1879
1880     if (inputrecNeedMutot(inputrec))
1881     {
1882         if (bStateChanged)
1883         {
1884             if (PAR(cr))
1885             {
1886                 gmx_sumd(2*DIM, mu, cr);
1887                 ddReopenBalanceRegionCpu(cr->dd);
1888             }
1889             for (i = 0; i < 2; i++)
1890             {
1891                 for (j = 0; j < DIM; j++)
1892                 {
1893                     fr->mu_tot[i][j] = mu[i*DIM + j];
1894                 }
1895             }
1896         }
1897         if (fr->efep == efepNO)
1898         {
1899             copy_rvec(fr->mu_tot[0], mu_tot);
1900         }
1901         else
1902         {
1903             for (j = 0; j < DIM; j++)
1904             {
1905                 mu_tot[j] =
1906                     (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1907             }
1908         }
1909     }
1910
1911     /* Reset energies */
1912     reset_enerdata(enerd);
1913     clear_rvecs(SHIFTS, fr->fshift);
1914
1915     if (bNS)
1916     {
1917         wallcycle_start(wcycle, ewcNS);
1918
1919         if (graph && bStateChanged)
1920         {
1921             /* Calculate intramolecular shift vectors to make molecules whole */
1922             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1923         }
1924
1925         /* Do the actual neighbour searching */
1926         ns(fplog, fr, box,
1927            groups, top, mdatoms,
1928            cr, nrnb, bFillGrid);
1929
1930         wallcycle_stop(wcycle, ewcNS);
1931     }
1932
1933     if (inputrec->implicit_solvent && bNS)
1934     {
1935         make_gb_nblist(cr, inputrec->gb_algorithm,
1936                        as_rvec_array(x.data()), box, fr, &top->idef, graph, fr->born);
1937     }
1938
1939     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1940     {
1941         wallcycle_start(wcycle, ewcPPDURINGPME);
1942         dd_force_flop_start(cr->dd, nrnb);
1943     }
1944
1945     if (inputrec->bRot)
1946     {
1947         wallcycle_start(wcycle, ewcROT);
1948         do_rotation(cr, inputrec, box, as_rvec_array(x.data()), t, step, bNS);
1949         wallcycle_stop(wcycle, ewcROT);
1950     }
1951
1952     /* Temporary solution until all routines take PaddedRVecVector */
1953     rvec *f = as_rvec_array(force.data());
1954
1955     /* Start the force cycle counter.
1956      * Note that a different counter is used for dynamic load balancing.
1957      */
1958     wallcycle_start(wcycle, ewcFORCE);
1959
1960     gmx::ArrayRef<gmx::RVec> forceRef = force;
1961     if (bDoForces)
1962     {
1963         /* If we need to compute the virial, we might need a separate
1964          * force buffer for algorithms for which the virial is calculated
1965          * separately, such as PME.
1966          */
1967         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1968         {
1969             forceRef = *fr->forceBufferForDirectVirialContributions;
1970
1971             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1972         }
1973
1974         /* Clear the short- and long-range forces */
1975         clear_rvecs(fr->natoms_force_constr, f);
1976     }
1977
1978     /* forceWithVirial might need the full force atom range */
1979     ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
1980
1981     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1982     {
1983         clear_pull_forces(inputrec->pull_work);
1984     }
1985
1986     /* update QMMMrec, if necessary */
1987     if (fr->bQMMM)
1988     {
1989         update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1990     }
1991
1992     /* Compute the bonded and non-bonded energies and optionally forces */
1993     do_force_lowlevel(fr, inputrec, &(top->idef),
1994                       cr, nrnb, wcycle, mdatoms,
1995                       as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd, top, fr->born,
1996                       bBornRadii, box,
1997                       inputrec->fepvals, lambda,
1998                       graph, &(top->excls), fr->mu_tot,
1999                       flags,
2000                       &cycles_pme);
2001
2002     wallcycle_stop(wcycle, ewcFORCE);
2003
2004     if (DOMAINDECOMP(cr))
2005     {
2006         dd_force_flop_stop(cr->dd, nrnb);
2007
2008         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
2009         {
2010             ddCloseBalanceRegionCpu(cr->dd);
2011         }
2012     }
2013
2014     computeSpecialForces(fplog, cr, inputrec, step, t, wcycle,
2015                          fr->forceProviders, box, as_rvec_array(x.data()), mdatoms, lambda,
2016                          flags, &forceWithVirial, enerd,
2017                          ed, bNS);
2018
2019     if (bDoForces)
2020     {
2021         /* Communicate the forces */
2022         if (DOMAINDECOMP(cr))
2023         {
2024             wallcycle_start(wcycle, ewcMOVEF);
2025             dd_move_f(cr->dd, f, fr->fshift);
2026             /* Do we need to communicate the separate force array
2027              * for terms that do not contribute to the single sum virial?
2028              * Position restraints and electric fields do not introduce
2029              * inter-cg forces, only full electrostatics methods do.
2030              * When we do not calculate the virial, fr->f_novirsum = f,
2031              * so we have already communicated these forces.
2032              */
2033             if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
2034                 (flags & GMX_FORCE_VIRIAL))
2035             {
2036                 dd_move_f(cr->dd, as_rvec_array(forceWithVirial.force_.data()), nullptr);
2037             }
2038             wallcycle_stop(wcycle, ewcMOVEF);
2039         }
2040
2041         /* If we have NoVirSum forces, but we do not calculate the virial,
2042          * we sum fr->f_novirsum=f later.
2043          */
2044         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
2045         {
2046             wallcycle_start(wcycle, ewcVSITESPREAD);
2047             spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
2048                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
2049             wallcycle_stop(wcycle, ewcVSITESPREAD);
2050         }
2051
2052         if (flags & GMX_FORCE_VIRIAL)
2053         {
2054             /* Calculation of the virial must be done after vsites! */
2055             calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
2056                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
2057         }
2058     }
2059
2060     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
2061     {
2062         /* In case of node-splitting, the PP nodes receive the long-range
2063          * forces, virial and energy from the PME nodes here.
2064          */
2065         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
2066     }
2067
2068     if (bDoForces)
2069     {
2070         post_process_forces(cr, step, nrnb, wcycle,
2071                             top, box, as_rvec_array(x.data()), f, &forceWithVirial,
2072                             vir_force, mdatoms, graph, fr, vsite,
2073                             flags);
2074     }
2075
2076     if (flags & GMX_FORCE_ENERGY)
2077     {
2078         /* Sum the potential energy terms from group contributions */
2079         sum_epot(&(enerd->grpp), enerd->term);
2080
2081         if (!EI_TPI(inputrec->eI))
2082         {
2083             checkPotentialEnergyValidity(step, *enerd, *inputrec);
2084         }
2085     }
2086
2087 }
2088
2089 void do_force(FILE *fplog, t_commrec *cr,
2090               t_inputrec *inputrec,
2091               gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
2092               gmx_localtop_t *top,
2093               gmx_groups_t *groups,
2094               matrix box, gmx::PaddedArrayRef<gmx::RVec> x, history_t *hist,
2095               gmx::PaddedArrayRef<gmx::RVec> force,
2096               tensor vir_force,
2097               t_mdatoms *mdatoms,
2098               gmx_enerdata_t *enerd, t_fcdata *fcd,
2099               gmx::ArrayRef<real> lambda, t_graph *graph,
2100               t_forcerec *fr,
2101               gmx_vsite_t *vsite, rvec mu_tot,
2102               double t, gmx_edsam_t ed,
2103               gmx_bool bBornRadii,
2104               int flags,
2105               DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
2106               DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
2107 {
2108     /* modify force flag if not doing nonbonded */
2109     if (!fr->bNonbonded)
2110     {
2111         flags &= ~GMX_FORCE_NONBONDED;
2112     }
2113
2114     GMX_ASSERT(x.size()     >= gmx::paddedRVecVectorSize(fr->natoms_force), "coordinates should be padded");
2115     GMX_ASSERT(force.size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "force should be padded");
2116
2117     switch (inputrec->cutoff_scheme)
2118     {
2119         case ecutsVERLET:
2120             do_force_cutsVERLET(fplog, cr, inputrec,
2121                                 step, nrnb, wcycle,
2122                                 top,
2123                                 groups,
2124                                 box, x, hist,
2125                                 force, vir_force,
2126                                 mdatoms,
2127                                 enerd, fcd,
2128                                 lambda.data(), graph,
2129                                 fr, fr->ic,
2130                                 vsite, mu_tot,
2131                                 t, ed,
2132                                 bBornRadii,
2133                                 flags,
2134                                 ddOpenBalanceRegion,
2135                                 ddCloseBalanceRegion);
2136             break;
2137         case ecutsGROUP:
2138             do_force_cutsGROUP(fplog, cr, inputrec,
2139                                step, nrnb, wcycle,
2140                                top,
2141                                groups,
2142                                box, x, hist,
2143                                force, vir_force,
2144                                mdatoms,
2145                                enerd, fcd,
2146                                lambda.data(), graph,
2147                                fr, vsite, mu_tot,
2148                                t, ed,
2149                                bBornRadii,
2150                                flags,
2151                                ddOpenBalanceRegion,
2152                                ddCloseBalanceRegion);
2153             break;
2154         default:
2155             gmx_incons("Invalid cut-off scheme passed!");
2156     }
2157
2158     /* In case we don't have constraints and are using GPUs, the next balancing
2159      * region starts here.
2160      * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2161      * virial calculation and COM pulling, is not thus not included in
2162      * the balance timing, which is ok as most tasks do communication.
2163      */
2164     if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
2165     {
2166         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
2167     }
2168 }
2169
2170
2171 void do_constrain_first(FILE *fplog, gmx_constr_t constr,
2172                         t_inputrec *ir, t_mdatoms *md,
2173                         t_state *state, t_commrec *cr, t_nrnb *nrnb,
2174                         t_forcerec *fr, gmx_localtop_t *top)
2175 {
2176     int             i, m, start, end;
2177     gmx_int64_t     step;
2178     real            dt = ir->delta_t;
2179     real            dvdl_dum;
2180     rvec           *savex;
2181
2182     /* We need to allocate one element extra, since we might use
2183      * (unaligned) 4-wide SIMD loads to access rvec entries.
2184      */
2185     snew(savex, state->natoms + 1);
2186
2187     start = 0;
2188     end   = md->homenr;
2189
2190     if (debug)
2191     {
2192         fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2193                 start, md->homenr, end);
2194     }
2195     /* Do a first constrain to reset particles... */
2196     step = ir->init_step;
2197     if (fplog)
2198     {
2199         char buf[STEPSTRSIZE];
2200         fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2201                 gmx_step_str(step, buf));
2202     }
2203     dvdl_dum = 0;
2204
2205     /* constrain the current position */
2206     constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
2207               ir, cr, step, 0, 1.0, md,
2208               as_rvec_array(state->x.data()), as_rvec_array(state->x.data()), nullptr,
2209               fr->bMolPBC, state->box,
2210               state->lambda[efptBONDED], &dvdl_dum,
2211               nullptr, nullptr, nrnb, econqCoord);
2212     if (EI_VV(ir->eI))
2213     {
2214         /* constrain the inital velocity, and save it */
2215         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2216         constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
2217                   ir, cr, step, 0, 1.0, md,
2218                   as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
2219                   fr->bMolPBC, state->box,
2220                   state->lambda[efptBONDED], &dvdl_dum,
2221                   nullptr, nullptr, nrnb, econqVeloc);
2222     }
2223     /* constrain the inital velocities at t-dt/2 */
2224     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2225     {
2226         for (i = start; (i < end); i++)
2227         {
2228             for (m = 0; (m < DIM); m++)
2229             {
2230                 /* Reverse the velocity */
2231                 state->v[i][m] = -state->v[i][m];
2232                 /* Store the position at t-dt in buf */
2233                 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
2234             }
2235         }
2236         /* Shake the positions at t=-dt with the positions at t=0
2237          * as reference coordinates.
2238          */
2239         if (fplog)
2240         {
2241             char buf[STEPSTRSIZE];
2242             fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2243                     gmx_step_str(step, buf));
2244         }
2245         dvdl_dum = 0;
2246         constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
2247                   ir, cr, step, -1, 1.0, md,
2248                   as_rvec_array(state->x.data()), savex, nullptr,
2249                   fr->bMolPBC, state->box,
2250                   state->lambda[efptBONDED], &dvdl_dum,
2251                   as_rvec_array(state->v.data()), nullptr, nrnb, econqCoord);
2252
2253         for (i = start; i < end; i++)
2254         {
2255             for (m = 0; m < DIM; m++)
2256             {
2257                 /* Re-reverse the velocities */
2258                 state->v[i][m] = -state->v[i][m];
2259             }
2260         }
2261     }
2262     sfree(savex);
2263 }
2264
2265
2266 static void
2267 integrate_table(real vdwtab[], real scale, int offstart, int rstart, int rend,
2268                 double *enerout, double *virout)
2269 {
2270     double enersum, virsum;
2271     double invscale, invscale2, invscale3;
2272     double r, ea, eb, ec, pa, pb, pc, pd;
2273     double y0, f, g, h;
2274     int    ri, offset;
2275     double tabfactor;
2276
2277     invscale  = 1.0/scale;
2278     invscale2 = invscale*invscale;
2279     invscale3 = invscale*invscale2;
2280
2281     /* Following summation derived from cubic spline definition,
2282      * Numerical Recipies in C, second edition, p. 113-116.  Exact for
2283      * the cubic spline.  We first calculate the negative of the
2284      * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2285      * add the more standard, abrupt cutoff correction to that result,
2286      * yielding the long-range correction for a switched function.  We
2287      * perform both the pressure and energy loops at the same time for
2288      * simplicity, as the computational cost is low. */
2289
2290     if (offstart == 0)
2291     {
2292         /* Since the dispersion table has been scaled down a factor
2293          * 6.0 and the repulsion a factor 12.0 to compensate for the
2294          * c6/c12 parameters inside nbfp[] being scaled up (to save
2295          * flops in kernels), we need to correct for this.
2296          */
2297         tabfactor = 6.0;
2298     }
2299     else
2300     {
2301         tabfactor = 12.0;
2302     }
2303
2304     enersum = 0.0;
2305     virsum  = 0.0;
2306     for (ri = rstart; ri < rend; ++ri)
2307     {
2308         r  = ri*invscale;
2309         ea = invscale3;
2310         eb = 2.0*invscale2*r;
2311         ec = invscale*r*r;
2312
2313         pa = invscale3;
2314         pb = 3.0*invscale2*r;
2315         pc = 3.0*invscale*r*r;
2316         pd = r*r*r;
2317
2318         /* this "8" is from the packing in the vdwtab array - perhaps
2319            should be defined? */
2320
2321         offset = 8*ri + offstart;
2322         y0     = vdwtab[offset];
2323         f      = vdwtab[offset+1];
2324         g      = vdwtab[offset+2];
2325         h      = vdwtab[offset+3];
2326
2327         enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2328         virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2329     }
2330     *enerout = 4.0*M_PI*enersum*tabfactor;
2331     *virout  = 4.0*M_PI*virsum*tabfactor;
2332 }
2333
2334 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2335 {
2336     double   eners[2], virs[2], enersum, virsum;
2337     double   r0, rc3, rc9;
2338     int      ri0, ri1, i;
2339     real     scale, *vdwtab;
2340
2341     fr->enershiftsix    = 0;
2342     fr->enershifttwelve = 0;
2343     fr->enerdiffsix     = 0;
2344     fr->enerdifftwelve  = 0;
2345     fr->virdiffsix      = 0;
2346     fr->virdifftwelve   = 0;
2347
2348     const interaction_const_t *ic = fr->ic;
2349
2350     if (eDispCorr != edispcNO)
2351     {
2352         for (i = 0; i < 2; i++)
2353         {
2354             eners[i] = 0;
2355             virs[i]  = 0;
2356         }
2357         if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2358             (ic->vdw_modifier == eintmodPOTSWITCH) ||
2359             (ic->vdw_modifier == eintmodFORCESWITCH) ||
2360             (ic->vdwtype == evdwSHIFT) ||
2361             (ic->vdwtype == evdwSWITCH))
2362         {
2363             if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2364                  (ic->vdw_modifier == eintmodFORCESWITCH) ||
2365                  (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2366             {
2367                 gmx_fatal(FARGS,
2368                           "With dispersion correction rvdw-switch can not be zero "
2369                           "for vdw-type = %s", evdw_names[ic->vdwtype]);
2370             }
2371
2372             /* TODO This code depends on the logic in tables.c that
2373                constructs the table layout, which should be made
2374                explicit in future cleanup. */
2375             GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2376                        "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2377             scale  = fr->dispersionCorrectionTable->scale;
2378             vdwtab = fr->dispersionCorrectionTable->data;
2379
2380             /* Round the cut-offs to exact table values for precision */
2381             ri0  = static_cast<int>(floor(ic->rvdw_switch*scale));
2382             ri1  = static_cast<int>(ceil(ic->rvdw*scale));
2383
2384             /* The code below has some support for handling force-switching, i.e.
2385              * when the force (instead of potential) is switched over a limited
2386              * region. This leads to a constant shift in the potential inside the
2387              * switching region, which we can handle by adding a constant energy
2388              * term in the force-switch case just like when we do potential-shift.
2389              *
2390              * For now this is not enabled, but to keep the functionality in the
2391              * code we check separately for switch and shift. When we do force-switch
2392              * the shifting point is rvdw_switch, while it is the cutoff when we
2393              * have a classical potential-shift.
2394              *
2395              * For a pure potential-shift the potential has a constant shift
2396              * all the way out to the cutoff, and that is it. For other forms
2397              * we need to calculate the constant shift up to the point where we
2398              * start modifying the potential.
2399              */
2400             ri0  = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2401
2402             r0   = ri0/scale;
2403             rc3  = r0*r0*r0;
2404             rc9  = rc3*rc3*rc3;
2405
2406             if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2407                 (ic->vdwtype == evdwSHIFT))
2408             {
2409                 /* Determine the constant energy shift below rvdw_switch.
2410                  * Table has a scale factor since we have scaled it down to compensate
2411                  * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2412                  */
2413                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2414                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2415             }
2416             else if (ic->vdw_modifier == eintmodPOTSHIFT)
2417             {
2418                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3));
2419                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3));
2420             }
2421
2422             /* Add the constant part from 0 to rvdw_switch.
2423              * This integration from 0 to rvdw_switch overcounts the number
2424              * of interactions by 1, as it also counts the self interaction.
2425              * We will correct for this later.
2426              */
2427             eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2428             eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2429
2430             /* Calculate the contribution in the range [r0,r1] where we
2431              * modify the potential. For a pure potential-shift modifier we will
2432              * have ri0==ri1, and there will not be any contribution here.
2433              */
2434             for (i = 0; i < 2; i++)
2435             {
2436                 enersum = 0;
2437                 virsum  = 0;
2438                 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2439                 eners[i] -= enersum;
2440                 virs[i]  -= virsum;
2441             }
2442
2443             /* Alright: Above we compensated by REMOVING the parts outside r0
2444              * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2445              *
2446              * Regardless of whether r0 is the point where we start switching,
2447              * or the cutoff where we calculated the constant shift, we include
2448              * all the parts we are missing out to infinity from r0 by
2449              * calculating the analytical dispersion correction.
2450              */
2451             eners[0] += -4.0*M_PI/(3.0*rc3);
2452             eners[1] +=  4.0*M_PI/(9.0*rc9);
2453             virs[0]  +=  8.0*M_PI/rc3;
2454             virs[1]  += -16.0*M_PI/(3.0*rc9);
2455         }
2456         else if (ic->vdwtype == evdwCUT ||
2457                  EVDW_PME(ic->vdwtype) ||
2458                  ic->vdwtype == evdwUSER)
2459         {
2460             if (ic->vdwtype == evdwUSER && fplog)
2461             {
2462                 fprintf(fplog,
2463                         "WARNING: using dispersion correction with user tables\n");
2464             }
2465
2466             /* Note that with LJ-PME, the dispersion correction is multiplied
2467              * by the difference between the actual C6 and the value of C6
2468              * that would produce the combination rule.
2469              * This means the normal energy and virial difference formulas
2470              * can be used here.
2471              */
2472
2473             rc3  = ic->rvdw*ic->rvdw*ic->rvdw;
2474             rc9  = rc3*rc3*rc3;
2475             /* Contribution beyond the cut-off */
2476             eners[0] += -4.0*M_PI/(3.0*rc3);
2477             eners[1] +=  4.0*M_PI/(9.0*rc9);
2478             if (ic->vdw_modifier == eintmodPOTSHIFT)
2479             {
2480                 /* Contribution within the cut-off */
2481                 eners[0] += -4.0*M_PI/(3.0*rc3);
2482                 eners[1] +=  4.0*M_PI/(3.0*rc9);
2483             }
2484             /* Contribution beyond the cut-off */
2485             virs[0]  +=  8.0*M_PI/rc3;
2486             virs[1]  += -16.0*M_PI/(3.0*rc9);
2487         }
2488         else
2489         {
2490             gmx_fatal(FARGS,
2491                       "Dispersion correction is not implemented for vdw-type = %s",
2492                       evdw_names[ic->vdwtype]);
2493         }
2494
2495         /* When we deprecate the group kernels the code below can go too */
2496         if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2497         {
2498             /* Calculate self-interaction coefficient (assuming that
2499              * the reciprocal-space contribution is constant in the
2500              * region that contributes to the self-interaction).
2501              */
2502             fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2503
2504             eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2505             virs[0]  +=  gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2506         }
2507
2508         fr->enerdiffsix    = eners[0];
2509         fr->enerdifftwelve = eners[1];
2510         /* The 0.5 is due to the Gromacs definition of the virial */
2511         fr->virdiffsix     = 0.5*virs[0];
2512         fr->virdifftwelve  = 0.5*virs[1];
2513     }
2514 }
2515
2516 void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
2517                    matrix box, real lambda, tensor pres, tensor virial,
2518                    real *prescorr, real *enercorr, real *dvdlcorr)
2519 {
2520     gmx_bool bCorrAll, bCorrPres;
2521     real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2522     int      m;
2523
2524     *prescorr = 0;
2525     *enercorr = 0;
2526     *dvdlcorr = 0;
2527
2528     clear_mat(virial);
2529     clear_mat(pres);
2530
2531     if (ir->eDispCorr != edispcNO)
2532     {
2533         bCorrAll  = (ir->eDispCorr == edispcAllEner ||
2534                      ir->eDispCorr == edispcAllEnerPres);
2535         bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2536                      ir->eDispCorr == edispcAllEnerPres);
2537
2538         invvol = 1/det(box);
2539         if (fr->n_tpi)
2540         {
2541             /* Only correct for the interactions with the inserted molecule */
2542             dens   = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2543             ninter = fr->n_tpi;
2544         }
2545         else
2546         {
2547             dens   = fr->numAtomsForDispersionCorrection*invvol;
2548             ninter = 0.5*fr->numAtomsForDispersionCorrection;
2549         }
2550
2551         if (ir->efep == efepNO)
2552         {
2553             avcsix    = fr->avcsix[0];
2554             avctwelve = fr->avctwelve[0];
2555         }
2556         else
2557         {
2558             avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
2559             avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2560         }
2561
2562         enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2563         *enercorr += avcsix*enerdiff;
2564         dvdlambda  = 0.0;
2565         if (ir->efep != efepNO)
2566         {
2567             dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2568         }
2569         if (bCorrAll)
2570         {
2571             enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2572             *enercorr += avctwelve*enerdiff;
2573             if (fr->efep != efepNO)
2574             {
2575                 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2576             }
2577         }
2578
2579         if (bCorrPres)
2580         {
2581             svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2582             if (ir->eDispCorr == edispcAllEnerPres)
2583             {
2584                 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2585             }
2586             /* The factor 2 is because of the Gromacs virial definition */
2587             spres = -2.0*invvol*svir*PRESFAC;
2588
2589             for (m = 0; m < DIM; m++)
2590             {
2591                 virial[m][m] += svir;
2592                 pres[m][m]   += spres;
2593             }
2594             *prescorr += spres;
2595         }
2596
2597         /* Can't currently control when it prints, for now, just print when degugging */
2598         if (debug)
2599         {
2600             if (bCorrAll)
2601             {
2602                 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2603                         avcsix, avctwelve);
2604             }
2605             if (bCorrPres)
2606             {
2607                 fprintf(debug,
2608                         "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2609                         *enercorr, spres, svir);
2610             }
2611             else
2612             {
2613                 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2614             }
2615         }
2616
2617         if (fr->efep != efepNO)
2618         {
2619             *dvdlcorr += dvdlambda;
2620         }
2621     }
2622 }
2623
2624 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2625                             const gmx_mtop_t *mtop, rvec x[],
2626                             gmx_bool bFirst)
2627 {
2628     t_graph        *graph;
2629     int             mb, as, mol;
2630     gmx_molblock_t *molb;
2631
2632     if (bFirst && fplog)
2633     {
2634         fprintf(fplog, "Removing pbc first time\n");
2635     }
2636
2637     snew(graph, 1);
2638     as = 0;
2639     for (mb = 0; mb < mtop->nmolblock; mb++)
2640     {
2641         molb = &mtop->molblock[mb];
2642         if (molb->natoms_mol == 1 ||
2643             (!bFirst && mtop->moltype[molb->type].cgs.nr == 1))
2644         {
2645             /* Just one atom or charge group in the molecule, no PBC required */
2646             as += molb->nmol*molb->natoms_mol;
2647         }
2648         else
2649         {
2650             /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2651             mk_graph_ilist(nullptr, mtop->moltype[molb->type].ilist,
2652                            0, molb->natoms_mol, FALSE, FALSE, graph);
2653
2654             for (mol = 0; mol < molb->nmol; mol++)
2655             {
2656                 mk_mshift(fplog, graph, ePBC, box, x+as);
2657
2658                 shift_self(graph, box, x+as);
2659                 /* The molecule is whole now.
2660                  * We don't need the second mk_mshift call as in do_pbc_first,
2661                  * since we no longer need this graph.
2662                  */
2663
2664                 as += molb->natoms_mol;
2665             }
2666             done_graph(graph);
2667         }
2668     }
2669     sfree(graph);
2670 }
2671
2672 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2673                        const gmx_mtop_t *mtop, rvec x[])
2674 {
2675     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2676 }
2677
2678 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2679                  const gmx_mtop_t *mtop, rvec x[])
2680 {
2681     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2682 }
2683
2684 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2685 {
2686     int t, nth;
2687     nth = gmx_omp_nthreads_get(emntDefault);
2688
2689 #pragma omp parallel for num_threads(nth) schedule(static)
2690     for (t = 0; t < nth; t++)
2691     {
2692         try
2693         {
2694             size_t natoms = x.size();
2695             size_t offset = (natoms*t    )/nth;
2696             size_t len    = (natoms*(t + 1))/nth - offset;
2697             put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2698         }
2699         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2700     }
2701 }
2702
2703 // TODO This can be cleaned up a lot, and move back to runner.cpp
2704 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, t_commrec *cr,
2705                 const t_inputrec *inputrec,
2706                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2707                 gmx_walltime_accounting_t walltime_accounting,
2708                 nonbonded_verlet_t *nbv,
2709                 gmx_pme_t *pme,
2710                 gmx_bool bWriteStat)
2711 {
2712     t_nrnb *nrnb_tot = nullptr;
2713     double  delta_t  = 0;
2714     double  nbfs     = 0, mflop = 0;
2715     double  elapsed_time,
2716             elapsed_time_over_all_ranks,
2717             elapsed_time_over_all_threads,
2718             elapsed_time_over_all_threads_over_all_ranks;
2719     /* Control whether it is valid to print a report. Only the
2720        simulation master may print, but it should not do so if the run
2721        terminated e.g. before a scheduled reset step. This is
2722        complicated by the fact that PME ranks are unaware of the
2723        reason why they were sent a pmerecvqxFINISH. To avoid
2724        communication deadlocks, we always do the communication for the
2725        report, even if we've decided not to write the report, because
2726        how long it takes to finish the run is not important when we've
2727        decided not to report on the simulation performance.
2728
2729        Further, we only report performance for dynamical integrators,
2730        because those are the only ones for which we plan to
2731        consider doing any optimizations. */
2732     bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2733
2734     if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2735     {
2736         GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2737         printReport = false;
2738     }
2739
2740     if (cr->nnodes > 1)
2741     {
2742         snew(nrnb_tot, 1);
2743 #if GMX_MPI
2744         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2745                       cr->mpi_comm_mysim);
2746 #endif
2747     }
2748     else
2749     {
2750         nrnb_tot = nrnb;
2751     }
2752
2753     elapsed_time                                 = walltime_accounting_get_elapsed_time(walltime_accounting);
2754     elapsed_time_over_all_ranks                  = elapsed_time;
2755     elapsed_time_over_all_threads                = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
2756     elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2757 #if GMX_MPI
2758     if (cr->nnodes > 1)
2759     {
2760         /* reduce elapsed_time over all MPI ranks in the current simulation */
2761         MPI_Allreduce(&elapsed_time,
2762                       &elapsed_time_over_all_ranks,
2763                       1, MPI_DOUBLE, MPI_SUM,
2764                       cr->mpi_comm_mysim);
2765         elapsed_time_over_all_ranks /= cr->nnodes;
2766         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2767          * current simulation. */
2768         MPI_Allreduce(&elapsed_time_over_all_threads,
2769                       &elapsed_time_over_all_threads_over_all_ranks,
2770                       1, MPI_DOUBLE, MPI_SUM,
2771                       cr->mpi_comm_mysim);
2772     }
2773 #endif
2774
2775     if (printReport)
2776     {
2777         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2778     }
2779     if (cr->nnodes > 1)
2780     {
2781         sfree(nrnb_tot);
2782     }
2783
2784     if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2785     {
2786         print_dd_statistics(cr, inputrec, fplog);
2787     }
2788
2789     /* TODO Move the responsibility for any scaling by thread counts
2790      * to the code that handled the thread region, so that there's a
2791      * mechanism to keep cycle counting working during the transition
2792      * to task parallelism. */
2793     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
2794     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2795     wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2796     auto cycle_sum(wallcycle_sum(cr, wcycle));
2797
2798     if (printReport)
2799     {
2800         auto                    nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2801         gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
2802         if (pme_gpu_task_enabled(pme))
2803         {
2804             pme_gpu_get_timings(pme, &pme_gpu_timings);
2805         }
2806         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2807                         elapsed_time_over_all_ranks,
2808                         wcycle, cycle_sum,
2809                         nbnxn_gpu_timings,
2810                         &pme_gpu_timings);
2811
2812         if (EI_DYNAMICS(inputrec->eI))
2813         {
2814             delta_t = inputrec->delta_t;
2815         }
2816
2817         if (fplog)
2818         {
2819             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2820                        elapsed_time_over_all_ranks,
2821                        walltime_accounting_get_nsteps_done(walltime_accounting),
2822                        delta_t, nbfs, mflop);
2823         }
2824         if (bWriteStat)
2825         {
2826             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2827                        elapsed_time_over_all_ranks,
2828                        walltime_accounting_get_nsteps_done(walltime_accounting),
2829                        delta_t, nbfs, mflop);
2830         }
2831     }
2832 }
2833
2834 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2835 {
2836     /* this function works, but could probably use a logic rewrite to keep all the different
2837        types of efep straight. */
2838
2839     if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
2840     {
2841         return;
2842     }
2843
2844     t_lambda *fep = ir->fepvals;
2845     *fep_state    = fep->init_fep_state; /* this might overwrite the checkpoint
2846                                             if checkpoint is set -- a kludge is in for now
2847                                             to prevent this.*/
2848
2849     for (int i = 0; i < efptNR; i++)
2850     {
2851         /* overwrite lambda state with init_lambda for now for backwards compatibility */
2852         if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2853         {
2854             lambda[i] = fep->init_lambda;
2855             if (lam0)
2856             {
2857                 lam0[i] = lambda[i];
2858             }
2859         }
2860         else
2861         {
2862             lambda[i] = fep->all_lambda[i][*fep_state];
2863             if (lam0)
2864             {
2865                 lam0[i] = lambda[i];
2866             }
2867         }
2868     }
2869     if (ir->bSimTemp)
2870     {
2871         /* need to rescale control temperatures to match current state */
2872         for (int i = 0; i < ir->opts.ngtc; i++)
2873         {
2874             if (ir->opts.ref_t[i] > 0)
2875             {
2876                 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2877             }
2878         }
2879     }
2880
2881     /* Send to the log the information on the current lambdas */
2882     if (fplog != nullptr)
2883     {
2884         fprintf(fplog, "Initial vector of lambda components:[ ");
2885         for (const auto &l : lambda)
2886         {
2887             fprintf(fplog, "%10.4f ", l);
2888         }
2889         fprintf(fplog, "]\n");
2890     }
2891     return;
2892 }
2893
2894
2895 void init_md(FILE *fplog,
2896              t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2897              t_inputrec *ir, const gmx_output_env_t *oenv,
2898              const MdrunOptions &mdrunOptions,
2899              double *t, double *t0,
2900              t_state *globalState, double *lam0,
2901              t_nrnb *nrnb, gmx_mtop_t *mtop,
2902              gmx_update_t **upd,
2903              int nfile, const t_filenm fnm[],
2904              gmx_mdoutf_t *outf, t_mdebin **mdebin,
2905              tensor force_vir, tensor shake_vir, rvec mu_tot,
2906              gmx_bool *bSimAnn, t_vcm **vcm,
2907              gmx_wallcycle_t wcycle)
2908 {
2909     int  i;
2910
2911     /* Initial values */
2912     *t = *t0       = ir->init_t;
2913
2914     *bSimAnn = FALSE;
2915     for (i = 0; i < ir->opts.ngtc; i++)
2916     {
2917         /* set bSimAnn if any group is being annealed */
2918         if (ir->opts.annealing[i] != eannNO)
2919         {
2920             *bSimAnn = TRUE;
2921         }
2922     }
2923
2924     /* Initialize lambda variables */
2925     /* TODO: Clean up initialization of fep_state and lambda in t_state.
2926      * We currently need to call initialize_lambdas on non-master ranks
2927      * to initialize lam0.
2928      */
2929     if (MASTER(cr))
2930     {
2931         initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
2932     }
2933     else
2934     {
2935         int                      tmpFepState;
2936         std::array<real, efptNR> tmpLambda;
2937         initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
2938     }
2939
2940     // TODO upd is never NULL in practice, but the analysers don't know that
2941     if (upd)
2942     {
2943         *upd = init_update(ir);
2944     }
2945     if (*bSimAnn)
2946     {
2947         update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2948     }
2949
2950     if (vcm != nullptr)
2951     {
2952         *vcm = init_vcm(fplog, &mtop->groups, ir);
2953     }
2954
2955     if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
2956     {
2957         if (ir->etc == etcBERENDSEN)
2958         {
2959             please_cite(fplog, "Berendsen84a");
2960         }
2961         if (ir->etc == etcVRESCALE)
2962         {
2963             please_cite(fplog, "Bussi2007a");
2964         }
2965         if (ir->eI == eiSD1)
2966         {
2967             please_cite(fplog, "Goga2012");
2968         }
2969     }
2970     init_nrnb(nrnb);
2971
2972     if (nfile != -1)
2973     {
2974         *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
2975
2976         *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
2977                               mtop, ir, mdoutf_get_fp_dhdl(*outf));
2978     }
2979
2980     /* Initiate variables */
2981     clear_mat(force_vir);
2982     clear_mat(shake_vir);
2983     clear_rvec(mu_tot);
2984 }