src/gromacs/mdlib/sim_util.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "sim_util.h"
  40
  41 #include "config.h"
  42
  43 #include <stdio.h>
  44 #include <string.h>
  45
  46 #include <cmath>
  47 #include <cstdint>
  48
  49 #include <array>
  50
  51 #include "gromacs/awh/awh.h"
  52 #include "gromacs/domdec/dlbtiming.h"
  53 #include "gromacs/domdec/domdec.h"
  54 #include "gromacs/domdec/domdec_struct.h"
  55 #include "gromacs/essentialdynamics/edsam.h"
  56 #include "gromacs/ewald/pme.h"
  57 #include "gromacs/gmxlib/chargegroup.h"
  58 #include "gromacs/gmxlib/network.h"
  59 #include "gromacs/gmxlib/nrnb.h"
  60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
  61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
  62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
  63 #include "gromacs/gpu_utils/gpu_utils.h"
  64 #include "gromacs/imd/imd.h"
  65 #include "gromacs/listed-forces/bonded.h"
  66 #include "gromacs/listed-forces/disre.h"
  67 #include "gromacs/listed-forces/orires.h"
  68 #include "gromacs/math/functions.h"
  69 #include "gromacs/math/units.h"
  70 #include "gromacs/math/vec.h"
  71 #include "gromacs/math/vecdump.h"
  72 #include "gromacs/mdlib/calcmu.h"
  73 #include "gromacs/mdlib/calcvir.h"
  74 #include "gromacs/mdlib/constr.h"
  75 #include "gromacs/mdlib/force.h"
  76 #include "gromacs/mdlib/forcerec.h"
  77 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  78 #include "gromacs/mdlib/mdrun.h"
  79 #include "gromacs/mdlib/nb_verlet.h"
  80 #include "gromacs/mdlib/nbnxn_atomdata.h"
  81 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  82 #include "gromacs/mdlib/nbnxn_grid.h"
  83 #include "gromacs/mdlib/nbnxn_search.h"
  84 #include "gromacs/mdlib/qmmm.h"
  85 #include "gromacs/mdlib/update.h"
  86 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
  87 #include "gromacs/mdtypes/commrec.h"
  88 #include "gromacs/mdtypes/forceoutput.h"
  89 #include "gromacs/mdtypes/iforceprovider.h"
  90 #include "gromacs/mdtypes/inputrec.h"
  91 #include "gromacs/mdtypes/md_enums.h"
  92 #include "gromacs/mdtypes/state.h"
  93 #include "gromacs/pbcutil/ishift.h"
  94 #include "gromacs/pbcutil/mshift.h"
  95 #include "gromacs/pbcutil/pbc.h"
  96 #include "gromacs/pulling/pull.h"
  97 #include "gromacs/pulling/pull_rotation.h"
  98 #include "gromacs/timing/cyclecounter.h"
  99 #include "gromacs/timing/gpu_timing.h"
 100 #include "gromacs/timing/wallcycle.h"
 101 #include "gromacs/timing/wallcyclereporting.h"
 102 #include "gromacs/timing/walltime_accounting.h"
 103 #include "gromacs/topology/topology.h"
 104 #include "gromacs/utility/arrayref.h"
 105 #include "gromacs/utility/basedefinitions.h"
 106 #include "gromacs/utility/cstringutil.h"
 107 #include "gromacs/utility/exceptions.h"
 108 #include "gromacs/utility/fatalerror.h"
 109 #include "gromacs/utility/gmxassert.h"
 110 #include "gromacs/utility/gmxmpi.h"
 111 #include "gromacs/utility/logger.h"
 112 #include "gromacs/utility/pleasecite.h"
 113 #include "gromacs/utility/smalloc.h"
 114 #include "gromacs/utility/sysinfo.h"
 115
 116 #include "nbnxn_gpu.h"
 117 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
 118 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
 119
 120 // TODO: this environment variable allows us to verify before release
 121 // that on less common architectures the total cost of polling is not larger than
 122 // a blocking wait (so polling does not introduce overhead when the static
 123 // PME-first ordering would suffice).
 124 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
 125
 126
 127 void print_time(FILE                     *out,
 128                 gmx_walltime_accounting_t walltime_accounting,
 129                 gmx_int64_t               step,
 130                 t_inputrec               *ir,
 131                 const t_commrec          *cr)
 132 {
 133     time_t finish;
 134     char   timebuf[STRLEN];
 135     double dt, elapsed_seconds, time_per_step;
 136     char   buf[48];
 137
 138 #if !GMX_THREAD_MPI
 139     if (!PAR(cr))
 140 #endif
 141     {
 142         fprintf(out, "\r");
 143     }
 144     fprintf(out, "step %s", gmx_step_str(step, buf));
 145     fflush(out);
 146
 147     if ((step >= ir->nstlist))
 148     {
 149         double seconds_since_epoch = gmx_gettime();
 150         elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
 151         time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
 152         dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
 153
 154         if (ir->nsteps >= 0)
 155         {
 156             if (dt >= 300)
 157             {
 158                 finish = (time_t) (seconds_since_epoch + dt);
 159                 gmx_ctime_r(&finish, timebuf, STRLEN);
 160                 sprintf(buf, "%s", timebuf);
 161                 buf[strlen(buf)-1] = '\0';
 162                 fprintf(out, ", will finish %s", buf);
 163             }
 164             else
 165             {
 166                 fprintf(out, ", remaining wall clock time: %5d s          ", (int)dt);
 167             }
 168         }
 169         else
 170         {
 171             fprintf(out, " performance: %.1f ns/day    ",
 172                     ir->delta_t/1000*24*60*60/time_per_step);
 173         }
 174     }
 175 #if !GMX_THREAD_MPI
 176     if (PAR(cr))
 177     {
 178         fprintf(out, "\n");
 179     }
 180 #else
 181     GMX_UNUSED_VALUE(cr);
 182 #endif
 183
 184     fflush(out);
 185 }
 186
 187 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
 188                          double the_time)
 189 {
 190     char   time_string[STRLEN];
 191
 192     if (!fplog)
 193     {
 194         return;
 195     }
 196
 197     {
 198         int    i;
 199         char   timebuf[STRLEN];
 200         time_t temp_time = (time_t) the_time;
 201
 202         gmx_ctime_r(&temp_time, timebuf, STRLEN);
 203         for (i = 0; timebuf[i] >= ' '; i++)
 204         {
 205             time_string[i] = timebuf[i];
 206         }
 207         time_string[i] = '\0';
 208     }
 209
 210     fprintf(fplog, "%s on rank %d %s\n", title, nodeid, time_string);
 211 }
 212
 213 void print_start(FILE *fplog, const t_commrec *cr,
 214                  gmx_walltime_accounting_t walltime_accounting,
 215                  const char *name)
 216 {
 217     char buf[STRLEN];
 218
 219     sprintf(buf, "Started %s", name);
 220     print_date_and_time(fplog, cr->nodeid, buf,
 221                         walltime_accounting_get_start_time_stamp(walltime_accounting));
 222 }
 223
 224 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
 225 {
 226     const int end = forceToAdd.size();
 227
 228     // cppcheck-suppress unreadVariable
 229     int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
 230 #pragma omp parallel for num_threads(nt) schedule(static)
 231     for (int i = 0; i < end; i++)
 232     {
 233         rvec_inc(f[i], forceToAdd[i]);
 234     }
 235 }
 236
 237 static void pme_gpu_reduce_outputs(gmx_wallcycle_t            wcycle,
 238                                    ForceWithVirial           *forceWithVirial,
 239                                    ArrayRef<const gmx::RVec>  pmeForces,
 240                                    gmx_enerdata_t            *enerd,
 241                                    const tensor               vir_Q,
 242                                    real                       Vlr_q)
 243 {
 244     wallcycle_start(wcycle, ewcPME_GPU_F_REDUCTION);
 245     GMX_ASSERT(forceWithVirial, "Invalid force pointer");
 246     forceWithVirial->addVirialContribution(vir_Q);
 247     enerd->term[F_COUL_RECIP] += Vlr_q;
 248     sum_forces(as_rvec_array(forceWithVirial->force_.data()), pmeForces);
 249     wallcycle_stop(wcycle, ewcPME_GPU_F_REDUCTION);
 250 }
 251
 252 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
 253                         tensor vir_part, t_graph *graph, matrix box,
 254                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
 255 {
 256     int    i;
 257
 258     /* The short-range virial from surrounding boxes */
 259     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
 260     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
 261
 262     /* Calculate partial virial, for local atoms only, based on short range.
 263      * Total virial is computed in global_stat, called from do_md
 264      */
 265     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
 266     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
 267
 268     /* Add wall contribution */
 269     for (i = 0; i < DIM; i++)
 270     {
 271         vir_part[i][ZZ] += fr->vir_wall_z[i];
 272     }
 273
 274     if (debug)
 275     {
 276         pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
 277     }
 278 }
 279
 280 static void pull_potential_wrapper(const t_commrec *cr,
 281                                    const t_inputrec *ir,
 282                                    matrix box, ArrayRef<const RVec> x,
 283                                    ForceWithVirial *force,
 284                                    const t_mdatoms *mdatoms,
 285                                    gmx_enerdata_t *enerd,
 286                                    real *lambda,
 287                                    double t,
 288                                    gmx_wallcycle_t wcycle)
 289 {
 290     t_pbc  pbc;
 291     real   dvdl;
 292
 293     /* Calculate the center of mass forces, this requires communication,
 294      * which is why pull_potential is called close to other communication.
 295      */
 296     wallcycle_start(wcycle, ewcPULLPOT);
 297     set_pbc(&pbc, ir->ePBC, box);
 298     dvdl                     = 0;
 299     enerd->term[F_COM_PULL] +=
 300         pull_potential(ir->pull_work, mdatoms, &pbc,
 301                        cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
 302     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
 303     wallcycle_stop(wcycle, ewcPULLPOT);
 304 }
 305
 306 static void pme_receive_force_ener(const t_commrec *cr,
 307                                    ForceWithVirial *forceWithVirial,
 308                                    gmx_enerdata_t  *enerd,
 309                                    gmx_wallcycle_t  wcycle)
 310 {
 311     real   e_q, e_lj, dvdl_q, dvdl_lj;
 312     float  cycles_ppdpme, cycles_seppme;
 313
 314     cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
 315     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
 316
 317     /* In case of node-splitting, the PP nodes receive the long-range
 318      * forces, virial and energy from the PME nodes here.
 319      */
 320     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
 321     dvdl_q  = 0;
 322     dvdl_lj = 0;
 323     gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
 324                       &cycles_seppme);
 325     enerd->term[F_COUL_RECIP] += e_q;
 326     enerd->term[F_LJ_RECIP]   += e_lj;
 327     enerd->dvdl_lin[efptCOUL] += dvdl_q;
 328     enerd->dvdl_lin[efptVDW]  += dvdl_lj;
 329
 330     if (wcycle)
 331     {
 332         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
 333     }
 334     wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
 335 }
 336
 337 static void print_large_forces(FILE            *fp,
 338                                const t_mdatoms *md,
 339                                const t_commrec *cr,
 340                                gmx_int64_t      step,
 341                                real             forceTolerance,
 342                                const rvec      *x,
 343                                const rvec      *f)
 344 {
 345     real           force2Tolerance = gmx::square(forceTolerance);
 346     std::uintmax_t numNonFinite    = 0;
 347     for (int i = 0; i < md->homenr; i++)
 348     {
 349         real force2    = norm2(f[i]);
 350         bool nonFinite = !std::isfinite(force2);
 351         if (force2 >= force2Tolerance || nonFinite)
 352         {
 353             fprintf(fp, "step %" GMX_PRId64 " atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
 354                     step,
 355                     ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
 356         }
 357         if (nonFinite)
 358         {
 359             numNonFinite++;
 360         }
 361     }
 362     if (numNonFinite > 0)
 363     {
 364         /* Note that with MPI this fatal call on one rank might interrupt
 365          * the printing on other ranks. But we can only avoid that with
 366          * an expensive MPI barrier that we would need at each step.
 367          */
 368         gmx_fatal(FARGS, "At step %" GMX_PRId64 " detected non-finite forces on %ju atoms", step, numNonFinite);
 369     }
 370 }
 371
 372 static void post_process_forces(const t_commrec      *cr,
 373                                 gmx_int64_t           step,
 374                                 t_nrnb               *nrnb,
 375                                 gmx_wallcycle_t       wcycle,
 376                                 const gmx_localtop_t *top,
 377                                 matrix                box,
 378                                 rvec                  x[],
 379                                 rvec                  f[],
 380                                 ForceWithVirial      *forceWithVirial,
 381                                 tensor                vir_force,
 382                                 const t_mdatoms      *mdatoms,
 383                                 t_graph              *graph,
 384                                 t_forcerec           *fr,
 385                                 const gmx_vsite_t    *vsite,
 386                                 int                   flags)
 387 {
 388     if (fr->haveDirectVirialContributions)
 389     {
 390         rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
 391
 392         if (vsite)
 393         {
 394             /* Spread the mesh force on virtual sites to the other particles...
 395              * This is parallellized. MPI communication is performed
 396              * if the constructing atoms aren't local.
 397              */
 398             matrix virial = { { 0 } };
 399             spread_vsite_f(vsite, x, fDirectVir, nullptr,
 400                            (flags & GMX_FORCE_VIRIAL), virial,
 401                            nrnb,
 402                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
 403             forceWithVirial->addVirialContribution(virial);
 404         }
 405
 406         if (flags & GMX_FORCE_VIRIAL)
 407         {
 408             /* Now add the forces, this is local */
 409             sum_forces(f, forceWithVirial->force_);
 410
 411             /* Add the direct virial contributions */
 412             GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
 413             m_add(vir_force, forceWithVirial->getVirial(), vir_force);
 414
 415             if (debug)
 416             {
 417                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
 418             }
 419         }
 420     }
 421
 422     if (fr->print_force >= 0)
 423     {
 424         print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
 425     }
 426 }
 427
 428 static void do_nb_verlet(t_forcerec *fr,
 429                          interaction_const_t *ic,
 430                          gmx_enerdata_t *enerd,
 431                          int flags, int ilocality,
 432                          int clearF,
 433                          gmx_int64_t step,
 434                          t_nrnb *nrnb,
 435                          gmx_wallcycle_t wcycle)
 436 {
 437     if (!(flags & GMX_FORCE_NONBONDED))
 438     {
 439         /* skip non-bonded calculation */
 440         return;
 441     }
 442
 443     nonbonded_verlet_t       *nbv  = fr->nbv;
 444     nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
 445
 446     /* GPU kernel launch overhead is already timed separately */
 447     if (fr->cutoff_scheme != ecutsVERLET)
 448     {
 449         gmx_incons("Invalid cut-off scheme passed!");
 450     }
 451
 452     bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
 453
 454     if (!bUsingGpuKernels)
 455     {
 456         /* When dynamic pair-list  pruning is requested, we need to prune
 457          * at nstlistPrune steps.
 458          */
 459         if (nbv->listParams->useDynamicPruning &&
 460             (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
 461         {
 462             /* Prune the pair-list beyond fr->ic->rlistPrune using
 463              * the current coordinates of the atoms.
 464              */
 465             wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
 466             nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
 467             wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
 468         }
 469
 470         wallcycle_sub_start(wcycle, ewcsNONBONDED);
 471     }
 472
 473     switch (nbvg->kernel_type)
 474     {
 475         case nbnxnk4x4_PlainC:
 476         case nbnxnk4xN_SIMD_4xN:
 477         case nbnxnk4xN_SIMD_2xNN:
 478             nbnxn_kernel_cpu(nbvg,
 479                              nbv->nbat,
 480                              ic,
 481                              fr->shift_vec,
 482                              flags,
 483                              clearF,
 484                              fr->fshift[0],
 485                              enerd->grpp.ener[egCOULSR],
 486                              fr->bBHAM ?
 487                              enerd->grpp.ener[egBHAMSR] :
 488                              enerd->grpp.ener[egLJSR]);
 489             break;
 490
 491         case nbnxnk8x8x8_GPU:
 492             nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbv->nbat, flags, ilocality);
 493             break;
 494
 495         case nbnxnk8x8x8_PlainC:
 496             nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
 497                                  nbv->nbat, ic,
 498                                  fr->shift_vec,
 499                                  flags,
 500                                  clearF,
 501                                  nbv->nbat->out[0].f,
 502                                  fr->fshift[0],
 503                                  enerd->grpp.ener[egCOULSR],
 504                                  fr->bBHAM ?
 505                                  enerd->grpp.ener[egBHAMSR] :
 506                                  enerd->grpp.ener[egLJSR]);
 507             break;
 508
 509         default:
 510             GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
 511
 512     }
 513     if (!bUsingGpuKernels)
 514     {
 515         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 516     }
 517
 518     int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
 519     if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
 520     {
 521         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
 522     }
 523     else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
 524              (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
 525     {
 526         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
 527     }
 528     else
 529     {
 530         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
 531     }
 532     enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
 533     if (flags & GMX_FORCE_ENERGY)
 534     {
 535         /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
 536         enr_nbnxn_kernel_ljc += 1;
 537         enr_nbnxn_kernel_lj  += 1;
 538     }
 539
 540     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
 541              nbvg->nbl_lists.natpair_ljq);
 542     inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
 543              nbvg->nbl_lists.natpair_lj);
 544     /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
 545     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
 546              nbvg->nbl_lists.natpair_q);
 547
 548     if (ic->vdw_modifier == eintmodFORCESWITCH)
 549     {
 550         /* We add up the switch cost separately */
 551         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 552                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 553     }
 554     if (ic->vdw_modifier == eintmodPOTSWITCH)
 555     {
 556         /* We add up the switch cost separately */
 557         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 558                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 559     }
 560     if (ic->vdwtype == evdwPME)
 561     {
 562         /* We add up the LJ Ewald cost separately */
 563         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 564                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 565     }
 566 }
 567
 568 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
 569                              t_forcerec           *fr,
 570                              rvec                  x[],
 571                              rvec                  f[],
 572                              const t_mdatoms      *mdatoms,
 573                              t_lambda             *fepvals,
 574                              real                 *lambda,
 575                              gmx_enerdata_t       *enerd,
 576                              int                   flags,
 577                              t_nrnb               *nrnb,
 578                              gmx_wallcycle_t       wcycle)
 579 {
 580     int              donb_flags;
 581     nb_kernel_data_t kernel_data;
 582     real             lam_i[efptNR];
 583     real             dvdl_nb[efptNR];
 584     int              th;
 585     int              i, j;
 586
 587     donb_flags = 0;
 588     /* Add short-range interactions */
 589     donb_flags |= GMX_NONBONDED_DO_SR;
 590
 591     /* Currently all group scheme kernels always calculate (shift-)forces */
 592     if (flags & GMX_FORCE_FORCES)
 593     {
 594         donb_flags |= GMX_NONBONDED_DO_FORCE;
 595     }
 596     if (flags & GMX_FORCE_VIRIAL)
 597     {
 598         donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
 599     }
 600     if (flags & GMX_FORCE_ENERGY)
 601     {
 602         donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
 603     }
 604
 605     kernel_data.flags  = donb_flags;
 606     kernel_data.lambda = lambda;
 607     kernel_data.dvdl   = dvdl_nb;
 608
 609     kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
 610     kernel_data.energygrp_vdw  = enerd->grpp.ener[egLJSR];
 611
 612     /* reset free energy components */
 613     for (i = 0; i < efptNR; i++)
 614     {
 615         dvdl_nb[i]  = 0;
 616     }
 617
 618     GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
 619
 620     wallcycle_sub_start(wcycle, ewcsNONBONDED);
 621 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
 622     for (th = 0; th < nbl_lists->nnbl; th++)
 623     {
 624         try
 625         {
 626             gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
 627                                       x, f, fr, mdatoms, &kernel_data, nrnb);
 628         }
 629         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 630     }
 631
 632     if (fepvals->sc_alpha != 0)
 633     {
 634         enerd->dvdl_nonlin[efptVDW]  += dvdl_nb[efptVDW];
 635         enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
 636     }
 637     else
 638     {
 639         enerd->dvdl_lin[efptVDW]  += dvdl_nb[efptVDW];
 640         enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
 641     }
 642
 643     /* If we do foreign lambda and we have soft-core interactions
 644      * we have to recalculate the (non-linear) energies contributions.
 645      */
 646     if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
 647     {
 648         kernel_data.flags          = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
 649         kernel_data.lambda         = lam_i;
 650         kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
 651         kernel_data.energygrp_vdw  = enerd->foreign_grpp.ener[egLJSR];
 652         /* Note that we add to kernel_data.dvdl, but ignore the result */
 653
 654         for (i = 0; i < enerd->n_lambda; i++)
 655         {
 656             for (j = 0; j < efptNR; j++)
 657             {
 658                 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
 659             }
 660             reset_foreign_enerdata(enerd);
 661 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
 662             for (th = 0; th < nbl_lists->nnbl; th++)
 663             {
 664                 try
 665                 {
 666                     gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
 667                                               x, f, fr, mdatoms, &kernel_data, nrnb);
 668                 }
 669                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 670             }
 671
 672             sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
 673             enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
 674         }
 675     }
 676
 677     wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 678 }
 679
 680 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
 681 {
 682     return nbv != nullptr && nbv->bUseGPU;
 683 }
 684
 685 static inline void clear_rvecs_omp(int n, rvec v[])
 686 {
 687     int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
 688
 689     /* Note that we would like to avoid this conditional by putting it
 690      * into the omp pragma instead, but then we still take the full
 691      * omp parallel for overhead (at least with gcc5).
 692      */
 693     if (nth == 1)
 694     {
 695         for (int i = 0; i < n; i++)
 696         {
 697             clear_rvec(v[i]);
 698         }
 699     }
 700     else
 701     {
 702 #pragma omp parallel for num_threads(nth) schedule(static)
 703         for (int i = 0; i < n; i++)
 704         {
 705             clear_rvec(v[i]);
 706         }
 707     }
 708 }
 709
 710 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
 711  *
 712  * \param groupOptions  Group options, containing T-coupling options
 713  */
 714 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
 715 {
 716     real nrdfCoupled   = 0;
 717     real nrdfUncoupled = 0;
 718     real kineticEnergy = 0;
 719     for (int g = 0; g < groupOptions.ngtc; g++)
 720     {
 721         if (groupOptions.tau_t[g] >= 0)
 722         {
 723             nrdfCoupled   += groupOptions.nrdf[g];
 724             kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
 725         }
 726         else
 727         {
 728             nrdfUncoupled += groupOptions.nrdf[g];
 729         }
 730     }
 731
 732     /* This conditional with > also catches nrdf=0 */
 733     if (nrdfCoupled > nrdfUncoupled)
 734     {
 735         return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
 736     }
 737     else
 738     {
 739         return 0;
 740     }
 741 }
 742
 743 /*! \brief This routine checks that the potential energy is finite.
 744  *
 745  * Always checks that the potential energy is finite. If step equals
 746  * inputrec.init_step also checks that the magnitude of the potential energy
 747  * is reasonable. Terminates with a fatal error when a check fails.
 748  * Note that passing this check does not guarantee finite forces,
 749  * since those use slightly different arithmetics. But in most cases
 750  * there is just a narrow coordinate range where forces are not finite
 751  * and energies are finite.
 752  *
 753  * \param[in] step      The step number, used for checking and printing
 754  * \param[in] enerd     The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
 755  * \param[in] inputrec  The input record
 756  */
 757 static void checkPotentialEnergyValidity(gmx_int64_t           step,
 758                                          const gmx_enerdata_t &enerd,
 759                                          const t_inputrec     &inputrec)
 760 {
 761     /* Threshold valid for comparing absolute potential energy against
 762      * the kinetic energy. Normally one should not consider absolute
 763      * potential energy values, but with a factor of one million
 764      * we should never get false positives.
 765      */
 766     constexpr real c_thresholdFactor = 1e6;
 767
 768     bool           energyIsNotFinite    = !std::isfinite(enerd.term[F_EPOT]);
 769     real           averageKineticEnergy = 0;
 770     /* We only check for large potential energy at the initial step,
 771      * because that is by far the most likely step for this too occur
 772      * and because computing the average kinetic energy is not free.
 773      * Note: nstcalcenergy >> 1 often does not allow to catch large energies
 774      * before they become NaN.
 775      */
 776     if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
 777     {
 778         averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
 779     }
 780
 781     if (energyIsNotFinite || (averageKineticEnergy > 0 &&
 782                               enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
 783     {
 784         gmx_fatal(FARGS, "Step %" GMX_PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
 785                   step,
 786                   enerd.term[F_EPOT],
 787                   energyIsNotFinite ? "not finite" : "extremely high",
 788                   enerd.term[F_LJ],
 789                   enerd.term[F_COUL_SR],
 790                   energyIsNotFinite ? "non-finite" : "very high",
 791                   energyIsNotFinite ? " or Nan" : "");
 792     }
 793 }
 794
 795 /*! \brief Compute forces and/or energies for special algorithms
 796  *
 797  * The intention is to collect all calls to algorithms that compute
 798  * forces on local atoms only and that do not contribute to the local
 799  * virial sum (but add their virial contribution separately).
 800  * Eventually these should likely all become ForceProviders.
 801  * Within this function the intention is to have algorithms that do
 802  * global communication at the end, so global barriers within the MD loop
 803  * are as close together as possible.
 804  *
 805  * \param[in]     fplog            The log file
 806  * \param[in]     cr               The communication record
 807  * \param[in]     inputrec         The input record
 808  * \param[in]     awh              The Awh module (nullptr if none in use).
 809  * \param[in]     step             The current MD step
 810  * \param[in]     t                The current time
 811  * \param[in,out] wcycle           Wallcycle accounting struct
 812  * \param[in,out] forceProviders   Pointer to a list of force providers
 813  * \param[in]     box              The unit cell
 814  * \param[in]     x                The coordinates
 815  * \param[in]     mdatoms          Per atom properties
 816  * \param[in]     lambda           Array of free-energy lambda values
 817  * \param[in]     forceFlags       Flags that tell whether we should compute forces/energies/virial
 818  * \param[in,out] forceWithVirial  Force and virial buffers
 819  * \param[in,out] enerd            Energy buffer
 820  * \param[in,out] ed               Essential dynamics pointer
 821  * \param[in]     bNS              Tells if we did neighbor searching this step, used for ED sampling
 822  *
 823  * \todo Remove bNS, which is used incorrectly.
 824  * \todo Convert all other algorithms called here to ForceProviders.
 825  */
 826 static void
 827 computeSpecialForces(FILE                *fplog,
 828                      const t_commrec     *cr,
 829                      const t_inputrec    *inputrec,
 830                      gmx::Awh            *awh,
 831                      gmx_int64_t          step,
 832                      double               t,
 833                      gmx_wallcycle_t      wcycle,
 834                      ForceProviders      *forceProviders,
 835                      matrix               box,
 836                      ArrayRef<const RVec> x,
 837                      const t_mdatoms     *mdatoms,
 838                      real                *lambda,
 839                      int                  forceFlags,
 840                      ForceWithVirial     *forceWithVirial,
 841                      gmx_enerdata_t      *enerd,
 842                      const gmx_edsam     *ed,
 843                      gmx_bool             bNS)
 844 {
 845     const bool computeForces = (forceFlags & GMX_FORCE_FORCES);
 846
 847     /* NOTE: Currently all ForceProviders only provide forces.
 848      *       When they also provide energies, remove this conditional.
 849      */
 850     if (computeForces)
 851     {
 852         ForceProviderInput  forceProviderInput(x, *mdatoms, t, box, *cr);
 853         ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
 854
 855         /* Collect forces from modules */
 856         forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
 857     }
 858
 859     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
 860     {
 861         pull_potential_wrapper(cr, inputrec, box, x,
 862                                forceWithVirial,
 863                                mdatoms, enerd, lambda, t,
 864                                wcycle);
 865
 866         if (awh)
 867         {
 868             enerd->term[F_COM_PULL] +=
 869                 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
 870                                                   forceWithVirial,
 871                                                   t, step, wcycle, fplog);
 872         }
 873     }
 874
 875     rvec *f = as_rvec_array(forceWithVirial->force_.data());
 876
 877     /* Add the forces from enforced rotation potentials (if any) */
 878     if (inputrec->bRot)
 879     {
 880         wallcycle_start(wcycle, ewcROTadd);
 881         enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
 882         wallcycle_stop(wcycle, ewcROTadd);
 883     }
 884
 885     if (ed)
 886     {
 887         /* Note that since init_edsam() is called after the initialization
 888          * of forcerec, edsam doesn't request the noVirSum force buffer.
 889          * Thus if no other algorithm (e.g. PME) requires it, the forces
 890          * here will contribute to the virial.
 891          */
 892         do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
 893     }
 894
 895     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
 896     if (inputrec->bIMD && computeForces)
 897     {
 898         IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
 899     }
 900 }
 901
 902 /*! \brief Launch the prepare_step and spread stages of PME GPU.
 903  *
 904  * \param[in]  pmedata       The PME structure
 905  * \param[in]  box           The box matrix
 906  * \param[in]  x             Coordinate array
 907  * \param[in]  flags         Force flags
 908  * \param[in]  wcycle        The wallcycle structure
 909  */
 910 static inline void launchPmeGpuSpread(gmx_pme_t      *pmedata,
 911                                       matrix          box,
 912                                       rvec            x[],
 913                                       int             flags,
 914                                       gmx_wallcycle_t wcycle)
 915 {
 916     int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE;
 917     pmeFlags |= (flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0;
 918     pmeFlags |= (flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0;
 919
 920     pme_gpu_prepare_computation(pmedata, flags & GMX_FORCE_DYNAMICBOX, box, wcycle, pmeFlags);
 921     pme_gpu_launch_spread(pmedata, x, wcycle);
 922 }
 923
 924 /*! \brief Launch the FFT and gather stages of PME GPU
 925  *
 926  * This function only implements setting the output forces (no accumulation).
 927  *
 928  * \param[in]  pmedata        The PME structure
 929  * \param[in]  wcycle         The wallcycle structure
 930  */
 931 static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
 932                                      gmx_wallcycle_t   wcycle)
 933 {
 934     pme_gpu_launch_complex_transforms(pmedata, wcycle);
 935     pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
 936 }
 937
 938 /*! \brief
 939  *  Polling wait for either of the PME or nonbonded GPU tasks.
 940  *
 941  * Instead of a static order in waiting for GPU tasks, this function
 942  * polls checking which of the two tasks completes first, and does the
 943  * associated force buffer reduction overlapped with the other task.
 944  * By doing that, unlike static scheduling order, it can always overlap
 945  * one of the reductions, regardless of the GPU task completion order.
 946  *
 947  * \param[in]     nbv              Nonbonded verlet structure
 948  * \param[in]     pmedata          PME module data
 949  * \param[in,out] force            Force array to reduce task outputs into.
 950  * \param[in,out] forceWithVirial  Force and virial buffers
 951  * \param[in,out] fshift           Shift force output vector results are reduced into
 952  * \param[in,out] enerd            Energy data structure results are reduced into
 953  * \param[in]     flags            Force flags
 954  * \param[in]     wcycle           The wallcycle structure
 955  */
 956 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t             *nbv,
 957                                         const gmx_pme_t                *pmedata,
 958                                         gmx::PaddedArrayRef<gmx::RVec> *force,
 959                                         ForceWithVirial                *forceWithVirial,
 960                                         rvec                            fshift[],
 961                                         gmx_enerdata_t                 *enerd,
 962                                         int                             flags,
 963                                         gmx_wallcycle_t                 wcycle)
 964 {
 965     bool isPmeGpuDone = false;
 966     bool isNbGpuDone  = false;
 967
 968
 969     gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
 970
 971     while (!isPmeGpuDone || !isNbGpuDone)
 972     {
 973         if (!isPmeGpuDone)
 974         {
 975             matrix            vir_Q;
 976             real              Vlr_q;
 977
 978             GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
 979             isPmeGpuDone = pme_gpu_try_finish_task(pmedata, wcycle, &pmeGpuForces,
 980                                                    vir_Q, &Vlr_q, completionType);
 981
 982             if (isPmeGpuDone)
 983             {
 984                 pme_gpu_reduce_outputs(wcycle, forceWithVirial, pmeGpuForces,
 985                                        enerd, vir_Q, Vlr_q);
 986             }
 987         }
 988
 989         if (!isNbGpuDone)
 990         {
 991             GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
 992             wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
 993             isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
 994                                                     flags, eatLocal,
 995                                                     enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
 996                                                     fshift, completionType);
 997             wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 998             // To get the call count right, when the task finished we
 999             // issue a start/stop.
1000             // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
1001             // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
1002             if (isNbGpuDone)
1003             {
1004                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1005                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1006
1007                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1008                                                nbv->nbat, as_rvec_array(force->data()), wcycle);
1009             }
1010         }
1011     }
1012 }
1013
1014 /*! \brief
1015  *  Launch the dynamic rolling pruning GPU task.
1016  *
1017  *  We currently alternate local/non-local list pruning in odd-even steps
1018  *  (only pruning every second step without DD).
1019  *
1020  * \param[in]     cr               The communication record
1021  * \param[in]     nbv              Nonbonded verlet structure
1022  * \param[in]     inputrec         The input record
1023  * \param[in]     step             The current MD step
1024  */
1025 static inline void launchGpuRollingPruning(const t_commrec          *cr,
1026                                            const nonbonded_verlet_t *nbv,
1027                                            const t_inputrec         *inputrec,
1028                                            const gmx_int64_t         step)
1029 {
1030     /* We should not launch the rolling pruning kernel at a search
1031      * step or just before search steps, since that's useless.
1032      * Without domain decomposition we prune at even steps.
1033      * With domain decomposition we alternate local and non-local
1034      * pruning at even and odd steps.
1035      */
1036     int  numRollingParts     = nbv->listParams->numRollingParts;
1037     GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1038     int  stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1039     bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
1040     if (stepWithCurrentList > 0 &&
1041         stepWithCurrentList < inputrec->nstlist - 1 &&
1042         (stepIsEven || DOMAINDECOMP(cr)))
1043     {
1044         nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
1045                                           stepIsEven ? eintLocal : eintNonlocal,
1046                                           numRollingParts);
1047     }
1048 }
1049
1050 static void do_force_cutsVERLET(FILE *fplog,
1051                                 const t_commrec *cr,
1052                                 const gmx_multisim_t *ms,
1053                                 const t_inputrec *inputrec,
1054                                 gmx::Awh *awh,
1055                                 gmx_int64_t step,
1056                                 t_nrnb *nrnb,
1057                                 gmx_wallcycle_t wcycle,
1058                                 const gmx_localtop_t *top,
1059                                 const gmx_groups_t * /* groups */,
1060                                 matrix box, gmx::PaddedArrayRef<gmx::RVec> x,
1061                                 history_t *hist,
1062                                 gmx::PaddedArrayRef<gmx::RVec> force,
1063                                 tensor vir_force,
1064                                 const t_mdatoms *mdatoms,
1065                                 gmx_enerdata_t *enerd, t_fcdata *fcd,
1066                                 real *lambda,
1067                                 t_graph *graph,
1068                                 t_forcerec *fr,
1069                                 interaction_const_t *ic,
1070                                 const gmx_vsite_t *vsite,
1071                                 rvec mu_tot,
1072                                 double t,
1073                                 const gmx_edsam *ed,
1074                                 int flags,
1075                                 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1076                                 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1077 {
1078     int                 cg1, i, j;
1079     double              mu[2*DIM];
1080     gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM;
1081     gmx_bool            bDoForces, bUseGPU, bUseOrEmulGPU;
1082     rvec                vzero, box_diag;
1083     float               cycles_pme, cycles_wait_gpu;
1084     nonbonded_verlet_t *nbv = fr->nbv;
1085
1086     bStateChanged = (flags & GMX_FORCE_STATECHANGED);
1087     bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1088     bFillGrid     = (bNS && bStateChanged);
1089     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
1090     bDoForces     = (flags & GMX_FORCE_FORCES);
1091     bUseGPU       = fr->nbv->bUseGPU;
1092     bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
1093
1094     const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
1095     // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
1096     const bool useGpuPme  = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
1097         ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
1098
1099     /* At a search step we need to start the first balancing region
1100      * somewhere early inside the step after communication during domain
1101      * decomposition (and not during the previous step as usual).
1102      */
1103     if (bNS &&
1104         ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1105     {
1106         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
1107     }
1108
1109     cycles_wait_gpu = 0;
1110
1111     const int start  = 0;
1112     const int homenr = mdatoms->homenr;
1113
1114     clear_mat(vir_force);
1115
1116     if (DOMAINDECOMP(cr))
1117     {
1118         cg1 = cr->dd->ncg_tot;
1119     }
1120     else
1121     {
1122         cg1 = top->cgs.nr;
1123     }
1124     if (fr->n_tpi > 0)
1125     {
1126         cg1--;
1127     }
1128
1129     if (bStateChanged)
1130     {
1131         update_forcerec(fr, box);
1132
1133         if (inputrecNeedMutot(inputrec))
1134         {
1135             /* Calculate total (local) dipole moment in a temporary common array.
1136              * This makes it possible to sum them over nodes faster.
1137              */
1138             calc_mu(start, homenr,
1139                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1140                     mu, mu+DIM);
1141         }
1142     }
1143
1144     if (fr->ePBC != epbcNONE)
1145     {
1146         /* Compute shift vectors every step,
1147          * because of pressure coupling or box deformation!
1148          */
1149         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1150         {
1151             calc_shifts(box, fr->shift_vec);
1152         }
1153
1154         if (bCalcCGCM)
1155         {
1156             put_atoms_in_box_omp(fr->ePBC, box, x.subArray(0, homenr));
1157             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1158         }
1159         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1160         {
1161             unshift_self(graph, box, as_rvec_array(x.data()));
1162         }
1163     }
1164
1165     nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
1166                                  fr->shift_vec, nbv->nbat);
1167
1168 #if GMX_MPI
1169     if (!thisRankHasDuty(cr, DUTY_PME))
1170     {
1171         /* Send particle coordinates to the pme nodes.
1172          * Since this is only implemented for domain decomposition
1173          * and domain decomposition does not use the graph,
1174          * we do not need to worry about shifting.
1175          */
1176         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1177                                  lambda[efptCOUL], lambda[efptVDW],
1178                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1179                                  step, wcycle);
1180     }
1181 #endif /* GMX_MPI */
1182
1183     if (useGpuPme)
1184     {
1185         launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.data()), flags, wcycle);
1186     }
1187
1188     /* do gridding for pair search */
1189     if (bNS)
1190     {
1191         if (graph && bStateChanged)
1192         {
1193             /* Calculate intramolecular shift vectors to make molecules whole */
1194             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1195         }
1196
1197         clear_rvec(vzero);
1198         box_diag[XX] = box[XX][XX];
1199         box_diag[YY] = box[YY][YY];
1200         box_diag[ZZ] = box[ZZ][ZZ];
1201
1202         wallcycle_start(wcycle, ewcNS);
1203         if (!DOMAINDECOMP(cr))
1204         {
1205             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1206             nbnxn_put_on_grid(nbv->nbs.get(), fr->ePBC, box,
1207                               0, vzero, box_diag,
1208                               0, mdatoms->homenr, -1, fr->cginfo, as_rvec_array(x.data()),
1209                               0, nullptr,
1210                               nbv->grp[eintLocal].kernel_type,
1211                               nbv->nbat);
1212             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1213         }
1214         else
1215         {
1216             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1217             nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
1218                                        fr->cginfo, as_rvec_array(x.data()),
1219                                        nbv->grp[eintNonlocal].kernel_type,
1220                                        nbv->nbat);
1221             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1222         }
1223
1224         nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
1225         wallcycle_stop(wcycle, ewcNS);
1226     }
1227
1228     /* initialize the GPU atom data and copy shift vector */
1229     if (bUseGPU)
1230     {
1231         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1232         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1233
1234         if (bNS)
1235         {
1236             nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1237         }
1238
1239         nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1240
1241         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1242         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1243     }
1244
1245     /* do local pair search */
1246     if (bNS)
1247     {
1248         wallcycle_start_nocount(wcycle, ewcNS);
1249         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1250         nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1251                             &top->excls,
1252                             nbv->listParams->rlistOuter,
1253                             nbv->min_ci_balanced,
1254                             &nbv->grp[eintLocal].nbl_lists,
1255                             eintLocal,
1256                             nbv->grp[eintLocal].kernel_type,
1257                             nrnb);
1258         nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
1259         if (nbv->listParams->useDynamicPruning && !bUseGPU)
1260         {
1261             nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
1262         }
1263         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1264
1265         if (bUseGPU)
1266         {
1267             /* initialize local pair-list on the GPU */
1268             nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1269                                     nbv->grp[eintLocal].nbl_lists.nbl[0],
1270                                     eintLocal);
1271         }
1272         wallcycle_stop(wcycle, ewcNS);
1273     }
1274     else
1275     {
1276         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatLocal, FALSE, as_rvec_array(x.data()),
1277                                         nbv->nbat, wcycle);
1278     }
1279
1280     if (bUseGPU)
1281     {
1282         if (DOMAINDECOMP(cr))
1283         {
1284             ddOpenBalanceRegionGpu(cr->dd);
1285         }
1286
1287         wallcycle_start(wcycle, ewcLAUNCH_GPU);
1288         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1289         /* launch local nonbonded work on GPU */
1290         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1291                      step, nrnb, wcycle);
1292         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1293         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1294     }
1295
1296     if (useGpuPme)
1297     {
1298         // In PME GPU and mixed mode we launch FFT / gather after the
1299         // X copy/transform to allow overlap as well as after the GPU NB
1300         // launch to avoid FFT launch overhead hijacking the CPU and delaying
1301         // the nonbonded kernel.
1302         launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1303     }
1304
1305     /* Communicate coordinates and sum dipole if necessary +
1306        do non-local pair search */
1307     if (DOMAINDECOMP(cr))
1308     {
1309         if (bNS)
1310         {
1311             wallcycle_start_nocount(wcycle, ewcNS);
1312             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1313
1314             nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1315                                 &top->excls,
1316                                 nbv->listParams->rlistOuter,
1317                                 nbv->min_ci_balanced,
1318                                 &nbv->grp[eintNonlocal].nbl_lists,
1319                                 eintNonlocal,
1320                                 nbv->grp[eintNonlocal].kernel_type,
1321                                 nrnb);
1322             nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1323             if (nbv->listParams->useDynamicPruning && !bUseGPU)
1324             {
1325                 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1326             }
1327             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1328
1329             if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1330             {
1331                 /* initialize non-local pair-list on the GPU */
1332                 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1333                                         nbv->grp[eintNonlocal].nbl_lists.nbl[0],
1334                                         eintNonlocal);
1335             }
1336             wallcycle_stop(wcycle, ewcNS);
1337         }
1338         else
1339         {
1340             dd_move_x(cr->dd, box, as_rvec_array(x.data()), wcycle);
1341
1342             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatNonlocal, FALSE, as_rvec_array(x.data()),
1343                                             nbv->nbat, wcycle);
1344         }
1345
1346         if (bUseGPU)
1347         {
1348             wallcycle_start(wcycle, ewcLAUNCH_GPU);
1349             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1350             /* launch non-local nonbonded tasks on GPU */
1351             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1352                          step, nrnb, wcycle);
1353             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1354             wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1355         }
1356     }
1357
1358     if (bUseGPU)
1359     {
1360         /* launch D2H copy-back F */
1361         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1362         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1363         if (DOMAINDECOMP(cr))
1364         {
1365             nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1366                                      flags, eatNonlocal);
1367         }
1368         nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1369                                  flags, eatLocal);
1370         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1371         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1372     }
1373
1374     if (bStateChanged && inputrecNeedMutot(inputrec))
1375     {
1376         if (PAR(cr))
1377         {
1378             gmx_sumd(2*DIM, mu, cr);
1379             ddReopenBalanceRegionCpu(cr->dd);
1380         }
1381
1382         for (i = 0; i < 2; i++)
1383         {
1384             for (j = 0; j < DIM; j++)
1385             {
1386                 fr->mu_tot[i][j] = mu[i*DIM + j];
1387             }
1388         }
1389     }
1390     if (fr->efep == efepNO)
1391     {
1392         copy_rvec(fr->mu_tot[0], mu_tot);
1393     }
1394     else
1395     {
1396         for (j = 0; j < DIM; j++)
1397         {
1398             mu_tot[j] =
1399                 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1400                 lambda[efptCOUL]*fr->mu_tot[1][j];
1401         }
1402     }
1403
1404     /* Reset energies */
1405     reset_enerdata(enerd);
1406     clear_rvecs(SHIFTS, fr->fshift);
1407
1408     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1409     {
1410         wallcycle_start(wcycle, ewcPPDURINGPME);
1411         dd_force_flop_start(cr->dd, nrnb);
1412     }
1413
1414     if (inputrec->bRot)
1415     {
1416         wallcycle_start(wcycle, ewcROT);
1417         do_rotation(cr, inputrec, box, as_rvec_array(x.data()), t, step, bNS);
1418         wallcycle_stop(wcycle, ewcROT);
1419     }
1420
1421     /* Temporary solution until all routines take PaddedRVecVector */
1422     rvec *f = as_rvec_array(force.data());
1423
1424     /* Start the force cycle counter.
1425      * Note that a different counter is used for dynamic load balancing.
1426      */
1427     wallcycle_start(wcycle, ewcFORCE);
1428
1429     gmx::ArrayRef<gmx::RVec> forceRef = force;
1430     if (bDoForces)
1431     {
1432         /* If we need to compute the virial, we might need a separate
1433          * force buffer for algorithms for which the virial is calculated
1434          * directly, such as PME.
1435          */
1436         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1437         {
1438             forceRef = *fr->forceBufferForDirectVirialContributions;
1439
1440             /* We only compute forces on local atoms. Note that vsites can
1441              * spread to non-local atoms, but that part of the buffer is
1442              * cleared separately in the vsite spreading code.
1443              */
1444             clear_rvecs_omp(mdatoms->homenr, as_rvec_array(forceRef.data()));
1445         }
1446         /* Clear the short- and long-range forces */
1447         clear_rvecs_omp(fr->natoms_force_constr, f);
1448     }
1449
1450     /* forceWithVirial uses the local atom range only */
1451     ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
1452
1453     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1454     {
1455         clear_pull_forces(inputrec->pull_work);
1456     }
1457
1458     /* We calculate the non-bonded forces, when done on the CPU, here.
1459      * We do this before calling do_force_lowlevel, because in that
1460      * function, the listed forces are calculated before PME, which
1461      * does communication.  With this order, non-bonded and listed
1462      * force calculation imbalance can be balanced out by the domain
1463      * decomposition load balancing.
1464      */
1465
1466     if (!bUseOrEmulGPU)
1467     {
1468         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1469                      step, nrnb, wcycle);
1470     }
1471
1472     if (fr->efep != efepNO)
1473     {
1474         /* Calculate the local and non-local free energy interactions here.
1475          * Happens here on the CPU both with and without GPU.
1476          */
1477         if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1478         {
1479             do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1480                              fr, as_rvec_array(x.data()), f, mdatoms,
1481                              inputrec->fepvals, lambda,
1482                              enerd, flags, nrnb, wcycle);
1483         }
1484
1485         if (DOMAINDECOMP(cr) &&
1486             fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1487         {
1488             do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1489                              fr, as_rvec_array(x.data()), f, mdatoms,
1490                              inputrec->fepvals, lambda,
1491                              enerd, flags, nrnb, wcycle);
1492         }
1493     }
1494
1495     if (!bUseOrEmulGPU)
1496     {
1497         int aloc;
1498
1499         if (DOMAINDECOMP(cr))
1500         {
1501             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1502                          step, nrnb, wcycle);
1503         }
1504
1505         if (!bUseOrEmulGPU)
1506         {
1507             aloc = eintLocal;
1508         }
1509         else
1510         {
1511             aloc = eintNonlocal;
1512         }
1513
1514         /* Add all the non-bonded force to the normal force array.
1515          * This can be split into a local and a non-local part when overlapping
1516          * communication with calculation with domain decomposition.
1517          */
1518         wallcycle_stop(wcycle, ewcFORCE);
1519
1520         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatAll, nbv->nbat, f, wcycle);
1521
1522         wallcycle_start_nocount(wcycle, ewcFORCE);
1523
1524         /* if there are multiple fshift output buffers reduce them */
1525         if ((flags & GMX_FORCE_VIRIAL) &&
1526             nbv->grp[aloc].nbl_lists.nnbl > 1)
1527         {
1528             /* This is not in a subcounter because it takes a
1529                negligible and constant-sized amount of time */
1530             nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1531                                                      fr->fshift);
1532         }
1533     }
1534
1535     /* update QMMMrec, if necessary */
1536     if (fr->bQMMM)
1537     {
1538         update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1539     }
1540
1541     /* Compute the bonded and non-bonded energies and optionally forces */
1542     do_force_lowlevel(fr, inputrec, &(top->idef),
1543                       cr, ms, nrnb, wcycle, mdatoms,
1544                       as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1545                       box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1546                       flags, &cycles_pme);
1547
1548     wallcycle_stop(wcycle, ewcFORCE);
1549
1550     computeSpecialForces(fplog, cr, inputrec, awh, step, t, wcycle,
1551                          fr->forceProviders, box, x, mdatoms, lambda,
1552                          flags, &forceWithVirial, enerd,
1553                          ed, bNS);
1554
1555     if (bUseOrEmulGPU)
1556     {
1557         /* wait for non-local forces (or calculate in emulation mode) */
1558         if (DOMAINDECOMP(cr))
1559         {
1560             if (bUseGPU)
1561             {
1562                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1563                 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1564                                            flags, eatNonlocal,
1565                                            enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1566                                            fr->fshift);
1567                 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1568             }
1569             else
1570             {
1571                 wallcycle_start_nocount(wcycle, ewcFORCE);
1572                 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1573                              step, nrnb, wcycle);
1574                 wallcycle_stop(wcycle, ewcFORCE);
1575             }
1576
1577             /* skip the reduction if there was no non-local work to do */
1578             if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1579             {
1580                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal,
1581                                                nbv->nbat, f, wcycle);
1582             }
1583         }
1584     }
1585
1586     if (DOMAINDECOMP(cr))
1587     {
1588         /* We are done with the CPU compute.
1589          * We will now communicate the non-local forces.
1590          * If we use a GPU this will overlap with GPU work, so in that case
1591          * we do not close the DD force balancing region here.
1592          */
1593         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1594         {
1595             ddCloseBalanceRegionCpu(cr->dd);
1596         }
1597         if (bDoForces)
1598         {
1599             dd_move_f(cr->dd, f, fr->fshift, wcycle);
1600         }
1601     }
1602
1603     // With both nonbonded and PME offloaded a GPU on the same rank, we use
1604     // an alternating wait/reduction scheme.
1605     bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1606     if (alternateGpuWait)
1607     {
1608         alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, wcycle);
1609     }
1610
1611     if (!alternateGpuWait && useGpuPme)
1612     {
1613         gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
1614         matrix vir_Q;
1615         real   Vlr_q = 0.0;
1616         pme_gpu_wait_finish_task(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
1617         pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
1618     }
1619
1620     /* Wait for local GPU NB outputs on the non-alternating wait path */
1621     if (!alternateGpuWait && bUseGPU)
1622     {
1623         /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1624          * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1625          * but even with a step of 0.1 ms the difference is less than 1%
1626          * of the step time.
1627          */
1628         const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1629
1630         wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1631         nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1632                                    flags, eatLocal,
1633                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1634                                    fr->fshift);
1635         float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1636
1637         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1638         {
1639             DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1640             if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1641             {
1642                 /* We measured few cycles, it could be that the kernel
1643                  * and transfer finished earlier and there was no actual
1644                  * wait time, only API call overhead.
1645                  * Then the actual time could be anywhere between 0 and
1646                  * cycles_wait_est. We will use half of cycles_wait_est.
1647                  */
1648                 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1649             }
1650             ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1651         }
1652     }
1653
1654     if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1655     {
1656         // NOTE: emulation kernel is not included in the balancing region,
1657         // but emulation mode does not target performance anyway
1658         wallcycle_start_nocount(wcycle, ewcFORCE);
1659         do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1660                      DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1661                      step, nrnb, wcycle);
1662         wallcycle_stop(wcycle, ewcFORCE);
1663     }
1664
1665     if (useGpuPme)
1666     {
1667         pme_gpu_reinit_computation(fr->pmedata, wcycle);
1668     }
1669
1670     if (bUseGPU)
1671     {
1672         /* now clear the GPU outputs while we finish the step on the CPU */
1673         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1674         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1675         nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1676
1677         /* Is dynamic pair-list pruning activated? */
1678         if (nbv->listParams->useDynamicPruning)
1679         {
1680             launchGpuRollingPruning(cr, nbv, inputrec, step);
1681         }
1682         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1683         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1684     }
1685
1686     /* Do the nonbonded GPU (or emulation) force buffer reduction
1687      * on the non-alternating path. */
1688     if (bUseOrEmulGPU && !alternateGpuWait)
1689     {
1690         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1691                                        nbv->nbat, f, wcycle);
1692     }
1693
1694     if (DOMAINDECOMP(cr))
1695     {
1696         dd_force_flop_stop(cr->dd, nrnb);
1697     }
1698
1699     if (bDoForces)
1700     {
1701         /* If we have NoVirSum forces, but we do not calculate the virial,
1702          * we sum fr->f_novirsum=f later.
1703          */
1704         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1705         {
1706             spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
1707                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1708         }
1709
1710         if (flags & GMX_FORCE_VIRIAL)
1711         {
1712             /* Calculation of the virial must be done after vsites! */
1713             calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
1714                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1715         }
1716     }
1717
1718     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1719     {
1720         /* In case of node-splitting, the PP nodes receive the long-range
1721          * forces, virial and energy from the PME nodes here.
1722          */
1723         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1724     }
1725
1726     if (bDoForces)
1727     {
1728         post_process_forces(cr, step, nrnb, wcycle,
1729                             top, box, as_rvec_array(x.data()), f, &forceWithVirial,
1730                             vir_force, mdatoms, graph, fr, vsite,
1731                             flags);
1732     }
1733
1734     if (flags & GMX_FORCE_ENERGY)
1735     {
1736         /* Sum the potential energy terms from group contributions */
1737         sum_epot(&(enerd->grpp), enerd->term);
1738
1739         if (!EI_TPI(inputrec->eI))
1740         {
1741             checkPotentialEnergyValidity(step, *enerd, *inputrec);
1742         }
1743     }
1744 }
1745
1746 static void do_force_cutsGROUP(FILE *fplog,
1747                                const t_commrec *cr,
1748                                const gmx_multisim_t *ms,
1749                                const t_inputrec *inputrec,
1750                                gmx::Awh *awh,
1751                                gmx_int64_t step,
1752                                t_nrnb *nrnb,
1753                                gmx_wallcycle_t wcycle,
1754                                gmx_localtop_t *top,
1755                                const gmx_groups_t *groups,
1756                                matrix box, gmx::PaddedArrayRef<gmx::RVec> x,
1757                                history_t *hist,
1758                                gmx::PaddedArrayRef<gmx::RVec> force,
1759                                tensor vir_force,
1760                                const t_mdatoms *mdatoms,
1761                                gmx_enerdata_t *enerd,
1762                                t_fcdata *fcd,
1763                                real *lambda,
1764                                t_graph *graph,
1765                                t_forcerec *fr,
1766                                const gmx_vsite_t *vsite,
1767                                rvec mu_tot,
1768                                double t,
1769                                const gmx_edsam *ed,
1770                                int flags,
1771                                DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1772                                DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1773 {
1774     int        cg0, cg1, i, j;
1775     double     mu[2*DIM];
1776     gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM;
1777     gmx_bool   bDoForces;
1778     float      cycles_pme;
1779
1780     const int  start  = 0;
1781     const int  homenr = mdatoms->homenr;
1782
1783     clear_mat(vir_force);
1784
1785     cg0 = 0;
1786     if (DOMAINDECOMP(cr))
1787     {
1788         cg1 = cr->dd->ncg_tot;
1789     }
1790     else
1791     {
1792         cg1 = top->cgs.nr;
1793     }
1794     if (fr->n_tpi > 0)
1795     {
1796         cg1--;
1797     }
1798
1799     bStateChanged  = (flags & GMX_FORCE_STATECHANGED);
1800     bNS            = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1801     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1802     bFillGrid      = (bNS && bStateChanged);
1803     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
1804     bDoForces      = (flags & GMX_FORCE_FORCES);
1805
1806     if (bStateChanged)
1807     {
1808         update_forcerec(fr, box);
1809
1810         if (inputrecNeedMutot(inputrec))
1811         {
1812             /* Calculate total (local) dipole moment in a temporary common array.
1813              * This makes it possible to sum them over nodes faster.
1814              */
1815             calc_mu(start, homenr,
1816                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1817                     mu, mu+DIM);
1818         }
1819     }
1820
1821     if (fr->ePBC != epbcNONE)
1822     {
1823         /* Compute shift vectors every step,
1824          * because of pressure coupling or box deformation!
1825          */
1826         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1827         {
1828             calc_shifts(box, fr->shift_vec);
1829         }
1830
1831         if (bCalcCGCM)
1832         {
1833             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1834                                      &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1835             inc_nrnb(nrnb, eNR_CGCM, homenr);
1836             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1837         }
1838         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1839         {
1840             unshift_self(graph, box, as_rvec_array(x.data()));
1841         }
1842     }
1843     else if (bCalcCGCM)
1844     {
1845         calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1846         inc_nrnb(nrnb, eNR_CGCM, homenr);
1847     }
1848
1849     if (bCalcCGCM && gmx_debug_at)
1850     {
1851         pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1852     }
1853
1854 #if GMX_MPI
1855     if (!thisRankHasDuty(cr, DUTY_PME))
1856     {
1857         /* Send particle coordinates to the pme nodes.
1858          * Since this is only implemented for domain decomposition
1859          * and domain decomposition does not use the graph,
1860          * we do not need to worry about shifting.
1861          */
1862         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1863                                  lambda[efptCOUL], lambda[efptVDW],
1864                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1865                                  step, wcycle);
1866     }
1867 #endif /* GMX_MPI */
1868
1869     /* Communicate coordinates and sum dipole if necessary */
1870     if (DOMAINDECOMP(cr))
1871     {
1872         dd_move_x(cr->dd, box, as_rvec_array(x.data()), wcycle);
1873
1874         /* No GPU support, no move_x overlap, so reopen the balance region here */
1875         if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1876         {
1877             ddReopenBalanceRegionCpu(cr->dd);
1878         }
1879     }
1880
1881     if (inputrecNeedMutot(inputrec))
1882     {
1883         if (bStateChanged)
1884         {
1885             if (PAR(cr))
1886             {
1887                 gmx_sumd(2*DIM, mu, cr);
1888                 ddReopenBalanceRegionCpu(cr->dd);
1889             }
1890             for (i = 0; i < 2; i++)
1891             {
1892                 for (j = 0; j < DIM; j++)
1893                 {
1894                     fr->mu_tot[i][j] = mu[i*DIM + j];
1895                 }
1896             }
1897         }
1898         if (fr->efep == efepNO)
1899         {
1900             copy_rvec(fr->mu_tot[0], mu_tot);
1901         }
1902         else
1903         {
1904             for (j = 0; j < DIM; j++)
1905             {
1906                 mu_tot[j] =
1907                     (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1908             }
1909         }
1910     }
1911
1912     /* Reset energies */
1913     reset_enerdata(enerd);
1914     clear_rvecs(SHIFTS, fr->fshift);
1915
1916     if (bNS)
1917     {
1918         wallcycle_start(wcycle, ewcNS);
1919
1920         if (graph && bStateChanged)
1921         {
1922             /* Calculate intramolecular shift vectors to make molecules whole */
1923             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1924         }
1925
1926         /* Do the actual neighbour searching */
1927         ns(fplog, fr, box,
1928            groups, top, mdatoms,
1929            cr, nrnb, bFillGrid);
1930
1931         wallcycle_stop(wcycle, ewcNS);
1932     }
1933
1934     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1935     {
1936         wallcycle_start(wcycle, ewcPPDURINGPME);
1937         dd_force_flop_start(cr->dd, nrnb);
1938     }
1939
1940     if (inputrec->bRot)
1941     {
1942         wallcycle_start(wcycle, ewcROT);
1943         do_rotation(cr, inputrec, box, as_rvec_array(x.data()), t, step, bNS);
1944         wallcycle_stop(wcycle, ewcROT);
1945     }
1946
1947     /* Temporary solution until all routines take PaddedRVecVector */
1948     rvec *f = as_rvec_array(force.data());
1949
1950     /* Start the force cycle counter.
1951      * Note that a different counter is used for dynamic load balancing.
1952      */
1953     wallcycle_start(wcycle, ewcFORCE);
1954
1955     gmx::ArrayRef<gmx::RVec> forceRef = force;
1956     if (bDoForces)
1957     {
1958         /* If we need to compute the virial, we might need a separate
1959          * force buffer for algorithms for which the virial is calculated
1960          * separately, such as PME.
1961          */
1962         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1963         {
1964             forceRef = *fr->forceBufferForDirectVirialContributions;
1965
1966             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1967         }
1968
1969         /* Clear the short- and long-range forces */
1970         clear_rvecs(fr->natoms_force_constr, f);
1971     }
1972
1973     /* forceWithVirial might need the full force atom range */
1974     ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
1975
1976     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1977     {
1978         clear_pull_forces(inputrec->pull_work);
1979     }
1980
1981     /* update QMMMrec, if necessary */
1982     if (fr->bQMMM)
1983     {
1984         update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1985     }
1986
1987     /* Compute the bonded and non-bonded energies and optionally forces */
1988     do_force_lowlevel(fr, inputrec, &(top->idef),
1989                       cr, ms, nrnb, wcycle, mdatoms,
1990                       as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1991                       box, inputrec->fepvals, lambda,
1992                       graph, &(top->excls), fr->mu_tot,
1993                       flags,
1994                       &cycles_pme);
1995
1996     wallcycle_stop(wcycle, ewcFORCE);
1997
1998     if (DOMAINDECOMP(cr))
1999     {
2000         dd_force_flop_stop(cr->dd, nrnb);
2001
2002         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
2003         {
2004             ddCloseBalanceRegionCpu(cr->dd);
2005         }
2006     }
2007
2008     computeSpecialForces(fplog, cr, inputrec, awh, step, t, wcycle,
2009                          fr->forceProviders, box, x, mdatoms, lambda,
2010                          flags, &forceWithVirial, enerd,
2011                          ed, bNS);
2012
2013     if (bDoForces)
2014     {
2015         /* Communicate the forces */
2016         if (DOMAINDECOMP(cr))
2017         {
2018             dd_move_f(cr->dd, f, fr->fshift, wcycle);
2019             /* Do we need to communicate the separate force array
2020              * for terms that do not contribute to the single sum virial?
2021              * Position restraints and electric fields do not introduce
2022              * inter-cg forces, only full electrostatics methods do.
2023              * When we do not calculate the virial, fr->f_novirsum = f,
2024              * so we have already communicated these forces.
2025              */
2026             if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
2027                 (flags & GMX_FORCE_VIRIAL))
2028             {
2029                 dd_move_f(cr->dd, as_rvec_array(forceWithVirial.force_.data()), nullptr, wcycle);
2030             }
2031         }
2032
2033         /* If we have NoVirSum forces, but we do not calculate the virial,
2034          * we sum fr->f_novirsum=f later.
2035          */
2036         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
2037         {
2038             spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
2039                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
2040         }
2041
2042         if (flags & GMX_FORCE_VIRIAL)
2043         {
2044             /* Calculation of the virial must be done after vsites! */
2045             calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
2046                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
2047         }
2048     }
2049
2050     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
2051     {
2052         /* In case of node-splitting, the PP nodes receive the long-range
2053          * forces, virial and energy from the PME nodes here.
2054          */
2055         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
2056     }
2057
2058     if (bDoForces)
2059     {
2060         post_process_forces(cr, step, nrnb, wcycle,
2061                             top, box, as_rvec_array(x.data()), f, &forceWithVirial,
2062                             vir_force, mdatoms, graph, fr, vsite,
2063                             flags);
2064     }
2065
2066     if (flags & GMX_FORCE_ENERGY)
2067     {
2068         /* Sum the potential energy terms from group contributions */
2069         sum_epot(&(enerd->grpp), enerd->term);
2070
2071         if (!EI_TPI(inputrec->eI))
2072         {
2073             checkPotentialEnergyValidity(step, *enerd, *inputrec);
2074         }
2075     }
2076
2077 }
2078
2079 void do_force(FILE                                     *fplog,
2080               const t_commrec                          *cr,
2081               const gmx_multisim_t                     *ms,
2082               const t_inputrec                         *inputrec,
2083               gmx::Awh                                 *awh,
2084               gmx_int64_t                               step,
2085               t_nrnb                                   *nrnb,
2086               gmx_wallcycle_t                           wcycle,
2087               gmx_localtop_t                           *top,
2088               const gmx_groups_t                       *groups,
2089               matrix                                    box,
2090               gmx::PaddedArrayRef<gmx::RVec>            x,
2091               history_t                                *hist,
2092               gmx::PaddedArrayRef<gmx::RVec>            force,
2093               tensor                                    vir_force,
2094               const t_mdatoms                          *mdatoms,
2095               gmx_enerdata_t                           *enerd,
2096               t_fcdata                                 *fcd,
2097               gmx::ArrayRef<real>                       lambda,
2098               t_graph                                  *graph,
2099               t_forcerec                               *fr,
2100               const gmx_vsite_t                        *vsite,
2101               rvec                                      mu_tot,
2102               double                                    t,
2103               const gmx_edsam                          *ed,
2104               int                                       flags,
2105               DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
2106               DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
2107 {
2108     /* modify force flag if not doing nonbonded */
2109     if (!fr->bNonbonded)
2110     {
2111         flags &= ~GMX_FORCE_NONBONDED;
2112     }
2113
2114     GMX_ASSERT(x.size()     >= gmx::paddedRVecVectorSize(fr->natoms_force), "coordinates should be padded");
2115     GMX_ASSERT(force.size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "force should be padded");
2116
2117     switch (inputrec->cutoff_scheme)
2118     {
2119         case ecutsVERLET:
2120             do_force_cutsVERLET(fplog, cr, ms, inputrec,
2121                                 awh, step, nrnb, wcycle,
2122                                 top,
2123                                 groups,
2124                                 box, x, hist,
2125                                 force, vir_force,
2126                                 mdatoms,
2127                                 enerd, fcd,
2128                                 lambda.data(), graph,
2129                                 fr, fr->ic,
2130                                 vsite, mu_tot,
2131                                 t, ed,
2132                                 flags,
2133                                 ddOpenBalanceRegion,
2134                                 ddCloseBalanceRegion);
2135             break;
2136         case ecutsGROUP:
2137             do_force_cutsGROUP(fplog, cr, ms, inputrec,
2138                                awh, step, nrnb, wcycle,
2139                                top,
2140                                groups,
2141                                box, x, hist,
2142                                force, vir_force,
2143                                mdatoms,
2144                                enerd, fcd,
2145                                lambda.data(), graph,
2146                                fr, vsite, mu_tot,
2147                                t, ed,
2148                                flags,
2149                                ddOpenBalanceRegion,
2150                                ddCloseBalanceRegion);
2151             break;
2152         default:
2153             gmx_incons("Invalid cut-off scheme passed!");
2154     }
2155
2156     /* In case we don't have constraints and are using GPUs, the next balancing
2157      * region starts here.
2158      * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2159      * virial calculation and COM pulling, is not thus not included in
2160      * the balance timing, which is ok as most tasks do communication.
2161      */
2162     if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
2163     {
2164         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
2165     }
2166 }
2167
2168
2169 void do_constrain_first(FILE *fplog, Constraints *constr,
2170                         t_inputrec *ir, t_mdatoms *md,
2171                         t_state *state, const t_commrec *cr,
2172                         const gmx_multisim_t *ms,
2173                         t_nrnb *nrnb,
2174                         t_forcerec *fr, gmx_localtop_t *top)
2175 {
2176     int             i, m, start, end;
2177     gmx_int64_t     step;
2178     real            dt = ir->delta_t;
2179     real            dvdl_dum;
2180     rvec           *savex;
2181
2182     /* We need to allocate one element extra, since we might use
2183      * (unaligned) 4-wide SIMD loads to access rvec entries.
2184      */
2185     snew(savex, state->natoms + 1);
2186
2187     start = 0;
2188     end   = md->homenr;
2189
2190     if (debug)
2191     {
2192         fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2193                 start, md->homenr, end);
2194     }
2195     /* Do a first constrain to reset particles... */
2196     step = ir->init_step;
2197     if (fplog)
2198     {
2199         char buf[STEPSTRSIZE];
2200         fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2201                 gmx_step_str(step, buf));
2202     }
2203     dvdl_dum = 0;
2204
2205     /* constrain the current position */
2206     constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
2207               ir, cr, ms, step, 0, 1.0, md,
2208               as_rvec_array(state->x.data()), as_rvec_array(state->x.data()), nullptr,
2209               fr->bMolPBC, state->box,
2210               state->lambda[efptBONDED], &dvdl_dum,
2211               nullptr, nullptr, nrnb, econqCoord);
2212     if (EI_VV(ir->eI))
2213     {
2214         /* constrain the inital velocity, and save it */
2215         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2216         constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
2217                   ir, cr, ms, step, 0, 1.0, md,
2218                   as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
2219                   fr->bMolPBC, state->box,
2220                   state->lambda[efptBONDED], &dvdl_dum,
2221                   nullptr, nullptr, nrnb, econqVeloc);
2222     }
2223     /* constrain the inital velocities at t-dt/2 */
2224     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2225     {
2226         for (i = start; (i < end); i++)
2227         {
2228             for (m = 0; (m < DIM); m++)
2229             {
2230                 /* Reverse the velocity */
2231                 state->v[i][m] = -state->v[i][m];
2232                 /* Store the position at t-dt in buf */
2233                 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
2234             }
2235         }
2236         /* Shake the positions at t=-dt with the positions at t=0
2237          * as reference coordinates.
2238          */
2239         if (fplog)
2240         {
2241             char buf[STEPSTRSIZE];
2242             fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2243                     gmx_step_str(step, buf));
2244         }
2245         dvdl_dum = 0;
2246         constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
2247                   ir, cr, ms, step, -1, 1.0, md,
2248                   as_rvec_array(state->x.data()), savex, nullptr,
2249                   fr->bMolPBC, state->box,
2250                   state->lambda[efptBONDED], &dvdl_dum,
2251                   as_rvec_array(state->v.data()), nullptr, nrnb, econqCoord);
2252
2253         for (i = start; i < end; i++)
2254         {
2255             for (m = 0; m < DIM; m++)
2256             {
2257                 /* Re-reverse the velocities */
2258                 state->v[i][m] = -state->v[i][m];
2259             }
2260         }
2261     }
2262     sfree(savex);
2263 }
2264
2265
2266 static void
2267 integrate_table(real vdwtab[], real scale, int offstart, int rstart, int rend,
2268                 double *enerout, double *virout)
2269 {
2270     double enersum, virsum;
2271     double invscale, invscale2, invscale3;
2272     double r, ea, eb, ec, pa, pb, pc, pd;
2273     double y0, f, g, h;
2274     int    ri, offset;
2275     double tabfactor;
2276
2277     invscale  = 1.0/scale;
2278     invscale2 = invscale*invscale;
2279     invscale3 = invscale*invscale2;
2280
2281     /* Following summation derived from cubic spline definition,
2282      * Numerical Recipies in C, second edition, p. 113-116.  Exact for
2283      * the cubic spline.  We first calculate the negative of the
2284      * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2285      * add the more standard, abrupt cutoff correction to that result,
2286      * yielding the long-range correction for a switched function.  We
2287      * perform both the pressure and energy loops at the same time for
2288      * simplicity, as the computational cost is low. */
2289
2290     if (offstart == 0)
2291     {
2292         /* Since the dispersion table has been scaled down a factor
2293          * 6.0 and the repulsion a factor 12.0 to compensate for the
2294          * c6/c12 parameters inside nbfp[] being scaled up (to save
2295          * flops in kernels), we need to correct for this.
2296          */
2297         tabfactor = 6.0;
2298     }
2299     else
2300     {
2301         tabfactor = 12.0;
2302     }
2303
2304     enersum = 0.0;
2305     virsum  = 0.0;
2306     for (ri = rstart; ri < rend; ++ri)
2307     {
2308         r  = ri*invscale;
2309         ea = invscale3;
2310         eb = 2.0*invscale2*r;
2311         ec = invscale*r*r;
2312
2313         pa = invscale3;
2314         pb = 3.0*invscale2*r;
2315         pc = 3.0*invscale*r*r;
2316         pd = r*r*r;
2317
2318         /* this "8" is from the packing in the vdwtab array - perhaps
2319            should be defined? */
2320
2321         offset = 8*ri + offstart;
2322         y0     = vdwtab[offset];
2323         f      = vdwtab[offset+1];
2324         g      = vdwtab[offset+2];
2325         h      = vdwtab[offset+3];
2326
2327         enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2328         virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2329     }
2330     *enerout = 4.0*M_PI*enersum*tabfactor;
2331     *virout  = 4.0*M_PI*virsum*tabfactor;
2332 }
2333
2334 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2335 {
2336     double   eners[2], virs[2], enersum, virsum;
2337     double   r0, rc3, rc9;
2338     int      ri0, ri1, i;
2339     real     scale, *vdwtab;
2340
2341     fr->enershiftsix    = 0;
2342     fr->enershifttwelve = 0;
2343     fr->enerdiffsix     = 0;
2344     fr->enerdifftwelve  = 0;
2345     fr->virdiffsix      = 0;
2346     fr->virdifftwelve   = 0;
2347
2348     const interaction_const_t *ic = fr->ic;
2349
2350     if (eDispCorr != edispcNO)
2351     {
2352         for (i = 0; i < 2; i++)
2353         {
2354             eners[i] = 0;
2355             virs[i]  = 0;
2356         }
2357         if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2358             (ic->vdw_modifier == eintmodPOTSWITCH) ||
2359             (ic->vdw_modifier == eintmodFORCESWITCH) ||
2360             (ic->vdwtype == evdwSHIFT) ||
2361             (ic->vdwtype == evdwSWITCH))
2362         {
2363             if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2364                  (ic->vdw_modifier == eintmodFORCESWITCH) ||
2365                  (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2366             {
2367                 gmx_fatal(FARGS,
2368                           "With dispersion correction rvdw-switch can not be zero "
2369                           "for vdw-type = %s", evdw_names[ic->vdwtype]);
2370             }
2371
2372             /* TODO This code depends on the logic in tables.c that
2373                constructs the table layout, which should be made
2374                explicit in future cleanup. */
2375             GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2376                        "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2377             scale  = fr->dispersionCorrectionTable->scale;
2378             vdwtab = fr->dispersionCorrectionTable->data;
2379
2380             /* Round the cut-offs to exact table values for precision */
2381             ri0  = static_cast<int>(floor(ic->rvdw_switch*scale));
2382             ri1  = static_cast<int>(ceil(ic->rvdw*scale));
2383
2384             /* The code below has some support for handling force-switching, i.e.
2385              * when the force (instead of potential) is switched over a limited
2386              * region. This leads to a constant shift in the potential inside the
2387              * switching region, which we can handle by adding a constant energy
2388              * term in the force-switch case just like when we do potential-shift.
2389              *
2390              * For now this is not enabled, but to keep the functionality in the
2391              * code we check separately for switch and shift. When we do force-switch
2392              * the shifting point is rvdw_switch, while it is the cutoff when we
2393              * have a classical potential-shift.
2394              *
2395              * For a pure potential-shift the potential has a constant shift
2396              * all the way out to the cutoff, and that is it. For other forms
2397              * we need to calculate the constant shift up to the point where we
2398              * start modifying the potential.
2399              */
2400             ri0  = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2401
2402             r0   = ri0/scale;
2403             rc3  = r0*r0*r0;
2404             rc9  = rc3*rc3*rc3;
2405
2406             if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2407                 (ic->vdwtype == evdwSHIFT))
2408             {
2409                 /* Determine the constant energy shift below rvdw_switch.
2410                  * Table has a scale factor since we have scaled it down to compensate
2411                  * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2412                  */
2413                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2414                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2415             }
2416             else if (ic->vdw_modifier == eintmodPOTSHIFT)
2417             {
2418                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3));
2419                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3));
2420             }
2421
2422             /* Add the constant part from 0 to rvdw_switch.
2423              * This integration from 0 to rvdw_switch overcounts the number
2424              * of interactions by 1, as it also counts the self interaction.
2425              * We will correct for this later.
2426              */
2427             eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2428             eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2429
2430             /* Calculate the contribution in the range [r0,r1] where we
2431              * modify the potential. For a pure potential-shift modifier we will
2432              * have ri0==ri1, and there will not be any contribution here.
2433              */
2434             for (i = 0; i < 2; i++)
2435             {
2436                 enersum = 0;
2437                 virsum  = 0;
2438                 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2439                 eners[i] -= enersum;
2440                 virs[i]  -= virsum;
2441             }
2442
2443             /* Alright: Above we compensated by REMOVING the parts outside r0
2444              * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2445              *
2446              * Regardless of whether r0 is the point where we start switching,
2447              * or the cutoff where we calculated the constant shift, we include
2448              * all the parts we are missing out to infinity from r0 by
2449              * calculating the analytical dispersion correction.
2450              */
2451             eners[0] += -4.0*M_PI/(3.0*rc3);
2452             eners[1] +=  4.0*M_PI/(9.0*rc9);
2453             virs[0]  +=  8.0*M_PI/rc3;
2454             virs[1]  += -16.0*M_PI/(3.0*rc9);
2455         }
2456         else if (ic->vdwtype == evdwCUT ||
2457                  EVDW_PME(ic->vdwtype) ||
2458                  ic->vdwtype == evdwUSER)
2459         {
2460             if (ic->vdwtype == evdwUSER && fplog)
2461             {
2462                 fprintf(fplog,
2463                         "WARNING: using dispersion correction with user tables\n");
2464             }
2465
2466             /* Note that with LJ-PME, the dispersion correction is multiplied
2467              * by the difference between the actual C6 and the value of C6
2468              * that would produce the combination rule.
2469              * This means the normal energy and virial difference formulas
2470              * can be used here.
2471              */
2472
2473             rc3  = ic->rvdw*ic->rvdw*ic->rvdw;
2474             rc9  = rc3*rc3*rc3;
2475             /* Contribution beyond the cut-off */
2476             eners[0] += -4.0*M_PI/(3.0*rc3);
2477             eners[1] +=  4.0*M_PI/(9.0*rc9);
2478             if (ic->vdw_modifier == eintmodPOTSHIFT)
2479             {
2480                 /* Contribution within the cut-off */
2481                 eners[0] += -4.0*M_PI/(3.0*rc3);
2482                 eners[1] +=  4.0*M_PI/(3.0*rc9);
2483             }
2484             /* Contribution beyond the cut-off */
2485             virs[0]  +=  8.0*M_PI/rc3;
2486             virs[1]  += -16.0*M_PI/(3.0*rc9);
2487         }
2488         else
2489         {
2490             gmx_fatal(FARGS,
2491                       "Dispersion correction is not implemented for vdw-type = %s",
2492                       evdw_names[ic->vdwtype]);
2493         }
2494
2495         /* When we deprecate the group kernels the code below can go too */
2496         if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2497         {
2498             /* Calculate self-interaction coefficient (assuming that
2499              * the reciprocal-space contribution is constant in the
2500              * region that contributes to the self-interaction).
2501              */
2502             fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2503
2504             eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2505             virs[0]  +=  gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2506         }
2507
2508         fr->enerdiffsix    = eners[0];
2509         fr->enerdifftwelve = eners[1];
2510         /* The 0.5 is due to the Gromacs definition of the virial */
2511         fr->virdiffsix     = 0.5*virs[0];
2512         fr->virdifftwelve  = 0.5*virs[1];
2513     }
2514 }
2515
2516 void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
2517                    matrix box, real lambda, tensor pres, tensor virial,
2518                    real *prescorr, real *enercorr, real *dvdlcorr)
2519 {
2520     gmx_bool bCorrAll, bCorrPres;
2521     real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2522     int      m;
2523
2524     *prescorr = 0;
2525     *enercorr = 0;
2526     *dvdlcorr = 0;
2527
2528     clear_mat(virial);
2529     clear_mat(pres);
2530
2531     if (ir->eDispCorr != edispcNO)
2532     {
2533         bCorrAll  = (ir->eDispCorr == edispcAllEner ||
2534                      ir->eDispCorr == edispcAllEnerPres);
2535         bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2536                      ir->eDispCorr == edispcAllEnerPres);
2537
2538         invvol = 1/det(box);
2539         if (fr->n_tpi)
2540         {
2541             /* Only correct for the interactions with the inserted molecule */
2542             dens   = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2543             ninter = fr->n_tpi;
2544         }
2545         else
2546         {
2547             dens   = fr->numAtomsForDispersionCorrection*invvol;
2548             ninter = 0.5*fr->numAtomsForDispersionCorrection;
2549         }
2550
2551         if (ir->efep == efepNO)
2552         {
2553             avcsix    = fr->avcsix[0];
2554             avctwelve = fr->avctwelve[0];
2555         }
2556         else
2557         {
2558             avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
2559             avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2560         }
2561
2562         enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2563         *enercorr += avcsix*enerdiff;
2564         dvdlambda  = 0.0;
2565         if (ir->efep != efepNO)
2566         {
2567             dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2568         }
2569         if (bCorrAll)
2570         {
2571             enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2572             *enercorr += avctwelve*enerdiff;
2573             if (fr->efep != efepNO)
2574             {
2575                 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2576             }
2577         }
2578
2579         if (bCorrPres)
2580         {
2581             svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2582             if (ir->eDispCorr == edispcAllEnerPres)
2583             {
2584                 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2585             }
2586             /* The factor 2 is because of the Gromacs virial definition */
2587             spres = -2.0*invvol*svir*PRESFAC;
2588
2589             for (m = 0; m < DIM; m++)
2590             {
2591                 virial[m][m] += svir;
2592                 pres[m][m]   += spres;
2593             }
2594             *prescorr += spres;
2595         }
2596
2597         /* Can't currently control when it prints, for now, just print when degugging */
2598         if (debug)
2599         {
2600             if (bCorrAll)
2601             {
2602                 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2603                         avcsix, avctwelve);
2604             }
2605             if (bCorrPres)
2606             {
2607                 fprintf(debug,
2608                         "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2609                         *enercorr, spres, svir);
2610             }
2611             else
2612             {
2613                 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2614             }
2615         }
2616
2617         if (fr->efep != efepNO)
2618         {
2619             *dvdlcorr += dvdlambda;
2620         }
2621     }
2622 }
2623
2624 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2625                             const gmx_mtop_t *mtop, rvec x[],
2626                             gmx_bool bFirst)
2627 {
2628     t_graph        *graph;
2629     int             as, mol;
2630
2631     if (bFirst && fplog)
2632     {
2633         fprintf(fplog, "Removing pbc first time\n");
2634     }
2635
2636     snew(graph, 1);
2637     as = 0;
2638     for (const gmx_molblock_t &molb : mtop->molblock)
2639     {
2640         const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2641         if (moltype.atoms.nr == 1 ||
2642             (!bFirst && moltype.cgs.nr == 1))
2643         {
2644             /* Just one atom or charge group in the molecule, no PBC required */
2645             as += molb.nmol*moltype.atoms.nr;
2646         }
2647         else
2648         {
2649             /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2650             mk_graph_ilist(nullptr, moltype.ilist,
2651                            0, moltype.atoms.nr, FALSE, FALSE, graph);
2652
2653             for (mol = 0; mol < molb.nmol; mol++)
2654             {
2655                 mk_mshift(fplog, graph, ePBC, box, x+as);
2656
2657                 shift_self(graph, box, x+as);
2658                 /* The molecule is whole now.
2659                  * We don't need the second mk_mshift call as in do_pbc_first,
2660                  * since we no longer need this graph.
2661                  */
2662
2663                 as += moltype.atoms.nr;
2664             }
2665             done_graph(graph);
2666         }
2667     }
2668     sfree(graph);
2669 }
2670
2671 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2672                        const gmx_mtop_t *mtop, rvec x[])
2673 {
2674     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2675 }
2676
2677 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2678                  const gmx_mtop_t *mtop, rvec x[])
2679 {
2680     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2681 }
2682
2683 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2684 {
2685     int t, nth;
2686     nth = gmx_omp_nthreads_get(emntDefault);
2687
2688 #pragma omp parallel for num_threads(nth) schedule(static)
2689     for (t = 0; t < nth; t++)
2690     {
2691         try
2692         {
2693             size_t natoms = x.size();
2694             size_t offset = (natoms*t    )/nth;
2695             size_t len    = (natoms*(t + 1))/nth - offset;
2696             put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2697         }
2698         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2699     }
2700 }
2701
2702 // TODO This can be cleaned up a lot, and move back to runner.cpp
2703 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2704                 const t_inputrec *inputrec,
2705                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2706                 gmx_walltime_accounting_t walltime_accounting,
2707                 nonbonded_verlet_t *nbv,
2708                 gmx_pme_t *pme,
2709                 gmx_bool bWriteStat)
2710 {
2711     t_nrnb *nrnb_tot = nullptr;
2712     double  delta_t  = 0;
2713     double  nbfs     = 0, mflop = 0;
2714     double  elapsed_time,
2715             elapsed_time_over_all_ranks,
2716             elapsed_time_over_all_threads,
2717             elapsed_time_over_all_threads_over_all_ranks;
2718     /* Control whether it is valid to print a report. Only the
2719        simulation master may print, but it should not do so if the run
2720        terminated e.g. before a scheduled reset step. This is
2721        complicated by the fact that PME ranks are unaware of the
2722        reason why they were sent a pmerecvqxFINISH. To avoid
2723        communication deadlocks, we always do the communication for the
2724        report, even if we've decided not to write the report, because
2725        how long it takes to finish the run is not important when we've
2726        decided not to report on the simulation performance.
2727
2728        Further, we only report performance for dynamical integrators,
2729        because those are the only ones for which we plan to
2730        consider doing any optimizations. */
2731     bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2732
2733     if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2734     {
2735         GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2736         printReport = false;
2737     }
2738
2739     if (cr->nnodes > 1)
2740     {
2741         snew(nrnb_tot, 1);
2742 #if GMX_MPI
2743         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2744                       cr->mpi_comm_mysim);
2745 #endif
2746     }
2747     else
2748     {
2749         nrnb_tot = nrnb;
2750     }
2751
2752     elapsed_time                                 = walltime_accounting_get_elapsed_time(walltime_accounting);
2753     elapsed_time_over_all_ranks                  = elapsed_time;
2754     elapsed_time_over_all_threads                = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
2755     elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2756 #if GMX_MPI
2757     if (cr->nnodes > 1)
2758     {
2759         /* reduce elapsed_time over all MPI ranks in the current simulation */
2760         MPI_Allreduce(&elapsed_time,
2761                       &elapsed_time_over_all_ranks,
2762                       1, MPI_DOUBLE, MPI_SUM,
2763                       cr->mpi_comm_mysim);
2764         elapsed_time_over_all_ranks /= cr->nnodes;
2765         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2766          * current simulation. */
2767         MPI_Allreduce(&elapsed_time_over_all_threads,
2768                       &elapsed_time_over_all_threads_over_all_ranks,
2769                       1, MPI_DOUBLE, MPI_SUM,
2770                       cr->mpi_comm_mysim);
2771     }
2772 #endif
2773
2774     if (printReport)
2775     {
2776         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2777     }
2778     if (cr->nnodes > 1)
2779     {
2780         sfree(nrnb_tot);
2781     }
2782
2783     if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2784     {
2785         print_dd_statistics(cr, inputrec, fplog);
2786     }
2787
2788     /* TODO Move the responsibility for any scaling by thread counts
2789      * to the code that handled the thread region, so that there's a
2790      * mechanism to keep cycle counting working during the transition
2791      * to task parallelism. */
2792     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
2793     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2794     wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2795     auto cycle_sum(wallcycle_sum(cr, wcycle));
2796
2797     if (printReport)
2798     {
2799         auto                    nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2800         gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
2801         if (pme_gpu_task_enabled(pme))
2802         {
2803             pme_gpu_get_timings(pme, &pme_gpu_timings);
2804         }
2805         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2806                         elapsed_time_over_all_ranks,
2807                         wcycle, cycle_sum,
2808                         nbnxn_gpu_timings,
2809                         &pme_gpu_timings);
2810
2811         if (EI_DYNAMICS(inputrec->eI))
2812         {
2813             delta_t = inputrec->delta_t;
2814         }
2815
2816         if (fplog)
2817         {
2818             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2819                        elapsed_time_over_all_ranks,
2820                        walltime_accounting_get_nsteps_done(walltime_accounting),
2821                        delta_t, nbfs, mflop);
2822         }
2823         if (bWriteStat)
2824         {
2825             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2826                        elapsed_time_over_all_ranks,
2827                        walltime_accounting_get_nsteps_done(walltime_accounting),
2828                        delta_t, nbfs, mflop);
2829         }
2830     }
2831 }
2832
2833 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2834 {
2835     /* this function works, but could probably use a logic rewrite to keep all the different
2836        types of efep straight. */
2837
2838     if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
2839     {
2840         return;
2841     }
2842
2843     t_lambda *fep = ir->fepvals;
2844     *fep_state    = fep->init_fep_state; /* this might overwrite the checkpoint
2845                                             if checkpoint is set -- a kludge is in for now
2846                                             to prevent this.*/
2847
2848     for (int i = 0; i < efptNR; i++)
2849     {
2850         /* overwrite lambda state with init_lambda for now for backwards compatibility */
2851         if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2852         {
2853             lambda[i] = fep->init_lambda;
2854             if (lam0)
2855             {
2856                 lam0[i] = lambda[i];
2857             }
2858         }
2859         else
2860         {
2861             lambda[i] = fep->all_lambda[i][*fep_state];
2862             if (lam0)
2863             {
2864                 lam0[i] = lambda[i];
2865             }
2866         }
2867     }
2868     if (ir->bSimTemp)
2869     {
2870         /* need to rescale control temperatures to match current state */
2871         for (int i = 0; i < ir->opts.ngtc; i++)
2872         {
2873             if (ir->opts.ref_t[i] > 0)
2874             {
2875                 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2876             }
2877         }
2878     }
2879
2880     /* Send to the log the information on the current lambdas */
2881     if (fplog != nullptr)
2882     {
2883         fprintf(fplog, "Initial vector of lambda components:[ ");
2884         for (const auto &l : lambda)
2885         {
2886             fprintf(fplog, "%10.4f ", l);
2887         }
2888         fprintf(fplog, "]\n");
2889     }
2890     return;
2891 }
2892
2893
2894 void init_md(FILE *fplog,
2895              const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2896              t_inputrec *ir, const gmx_output_env_t *oenv,
2897              const MdrunOptions &mdrunOptions,
2898              double *t, double *t0,
2899              t_state *globalState, double *lam0,
2900              t_nrnb *nrnb, gmx_mtop_t *mtop,
2901              gmx_update_t **upd,
2902              int nfile, const t_filenm fnm[],
2903              gmx_mdoutf_t *outf, t_mdebin **mdebin,
2904              tensor force_vir, tensor shake_vir, rvec mu_tot,
2905              gmx_bool *bSimAnn, t_vcm **vcm,
2906              gmx_wallcycle_t wcycle)
2907 {
2908     int  i;
2909
2910     /* Initial values */
2911     *t = *t0       = ir->init_t;
2912
2913     *bSimAnn = FALSE;
2914     for (i = 0; i < ir->opts.ngtc; i++)
2915     {
2916         /* set bSimAnn if any group is being annealed */
2917         if (ir->opts.annealing[i] != eannNO)
2918         {
2919             *bSimAnn = TRUE;
2920         }
2921     }
2922
2923     /* Initialize lambda variables */
2924     /* TODO: Clean up initialization of fep_state and lambda in t_state.
2925      * We currently need to call initialize_lambdas on non-master ranks
2926      * to initialize lam0.
2927      */
2928     if (MASTER(cr))
2929     {
2930         initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
2931     }
2932     else
2933     {
2934         int                      tmpFepState;
2935         std::array<real, efptNR> tmpLambda;
2936         initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
2937     }
2938
2939     // TODO upd is never NULL in practice, but the analysers don't know that
2940     if (upd)
2941     {
2942         *upd = init_update(ir);
2943     }
2944     if (*bSimAnn)
2945     {
2946         update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2947     }
2948
2949     if (vcm != nullptr)
2950     {
2951         *vcm = init_vcm(fplog, &mtop->groups, ir);
2952     }
2953
2954     if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
2955     {
2956         if (ir->etc == etcBERENDSEN)
2957         {
2958             please_cite(fplog, "Berendsen84a");
2959         }
2960         if (ir->etc == etcVRESCALE)
2961         {
2962             please_cite(fplog, "Bussi2007a");
2963         }
2964         if (ir->eI == eiSD1)
2965         {
2966             please_cite(fplog, "Goga2012");
2967         }
2968     }
2969     init_nrnb(nrnb);
2970
2971     if (nfile != -1)
2972     {
2973         *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
2974
2975         *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
2976                               mtop, ir, mdoutf_get_fp_dhdl(*outf));
2977     }
2978
2979     /* Initiate variables */
2980     clear_mat(force_vir);
2981     clear_mat(shake_vir);
2982     clear_rvec(mu_tot);
2983 }