src/gromacs/mdlib/sim_util.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "sim_util.h"
  40
  41 #include "config.h"
  42
  43 #include <cmath>
  44 #include <cstdint>
  45 #include <cstdio>
  46 #include <cstring>
  47
  48 #include <array>
  49
  50 #include "gromacs/awh/awh.h"
  51 #include "gromacs/domdec/dlbtiming.h"
  52 #include "gromacs/domdec/domdec_struct.h"
  53 #include "gromacs/domdec/partition.h"
  54 #include "gromacs/essentialdynamics/edsam.h"
  55 #include "gromacs/ewald/pme.h"
  56 #include "gromacs/gmxlib/chargegroup.h"
  57 #include "gromacs/gmxlib/network.h"
  58 #include "gromacs/gmxlib/nrnb.h"
  59 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
  60 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
  61 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
  62 #include "gromacs/gpu_utils/gpu_utils.h"
  63 #include "gromacs/imd/imd.h"
  64 #include "gromacs/listed-forces/bonded.h"
  65 #include "gromacs/listed-forces/disre.h"
  66 #include "gromacs/listed-forces/manage-threading.h"
  67 #include "gromacs/listed-forces/orires.h"
  68 #include "gromacs/math/functions.h"
  69 #include "gromacs/math/units.h"
  70 #include "gromacs/math/vec.h"
  71 #include "gromacs/math/vecdump.h"
  72 #include "gromacs/mdlib/calcmu.h"
  73 #include "gromacs/mdlib/calcvir.h"
  74 #include "gromacs/mdlib/constr.h"
  75 #include "gromacs/mdlib/force.h"
  76 #include "gromacs/mdlib/forcerec.h"
  77 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  78 #include "gromacs/mdlib/mdrun.h"
  79 #include "gromacs/mdlib/nb_verlet.h"
  80 #include "gromacs/mdlib/nbnxn_atomdata.h"
  81 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  82 #include "gromacs/mdlib/nbnxn_grid.h"
  83 #include "gromacs/mdlib/nbnxn_search.h"
  84 #include "gromacs/mdlib/qmmm.h"
  85 #include "gromacs/mdlib/update.h"
  86 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
  87 #include "gromacs/mdtypes/commrec.h"
  88 #include "gromacs/mdtypes/forceoutput.h"
  89 #include "gromacs/mdtypes/iforceprovider.h"
  90 #include "gromacs/mdtypes/inputrec.h"
  91 #include "gromacs/mdtypes/md_enums.h"
  92 #include "gromacs/mdtypes/state.h"
  93 #include "gromacs/pbcutil/ishift.h"
  94 #include "gromacs/pbcutil/mshift.h"
  95 #include "gromacs/pbcutil/pbc.h"
  96 #include "gromacs/pulling/pull.h"
  97 #include "gromacs/pulling/pull_rotation.h"
  98 #include "gromacs/timing/cyclecounter.h"
  99 #include "gromacs/timing/gpu_timing.h"
 100 #include "gromacs/timing/wallcycle.h"
 101 #include "gromacs/timing/wallcyclereporting.h"
 102 #include "gromacs/timing/walltime_accounting.h"
 103 #include "gromacs/topology/topology.h"
 104 #include "gromacs/utility/arrayref.h"
 105 #include "gromacs/utility/basedefinitions.h"
 106 #include "gromacs/utility/cstringutil.h"
 107 #include "gromacs/utility/exceptions.h"
 108 #include "gromacs/utility/fatalerror.h"
 109 #include "gromacs/utility/gmxassert.h"
 110 #include "gromacs/utility/gmxmpi.h"
 111 #include "gromacs/utility/logger.h"
 112 #include "gromacs/utility/pleasecite.h"
 113 #include "gromacs/utility/smalloc.h"
 114 #include "gromacs/utility/strconvert.h"
 115 #include "gromacs/utility/sysinfo.h"
 116
 117 #include "nbnxn_gpu.h"
 118 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
 119 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
 120
 121 // TODO: this environment variable allows us to verify before release
 122 // that on less common architectures the total cost of polling is not larger than
 123 // a blocking wait (so polling does not introduce overhead when the static
 124 // PME-first ordering would suffice).
 125 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
 126
 127
 128 void print_time(FILE                     *out,
 129                 gmx_walltime_accounting_t walltime_accounting,
 130                 int64_t                   step,
 131                 const t_inputrec         *ir,
 132                 const t_commrec          *cr)
 133 {
 134     time_t finish;
 135     double dt, elapsed_seconds, time_per_step;
 136
 137 #if !GMX_THREAD_MPI
 138     if (!PAR(cr))
 139 #endif
 140     {
 141         fprintf(out, "\r");
 142     }
 143     fputs("step ", out);
 144     fputs(gmx::int64ToString(step).c_str(), out);
 145     fflush(out);
 146
 147     if ((step >= ir->nstlist))
 148     {
 149         double seconds_since_epoch = gmx_gettime();
 150         elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
 151         time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
 152         dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
 153
 154         if (ir->nsteps >= 0)
 155         {
 156             if (dt >= 300)
 157             {
 158                 finish = static_cast<time_t>(seconds_since_epoch + dt);
 159                 auto timebuf = gmx_ctime_r(&finish);
 160                 fputs(", will finish ", out);
 161                 fputs(timebuf.c_str(), out);
 162             }
 163             else
 164             {
 165                 fprintf(out, ", remaining wall clock time: %5d s          ", static_cast<int>(dt));
 166             }
 167         }
 168         else
 169         {
 170             fprintf(out, " performance: %.1f ns/day    ",
 171                     ir->delta_t/1000*24*60*60/time_per_step);
 172         }
 173     }
 174 #if !GMX_THREAD_MPI
 175     if (PAR(cr))
 176     {
 177         fprintf(out, "\n");
 178     }
 179 #else
 180     GMX_UNUSED_VALUE(cr);
 181 #endif
 182
 183     fflush(out);
 184 }
 185
 186 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
 187                          double the_time)
 188 {
 189     if (!fplog)
 190     {
 191         return;
 192     }
 193
 194     time_t temp_time = static_cast<time_t>(the_time);
 195
 196     auto   timebuf = gmx_ctime_r(&temp_time);
 197     // Retain only the characters before the first space
 198     timebuf.erase(std::find(timebuf.begin(), timebuf.end(), ' '), timebuf.end());
 199
 200     fprintf(fplog, "%s on rank %d %s\n", title, nodeid, timebuf.c_str());
 201 }
 202
 203 void print_start(FILE *fplog, const t_commrec *cr,
 204                  gmx_walltime_accounting_t walltime_accounting,
 205                  const char *name)
 206 {
 207     char buf[STRLEN];
 208
 209     sprintf(buf, "Started %s", name);
 210     print_date_and_time(fplog, cr->nodeid, buf,
 211                         walltime_accounting_get_start_time_stamp(walltime_accounting));
 212 }
 213
 214 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
 215 {
 216     const int      end = forceToAdd.size();
 217
 218     int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
 219 #pragma omp parallel for num_threads(nt) schedule(static)
 220     for (int i = 0; i < end; i++)
 221     {
 222         rvec_inc(f[i], forceToAdd[i]);
 223     }
 224 }
 225
 226 static void pme_gpu_reduce_outputs(gmx_wallcycle_t                 wcycle,
 227                                    gmx::ForceWithVirial           *forceWithVirial,
 228                                    gmx::ArrayRef<const gmx::RVec>  pmeForces,
 229                                    gmx_enerdata_t                 *enerd,
 230                                    const tensor                    vir_Q,
 231                                    real                            Vlr_q)
 232 {
 233     wallcycle_start(wcycle, ewcPME_GPU_F_REDUCTION);
 234     GMX_ASSERT(forceWithVirial, "Invalid force pointer");
 235     forceWithVirial->addVirialContribution(vir_Q);
 236     enerd->term[F_COUL_RECIP] += Vlr_q;
 237     sum_forces(as_rvec_array(forceWithVirial->force_.data()), pmeForces);
 238     wallcycle_stop(wcycle, ewcPME_GPU_F_REDUCTION);
 239 }
 240
 241 static void calc_virial(int start, int homenr, const rvec x[], const rvec f[],
 242                         tensor vir_part, const t_graph *graph, const matrix box,
 243                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
 244 {
 245     /* The short-range virial from surrounding boxes */
 246     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
 247     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
 248
 249     /* Calculate partial virial, for local atoms only, based on short range.
 250      * Total virial is computed in global_stat, called from do_md
 251      */
 252     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
 253     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
 254
 255     if (debug)
 256     {
 257         pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
 258     }
 259 }
 260
 261 static void pull_potential_wrapper(const t_commrec *cr,
 262                                    const t_inputrec *ir,
 263                                    const matrix box, gmx::ArrayRef<const gmx::RVec> x,
 264                                    gmx::ForceWithVirial *force,
 265                                    const t_mdatoms *mdatoms,
 266                                    gmx_enerdata_t *enerd,
 267                                    const real *lambda,
 268                                    double t,
 269                                    gmx_wallcycle_t wcycle)
 270 {
 271     t_pbc  pbc;
 272     real   dvdl;
 273
 274     /* Calculate the center of mass forces, this requires communication,
 275      * which is why pull_potential is called close to other communication.
 276      */
 277     wallcycle_start(wcycle, ewcPULLPOT);
 278     set_pbc(&pbc, ir->ePBC, box);
 279     dvdl                     = 0;
 280     enerd->term[F_COM_PULL] +=
 281         pull_potential(ir->pull_work, mdatoms, &pbc,
 282                        cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
 283     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
 284     wallcycle_stop(wcycle, ewcPULLPOT);
 285 }
 286
 287 static void pme_receive_force_ener(const t_commrec      *cr,
 288                                    gmx::ForceWithVirial *forceWithVirial,
 289                                    gmx_enerdata_t       *enerd,
 290                                    gmx_wallcycle_t       wcycle)
 291 {
 292     real   e_q, e_lj, dvdl_q, dvdl_lj;
 293     float  cycles_ppdpme, cycles_seppme;
 294
 295     cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
 296     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
 297
 298     /* In case of node-splitting, the PP nodes receive the long-range
 299      * forces, virial and energy from the PME nodes here.
 300      */
 301     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
 302     dvdl_q  = 0;
 303     dvdl_lj = 0;
 304     gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
 305                       &cycles_seppme);
 306     enerd->term[F_COUL_RECIP] += e_q;
 307     enerd->term[F_LJ_RECIP]   += e_lj;
 308     enerd->dvdl_lin[efptCOUL] += dvdl_q;
 309     enerd->dvdl_lin[efptVDW]  += dvdl_lj;
 310
 311     if (wcycle)
 312     {
 313         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
 314     }
 315     wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
 316 }
 317
 318 static void print_large_forces(FILE            *fp,
 319                                const t_mdatoms *md,
 320                                const t_commrec *cr,
 321                                int64_t          step,
 322                                real             forceTolerance,
 323                                const rvec      *x,
 324                                const rvec      *f)
 325 {
 326     real           force2Tolerance = gmx::square(forceTolerance);
 327     gmx::index     numNonFinite    = 0;
 328     for (int i = 0; i < md->homenr; i++)
 329     {
 330         real force2    = norm2(f[i]);
 331         bool nonFinite = !std::isfinite(force2);
 332         if (force2 >= force2Tolerance || nonFinite)
 333         {
 334             fprintf(fp, "step %" PRId64 " atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
 335                     step,
 336                     ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
 337         }
 338         if (nonFinite)
 339         {
 340             numNonFinite++;
 341         }
 342     }
 343     if (numNonFinite > 0)
 344     {
 345         /* Note that with MPI this fatal call on one rank might interrupt
 346          * the printing on other ranks. But we can only avoid that with
 347          * an expensive MPI barrier that we would need at each step.
 348          */
 349         gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %td atoms", step, numNonFinite);
 350     }
 351 }
 352
 353 static void post_process_forces(const t_commrec           *cr,
 354                                 int64_t                    step,
 355                                 t_nrnb                    *nrnb,
 356                                 gmx_wallcycle_t            wcycle,
 357                                 const gmx_localtop_t      *top,
 358                                 const matrix               box,
 359                                 const rvec                 x[],
 360                                 rvec                       f[],
 361                                 gmx::ForceWithVirial      *forceWithVirial,
 362                                 tensor                     vir_force,
 363                                 const t_mdatoms           *mdatoms,
 364                                 const t_graph             *graph,
 365                                 const t_forcerec          *fr,
 366                                 const gmx_vsite_t         *vsite,
 367                                 int                        flags)
 368 {
 369     if (fr->haveDirectVirialContributions)
 370     {
 371         rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
 372
 373         if (vsite)
 374         {
 375             /* Spread the mesh force on virtual sites to the other particles...
 376              * This is parallellized. MPI communication is performed
 377              * if the constructing atoms aren't local.
 378              */
 379             matrix virial = { { 0 } };
 380             spread_vsite_f(vsite, x, fDirectVir, nullptr,
 381                            (flags & GMX_FORCE_VIRIAL) != 0, virial,
 382                            nrnb,
 383                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
 384             forceWithVirial->addVirialContribution(virial);
 385         }
 386
 387         if (flags & GMX_FORCE_VIRIAL)
 388         {
 389             /* Now add the forces, this is local */
 390             sum_forces(f, forceWithVirial->force_);
 391
 392             /* Add the direct virial contributions */
 393             GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
 394             m_add(vir_force, forceWithVirial->getVirial(), vir_force);
 395
 396             if (debug)
 397             {
 398                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
 399             }
 400         }
 401     }
 402
 403     if (fr->print_force >= 0)
 404     {
 405         print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
 406     }
 407 }
 408
 409 static void do_nb_verlet(const t_forcerec *fr,
 410                          const interaction_const_t *ic,
 411                          gmx_enerdata_t *enerd,
 412                          int flags, int ilocality,
 413                          int clearF,
 414                          int64_t step,
 415                          t_nrnb *nrnb,
 416                          gmx_wallcycle_t wcycle)
 417 {
 418     if (!(flags & GMX_FORCE_NONBONDED))
 419     {
 420         /* skip non-bonded calculation */
 421         return;
 422     }
 423
 424     nonbonded_verlet_t       *nbv  = fr->nbv;
 425     nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
 426
 427     /* GPU kernel launch overhead is already timed separately */
 428     if (fr->cutoff_scheme != ecutsVERLET)
 429     {
 430         gmx_incons("Invalid cut-off scheme passed!");
 431     }
 432
 433     bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
 434
 435     if (!bUsingGpuKernels)
 436     {
 437         /* When dynamic pair-list  pruning is requested, we need to prune
 438          * at nstlistPrune steps.
 439          */
 440         if (nbv->listParams->useDynamicPruning &&
 441             (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
 442         {
 443             /* Prune the pair-list beyond fr->ic->rlistPrune using
 444              * the current coordinates of the atoms.
 445              */
 446             wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
 447             nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
 448             wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
 449         }
 450
 451         wallcycle_sub_start(wcycle, ewcsNONBONDED);
 452     }
 453
 454     switch (nbvg->kernel_type)
 455     {
 456         case nbnxnk4x4_PlainC:
 457         case nbnxnk4xN_SIMD_4xN:
 458         case nbnxnk4xN_SIMD_2xNN:
 459             nbnxn_kernel_cpu(nbvg,
 460                              nbv->nbat,
 461                              ic,
 462                              fr->shift_vec,
 463                              flags,
 464                              clearF,
 465                              fr->fshift[0],
 466                              enerd->grpp.ener[egCOULSR],
 467                              fr->bBHAM ?
 468                              enerd->grpp.ener[egBHAMSR] :
 469                              enerd->grpp.ener[egLJSR]);
 470             break;
 471
 472         case nbnxnk8x8x8_GPU:
 473             nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbv->nbat, flags, ilocality);
 474             break;
 475
 476         case nbnxnk8x8x8_PlainC:
 477             nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
 478                                  nbv->nbat, ic,
 479                                  fr->shift_vec,
 480                                  flags,
 481                                  clearF,
 482                                  nbv->nbat->out[0].f,
 483                                  fr->fshift[0],
 484                                  enerd->grpp.ener[egCOULSR],
 485                                  fr->bBHAM ?
 486                                  enerd->grpp.ener[egBHAMSR] :
 487                                  enerd->grpp.ener[egLJSR]);
 488             break;
 489
 490         default:
 491             GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
 492
 493     }
 494     if (!bUsingGpuKernels)
 495     {
 496         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 497     }
 498
 499     int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
 500     if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
 501     {
 502         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
 503     }
 504     else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
 505              (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
 506     {
 507         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
 508     }
 509     else
 510     {
 511         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
 512     }
 513     enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
 514     if (flags & GMX_FORCE_ENERGY)
 515     {
 516         /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
 517         enr_nbnxn_kernel_ljc += 1;
 518         enr_nbnxn_kernel_lj  += 1;
 519     }
 520
 521     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
 522              nbvg->nbl_lists.natpair_ljq);
 523     inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
 524              nbvg->nbl_lists.natpair_lj);
 525     /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
 526     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
 527              nbvg->nbl_lists.natpair_q);
 528
 529     if (ic->vdw_modifier == eintmodFORCESWITCH)
 530     {
 531         /* We add up the switch cost separately */
 532         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 533                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 534     }
 535     if (ic->vdw_modifier == eintmodPOTSWITCH)
 536     {
 537         /* We add up the switch cost separately */
 538         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 539                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 540     }
 541     if (ic->vdwtype == evdwPME)
 542     {
 543         /* We add up the LJ Ewald cost separately */
 544         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 545                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 546     }
 547 }
 548
 549 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
 550                              t_forcerec           *fr,
 551                              rvec                  x[],
 552                              rvec                  f[],
 553                              const t_mdatoms      *mdatoms,
 554                              t_lambda             *fepvals,
 555                              real                 *lambda,
 556                              gmx_enerdata_t       *enerd,
 557                              int                   flags,
 558                              t_nrnb               *nrnb,
 559                              gmx_wallcycle_t       wcycle)
 560 {
 561     int              donb_flags;
 562     nb_kernel_data_t kernel_data;
 563     real             lam_i[efptNR];
 564     real             dvdl_nb[efptNR];
 565     int              th;
 566     int              i, j;
 567
 568     donb_flags = 0;
 569     /* Add short-range interactions */
 570     donb_flags |= GMX_NONBONDED_DO_SR;
 571
 572     /* Currently all group scheme kernels always calculate (shift-)forces */
 573     if (flags & GMX_FORCE_FORCES)
 574     {
 575         donb_flags |= GMX_NONBONDED_DO_FORCE;
 576     }
 577     if (flags & GMX_FORCE_VIRIAL)
 578     {
 579         donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
 580     }
 581     if (flags & GMX_FORCE_ENERGY)
 582     {
 583         donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
 584     }
 585
 586     kernel_data.flags  = donb_flags;
 587     kernel_data.lambda = lambda;
 588     kernel_data.dvdl   = dvdl_nb;
 589
 590     kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
 591     kernel_data.energygrp_vdw  = enerd->grpp.ener[egLJSR];
 592
 593     /* reset free energy components */
 594     for (i = 0; i < efptNR; i++)
 595     {
 596         dvdl_nb[i]  = 0;
 597     }
 598
 599     GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
 600
 601     wallcycle_sub_start(wcycle, ewcsNONBONDED);
 602 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
 603     for (th = 0; th < nbl_lists->nnbl; th++)
 604     {
 605         try
 606         {
 607             gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
 608                                       x, f, fr, mdatoms, &kernel_data, nrnb);
 609         }
 610         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 611     }
 612
 613     if (fepvals->sc_alpha != 0)
 614     {
 615         enerd->dvdl_nonlin[efptVDW]  += dvdl_nb[efptVDW];
 616         enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
 617     }
 618     else
 619     {
 620         enerd->dvdl_lin[efptVDW]  += dvdl_nb[efptVDW];
 621         enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
 622     }
 623
 624     /* If we do foreign lambda and we have soft-core interactions
 625      * we have to recalculate the (non-linear) energies contributions.
 626      */
 627     if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
 628     {
 629         kernel_data.flags          = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
 630         kernel_data.lambda         = lam_i;
 631         kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
 632         kernel_data.energygrp_vdw  = enerd->foreign_grpp.ener[egLJSR];
 633         /* Note that we add to kernel_data.dvdl, but ignore the result */
 634
 635         for (i = 0; i < enerd->n_lambda; i++)
 636         {
 637             for (j = 0; j < efptNR; j++)
 638             {
 639                 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
 640             }
 641             reset_foreign_enerdata(enerd);
 642 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
 643             for (th = 0; th < nbl_lists->nnbl; th++)
 644             {
 645                 try
 646                 {
 647                     gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
 648                                               x, f, fr, mdatoms, &kernel_data, nrnb);
 649                 }
 650                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 651             }
 652
 653             sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
 654             enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
 655         }
 656     }
 657
 658     wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 659 }
 660
 661 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
 662 {
 663     return nbv != nullptr && nbv->bUseGPU;
 664 }
 665
 666 static inline void clear_rvecs_omp(int n, rvec v[])
 667 {
 668     int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
 669
 670     /* Note that we would like to avoid this conditional by putting it
 671      * into the omp pragma instead, but then we still take the full
 672      * omp parallel for overhead (at least with gcc5).
 673      */
 674     if (nth == 1)
 675     {
 676         for (int i = 0; i < n; i++)
 677         {
 678             clear_rvec(v[i]);
 679         }
 680     }
 681     else
 682     {
 683 #pragma omp parallel for num_threads(nth) schedule(static)
 684         for (int i = 0; i < n; i++)
 685         {
 686             clear_rvec(v[i]);
 687         }
 688     }
 689 }
 690
 691 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
 692  *
 693  * \param groupOptions  Group options, containing T-coupling options
 694  */
 695 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
 696 {
 697     real nrdfCoupled   = 0;
 698     real nrdfUncoupled = 0;
 699     real kineticEnergy = 0;
 700     for (int g = 0; g < groupOptions.ngtc; g++)
 701     {
 702         if (groupOptions.tau_t[g] >= 0)
 703         {
 704             nrdfCoupled   += groupOptions.nrdf[g];
 705             kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
 706         }
 707         else
 708         {
 709             nrdfUncoupled += groupOptions.nrdf[g];
 710         }
 711     }
 712
 713     /* This conditional with > also catches nrdf=0 */
 714     if (nrdfCoupled > nrdfUncoupled)
 715     {
 716         return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
 717     }
 718     else
 719     {
 720         return 0;
 721     }
 722 }
 723
 724 /*! \brief This routine checks that the potential energy is finite.
 725  *
 726  * Always checks that the potential energy is finite. If step equals
 727  * inputrec.init_step also checks that the magnitude of the potential energy
 728  * is reasonable. Terminates with a fatal error when a check fails.
 729  * Note that passing this check does not guarantee finite forces,
 730  * since those use slightly different arithmetics. But in most cases
 731  * there is just a narrow coordinate range where forces are not finite
 732  * and energies are finite.
 733  *
 734  * \param[in] step      The step number, used for checking and printing
 735  * \param[in] enerd     The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
 736  * \param[in] inputrec  The input record
 737  */
 738 static void checkPotentialEnergyValidity(int64_t               step,
 739                                          const gmx_enerdata_t &enerd,
 740                                          const t_inputrec     &inputrec)
 741 {
 742     /* Threshold valid for comparing absolute potential energy against
 743      * the kinetic energy. Normally one should not consider absolute
 744      * potential energy values, but with a factor of one million
 745      * we should never get false positives.
 746      */
 747     constexpr real c_thresholdFactor = 1e6;
 748
 749     bool           energyIsNotFinite    = !std::isfinite(enerd.term[F_EPOT]);
 750     real           averageKineticEnergy = 0;
 751     /* We only check for large potential energy at the initial step,
 752      * because that is by far the most likely step for this too occur
 753      * and because computing the average kinetic energy is not free.
 754      * Note: nstcalcenergy >> 1 often does not allow to catch large energies
 755      * before they become NaN.
 756      */
 757     if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
 758     {
 759         averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
 760     }
 761
 762     if (energyIsNotFinite || (averageKineticEnergy > 0 &&
 763                               enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
 764     {
 765         gmx_fatal(FARGS, "Step %" PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
 766                   step,
 767                   enerd.term[F_EPOT],
 768                   energyIsNotFinite ? "not finite" : "extremely high",
 769                   enerd.term[F_LJ],
 770                   enerd.term[F_COUL_SR],
 771                   energyIsNotFinite ? "non-finite" : "very high",
 772                   energyIsNotFinite ? " or Nan" : "");
 773     }
 774 }
 775
 776 /*! \brief Compute forces and/or energies for special algorithms
 777  *
 778  * The intention is to collect all calls to algorithms that compute
 779  * forces on local atoms only and that do not contribute to the local
 780  * virial sum (but add their virial contribution separately).
 781  * Eventually these should likely all become ForceProviders.
 782  * Within this function the intention is to have algorithms that do
 783  * global communication at the end, so global barriers within the MD loop
 784  * are as close together as possible.
 785  *
 786  * \param[in]     fplog            The log file
 787  * \param[in]     cr               The communication record
 788  * \param[in]     inputrec         The input record
 789  * \param[in]     awh              The Awh module (nullptr if none in use).
 790  * \param[in]     enforcedRotation Enforced rotation module.
 791  * \param[in]     step             The current MD step
 792  * \param[in]     t                The current time
 793  * \param[in,out] wcycle           Wallcycle accounting struct
 794  * \param[in,out] forceProviders   Pointer to a list of force providers
 795  * \param[in]     box              The unit cell
 796  * \param[in]     x                The coordinates
 797  * \param[in]     mdatoms          Per atom properties
 798  * \param[in]     lambda           Array of free-energy lambda values
 799  * \param[in]     forceFlags       Flags that tell whether we should compute forces/energies/virial
 800  * \param[in,out] forceWithVirial  Force and virial buffers
 801  * \param[in,out] enerd            Energy buffer
 802  * \param[in,out] ed               Essential dynamics pointer
 803  * \param[in]     bNS              Tells if we did neighbor searching this step, used for ED sampling
 804  *
 805  * \todo Remove bNS, which is used incorrectly.
 806  * \todo Convert all other algorithms called here to ForceProviders.
 807  */
 808 static void
 809 computeSpecialForces(FILE                          *fplog,
 810                      const t_commrec               *cr,
 811                      const t_inputrec              *inputrec,
 812                      gmx::Awh                      *awh,
 813                      gmx_enfrot                    *enforcedRotation,
 814                      int64_t                        step,
 815                      double                         t,
 816                      gmx_wallcycle_t                wcycle,
 817                      ForceProviders                *forceProviders,
 818                      matrix                         box,
 819                      gmx::ArrayRef<const gmx::RVec> x,
 820                      const t_mdatoms               *mdatoms,
 821                      real                          *lambda,
 822                      int                            forceFlags,
 823                      gmx::ForceWithVirial          *forceWithVirial,
 824                      gmx_enerdata_t                *enerd,
 825                      gmx_edsam                     *ed,
 826                      gmx_bool                       bNS)
 827 {
 828     const bool computeForces = (forceFlags & GMX_FORCE_FORCES) != 0;
 829
 830     /* NOTE: Currently all ForceProviders only provide forces.
 831      *       When they also provide energies, remove this conditional.
 832      */
 833     if (computeForces)
 834     {
 835         gmx::ForceProviderInput  forceProviderInput(x, *mdatoms, t, box, *cr);
 836         gmx::ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
 837
 838         /* Collect forces from modules */
 839         forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
 840     }
 841
 842     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
 843     {
 844         pull_potential_wrapper(cr, inputrec, box, x,
 845                                forceWithVirial,
 846                                mdatoms, enerd, lambda, t,
 847                                wcycle);
 848
 849         if (awh)
 850         {
 851             enerd->term[F_COM_PULL] +=
 852                 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
 853                                                   forceWithVirial,
 854                                                   t, step, wcycle, fplog);
 855         }
 856     }
 857
 858     rvec *f = as_rvec_array(forceWithVirial->force_.data());
 859
 860     /* Add the forces from enforced rotation potentials (if any) */
 861     if (inputrec->bRot)
 862     {
 863         wallcycle_start(wcycle, ewcROTadd);
 864         enerd->term[F_COM_PULL] += add_rot_forces(enforcedRotation, f, cr, step, t);
 865         wallcycle_stop(wcycle, ewcROTadd);
 866     }
 867
 868     if (ed)
 869     {
 870         /* Note that since init_edsam() is called after the initialization
 871          * of forcerec, edsam doesn't request the noVirSum force buffer.
 872          * Thus if no other algorithm (e.g. PME) requires it, the forces
 873          * here will contribute to the virial.
 874          */
 875         do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
 876     }
 877
 878     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
 879     if (inputrec->bIMD && computeForces)
 880     {
 881         IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
 882     }
 883 }
 884
 885 /*! \brief Launch the prepare_step and spread stages of PME GPU.
 886  *
 887  * \param[in]  pmedata       The PME structure
 888  * \param[in]  box           The box matrix
 889  * \param[in]  x             Coordinate array
 890  * \param[in]  flags         Force flags
 891  * \param[in]  wcycle        The wallcycle structure
 892  */
 893 static inline void launchPmeGpuSpread(gmx_pme_t      *pmedata,
 894                                       matrix          box,
 895                                       rvec            x[],
 896                                       int             flags,
 897                                       gmx_wallcycle_t wcycle)
 898 {
 899     int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE;
 900     pmeFlags |= (flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0;
 901     pmeFlags |= (flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0;
 902
 903     pme_gpu_prepare_computation(pmedata, (flags & GMX_FORCE_DYNAMICBOX) != 0, box, wcycle, pmeFlags);
 904     pme_gpu_launch_spread(pmedata, x, wcycle);
 905 }
 906
 907 /*! \brief Launch the FFT and gather stages of PME GPU
 908  *
 909  * This function only implements setting the output forces (no accumulation).
 910  *
 911  * \param[in]  pmedata        The PME structure
 912  * \param[in]  wcycle         The wallcycle structure
 913  */
 914 static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
 915                                      gmx_wallcycle_t   wcycle)
 916 {
 917     pme_gpu_launch_complex_transforms(pmedata, wcycle);
 918     pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
 919 }
 920
 921 /*! \brief
 922  *  Polling wait for either of the PME or nonbonded GPU tasks.
 923  *
 924  * Instead of a static order in waiting for GPU tasks, this function
 925  * polls checking which of the two tasks completes first, and does the
 926  * associated force buffer reduction overlapped with the other task.
 927  * By doing that, unlike static scheduling order, it can always overlap
 928  * one of the reductions, regardless of the GPU task completion order.
 929  *
 930  * \param[in]     nbv              Nonbonded verlet structure
 931  * \param[in]     pmedata          PME module data
 932  * \param[in,out] force            Force array to reduce task outputs into.
 933  * \param[in,out] forceWithVirial  Force and virial buffers
 934  * \param[in,out] fshift           Shift force output vector results are reduced into
 935  * \param[in,out] enerd            Energy data structure results are reduced into
 936  * \param[in]     flags            Force flags
 937  * \param[in]     wcycle           The wallcycle structure
 938  */
 939 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv,
 940                                         const gmx_pme_t                     *pmedata,
 941                                         gmx::PaddedArrayRef<gmx::RVec>      *force,
 942                                         gmx::ForceWithVirial                *forceWithVirial,
 943                                         rvec                                 fshift[],
 944                                         gmx_enerdata_t                      *enerd,
 945                                         int                                  flags,
 946                                         gmx_wallcycle_t                      wcycle)
 947 {
 948     bool isPmeGpuDone = false;
 949     bool isNbGpuDone  = false;
 950
 951
 952     gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
 953
 954     while (!isPmeGpuDone || !isNbGpuDone)
 955     {
 956         if (!isPmeGpuDone)
 957         {
 958             matrix            vir_Q;
 959             real              Vlr_q;
 960
 961             GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
 962             isPmeGpuDone = pme_gpu_try_finish_task(pmedata, wcycle, &pmeGpuForces,
 963                                                    vir_Q, &Vlr_q, completionType);
 964
 965             if (isPmeGpuDone)
 966             {
 967                 pme_gpu_reduce_outputs(wcycle, forceWithVirial, pmeGpuForces,
 968                                        enerd, vir_Q, Vlr_q);
 969             }
 970         }
 971
 972         if (!isNbGpuDone)
 973         {
 974             GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
 975             wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
 976             isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
 977                                                     flags, eatLocal,
 978                                                     enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
 979                                                     fshift, completionType);
 980             wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 981             // To get the call count right, when the task finished we
 982             // issue a start/stop.
 983             // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
 984             // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
 985             if (isNbGpuDone)
 986             {
 987                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
 988                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 989
 990                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
 991                                                nbv->nbat, as_rvec_array(force->data()), wcycle);
 992             }
 993         }
 994     }
 995 }
 996
 997 /*! \brief
 998  *  Launch the dynamic rolling pruning GPU task.
 999  *
1000  *  We currently alternate local/non-local list pruning in odd-even steps
1001  *  (only pruning every second step without DD).
1002  *
1003  * \param[in]     cr               The communication record
1004  * \param[in]     nbv              Nonbonded verlet structure
1005  * \param[in]     inputrec         The input record
1006  * \param[in]     step             The current MD step
1007  */
1008 static inline void launchGpuRollingPruning(const t_commrec          *cr,
1009                                            const nonbonded_verlet_t *nbv,
1010                                            const t_inputrec         *inputrec,
1011                                            const int64_t             step)
1012 {
1013     /* We should not launch the rolling pruning kernel at a search
1014      * step or just before search steps, since that's useless.
1015      * Without domain decomposition we prune at even steps.
1016      * With domain decomposition we alternate local and non-local
1017      * pruning at even and odd steps.
1018      */
1019     int  numRollingParts     = nbv->listParams->numRollingParts;
1020     GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1021     int  stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1022     bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
1023     if (stepWithCurrentList > 0 &&
1024         stepWithCurrentList < inputrec->nstlist - 1 &&
1025         (stepIsEven || DOMAINDECOMP(cr)))
1026     {
1027         nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
1028                                           stepIsEven ? eintLocal : eintNonlocal,
1029                                           numRollingParts);
1030     }
1031 }
1032
1033 static void do_force_cutsVERLET(FILE *fplog,
1034                                 const t_commrec *cr,
1035                                 const gmx_multisim_t *ms,
1036                                 const t_inputrec *inputrec,
1037                                 gmx::Awh *awh,
1038                                 gmx_enfrot *enforcedRotation,
1039                                 int64_t step,
1040                                 t_nrnb *nrnb,
1041                                 gmx_wallcycle_t wcycle,
1042                                 const gmx_localtop_t *top,
1043                                 const gmx_groups_t * /* groups */,
1044                                 matrix box, gmx::PaddedArrayRef<gmx::RVec> x,
1045                                 history_t *hist,
1046                                 gmx::PaddedArrayRef<gmx::RVec> force,
1047                                 tensor vir_force,
1048                                 const t_mdatoms *mdatoms,
1049                                 gmx_enerdata_t *enerd, t_fcdata *fcd,
1050                                 real *lambda,
1051                                 t_graph *graph,
1052                                 t_forcerec *fr,
1053                                 interaction_const_t *ic,
1054                                 const gmx_vsite_t *vsite,
1055                                 rvec mu_tot,
1056                                 double t,
1057                                 gmx_edsam *ed,
1058                                 int flags,
1059                                 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1060                                 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1061 {
1062     int                 cg1, i, j;
1063     double              mu[2*DIM];
1064     gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM;
1065     gmx_bool            bDoForces, bUseGPU, bUseOrEmulGPU;
1066     rvec                vzero, box_diag;
1067     float               cycles_pme, cycles_wait_gpu;
1068     nonbonded_verlet_t *nbv = fr->nbv;
1069
1070     bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
1071     bNS           = ((flags & GMX_FORCE_NS) != 0) && (!fr->bAllvsAll);
1072     bFillGrid     = (bNS && bStateChanged);
1073     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
1074     bDoForces     = ((flags & GMX_FORCE_FORCES) != 0);
1075     bUseGPU       = fr->nbv->bUseGPU;
1076     bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
1077
1078     const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
1079     // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
1080     const bool useGpuPme  = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
1081         ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
1082
1083     /* At a search step we need to start the first balancing region
1084      * somewhere early inside the step after communication during domain
1085      * decomposition (and not during the previous step as usual).
1086      */
1087     if (bNS &&
1088         ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1089     {
1090         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
1091     }
1092
1093     cycles_wait_gpu = 0;
1094
1095     const int start  = 0;
1096     const int homenr = mdatoms->homenr;
1097
1098     clear_mat(vir_force);
1099
1100     if (DOMAINDECOMP(cr))
1101     {
1102         cg1 = cr->dd->globalAtomGroupIndices.size();
1103     }
1104     else
1105     {
1106         cg1 = top->cgs.nr;
1107     }
1108     if (fr->n_tpi > 0)
1109     {
1110         cg1--;
1111     }
1112
1113     if (bStateChanged)
1114     {
1115         update_forcerec(fr, box);
1116
1117         if (inputrecNeedMutot(inputrec))
1118         {
1119             /* Calculate total (local) dipole moment in a temporary common array.
1120              * This makes it possible to sum them over nodes faster.
1121              */
1122             calc_mu(start, homenr,
1123                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1124                     mu, mu+DIM);
1125         }
1126     }
1127
1128     if (fr->ePBC != epbcNONE)
1129     {
1130         /* Compute shift vectors every step,
1131          * because of pressure coupling or box deformation!
1132          */
1133         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1134         {
1135             calc_shifts(box, fr->shift_vec);
1136         }
1137
1138         if (bCalcCGCM)
1139         {
1140             put_atoms_in_box_omp(fr->ePBC, box, x.subArray(0, homenr));
1141             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1142         }
1143         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1144         {
1145             unshift_self(graph, box, as_rvec_array(x.data()));
1146         }
1147     }
1148
1149     nbnxn_atomdata_copy_shiftvec((flags & GMX_FORCE_DYNAMICBOX) != 0,
1150                                  fr->shift_vec, nbv->nbat);
1151
1152 #if GMX_MPI
1153     if (!thisRankHasDuty(cr, DUTY_PME))
1154     {
1155         /* Send particle coordinates to the pme nodes.
1156          * Since this is only implemented for domain decomposition
1157          * and domain decomposition does not use the graph,
1158          * we do not need to worry about shifting.
1159          */
1160         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1161                                  lambda[efptCOUL], lambda[efptVDW],
1162                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1163                                  step, wcycle);
1164     }
1165 #endif /* GMX_MPI */
1166
1167     if (useGpuPme)
1168     {
1169         launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.data()), flags, wcycle);
1170     }
1171
1172     /* do gridding for pair search */
1173     if (bNS)
1174     {
1175         if (graph && bStateChanged)
1176         {
1177             /* Calculate intramolecular shift vectors to make molecules whole */
1178             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1179         }
1180
1181         clear_rvec(vzero);
1182         box_diag[XX] = box[XX][XX];
1183         box_diag[YY] = box[YY][YY];
1184         box_diag[ZZ] = box[ZZ][ZZ];
1185
1186         wallcycle_start(wcycle, ewcNS);
1187         if (!DOMAINDECOMP(cr))
1188         {
1189             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1190             nbnxn_put_on_grid(nbv->nbs.get(), fr->ePBC, box,
1191                               0, vzero, box_diag,
1192                               nullptr, 0, mdatoms->homenr, -1,
1193                               fr->cginfo, x,
1194                               0, nullptr,
1195                               nbv->grp[eintLocal].kernel_type,
1196                               nbv->nbat);
1197             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1198         }
1199         else
1200         {
1201             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1202             nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
1203                                        fr->cginfo, x,
1204                                        nbv->grp[eintNonlocal].kernel_type,
1205                                        nbv->nbat);
1206             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1207         }
1208
1209         nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
1210
1211         /* Now we put all atoms on the grid, we can assign bonded interactions
1212          * to the GPU, where the grid order is needed.
1213          */
1214         if (fr->gpuBondedLists)
1215         {
1216             assign_bondeds_to_gpu(fr->gpuBondedLists,
1217                                   nbnxn_get_gridindices(fr->nbv->nbs.get()),
1218                                   top->idef);
1219         }
1220
1221         wallcycle_stop(wcycle, ewcNS);
1222     }
1223
1224     /* initialize the GPU atom data and copy shift vector */
1225     if (bUseGPU)
1226     {
1227         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1228         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1229
1230         if (bNS)
1231         {
1232             nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1233         }
1234
1235         nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1236
1237         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1238         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1239     }
1240
1241     /* do local pair search */
1242     if (bNS)
1243     {
1244         wallcycle_start_nocount(wcycle, ewcNS);
1245         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1246         nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1247                             &top->excls,
1248                             nbv->listParams->rlistOuter,
1249                             nbv->min_ci_balanced,
1250                             &nbv->grp[eintLocal].nbl_lists,
1251                             eintLocal,
1252                             nbv->grp[eintLocal].kernel_type,
1253                             nrnb);
1254         nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
1255         if (nbv->listParams->useDynamicPruning && !bUseGPU)
1256         {
1257             nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
1258         }
1259         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1260
1261         if (bUseGPU)
1262         {
1263             /* initialize local pair-list on the GPU */
1264             nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1265                                     nbv->grp[eintLocal].nbl_lists.nbl[0],
1266                                     eintLocal);
1267         }
1268         wallcycle_stop(wcycle, ewcNS);
1269     }
1270     else
1271     {
1272         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatLocal, FALSE, as_rvec_array(x.data()),
1273                                         nbv->nbat, wcycle);
1274     }
1275
1276     if (bUseGPU)
1277     {
1278         if (DOMAINDECOMP(cr))
1279         {
1280             ddOpenBalanceRegionGpu(cr->dd);
1281         }
1282
1283         wallcycle_start(wcycle, ewcLAUNCH_GPU);
1284         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1285         /* launch local nonbonded work on GPU */
1286         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1287                      step, nrnb, wcycle);
1288         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1289         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1290     }
1291
1292     if (useGpuPme)
1293     {
1294         // In PME GPU and mixed mode we launch FFT / gather after the
1295         // X copy/transform to allow overlap as well as after the GPU NB
1296         // launch to avoid FFT launch overhead hijacking the CPU and delaying
1297         // the nonbonded kernel.
1298         launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1299     }
1300
1301     /* Communicate coordinates and sum dipole if necessary +
1302        do non-local pair search */
1303     if (DOMAINDECOMP(cr))
1304     {
1305         if (bNS)
1306         {
1307             wallcycle_start_nocount(wcycle, ewcNS);
1308             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1309
1310             nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1311                                 &top->excls,
1312                                 nbv->listParams->rlistOuter,
1313                                 nbv->min_ci_balanced,
1314                                 &nbv->grp[eintNonlocal].nbl_lists,
1315                                 eintNonlocal,
1316                                 nbv->grp[eintNonlocal].kernel_type,
1317                                 nrnb);
1318             nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1319             if (nbv->listParams->useDynamicPruning && !bUseGPU)
1320             {
1321                 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1322             }
1323             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1324
1325             if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1326             {
1327                 /* initialize non-local pair-list on the GPU */
1328                 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1329                                         nbv->grp[eintNonlocal].nbl_lists.nbl[0],
1330                                         eintNonlocal);
1331             }
1332             wallcycle_stop(wcycle, ewcNS);
1333         }
1334         else
1335         {
1336             dd_move_x(cr->dd, box, x, wcycle);
1337
1338             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatNonlocal, FALSE, as_rvec_array(x.data()),
1339                                             nbv->nbat, wcycle);
1340         }
1341
1342         if (bUseGPU)
1343         {
1344             wallcycle_start(wcycle, ewcLAUNCH_GPU);
1345             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1346             /* launch non-local nonbonded tasks on GPU */
1347             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1348                          step, nrnb, wcycle);
1349             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1350             wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1351         }
1352     }
1353
1354     if (bUseGPU)
1355     {
1356         /* launch D2H copy-back F */
1357         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1358         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1359         if (DOMAINDECOMP(cr))
1360         {
1361             nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1362                                      flags, eatNonlocal);
1363         }
1364         nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1365                                  flags, eatLocal);
1366         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1367         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1368     }
1369
1370     if (bStateChanged && inputrecNeedMutot(inputrec))
1371     {
1372         if (PAR(cr))
1373         {
1374             gmx_sumd(2*DIM, mu, cr);
1375             ddReopenBalanceRegionCpu(cr->dd);
1376         }
1377
1378         for (i = 0; i < 2; i++)
1379         {
1380             for (j = 0; j < DIM; j++)
1381             {
1382                 fr->mu_tot[i][j] = mu[i*DIM + j];
1383             }
1384         }
1385     }
1386     if (fr->efep == efepNO)
1387     {
1388         copy_rvec(fr->mu_tot[0], mu_tot);
1389     }
1390     else
1391     {
1392         for (j = 0; j < DIM; j++)
1393         {
1394             mu_tot[j] =
1395                 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1396                 lambda[efptCOUL]*fr->mu_tot[1][j];
1397         }
1398     }
1399
1400     /* Reset energies */
1401     reset_enerdata(enerd);
1402     clear_rvecs(SHIFTS, fr->fshift);
1403
1404     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1405     {
1406         wallcycle_start(wcycle, ewcPPDURINGPME);
1407         dd_force_flop_start(cr->dd, nrnb);
1408     }
1409
1410     if (inputrec->bRot)
1411     {
1412         wallcycle_start(wcycle, ewcROT);
1413         do_rotation(cr, enforcedRotation, box, as_rvec_array(x.data()), t, step, bNS);
1414         wallcycle_stop(wcycle, ewcROT);
1415     }
1416
1417     /* Temporary solution until all routines take PaddedRVecVector */
1418     rvec *const f = as_rvec_array(force.data());
1419
1420     /* Start the force cycle counter.
1421      * Note that a different counter is used for dynamic load balancing.
1422      */
1423     wallcycle_start(wcycle, ewcFORCE);
1424
1425     gmx::ArrayRef<gmx::RVec> forceRef = force;
1426     if (bDoForces)
1427     {
1428         /* If we need to compute the virial, we might need a separate
1429          * force buffer for algorithms for which the virial is calculated
1430          * directly, such as PME.
1431          */
1432         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1433         {
1434             forceRef = *fr->forceBufferForDirectVirialContributions;
1435
1436             /* TODO: update comment
1437              * We only compute forces on local atoms. Note that vsites can
1438              * spread to non-local atoms, but that part of the buffer is
1439              * cleared separately in the vsite spreading code.
1440              */
1441             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1442         }
1443         /* Clear the short- and long-range forces */
1444         clear_rvecs_omp(fr->natoms_force_constr, f);
1445     }
1446
1447     /* forceWithVirial uses the local atom range only */
1448     gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1449
1450     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1451     {
1452         clear_pull_forces(inputrec->pull_work);
1453     }
1454
1455     /* We calculate the non-bonded forces, when done on the CPU, here.
1456      * We do this before calling do_force_lowlevel, because in that
1457      * function, the listed forces are calculated before PME, which
1458      * does communication.  With this order, non-bonded and listed
1459      * force calculation imbalance can be balanced out by the domain
1460      * decomposition load balancing.
1461      */
1462
1463     if (!bUseOrEmulGPU)
1464     {
1465         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1466                      step, nrnb, wcycle);
1467     }
1468
1469     if (fr->efep != efepNO)
1470     {
1471         /* Calculate the local and non-local free energy interactions here.
1472          * Happens here on the CPU both with and without GPU.
1473          */
1474         if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1475         {
1476             do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1477                              fr, as_rvec_array(x.data()), f, mdatoms,
1478                              inputrec->fepvals, lambda,
1479                              enerd, flags, nrnb, wcycle);
1480         }
1481
1482         if (DOMAINDECOMP(cr) &&
1483             fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1484         {
1485             do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1486                              fr, as_rvec_array(x.data()), f, mdatoms,
1487                              inputrec->fepvals, lambda,
1488                              enerd, flags, nrnb, wcycle);
1489         }
1490     }
1491
1492     if (!bUseOrEmulGPU)
1493     {
1494         int aloc;
1495
1496         if (DOMAINDECOMP(cr))
1497         {
1498             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1499                          step, nrnb, wcycle);
1500         }
1501
1502         if (!bUseOrEmulGPU)
1503         {
1504             aloc = eintLocal;
1505         }
1506         else
1507         {
1508             aloc = eintNonlocal;
1509         }
1510
1511         /* Add all the non-bonded force to the normal force array.
1512          * This can be split into a local and a non-local part when overlapping
1513          * communication with calculation with domain decomposition.
1514          */
1515         wallcycle_stop(wcycle, ewcFORCE);
1516
1517         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatAll, nbv->nbat, f, wcycle);
1518
1519         wallcycle_start_nocount(wcycle, ewcFORCE);
1520
1521         /* if there are multiple fshift output buffers reduce them */
1522         if ((flags & GMX_FORCE_VIRIAL) &&
1523             nbv->grp[aloc].nbl_lists.nnbl > 1)
1524         {
1525             /* This is not in a subcounter because it takes a
1526                negligible and constant-sized amount of time */
1527             nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1528                                                      fr->fshift);
1529         }
1530     }
1531
1532     /* update QMMMrec, if necessary */
1533     if (fr->bQMMM)
1534     {
1535         update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1536     }
1537
1538     /* Compute the bonded and non-bonded energies and optionally forces */
1539     do_force_lowlevel(fr, inputrec, &(top->idef),
1540                       cr, ms, nrnb, wcycle, mdatoms,
1541                       as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1542                       box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1543                       flags, &cycles_pme);
1544
1545     wallcycle_stop(wcycle, ewcFORCE);
1546
1547     computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1548                          step, t, wcycle,
1549                          fr->forceProviders, box, x, mdatoms, lambda,
1550                          flags, &forceWithVirial, enerd,
1551                          ed, bNS);
1552
1553     if (bUseOrEmulGPU)
1554     {
1555         /* wait for non-local forces (or calculate in emulation mode) */
1556         if (DOMAINDECOMP(cr))
1557         {
1558             if (bUseGPU)
1559             {
1560                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1561                 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1562                                            flags, eatNonlocal,
1563                                            enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1564                                            fr->fshift);
1565                 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1566             }
1567             else
1568             {
1569                 wallcycle_start_nocount(wcycle, ewcFORCE);
1570                 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1571                              step, nrnb, wcycle);
1572                 wallcycle_stop(wcycle, ewcFORCE);
1573             }
1574
1575             /* skip the reduction if there was no non-local work to do */
1576             if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1577             {
1578                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal,
1579                                                nbv->nbat, f, wcycle);
1580             }
1581         }
1582     }
1583
1584     if (DOMAINDECOMP(cr))
1585     {
1586         /* We are done with the CPU compute.
1587          * We will now communicate the non-local forces.
1588          * If we use a GPU this will overlap with GPU work, so in that case
1589          * we do not close the DD force balancing region here.
1590          */
1591         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1592         {
1593             ddCloseBalanceRegionCpu(cr->dd);
1594         }
1595         if (bDoForces)
1596         {
1597             dd_move_f(cr->dd, force, fr->fshift, wcycle);
1598         }
1599     }
1600
1601     // With both nonbonded and PME offloaded a GPU on the same rank, we use
1602     // an alternating wait/reduction scheme.
1603     bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1604     if (alternateGpuWait)
1605     {
1606         alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, wcycle);
1607     }
1608
1609     if (!alternateGpuWait && useGpuPme)
1610     {
1611         gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
1612         matrix vir_Q;
1613         real   Vlr_q = 0.0;
1614         pme_gpu_wait_finish_task(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
1615         pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
1616     }
1617
1618     /* Wait for local GPU NB outputs on the non-alternating wait path */
1619     if (!alternateGpuWait && bUseGPU)
1620     {
1621         /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1622          * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1623          * but even with a step of 0.1 ms the difference is less than 1%
1624          * of the step time.
1625          */
1626         const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1627
1628         wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1629         nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1630                                    flags, eatLocal,
1631                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1632                                    fr->fshift);
1633         float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1634
1635         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1636         {
1637             DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1638             if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1639             {
1640                 /* We measured few cycles, it could be that the kernel
1641                  * and transfer finished earlier and there was no actual
1642                  * wait time, only API call overhead.
1643                  * Then the actual time could be anywhere between 0 and
1644                  * cycles_wait_est. We will use half of cycles_wait_est.
1645                  */
1646                 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1647             }
1648             ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1649         }
1650     }
1651
1652     if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1653     {
1654         // NOTE: emulation kernel is not included in the balancing region,
1655         // but emulation mode does not target performance anyway
1656         wallcycle_start_nocount(wcycle, ewcFORCE);
1657         do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1658                      DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1659                      step, nrnb, wcycle);
1660         wallcycle_stop(wcycle, ewcFORCE);
1661     }
1662
1663     if (useGpuPme)
1664     {
1665         pme_gpu_reinit_computation(fr->pmedata, wcycle);
1666     }
1667
1668     if (bUseGPU)
1669     {
1670         /* now clear the GPU outputs while we finish the step on the CPU */
1671         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1672         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1673         nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1674
1675         /* Is dynamic pair-list pruning activated? */
1676         if (nbv->listParams->useDynamicPruning)
1677         {
1678             launchGpuRollingPruning(cr, nbv, inputrec, step);
1679         }
1680         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1681         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1682     }
1683
1684     /* Do the nonbonded GPU (or emulation) force buffer reduction
1685      * on the non-alternating path. */
1686     if (bUseOrEmulGPU && !alternateGpuWait)
1687     {
1688         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1689                                        nbv->nbat, f, wcycle);
1690     }
1691
1692     if (DOMAINDECOMP(cr))
1693     {
1694         dd_force_flop_stop(cr->dd, nrnb);
1695     }
1696
1697     if (bDoForces)
1698     {
1699         /* If we have NoVirSum forces, but we do not calculate the virial,
1700          * we sum fr->f_novirsum=f later.
1701          */
1702         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1703         {
1704             spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
1705                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1706         }
1707
1708         if (flags & GMX_FORCE_VIRIAL)
1709         {
1710             /* Calculation of the virial must be done after vsites! */
1711             calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
1712                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1713         }
1714     }
1715
1716     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1717     {
1718         /* In case of node-splitting, the PP nodes receive the long-range
1719          * forces, virial and energy from the PME nodes here.
1720          */
1721         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1722     }
1723
1724     if (bDoForces)
1725     {
1726         post_process_forces(cr, step, nrnb, wcycle,
1727                             top, box, as_rvec_array(x.data()), f, &forceWithVirial,
1728                             vir_force, mdatoms, graph, fr, vsite,
1729                             flags);
1730     }
1731
1732     if (flags & GMX_FORCE_ENERGY)
1733     {
1734         /* Sum the potential energy terms from group contributions */
1735         sum_epot(&(enerd->grpp), enerd->term);
1736
1737         if (!EI_TPI(inputrec->eI))
1738         {
1739             checkPotentialEnergyValidity(step, *enerd, *inputrec);
1740         }
1741     }
1742 }
1743
1744 static void do_force_cutsGROUP(FILE *fplog,
1745                                const t_commrec *cr,
1746                                const gmx_multisim_t *ms,
1747                                const t_inputrec *inputrec,
1748                                gmx::Awh *awh,
1749                                gmx_enfrot *enforcedRotation,
1750                                int64_t step,
1751                                t_nrnb *nrnb,
1752                                gmx_wallcycle_t wcycle,
1753                                gmx_localtop_t *top,
1754                                const gmx_groups_t *groups,
1755                                matrix box, gmx::PaddedArrayRef<gmx::RVec> x,
1756                                history_t *hist,
1757                                gmx::PaddedArrayRef<gmx::RVec> force,
1758                                tensor vir_force,
1759                                const t_mdatoms *mdatoms,
1760                                gmx_enerdata_t *enerd,
1761                                t_fcdata *fcd,
1762                                real *lambda,
1763                                t_graph *graph,
1764                                t_forcerec *fr,
1765                                const gmx_vsite_t *vsite,
1766                                rvec mu_tot,
1767                                double t,
1768                                gmx_edsam *ed,
1769                                int flags,
1770                                DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1771                                DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1772 {
1773     int        cg0, cg1, i, j;
1774     double     mu[2*DIM];
1775     gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM;
1776     gmx_bool   bDoForces;
1777     float      cycles_pme;
1778
1779     const int  start  = 0;
1780     const int  homenr = mdatoms->homenr;
1781
1782     clear_mat(vir_force);
1783
1784     cg0 = 0;
1785     if (DOMAINDECOMP(cr))
1786     {
1787         cg1 = cr->dd->globalAtomGroupIndices.size();
1788     }
1789     else
1790     {
1791         cg1 = top->cgs.nr;
1792     }
1793     if (fr->n_tpi > 0)
1794     {
1795         cg1--;
1796     }
1797
1798     bStateChanged  = ((flags & GMX_FORCE_STATECHANGED) != 0);
1799     bNS            = ((flags & GMX_FORCE_NS) != 0) && (!fr->bAllvsAll);
1800     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1801     bFillGrid      = (bNS && bStateChanged);
1802     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
1803     bDoForces      = ((flags & GMX_FORCE_FORCES) != 0);
1804
1805     if (bStateChanged)
1806     {
1807         update_forcerec(fr, box);
1808
1809         if (inputrecNeedMutot(inputrec))
1810         {
1811             /* Calculate total (local) dipole moment in a temporary common array.
1812              * This makes it possible to sum them over nodes faster.
1813              */
1814             calc_mu(start, homenr,
1815                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1816                     mu, mu+DIM);
1817         }
1818     }
1819
1820     if (fr->ePBC != epbcNONE)
1821     {
1822         /* Compute shift vectors every step,
1823          * because of pressure coupling or box deformation!
1824          */
1825         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1826         {
1827             calc_shifts(box, fr->shift_vec);
1828         }
1829
1830         if (bCalcCGCM)
1831         {
1832             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1833                                      &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1834             inc_nrnb(nrnb, eNR_CGCM, homenr);
1835             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1836         }
1837         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1838         {
1839             unshift_self(graph, box, as_rvec_array(x.data()));
1840         }
1841     }
1842     else if (bCalcCGCM)
1843     {
1844         calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1845         inc_nrnb(nrnb, eNR_CGCM, homenr);
1846     }
1847
1848     if (bCalcCGCM && gmx_debug_at)
1849     {
1850         pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1851     }
1852
1853 #if GMX_MPI
1854     if (!thisRankHasDuty(cr, DUTY_PME))
1855     {
1856         /* Send particle coordinates to the pme nodes.
1857          * Since this is only implemented for domain decomposition
1858          * and domain decomposition does not use the graph,
1859          * we do not need to worry about shifting.
1860          */
1861         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1862                                  lambda[efptCOUL], lambda[efptVDW],
1863                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1864                                  step, wcycle);
1865     }
1866 #endif /* GMX_MPI */
1867
1868     /* Communicate coordinates and sum dipole if necessary */
1869     if (DOMAINDECOMP(cr))
1870     {
1871         dd_move_x(cr->dd, box, x, wcycle);
1872
1873         /* No GPU support, no move_x overlap, so reopen the balance region here */
1874         if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1875         {
1876             ddReopenBalanceRegionCpu(cr->dd);
1877         }
1878     }
1879
1880     if (inputrecNeedMutot(inputrec))
1881     {
1882         if (bStateChanged)
1883         {
1884             if (PAR(cr))
1885             {
1886                 gmx_sumd(2*DIM, mu, cr);
1887                 ddReopenBalanceRegionCpu(cr->dd);
1888             }
1889             for (i = 0; i < 2; i++)
1890             {
1891                 for (j = 0; j < DIM; j++)
1892                 {
1893                     fr->mu_tot[i][j] = mu[i*DIM + j];
1894                 }
1895             }
1896         }
1897         if (fr->efep == efepNO)
1898         {
1899             copy_rvec(fr->mu_tot[0], mu_tot);
1900         }
1901         else
1902         {
1903             for (j = 0; j < DIM; j++)
1904             {
1905                 mu_tot[j] =
1906                     (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1907             }
1908         }
1909     }
1910
1911     /* Reset energies */
1912     reset_enerdata(enerd);
1913     clear_rvecs(SHIFTS, fr->fshift);
1914
1915     if (bNS)
1916     {
1917         wallcycle_start(wcycle, ewcNS);
1918
1919         if (graph && bStateChanged)
1920         {
1921             /* Calculate intramolecular shift vectors to make molecules whole */
1922             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1923         }
1924
1925         /* Do the actual neighbour searching */
1926         ns(fplog, fr, box,
1927            groups, top, mdatoms,
1928            cr, nrnb, bFillGrid);
1929
1930         wallcycle_stop(wcycle, ewcNS);
1931     }
1932
1933     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1934     {
1935         wallcycle_start(wcycle, ewcPPDURINGPME);
1936         dd_force_flop_start(cr->dd, nrnb);
1937     }
1938
1939     if (inputrec->bRot)
1940     {
1941         wallcycle_start(wcycle, ewcROT);
1942         do_rotation(cr, enforcedRotation, box, as_rvec_array(x.data()), t, step, bNS);
1943         wallcycle_stop(wcycle, ewcROT);
1944     }
1945
1946     /* Temporary solution until all routines take PaddedRVecVector */
1947     rvec *f = as_rvec_array(force.data());
1948
1949     /* Start the force cycle counter.
1950      * Note that a different counter is used for dynamic load balancing.
1951      */
1952     wallcycle_start(wcycle, ewcFORCE);
1953
1954     gmx::ArrayRef<gmx::RVec> forceRef = force;
1955     if (bDoForces)
1956     {
1957         /* If we need to compute the virial, we might need a separate
1958          * force buffer for algorithms for which the virial is calculated
1959          * separately, such as PME.
1960          */
1961         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1962         {
1963             forceRef = *fr->forceBufferForDirectVirialContributions;
1964
1965             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1966         }
1967
1968         /* Clear the short- and long-range forces */
1969         clear_rvecs(fr->natoms_force_constr, f);
1970     }
1971
1972     /* forceWithVirial might need the full force atom range */
1973     gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1974
1975     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1976     {
1977         clear_pull_forces(inputrec->pull_work);
1978     }
1979
1980     /* update QMMMrec, if necessary */
1981     if (fr->bQMMM)
1982     {
1983         update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1984     }
1985
1986     /* Compute the bonded and non-bonded energies and optionally forces */
1987     do_force_lowlevel(fr, inputrec, &(top->idef),
1988                       cr, ms, nrnb, wcycle, mdatoms,
1989                       as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1990                       box, inputrec->fepvals, lambda,
1991                       graph, &(top->excls), fr->mu_tot,
1992                       flags,
1993                       &cycles_pme);
1994
1995     wallcycle_stop(wcycle, ewcFORCE);
1996
1997     if (DOMAINDECOMP(cr))
1998     {
1999         dd_force_flop_stop(cr->dd, nrnb);
2000
2001         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
2002         {
2003             ddCloseBalanceRegionCpu(cr->dd);
2004         }
2005     }
2006
2007     computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
2008                          step, t, wcycle,
2009                          fr->forceProviders, box, x, mdatoms, lambda,
2010                          flags, &forceWithVirial, enerd,
2011                          ed, bNS);
2012
2013     if (bDoForces)
2014     {
2015         /* Communicate the forces */
2016         if (DOMAINDECOMP(cr))
2017         {
2018             dd_move_f(cr->dd, force, fr->fshift, wcycle);
2019             /* Do we need to communicate the separate force array
2020              * for terms that do not contribute to the single sum virial?
2021              * Position restraints and electric fields do not introduce
2022              * inter-cg forces, only full electrostatics methods do.
2023              * When we do not calculate the virial, fr->f_novirsum = f,
2024              * so we have already communicated these forces.
2025              */
2026             if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
2027                 (flags & GMX_FORCE_VIRIAL))
2028             {
2029                 dd_move_f(cr->dd, forceWithVirial.force_, nullptr, wcycle);
2030             }
2031         }
2032
2033         /* If we have NoVirSum forces, but we do not calculate the virial,
2034          * we sum fr->f_novirsum=f later.
2035          */
2036         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
2037         {
2038             spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
2039                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
2040         }
2041
2042         if (flags & GMX_FORCE_VIRIAL)
2043         {
2044             /* Calculation of the virial must be done after vsites! */
2045             calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
2046                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
2047         }
2048     }
2049
2050     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
2051     {
2052         /* In case of node-splitting, the PP nodes receive the long-range
2053          * forces, virial and energy from the PME nodes here.
2054          */
2055         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
2056     }
2057
2058     if (bDoForces)
2059     {
2060         post_process_forces(cr, step, nrnb, wcycle,
2061                             top, box, as_rvec_array(x.data()), f, &forceWithVirial,
2062                             vir_force, mdatoms, graph, fr, vsite,
2063                             flags);
2064     }
2065
2066     if (flags & GMX_FORCE_ENERGY)
2067     {
2068         /* Sum the potential energy terms from group contributions */
2069         sum_epot(&(enerd->grpp), enerd->term);
2070
2071         if (!EI_TPI(inputrec->eI))
2072         {
2073             checkPotentialEnergyValidity(step, *enerd, *inputrec);
2074         }
2075     }
2076
2077 }
2078
2079 void do_force(FILE                                     *fplog,
2080               const t_commrec                          *cr,
2081               const gmx_multisim_t                     *ms,
2082               const t_inputrec                         *inputrec,
2083               gmx::Awh                                 *awh,
2084               gmx_enfrot                               *enforcedRotation,
2085               int64_t                                   step,
2086               t_nrnb                                   *nrnb,
2087               gmx_wallcycle_t                           wcycle,
2088               gmx_localtop_t                           *top,
2089               const gmx_groups_t                       *groups,
2090               matrix                                    box,
2091               gmx::PaddedArrayRef<gmx::RVec>            x,
2092               history_t                                *hist,
2093               gmx::PaddedArrayRef<gmx::RVec>            force,
2094               tensor                                    vir_force,
2095               const t_mdatoms                          *mdatoms,
2096               gmx_enerdata_t                           *enerd,
2097               t_fcdata                                 *fcd,
2098               gmx::ArrayRef<real>                       lambda,
2099               t_graph                                  *graph,
2100               t_forcerec                               *fr,
2101               const gmx_vsite_t                        *vsite,
2102               rvec                                      mu_tot,
2103               double                                    t,
2104               gmx_edsam                                *ed,
2105               int                                       flags,
2106               DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
2107               DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
2108 {
2109     /* modify force flag if not doing nonbonded */
2110     if (!fr->bNonbonded)
2111     {
2112         flags &= ~GMX_FORCE_NONBONDED;
2113     }
2114
2115     switch (inputrec->cutoff_scheme)
2116     {
2117         case ecutsVERLET:
2118             do_force_cutsVERLET(fplog, cr, ms, inputrec,
2119                                 awh, enforcedRotation, step, nrnb, wcycle,
2120                                 top,
2121                                 groups,
2122                                 box, x, hist,
2123                                 force, vir_force,
2124                                 mdatoms,
2125                                 enerd, fcd,
2126                                 lambda.data(), graph,
2127                                 fr, fr->ic,
2128                                 vsite, mu_tot,
2129                                 t, ed,
2130                                 flags,
2131                                 ddOpenBalanceRegion,
2132                                 ddCloseBalanceRegion);
2133             break;
2134         case ecutsGROUP:
2135             do_force_cutsGROUP(fplog, cr, ms, inputrec,
2136                                awh, enforcedRotation, step, nrnb, wcycle,
2137                                top,
2138                                groups,
2139                                box, x, hist,
2140                                force, vir_force,
2141                                mdatoms,
2142                                enerd, fcd,
2143                                lambda.data(), graph,
2144                                fr, vsite, mu_tot,
2145                                t, ed,
2146                                flags,
2147                                ddOpenBalanceRegion,
2148                                ddCloseBalanceRegion);
2149             break;
2150         default:
2151             gmx_incons("Invalid cut-off scheme passed!");
2152     }
2153
2154     /* In case we don't have constraints and are using GPUs, the next balancing
2155      * region starts here.
2156      * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2157      * virial calculation and COM pulling, is not thus not included in
2158      * the balance timing, which is ok as most tasks do communication.
2159      */
2160     if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
2161     {
2162         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
2163     }
2164 }
2165
2166
2167 void do_constrain_first(FILE *fplog, gmx::Constraints *constr,
2168                         const t_inputrec *ir, const t_mdatoms *md,
2169                         t_state *state)
2170 {
2171     int             i, m, start, end;
2172     int64_t         step;
2173     real            dt = ir->delta_t;
2174     real            dvdl_dum;
2175     rvec           *savex;
2176
2177     /* We need to allocate one element extra, since we might use
2178      * (unaligned) 4-wide SIMD loads to access rvec entries.
2179      */
2180     snew(savex, state->natoms + 1);
2181
2182     start = 0;
2183     end   = md->homenr;
2184
2185     if (debug)
2186     {
2187         fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2188                 start, md->homenr, end);
2189     }
2190     /* Do a first constrain to reset particles... */
2191     step = ir->init_step;
2192     if (fplog)
2193     {
2194         char buf[STEPSTRSIZE];
2195         fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2196                 gmx_step_str(step, buf));
2197     }
2198     dvdl_dum = 0;
2199
2200     /* constrain the current position */
2201     constr->apply(TRUE, FALSE,
2202                   step, 0, 1.0,
2203                   state->x.rvec_array(), state->x.rvec_array(), nullptr,
2204                   state->box,
2205                   state->lambda[efptBONDED], &dvdl_dum,
2206                   nullptr, nullptr, gmx::ConstraintVariable::Positions);
2207     if (EI_VV(ir->eI))
2208     {
2209         /* constrain the inital velocity, and save it */
2210         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2211         constr->apply(TRUE, FALSE,
2212                       step, 0, 1.0,
2213                       state->x.rvec_array(), state->v.rvec_array(), state->v.rvec_array(),
2214                       state->box,
2215                       state->lambda[efptBONDED], &dvdl_dum,
2216                       nullptr, nullptr, gmx::ConstraintVariable::Velocities);
2217     }
2218     /* constrain the inital velocities at t-dt/2 */
2219     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2220     {
2221         auto x = makeArrayRef(state->x).subArray(start, end);
2222         auto v = makeArrayRef(state->v).subArray(start, end);
2223         for (i = start; (i < end); i++)
2224         {
2225             for (m = 0; (m < DIM); m++)
2226             {
2227                 /* Reverse the velocity */
2228                 v[i][m] = -v[i][m];
2229                 /* Store the position at t-dt in buf */
2230                 savex[i][m] = x[i][m] + dt*v[i][m];
2231             }
2232         }
2233         /* Shake the positions at t=-dt with the positions at t=0
2234          * as reference coordinates.
2235          */
2236         if (fplog)
2237         {
2238             char buf[STEPSTRSIZE];
2239             fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2240                     gmx_step_str(step, buf));
2241         }
2242         dvdl_dum = 0;
2243         constr->apply(TRUE, FALSE,
2244                       step, -1, 1.0,
2245                       state->x.rvec_array(), savex, nullptr,
2246                       state->box,
2247                       state->lambda[efptBONDED], &dvdl_dum,
2248                       state->v.rvec_array(), nullptr, gmx::ConstraintVariable::Positions);
2249
2250         for (i = start; i < end; i++)
2251         {
2252             for (m = 0; m < DIM; m++)
2253             {
2254                 /* Re-reverse the velocities */
2255                 v[i][m] = -v[i][m];
2256             }
2257         }
2258     }
2259     sfree(savex);
2260 }
2261
2262
2263 static void
2264 integrate_table(const real vdwtab[], real scale, int offstart, int rstart, int rend,
2265                 double *enerout, double *virout)
2266 {
2267     double enersum, virsum;
2268     double invscale, invscale2, invscale3;
2269     double r, ea, eb, ec, pa, pb, pc, pd;
2270     double y0, f, g, h;
2271     int    ri, offset;
2272     double tabfactor;
2273
2274     invscale  = 1.0/scale;
2275     invscale2 = invscale*invscale;
2276     invscale3 = invscale*invscale2;
2277
2278     /* Following summation derived from cubic spline definition,
2279      * Numerical Recipies in C, second edition, p. 113-116.  Exact for
2280      * the cubic spline.  We first calculate the negative of the
2281      * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2282      * add the more standard, abrupt cutoff correction to that result,
2283      * yielding the long-range correction for a switched function.  We
2284      * perform both the pressure and energy loops at the same time for
2285      * simplicity, as the computational cost is low. */
2286
2287     if (offstart == 0)
2288     {
2289         /* Since the dispersion table has been scaled down a factor
2290          * 6.0 and the repulsion a factor 12.0 to compensate for the
2291          * c6/c12 parameters inside nbfp[] being scaled up (to save
2292          * flops in kernels), we need to correct for this.
2293          */
2294         tabfactor = 6.0;
2295     }
2296     else
2297     {
2298         tabfactor = 12.0;
2299     }
2300
2301     enersum = 0.0;
2302     virsum  = 0.0;
2303     for (ri = rstart; ri < rend; ++ri)
2304     {
2305         r  = ri*invscale;
2306         ea = invscale3;
2307         eb = 2.0*invscale2*r;
2308         ec = invscale*r*r;
2309
2310         pa = invscale3;
2311         pb = 3.0*invscale2*r;
2312         pc = 3.0*invscale*r*r;
2313         pd = r*r*r;
2314
2315         /* this "8" is from the packing in the vdwtab array - perhaps
2316            should be defined? */
2317
2318         offset = 8*ri + offstart;
2319         y0     = vdwtab[offset];
2320         f      = vdwtab[offset+1];
2321         g      = vdwtab[offset+2];
2322         h      = vdwtab[offset+3];
2323
2324         enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2325         virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2326     }
2327     *enerout = 4.0*M_PI*enersum*tabfactor;
2328     *virout  = 4.0*M_PI*virsum*tabfactor;
2329 }
2330
2331 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2332 {
2333     double   eners[2], virs[2], enersum, virsum;
2334     double   r0, rc3, rc9;
2335     int      ri0, ri1, i;
2336     real     scale, *vdwtab;
2337
2338     fr->enershiftsix    = 0;
2339     fr->enershifttwelve = 0;
2340     fr->enerdiffsix     = 0;
2341     fr->enerdifftwelve  = 0;
2342     fr->virdiffsix      = 0;
2343     fr->virdifftwelve   = 0;
2344
2345     const interaction_const_t *ic = fr->ic;
2346
2347     if (eDispCorr != edispcNO)
2348     {
2349         for (i = 0; i < 2; i++)
2350         {
2351             eners[i] = 0;
2352             virs[i]  = 0;
2353         }
2354         if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2355             (ic->vdw_modifier == eintmodPOTSWITCH) ||
2356             (ic->vdw_modifier == eintmodFORCESWITCH) ||
2357             (ic->vdwtype == evdwSHIFT) ||
2358             (ic->vdwtype == evdwSWITCH))
2359         {
2360             if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2361                  (ic->vdw_modifier == eintmodFORCESWITCH) ||
2362                  (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2363             {
2364                 gmx_fatal(FARGS,
2365                           "With dispersion correction rvdw-switch can not be zero "
2366                           "for vdw-type = %s", evdw_names[ic->vdwtype]);
2367             }
2368
2369             /* TODO This code depends on the logic in tables.c that
2370                constructs the table layout, which should be made
2371                explicit in future cleanup. */
2372             GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2373                        "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2374             scale  = fr->dispersionCorrectionTable->scale;
2375             vdwtab = fr->dispersionCorrectionTable->data;
2376
2377             /* Round the cut-offs to exact table values for precision */
2378             ri0  = static_cast<int>(std::floor(ic->rvdw_switch*scale));
2379             ri1  = static_cast<int>(std::ceil(ic->rvdw*scale));
2380
2381             /* The code below has some support for handling force-switching, i.e.
2382              * when the force (instead of potential) is switched over a limited
2383              * region. This leads to a constant shift in the potential inside the
2384              * switching region, which we can handle by adding a constant energy
2385              * term in the force-switch case just like when we do potential-shift.
2386              *
2387              * For now this is not enabled, but to keep the functionality in the
2388              * code we check separately for switch and shift. When we do force-switch
2389              * the shifting point is rvdw_switch, while it is the cutoff when we
2390              * have a classical potential-shift.
2391              *
2392              * For a pure potential-shift the potential has a constant shift
2393              * all the way out to the cutoff, and that is it. For other forms
2394              * we need to calculate the constant shift up to the point where we
2395              * start modifying the potential.
2396              */
2397             ri0  = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2398
2399             r0   = ri0/scale;
2400             rc3  = r0*r0*r0;
2401             rc9  = rc3*rc3*rc3;
2402
2403             if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2404                 (ic->vdwtype == evdwSHIFT))
2405             {
2406                 /* Determine the constant energy shift below rvdw_switch.
2407                  * Table has a scale factor since we have scaled it down to compensate
2408                  * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2409                  */
2410                 fr->enershiftsix    = static_cast<real>(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2411                 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2412             }
2413             else if (ic->vdw_modifier == eintmodPOTSHIFT)
2414             {
2415                 fr->enershiftsix    = static_cast<real>(-1.0/(rc3*rc3));
2416                 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3));
2417             }
2418
2419             /* Add the constant part from 0 to rvdw_switch.
2420              * This integration from 0 to rvdw_switch overcounts the number
2421              * of interactions by 1, as it also counts the self interaction.
2422              * We will correct for this later.
2423              */
2424             eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2425             eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2426
2427             /* Calculate the contribution in the range [r0,r1] where we
2428              * modify the potential. For a pure potential-shift modifier we will
2429              * have ri0==ri1, and there will not be any contribution here.
2430              */
2431             for (i = 0; i < 2; i++)
2432             {
2433                 enersum = 0;
2434                 virsum  = 0;
2435                 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2436                 eners[i] -= enersum;
2437                 virs[i]  -= virsum;
2438             }
2439
2440             /* Alright: Above we compensated by REMOVING the parts outside r0
2441              * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2442              *
2443              * Regardless of whether r0 is the point where we start switching,
2444              * or the cutoff where we calculated the constant shift, we include
2445              * all the parts we are missing out to infinity from r0 by
2446              * calculating the analytical dispersion correction.
2447              */
2448             eners[0] += -4.0*M_PI/(3.0*rc3);
2449             eners[1] +=  4.0*M_PI/(9.0*rc9);
2450             virs[0]  +=  8.0*M_PI/rc3;
2451             virs[1]  += -16.0*M_PI/(3.0*rc9);
2452         }
2453         else if (ic->vdwtype == evdwCUT ||
2454                  EVDW_PME(ic->vdwtype) ||
2455                  ic->vdwtype == evdwUSER)
2456         {
2457             if (ic->vdwtype == evdwUSER && fplog)
2458             {
2459                 fprintf(fplog,
2460                         "WARNING: using dispersion correction with user tables\n");
2461             }
2462
2463             /* Note that with LJ-PME, the dispersion correction is multiplied
2464              * by the difference between the actual C6 and the value of C6
2465              * that would produce the combination rule.
2466              * This means the normal energy and virial difference formulas
2467              * can be used here.
2468              */
2469
2470             rc3  = ic->rvdw*ic->rvdw*ic->rvdw;
2471             rc9  = rc3*rc3*rc3;
2472             /* Contribution beyond the cut-off */
2473             eners[0] += -4.0*M_PI/(3.0*rc3);
2474             eners[1] +=  4.0*M_PI/(9.0*rc9);
2475             if (ic->vdw_modifier == eintmodPOTSHIFT)
2476             {
2477                 /* Contribution within the cut-off */
2478                 eners[0] += -4.0*M_PI/(3.0*rc3);
2479                 eners[1] +=  4.0*M_PI/(3.0*rc9);
2480             }
2481             /* Contribution beyond the cut-off */
2482             virs[0]  +=  8.0*M_PI/rc3;
2483             virs[1]  += -16.0*M_PI/(3.0*rc9);
2484         }
2485         else
2486         {
2487             gmx_fatal(FARGS,
2488                       "Dispersion correction is not implemented for vdw-type = %s",
2489                       evdw_names[ic->vdwtype]);
2490         }
2491
2492         /* When we deprecate the group kernels the code below can go too */
2493         if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2494         {
2495             /* Calculate self-interaction coefficient (assuming that
2496              * the reciprocal-space contribution is constant in the
2497              * region that contributes to the self-interaction).
2498              */
2499             fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2500
2501             eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2502             virs[0]  +=  gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2503         }
2504
2505         fr->enerdiffsix    = eners[0];
2506         fr->enerdifftwelve = eners[1];
2507         /* The 0.5 is due to the Gromacs definition of the virial */
2508         fr->virdiffsix     = 0.5*virs[0];
2509         fr->virdifftwelve  = 0.5*virs[1];
2510     }
2511 }
2512
2513 void calc_dispcorr(const t_inputrec *ir, const t_forcerec *fr,
2514                    const matrix box, real lambda, tensor pres, tensor virial,
2515                    real *prescorr, real *enercorr, real *dvdlcorr)
2516 {
2517     gmx_bool bCorrAll, bCorrPres;
2518     real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2519     int      m;
2520
2521     *prescorr = 0;
2522     *enercorr = 0;
2523     *dvdlcorr = 0;
2524
2525     clear_mat(virial);
2526     clear_mat(pres);
2527
2528     if (ir->eDispCorr != edispcNO)
2529     {
2530         bCorrAll  = (ir->eDispCorr == edispcAllEner ||
2531                      ir->eDispCorr == edispcAllEnerPres);
2532         bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2533                      ir->eDispCorr == edispcAllEnerPres);
2534
2535         invvol = 1/det(box);
2536         if (fr->n_tpi)
2537         {
2538             /* Only correct for the interactions with the inserted molecule */
2539             dens   = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2540             ninter = fr->n_tpi;
2541         }
2542         else
2543         {
2544             dens   = fr->numAtomsForDispersionCorrection*invvol;
2545             ninter = 0.5*fr->numAtomsForDispersionCorrection;
2546         }
2547
2548         if (ir->efep == efepNO)
2549         {
2550             avcsix    = fr->avcsix[0];
2551             avctwelve = fr->avctwelve[0];
2552         }
2553         else
2554         {
2555             avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
2556             avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2557         }
2558
2559         enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2560         *enercorr += avcsix*enerdiff;
2561         dvdlambda  = 0.0;
2562         if (ir->efep != efepNO)
2563         {
2564             dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2565         }
2566         if (bCorrAll)
2567         {
2568             enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2569             *enercorr += avctwelve*enerdiff;
2570             if (fr->efep != efepNO)
2571             {
2572                 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2573             }
2574         }
2575
2576         if (bCorrPres)
2577         {
2578             svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2579             if (ir->eDispCorr == edispcAllEnerPres)
2580             {
2581                 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2582             }
2583             /* The factor 2 is because of the Gromacs virial definition */
2584             spres = -2.0*invvol*svir*PRESFAC;
2585
2586             for (m = 0; m < DIM; m++)
2587             {
2588                 virial[m][m] += svir;
2589                 pres[m][m]   += spres;
2590             }
2591             *prescorr += spres;
2592         }
2593
2594         /* Can't currently control when it prints, for now, just print when degugging */
2595         if (debug)
2596         {
2597             if (bCorrAll)
2598             {
2599                 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2600                         avcsix, avctwelve);
2601             }
2602             if (bCorrPres)
2603             {
2604                 fprintf(debug,
2605                         "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2606                         *enercorr, spres, svir);
2607             }
2608             else
2609             {
2610                 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2611             }
2612         }
2613
2614         if (fr->efep != efepNO)
2615         {
2616             *dvdlcorr += dvdlambda;
2617         }
2618     }
2619 }
2620
2621 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2622                             const gmx_mtop_t *mtop, rvec x[],
2623                             gmx_bool bFirst)
2624 {
2625     t_graph        *graph;
2626     int             as, mol;
2627
2628     if (bFirst && fplog)
2629     {
2630         fprintf(fplog, "Removing pbc first time\n");
2631     }
2632
2633     snew(graph, 1);
2634     as = 0;
2635     for (const gmx_molblock_t &molb : mtop->molblock)
2636     {
2637         const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2638         if (moltype.atoms.nr == 1 ||
2639             (!bFirst && moltype.cgs.nr == 1))
2640         {
2641             /* Just one atom or charge group in the molecule, no PBC required */
2642             as += molb.nmol*moltype.atoms.nr;
2643         }
2644         else
2645         {
2646             mk_graph_moltype(moltype, graph);
2647
2648             for (mol = 0; mol < molb.nmol; mol++)
2649             {
2650                 mk_mshift(fplog, graph, ePBC, box, x+as);
2651
2652                 shift_self(graph, box, x+as);
2653                 /* The molecule is whole now.
2654                  * We don't need the second mk_mshift call as in do_pbc_first,
2655                  * since we no longer need this graph.
2656                  */
2657
2658                 as += moltype.atoms.nr;
2659             }
2660             done_graph(graph);
2661         }
2662     }
2663     sfree(graph);
2664 }
2665
2666 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2667                        const gmx_mtop_t *mtop, rvec x[])
2668 {
2669     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2670 }
2671
2672 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2673                  const gmx_mtop_t *mtop, rvec x[])
2674 {
2675     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2676 }
2677
2678 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2679 {
2680     int t, nth;
2681     nth = gmx_omp_nthreads_get(emntDefault);
2682
2683 #pragma omp parallel for num_threads(nth) schedule(static)
2684     for (t = 0; t < nth; t++)
2685     {
2686         try
2687         {
2688             size_t natoms = x.size();
2689             size_t offset = (natoms*t    )/nth;
2690             size_t len    = (natoms*(t + 1))/nth - offset;
2691             put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2692         }
2693         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2694     }
2695 }
2696
2697 // TODO This can be cleaned up a lot, and move back to runner.cpp
2698 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2699                 const t_inputrec *inputrec,
2700                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2701                 gmx_walltime_accounting_t walltime_accounting,
2702                 nonbonded_verlet_t *nbv,
2703                 const gmx_pme_t *pme,
2704                 gmx_bool bWriteStat)
2705 {
2706     t_nrnb *nrnb_tot = nullptr;
2707     double  delta_t  = 0;
2708     double  nbfs     = 0, mflop = 0;
2709     double  elapsed_time,
2710             elapsed_time_over_all_ranks,
2711             elapsed_time_over_all_threads,
2712             elapsed_time_over_all_threads_over_all_ranks;
2713     /* Control whether it is valid to print a report. Only the
2714        simulation master may print, but it should not do so if the run
2715        terminated e.g. before a scheduled reset step. This is
2716        complicated by the fact that PME ranks are unaware of the
2717        reason why they were sent a pmerecvqxFINISH. To avoid
2718        communication deadlocks, we always do the communication for the
2719        report, even if we've decided not to write the report, because
2720        how long it takes to finish the run is not important when we've
2721        decided not to report on the simulation performance.
2722
2723        Further, we only report performance for dynamical integrators,
2724        because those are the only ones for which we plan to
2725        consider doing any optimizations. */
2726     bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2727
2728     if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2729     {
2730         GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2731         printReport = false;
2732     }
2733
2734     if (cr->nnodes > 1)
2735     {
2736         snew(nrnb_tot, 1);
2737 #if GMX_MPI
2738         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2739                       cr->mpi_comm_mysim);
2740 #endif
2741     }
2742     else
2743     {
2744         nrnb_tot = nrnb;
2745     }
2746
2747     elapsed_time                  = walltime_accounting_get_time_since_reset(walltime_accounting);
2748     elapsed_time_over_all_threads = walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
2749     if (cr->nnodes > 1)
2750     {
2751 #if GMX_MPI
2752         /* reduce elapsed_time over all MPI ranks in the current simulation */
2753         MPI_Allreduce(&elapsed_time,
2754                       &elapsed_time_over_all_ranks,
2755                       1, MPI_DOUBLE, MPI_SUM,
2756                       cr->mpi_comm_mysim);
2757         elapsed_time_over_all_ranks /= cr->nnodes;
2758         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2759          * current simulation. */
2760         MPI_Allreduce(&elapsed_time_over_all_threads,
2761                       &elapsed_time_over_all_threads_over_all_ranks,
2762                       1, MPI_DOUBLE, MPI_SUM,
2763                       cr->mpi_comm_mysim);
2764 #endif
2765     }
2766     else
2767     {
2768         elapsed_time_over_all_ranks                  = elapsed_time;
2769         elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2770     }
2771
2772     if (printReport)
2773     {
2774         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2775     }
2776     if (cr->nnodes > 1)
2777     {
2778         sfree(nrnb_tot);
2779     }
2780
2781     if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2782     {
2783         print_dd_statistics(cr, inputrec, fplog);
2784     }
2785
2786     /* TODO Move the responsibility for any scaling by thread counts
2787      * to the code that handled the thread region, so that there's a
2788      * mechanism to keep cycle counting working during the transition
2789      * to task parallelism. */
2790     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
2791     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2792     wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2793     auto cycle_sum(wallcycle_sum(cr, wcycle));
2794
2795     if (printReport)
2796     {
2797         auto                    nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2798         gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
2799         if (pme_gpu_task_enabled(pme))
2800         {
2801             pme_gpu_get_timings(pme, &pme_gpu_timings);
2802         }
2803         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2804                         elapsed_time_over_all_ranks,
2805                         wcycle, cycle_sum,
2806                         nbnxn_gpu_timings,
2807                         &pme_gpu_timings);
2808
2809         if (EI_DYNAMICS(inputrec->eI))
2810         {
2811             delta_t = inputrec->delta_t;
2812         }
2813
2814         if (fplog)
2815         {
2816             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2817                        elapsed_time_over_all_ranks,
2818                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2819                        delta_t, nbfs, mflop);
2820         }
2821         if (bWriteStat)
2822         {
2823             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2824                        elapsed_time_over_all_ranks,
2825                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2826                        delta_t, nbfs, mflop);
2827         }
2828     }
2829 }
2830
2831 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2832 {
2833     /* this function works, but could probably use a logic rewrite to keep all the different
2834        types of efep straight. */
2835
2836     if ((ir->efep == efepNO) && (!ir->bSimTemp))
2837     {
2838         return;
2839     }
2840
2841     t_lambda *fep = ir->fepvals;
2842     *fep_state    = fep->init_fep_state; /* this might overwrite the checkpoint
2843                                             if checkpoint is set -- a kludge is in for now
2844                                             to prevent this.*/
2845
2846     for (int i = 0; i < efptNR; i++)
2847     {
2848         /* overwrite lambda state with init_lambda for now for backwards compatibility */
2849         if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2850         {
2851             lambda[i] = fep->init_lambda;
2852             if (lam0)
2853             {
2854                 lam0[i] = lambda[i];
2855             }
2856         }
2857         else
2858         {
2859             lambda[i] = fep->all_lambda[i][*fep_state];
2860             if (lam0)
2861             {
2862                 lam0[i] = lambda[i];
2863             }
2864         }
2865     }
2866     if (ir->bSimTemp)
2867     {
2868         /* need to rescale control temperatures to match current state */
2869         for (int i = 0; i < ir->opts.ngtc; i++)
2870         {
2871             if (ir->opts.ref_t[i] > 0)
2872             {
2873                 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2874             }
2875         }
2876     }
2877
2878     /* Send to the log the information on the current lambdas */
2879     if (fplog != nullptr)
2880     {
2881         fprintf(fplog, "Initial vector of lambda components:[ ");
2882         for (const auto &l : lambda)
2883         {
2884             fprintf(fplog, "%10.4f ", l);
2885         }
2886         fprintf(fplog, "]\n");
2887     }
2888 }
2889
2890
2891 void init_md(FILE *fplog,
2892              const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2893              t_inputrec *ir, const gmx_output_env_t *oenv,
2894              const MdrunOptions &mdrunOptions,
2895              double *t, double *t0,
2896              t_state *globalState, double *lam0,
2897              t_nrnb *nrnb, gmx_mtop_t *mtop,
2898              gmx_update_t **upd,
2899              gmx::BoxDeformation *deform,
2900              int nfile, const t_filenm fnm[],
2901              gmx_mdoutf_t *outf, t_mdebin **mdebin,
2902              tensor force_vir, tensor shake_vir,
2903              tensor total_vir, tensor pres, rvec mu_tot,
2904              gmx_bool *bSimAnn, t_vcm **vcm,
2905              gmx_wallcycle_t wcycle)
2906 {
2907     int  i;
2908
2909     /* Initial values */
2910     *t = *t0       = ir->init_t;
2911
2912     *bSimAnn = FALSE;
2913     for (i = 0; i < ir->opts.ngtc; i++)
2914     {
2915         /* set bSimAnn if any group is being annealed */
2916         if (ir->opts.annealing[i] != eannNO)
2917         {
2918             *bSimAnn = TRUE;
2919         }
2920     }
2921
2922     /* Initialize lambda variables */
2923     /* TODO: Clean up initialization of fep_state and lambda in t_state.
2924      * We currently need to call initialize_lambdas on non-master ranks
2925      * to initialize lam0.
2926      */
2927     if (MASTER(cr))
2928     {
2929         initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
2930     }
2931     else
2932     {
2933         int                      tmpFepState;
2934         std::array<real, efptNR> tmpLambda;
2935         initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
2936     }
2937
2938     // TODO upd is never NULL in practice, but the analysers don't know that
2939     if (upd)
2940     {
2941         *upd = init_update(ir, deform);
2942     }
2943     if (*bSimAnn)
2944     {
2945         update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2946     }
2947
2948     if (vcm != nullptr)
2949     {
2950         *vcm = init_vcm(fplog, &mtop->groups, ir);
2951     }
2952
2953     if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
2954     {
2955         if (ir->etc == etcBERENDSEN)
2956         {
2957             please_cite(fplog, "Berendsen84a");
2958         }
2959         if (ir->etc == etcVRESCALE)
2960         {
2961             please_cite(fplog, "Bussi2007a");
2962         }
2963         if (ir->eI == eiSD1)
2964         {
2965             please_cite(fplog, "Goga2012");
2966         }
2967     }
2968     init_nrnb(nrnb);
2969
2970     if (nfile != -1)
2971     {
2972         *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
2973
2974         *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
2975                               mtop, ir, mdoutf_get_fp_dhdl(*outf));
2976     }
2977
2978     /* Initiate variables */
2979     clear_mat(force_vir);
2980     clear_mat(shake_vir);
2981     clear_rvec(mu_tot);
2982     clear_mat(total_vir);
2983     clear_mat(pres);
2984 }
2985
2986 void init_rerun(FILE *fplog,
2987                 const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2988                 t_inputrec *ir, const gmx_output_env_t *oenv,
2989                 const MdrunOptions &mdrunOptions,
2990                 t_state *globalState, double *lam0,
2991                 t_nrnb *nrnb, gmx_mtop_t *mtop,
2992                 int nfile, const t_filenm fnm[],
2993                 gmx_mdoutf_t *outf, t_mdebin **mdebin,
2994                 gmx_wallcycle_t wcycle)
2995 {
2996     /* Initialize lambda variables */
2997     /* TODO: Clean up initialization of fep_state and lambda in t_state.
2998      * We currently need to call initialize_lambdas on non-master ranks
2999      * to initialize lam0.
3000      */
3001     if (MASTER(cr))
3002     {
3003         initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
3004     }
3005     else
3006     {
3007         int                      tmpFepState;
3008         std::array<real, efptNR> tmpLambda;
3009         initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
3010     }
3011
3012     init_nrnb(nrnb);
3013
3014     if (nfile != -1)
3015     {
3016         *outf   = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
3017         *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
3018                               mtop, ir, mdoutf_get_fp_dhdl(*outf), true);
3019     }
3020 }