src/gromacs/mdlib/sim_util.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "sim_util.h"
  40
  41 #include "config.h"
  42
  43 #include <stdio.h>
  44 #include <string.h>
  45
  46 #include <cmath>
  47 #include <cstdint>
  48
  49 #include <array>
  50
  51 #include "gromacs/awh/awh.h"
  52 #include "gromacs/domdec/dlbtiming.h"
  53 #include "gromacs/domdec/domdec.h"
  54 #include "gromacs/domdec/domdec_struct.h"
  55 #include "gromacs/essentialdynamics/edsam.h"
  56 #include "gromacs/ewald/pme.h"
  57 #include "gromacs/gmxlib/chargegroup.h"
  58 #include "gromacs/gmxlib/network.h"
  59 #include "gromacs/gmxlib/nrnb.h"
  60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
  61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
  62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
  63 #include "gromacs/gpu_utils/gpu_utils.h"
  64 #include "gromacs/imd/imd.h"
  65 #include "gromacs/listed-forces/bonded.h"
  66 #include "gromacs/listed-forces/disre.h"
  67 #include "gromacs/listed-forces/orires.h"
  68 #include "gromacs/math/functions.h"
  69 #include "gromacs/math/units.h"
  70 #include "gromacs/math/vec.h"
  71 #include "gromacs/math/vecdump.h"
  72 #include "gromacs/mdlib/calcmu.h"
  73 #include "gromacs/mdlib/calcvir.h"
  74 #include "gromacs/mdlib/constr.h"
  75 #include "gromacs/mdlib/force.h"
  76 #include "gromacs/mdlib/forcerec.h"
  77 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  78 #include "gromacs/mdlib/mdrun.h"
  79 #include "gromacs/mdlib/nb_verlet.h"
  80 #include "gromacs/mdlib/nbnxn_atomdata.h"
  81 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  82 #include "gromacs/mdlib/nbnxn_grid.h"
  83 #include "gromacs/mdlib/nbnxn_search.h"
  84 #include "gromacs/mdlib/qmmm.h"
  85 #include "gromacs/mdlib/update.h"
  86 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
  87 #include "gromacs/mdtypes/commrec.h"
  88 #include "gromacs/mdtypes/forceoutput.h"
  89 #include "gromacs/mdtypes/iforceprovider.h"
  90 #include "gromacs/mdtypes/inputrec.h"
  91 #include "gromacs/mdtypes/md_enums.h"
  92 #include "gromacs/mdtypes/state.h"
  93 #include "gromacs/pbcutil/ishift.h"
  94 #include "gromacs/pbcutil/mshift.h"
  95 #include "gromacs/pbcutil/pbc.h"
  96 #include "gromacs/pulling/pull.h"
  97 #include "gromacs/pulling/pull_rotation.h"
  98 #include "gromacs/timing/cyclecounter.h"
  99 #include "gromacs/timing/gpu_timing.h"
 100 #include "gromacs/timing/wallcycle.h"
 101 #include "gromacs/timing/wallcyclereporting.h"
 102 #include "gromacs/timing/walltime_accounting.h"
 103 #include "gromacs/topology/topology.h"
 104 #include "gromacs/utility/arrayref.h"
 105 #include "gromacs/utility/basedefinitions.h"
 106 #include "gromacs/utility/cstringutil.h"
 107 #include "gromacs/utility/exceptions.h"
 108 #include "gromacs/utility/fatalerror.h"
 109 #include "gromacs/utility/gmxassert.h"
 110 #include "gromacs/utility/gmxmpi.h"
 111 #include "gromacs/utility/logger.h"
 112 #include "gromacs/utility/pleasecite.h"
 113 #include "gromacs/utility/smalloc.h"
 114 #include "gromacs/utility/sysinfo.h"
 115
 116 #include "nbnxn_gpu.h"
 117 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
 118 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
 119
 120 // TODO: this environment variable allows us to verify before release
 121 // that on less common architectures the total cost of polling is not larger than
 122 // a blocking wait (so polling does not introduce overhead when the static
 123 // PME-first ordering would suffice).
 124 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
 125
 126
 127 void print_time(FILE                     *out,
 128                 gmx_walltime_accounting_t walltime_accounting,
 129                 int64_t                   step,
 130                 t_inputrec               *ir,
 131                 const t_commrec          *cr)
 132 {
 133     time_t finish;
 134     char   timebuf[STRLEN];
 135     double dt, elapsed_seconds, time_per_step;
 136     char   buf[48];
 137
 138 #if !GMX_THREAD_MPI
 139     if (!PAR(cr))
 140 #endif
 141     {
 142         fprintf(out, "\r");
 143     }
 144     fprintf(out, "step %s", gmx_step_str(step, buf));
 145     fflush(out);
 146
 147     if ((step >= ir->nstlist))
 148     {
 149         double seconds_since_epoch = gmx_gettime();
 150         elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
 151         time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
 152         dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
 153
 154         if (ir->nsteps >= 0)
 155         {
 156             if (dt >= 300)
 157             {
 158                 finish = (time_t) (seconds_since_epoch + dt);
 159                 gmx_ctime_r(&finish, timebuf, STRLEN);
 160                 sprintf(buf, "%s", timebuf);
 161                 buf[strlen(buf)-1] = '\0';
 162                 fprintf(out, ", will finish %s", buf);
 163             }
 164             else
 165             {
 166                 fprintf(out, ", remaining wall clock time: %5d s          ", (int)dt);
 167             }
 168         }
 169         else
 170         {
 171             fprintf(out, " performance: %.1f ns/day    ",
 172                     ir->delta_t/1000*24*60*60/time_per_step);
 173         }
 174     }
 175 #if !GMX_THREAD_MPI
 176     if (PAR(cr))
 177     {
 178         fprintf(out, "\n");
 179     }
 180 #else
 181     GMX_UNUSED_VALUE(cr);
 182 #endif
 183
 184     fflush(out);
 185 }
 186
 187 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
 188                          double the_time)
 189 {
 190     char   time_string[STRLEN];
 191
 192     if (!fplog)
 193     {
 194         return;
 195     }
 196
 197     {
 198         int    i;
 199         char   timebuf[STRLEN];
 200         time_t temp_time = (time_t) the_time;
 201
 202         gmx_ctime_r(&temp_time, timebuf, STRLEN);
 203         for (i = 0; timebuf[i] >= ' '; i++)
 204         {
 205             time_string[i] = timebuf[i];
 206         }
 207         time_string[i] = '\0';
 208     }
 209
 210     fprintf(fplog, "%s on rank %d %s\n", title, nodeid, time_string);
 211 }
 212
 213 void print_start(FILE *fplog, const t_commrec *cr,
 214                  gmx_walltime_accounting_t walltime_accounting,
 215                  const char *name)
 216 {
 217     char buf[STRLEN];
 218
 219     sprintf(buf, "Started %s", name);
 220     print_date_and_time(fplog, cr->nodeid, buf,
 221                         walltime_accounting_get_start_time_stamp(walltime_accounting));
 222 }
 223
 224 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
 225 {
 226     const int end = forceToAdd.size();
 227
 228     // cppcheck-suppress unreadVariable
 229     int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
 230 #pragma omp parallel for num_threads(nt) schedule(static)
 231     for (int i = 0; i < end; i++)
 232     {
 233         rvec_inc(f[i], forceToAdd[i]);
 234     }
 235 }
 236
 237 static void pme_gpu_reduce_outputs(gmx_wallcycle_t                 wcycle,
 238                                    gmx::ForceWithVirial           *forceWithVirial,
 239                                    gmx::ArrayRef<const gmx::RVec>  pmeForces,
 240                                    gmx_enerdata_t                 *enerd,
 241                                    const tensor                    vir_Q,
 242                                    real                            Vlr_q)
 243 {
 244     wallcycle_start(wcycle, ewcPME_GPU_F_REDUCTION);
 245     GMX_ASSERT(forceWithVirial, "Invalid force pointer");
 246     forceWithVirial->addVirialContribution(vir_Q);
 247     enerd->term[F_COUL_RECIP] += Vlr_q;
 248     sum_forces(as_rvec_array(forceWithVirial->force_.data()), pmeForces);
 249     wallcycle_stop(wcycle, ewcPME_GPU_F_REDUCTION);
 250 }
 251
 252 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
 253                         tensor vir_part, t_graph *graph, matrix box,
 254                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
 255 {
 256     int    i;
 257
 258     /* The short-range virial from surrounding boxes */
 259     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
 260     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
 261
 262     /* Calculate partial virial, for local atoms only, based on short range.
 263      * Total virial is computed in global_stat, called from do_md
 264      */
 265     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
 266     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
 267
 268     /* Add wall contribution */
 269     for (i = 0; i < DIM; i++)
 270     {
 271         vir_part[i][ZZ] += fr->vir_wall_z[i];
 272     }
 273
 274     if (debug)
 275     {
 276         pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
 277     }
 278 }
 279
 280 static void pull_potential_wrapper(const t_commrec *cr,
 281                                    const t_inputrec *ir,
 282                                    matrix box, gmx::ArrayRef<const gmx::RVec> x,
 283                                    gmx::ForceWithVirial *force,
 284                                    const t_mdatoms *mdatoms,
 285                                    gmx_enerdata_t *enerd,
 286                                    real *lambda,
 287                                    double t,
 288                                    gmx_wallcycle_t wcycle)
 289 {
 290     t_pbc  pbc;
 291     real   dvdl;
 292
 293     /* Calculate the center of mass forces, this requires communication,
 294      * which is why pull_potential is called close to other communication.
 295      */
 296     wallcycle_start(wcycle, ewcPULLPOT);
 297     set_pbc(&pbc, ir->ePBC, box);
 298     dvdl                     = 0;
 299     enerd->term[F_COM_PULL] +=
 300         pull_potential(ir->pull_work, mdatoms, &pbc,
 301                        cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
 302     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
 303     wallcycle_stop(wcycle, ewcPULLPOT);
 304 }
 305
 306 static void pme_receive_force_ener(const t_commrec      *cr,
 307                                    gmx::ForceWithVirial *forceWithVirial,
 308                                    gmx_enerdata_t       *enerd,
 309                                    gmx_wallcycle_t       wcycle)
 310 {
 311     real   e_q, e_lj, dvdl_q, dvdl_lj;
 312     float  cycles_ppdpme, cycles_seppme;
 313
 314     cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
 315     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
 316
 317     /* In case of node-splitting, the PP nodes receive the long-range
 318      * forces, virial and energy from the PME nodes here.
 319      */
 320     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
 321     dvdl_q  = 0;
 322     dvdl_lj = 0;
 323     gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
 324                       &cycles_seppme);
 325     enerd->term[F_COUL_RECIP] += e_q;
 326     enerd->term[F_LJ_RECIP]   += e_lj;
 327     enerd->dvdl_lin[efptCOUL] += dvdl_q;
 328     enerd->dvdl_lin[efptVDW]  += dvdl_lj;
 329
 330     if (wcycle)
 331     {
 332         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
 333     }
 334     wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
 335 }
 336
 337 static void print_large_forces(FILE            *fp,
 338                                const t_mdatoms *md,
 339                                const t_commrec *cr,
 340                                int64_t          step,
 341                                real             forceTolerance,
 342                                const rvec      *x,
 343                                const rvec      *f)
 344 {
 345     real           force2Tolerance = gmx::square(forceTolerance);
 346     std::uintmax_t numNonFinite    = 0;
 347     for (int i = 0; i < md->homenr; i++)
 348     {
 349         real force2    = norm2(f[i]);
 350         bool nonFinite = !std::isfinite(force2);
 351         if (force2 >= force2Tolerance || nonFinite)
 352         {
 353             fprintf(fp, "step %" PRId64 " atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
 354                     step,
 355                     ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
 356         }
 357         if (nonFinite)
 358         {
 359             numNonFinite++;
 360         }
 361     }
 362     if (numNonFinite > 0)
 363     {
 364         /* Note that with MPI this fatal call on one rank might interrupt
 365          * the printing on other ranks. But we can only avoid that with
 366          * an expensive MPI barrier that we would need at each step.
 367          */
 368         gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %ju atoms", step, numNonFinite);
 369     }
 370 }
 371
 372 static void post_process_forces(const t_commrec           *cr,
 373                                 int64_t                    step,
 374                                 t_nrnb                    *nrnb,
 375                                 gmx_wallcycle_t            wcycle,
 376                                 const gmx_localtop_t      *top,
 377                                 matrix                     box,
 378                                 rvec                       x[],
 379                                 rvec                       f[],
 380                                 gmx::ForceWithVirial      *forceWithVirial,
 381                                 tensor                     vir_force,
 382                                 const t_mdatoms           *mdatoms,
 383                                 t_graph                   *graph,
 384                                 t_forcerec                *fr,
 385                                 const gmx_vsite_t         *vsite,
 386                                 int                        flags)
 387 {
 388     if (fr->haveDirectVirialContributions)
 389     {
 390         rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
 391
 392         if (vsite)
 393         {
 394             /* Spread the mesh force on virtual sites to the other particles...
 395              * This is parallellized. MPI communication is performed
 396              * if the constructing atoms aren't local.
 397              */
 398             matrix virial = { { 0 } };
 399             spread_vsite_f(vsite, x, fDirectVir, nullptr,
 400                            (flags & GMX_FORCE_VIRIAL), virial,
 401                            nrnb,
 402                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
 403             forceWithVirial->addVirialContribution(virial);
 404         }
 405
 406         if (flags & GMX_FORCE_VIRIAL)
 407         {
 408             /* Now add the forces, this is local */
 409             sum_forces(f, forceWithVirial->force_);
 410
 411             /* Add the direct virial contributions */
 412             GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
 413             m_add(vir_force, forceWithVirial->getVirial(), vir_force);
 414
 415             if (debug)
 416             {
 417                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
 418             }
 419         }
 420     }
 421
 422     if (fr->print_force >= 0)
 423     {
 424         print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
 425     }
 426 }
 427
 428 static void do_nb_verlet(t_forcerec *fr,
 429                          interaction_const_t *ic,
 430                          gmx_enerdata_t *enerd,
 431                          int flags, int ilocality,
 432                          int clearF,
 433                          int64_t step,
 434                          t_nrnb *nrnb,
 435                          gmx_wallcycle_t wcycle)
 436 {
 437     if (!(flags & GMX_FORCE_NONBONDED))
 438     {
 439         /* skip non-bonded calculation */
 440         return;
 441     }
 442
 443     nonbonded_verlet_t       *nbv  = fr->nbv;
 444     nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
 445
 446     /* GPU kernel launch overhead is already timed separately */
 447     if (fr->cutoff_scheme != ecutsVERLET)
 448     {
 449         gmx_incons("Invalid cut-off scheme passed!");
 450     }
 451
 452     bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
 453
 454     if (!bUsingGpuKernels)
 455     {
 456         /* When dynamic pair-list  pruning is requested, we need to prune
 457          * at nstlistPrune steps.
 458          */
 459         if (nbv->listParams->useDynamicPruning &&
 460             (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
 461         {
 462             /* Prune the pair-list beyond fr->ic->rlistPrune using
 463              * the current coordinates of the atoms.
 464              */
 465             wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
 466             nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
 467             wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
 468         }
 469
 470         wallcycle_sub_start(wcycle, ewcsNONBONDED);
 471     }
 472
 473     switch (nbvg->kernel_type)
 474     {
 475         case nbnxnk4x4_PlainC:
 476         case nbnxnk4xN_SIMD_4xN:
 477         case nbnxnk4xN_SIMD_2xNN:
 478             nbnxn_kernel_cpu(nbvg,
 479                              nbv->nbat,
 480                              ic,
 481                              fr->shift_vec,
 482                              flags,
 483                              clearF,
 484                              fr->fshift[0],
 485                              enerd->grpp.ener[egCOULSR],
 486                              fr->bBHAM ?
 487                              enerd->grpp.ener[egBHAMSR] :
 488                              enerd->grpp.ener[egLJSR]);
 489             break;
 490
 491         case nbnxnk8x8x8_GPU:
 492             nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbv->nbat, flags, ilocality);
 493             break;
 494
 495         case nbnxnk8x8x8_PlainC:
 496             nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
 497                                  nbv->nbat, ic,
 498                                  fr->shift_vec,
 499                                  flags,
 500                                  clearF,
 501                                  nbv->nbat->out[0].f,
 502                                  fr->fshift[0],
 503                                  enerd->grpp.ener[egCOULSR],
 504                                  fr->bBHAM ?
 505                                  enerd->grpp.ener[egBHAMSR] :
 506                                  enerd->grpp.ener[egLJSR]);
 507             break;
 508
 509         default:
 510             GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
 511
 512     }
 513     if (!bUsingGpuKernels)
 514     {
 515         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 516     }
 517
 518     int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
 519     if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
 520     {
 521         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
 522     }
 523     else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
 524              (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
 525     {
 526         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
 527     }
 528     else
 529     {
 530         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
 531     }
 532     enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
 533     if (flags & GMX_FORCE_ENERGY)
 534     {
 535         /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
 536         enr_nbnxn_kernel_ljc += 1;
 537         enr_nbnxn_kernel_lj  += 1;
 538     }
 539
 540     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
 541              nbvg->nbl_lists.natpair_ljq);
 542     inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
 543              nbvg->nbl_lists.natpair_lj);
 544     /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
 545     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
 546              nbvg->nbl_lists.natpair_q);
 547
 548     if (ic->vdw_modifier == eintmodFORCESWITCH)
 549     {
 550         /* We add up the switch cost separately */
 551         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 552                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 553     }
 554     if (ic->vdw_modifier == eintmodPOTSWITCH)
 555     {
 556         /* We add up the switch cost separately */
 557         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 558                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 559     }
 560     if (ic->vdwtype == evdwPME)
 561     {
 562         /* We add up the LJ Ewald cost separately */
 563         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 564                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 565     }
 566 }
 567
 568 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
 569                              t_forcerec           *fr,
 570                              rvec                  x[],
 571                              rvec                  f[],
 572                              const t_mdatoms      *mdatoms,
 573                              t_lambda             *fepvals,
 574                              real                 *lambda,
 575                              gmx_enerdata_t       *enerd,
 576                              int                   flags,
 577                              t_nrnb               *nrnb,
 578                              gmx_wallcycle_t       wcycle)
 579 {
 580     int              donb_flags;
 581     nb_kernel_data_t kernel_data;
 582     real             lam_i[efptNR];
 583     real             dvdl_nb[efptNR];
 584     int              th;
 585     int              i, j;
 586
 587     donb_flags = 0;
 588     /* Add short-range interactions */
 589     donb_flags |= GMX_NONBONDED_DO_SR;
 590
 591     /* Currently all group scheme kernels always calculate (shift-)forces */
 592     if (flags & GMX_FORCE_FORCES)
 593     {
 594         donb_flags |= GMX_NONBONDED_DO_FORCE;
 595     }
 596     if (flags & GMX_FORCE_VIRIAL)
 597     {
 598         donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
 599     }
 600     if (flags & GMX_FORCE_ENERGY)
 601     {
 602         donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
 603     }
 604
 605     kernel_data.flags  = donb_flags;
 606     kernel_data.lambda = lambda;
 607     kernel_data.dvdl   = dvdl_nb;
 608
 609     kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
 610     kernel_data.energygrp_vdw  = enerd->grpp.ener[egLJSR];
 611
 612     /* reset free energy components */
 613     for (i = 0; i < efptNR; i++)
 614     {
 615         dvdl_nb[i]  = 0;
 616     }
 617
 618     GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
 619
 620     wallcycle_sub_start(wcycle, ewcsNONBONDED);
 621 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
 622     for (th = 0; th < nbl_lists->nnbl; th++)
 623     {
 624         try
 625         {
 626             gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
 627                                       x, f, fr, mdatoms, &kernel_data, nrnb);
 628         }
 629         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 630     }
 631
 632     if (fepvals->sc_alpha != 0)
 633     {
 634         enerd->dvdl_nonlin[efptVDW]  += dvdl_nb[efptVDW];
 635         enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
 636     }
 637     else
 638     {
 639         enerd->dvdl_lin[efptVDW]  += dvdl_nb[efptVDW];
 640         enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
 641     }
 642
 643     /* If we do foreign lambda and we have soft-core interactions
 644      * we have to recalculate the (non-linear) energies contributions.
 645      */
 646     if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
 647     {
 648         kernel_data.flags          = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
 649         kernel_data.lambda         = lam_i;
 650         kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
 651         kernel_data.energygrp_vdw  = enerd->foreign_grpp.ener[egLJSR];
 652         /* Note that we add to kernel_data.dvdl, but ignore the result */
 653
 654         for (i = 0; i < enerd->n_lambda; i++)
 655         {
 656             for (j = 0; j < efptNR; j++)
 657             {
 658                 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
 659             }
 660             reset_foreign_enerdata(enerd);
 661 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
 662             for (th = 0; th < nbl_lists->nnbl; th++)
 663             {
 664                 try
 665                 {
 666                     gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
 667                                               x, f, fr, mdatoms, &kernel_data, nrnb);
 668                 }
 669                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 670             }
 671
 672             sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
 673             enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
 674         }
 675     }
 676
 677     wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 678 }
 679
 680 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
 681 {
 682     return nbv != nullptr && nbv->bUseGPU;
 683 }
 684
 685 static inline void clear_rvecs_omp(int n, rvec v[])
 686 {
 687     int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
 688
 689     /* Note that we would like to avoid this conditional by putting it
 690      * into the omp pragma instead, but then we still take the full
 691      * omp parallel for overhead (at least with gcc5).
 692      */
 693     if (nth == 1)
 694     {
 695         for (int i = 0; i < n; i++)
 696         {
 697             clear_rvec(v[i]);
 698         }
 699     }
 700     else
 701     {
 702 #pragma omp parallel for num_threads(nth) schedule(static)
 703         for (int i = 0; i < n; i++)
 704         {
 705             clear_rvec(v[i]);
 706         }
 707     }
 708 }
 709
 710 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
 711  *
 712  * \param groupOptions  Group options, containing T-coupling options
 713  */
 714 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
 715 {
 716     real nrdfCoupled   = 0;
 717     real nrdfUncoupled = 0;
 718     real kineticEnergy = 0;
 719     for (int g = 0; g < groupOptions.ngtc; g++)
 720     {
 721         if (groupOptions.tau_t[g] >= 0)
 722         {
 723             nrdfCoupled   += groupOptions.nrdf[g];
 724             kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
 725         }
 726         else
 727         {
 728             nrdfUncoupled += groupOptions.nrdf[g];
 729         }
 730     }
 731
 732     /* This conditional with > also catches nrdf=0 */
 733     if (nrdfCoupled > nrdfUncoupled)
 734     {
 735         return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
 736     }
 737     else
 738     {
 739         return 0;
 740     }
 741 }
 742
 743 /*! \brief This routine checks that the potential energy is finite.
 744  *
 745  * Always checks that the potential energy is finite. If step equals
 746  * inputrec.init_step also checks that the magnitude of the potential energy
 747  * is reasonable. Terminates with a fatal error when a check fails.
 748  * Note that passing this check does not guarantee finite forces,
 749  * since those use slightly different arithmetics. But in most cases
 750  * there is just a narrow coordinate range where forces are not finite
 751  * and energies are finite.
 752  *
 753  * \param[in] step      The step number, used for checking and printing
 754  * \param[in] enerd     The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
 755  * \param[in] inputrec  The input record
 756  */
 757 static void checkPotentialEnergyValidity(int64_t               step,
 758                                          const gmx_enerdata_t &enerd,
 759                                          const t_inputrec     &inputrec)
 760 {
 761     /* Threshold valid for comparing absolute potential energy against
 762      * the kinetic energy. Normally one should not consider absolute
 763      * potential energy values, but with a factor of one million
 764      * we should never get false positives.
 765      */
 766     constexpr real c_thresholdFactor = 1e6;
 767
 768     bool           energyIsNotFinite    = !std::isfinite(enerd.term[F_EPOT]);
 769     real           averageKineticEnergy = 0;
 770     /* We only check for large potential energy at the initial step,
 771      * because that is by far the most likely step for this too occur
 772      * and because computing the average kinetic energy is not free.
 773      * Note: nstcalcenergy >> 1 often does not allow to catch large energies
 774      * before they become NaN.
 775      */
 776     if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
 777     {
 778         averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
 779     }
 780
 781     if (energyIsNotFinite || (averageKineticEnergy > 0 &&
 782                               enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
 783     {
 784         gmx_fatal(FARGS, "Step %" PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
 785                   step,
 786                   enerd.term[F_EPOT],
 787                   energyIsNotFinite ? "not finite" : "extremely high",
 788                   enerd.term[F_LJ],
 789                   enerd.term[F_COUL_SR],
 790                   energyIsNotFinite ? "non-finite" : "very high",
 791                   energyIsNotFinite ? " or Nan" : "");
 792     }
 793 }
 794
 795 /*! \brief Compute forces and/or energies for special algorithms
 796  *
 797  * The intention is to collect all calls to algorithms that compute
 798  * forces on local atoms only and that do not contribute to the local
 799  * virial sum (but add their virial contribution separately).
 800  * Eventually these should likely all become ForceProviders.
 801  * Within this function the intention is to have algorithms that do
 802  * global communication at the end, so global barriers within the MD loop
 803  * are as close together as possible.
 804  *
 805  * \param[in]     fplog            The log file
 806  * \param[in]     cr               The communication record
 807  * \param[in]     inputrec         The input record
 808  * \param[in]     awh              The Awh module (nullptr if none in use).
 809  * \param[in]     enforcedRotation Enforced rotation module.
 810  * \param[in]     step             The current MD step
 811  * \param[in]     t                The current time
 812  * \param[in,out] wcycle           Wallcycle accounting struct
 813  * \param[in,out] forceProviders   Pointer to a list of force providers
 814  * \param[in]     box              The unit cell
 815  * \param[in]     x                The coordinates
 816  * \param[in]     mdatoms          Per atom properties
 817  * \param[in]     lambda           Array of free-energy lambda values
 818  * \param[in]     forceFlags       Flags that tell whether we should compute forces/energies/virial
 819  * \param[in,out] forceWithVirial  Force and virial buffers
 820  * \param[in,out] enerd            Energy buffer
 821  * \param[in,out] ed               Essential dynamics pointer
 822  * \param[in]     bNS              Tells if we did neighbor searching this step, used for ED sampling
 823  *
 824  * \todo Remove bNS, which is used incorrectly.
 825  * \todo Convert all other algorithms called here to ForceProviders.
 826  */
 827 static void
 828 computeSpecialForces(FILE                          *fplog,
 829                      const t_commrec               *cr,
 830                      const t_inputrec              *inputrec,
 831                      gmx::Awh                      *awh,
 832                      gmx_enfrot                    *enforcedRotation,
 833                      int64_t                        step,
 834                      double                         t,
 835                      gmx_wallcycle_t                wcycle,
 836                      ForceProviders                *forceProviders,
 837                      matrix                         box,
 838                      gmx::ArrayRef<const gmx::RVec> x,
 839                      const t_mdatoms               *mdatoms,
 840                      real                          *lambda,
 841                      int                            forceFlags,
 842                      gmx::ForceWithVirial          *forceWithVirial,
 843                      gmx_enerdata_t                *enerd,
 844                      const gmx_edsam               *ed,
 845                      gmx_bool                       bNS)
 846 {
 847     const bool computeForces = (forceFlags & GMX_FORCE_FORCES);
 848
 849     /* NOTE: Currently all ForceProviders only provide forces.
 850      *       When they also provide energies, remove this conditional.
 851      */
 852     if (computeForces)
 853     {
 854         gmx::ForceProviderInput  forceProviderInput(x, *mdatoms, t, box, *cr);
 855         gmx::ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
 856
 857         /* Collect forces from modules */
 858         forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
 859     }
 860
 861     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
 862     {
 863         pull_potential_wrapper(cr, inputrec, box, x,
 864                                forceWithVirial,
 865                                mdatoms, enerd, lambda, t,
 866                                wcycle);
 867
 868         if (awh)
 869         {
 870             enerd->term[F_COM_PULL] +=
 871                 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
 872                                                   forceWithVirial,
 873                                                   t, step, wcycle, fplog);
 874         }
 875     }
 876
 877     rvec *f = as_rvec_array(forceWithVirial->force_.data());
 878
 879     /* Add the forces from enforced rotation potentials (if any) */
 880     if (inputrec->bRot)
 881     {
 882         wallcycle_start(wcycle, ewcROTadd);
 883         enerd->term[F_COM_PULL] += add_rot_forces(enforcedRotation, f, cr, step, t);
 884         wallcycle_stop(wcycle, ewcROTadd);
 885     }
 886
 887     if (ed)
 888     {
 889         /* Note that since init_edsam() is called after the initialization
 890          * of forcerec, edsam doesn't request the noVirSum force buffer.
 891          * Thus if no other algorithm (e.g. PME) requires it, the forces
 892          * here will contribute to the virial.
 893          */
 894         do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
 895     }
 896
 897     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
 898     if (inputrec->bIMD && computeForces)
 899     {
 900         IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
 901     }
 902 }
 903
 904 /*! \brief Launch the prepare_step and spread stages of PME GPU.
 905  *
 906  * \param[in]  pmedata       The PME structure
 907  * \param[in]  box           The box matrix
 908  * \param[in]  x             Coordinate array
 909  * \param[in]  flags         Force flags
 910  * \param[in]  wcycle        The wallcycle structure
 911  */
 912 static inline void launchPmeGpuSpread(gmx_pme_t      *pmedata,
 913                                       matrix          box,
 914                                       rvec            x[],
 915                                       int             flags,
 916                                       gmx_wallcycle_t wcycle)
 917 {
 918     int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE;
 919     pmeFlags |= (flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0;
 920     pmeFlags |= (flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0;
 921
 922     pme_gpu_prepare_computation(pmedata, flags & GMX_FORCE_DYNAMICBOX, box, wcycle, pmeFlags);
 923     pme_gpu_launch_spread(pmedata, x, wcycle);
 924 }
 925
 926 /*! \brief Launch the FFT and gather stages of PME GPU
 927  *
 928  * This function only implements setting the output forces (no accumulation).
 929  *
 930  * \param[in]  pmedata        The PME structure
 931  * \param[in]  wcycle         The wallcycle structure
 932  */
 933 static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
 934                                      gmx_wallcycle_t   wcycle)
 935 {
 936     pme_gpu_launch_complex_transforms(pmedata, wcycle);
 937     pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
 938 }
 939
 940 /*! \brief
 941  *  Polling wait for either of the PME or nonbonded GPU tasks.
 942  *
 943  * Instead of a static order in waiting for GPU tasks, this function
 944  * polls checking which of the two tasks completes first, and does the
 945  * associated force buffer reduction overlapped with the other task.
 946  * By doing that, unlike static scheduling order, it can always overlap
 947  * one of the reductions, regardless of the GPU task completion order.
 948  *
 949  * \param[in]     nbv              Nonbonded verlet structure
 950  * \param[in]     pmedata          PME module data
 951  * \param[in,out] force            Force array to reduce task outputs into.
 952  * \param[in,out] forceWithVirial  Force and virial buffers
 953  * \param[in,out] fshift           Shift force output vector results are reduced into
 954  * \param[in,out] enerd            Energy data structure results are reduced into
 955  * \param[in]     flags            Force flags
 956  * \param[in]     wcycle           The wallcycle structure
 957  */
 958 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv,
 959                                         const gmx_pme_t                     *pmedata,
 960                                         gmx::PaddedArrayRef<gmx::RVec>      *force,
 961                                         gmx::ForceWithVirial                *forceWithVirial,
 962                                         rvec                                 fshift[],
 963                                         gmx_enerdata_t                      *enerd,
 964                                         int                                  flags,
 965                                         gmx_wallcycle_t                      wcycle)
 966 {
 967     bool isPmeGpuDone = false;
 968     bool isNbGpuDone  = false;
 969
 970
 971     gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
 972
 973     while (!isPmeGpuDone || !isNbGpuDone)
 974     {
 975         if (!isPmeGpuDone)
 976         {
 977             matrix            vir_Q;
 978             real              Vlr_q;
 979
 980             GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
 981             isPmeGpuDone = pme_gpu_try_finish_task(pmedata, wcycle, &pmeGpuForces,
 982                                                    vir_Q, &Vlr_q, completionType);
 983
 984             if (isPmeGpuDone)
 985             {
 986                 pme_gpu_reduce_outputs(wcycle, forceWithVirial, pmeGpuForces,
 987                                        enerd, vir_Q, Vlr_q);
 988             }
 989         }
 990
 991         if (!isNbGpuDone)
 992         {
 993             GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
 994             wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
 995             isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
 996                                                     flags, eatLocal,
 997                                                     enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
 998                                                     fshift, completionType);
 999             wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1000             // To get the call count right, when the task finished we
1001             // issue a start/stop.
1002             // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
1003             // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
1004             if (isNbGpuDone)
1005             {
1006                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1007                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1008
1009                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1010                                                nbv->nbat, as_rvec_array(force->data()), wcycle);
1011             }
1012         }
1013     }
1014 }
1015
1016 /*! \brief
1017  *  Launch the dynamic rolling pruning GPU task.
1018  *
1019  *  We currently alternate local/non-local list pruning in odd-even steps
1020  *  (only pruning every second step without DD).
1021  *
1022  * \param[in]     cr               The communication record
1023  * \param[in]     nbv              Nonbonded verlet structure
1024  * \param[in]     inputrec         The input record
1025  * \param[in]     step             The current MD step
1026  */
1027 static inline void launchGpuRollingPruning(const t_commrec          *cr,
1028                                            const nonbonded_verlet_t *nbv,
1029                                            const t_inputrec         *inputrec,
1030                                            const int64_t             step)
1031 {
1032     /* We should not launch the rolling pruning kernel at a search
1033      * step or just before search steps, since that's useless.
1034      * Without domain decomposition we prune at even steps.
1035      * With domain decomposition we alternate local and non-local
1036      * pruning at even and odd steps.
1037      */
1038     int  numRollingParts     = nbv->listParams->numRollingParts;
1039     GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1040     int  stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1041     bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
1042     if (stepWithCurrentList > 0 &&
1043         stepWithCurrentList < inputrec->nstlist - 1 &&
1044         (stepIsEven || DOMAINDECOMP(cr)))
1045     {
1046         nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
1047                                           stepIsEven ? eintLocal : eintNonlocal,
1048                                           numRollingParts);
1049     }
1050 }
1051
1052 static void do_force_cutsVERLET(FILE *fplog,
1053                                 const t_commrec *cr,
1054                                 const gmx_multisim_t *ms,
1055                                 const t_inputrec *inputrec,
1056                                 gmx::Awh *awh,
1057                                 gmx_enfrot *enforcedRotation,
1058                                 int64_t step,
1059                                 t_nrnb *nrnb,
1060                                 gmx_wallcycle_t wcycle,
1061                                 const gmx_localtop_t *top,
1062                                 const gmx_groups_t * /* groups */,
1063                                 matrix box, gmx::PaddedArrayRef<gmx::RVec> x,
1064                                 history_t *hist,
1065                                 gmx::PaddedArrayRef<gmx::RVec> force,
1066                                 tensor vir_force,
1067                                 const t_mdatoms *mdatoms,
1068                                 gmx_enerdata_t *enerd, t_fcdata *fcd,
1069                                 real *lambda,
1070                                 t_graph *graph,
1071                                 t_forcerec *fr,
1072                                 interaction_const_t *ic,
1073                                 const gmx_vsite_t *vsite,
1074                                 rvec mu_tot,
1075                                 double t,
1076                                 const gmx_edsam *ed,
1077                                 int flags,
1078                                 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1079                                 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1080 {
1081     int                 cg1, i, j;
1082     double              mu[2*DIM];
1083     gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM;
1084     gmx_bool            bDoForces, bUseGPU, bUseOrEmulGPU;
1085     rvec                vzero, box_diag;
1086     float               cycles_pme, cycles_wait_gpu;
1087     nonbonded_verlet_t *nbv = fr->nbv;
1088
1089     bStateChanged = (flags & GMX_FORCE_STATECHANGED);
1090     bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1091     bFillGrid     = (bNS && bStateChanged);
1092     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
1093     bDoForces     = (flags & GMX_FORCE_FORCES);
1094     bUseGPU       = fr->nbv->bUseGPU;
1095     bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
1096
1097     const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
1098     // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
1099     const bool useGpuPme  = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
1100         ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
1101
1102     /* At a search step we need to start the first balancing region
1103      * somewhere early inside the step after communication during domain
1104      * decomposition (and not during the previous step as usual).
1105      */
1106     if (bNS &&
1107         ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1108     {
1109         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
1110     }
1111
1112     cycles_wait_gpu = 0;
1113
1114     const int start  = 0;
1115     const int homenr = mdatoms->homenr;
1116
1117     clear_mat(vir_force);
1118
1119     if (DOMAINDECOMP(cr))
1120     {
1121         cg1 = cr->dd->globalAtomGroupIndices.size();
1122     }
1123     else
1124     {
1125         cg1 = top->cgs.nr;
1126     }
1127     if (fr->n_tpi > 0)
1128     {
1129         cg1--;
1130     }
1131
1132     if (bStateChanged)
1133     {
1134         update_forcerec(fr, box);
1135
1136         if (inputrecNeedMutot(inputrec))
1137         {
1138             /* Calculate total (local) dipole moment in a temporary common array.
1139              * This makes it possible to sum them over nodes faster.
1140              */
1141             calc_mu(start, homenr,
1142                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1143                     mu, mu+DIM);
1144         }
1145     }
1146
1147     if (fr->ePBC != epbcNONE)
1148     {
1149         /* Compute shift vectors every step,
1150          * because of pressure coupling or box deformation!
1151          */
1152         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1153         {
1154             calc_shifts(box, fr->shift_vec);
1155         }
1156
1157         if (bCalcCGCM)
1158         {
1159             put_atoms_in_box_omp(fr->ePBC, box, x.subArray(0, homenr));
1160             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1161         }
1162         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1163         {
1164             unshift_self(graph, box, as_rvec_array(x.data()));
1165         }
1166     }
1167
1168     nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
1169                                  fr->shift_vec, nbv->nbat);
1170
1171 #if GMX_MPI
1172     if (!thisRankHasDuty(cr, DUTY_PME))
1173     {
1174         /* Send particle coordinates to the pme nodes.
1175          * Since this is only implemented for domain decomposition
1176          * and domain decomposition does not use the graph,
1177          * we do not need to worry about shifting.
1178          */
1179         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1180                                  lambda[efptCOUL], lambda[efptVDW],
1181                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1182                                  step, wcycle);
1183     }
1184 #endif /* GMX_MPI */
1185
1186     if (useGpuPme)
1187     {
1188         launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.data()), flags, wcycle);
1189     }
1190
1191     /* do gridding for pair search */
1192     if (bNS)
1193     {
1194         if (graph && bStateChanged)
1195         {
1196             /* Calculate intramolecular shift vectors to make molecules whole */
1197             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1198         }
1199
1200         clear_rvec(vzero);
1201         box_diag[XX] = box[XX][XX];
1202         box_diag[YY] = box[YY][YY];
1203         box_diag[ZZ] = box[ZZ][ZZ];
1204
1205         wallcycle_start(wcycle, ewcNS);
1206         if (!DOMAINDECOMP(cr))
1207         {
1208             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1209             nbnxn_put_on_grid(nbv->nbs.get(), fr->ePBC, box,
1210                               0, vzero, box_diag,
1211                               0, mdatoms->homenr, -1, fr->cginfo, as_rvec_array(x.data()),
1212                               0, nullptr,
1213                               nbv->grp[eintLocal].kernel_type,
1214                               nbv->nbat);
1215             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1216         }
1217         else
1218         {
1219             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1220             nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
1221                                        fr->cginfo, as_rvec_array(x.data()),
1222                                        nbv->grp[eintNonlocal].kernel_type,
1223                                        nbv->nbat);
1224             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1225         }
1226
1227         nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
1228         wallcycle_stop(wcycle, ewcNS);
1229     }
1230
1231     /* initialize the GPU atom data and copy shift vector */
1232     if (bUseGPU)
1233     {
1234         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1235         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1236
1237         if (bNS)
1238         {
1239             nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1240         }
1241
1242         nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1243
1244         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1245         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1246     }
1247
1248     /* do local pair search */
1249     if (bNS)
1250     {
1251         wallcycle_start_nocount(wcycle, ewcNS);
1252         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1253         nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1254                             &top->excls,
1255                             nbv->listParams->rlistOuter,
1256                             nbv->min_ci_balanced,
1257                             &nbv->grp[eintLocal].nbl_lists,
1258                             eintLocal,
1259                             nbv->grp[eintLocal].kernel_type,
1260                             nrnb);
1261         nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
1262         if (nbv->listParams->useDynamicPruning && !bUseGPU)
1263         {
1264             nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
1265         }
1266         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1267
1268         if (bUseGPU)
1269         {
1270             /* initialize local pair-list on the GPU */
1271             nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1272                                     nbv->grp[eintLocal].nbl_lists.nbl[0],
1273                                     eintLocal);
1274         }
1275         wallcycle_stop(wcycle, ewcNS);
1276     }
1277     else
1278     {
1279         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatLocal, FALSE, as_rvec_array(x.data()),
1280                                         nbv->nbat, wcycle);
1281     }
1282
1283     if (bUseGPU)
1284     {
1285         if (DOMAINDECOMP(cr))
1286         {
1287             ddOpenBalanceRegionGpu(cr->dd);
1288         }
1289
1290         wallcycle_start(wcycle, ewcLAUNCH_GPU);
1291         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1292         /* launch local nonbonded work on GPU */
1293         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1294                      step, nrnb, wcycle);
1295         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1296         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1297     }
1298
1299     if (useGpuPme)
1300     {
1301         // In PME GPU and mixed mode we launch FFT / gather after the
1302         // X copy/transform to allow overlap as well as after the GPU NB
1303         // launch to avoid FFT launch overhead hijacking the CPU and delaying
1304         // the nonbonded kernel.
1305         launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1306     }
1307
1308     /* Communicate coordinates and sum dipole if necessary +
1309        do non-local pair search */
1310     if (DOMAINDECOMP(cr))
1311     {
1312         if (bNS)
1313         {
1314             wallcycle_start_nocount(wcycle, ewcNS);
1315             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1316
1317             nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1318                                 &top->excls,
1319                                 nbv->listParams->rlistOuter,
1320                                 nbv->min_ci_balanced,
1321                                 &nbv->grp[eintNonlocal].nbl_lists,
1322                                 eintNonlocal,
1323                                 nbv->grp[eintNonlocal].kernel_type,
1324                                 nrnb);
1325             nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1326             if (nbv->listParams->useDynamicPruning && !bUseGPU)
1327             {
1328                 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1329             }
1330             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1331
1332             if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1333             {
1334                 /* initialize non-local pair-list on the GPU */
1335                 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1336                                         nbv->grp[eintNonlocal].nbl_lists.nbl[0],
1337                                         eintNonlocal);
1338             }
1339             wallcycle_stop(wcycle, ewcNS);
1340         }
1341         else
1342         {
1343             dd_move_x(cr->dd, box, x, wcycle);
1344
1345             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatNonlocal, FALSE, as_rvec_array(x.data()),
1346                                             nbv->nbat, wcycle);
1347         }
1348
1349         if (bUseGPU)
1350         {
1351             wallcycle_start(wcycle, ewcLAUNCH_GPU);
1352             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1353             /* launch non-local nonbonded tasks on GPU */
1354             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1355                          step, nrnb, wcycle);
1356             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1357             wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1358         }
1359     }
1360
1361     if (bUseGPU)
1362     {
1363         /* launch D2H copy-back F */
1364         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1365         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1366         if (DOMAINDECOMP(cr))
1367         {
1368             nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1369                                      flags, eatNonlocal);
1370         }
1371         nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1372                                  flags, eatLocal);
1373         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1374         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1375     }
1376
1377     if (bStateChanged && inputrecNeedMutot(inputrec))
1378     {
1379         if (PAR(cr))
1380         {
1381             gmx_sumd(2*DIM, mu, cr);
1382             ddReopenBalanceRegionCpu(cr->dd);
1383         }
1384
1385         for (i = 0; i < 2; i++)
1386         {
1387             for (j = 0; j < DIM; j++)
1388             {
1389                 fr->mu_tot[i][j] = mu[i*DIM + j];
1390             }
1391         }
1392     }
1393     if (fr->efep == efepNO)
1394     {
1395         copy_rvec(fr->mu_tot[0], mu_tot);
1396     }
1397     else
1398     {
1399         for (j = 0; j < DIM; j++)
1400         {
1401             mu_tot[j] =
1402                 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1403                 lambda[efptCOUL]*fr->mu_tot[1][j];
1404         }
1405     }
1406
1407     /* Reset energies */
1408     reset_enerdata(enerd);
1409     clear_rvecs(SHIFTS, fr->fshift);
1410
1411     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1412     {
1413         wallcycle_start(wcycle, ewcPPDURINGPME);
1414         dd_force_flop_start(cr->dd, nrnb);
1415     }
1416
1417     if (inputrec->bRot)
1418     {
1419         wallcycle_start(wcycle, ewcROT);
1420         do_rotation(cr, enforcedRotation, box, as_rvec_array(x.data()), t, step, bNS);
1421         wallcycle_stop(wcycle, ewcROT);
1422     }
1423
1424     /* Temporary solution until all routines take PaddedRVecVector */
1425     rvec *const f = as_rvec_array(force.data());
1426
1427     /* Start the force cycle counter.
1428      * Note that a different counter is used for dynamic load balancing.
1429      */
1430     wallcycle_start(wcycle, ewcFORCE);
1431
1432     gmx::ArrayRef<gmx::RVec> forceRef = force;
1433     if (bDoForces)
1434     {
1435         /* If we need to compute the virial, we might need a separate
1436          * force buffer for algorithms for which the virial is calculated
1437          * directly, such as PME.
1438          */
1439         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1440         {
1441             forceRef = *fr->forceBufferForDirectVirialContributions;
1442
1443             /* TODO: update comment
1444              * We only compute forces on local atoms. Note that vsites can
1445              * spread to non-local atoms, but that part of the buffer is
1446              * cleared separately in the vsite spreading code.
1447              */
1448             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1449         }
1450         /* Clear the short- and long-range forces */
1451         clear_rvecs_omp(fr->natoms_force_constr, f);
1452     }
1453
1454     /* forceWithVirial uses the local atom range only */
1455     gmx::ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
1456
1457     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1458     {
1459         clear_pull_forces(inputrec->pull_work);
1460     }
1461
1462     /* We calculate the non-bonded forces, when done on the CPU, here.
1463      * We do this before calling do_force_lowlevel, because in that
1464      * function, the listed forces are calculated before PME, which
1465      * does communication.  With this order, non-bonded and listed
1466      * force calculation imbalance can be balanced out by the domain
1467      * decomposition load balancing.
1468      */
1469
1470     if (!bUseOrEmulGPU)
1471     {
1472         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1473                      step, nrnb, wcycle);
1474     }
1475
1476     if (fr->efep != efepNO)
1477     {
1478         /* Calculate the local and non-local free energy interactions here.
1479          * Happens here on the CPU both with and without GPU.
1480          */
1481         if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1482         {
1483             do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1484                              fr, as_rvec_array(x.data()), f, mdatoms,
1485                              inputrec->fepvals, lambda,
1486                              enerd, flags, nrnb, wcycle);
1487         }
1488
1489         if (DOMAINDECOMP(cr) &&
1490             fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1491         {
1492             do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1493                              fr, as_rvec_array(x.data()), f, mdatoms,
1494                              inputrec->fepvals, lambda,
1495                              enerd, flags, nrnb, wcycle);
1496         }
1497     }
1498
1499     if (!bUseOrEmulGPU)
1500     {
1501         int aloc;
1502
1503         if (DOMAINDECOMP(cr))
1504         {
1505             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1506                          step, nrnb, wcycle);
1507         }
1508
1509         if (!bUseOrEmulGPU)
1510         {
1511             aloc = eintLocal;
1512         }
1513         else
1514         {
1515             aloc = eintNonlocal;
1516         }
1517
1518         /* Add all the non-bonded force to the normal force array.
1519          * This can be split into a local and a non-local part when overlapping
1520          * communication with calculation with domain decomposition.
1521          */
1522         wallcycle_stop(wcycle, ewcFORCE);
1523
1524         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatAll, nbv->nbat, f, wcycle);
1525
1526         wallcycle_start_nocount(wcycle, ewcFORCE);
1527
1528         /* if there are multiple fshift output buffers reduce them */
1529         if ((flags & GMX_FORCE_VIRIAL) &&
1530             nbv->grp[aloc].nbl_lists.nnbl > 1)
1531         {
1532             /* This is not in a subcounter because it takes a
1533                negligible and constant-sized amount of time */
1534             nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1535                                                      fr->fshift);
1536         }
1537     }
1538
1539     /* update QMMMrec, if necessary */
1540     if (fr->bQMMM)
1541     {
1542         update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1543     }
1544
1545     /* Compute the bonded and non-bonded energies and optionally forces */
1546     do_force_lowlevel(fr, inputrec, &(top->idef),
1547                       cr, ms, nrnb, wcycle, mdatoms,
1548                       as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1549                       box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1550                       flags, &cycles_pme);
1551
1552     wallcycle_stop(wcycle, ewcFORCE);
1553
1554     computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1555                          step, t, wcycle,
1556                          fr->forceProviders, box, x, mdatoms, lambda,
1557                          flags, &forceWithVirial, enerd,
1558                          ed, bNS);
1559
1560     if (bUseOrEmulGPU)
1561     {
1562         /* wait for non-local forces (or calculate in emulation mode) */
1563         if (DOMAINDECOMP(cr))
1564         {
1565             if (bUseGPU)
1566             {
1567                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1568                 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1569                                            flags, eatNonlocal,
1570                                            enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1571                                            fr->fshift);
1572                 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1573             }
1574             else
1575             {
1576                 wallcycle_start_nocount(wcycle, ewcFORCE);
1577                 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1578                              step, nrnb, wcycle);
1579                 wallcycle_stop(wcycle, ewcFORCE);
1580             }
1581
1582             /* skip the reduction if there was no non-local work to do */
1583             if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1584             {
1585                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal,
1586                                                nbv->nbat, f, wcycle);
1587             }
1588         }
1589     }
1590
1591     if (DOMAINDECOMP(cr))
1592     {
1593         /* We are done with the CPU compute.
1594          * We will now communicate the non-local forces.
1595          * If we use a GPU this will overlap with GPU work, so in that case
1596          * we do not close the DD force balancing region here.
1597          */
1598         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1599         {
1600             ddCloseBalanceRegionCpu(cr->dd);
1601         }
1602         if (bDoForces)
1603         {
1604             dd_move_f(cr->dd, force, fr->fshift, wcycle);
1605         }
1606     }
1607
1608     // With both nonbonded and PME offloaded a GPU on the same rank, we use
1609     // an alternating wait/reduction scheme.
1610     bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1611     if (alternateGpuWait)
1612     {
1613         alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, wcycle);
1614     }
1615
1616     if (!alternateGpuWait && useGpuPme)
1617     {
1618         gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
1619         matrix vir_Q;
1620         real   Vlr_q = 0.0;
1621         pme_gpu_wait_finish_task(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
1622         pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
1623     }
1624
1625     /* Wait for local GPU NB outputs on the non-alternating wait path */
1626     if (!alternateGpuWait && bUseGPU)
1627     {
1628         /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1629          * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1630          * but even with a step of 0.1 ms the difference is less than 1%
1631          * of the step time.
1632          */
1633         const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1634
1635         wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1636         nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1637                                    flags, eatLocal,
1638                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1639                                    fr->fshift);
1640         float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1641
1642         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1643         {
1644             DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1645             if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1646             {
1647                 /* We measured few cycles, it could be that the kernel
1648                  * and transfer finished earlier and there was no actual
1649                  * wait time, only API call overhead.
1650                  * Then the actual time could be anywhere between 0 and
1651                  * cycles_wait_est. We will use half of cycles_wait_est.
1652                  */
1653                 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1654             }
1655             ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1656         }
1657     }
1658
1659     if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1660     {
1661         // NOTE: emulation kernel is not included in the balancing region,
1662         // but emulation mode does not target performance anyway
1663         wallcycle_start_nocount(wcycle, ewcFORCE);
1664         do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1665                      DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1666                      step, nrnb, wcycle);
1667         wallcycle_stop(wcycle, ewcFORCE);
1668     }
1669
1670     if (useGpuPme)
1671     {
1672         pme_gpu_reinit_computation(fr->pmedata, wcycle);
1673     }
1674
1675     if (bUseGPU)
1676     {
1677         /* now clear the GPU outputs while we finish the step on the CPU */
1678         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1679         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1680         nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1681
1682         /* Is dynamic pair-list pruning activated? */
1683         if (nbv->listParams->useDynamicPruning)
1684         {
1685             launchGpuRollingPruning(cr, nbv, inputrec, step);
1686         }
1687         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1688         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1689     }
1690
1691     /* Do the nonbonded GPU (or emulation) force buffer reduction
1692      * on the non-alternating path. */
1693     if (bUseOrEmulGPU && !alternateGpuWait)
1694     {
1695         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1696                                        nbv->nbat, f, wcycle);
1697     }
1698
1699     if (DOMAINDECOMP(cr))
1700     {
1701         dd_force_flop_stop(cr->dd, nrnb);
1702     }
1703
1704     if (bDoForces)
1705     {
1706         /* If we have NoVirSum forces, but we do not calculate the virial,
1707          * we sum fr->f_novirsum=f later.
1708          */
1709         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1710         {
1711             spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
1712                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1713         }
1714
1715         if (flags & GMX_FORCE_VIRIAL)
1716         {
1717             /* Calculation of the virial must be done after vsites! */
1718             calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
1719                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1720         }
1721     }
1722
1723     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1724     {
1725         /* In case of node-splitting, the PP nodes receive the long-range
1726          * forces, virial and energy from the PME nodes here.
1727          */
1728         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1729     }
1730
1731     if (bDoForces)
1732     {
1733         post_process_forces(cr, step, nrnb, wcycle,
1734                             top, box, as_rvec_array(x.data()), f, &forceWithVirial,
1735                             vir_force, mdatoms, graph, fr, vsite,
1736                             flags);
1737     }
1738
1739     if (flags & GMX_FORCE_ENERGY)
1740     {
1741         /* Sum the potential energy terms from group contributions */
1742         sum_epot(&(enerd->grpp), enerd->term);
1743
1744         if (!EI_TPI(inputrec->eI))
1745         {
1746             checkPotentialEnergyValidity(step, *enerd, *inputrec);
1747         }
1748     }
1749 }
1750
1751 static void do_force_cutsGROUP(FILE *fplog,
1752                                const t_commrec *cr,
1753                                const gmx_multisim_t *ms,
1754                                const t_inputrec *inputrec,
1755                                gmx::Awh *awh,
1756                                gmx_enfrot *enforcedRotation,
1757                                int64_t step,
1758                                t_nrnb *nrnb,
1759                                gmx_wallcycle_t wcycle,
1760                                gmx_localtop_t *top,
1761                                const gmx_groups_t *groups,
1762                                matrix box, gmx::PaddedArrayRef<gmx::RVec> x,
1763                                history_t *hist,
1764                                gmx::PaddedArrayRef<gmx::RVec> force,
1765                                tensor vir_force,
1766                                const t_mdatoms *mdatoms,
1767                                gmx_enerdata_t *enerd,
1768                                t_fcdata *fcd,
1769                                real *lambda,
1770                                t_graph *graph,
1771                                t_forcerec *fr,
1772                                const gmx_vsite_t *vsite,
1773                                rvec mu_tot,
1774                                double t,
1775                                const gmx_edsam *ed,
1776                                int flags,
1777                                DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1778                                DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1779 {
1780     int        cg0, cg1, i, j;
1781     double     mu[2*DIM];
1782     gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM;
1783     gmx_bool   bDoForces;
1784     float      cycles_pme;
1785
1786     const int  start  = 0;
1787     const int  homenr = mdatoms->homenr;
1788
1789     clear_mat(vir_force);
1790
1791     cg0 = 0;
1792     if (DOMAINDECOMP(cr))
1793     {
1794         cg1 = cr->dd->globalAtomGroupIndices.size();
1795     }
1796     else
1797     {
1798         cg1 = top->cgs.nr;
1799     }
1800     if (fr->n_tpi > 0)
1801     {
1802         cg1--;
1803     }
1804
1805     bStateChanged  = (flags & GMX_FORCE_STATECHANGED);
1806     bNS            = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1807     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1808     bFillGrid      = (bNS && bStateChanged);
1809     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
1810     bDoForces      = (flags & GMX_FORCE_FORCES);
1811
1812     if (bStateChanged)
1813     {
1814         update_forcerec(fr, box);
1815
1816         if (inputrecNeedMutot(inputrec))
1817         {
1818             /* Calculate total (local) dipole moment in a temporary common array.
1819              * This makes it possible to sum them over nodes faster.
1820              */
1821             calc_mu(start, homenr,
1822                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1823                     mu, mu+DIM);
1824         }
1825     }
1826
1827     if (fr->ePBC != epbcNONE)
1828     {
1829         /* Compute shift vectors every step,
1830          * because of pressure coupling or box deformation!
1831          */
1832         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1833         {
1834             calc_shifts(box, fr->shift_vec);
1835         }
1836
1837         if (bCalcCGCM)
1838         {
1839             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1840                                      &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1841             inc_nrnb(nrnb, eNR_CGCM, homenr);
1842             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1843         }
1844         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1845         {
1846             unshift_self(graph, box, as_rvec_array(x.data()));
1847         }
1848     }
1849     else if (bCalcCGCM)
1850     {
1851         calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1852         inc_nrnb(nrnb, eNR_CGCM, homenr);
1853     }
1854
1855     if (bCalcCGCM && gmx_debug_at)
1856     {
1857         pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1858     }
1859
1860 #if GMX_MPI
1861     if (!thisRankHasDuty(cr, DUTY_PME))
1862     {
1863         /* Send particle coordinates to the pme nodes.
1864          * Since this is only implemented for domain decomposition
1865          * and domain decomposition does not use the graph,
1866          * we do not need to worry about shifting.
1867          */
1868         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1869                                  lambda[efptCOUL], lambda[efptVDW],
1870                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1871                                  step, wcycle);
1872     }
1873 #endif /* GMX_MPI */
1874
1875     /* Communicate coordinates and sum dipole if necessary */
1876     if (DOMAINDECOMP(cr))
1877     {
1878         dd_move_x(cr->dd, box, x, wcycle);
1879
1880         /* No GPU support, no move_x overlap, so reopen the balance region here */
1881         if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1882         {
1883             ddReopenBalanceRegionCpu(cr->dd);
1884         }
1885     }
1886
1887     if (inputrecNeedMutot(inputrec))
1888     {
1889         if (bStateChanged)
1890         {
1891             if (PAR(cr))
1892             {
1893                 gmx_sumd(2*DIM, mu, cr);
1894                 ddReopenBalanceRegionCpu(cr->dd);
1895             }
1896             for (i = 0; i < 2; i++)
1897             {
1898                 for (j = 0; j < DIM; j++)
1899                 {
1900                     fr->mu_tot[i][j] = mu[i*DIM + j];
1901                 }
1902             }
1903         }
1904         if (fr->efep == efepNO)
1905         {
1906             copy_rvec(fr->mu_tot[0], mu_tot);
1907         }
1908         else
1909         {
1910             for (j = 0; j < DIM; j++)
1911             {
1912                 mu_tot[j] =
1913                     (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1914             }
1915         }
1916     }
1917
1918     /* Reset energies */
1919     reset_enerdata(enerd);
1920     clear_rvecs(SHIFTS, fr->fshift);
1921
1922     if (bNS)
1923     {
1924         wallcycle_start(wcycle, ewcNS);
1925
1926         if (graph && bStateChanged)
1927         {
1928             /* Calculate intramolecular shift vectors to make molecules whole */
1929             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1930         }
1931
1932         /* Do the actual neighbour searching */
1933         ns(fplog, fr, box,
1934            groups, top, mdatoms,
1935            cr, nrnb, bFillGrid);
1936
1937         wallcycle_stop(wcycle, ewcNS);
1938     }
1939
1940     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1941     {
1942         wallcycle_start(wcycle, ewcPPDURINGPME);
1943         dd_force_flop_start(cr->dd, nrnb);
1944     }
1945
1946     if (inputrec->bRot)
1947     {
1948         wallcycle_start(wcycle, ewcROT);
1949         do_rotation(cr, enforcedRotation, box, as_rvec_array(x.data()), t, step, bNS);
1950         wallcycle_stop(wcycle, ewcROT);
1951     }
1952
1953     /* Temporary solution until all routines take PaddedRVecVector */
1954     rvec *f = as_rvec_array(force.data());
1955
1956     /* Start the force cycle counter.
1957      * Note that a different counter is used for dynamic load balancing.
1958      */
1959     wallcycle_start(wcycle, ewcFORCE);
1960
1961     gmx::ArrayRef<gmx::RVec> forceRef = force;
1962     if (bDoForces)
1963     {
1964         /* If we need to compute the virial, we might need a separate
1965          * force buffer for algorithms for which the virial is calculated
1966          * separately, such as PME.
1967          */
1968         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1969         {
1970             forceRef = *fr->forceBufferForDirectVirialContributions;
1971
1972             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1973         }
1974
1975         /* Clear the short- and long-range forces */
1976         clear_rvecs(fr->natoms_force_constr, f);
1977     }
1978
1979     /* forceWithVirial might need the full force atom range */
1980     gmx::ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
1981
1982     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1983     {
1984         clear_pull_forces(inputrec->pull_work);
1985     }
1986
1987     /* update QMMMrec, if necessary */
1988     if (fr->bQMMM)
1989     {
1990         update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1991     }
1992
1993     /* Compute the bonded and non-bonded energies and optionally forces */
1994     do_force_lowlevel(fr, inputrec, &(top->idef),
1995                       cr, ms, nrnb, wcycle, mdatoms,
1996                       as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1997                       box, inputrec->fepvals, lambda,
1998                       graph, &(top->excls), fr->mu_tot,
1999                       flags,
2000                       &cycles_pme);
2001
2002     wallcycle_stop(wcycle, ewcFORCE);
2003
2004     if (DOMAINDECOMP(cr))
2005     {
2006         dd_force_flop_stop(cr->dd, nrnb);
2007
2008         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
2009         {
2010             ddCloseBalanceRegionCpu(cr->dd);
2011         }
2012     }
2013
2014     computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
2015                          step, t, wcycle,
2016                          fr->forceProviders, box, x, mdatoms, lambda,
2017                          flags, &forceWithVirial, enerd,
2018                          ed, bNS);
2019
2020     if (bDoForces)
2021     {
2022         /* Communicate the forces */
2023         if (DOMAINDECOMP(cr))
2024         {
2025             dd_move_f(cr->dd, force, fr->fshift, wcycle);
2026             /* Do we need to communicate the separate force array
2027              * for terms that do not contribute to the single sum virial?
2028              * Position restraints and electric fields do not introduce
2029              * inter-cg forces, only full electrostatics methods do.
2030              * When we do not calculate the virial, fr->f_novirsum = f,
2031              * so we have already communicated these forces.
2032              */
2033             if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
2034                 (flags & GMX_FORCE_VIRIAL))
2035             {
2036                 dd_move_f(cr->dd, forceWithVirial.force_, nullptr, wcycle);
2037             }
2038         }
2039
2040         /* If we have NoVirSum forces, but we do not calculate the virial,
2041          * we sum fr->f_novirsum=f later.
2042          */
2043         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
2044         {
2045             spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
2046                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
2047         }
2048
2049         if (flags & GMX_FORCE_VIRIAL)
2050         {
2051             /* Calculation of the virial must be done after vsites! */
2052             calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
2053                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
2054         }
2055     }
2056
2057     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
2058     {
2059         /* In case of node-splitting, the PP nodes receive the long-range
2060          * forces, virial and energy from the PME nodes here.
2061          */
2062         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
2063     }
2064
2065     if (bDoForces)
2066     {
2067         post_process_forces(cr, step, nrnb, wcycle,
2068                             top, box, as_rvec_array(x.data()), f, &forceWithVirial,
2069                             vir_force, mdatoms, graph, fr, vsite,
2070                             flags);
2071     }
2072
2073     if (flags & GMX_FORCE_ENERGY)
2074     {
2075         /* Sum the potential energy terms from group contributions */
2076         sum_epot(&(enerd->grpp), enerd->term);
2077
2078         if (!EI_TPI(inputrec->eI))
2079         {
2080             checkPotentialEnergyValidity(step, *enerd, *inputrec);
2081         }
2082     }
2083
2084 }
2085
2086 void do_force(FILE                                     *fplog,
2087               const t_commrec                          *cr,
2088               const gmx_multisim_t                     *ms,
2089               const t_inputrec                         *inputrec,
2090               gmx::Awh                                 *awh,
2091               gmx_enfrot                               *enforcedRotation,
2092               int64_t                                   step,
2093               t_nrnb                                   *nrnb,
2094               gmx_wallcycle_t                           wcycle,
2095               gmx_localtop_t                           *top,
2096               const gmx_groups_t                       *groups,
2097               matrix                                    box,
2098               gmx::PaddedArrayRef<gmx::RVec>            x,
2099               history_t                                *hist,
2100               gmx::PaddedArrayRef<gmx::RVec>            force,
2101               tensor                                    vir_force,
2102               const t_mdatoms                          *mdatoms,
2103               gmx_enerdata_t                           *enerd,
2104               t_fcdata                                 *fcd,
2105               gmx::ArrayRef<real>                       lambda,
2106               t_graph                                  *graph,
2107               t_forcerec                               *fr,
2108               const gmx_vsite_t                        *vsite,
2109               rvec                                      mu_tot,
2110               double                                    t,
2111               const gmx_edsam                          *ed,
2112               int                                       flags,
2113               DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
2114               DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
2115 {
2116     /* modify force flag if not doing nonbonded */
2117     if (!fr->bNonbonded)
2118     {
2119         flags &= ~GMX_FORCE_NONBONDED;
2120     }
2121
2122     GMX_ASSERT(x.size()     >= gmx::paddedRVecVectorSize(fr->natoms_force), "coordinates should be padded");
2123     GMX_ASSERT(force.size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "force should be padded");
2124
2125     switch (inputrec->cutoff_scheme)
2126     {
2127         case ecutsVERLET:
2128             do_force_cutsVERLET(fplog, cr, ms, inputrec,
2129                                 awh, enforcedRotation, step, nrnb, wcycle,
2130                                 top,
2131                                 groups,
2132                                 box, x, hist,
2133                                 force, vir_force,
2134                                 mdatoms,
2135                                 enerd, fcd,
2136                                 lambda.data(), graph,
2137                                 fr, fr->ic,
2138                                 vsite, mu_tot,
2139                                 t, ed,
2140                                 flags,
2141                                 ddOpenBalanceRegion,
2142                                 ddCloseBalanceRegion);
2143             break;
2144         case ecutsGROUP:
2145             do_force_cutsGROUP(fplog, cr, ms, inputrec,
2146                                awh, enforcedRotation, step, nrnb, wcycle,
2147                                top,
2148                                groups,
2149                                box, x, hist,
2150                                force, vir_force,
2151                                mdatoms,
2152                                enerd, fcd,
2153                                lambda.data(), graph,
2154                                fr, vsite, mu_tot,
2155                                t, ed,
2156                                flags,
2157                                ddOpenBalanceRegion,
2158                                ddCloseBalanceRegion);
2159             break;
2160         default:
2161             gmx_incons("Invalid cut-off scheme passed!");
2162     }
2163
2164     /* In case we don't have constraints and are using GPUs, the next balancing
2165      * region starts here.
2166      * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2167      * virial calculation and COM pulling, is not thus not included in
2168      * the balance timing, which is ok as most tasks do communication.
2169      */
2170     if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
2171     {
2172         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
2173     }
2174 }
2175
2176
2177 void do_constrain_first(FILE *fplog, gmx::Constraints *constr,
2178                         t_inputrec *ir, t_mdatoms *md,
2179                         t_state *state)
2180 {
2181     int             i, m, start, end;
2182     int64_t         step;
2183     real            dt = ir->delta_t;
2184     real            dvdl_dum;
2185     rvec           *savex;
2186
2187     /* We need to allocate one element extra, since we might use
2188      * (unaligned) 4-wide SIMD loads to access rvec entries.
2189      */
2190     snew(savex, state->natoms + 1);
2191
2192     start = 0;
2193     end   = md->homenr;
2194
2195     if (debug)
2196     {
2197         fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2198                 start, md->homenr, end);
2199     }
2200     /* Do a first constrain to reset particles... */
2201     step = ir->init_step;
2202     if (fplog)
2203     {
2204         char buf[STEPSTRSIZE];
2205         fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2206                 gmx_step_str(step, buf));
2207     }
2208     dvdl_dum = 0;
2209
2210     /* constrain the current position */
2211     constr->apply(TRUE, FALSE,
2212                   step, 0, 1.0,
2213                   as_rvec_array(state->x.data()), as_rvec_array(state->x.data()), nullptr,
2214                   state->box,
2215                   state->lambda[efptBONDED], &dvdl_dum,
2216                   nullptr, nullptr, gmx::ConstraintVariable::Positions);
2217     if (EI_VV(ir->eI))
2218     {
2219         /* constrain the inital velocity, and save it */
2220         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2221         constr->apply(TRUE, FALSE,
2222                       step, 0, 1.0,
2223                       as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
2224                       state->box,
2225                       state->lambda[efptBONDED], &dvdl_dum,
2226                       nullptr, nullptr, gmx::ConstraintVariable::Velocities);
2227     }
2228     /* constrain the inital velocities at t-dt/2 */
2229     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2230     {
2231         for (i = start; (i < end); i++)
2232         {
2233             for (m = 0; (m < DIM); m++)
2234             {
2235                 /* Reverse the velocity */
2236                 state->v[i][m] = -state->v[i][m];
2237                 /* Store the position at t-dt in buf */
2238                 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
2239             }
2240         }
2241         /* Shake the positions at t=-dt with the positions at t=0
2242          * as reference coordinates.
2243          */
2244         if (fplog)
2245         {
2246             char buf[STEPSTRSIZE];
2247             fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2248                     gmx_step_str(step, buf));
2249         }
2250         dvdl_dum = 0;
2251         constr->apply(TRUE, FALSE,
2252                       step, -1, 1.0,
2253                       as_rvec_array(state->x.data()), savex, nullptr,
2254                       state->box,
2255                       state->lambda[efptBONDED], &dvdl_dum,
2256                       as_rvec_array(state->v.data()), nullptr, gmx::ConstraintVariable::Positions);
2257
2258         for (i = start; i < end; i++)
2259         {
2260             for (m = 0; m < DIM; m++)
2261             {
2262                 /* Re-reverse the velocities */
2263                 state->v[i][m] = -state->v[i][m];
2264             }
2265         }
2266     }
2267     sfree(savex);
2268 }
2269
2270
2271 static void
2272 integrate_table(const real vdwtab[], real scale, int offstart, int rstart, int rend,
2273                 double *enerout, double *virout)
2274 {
2275     double enersum, virsum;
2276     double invscale, invscale2, invscale3;
2277     double r, ea, eb, ec, pa, pb, pc, pd;
2278     double y0, f, g, h;
2279     int    ri, offset;
2280     double tabfactor;
2281
2282     invscale  = 1.0/scale;
2283     invscale2 = invscale*invscale;
2284     invscale3 = invscale*invscale2;
2285
2286     /* Following summation derived from cubic spline definition,
2287      * Numerical Recipies in C, second edition, p. 113-116.  Exact for
2288      * the cubic spline.  We first calculate the negative of the
2289      * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2290      * add the more standard, abrupt cutoff correction to that result,
2291      * yielding the long-range correction for a switched function.  We
2292      * perform both the pressure and energy loops at the same time for
2293      * simplicity, as the computational cost is low. */
2294
2295     if (offstart == 0)
2296     {
2297         /* Since the dispersion table has been scaled down a factor
2298          * 6.0 and the repulsion a factor 12.0 to compensate for the
2299          * c6/c12 parameters inside nbfp[] being scaled up (to save
2300          * flops in kernels), we need to correct for this.
2301          */
2302         tabfactor = 6.0;
2303     }
2304     else
2305     {
2306         tabfactor = 12.0;
2307     }
2308
2309     enersum = 0.0;
2310     virsum  = 0.0;
2311     for (ri = rstart; ri < rend; ++ri)
2312     {
2313         r  = ri*invscale;
2314         ea = invscale3;
2315         eb = 2.0*invscale2*r;
2316         ec = invscale*r*r;
2317
2318         pa = invscale3;
2319         pb = 3.0*invscale2*r;
2320         pc = 3.0*invscale*r*r;
2321         pd = r*r*r;
2322
2323         /* this "8" is from the packing in the vdwtab array - perhaps
2324            should be defined? */
2325
2326         offset = 8*ri + offstart;
2327         y0     = vdwtab[offset];
2328         f      = vdwtab[offset+1];
2329         g      = vdwtab[offset+2];
2330         h      = vdwtab[offset+3];
2331
2332         enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2333         virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2334     }
2335     *enerout = 4.0*M_PI*enersum*tabfactor;
2336     *virout  = 4.0*M_PI*virsum*tabfactor;
2337 }
2338
2339 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2340 {
2341     double   eners[2], virs[2], enersum, virsum;
2342     double   r0, rc3, rc9;
2343     int      ri0, ri1, i;
2344     real     scale, *vdwtab;
2345
2346     fr->enershiftsix    = 0;
2347     fr->enershifttwelve = 0;
2348     fr->enerdiffsix     = 0;
2349     fr->enerdifftwelve  = 0;
2350     fr->virdiffsix      = 0;
2351     fr->virdifftwelve   = 0;
2352
2353     const interaction_const_t *ic = fr->ic;
2354
2355     if (eDispCorr != edispcNO)
2356     {
2357         for (i = 0; i < 2; i++)
2358         {
2359             eners[i] = 0;
2360             virs[i]  = 0;
2361         }
2362         if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2363             (ic->vdw_modifier == eintmodPOTSWITCH) ||
2364             (ic->vdw_modifier == eintmodFORCESWITCH) ||
2365             (ic->vdwtype == evdwSHIFT) ||
2366             (ic->vdwtype == evdwSWITCH))
2367         {
2368             if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2369                  (ic->vdw_modifier == eintmodFORCESWITCH) ||
2370                  (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2371             {
2372                 gmx_fatal(FARGS,
2373                           "With dispersion correction rvdw-switch can not be zero "
2374                           "for vdw-type = %s", evdw_names[ic->vdwtype]);
2375             }
2376
2377             /* TODO This code depends on the logic in tables.c that
2378                constructs the table layout, which should be made
2379                explicit in future cleanup. */
2380             GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2381                        "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2382             scale  = fr->dispersionCorrectionTable->scale;
2383             vdwtab = fr->dispersionCorrectionTable->data;
2384
2385             /* Round the cut-offs to exact table values for precision */
2386             ri0  = static_cast<int>(std::floor(ic->rvdw_switch*scale));
2387             ri1  = static_cast<int>(std::ceil(ic->rvdw*scale));
2388
2389             /* The code below has some support for handling force-switching, i.e.
2390              * when the force (instead of potential) is switched over a limited
2391              * region. This leads to a constant shift in the potential inside the
2392              * switching region, which we can handle by adding a constant energy
2393              * term in the force-switch case just like when we do potential-shift.
2394              *
2395              * For now this is not enabled, but to keep the functionality in the
2396              * code we check separately for switch and shift. When we do force-switch
2397              * the shifting point is rvdw_switch, while it is the cutoff when we
2398              * have a classical potential-shift.
2399              *
2400              * For a pure potential-shift the potential has a constant shift
2401              * all the way out to the cutoff, and that is it. For other forms
2402              * we need to calculate the constant shift up to the point where we
2403              * start modifying the potential.
2404              */
2405             ri0  = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2406
2407             r0   = ri0/scale;
2408             rc3  = r0*r0*r0;
2409             rc9  = rc3*rc3*rc3;
2410
2411             if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2412                 (ic->vdwtype == evdwSHIFT))
2413             {
2414                 /* Determine the constant energy shift below rvdw_switch.
2415                  * Table has a scale factor since we have scaled it down to compensate
2416                  * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2417                  */
2418                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2419                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2420             }
2421             else if (ic->vdw_modifier == eintmodPOTSHIFT)
2422             {
2423                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3));
2424                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3));
2425             }
2426
2427             /* Add the constant part from 0 to rvdw_switch.
2428              * This integration from 0 to rvdw_switch overcounts the number
2429              * of interactions by 1, as it also counts the self interaction.
2430              * We will correct for this later.
2431              */
2432             eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2433             eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2434
2435             /* Calculate the contribution in the range [r0,r1] where we
2436              * modify the potential. For a pure potential-shift modifier we will
2437              * have ri0==ri1, and there will not be any contribution here.
2438              */
2439             for (i = 0; i < 2; i++)
2440             {
2441                 enersum = 0;
2442                 virsum  = 0;
2443                 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2444                 eners[i] -= enersum;
2445                 virs[i]  -= virsum;
2446             }
2447
2448             /* Alright: Above we compensated by REMOVING the parts outside r0
2449              * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2450              *
2451              * Regardless of whether r0 is the point where we start switching,
2452              * or the cutoff where we calculated the constant shift, we include
2453              * all the parts we are missing out to infinity from r0 by
2454              * calculating the analytical dispersion correction.
2455              */
2456             eners[0] += -4.0*M_PI/(3.0*rc3);
2457             eners[1] +=  4.0*M_PI/(9.0*rc9);
2458             virs[0]  +=  8.0*M_PI/rc3;
2459             virs[1]  += -16.0*M_PI/(3.0*rc9);
2460         }
2461         else if (ic->vdwtype == evdwCUT ||
2462                  EVDW_PME(ic->vdwtype) ||
2463                  ic->vdwtype == evdwUSER)
2464         {
2465             if (ic->vdwtype == evdwUSER && fplog)
2466             {
2467                 fprintf(fplog,
2468                         "WARNING: using dispersion correction with user tables\n");
2469             }
2470
2471             /* Note that with LJ-PME, the dispersion correction is multiplied
2472              * by the difference between the actual C6 and the value of C6
2473              * that would produce the combination rule.
2474              * This means the normal energy and virial difference formulas
2475              * can be used here.
2476              */
2477
2478             rc3  = ic->rvdw*ic->rvdw*ic->rvdw;
2479             rc9  = rc3*rc3*rc3;
2480             /* Contribution beyond the cut-off */
2481             eners[0] += -4.0*M_PI/(3.0*rc3);
2482             eners[1] +=  4.0*M_PI/(9.0*rc9);
2483             if (ic->vdw_modifier == eintmodPOTSHIFT)
2484             {
2485                 /* Contribution within the cut-off */
2486                 eners[0] += -4.0*M_PI/(3.0*rc3);
2487                 eners[1] +=  4.0*M_PI/(3.0*rc9);
2488             }
2489             /* Contribution beyond the cut-off */
2490             virs[0]  +=  8.0*M_PI/rc3;
2491             virs[1]  += -16.0*M_PI/(3.0*rc9);
2492         }
2493         else
2494         {
2495             gmx_fatal(FARGS,
2496                       "Dispersion correction is not implemented for vdw-type = %s",
2497                       evdw_names[ic->vdwtype]);
2498         }
2499
2500         /* When we deprecate the group kernels the code below can go too */
2501         if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2502         {
2503             /* Calculate self-interaction coefficient (assuming that
2504              * the reciprocal-space contribution is constant in the
2505              * region that contributes to the self-interaction).
2506              */
2507             fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2508
2509             eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2510             virs[0]  +=  gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2511         }
2512
2513         fr->enerdiffsix    = eners[0];
2514         fr->enerdifftwelve = eners[1];
2515         /* The 0.5 is due to the Gromacs definition of the virial */
2516         fr->virdiffsix     = 0.5*virs[0];
2517         fr->virdifftwelve  = 0.5*virs[1];
2518     }
2519 }
2520
2521 void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
2522                    matrix box, real lambda, tensor pres, tensor virial,
2523                    real *prescorr, real *enercorr, real *dvdlcorr)
2524 {
2525     gmx_bool bCorrAll, bCorrPres;
2526     real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2527     int      m;
2528
2529     *prescorr = 0;
2530     *enercorr = 0;
2531     *dvdlcorr = 0;
2532
2533     clear_mat(virial);
2534     clear_mat(pres);
2535
2536     if (ir->eDispCorr != edispcNO)
2537     {
2538         bCorrAll  = (ir->eDispCorr == edispcAllEner ||
2539                      ir->eDispCorr == edispcAllEnerPres);
2540         bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2541                      ir->eDispCorr == edispcAllEnerPres);
2542
2543         invvol = 1/det(box);
2544         if (fr->n_tpi)
2545         {
2546             /* Only correct for the interactions with the inserted molecule */
2547             dens   = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2548             ninter = fr->n_tpi;
2549         }
2550         else
2551         {
2552             dens   = fr->numAtomsForDispersionCorrection*invvol;
2553             ninter = 0.5*fr->numAtomsForDispersionCorrection;
2554         }
2555
2556         if (ir->efep == efepNO)
2557         {
2558             avcsix    = fr->avcsix[0];
2559             avctwelve = fr->avctwelve[0];
2560         }
2561         else
2562         {
2563             avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
2564             avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2565         }
2566
2567         enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2568         *enercorr += avcsix*enerdiff;
2569         dvdlambda  = 0.0;
2570         if (ir->efep != efepNO)
2571         {
2572             dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2573         }
2574         if (bCorrAll)
2575         {
2576             enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2577             *enercorr += avctwelve*enerdiff;
2578             if (fr->efep != efepNO)
2579             {
2580                 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2581             }
2582         }
2583
2584         if (bCorrPres)
2585         {
2586             svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2587             if (ir->eDispCorr == edispcAllEnerPres)
2588             {
2589                 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2590             }
2591             /* The factor 2 is because of the Gromacs virial definition */
2592             spres = -2.0*invvol*svir*PRESFAC;
2593
2594             for (m = 0; m < DIM; m++)
2595             {
2596                 virial[m][m] += svir;
2597                 pres[m][m]   += spres;
2598             }
2599             *prescorr += spres;
2600         }
2601
2602         /* Can't currently control when it prints, for now, just print when degugging */
2603         if (debug)
2604         {
2605             if (bCorrAll)
2606             {
2607                 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2608                         avcsix, avctwelve);
2609             }
2610             if (bCorrPres)
2611             {
2612                 fprintf(debug,
2613                         "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2614                         *enercorr, spres, svir);
2615             }
2616             else
2617             {
2618                 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2619             }
2620         }
2621
2622         if (fr->efep != efepNO)
2623         {
2624             *dvdlcorr += dvdlambda;
2625         }
2626     }
2627 }
2628
2629 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2630                             const gmx_mtop_t *mtop, rvec x[],
2631                             gmx_bool bFirst)
2632 {
2633     t_graph        *graph;
2634     int             as, mol;
2635
2636     if (bFirst && fplog)
2637     {
2638         fprintf(fplog, "Removing pbc first time\n");
2639     }
2640
2641     snew(graph, 1);
2642     as = 0;
2643     for (const gmx_molblock_t &molb : mtop->molblock)
2644     {
2645         const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2646         if (moltype.atoms.nr == 1 ||
2647             (!bFirst && moltype.cgs.nr == 1))
2648         {
2649             /* Just one atom or charge group in the molecule, no PBC required */
2650             as += molb.nmol*moltype.atoms.nr;
2651         }
2652         else
2653         {
2654             /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2655             mk_graph_ilist(nullptr, moltype.ilist,
2656                            0, moltype.atoms.nr, FALSE, FALSE, graph);
2657
2658             for (mol = 0; mol < molb.nmol; mol++)
2659             {
2660                 mk_mshift(fplog, graph, ePBC, box, x+as);
2661
2662                 shift_self(graph, box, x+as);
2663                 /* The molecule is whole now.
2664                  * We don't need the second mk_mshift call as in do_pbc_first,
2665                  * since we no longer need this graph.
2666                  */
2667
2668                 as += moltype.atoms.nr;
2669             }
2670             done_graph(graph);
2671         }
2672     }
2673     sfree(graph);
2674 }
2675
2676 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2677                        const gmx_mtop_t *mtop, rvec x[])
2678 {
2679     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2680 }
2681
2682 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2683                  const gmx_mtop_t *mtop, rvec x[])
2684 {
2685     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2686 }
2687
2688 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2689 {
2690     int t, nth;
2691     nth = gmx_omp_nthreads_get(emntDefault);
2692
2693 #pragma omp parallel for num_threads(nth) schedule(static)
2694     for (t = 0; t < nth; t++)
2695     {
2696         try
2697         {
2698             size_t natoms = x.size();
2699             size_t offset = (natoms*t    )/nth;
2700             size_t len    = (natoms*(t + 1))/nth - offset;
2701             put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2702         }
2703         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2704     }
2705 }
2706
2707 // TODO This can be cleaned up a lot, and move back to runner.cpp
2708 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2709                 const t_inputrec *inputrec,
2710                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2711                 gmx_walltime_accounting_t walltime_accounting,
2712                 nonbonded_verlet_t *nbv,
2713                 gmx_pme_t *pme,
2714                 gmx_bool bWriteStat)
2715 {
2716     t_nrnb *nrnb_tot = nullptr;
2717     double  delta_t  = 0;
2718     double  nbfs     = 0, mflop = 0;
2719     double  elapsed_time,
2720             elapsed_time_over_all_ranks,
2721             elapsed_time_over_all_threads,
2722             elapsed_time_over_all_threads_over_all_ranks;
2723     /* Control whether it is valid to print a report. Only the
2724        simulation master may print, but it should not do so if the run
2725        terminated e.g. before a scheduled reset step. This is
2726        complicated by the fact that PME ranks are unaware of the
2727        reason why they were sent a pmerecvqxFINISH. To avoid
2728        communication deadlocks, we always do the communication for the
2729        report, even if we've decided not to write the report, because
2730        how long it takes to finish the run is not important when we've
2731        decided not to report on the simulation performance.
2732
2733        Further, we only report performance for dynamical integrators,
2734        because those are the only ones for which we plan to
2735        consider doing any optimizations. */
2736     bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2737
2738     if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2739     {
2740         GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2741         printReport = false;
2742     }
2743
2744     if (cr->nnodes > 1)
2745     {
2746         snew(nrnb_tot, 1);
2747 #if GMX_MPI
2748         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2749                       cr->mpi_comm_mysim);
2750 #endif
2751     }
2752     else
2753     {
2754         nrnb_tot = nrnb;
2755     }
2756
2757     elapsed_time                                 = walltime_accounting_get_elapsed_time(walltime_accounting);
2758     elapsed_time_over_all_ranks                  = elapsed_time;
2759     elapsed_time_over_all_threads                = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
2760     elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2761 #if GMX_MPI
2762     if (cr->nnodes > 1)
2763     {
2764         /* reduce elapsed_time over all MPI ranks in the current simulation */
2765         MPI_Allreduce(&elapsed_time,
2766                       &elapsed_time_over_all_ranks,
2767                       1, MPI_DOUBLE, MPI_SUM,
2768                       cr->mpi_comm_mysim);
2769         elapsed_time_over_all_ranks /= cr->nnodes;
2770         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2771          * current simulation. */
2772         MPI_Allreduce(&elapsed_time_over_all_threads,
2773                       &elapsed_time_over_all_threads_over_all_ranks,
2774                       1, MPI_DOUBLE, MPI_SUM,
2775                       cr->mpi_comm_mysim);
2776     }
2777 #endif
2778
2779     if (printReport)
2780     {
2781         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2782     }
2783     if (cr->nnodes > 1)
2784     {
2785         sfree(nrnb_tot);
2786     }
2787
2788     if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2789     {
2790         print_dd_statistics(cr, inputrec, fplog);
2791     }
2792
2793     /* TODO Move the responsibility for any scaling by thread counts
2794      * to the code that handled the thread region, so that there's a
2795      * mechanism to keep cycle counting working during the transition
2796      * to task parallelism. */
2797     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
2798     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2799     wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2800     auto cycle_sum(wallcycle_sum(cr, wcycle));
2801
2802     if (printReport)
2803     {
2804         auto                    nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2805         gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
2806         if (pme_gpu_task_enabled(pme))
2807         {
2808             pme_gpu_get_timings(pme, &pme_gpu_timings);
2809         }
2810         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2811                         elapsed_time_over_all_ranks,
2812                         wcycle, cycle_sum,
2813                         nbnxn_gpu_timings,
2814                         &pme_gpu_timings);
2815
2816         if (EI_DYNAMICS(inputrec->eI))
2817         {
2818             delta_t = inputrec->delta_t;
2819         }
2820
2821         if (fplog)
2822         {
2823             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2824                        elapsed_time_over_all_ranks,
2825                        walltime_accounting_get_nsteps_done(walltime_accounting),
2826                        delta_t, nbfs, mflop);
2827         }
2828         if (bWriteStat)
2829         {
2830             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2831                        elapsed_time_over_all_ranks,
2832                        walltime_accounting_get_nsteps_done(walltime_accounting),
2833                        delta_t, nbfs, mflop);
2834         }
2835     }
2836 }
2837
2838 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2839 {
2840     /* this function works, but could probably use a logic rewrite to keep all the different
2841        types of efep straight. */
2842
2843     if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
2844     {
2845         return;
2846     }
2847
2848     t_lambda *fep = ir->fepvals;
2849     *fep_state    = fep->init_fep_state; /* this might overwrite the checkpoint
2850                                             if checkpoint is set -- a kludge is in for now
2851                                             to prevent this.*/
2852
2853     for (int i = 0; i < efptNR; i++)
2854     {
2855         /* overwrite lambda state with init_lambda for now for backwards compatibility */
2856         if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2857         {
2858             lambda[i] = fep->init_lambda;
2859             if (lam0)
2860             {
2861                 lam0[i] = lambda[i];
2862             }
2863         }
2864         else
2865         {
2866             lambda[i] = fep->all_lambda[i][*fep_state];
2867             if (lam0)
2868             {
2869                 lam0[i] = lambda[i];
2870             }
2871         }
2872     }
2873     if (ir->bSimTemp)
2874     {
2875         /* need to rescale control temperatures to match current state */
2876         for (int i = 0; i < ir->opts.ngtc; i++)
2877         {
2878             if (ir->opts.ref_t[i] > 0)
2879             {
2880                 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2881             }
2882         }
2883     }
2884
2885     /* Send to the log the information on the current lambdas */
2886     if (fplog != nullptr)
2887     {
2888         fprintf(fplog, "Initial vector of lambda components:[ ");
2889         for (const auto &l : lambda)
2890         {
2891             fprintf(fplog, "%10.4f ", l);
2892         }
2893         fprintf(fplog, "]\n");
2894     }
2895 }
2896
2897
2898 void init_md(FILE *fplog,
2899              const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2900              t_inputrec *ir, const gmx_output_env_t *oenv,
2901              const MdrunOptions &mdrunOptions,
2902              double *t, double *t0,
2903              t_state *globalState, double *lam0,
2904              t_nrnb *nrnb, gmx_mtop_t *mtop,
2905              gmx_update_t **upd,
2906              gmx::BoxDeformation *deform,
2907              int nfile, const t_filenm fnm[],
2908              gmx_mdoutf_t *outf, t_mdebin **mdebin,
2909              tensor force_vir, tensor shake_vir, rvec mu_tot,
2910              gmx_bool *bSimAnn, t_vcm **vcm,
2911              gmx_wallcycle_t wcycle)
2912 {
2913     int  i;
2914
2915     /* Initial values */
2916     *t = *t0       = ir->init_t;
2917
2918     *bSimAnn = FALSE;
2919     for (i = 0; i < ir->opts.ngtc; i++)
2920     {
2921         /* set bSimAnn if any group is being annealed */
2922         if (ir->opts.annealing[i] != eannNO)
2923         {
2924             *bSimAnn = TRUE;
2925         }
2926     }
2927
2928     /* Initialize lambda variables */
2929     /* TODO: Clean up initialization of fep_state and lambda in t_state.
2930      * We currently need to call initialize_lambdas on non-master ranks
2931      * to initialize lam0.
2932      */
2933     if (MASTER(cr))
2934     {
2935         initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
2936     }
2937     else
2938     {
2939         int                      tmpFepState;
2940         std::array<real, efptNR> tmpLambda;
2941         initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
2942     }
2943
2944     // TODO upd is never NULL in practice, but the analysers don't know that
2945     if (upd)
2946     {
2947         *upd = init_update(ir, deform);
2948     }
2949     if (*bSimAnn)
2950     {
2951         update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2952     }
2953
2954     if (vcm != nullptr)
2955     {
2956         *vcm = init_vcm(fplog, &mtop->groups, ir);
2957     }
2958
2959     if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
2960     {
2961         if (ir->etc == etcBERENDSEN)
2962         {
2963             please_cite(fplog, "Berendsen84a");
2964         }
2965         if (ir->etc == etcVRESCALE)
2966         {
2967             please_cite(fplog, "Bussi2007a");
2968         }
2969         if (ir->eI == eiSD1)
2970         {
2971             please_cite(fplog, "Goga2012");
2972         }
2973     }
2974     init_nrnb(nrnb);
2975
2976     if (nfile != -1)
2977     {
2978         *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
2979
2980         *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
2981                               mtop, ir, mdoutf_get_fp_dhdl(*outf));
2982     }
2983
2984     /* Initiate variables */
2985     clear_mat(force_vir);
2986     clear_mat(shake_vir);
2987     clear_rvec(mu_tot);
2988 }