src/gromacs/mdlib/sim_util.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "sim_util.h"
  40
  41 #include "config.h"
  42
  43 #include <stdio.h>
  44 #include <string.h>
  45
  46 #include <cmath>
  47 #include <cstdint>
  48
  49 #include <array>
  50
  51 #include "gromacs/awh/awh.h"
  52 #include "gromacs/domdec/dlbtiming.h"
  53 #include "gromacs/domdec/domdec.h"
  54 #include "gromacs/domdec/domdec_struct.h"
  55 #include "gromacs/essentialdynamics/edsam.h"
  56 #include "gromacs/ewald/pme.h"
  57 #include "gromacs/gmxlib/chargegroup.h"
  58 #include "gromacs/gmxlib/network.h"
  59 #include "gromacs/gmxlib/nrnb.h"
  60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
  61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
  62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
  63 #include "gromacs/gpu_utils/gpu_utils.h"
  64 #include "gromacs/imd/imd.h"
  65 #include "gromacs/listed-forces/bonded.h"
  66 #include "gromacs/listed-forces/disre.h"
  67 #include "gromacs/listed-forces/orires.h"
  68 #include "gromacs/math/functions.h"
  69 #include "gromacs/math/units.h"
  70 #include "gromacs/math/vec.h"
  71 #include "gromacs/math/vecdump.h"
  72 #include "gromacs/mdlib/calcmu.h"
  73 #include "gromacs/mdlib/constr.h"
  74 #include "gromacs/mdlib/force.h"
  75 #include "gromacs/mdlib/forcerec.h"
  76 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  77 #include "gromacs/mdlib/mdrun.h"
  78 #include "gromacs/mdlib/nb_verlet.h"
  79 #include "gromacs/mdlib/nbnxn_atomdata.h"
  80 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  81 #include "gromacs/mdlib/nbnxn_grid.h"
  82 #include "gromacs/mdlib/nbnxn_search.h"
  83 #include "gromacs/mdlib/qmmm.h"
  84 #include "gromacs/mdlib/update.h"
  85 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
  86 #include "gromacs/mdtypes/commrec.h"
  87 #include "gromacs/mdtypes/forceoutput.h"
  88 #include "gromacs/mdtypes/iforceprovider.h"
  89 #include "gromacs/mdtypes/inputrec.h"
  90 #include "gromacs/mdtypes/md_enums.h"
  91 #include "gromacs/mdtypes/state.h"
  92 #include "gromacs/pbcutil/ishift.h"
  93 #include "gromacs/pbcutil/mshift.h"
  94 #include "gromacs/pbcutil/pbc.h"
  95 #include "gromacs/pulling/pull.h"
  96 #include "gromacs/pulling/pull_rotation.h"
  97 #include "gromacs/timing/cyclecounter.h"
  98 #include "gromacs/timing/gpu_timing.h"
  99 #include "gromacs/timing/wallcycle.h"
 100 #include "gromacs/timing/wallcyclereporting.h"
 101 #include "gromacs/timing/walltime_accounting.h"
 102 #include "gromacs/topology/topology.h"
 103 #include "gromacs/utility/arrayref.h"
 104 #include "gromacs/utility/basedefinitions.h"
 105 #include "gromacs/utility/cstringutil.h"
 106 #include "gromacs/utility/exceptions.h"
 107 #include "gromacs/utility/fatalerror.h"
 108 #include "gromacs/utility/gmxassert.h"
 109 #include "gromacs/utility/gmxmpi.h"
 110 #include "gromacs/utility/logger.h"
 111 #include "gromacs/utility/pleasecite.h"
 112 #include "gromacs/utility/smalloc.h"
 113 #include "gromacs/utility/sysinfo.h"
 114
 115 #include "nbnxn_gpu.h"
 116 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
 117 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
 118
 119 // TODO: this environment variable allows us to verify before release
 120 // that on less common architectures the total cost of polling is not larger than
 121 // a blocking wait (so polling does not introduce overhead when the static
 122 // PME-first ordering would suffice).
 123 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
 124
 125
 126 void print_time(FILE                     *out,
 127                 gmx_walltime_accounting_t walltime_accounting,
 128                 gmx_int64_t               step,
 129                 t_inputrec               *ir,
 130                 const t_commrec          *cr)
 131 {
 132     time_t finish;
 133     char   timebuf[STRLEN];
 134     double dt, elapsed_seconds, time_per_step;
 135     char   buf[48];
 136
 137 #if !GMX_THREAD_MPI
 138     if (!PAR(cr))
 139 #endif
 140     {
 141         fprintf(out, "\r");
 142     }
 143     fprintf(out, "step %s", gmx_step_str(step, buf));
 144     fflush(out);
 145
 146     if ((step >= ir->nstlist))
 147     {
 148         double seconds_since_epoch = gmx_gettime();
 149         elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
 150         time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
 151         dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
 152
 153         if (ir->nsteps >= 0)
 154         {
 155             if (dt >= 300)
 156             {
 157                 finish = (time_t) (seconds_since_epoch + dt);
 158                 gmx_ctime_r(&finish, timebuf, STRLEN);
 159                 sprintf(buf, "%s", timebuf);
 160                 buf[strlen(buf)-1] = '\0';
 161                 fprintf(out, ", will finish %s", buf);
 162             }
 163             else
 164             {
 165                 fprintf(out, ", remaining wall clock time: %5d s          ", (int)dt);
 166             }
 167         }
 168         else
 169         {
 170             fprintf(out, " performance: %.1f ns/day    ",
 171                     ir->delta_t/1000*24*60*60/time_per_step);
 172         }
 173     }
 174 #if !GMX_THREAD_MPI
 175     if (PAR(cr))
 176     {
 177         fprintf(out, "\n");
 178     }
 179 #else
 180     GMX_UNUSED_VALUE(cr);
 181 #endif
 182
 183     fflush(out);
 184 }
 185
 186 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
 187                          double the_time)
 188 {
 189     char   time_string[STRLEN];
 190
 191     if (!fplog)
 192     {
 193         return;
 194     }
 195
 196     {
 197         int    i;
 198         char   timebuf[STRLEN];
 199         time_t temp_time = (time_t) the_time;
 200
 201         gmx_ctime_r(&temp_time, timebuf, STRLEN);
 202         for (i = 0; timebuf[i] >= ' '; i++)
 203         {
 204             time_string[i] = timebuf[i];
 205         }
 206         time_string[i] = '\0';
 207     }
 208
 209     fprintf(fplog, "%s on rank %d %s\n", title, nodeid, time_string);
 210 }
 211
 212 void print_start(FILE *fplog, const t_commrec *cr,
 213                  gmx_walltime_accounting_t walltime_accounting,
 214                  const char *name)
 215 {
 216     char buf[STRLEN];
 217
 218     sprintf(buf, "Started %s", name);
 219     print_date_and_time(fplog, cr->nodeid, buf,
 220                         walltime_accounting_get_start_time_stamp(walltime_accounting));
 221 }
 222
 223 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
 224 {
 225     const int end = forceToAdd.size();
 226
 227     // cppcheck-suppress unreadVariable
 228     int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
 229 #pragma omp parallel for num_threads(nt) schedule(static)
 230     for (int i = 0; i < end; i++)
 231     {
 232         rvec_inc(f[i], forceToAdd[i]);
 233     }
 234 }
 235
 236 static void pme_gpu_reduce_outputs(gmx_wallcycle_t            wcycle,
 237                                    ForceWithVirial           *forceWithVirial,
 238                                    ArrayRef<const gmx::RVec>  pmeForces,
 239                                    gmx_enerdata_t            *enerd,
 240                                    const tensor               vir_Q,
 241                                    real                       Vlr_q)
 242 {
 243     wallcycle_start(wcycle, ewcPME_GPU_F_REDUCTION);
 244     GMX_ASSERT(forceWithVirial, "Invalid force pointer");
 245     forceWithVirial->addVirialContribution(vir_Q);
 246     enerd->term[F_COUL_RECIP] += Vlr_q;
 247     sum_forces(as_rvec_array(forceWithVirial->force_.data()), pmeForces);
 248     wallcycle_stop(wcycle, ewcPME_GPU_F_REDUCTION);
 249 }
 250
 251 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
 252                         tensor vir_part, t_graph *graph, matrix box,
 253                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
 254 {
 255     int    i;
 256
 257     /* The short-range virial from surrounding boxes */
 258     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
 259     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
 260
 261     /* Calculate partial virial, for local atoms only, based on short range.
 262      * Total virial is computed in global_stat, called from do_md
 263      */
 264     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
 265     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
 266
 267     /* Add wall contribution */
 268     for (i = 0; i < DIM; i++)
 269     {
 270         vir_part[i][ZZ] += fr->vir_wall_z[i];
 271     }
 272
 273     if (debug)
 274     {
 275         pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
 276     }
 277 }
 278
 279 static void pull_potential_wrapper(const t_commrec *cr,
 280                                    const t_inputrec *ir,
 281                                    matrix box, ArrayRef<const RVec> x,
 282                                    ForceWithVirial *force,
 283                                    const t_mdatoms *mdatoms,
 284                                    gmx_enerdata_t *enerd,
 285                                    real *lambda,
 286                                    double t,
 287                                    gmx_wallcycle_t wcycle)
 288 {
 289     t_pbc  pbc;
 290     real   dvdl;
 291
 292     /* Calculate the center of mass forces, this requires communication,
 293      * which is why pull_potential is called close to other communication.
 294      */
 295     wallcycle_start(wcycle, ewcPULLPOT);
 296     set_pbc(&pbc, ir->ePBC, box);
 297     dvdl                     = 0;
 298     enerd->term[F_COM_PULL] +=
 299         pull_potential(ir->pull_work, mdatoms, &pbc,
 300                        cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
 301     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
 302     wallcycle_stop(wcycle, ewcPULLPOT);
 303 }
 304
 305 static void pme_receive_force_ener(const t_commrec *cr,
 306                                    ForceWithVirial *forceWithVirial,
 307                                    gmx_enerdata_t  *enerd,
 308                                    gmx_wallcycle_t  wcycle)
 309 {
 310     real   e_q, e_lj, dvdl_q, dvdl_lj;
 311     float  cycles_ppdpme, cycles_seppme;
 312
 313     cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
 314     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
 315
 316     /* In case of node-splitting, the PP nodes receive the long-range
 317      * forces, virial and energy from the PME nodes here.
 318      */
 319     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
 320     dvdl_q  = 0;
 321     dvdl_lj = 0;
 322     gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
 323                       &cycles_seppme);
 324     enerd->term[F_COUL_RECIP] += e_q;
 325     enerd->term[F_LJ_RECIP]   += e_lj;
 326     enerd->dvdl_lin[efptCOUL] += dvdl_q;
 327     enerd->dvdl_lin[efptVDW]  += dvdl_lj;
 328
 329     if (wcycle)
 330     {
 331         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
 332     }
 333     wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
 334 }
 335
 336 static void print_large_forces(FILE            *fp,
 337                                const t_mdatoms *md,
 338                                const t_commrec *cr,
 339                                gmx_int64_t      step,
 340                                real             forceTolerance,
 341                                const rvec      *x,
 342                                const rvec      *f)
 343 {
 344     real           force2Tolerance = gmx::square(forceTolerance);
 345     std::uintmax_t numNonFinite    = 0;
 346     for (int i = 0; i < md->homenr; i++)
 347     {
 348         real force2    = norm2(f[i]);
 349         bool nonFinite = !std::isfinite(force2);
 350         if (force2 >= force2Tolerance || nonFinite)
 351         {
 352             fprintf(fp, "step %" GMX_PRId64 " atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
 353                     step,
 354                     ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
 355         }
 356         if (nonFinite)
 357         {
 358             numNonFinite++;
 359         }
 360     }
 361     if (numNonFinite > 0)
 362     {
 363         /* Note that with MPI this fatal call on one rank might interrupt
 364          * the printing on other ranks. But we can only avoid that with
 365          * an expensive MPI barrier that we would need at each step.
 366          */
 367         gmx_fatal(FARGS, "At step %" GMX_PRId64 " detected non-finite forces on %ju atoms", step, numNonFinite);
 368     }
 369 }
 370
 371 static void post_process_forces(const t_commrec      *cr,
 372                                 gmx_int64_t           step,
 373                                 t_nrnb               *nrnb,
 374                                 gmx_wallcycle_t       wcycle,
 375                                 const gmx_localtop_t *top,
 376                                 matrix                box,
 377                                 rvec                  x[],
 378                                 rvec                  f[],
 379                                 ForceWithVirial      *forceWithVirial,
 380                                 tensor                vir_force,
 381                                 const t_mdatoms      *mdatoms,
 382                                 t_graph              *graph,
 383                                 t_forcerec           *fr,
 384                                 const gmx_vsite_t    *vsite,
 385                                 int                   flags)
 386 {
 387     if (fr->haveDirectVirialContributions)
 388     {
 389         rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
 390
 391         if (vsite)
 392         {
 393             /* Spread the mesh force on virtual sites to the other particles...
 394              * This is parallellized. MPI communication is performed
 395              * if the constructing atoms aren't local.
 396              */
 397             matrix virial = { { 0 } };
 398             spread_vsite_f(vsite, x, fDirectVir, nullptr,
 399                            (flags & GMX_FORCE_VIRIAL), virial,
 400                            nrnb,
 401                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
 402             forceWithVirial->addVirialContribution(virial);
 403         }
 404
 405         if (flags & GMX_FORCE_VIRIAL)
 406         {
 407             /* Now add the forces, this is local */
 408             sum_forces(f, forceWithVirial->force_);
 409
 410             /* Add the direct virial contributions */
 411             GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
 412             m_add(vir_force, forceWithVirial->getVirial(), vir_force);
 413
 414             if (debug)
 415             {
 416                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
 417             }
 418         }
 419     }
 420
 421     if (fr->print_force >= 0)
 422     {
 423         print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
 424     }
 425 }
 426
 427 static void do_nb_verlet(t_forcerec *fr,
 428                          interaction_const_t *ic,
 429                          gmx_enerdata_t *enerd,
 430                          int flags, int ilocality,
 431                          int clearF,
 432                          gmx_int64_t step,
 433                          t_nrnb *nrnb,
 434                          gmx_wallcycle_t wcycle)
 435 {
 436     if (!(flags & GMX_FORCE_NONBONDED))
 437     {
 438         /* skip non-bonded calculation */
 439         return;
 440     }
 441
 442     nonbonded_verlet_t       *nbv  = fr->nbv;
 443     nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
 444
 445     /* GPU kernel launch overhead is already timed separately */
 446     if (fr->cutoff_scheme != ecutsVERLET)
 447     {
 448         gmx_incons("Invalid cut-off scheme passed!");
 449     }
 450
 451     bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
 452
 453     if (!bUsingGpuKernels)
 454     {
 455         /* When dynamic pair-list  pruning is requested, we need to prune
 456          * at nstlistPrune steps.
 457          */
 458         if (nbv->listParams->useDynamicPruning &&
 459             (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
 460         {
 461             /* Prune the pair-list beyond fr->ic->rlistPrune using
 462              * the current coordinates of the atoms.
 463              */
 464             wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
 465             nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
 466             wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
 467         }
 468
 469         wallcycle_sub_start(wcycle, ewcsNONBONDED);
 470     }
 471
 472     switch (nbvg->kernel_type)
 473     {
 474         case nbnxnk4x4_PlainC:
 475         case nbnxnk4xN_SIMD_4xN:
 476         case nbnxnk4xN_SIMD_2xNN:
 477             nbnxn_kernel_cpu(nbvg,
 478                              nbv->nbat,
 479                              ic,
 480                              fr->shift_vec,
 481                              flags,
 482                              clearF,
 483                              fr->fshift[0],
 484                              enerd->grpp.ener[egCOULSR],
 485                              fr->bBHAM ?
 486                              enerd->grpp.ener[egBHAMSR] :
 487                              enerd->grpp.ener[egLJSR]);
 488             break;
 489
 490         case nbnxnk8x8x8_GPU:
 491             nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbv->nbat, flags, ilocality);
 492             break;
 493
 494         case nbnxnk8x8x8_PlainC:
 495             nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
 496                                  nbv->nbat, ic,
 497                                  fr->shift_vec,
 498                                  flags,
 499                                  clearF,
 500                                  nbv->nbat->out[0].f,
 501                                  fr->fshift[0],
 502                                  enerd->grpp.ener[egCOULSR],
 503                                  fr->bBHAM ?
 504                                  enerd->grpp.ener[egBHAMSR] :
 505                                  enerd->grpp.ener[egLJSR]);
 506             break;
 507
 508         default:
 509             GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
 510
 511     }
 512     if (!bUsingGpuKernels)
 513     {
 514         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 515     }
 516
 517     int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
 518     if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
 519     {
 520         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
 521     }
 522     else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
 523              (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
 524     {
 525         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
 526     }
 527     else
 528     {
 529         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
 530     }
 531     enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
 532     if (flags & GMX_FORCE_ENERGY)
 533     {
 534         /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
 535         enr_nbnxn_kernel_ljc += 1;
 536         enr_nbnxn_kernel_lj  += 1;
 537     }
 538
 539     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
 540              nbvg->nbl_lists.natpair_ljq);
 541     inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
 542              nbvg->nbl_lists.natpair_lj);
 543     /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
 544     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
 545              nbvg->nbl_lists.natpair_q);
 546
 547     if (ic->vdw_modifier == eintmodFORCESWITCH)
 548     {
 549         /* We add up the switch cost separately */
 550         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 551                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 552     }
 553     if (ic->vdw_modifier == eintmodPOTSWITCH)
 554     {
 555         /* We add up the switch cost separately */
 556         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 557                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 558     }
 559     if (ic->vdwtype == evdwPME)
 560     {
 561         /* We add up the LJ Ewald cost separately */
 562         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 563                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 564     }
 565 }
 566
 567 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
 568                              t_forcerec           *fr,
 569                              rvec                  x[],
 570                              rvec                  f[],
 571                              const t_mdatoms      *mdatoms,
 572                              t_lambda             *fepvals,
 573                              real                 *lambda,
 574                              gmx_enerdata_t       *enerd,
 575                              int                   flags,
 576                              t_nrnb               *nrnb,
 577                              gmx_wallcycle_t       wcycle)
 578 {
 579     int              donb_flags;
 580     nb_kernel_data_t kernel_data;
 581     real             lam_i[efptNR];
 582     real             dvdl_nb[efptNR];
 583     int              th;
 584     int              i, j;
 585
 586     donb_flags = 0;
 587     /* Add short-range interactions */
 588     donb_flags |= GMX_NONBONDED_DO_SR;
 589
 590     /* Currently all group scheme kernels always calculate (shift-)forces */
 591     if (flags & GMX_FORCE_FORCES)
 592     {
 593         donb_flags |= GMX_NONBONDED_DO_FORCE;
 594     }
 595     if (flags & GMX_FORCE_VIRIAL)
 596     {
 597         donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
 598     }
 599     if (flags & GMX_FORCE_ENERGY)
 600     {
 601         donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
 602     }
 603
 604     kernel_data.flags  = donb_flags;
 605     kernel_data.lambda = lambda;
 606     kernel_data.dvdl   = dvdl_nb;
 607
 608     kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
 609     kernel_data.energygrp_vdw  = enerd->grpp.ener[egLJSR];
 610
 611     /* reset free energy components */
 612     for (i = 0; i < efptNR; i++)
 613     {
 614         dvdl_nb[i]  = 0;
 615     }
 616
 617     GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
 618
 619     wallcycle_sub_start(wcycle, ewcsNONBONDED);
 620 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
 621     for (th = 0; th < nbl_lists->nnbl; th++)
 622     {
 623         try
 624         {
 625             gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
 626                                       x, f, fr, mdatoms, &kernel_data, nrnb);
 627         }
 628         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 629     }
 630
 631     if (fepvals->sc_alpha != 0)
 632     {
 633         enerd->dvdl_nonlin[efptVDW]  += dvdl_nb[efptVDW];
 634         enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
 635     }
 636     else
 637     {
 638         enerd->dvdl_lin[efptVDW]  += dvdl_nb[efptVDW];
 639         enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
 640     }
 641
 642     /* If we do foreign lambda and we have soft-core interactions
 643      * we have to recalculate the (non-linear) energies contributions.
 644      */
 645     if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
 646     {
 647         kernel_data.flags          = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
 648         kernel_data.lambda         = lam_i;
 649         kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
 650         kernel_data.energygrp_vdw  = enerd->foreign_grpp.ener[egLJSR];
 651         /* Note that we add to kernel_data.dvdl, but ignore the result */
 652
 653         for (i = 0; i < enerd->n_lambda; i++)
 654         {
 655             for (j = 0; j < efptNR; j++)
 656             {
 657                 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
 658             }
 659             reset_foreign_enerdata(enerd);
 660 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
 661             for (th = 0; th < nbl_lists->nnbl; th++)
 662             {
 663                 try
 664                 {
 665                     gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
 666                                               x, f, fr, mdatoms, &kernel_data, nrnb);
 667                 }
 668                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 669             }
 670
 671             sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
 672             enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
 673         }
 674     }
 675
 676     wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 677 }
 678
 679 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
 680 {
 681     return nbv != nullptr && nbv->bUseGPU;
 682 }
 683
 684 static inline void clear_rvecs_omp(int n, rvec v[])
 685 {
 686     int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
 687
 688     /* Note that we would like to avoid this conditional by putting it
 689      * into the omp pragma instead, but then we still take the full
 690      * omp parallel for overhead (at least with gcc5).
 691      */
 692     if (nth == 1)
 693     {
 694         for (int i = 0; i < n; i++)
 695         {
 696             clear_rvec(v[i]);
 697         }
 698     }
 699     else
 700     {
 701 #pragma omp parallel for num_threads(nth) schedule(static)
 702         for (int i = 0; i < n; i++)
 703         {
 704             clear_rvec(v[i]);
 705         }
 706     }
 707 }
 708
 709 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
 710  *
 711  * \param groupOptions  Group options, containing T-coupling options
 712  */
 713 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
 714 {
 715     real nrdfCoupled   = 0;
 716     real nrdfUncoupled = 0;
 717     real kineticEnergy = 0;
 718     for (int g = 0; g < groupOptions.ngtc; g++)
 719     {
 720         if (groupOptions.tau_t[g] >= 0)
 721         {
 722             nrdfCoupled   += groupOptions.nrdf[g];
 723             kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
 724         }
 725         else
 726         {
 727             nrdfUncoupled += groupOptions.nrdf[g];
 728         }
 729     }
 730
 731     /* This conditional with > also catches nrdf=0 */
 732     if (nrdfCoupled > nrdfUncoupled)
 733     {
 734         return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
 735     }
 736     else
 737     {
 738         return 0;
 739     }
 740 }
 741
 742 /*! \brief This routine checks that the potential energy is finite.
 743  *
 744  * Always checks that the potential energy is finite. If step equals
 745  * inputrec.init_step also checks that the magnitude of the potential energy
 746  * is reasonable. Terminates with a fatal error when a check fails.
 747  * Note that passing this check does not guarantee finite forces,
 748  * since those use slightly different arithmetics. But in most cases
 749  * there is just a narrow coordinate range where forces are not finite
 750  * and energies are finite.
 751  *
 752  * \param[in] step      The step number, used for checking and printing
 753  * \param[in] enerd     The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
 754  * \param[in] inputrec  The input record
 755  */
 756 static void checkPotentialEnergyValidity(gmx_int64_t           step,
 757                                          const gmx_enerdata_t &enerd,
 758                                          const t_inputrec     &inputrec)
 759 {
 760     /* Threshold valid for comparing absolute potential energy against
 761      * the kinetic energy. Normally one should not consider absolute
 762      * potential energy values, but with a factor of one million
 763      * we should never get false positives.
 764      */
 765     constexpr real c_thresholdFactor = 1e6;
 766
 767     bool           energyIsNotFinite    = !std::isfinite(enerd.term[F_EPOT]);
 768     real           averageKineticEnergy = 0;
 769     /* We only check for large potential energy at the initial step,
 770      * because that is by far the most likely step for this too occur
 771      * and because computing the average kinetic energy is not free.
 772      * Note: nstcalcenergy >> 1 often does not allow to catch large energies
 773      * before they become NaN.
 774      */
 775     if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
 776     {
 777         averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
 778     }
 779
 780     if (energyIsNotFinite || (averageKineticEnergy > 0 &&
 781                               enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
 782     {
 783         gmx_fatal(FARGS, "Step %" GMX_PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
 784                   step,
 785                   enerd.term[F_EPOT],
 786                   energyIsNotFinite ? "not finite" : "extremely high",
 787                   enerd.term[F_LJ],
 788                   enerd.term[F_COUL_SR],
 789                   energyIsNotFinite ? "non-finite" : "very high",
 790                   energyIsNotFinite ? " or Nan" : "");
 791     }
 792 }
 793
 794 /*! \brief Compute forces and/or energies for special algorithms
 795  *
 796  * The intention is to collect all calls to algorithms that compute
 797  * forces on local atoms only and that do not contribute to the local
 798  * virial sum (but add their virial contribution separately).
 799  * Eventually these should likely all become ForceProviders.
 800  * Within this function the intention is to have algorithms that do
 801  * global communication at the end, so global barriers within the MD loop
 802  * are as close together as possible.
 803  *
 804  * \param[in]     fplog            The log file
 805  * \param[in]     cr               The communication record
 806  * \param[in]     inputrec         The input record
 807  * \param[in]     step             The current MD step
 808  * \param[in]     t                The current time
 809  * \param[in,out] wcycle           Wallcycle accounting struct
 810  * \param[in,out] forceProviders   Pointer to a list of force providers
 811  * \param[in]     box              The unit cell
 812  * \param[in]     x                The coordinates
 813  * \param[in]     mdatoms          Per atom properties
 814  * \param[in]     lambda           Array of free-energy lambda values
 815  * \param[in]     forceFlags       Flags that tell whether we should compute forces/energies/virial
 816  * \param[in,out] forceWithVirial  Force and virial buffers
 817  * \param[in,out] enerd            Energy buffer
 818  * \param[in,out] ed               Essential dynamics pointer
 819  * \param[in]     bNS              Tells if we did neighbor searching this step, used for ED sampling
 820  *
 821  * \todo Remove bNS, which is used incorrectly.
 822  * \todo Convert all other algorithms called here to ForceProviders.
 823  */
 824 static void
 825 computeSpecialForces(FILE                *fplog,
 826                      const t_commrec     *cr,
 827                      const t_inputrec    *inputrec,
 828                      gmx_int64_t          step,
 829                      double               t,
 830                      gmx_wallcycle_t      wcycle,
 831                      ForceProviders      *forceProviders,
 832                      matrix               box,
 833                      ArrayRef<const RVec> x,
 834                      const t_mdatoms     *mdatoms,
 835                      real                *lambda,
 836                      int                  forceFlags,
 837                      ForceWithVirial     *forceWithVirial,
 838                      gmx_enerdata_t      *enerd,
 839                      const gmx_edsam     *ed,
 840                      gmx_bool             bNS)
 841 {
 842     const bool computeForces = (forceFlags & GMX_FORCE_FORCES);
 843
 844     /* NOTE: Currently all ForceProviders only provide forces.
 845      *       When they also provide energies, remove this conditional.
 846      */
 847     if (computeForces)
 848     {
 849         /* Collect forces from modules */
 850         forceProviders->calculateForces(cr, mdatoms, box, t, x, forceWithVirial);
 851     }
 852
 853     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
 854     {
 855         pull_potential_wrapper(cr, inputrec, box, x,
 856                                forceWithVirial,
 857                                mdatoms, enerd, lambda, t,
 858                                wcycle);
 859
 860         if (inputrec->bDoAwh)
 861         {
 862             Awh &awh = *inputrec->awh;
 863             enerd->term[F_COM_PULL] +=
 864                 awh.applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
 865                                                  forceWithVirial,
 866                                                  t, step, wcycle, fplog);
 867         }
 868     }
 869
 870     rvec *f = as_rvec_array(forceWithVirial->force_.data());
 871
 872     /* Add the forces from enforced rotation potentials (if any) */
 873     if (inputrec->bRot)
 874     {
 875         wallcycle_start(wcycle, ewcROTadd);
 876         enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
 877         wallcycle_stop(wcycle, ewcROTadd);
 878     }
 879
 880     if (ed)
 881     {
 882         /* Note that since init_edsam() is called after the initialization
 883          * of forcerec, edsam doesn't request the noVirSum force buffer.
 884          * Thus if no other algorithm (e.g. PME) requires it, the forces
 885          * here will contribute to the virial.
 886          */
 887         do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
 888     }
 889
 890     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
 891     if (inputrec->bIMD && computeForces)
 892     {
 893         IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
 894     }
 895 }
 896
 897 /*! \brief Launch the prepare_step and spread stages of PME GPU.
 898  *
 899  * \param[in]  pmedata       The PME structure
 900  * \param[in]  box           The box matrix
 901  * \param[in]  x             Coordinate array
 902  * \param[in]  flags         Force flags
 903  * \param[in]  wcycle        The wallcycle structure
 904  */
 905 static inline void launchPmeGpuSpread(gmx_pme_t      *pmedata,
 906                                       matrix          box,
 907                                       rvec            x[],
 908                                       int             flags,
 909                                       gmx_wallcycle_t wcycle)
 910 {
 911     int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE;
 912     pmeFlags |= (flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0;
 913     pmeFlags |= (flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0;
 914
 915     pme_gpu_prepare_computation(pmedata, flags & GMX_FORCE_DYNAMICBOX, box, wcycle, pmeFlags);
 916     pme_gpu_launch_spread(pmedata, x, wcycle);
 917 }
 918
 919 /*! \brief Launch the FFT and gather stages of PME GPU
 920  *
 921  * This function only implements setting the output forces (no accumulation).
 922  *
 923  * \param[in]  pmedata        The PME structure
 924  * \param[in]  wcycle         The wallcycle structure
 925  */
 926 static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
 927                                      gmx_wallcycle_t   wcycle)
 928 {
 929     pme_gpu_launch_complex_transforms(pmedata, wcycle);
 930     pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
 931 }
 932
 933 /*! \brief
 934  *  Polling wait for either of the PME or nonbonded GPU tasks.
 935  *
 936  * Instead of a static order in waiting for GPU tasks, this function
 937  * polls checking which of the two tasks completes first, and does the
 938  * associated force buffer reduction overlapped with the other task.
 939  * By doing that, unlike static scheduling order, it can always overlap
 940  * one of the reductions, regardless of the GPU task completion order.
 941  *
 942  * \param[in]     nbv              Nonbonded verlet structure
 943  * \param[in]     pmedata          PME module data
 944  * \param[in,out] force            Force array to reduce task outputs into.
 945  * \param[in,out] forceWithVirial  Force and virial buffers
 946  * \param[in,out] fshift           Shift force output vector results are reduced into
 947  * \param[in,out] enerd            Energy data structure results are reduced into
 948  * \param[in]     flags            Force flags
 949  * \param[in]     wcycle           The wallcycle structure
 950  */
 951 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t             *nbv,
 952                                         const gmx_pme_t                *pmedata,
 953                                         gmx::PaddedArrayRef<gmx::RVec> *force,
 954                                         ForceWithVirial                *forceWithVirial,
 955                                         rvec                            fshift[],
 956                                         gmx_enerdata_t                 *enerd,
 957                                         int                             flags,
 958                                         gmx_wallcycle_t                 wcycle)
 959 {
 960     bool isPmeGpuDone = false;
 961     bool isNbGpuDone  = false;
 962
 963
 964     gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
 965
 966     while (!isPmeGpuDone || !isNbGpuDone)
 967     {
 968         if (!isPmeGpuDone)
 969         {
 970             matrix            vir_Q;
 971             real              Vlr_q;
 972
 973             GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
 974             isPmeGpuDone = pme_gpu_try_finish_task(pmedata, wcycle, &pmeGpuForces,
 975                                                    vir_Q, &Vlr_q, completionType);
 976
 977             if (isPmeGpuDone)
 978             {
 979                 pme_gpu_reduce_outputs(wcycle, forceWithVirial, pmeGpuForces,
 980                                        enerd, vir_Q, Vlr_q);
 981             }
 982         }
 983
 984         if (!isNbGpuDone)
 985         {
 986             GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
 987             wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
 988             isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
 989                                                     flags, eatLocal,
 990                                                     enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
 991                                                     fshift, completionType);
 992             wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 993             // To get the call count right, when the task finished we
 994             // issue a start/stop.
 995             // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
 996             // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
 997             if (isNbGpuDone)
 998             {
 999                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1000                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1001
1002                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
1003                                                nbv->nbat, as_rvec_array(force->data()), wcycle);
1004             }
1005         }
1006     }
1007 }
1008
1009 /*! \brief
1010  *  Launch the dynamic rolling pruning GPU task.
1011  *
1012  *  We currently alternate local/non-local list pruning in odd-even steps
1013  *  (only pruning every second step without DD).
1014  *
1015  * \param[in]     cr               The communication record
1016  * \param[in]     nbv              Nonbonded verlet structure
1017  * \param[in]     inputrec         The input record
1018  * \param[in]     step             The current MD step
1019  */
1020 static inline void launchGpuRollingPruning(const t_commrec          *cr,
1021                                            const nonbonded_verlet_t *nbv,
1022                                            const t_inputrec         *inputrec,
1023                                            const gmx_int64_t         step)
1024 {
1025     /* We should not launch the rolling pruning kernel at a search
1026      * step or just before search steps, since that's useless.
1027      * Without domain decomposition we prune at even steps.
1028      * With domain decomposition we alternate local and non-local
1029      * pruning at even and odd steps.
1030      */
1031     int  numRollingParts     = nbv->listParams->numRollingParts;
1032     GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1033     int  stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1034     bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
1035     if (stepWithCurrentList > 0 &&
1036         stepWithCurrentList < inputrec->nstlist - 1 &&
1037         (stepIsEven || DOMAINDECOMP(cr)))
1038     {
1039         nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
1040                                           stepIsEven ? eintLocal : eintNonlocal,
1041                                           numRollingParts);
1042     }
1043 }
1044
1045 static void do_force_cutsVERLET(FILE *fplog,
1046                                 const t_commrec *cr,
1047                                 const gmx_multisim_t *ms,
1048                                 const t_inputrec *inputrec,
1049                                 gmx_int64_t step,
1050                                 t_nrnb *nrnb,
1051                                 gmx_wallcycle_t wcycle,
1052                                 const gmx_localtop_t *top,
1053                                 const gmx_groups_t * /* groups */,
1054                                 matrix box, gmx::PaddedArrayRef<gmx::RVec> x,
1055                                 history_t *hist,
1056                                 gmx::PaddedArrayRef<gmx::RVec> force,
1057                                 tensor vir_force,
1058                                 const t_mdatoms *mdatoms,
1059                                 gmx_enerdata_t *enerd, t_fcdata *fcd,
1060                                 real *lambda,
1061                                 t_graph *graph,
1062                                 t_forcerec *fr,
1063                                 interaction_const_t *ic,
1064                                 const gmx_vsite_t *vsite,
1065                                 rvec mu_tot,
1066                                 double t,
1067                                 const gmx_edsam *ed,
1068                                 int flags,
1069                                 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1070                                 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1071 {
1072     int                 cg1, i, j;
1073     double              mu[2*DIM];
1074     gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM;
1075     gmx_bool            bDoForces, bUseGPU, bUseOrEmulGPU;
1076     rvec                vzero, box_diag;
1077     float               cycles_pme, cycles_wait_gpu;
1078     nonbonded_verlet_t *nbv = fr->nbv;
1079
1080     bStateChanged = (flags & GMX_FORCE_STATECHANGED);
1081     bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1082     bFillGrid     = (bNS && bStateChanged);
1083     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
1084     bDoForces     = (flags & GMX_FORCE_FORCES);
1085     bUseGPU       = fr->nbv->bUseGPU;
1086     bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
1087
1088     const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
1089     // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
1090     const bool useGpuPme  = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
1091         ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
1092
1093     /* At a search step we need to start the first balancing region
1094      * somewhere early inside the step after communication during domain
1095      * decomposition (and not during the previous step as usual).
1096      */
1097     if (bNS &&
1098         ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1099     {
1100         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
1101     }
1102
1103     cycles_wait_gpu = 0;
1104
1105     const int start  = 0;
1106     const int homenr = mdatoms->homenr;
1107
1108     clear_mat(vir_force);
1109
1110     if (DOMAINDECOMP(cr))
1111     {
1112         cg1 = cr->dd->ncg_tot;
1113     }
1114     else
1115     {
1116         cg1 = top->cgs.nr;
1117     }
1118     if (fr->n_tpi > 0)
1119     {
1120         cg1--;
1121     }
1122
1123     if (bStateChanged)
1124     {
1125         update_forcerec(fr, box);
1126
1127         if (inputrecNeedMutot(inputrec))
1128         {
1129             /* Calculate total (local) dipole moment in a temporary common array.
1130              * This makes it possible to sum them over nodes faster.
1131              */
1132             calc_mu(start, homenr,
1133                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1134                     mu, mu+DIM);
1135         }
1136     }
1137
1138     if (fr->ePBC != epbcNONE)
1139     {
1140         /* Compute shift vectors every step,
1141          * because of pressure coupling or box deformation!
1142          */
1143         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1144         {
1145             calc_shifts(box, fr->shift_vec);
1146         }
1147
1148         if (bCalcCGCM)
1149         {
1150             put_atoms_in_box_omp(fr->ePBC, box, x.subArray(0, homenr));
1151             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1152         }
1153         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1154         {
1155             unshift_self(graph, box, as_rvec_array(x.data()));
1156         }
1157     }
1158
1159     nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
1160                                  fr->shift_vec, nbv->nbat);
1161
1162 #if GMX_MPI
1163     if (!thisRankHasDuty(cr, DUTY_PME))
1164     {
1165         /* Send particle coordinates to the pme nodes.
1166          * Since this is only implemented for domain decomposition
1167          * and domain decomposition does not use the graph,
1168          * we do not need to worry about shifting.
1169          */
1170         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1171                                  lambda[efptCOUL], lambda[efptVDW],
1172                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1173                                  step, wcycle);
1174     }
1175 #endif /* GMX_MPI */
1176
1177     if (useGpuPme)
1178     {
1179         launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.data()), flags, wcycle);
1180     }
1181
1182     /* do gridding for pair search */
1183     if (bNS)
1184     {
1185         if (graph && bStateChanged)
1186         {
1187             /* Calculate intramolecular shift vectors to make molecules whole */
1188             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1189         }
1190
1191         clear_rvec(vzero);
1192         box_diag[XX] = box[XX][XX];
1193         box_diag[YY] = box[YY][YY];
1194         box_diag[ZZ] = box[ZZ][ZZ];
1195
1196         wallcycle_start(wcycle, ewcNS);
1197         if (!fr->bDomDec)
1198         {
1199             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1200             nbnxn_put_on_grid(nbv->nbs, fr->ePBC, box,
1201                               0, vzero, box_diag,
1202                               0, mdatoms->homenr, -1, fr->cginfo, as_rvec_array(x.data()),
1203                               0, nullptr,
1204                               nbv->grp[eintLocal].kernel_type,
1205                               nbv->nbat);
1206             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1207         }
1208         else
1209         {
1210             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1211             nbnxn_put_on_grid_nonlocal(nbv->nbs, domdec_zones(cr->dd),
1212                                        fr->cginfo, as_rvec_array(x.data()),
1213                                        nbv->grp[eintNonlocal].kernel_type,
1214                                        nbv->nbat);
1215             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1216         }
1217
1218         nbnxn_atomdata_set(nbv->nbat, nbv->nbs, mdatoms, fr->cginfo);
1219         wallcycle_stop(wcycle, ewcNS);
1220     }
1221
1222     /* initialize the GPU atom data and copy shift vector */
1223     if (bUseGPU)
1224     {
1225         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1226         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1227
1228         if (bNS)
1229         {
1230             nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1231         }
1232
1233         nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1234
1235         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1236         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1237     }
1238
1239     /* do local pair search */
1240     if (bNS)
1241     {
1242         wallcycle_start_nocount(wcycle, ewcNS);
1243         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1244         nbnxn_make_pairlist(nbv->nbs, nbv->nbat,
1245                             &top->excls,
1246                             nbv->listParams->rlistOuter,
1247                             nbv->min_ci_balanced,
1248                             &nbv->grp[eintLocal].nbl_lists,
1249                             eintLocal,
1250                             nbv->grp[eintLocal].kernel_type,
1251                             nrnb);
1252         nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
1253         if (nbv->listParams->useDynamicPruning && !bUseGPU)
1254         {
1255             nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
1256         }
1257         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1258
1259         if (bUseGPU)
1260         {
1261             /* initialize local pair-list on the GPU */
1262             nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1263                                     nbv->grp[eintLocal].nbl_lists.nbl[0],
1264                                     eintLocal);
1265         }
1266         wallcycle_stop(wcycle, ewcNS);
1267     }
1268     else
1269     {
1270         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, as_rvec_array(x.data()),
1271                                         nbv->nbat, wcycle);
1272     }
1273
1274     if (bUseGPU)
1275     {
1276         if (DOMAINDECOMP(cr))
1277         {
1278             ddOpenBalanceRegionGpu(cr->dd);
1279         }
1280
1281         wallcycle_start(wcycle, ewcLAUNCH_GPU);
1282         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1283         /* launch local nonbonded work on GPU */
1284         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1285                      step, nrnb, wcycle);
1286         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1287         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1288     }
1289
1290     if (useGpuPme)
1291     {
1292         // In PME GPU and mixed mode we launch FFT / gather after the
1293         // X copy/transform to allow overlap as well as after the GPU NB
1294         // launch to avoid FFT launch overhead hijacking the CPU and delaying
1295         // the nonbonded kernel.
1296         launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1297     }
1298
1299     /* Communicate coordinates and sum dipole if necessary +
1300        do non-local pair search */
1301     if (DOMAINDECOMP(cr))
1302     {
1303         if (bNS)
1304         {
1305             wallcycle_start_nocount(wcycle, ewcNS);
1306             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1307
1308             nbnxn_make_pairlist(nbv->nbs, nbv->nbat,
1309                                 &top->excls,
1310                                 nbv->listParams->rlistOuter,
1311                                 nbv->min_ci_balanced,
1312                                 &nbv->grp[eintNonlocal].nbl_lists,
1313                                 eintNonlocal,
1314                                 nbv->grp[eintNonlocal].kernel_type,
1315                                 nrnb);
1316             nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1317             if (nbv->listParams->useDynamicPruning && !bUseGPU)
1318             {
1319                 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1320             }
1321             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1322
1323             if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1324             {
1325                 /* initialize non-local pair-list on the GPU */
1326                 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1327                                         nbv->grp[eintNonlocal].nbl_lists.nbl[0],
1328                                         eintNonlocal);
1329             }
1330             wallcycle_stop(wcycle, ewcNS);
1331         }
1332         else
1333         {
1334             dd_move_x(cr->dd, box, as_rvec_array(x.data()), wcycle);
1335
1336             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, as_rvec_array(x.data()),
1337                                             nbv->nbat, wcycle);
1338         }
1339
1340         if (bUseGPU)
1341         {
1342             wallcycle_start(wcycle, ewcLAUNCH_GPU);
1343             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1344             /* launch non-local nonbonded tasks on GPU */
1345             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1346                          step, nrnb, wcycle);
1347             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1348             wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1349         }
1350     }
1351
1352     if (bUseGPU)
1353     {
1354         /* launch D2H copy-back F */
1355         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1356         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1357         if (DOMAINDECOMP(cr))
1358         {
1359             nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1360                                      flags, eatNonlocal);
1361         }
1362         nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1363                                  flags, eatLocal);
1364         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1365         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1366     }
1367
1368     if (bStateChanged && inputrecNeedMutot(inputrec))
1369     {
1370         if (PAR(cr))
1371         {
1372             gmx_sumd(2*DIM, mu, cr);
1373             ddReopenBalanceRegionCpu(cr->dd);
1374         }
1375
1376         for (i = 0; i < 2; i++)
1377         {
1378             for (j = 0; j < DIM; j++)
1379             {
1380                 fr->mu_tot[i][j] = mu[i*DIM + j];
1381             }
1382         }
1383     }
1384     if (fr->efep == efepNO)
1385     {
1386         copy_rvec(fr->mu_tot[0], mu_tot);
1387     }
1388     else
1389     {
1390         for (j = 0; j < DIM; j++)
1391         {
1392             mu_tot[j] =
1393                 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1394                 lambda[efptCOUL]*fr->mu_tot[1][j];
1395         }
1396     }
1397
1398     /* Reset energies */
1399     reset_enerdata(enerd);
1400     clear_rvecs(SHIFTS, fr->fshift);
1401
1402     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1403     {
1404         wallcycle_start(wcycle, ewcPPDURINGPME);
1405         dd_force_flop_start(cr->dd, nrnb);
1406     }
1407
1408     if (inputrec->bRot)
1409     {
1410         wallcycle_start(wcycle, ewcROT);
1411         do_rotation(cr, inputrec, box, as_rvec_array(x.data()), t, step, bNS);
1412         wallcycle_stop(wcycle, ewcROT);
1413     }
1414
1415     /* Temporary solution until all routines take PaddedRVecVector */
1416     rvec *f = as_rvec_array(force.data());
1417
1418     /* Start the force cycle counter.
1419      * Note that a different counter is used for dynamic load balancing.
1420      */
1421     wallcycle_start(wcycle, ewcFORCE);
1422
1423     gmx::ArrayRef<gmx::RVec> forceRef = force;
1424     if (bDoForces)
1425     {
1426         /* If we need to compute the virial, we might need a separate
1427          * force buffer for algorithms for which the virial is calculated
1428          * directly, such as PME.
1429          */
1430         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1431         {
1432             forceRef = *fr->forceBufferForDirectVirialContributions;
1433
1434             /* We only compute forces on local atoms. Note that vsites can
1435              * spread to non-local atoms, but that part of the buffer is
1436              * cleared separately in the vsite spreading code.
1437              */
1438             clear_rvecs_omp(mdatoms->homenr, as_rvec_array(forceRef.data()));
1439         }
1440         /* Clear the short- and long-range forces */
1441         clear_rvecs_omp(fr->natoms_force_constr, f);
1442     }
1443
1444     /* forceWithVirial uses the local atom range only */
1445     ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
1446
1447     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1448     {
1449         clear_pull_forces(inputrec->pull_work);
1450     }
1451
1452     /* We calculate the non-bonded forces, when done on the CPU, here.
1453      * We do this before calling do_force_lowlevel, because in that
1454      * function, the listed forces are calculated before PME, which
1455      * does communication.  With this order, non-bonded and listed
1456      * force calculation imbalance can be balanced out by the domain
1457      * decomposition load balancing.
1458      */
1459
1460     if (!bUseOrEmulGPU)
1461     {
1462         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1463                      step, nrnb, wcycle);
1464     }
1465
1466     if (fr->efep != efepNO)
1467     {
1468         /* Calculate the local and non-local free energy interactions here.
1469          * Happens here on the CPU both with and without GPU.
1470          */
1471         if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1472         {
1473             do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1474                              fr, as_rvec_array(x.data()), f, mdatoms,
1475                              inputrec->fepvals, lambda,
1476                              enerd, flags, nrnb, wcycle);
1477         }
1478
1479         if (DOMAINDECOMP(cr) &&
1480             fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1481         {
1482             do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1483                              fr, as_rvec_array(x.data()), f, mdatoms,
1484                              inputrec->fepvals, lambda,
1485                              enerd, flags, nrnb, wcycle);
1486         }
1487     }
1488
1489     if (!bUseOrEmulGPU)
1490     {
1491         int aloc;
1492
1493         if (DOMAINDECOMP(cr))
1494         {
1495             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1496                          step, nrnb, wcycle);
1497         }
1498
1499         if (!bUseOrEmulGPU)
1500         {
1501             aloc = eintLocal;
1502         }
1503         else
1504         {
1505             aloc = eintNonlocal;
1506         }
1507
1508         /* Add all the non-bonded force to the normal force array.
1509          * This can be split into a local and a non-local part when overlapping
1510          * communication with calculation with domain decomposition.
1511          */
1512         wallcycle_stop(wcycle, ewcFORCE);
1513
1514         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->nbat, f, wcycle);
1515
1516         wallcycle_start_nocount(wcycle, ewcFORCE);
1517
1518         /* if there are multiple fshift output buffers reduce them */
1519         if ((flags & GMX_FORCE_VIRIAL) &&
1520             nbv->grp[aloc].nbl_lists.nnbl > 1)
1521         {
1522             /* This is not in a subcounter because it takes a
1523                negligible and constant-sized amount of time */
1524             nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1525                                                      fr->fshift);
1526         }
1527     }
1528
1529     /* update QMMMrec, if necessary */
1530     if (fr->bQMMM)
1531     {
1532         update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1533     }
1534
1535     /* Compute the bonded and non-bonded energies and optionally forces */
1536     do_force_lowlevel(fr, inputrec, &(top->idef),
1537                       cr, ms, nrnb, wcycle, mdatoms,
1538                       as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1539                       box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1540                       flags, &cycles_pme);
1541
1542     wallcycle_stop(wcycle, ewcFORCE);
1543
1544     computeSpecialForces(fplog, cr, inputrec, step, t, wcycle,
1545                          fr->forceProviders, box, x, mdatoms, lambda,
1546                          flags, &forceWithVirial, enerd,
1547                          ed, bNS);
1548
1549     if (bUseOrEmulGPU)
1550     {
1551         /* wait for non-local forces (or calculate in emulation mode) */
1552         if (DOMAINDECOMP(cr))
1553         {
1554             if (bUseGPU)
1555             {
1556                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1557                 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1558                                            flags, eatNonlocal,
1559                                            enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1560                                            fr->fshift);
1561                 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1562             }
1563             else
1564             {
1565                 wallcycle_start_nocount(wcycle, ewcFORCE);
1566                 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1567                              step, nrnb, wcycle);
1568                 wallcycle_stop(wcycle, ewcFORCE);
1569             }
1570
1571             /* skip the reduction if there was no non-local work to do */
1572             if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1573             {
1574                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
1575                                                nbv->nbat, f, wcycle);
1576             }
1577         }
1578     }
1579
1580     if (DOMAINDECOMP(cr))
1581     {
1582         /* We are done with the CPU compute.
1583          * We will now communicate the non-local forces.
1584          * If we use a GPU this will overlap with GPU work, so in that case
1585          * we do not close the DD force balancing region here.
1586          */
1587         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1588         {
1589             ddCloseBalanceRegionCpu(cr->dd);
1590         }
1591         if (bDoForces)
1592         {
1593             dd_move_f(cr->dd, f, fr->fshift, wcycle);
1594         }
1595     }
1596
1597     // With both nonbonded and PME offloaded a GPU on the same rank, we use
1598     // an alternating wait/reduction scheme.
1599     bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1600     if (alternateGpuWait)
1601     {
1602         alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, wcycle);
1603     }
1604
1605     if (!alternateGpuWait && useGpuPme)
1606     {
1607         gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
1608         matrix vir_Q;
1609         real   Vlr_q = 0.0;
1610         pme_gpu_wait_finish_task(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
1611         pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
1612     }
1613
1614     /* Wait for local GPU NB outputs on the non-alternating wait path */
1615     if (!alternateGpuWait && bUseGPU)
1616     {
1617         /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1618          * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1619          * but even with a step of 0.1 ms the difference is less than 1%
1620          * of the step time.
1621          */
1622         const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1623
1624         wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1625         nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1626                                    flags, eatLocal,
1627                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1628                                    fr->fshift);
1629         float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1630
1631         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1632         {
1633             DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1634             if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1635             {
1636                 /* We measured few cycles, it could be that the kernel
1637                  * and transfer finished earlier and there was no actual
1638                  * wait time, only API call overhead.
1639                  * Then the actual time could be anywhere between 0 and
1640                  * cycles_wait_est. We will use half of cycles_wait_est.
1641                  */
1642                 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1643             }
1644             ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1645         }
1646     }
1647
1648     if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1649     {
1650         // NOTE: emulation kernel is not included in the balancing region,
1651         // but emulation mode does not target performance anyway
1652         wallcycle_start_nocount(wcycle, ewcFORCE);
1653         do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1654                      DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1655                      step, nrnb, wcycle);
1656         wallcycle_stop(wcycle, ewcFORCE);
1657     }
1658
1659     if (useGpuPme)
1660     {
1661         pme_gpu_reinit_computation(fr->pmedata, wcycle);
1662     }
1663
1664     if (bUseGPU)
1665     {
1666         /* now clear the GPU outputs while we finish the step on the CPU */
1667         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1668         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1669         nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1670
1671         /* Is dynamic pair-list pruning activated? */
1672         if (nbv->listParams->useDynamicPruning)
1673         {
1674             launchGpuRollingPruning(cr, nbv, inputrec, step);
1675         }
1676         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1677         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1678     }
1679
1680     /* Do the nonbonded GPU (or emulation) force buffer reduction
1681      * on the non-alternating path. */
1682     if (bUseOrEmulGPU && !alternateGpuWait)
1683     {
1684         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
1685                                        nbv->nbat, f, wcycle);
1686     }
1687
1688     if (DOMAINDECOMP(cr))
1689     {
1690         dd_force_flop_stop(cr->dd, nrnb);
1691     }
1692
1693     if (bDoForces)
1694     {
1695         /* If we have NoVirSum forces, but we do not calculate the virial,
1696          * we sum fr->f_novirsum=f later.
1697          */
1698         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1699         {
1700             spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
1701                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1702         }
1703
1704         if (flags & GMX_FORCE_VIRIAL)
1705         {
1706             /* Calculation of the virial must be done after vsites! */
1707             calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
1708                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1709         }
1710     }
1711
1712     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1713     {
1714         /* In case of node-splitting, the PP nodes receive the long-range
1715          * forces, virial and energy from the PME nodes here.
1716          */
1717         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1718     }
1719
1720     if (bDoForces)
1721     {
1722         post_process_forces(cr, step, nrnb, wcycle,
1723                             top, box, as_rvec_array(x.data()), f, &forceWithVirial,
1724                             vir_force, mdatoms, graph, fr, vsite,
1725                             flags);
1726     }
1727
1728     if (flags & GMX_FORCE_ENERGY)
1729     {
1730         /* Sum the potential energy terms from group contributions */
1731         sum_epot(&(enerd->grpp), enerd->term);
1732
1733         if (!EI_TPI(inputrec->eI))
1734         {
1735             checkPotentialEnergyValidity(step, *enerd, *inputrec);
1736         }
1737     }
1738 }
1739
1740 static void do_force_cutsGROUP(FILE *fplog,
1741                                const t_commrec *cr,
1742                                const gmx_multisim_t *ms,
1743                                const t_inputrec *inputrec,
1744                                gmx_int64_t step,
1745                                t_nrnb *nrnb,
1746                                gmx_wallcycle_t wcycle,
1747                                gmx_localtop_t *top,
1748                                const gmx_groups_t *groups,
1749                                matrix box, gmx::PaddedArrayRef<gmx::RVec> x,
1750                                history_t *hist,
1751                                gmx::PaddedArrayRef<gmx::RVec> force,
1752                                tensor vir_force,
1753                                const t_mdatoms *mdatoms,
1754                                gmx_enerdata_t *enerd,
1755                                t_fcdata *fcd,
1756                                real *lambda,
1757                                t_graph *graph,
1758                                t_forcerec *fr,
1759                                const gmx_vsite_t *vsite,
1760                                rvec mu_tot,
1761                                double t,
1762                                const gmx_edsam *ed,
1763                                int flags,
1764                                DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1765                                DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1766 {
1767     int        cg0, cg1, i, j;
1768     double     mu[2*DIM];
1769     gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM;
1770     gmx_bool   bDoForces;
1771     float      cycles_pme;
1772
1773     const int  start  = 0;
1774     const int  homenr = mdatoms->homenr;
1775
1776     clear_mat(vir_force);
1777
1778     cg0 = 0;
1779     if (DOMAINDECOMP(cr))
1780     {
1781         cg1 = cr->dd->ncg_tot;
1782     }
1783     else
1784     {
1785         cg1 = top->cgs.nr;
1786     }
1787     if (fr->n_tpi > 0)
1788     {
1789         cg1--;
1790     }
1791
1792     bStateChanged  = (flags & GMX_FORCE_STATECHANGED);
1793     bNS            = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1794     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1795     bFillGrid      = (bNS && bStateChanged);
1796     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
1797     bDoForces      = (flags & GMX_FORCE_FORCES);
1798
1799     if (bStateChanged)
1800     {
1801         update_forcerec(fr, box);
1802
1803         if (inputrecNeedMutot(inputrec))
1804         {
1805             /* Calculate total (local) dipole moment in a temporary common array.
1806              * This makes it possible to sum them over nodes faster.
1807              */
1808             calc_mu(start, homenr,
1809                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1810                     mu, mu+DIM);
1811         }
1812     }
1813
1814     if (fr->ePBC != epbcNONE)
1815     {
1816         /* Compute shift vectors every step,
1817          * because of pressure coupling or box deformation!
1818          */
1819         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1820         {
1821             calc_shifts(box, fr->shift_vec);
1822         }
1823
1824         if (bCalcCGCM)
1825         {
1826             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1827                                      &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1828             inc_nrnb(nrnb, eNR_CGCM, homenr);
1829             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1830         }
1831         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1832         {
1833             unshift_self(graph, box, as_rvec_array(x.data()));
1834         }
1835     }
1836     else if (bCalcCGCM)
1837     {
1838         calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1839         inc_nrnb(nrnb, eNR_CGCM, homenr);
1840     }
1841
1842     if (bCalcCGCM && gmx_debug_at)
1843     {
1844         pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1845     }
1846
1847 #if GMX_MPI
1848     if (!thisRankHasDuty(cr, DUTY_PME))
1849     {
1850         /* Send particle coordinates to the pme nodes.
1851          * Since this is only implemented for domain decomposition
1852          * and domain decomposition does not use the graph,
1853          * we do not need to worry about shifting.
1854          */
1855         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1856                                  lambda[efptCOUL], lambda[efptVDW],
1857                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1858                                  step, wcycle);
1859     }
1860 #endif /* GMX_MPI */
1861
1862     /* Communicate coordinates and sum dipole if necessary */
1863     if (DOMAINDECOMP(cr))
1864     {
1865         dd_move_x(cr->dd, box, as_rvec_array(x.data()), wcycle);
1866
1867         /* No GPU support, no move_x overlap, so reopen the balance region here */
1868         if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1869         {
1870             ddReopenBalanceRegionCpu(cr->dd);
1871         }
1872     }
1873
1874     if (inputrecNeedMutot(inputrec))
1875     {
1876         if (bStateChanged)
1877         {
1878             if (PAR(cr))
1879             {
1880                 gmx_sumd(2*DIM, mu, cr);
1881                 ddReopenBalanceRegionCpu(cr->dd);
1882             }
1883             for (i = 0; i < 2; i++)
1884             {
1885                 for (j = 0; j < DIM; j++)
1886                 {
1887                     fr->mu_tot[i][j] = mu[i*DIM + j];
1888                 }
1889             }
1890         }
1891         if (fr->efep == efepNO)
1892         {
1893             copy_rvec(fr->mu_tot[0], mu_tot);
1894         }
1895         else
1896         {
1897             for (j = 0; j < DIM; j++)
1898             {
1899                 mu_tot[j] =
1900                     (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1901             }
1902         }
1903     }
1904
1905     /* Reset energies */
1906     reset_enerdata(enerd);
1907     clear_rvecs(SHIFTS, fr->fshift);
1908
1909     if (bNS)
1910     {
1911         wallcycle_start(wcycle, ewcNS);
1912
1913         if (graph && bStateChanged)
1914         {
1915             /* Calculate intramolecular shift vectors to make molecules whole */
1916             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1917         }
1918
1919         /* Do the actual neighbour searching */
1920         ns(fplog, fr, box,
1921            groups, top, mdatoms,
1922            cr, nrnb, bFillGrid);
1923
1924         wallcycle_stop(wcycle, ewcNS);
1925     }
1926
1927     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1928     {
1929         wallcycle_start(wcycle, ewcPPDURINGPME);
1930         dd_force_flop_start(cr->dd, nrnb);
1931     }
1932
1933     if (inputrec->bRot)
1934     {
1935         wallcycle_start(wcycle, ewcROT);
1936         do_rotation(cr, inputrec, box, as_rvec_array(x.data()), t, step, bNS);
1937         wallcycle_stop(wcycle, ewcROT);
1938     }
1939
1940     /* Temporary solution until all routines take PaddedRVecVector */
1941     rvec *f = as_rvec_array(force.data());
1942
1943     /* Start the force cycle counter.
1944      * Note that a different counter is used for dynamic load balancing.
1945      */
1946     wallcycle_start(wcycle, ewcFORCE);
1947
1948     gmx::ArrayRef<gmx::RVec> forceRef = force;
1949     if (bDoForces)
1950     {
1951         /* If we need to compute the virial, we might need a separate
1952          * force buffer for algorithms for which the virial is calculated
1953          * separately, such as PME.
1954          */
1955         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1956         {
1957             forceRef = *fr->forceBufferForDirectVirialContributions;
1958
1959             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1960         }
1961
1962         /* Clear the short- and long-range forces */
1963         clear_rvecs(fr->natoms_force_constr, f);
1964     }
1965
1966     /* forceWithVirial might need the full force atom range */
1967     ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
1968
1969     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1970     {
1971         clear_pull_forces(inputrec->pull_work);
1972     }
1973
1974     /* update QMMMrec, if necessary */
1975     if (fr->bQMMM)
1976     {
1977         update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1978     }
1979
1980     /* Compute the bonded and non-bonded energies and optionally forces */
1981     do_force_lowlevel(fr, inputrec, &(top->idef),
1982                       cr, ms, nrnb, wcycle, mdatoms,
1983                       as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1984                       box, inputrec->fepvals, lambda,
1985                       graph, &(top->excls), fr->mu_tot,
1986                       flags,
1987                       &cycles_pme);
1988
1989     wallcycle_stop(wcycle, ewcFORCE);
1990
1991     if (DOMAINDECOMP(cr))
1992     {
1993         dd_force_flop_stop(cr->dd, nrnb);
1994
1995         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1996         {
1997             ddCloseBalanceRegionCpu(cr->dd);
1998         }
1999     }
2000
2001     computeSpecialForces(fplog, cr, inputrec, step, t, wcycle,
2002                          fr->forceProviders, box, x, mdatoms, lambda,
2003                          flags, &forceWithVirial, enerd,
2004                          ed, bNS);
2005
2006     if (bDoForces)
2007     {
2008         /* Communicate the forces */
2009         if (DOMAINDECOMP(cr))
2010         {
2011             dd_move_f(cr->dd, f, fr->fshift, wcycle);
2012             /* Do we need to communicate the separate force array
2013              * for terms that do not contribute to the single sum virial?
2014              * Position restraints and electric fields do not introduce
2015              * inter-cg forces, only full electrostatics methods do.
2016              * When we do not calculate the virial, fr->f_novirsum = f,
2017              * so we have already communicated these forces.
2018              */
2019             if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
2020                 (flags & GMX_FORCE_VIRIAL))
2021             {
2022                 dd_move_f(cr->dd, as_rvec_array(forceWithVirial.force_.data()), nullptr, wcycle);
2023             }
2024         }
2025
2026         /* If we have NoVirSum forces, but we do not calculate the virial,
2027          * we sum fr->f_novirsum=f later.
2028          */
2029         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
2030         {
2031             spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
2032                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
2033         }
2034
2035         if (flags & GMX_FORCE_VIRIAL)
2036         {
2037             /* Calculation of the virial must be done after vsites! */
2038             calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
2039                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
2040         }
2041     }
2042
2043     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
2044     {
2045         /* In case of node-splitting, the PP nodes receive the long-range
2046          * forces, virial and energy from the PME nodes here.
2047          */
2048         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
2049     }
2050
2051     if (bDoForces)
2052     {
2053         post_process_forces(cr, step, nrnb, wcycle,
2054                             top, box, as_rvec_array(x.data()), f, &forceWithVirial,
2055                             vir_force, mdatoms, graph, fr, vsite,
2056                             flags);
2057     }
2058
2059     if (flags & GMX_FORCE_ENERGY)
2060     {
2061         /* Sum the potential energy terms from group contributions */
2062         sum_epot(&(enerd->grpp), enerd->term);
2063
2064         if (!EI_TPI(inputrec->eI))
2065         {
2066             checkPotentialEnergyValidity(step, *enerd, *inputrec);
2067         }
2068     }
2069
2070 }
2071
2072 void do_force(FILE                                     *fplog,
2073               const t_commrec                          *cr,
2074               const gmx_multisim_t                     *ms,
2075               const t_inputrec                         *inputrec,
2076               gmx_int64_t                               step,
2077               t_nrnb                                   *nrnb,
2078               gmx_wallcycle_t                           wcycle,
2079               gmx_localtop_t                           *top,
2080               const gmx_groups_t                       *groups,
2081               matrix                                    box,
2082               gmx::PaddedArrayRef<gmx::RVec>            x,
2083               history_t                                *hist,
2084               gmx::PaddedArrayRef<gmx::RVec>            force,
2085               tensor                                    vir_force,
2086               const t_mdatoms                          *mdatoms,
2087               gmx_enerdata_t                           *enerd,
2088               t_fcdata                                 *fcd,
2089               gmx::ArrayRef<real>                       lambda,
2090               t_graph                                  *graph,
2091               t_forcerec                               *fr,
2092               const gmx_vsite_t                        *vsite,
2093               rvec                                      mu_tot,
2094               double                                    t,
2095               const gmx_edsam                          *ed,
2096               int                                       flags,
2097               DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
2098               DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
2099 {
2100     /* modify force flag if not doing nonbonded */
2101     if (!fr->bNonbonded)
2102     {
2103         flags &= ~GMX_FORCE_NONBONDED;
2104     }
2105
2106     GMX_ASSERT(x.size()     >= gmx::paddedRVecVectorSize(fr->natoms_force), "coordinates should be padded");
2107     GMX_ASSERT(force.size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "force should be padded");
2108
2109     switch (inputrec->cutoff_scheme)
2110     {
2111         case ecutsVERLET:
2112             do_force_cutsVERLET(fplog, cr, ms, inputrec,
2113                                 step, nrnb, wcycle,
2114                                 top,
2115                                 groups,
2116                                 box, x, hist,
2117                                 force, vir_force,
2118                                 mdatoms,
2119                                 enerd, fcd,
2120                                 lambda.data(), graph,
2121                                 fr, fr->ic,
2122                                 vsite, mu_tot,
2123                                 t, ed,
2124                                 flags,
2125                                 ddOpenBalanceRegion,
2126                                 ddCloseBalanceRegion);
2127             break;
2128         case ecutsGROUP:
2129             do_force_cutsGROUP(fplog, cr, ms, inputrec,
2130                                step, nrnb, wcycle,
2131                                top,
2132                                groups,
2133                                box, x, hist,
2134                                force, vir_force,
2135                                mdatoms,
2136                                enerd, fcd,
2137                                lambda.data(), graph,
2138                                fr, vsite, mu_tot,
2139                                t, ed,
2140                                flags,
2141                                ddOpenBalanceRegion,
2142                                ddCloseBalanceRegion);
2143             break;
2144         default:
2145             gmx_incons("Invalid cut-off scheme passed!");
2146     }
2147
2148     /* In case we don't have constraints and are using GPUs, the next balancing
2149      * region starts here.
2150      * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2151      * virial calculation and COM pulling, is not thus not included in
2152      * the balance timing, which is ok as most tasks do communication.
2153      */
2154     if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
2155     {
2156         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
2157     }
2158 }
2159
2160
2161 void do_constrain_first(FILE *fplog, Constraints *constr,
2162                         t_inputrec *ir, t_mdatoms *md,
2163                         t_state *state, const t_commrec *cr,
2164                         const gmx_multisim_t *ms,
2165                         t_nrnb *nrnb,
2166                         t_forcerec *fr, gmx_localtop_t *top)
2167 {
2168     int             i, m, start, end;
2169     gmx_int64_t     step;
2170     real            dt = ir->delta_t;
2171     real            dvdl_dum;
2172     rvec           *savex;
2173
2174     /* We need to allocate one element extra, since we might use
2175      * (unaligned) 4-wide SIMD loads to access rvec entries.
2176      */
2177     snew(savex, state->natoms + 1);
2178
2179     start = 0;
2180     end   = md->homenr;
2181
2182     if (debug)
2183     {
2184         fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2185                 start, md->homenr, end);
2186     }
2187     /* Do a first constrain to reset particles... */
2188     step = ir->init_step;
2189     if (fplog)
2190     {
2191         char buf[STEPSTRSIZE];
2192         fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2193                 gmx_step_str(step, buf));
2194     }
2195     dvdl_dum = 0;
2196
2197     /* constrain the current position */
2198     constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
2199               ir, cr, ms, step, 0, 1.0, md,
2200               as_rvec_array(state->x.data()), as_rvec_array(state->x.data()), nullptr,
2201               fr->bMolPBC, state->box,
2202               state->lambda[efptBONDED], &dvdl_dum,
2203               nullptr, nullptr, nrnb, econqCoord);
2204     if (EI_VV(ir->eI))
2205     {
2206         /* constrain the inital velocity, and save it */
2207         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2208         constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
2209                   ir, cr, ms, step, 0, 1.0, md,
2210                   as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
2211                   fr->bMolPBC, state->box,
2212                   state->lambda[efptBONDED], &dvdl_dum,
2213                   nullptr, nullptr, nrnb, econqVeloc);
2214     }
2215     /* constrain the inital velocities at t-dt/2 */
2216     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2217     {
2218         for (i = start; (i < end); i++)
2219         {
2220             for (m = 0; (m < DIM); m++)
2221             {
2222                 /* Reverse the velocity */
2223                 state->v[i][m] = -state->v[i][m];
2224                 /* Store the position at t-dt in buf */
2225                 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
2226             }
2227         }
2228         /* Shake the positions at t=-dt with the positions at t=0
2229          * as reference coordinates.
2230          */
2231         if (fplog)
2232         {
2233             char buf[STEPSTRSIZE];
2234             fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2235                     gmx_step_str(step, buf));
2236         }
2237         dvdl_dum = 0;
2238         constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
2239                   ir, cr, ms, step, -1, 1.0, md,
2240                   as_rvec_array(state->x.data()), savex, nullptr,
2241                   fr->bMolPBC, state->box,
2242                   state->lambda[efptBONDED], &dvdl_dum,
2243                   as_rvec_array(state->v.data()), nullptr, nrnb, econqCoord);
2244
2245         for (i = start; i < end; i++)
2246         {
2247             for (m = 0; m < DIM; m++)
2248             {
2249                 /* Re-reverse the velocities */
2250                 state->v[i][m] = -state->v[i][m];
2251             }
2252         }
2253     }
2254     sfree(savex);
2255 }
2256
2257
2258 static void
2259 integrate_table(real vdwtab[], real scale, int offstart, int rstart, int rend,
2260                 double *enerout, double *virout)
2261 {
2262     double enersum, virsum;
2263     double invscale, invscale2, invscale3;
2264     double r, ea, eb, ec, pa, pb, pc, pd;
2265     double y0, f, g, h;
2266     int    ri, offset;
2267     double tabfactor;
2268
2269     invscale  = 1.0/scale;
2270     invscale2 = invscale*invscale;
2271     invscale3 = invscale*invscale2;
2272
2273     /* Following summation derived from cubic spline definition,
2274      * Numerical Recipies in C, second edition, p. 113-116.  Exact for
2275      * the cubic spline.  We first calculate the negative of the
2276      * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2277      * add the more standard, abrupt cutoff correction to that result,
2278      * yielding the long-range correction for a switched function.  We
2279      * perform both the pressure and energy loops at the same time for
2280      * simplicity, as the computational cost is low. */
2281
2282     if (offstart == 0)
2283     {
2284         /* Since the dispersion table has been scaled down a factor
2285          * 6.0 and the repulsion a factor 12.0 to compensate for the
2286          * c6/c12 parameters inside nbfp[] being scaled up (to save
2287          * flops in kernels), we need to correct for this.
2288          */
2289         tabfactor = 6.0;
2290     }
2291     else
2292     {
2293         tabfactor = 12.0;
2294     }
2295
2296     enersum = 0.0;
2297     virsum  = 0.0;
2298     for (ri = rstart; ri < rend; ++ri)
2299     {
2300         r  = ri*invscale;
2301         ea = invscale3;
2302         eb = 2.0*invscale2*r;
2303         ec = invscale*r*r;
2304
2305         pa = invscale3;
2306         pb = 3.0*invscale2*r;
2307         pc = 3.0*invscale*r*r;
2308         pd = r*r*r;
2309
2310         /* this "8" is from the packing in the vdwtab array - perhaps
2311            should be defined? */
2312
2313         offset = 8*ri + offstart;
2314         y0     = vdwtab[offset];
2315         f      = vdwtab[offset+1];
2316         g      = vdwtab[offset+2];
2317         h      = vdwtab[offset+3];
2318
2319         enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2320         virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2321     }
2322     *enerout = 4.0*M_PI*enersum*tabfactor;
2323     *virout  = 4.0*M_PI*virsum*tabfactor;
2324 }
2325
2326 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2327 {
2328     double   eners[2], virs[2], enersum, virsum;
2329     double   r0, rc3, rc9;
2330     int      ri0, ri1, i;
2331     real     scale, *vdwtab;
2332
2333     fr->enershiftsix    = 0;
2334     fr->enershifttwelve = 0;
2335     fr->enerdiffsix     = 0;
2336     fr->enerdifftwelve  = 0;
2337     fr->virdiffsix      = 0;
2338     fr->virdifftwelve   = 0;
2339
2340     const interaction_const_t *ic = fr->ic;
2341
2342     if (eDispCorr != edispcNO)
2343     {
2344         for (i = 0; i < 2; i++)
2345         {
2346             eners[i] = 0;
2347             virs[i]  = 0;
2348         }
2349         if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2350             (ic->vdw_modifier == eintmodPOTSWITCH) ||
2351             (ic->vdw_modifier == eintmodFORCESWITCH) ||
2352             (ic->vdwtype == evdwSHIFT) ||
2353             (ic->vdwtype == evdwSWITCH))
2354         {
2355             if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2356                  (ic->vdw_modifier == eintmodFORCESWITCH) ||
2357                  (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2358             {
2359                 gmx_fatal(FARGS,
2360                           "With dispersion correction rvdw-switch can not be zero "
2361                           "for vdw-type = %s", evdw_names[ic->vdwtype]);
2362             }
2363
2364             /* TODO This code depends on the logic in tables.c that
2365                constructs the table layout, which should be made
2366                explicit in future cleanup. */
2367             GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2368                        "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2369             scale  = fr->dispersionCorrectionTable->scale;
2370             vdwtab = fr->dispersionCorrectionTable->data;
2371
2372             /* Round the cut-offs to exact table values for precision */
2373             ri0  = static_cast<int>(floor(ic->rvdw_switch*scale));
2374             ri1  = static_cast<int>(ceil(ic->rvdw*scale));
2375
2376             /* The code below has some support for handling force-switching, i.e.
2377              * when the force (instead of potential) is switched over a limited
2378              * region. This leads to a constant shift in the potential inside the
2379              * switching region, which we can handle by adding a constant energy
2380              * term in the force-switch case just like when we do potential-shift.
2381              *
2382              * For now this is not enabled, but to keep the functionality in the
2383              * code we check separately for switch and shift. When we do force-switch
2384              * the shifting point is rvdw_switch, while it is the cutoff when we
2385              * have a classical potential-shift.
2386              *
2387              * For a pure potential-shift the potential has a constant shift
2388              * all the way out to the cutoff, and that is it. For other forms
2389              * we need to calculate the constant shift up to the point where we
2390              * start modifying the potential.
2391              */
2392             ri0  = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2393
2394             r0   = ri0/scale;
2395             rc3  = r0*r0*r0;
2396             rc9  = rc3*rc3*rc3;
2397
2398             if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2399                 (ic->vdwtype == evdwSHIFT))
2400             {
2401                 /* Determine the constant energy shift below rvdw_switch.
2402                  * Table has a scale factor since we have scaled it down to compensate
2403                  * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2404                  */
2405                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2406                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2407             }
2408             else if (ic->vdw_modifier == eintmodPOTSHIFT)
2409             {
2410                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3));
2411                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3));
2412             }
2413
2414             /* Add the constant part from 0 to rvdw_switch.
2415              * This integration from 0 to rvdw_switch overcounts the number
2416              * of interactions by 1, as it also counts the self interaction.
2417              * We will correct for this later.
2418              */
2419             eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2420             eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2421
2422             /* Calculate the contribution in the range [r0,r1] where we
2423              * modify the potential. For a pure potential-shift modifier we will
2424              * have ri0==ri1, and there will not be any contribution here.
2425              */
2426             for (i = 0; i < 2; i++)
2427             {
2428                 enersum = 0;
2429                 virsum  = 0;
2430                 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2431                 eners[i] -= enersum;
2432                 virs[i]  -= virsum;
2433             }
2434
2435             /* Alright: Above we compensated by REMOVING the parts outside r0
2436              * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2437              *
2438              * Regardless of whether r0 is the point where we start switching,
2439              * or the cutoff where we calculated the constant shift, we include
2440              * all the parts we are missing out to infinity from r0 by
2441              * calculating the analytical dispersion correction.
2442              */
2443             eners[0] += -4.0*M_PI/(3.0*rc3);
2444             eners[1] +=  4.0*M_PI/(9.0*rc9);
2445             virs[0]  +=  8.0*M_PI/rc3;
2446             virs[1]  += -16.0*M_PI/(3.0*rc9);
2447         }
2448         else if (ic->vdwtype == evdwCUT ||
2449                  EVDW_PME(ic->vdwtype) ||
2450                  ic->vdwtype == evdwUSER)
2451         {
2452             if (ic->vdwtype == evdwUSER && fplog)
2453             {
2454                 fprintf(fplog,
2455                         "WARNING: using dispersion correction with user tables\n");
2456             }
2457
2458             /* Note that with LJ-PME, the dispersion correction is multiplied
2459              * by the difference between the actual C6 and the value of C6
2460              * that would produce the combination rule.
2461              * This means the normal energy and virial difference formulas
2462              * can be used here.
2463              */
2464
2465             rc3  = ic->rvdw*ic->rvdw*ic->rvdw;
2466             rc9  = rc3*rc3*rc3;
2467             /* Contribution beyond the cut-off */
2468             eners[0] += -4.0*M_PI/(3.0*rc3);
2469             eners[1] +=  4.0*M_PI/(9.0*rc9);
2470             if (ic->vdw_modifier == eintmodPOTSHIFT)
2471             {
2472                 /* Contribution within the cut-off */
2473                 eners[0] += -4.0*M_PI/(3.0*rc3);
2474                 eners[1] +=  4.0*M_PI/(3.0*rc9);
2475             }
2476             /* Contribution beyond the cut-off */
2477             virs[0]  +=  8.0*M_PI/rc3;
2478             virs[1]  += -16.0*M_PI/(3.0*rc9);
2479         }
2480         else
2481         {
2482             gmx_fatal(FARGS,
2483                       "Dispersion correction is not implemented for vdw-type = %s",
2484                       evdw_names[ic->vdwtype]);
2485         }
2486
2487         /* When we deprecate the group kernels the code below can go too */
2488         if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2489         {
2490             /* Calculate self-interaction coefficient (assuming that
2491              * the reciprocal-space contribution is constant in the
2492              * region that contributes to the self-interaction).
2493              */
2494             fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2495
2496             eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2497             virs[0]  +=  gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2498         }
2499
2500         fr->enerdiffsix    = eners[0];
2501         fr->enerdifftwelve = eners[1];
2502         /* The 0.5 is due to the Gromacs definition of the virial */
2503         fr->virdiffsix     = 0.5*virs[0];
2504         fr->virdifftwelve  = 0.5*virs[1];
2505     }
2506 }
2507
2508 void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
2509                    matrix box, real lambda, tensor pres, tensor virial,
2510                    real *prescorr, real *enercorr, real *dvdlcorr)
2511 {
2512     gmx_bool bCorrAll, bCorrPres;
2513     real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2514     int      m;
2515
2516     *prescorr = 0;
2517     *enercorr = 0;
2518     *dvdlcorr = 0;
2519
2520     clear_mat(virial);
2521     clear_mat(pres);
2522
2523     if (ir->eDispCorr != edispcNO)
2524     {
2525         bCorrAll  = (ir->eDispCorr == edispcAllEner ||
2526                      ir->eDispCorr == edispcAllEnerPres);
2527         bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2528                      ir->eDispCorr == edispcAllEnerPres);
2529
2530         invvol = 1/det(box);
2531         if (fr->n_tpi)
2532         {
2533             /* Only correct for the interactions with the inserted molecule */
2534             dens   = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2535             ninter = fr->n_tpi;
2536         }
2537         else
2538         {
2539             dens   = fr->numAtomsForDispersionCorrection*invvol;
2540             ninter = 0.5*fr->numAtomsForDispersionCorrection;
2541         }
2542
2543         if (ir->efep == efepNO)
2544         {
2545             avcsix    = fr->avcsix[0];
2546             avctwelve = fr->avctwelve[0];
2547         }
2548         else
2549         {
2550             avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
2551             avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2552         }
2553
2554         enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2555         *enercorr += avcsix*enerdiff;
2556         dvdlambda  = 0.0;
2557         if (ir->efep != efepNO)
2558         {
2559             dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2560         }
2561         if (bCorrAll)
2562         {
2563             enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2564             *enercorr += avctwelve*enerdiff;
2565             if (fr->efep != efepNO)
2566             {
2567                 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2568             }
2569         }
2570
2571         if (bCorrPres)
2572         {
2573             svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2574             if (ir->eDispCorr == edispcAllEnerPres)
2575             {
2576                 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2577             }
2578             /* The factor 2 is because of the Gromacs virial definition */
2579             spres = -2.0*invvol*svir*PRESFAC;
2580
2581             for (m = 0; m < DIM; m++)
2582             {
2583                 virial[m][m] += svir;
2584                 pres[m][m]   += spres;
2585             }
2586             *prescorr += spres;
2587         }
2588
2589         /* Can't currently control when it prints, for now, just print when degugging */
2590         if (debug)
2591         {
2592             if (bCorrAll)
2593             {
2594                 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2595                         avcsix, avctwelve);
2596             }
2597             if (bCorrPres)
2598             {
2599                 fprintf(debug,
2600                         "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2601                         *enercorr, spres, svir);
2602             }
2603             else
2604             {
2605                 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2606             }
2607         }
2608
2609         if (fr->efep != efepNO)
2610         {
2611             *dvdlcorr += dvdlambda;
2612         }
2613     }
2614 }
2615
2616 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2617                             const gmx_mtop_t *mtop, rvec x[],
2618                             gmx_bool bFirst)
2619 {
2620     t_graph        *graph;
2621     int             as, mol;
2622
2623     if (bFirst && fplog)
2624     {
2625         fprintf(fplog, "Removing pbc first time\n");
2626     }
2627
2628     snew(graph, 1);
2629     as = 0;
2630     for (const gmx_molblock_t &molb : mtop->molblock)
2631     {
2632         const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2633         if (moltype.atoms.nr == 1 ||
2634             (!bFirst && moltype.cgs.nr == 1))
2635         {
2636             /* Just one atom or charge group in the molecule, no PBC required */
2637             as += molb.nmol*moltype.atoms.nr;
2638         }
2639         else
2640         {
2641             /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2642             mk_graph_ilist(nullptr, moltype.ilist,
2643                            0, moltype.atoms.nr, FALSE, FALSE, graph);
2644
2645             for (mol = 0; mol < molb.nmol; mol++)
2646             {
2647                 mk_mshift(fplog, graph, ePBC, box, x+as);
2648
2649                 shift_self(graph, box, x+as);
2650                 /* The molecule is whole now.
2651                  * We don't need the second mk_mshift call as in do_pbc_first,
2652                  * since we no longer need this graph.
2653                  */
2654
2655                 as += moltype.atoms.nr;
2656             }
2657             done_graph(graph);
2658         }
2659     }
2660     sfree(graph);
2661 }
2662
2663 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2664                        const gmx_mtop_t *mtop, rvec x[])
2665 {
2666     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2667 }
2668
2669 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2670                  const gmx_mtop_t *mtop, rvec x[])
2671 {
2672     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2673 }
2674
2675 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2676 {
2677     int t, nth;
2678     nth = gmx_omp_nthreads_get(emntDefault);
2679
2680 #pragma omp parallel for num_threads(nth) schedule(static)
2681     for (t = 0; t < nth; t++)
2682     {
2683         try
2684         {
2685             size_t natoms = x.size();
2686             size_t offset = (natoms*t    )/nth;
2687             size_t len    = (natoms*(t + 1))/nth - offset;
2688             put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2689         }
2690         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2691     }
2692 }
2693
2694 // TODO This can be cleaned up a lot, and move back to runner.cpp
2695 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2696                 const t_inputrec *inputrec,
2697                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2698                 gmx_walltime_accounting_t walltime_accounting,
2699                 nonbonded_verlet_t *nbv,
2700                 gmx_pme_t *pme,
2701                 gmx_bool bWriteStat)
2702 {
2703     t_nrnb *nrnb_tot = nullptr;
2704     double  delta_t  = 0;
2705     double  nbfs     = 0, mflop = 0;
2706     double  elapsed_time,
2707             elapsed_time_over_all_ranks,
2708             elapsed_time_over_all_threads,
2709             elapsed_time_over_all_threads_over_all_ranks;
2710     /* Control whether it is valid to print a report. Only the
2711        simulation master may print, but it should not do so if the run
2712        terminated e.g. before a scheduled reset step. This is
2713        complicated by the fact that PME ranks are unaware of the
2714        reason why they were sent a pmerecvqxFINISH. To avoid
2715        communication deadlocks, we always do the communication for the
2716        report, even if we've decided not to write the report, because
2717        how long it takes to finish the run is not important when we've
2718        decided not to report on the simulation performance.
2719
2720        Further, we only report performance for dynamical integrators,
2721        because those are the only ones for which we plan to
2722        consider doing any optimizations. */
2723     bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2724
2725     if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2726     {
2727         GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2728         printReport = false;
2729     }
2730
2731     if (cr->nnodes > 1)
2732     {
2733         snew(nrnb_tot, 1);
2734 #if GMX_MPI
2735         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2736                       cr->mpi_comm_mysim);
2737 #endif
2738     }
2739     else
2740     {
2741         nrnb_tot = nrnb;
2742     }
2743
2744     elapsed_time                                 = walltime_accounting_get_elapsed_time(walltime_accounting);
2745     elapsed_time_over_all_ranks                  = elapsed_time;
2746     elapsed_time_over_all_threads                = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
2747     elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2748 #if GMX_MPI
2749     if (cr->nnodes > 1)
2750     {
2751         /* reduce elapsed_time over all MPI ranks in the current simulation */
2752         MPI_Allreduce(&elapsed_time,
2753                       &elapsed_time_over_all_ranks,
2754                       1, MPI_DOUBLE, MPI_SUM,
2755                       cr->mpi_comm_mysim);
2756         elapsed_time_over_all_ranks /= cr->nnodes;
2757         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2758          * current simulation. */
2759         MPI_Allreduce(&elapsed_time_over_all_threads,
2760                       &elapsed_time_over_all_threads_over_all_ranks,
2761                       1, MPI_DOUBLE, MPI_SUM,
2762                       cr->mpi_comm_mysim);
2763     }
2764 #endif
2765
2766     if (printReport)
2767     {
2768         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2769     }
2770     if (cr->nnodes > 1)
2771     {
2772         sfree(nrnb_tot);
2773     }
2774
2775     if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2776     {
2777         print_dd_statistics(cr, inputrec, fplog);
2778     }
2779
2780     /* TODO Move the responsibility for any scaling by thread counts
2781      * to the code that handled the thread region, so that there's a
2782      * mechanism to keep cycle counting working during the transition
2783      * to task parallelism. */
2784     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
2785     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2786     wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2787     auto cycle_sum(wallcycle_sum(cr, wcycle));
2788
2789     if (printReport)
2790     {
2791         auto                    nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2792         gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
2793         if (pme_gpu_task_enabled(pme))
2794         {
2795             pme_gpu_get_timings(pme, &pme_gpu_timings);
2796         }
2797         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2798                         elapsed_time_over_all_ranks,
2799                         wcycle, cycle_sum,
2800                         nbnxn_gpu_timings,
2801                         &pme_gpu_timings);
2802
2803         if (EI_DYNAMICS(inputrec->eI))
2804         {
2805             delta_t = inputrec->delta_t;
2806         }
2807
2808         if (fplog)
2809         {
2810             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2811                        elapsed_time_over_all_ranks,
2812                        walltime_accounting_get_nsteps_done(walltime_accounting),
2813                        delta_t, nbfs, mflop);
2814         }
2815         if (bWriteStat)
2816         {
2817             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2818                        elapsed_time_over_all_ranks,
2819                        walltime_accounting_get_nsteps_done(walltime_accounting),
2820                        delta_t, nbfs, mflop);
2821         }
2822     }
2823 }
2824
2825 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2826 {
2827     /* this function works, but could probably use a logic rewrite to keep all the different
2828        types of efep straight. */
2829
2830     if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
2831     {
2832         return;
2833     }
2834
2835     t_lambda *fep = ir->fepvals;
2836     *fep_state    = fep->init_fep_state; /* this might overwrite the checkpoint
2837                                             if checkpoint is set -- a kludge is in for now
2838                                             to prevent this.*/
2839
2840     for (int i = 0; i < efptNR; i++)
2841     {
2842         /* overwrite lambda state with init_lambda for now for backwards compatibility */
2843         if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2844         {
2845             lambda[i] = fep->init_lambda;
2846             if (lam0)
2847             {
2848                 lam0[i] = lambda[i];
2849             }
2850         }
2851         else
2852         {
2853             lambda[i] = fep->all_lambda[i][*fep_state];
2854             if (lam0)
2855             {
2856                 lam0[i] = lambda[i];
2857             }
2858         }
2859     }
2860     if (ir->bSimTemp)
2861     {
2862         /* need to rescale control temperatures to match current state */
2863         for (int i = 0; i < ir->opts.ngtc; i++)
2864         {
2865             if (ir->opts.ref_t[i] > 0)
2866             {
2867                 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2868             }
2869         }
2870     }
2871
2872     /* Send to the log the information on the current lambdas */
2873     if (fplog != nullptr)
2874     {
2875         fprintf(fplog, "Initial vector of lambda components:[ ");
2876         for (const auto &l : lambda)
2877         {
2878             fprintf(fplog, "%10.4f ", l);
2879         }
2880         fprintf(fplog, "]\n");
2881     }
2882     return;
2883 }
2884
2885
2886 void init_md(FILE *fplog,
2887              const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2888              t_inputrec *ir, const gmx_output_env_t *oenv,
2889              const MdrunOptions &mdrunOptions,
2890              double *t, double *t0,
2891              t_state *globalState, double *lam0,
2892              t_nrnb *nrnb, gmx_mtop_t *mtop,
2893              gmx_update_t **upd,
2894              int nfile, const t_filenm fnm[],
2895              gmx_mdoutf_t *outf, t_mdebin **mdebin,
2896              tensor force_vir, tensor shake_vir, rvec mu_tot,
2897              gmx_bool *bSimAnn, t_vcm **vcm,
2898              gmx_wallcycle_t wcycle)
2899 {
2900     int  i;
2901
2902     /* Initial values */
2903     *t = *t0       = ir->init_t;
2904
2905     *bSimAnn = FALSE;
2906     for (i = 0; i < ir->opts.ngtc; i++)
2907     {
2908         /* set bSimAnn if any group is being annealed */
2909         if (ir->opts.annealing[i] != eannNO)
2910         {
2911             *bSimAnn = TRUE;
2912         }
2913     }
2914
2915     /* Initialize lambda variables */
2916     /* TODO: Clean up initialization of fep_state and lambda in t_state.
2917      * We currently need to call initialize_lambdas on non-master ranks
2918      * to initialize lam0.
2919      */
2920     if (MASTER(cr))
2921     {
2922         initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
2923     }
2924     else
2925     {
2926         int                      tmpFepState;
2927         std::array<real, efptNR> tmpLambda;
2928         initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
2929     }
2930
2931     // TODO upd is never NULL in practice, but the analysers don't know that
2932     if (upd)
2933     {
2934         *upd = init_update(ir);
2935     }
2936     if (*bSimAnn)
2937     {
2938         update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2939     }
2940
2941     if (vcm != nullptr)
2942     {
2943         *vcm = init_vcm(fplog, &mtop->groups, ir);
2944     }
2945
2946     if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
2947     {
2948         if (ir->etc == etcBERENDSEN)
2949         {
2950             please_cite(fplog, "Berendsen84a");
2951         }
2952         if (ir->etc == etcVRESCALE)
2953         {
2954             please_cite(fplog, "Bussi2007a");
2955         }
2956         if (ir->eI == eiSD1)
2957         {
2958             please_cite(fplog, "Goga2012");
2959         }
2960     }
2961     init_nrnb(nrnb);
2962
2963     if (nfile != -1)
2964     {
2965         *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
2966
2967         *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
2968                               mtop, ir, mdoutf_get_fp_dhdl(*outf));
2969     }
2970
2971     /* Initiate variables */
2972     clear_mat(force_vir);
2973     clear_mat(shake_vir);
2974     clear_rvec(mu_tot);
2975 }