src/gromacs/mdlib/sim_util.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "sim_util.h"
  40
  41 #include "config.h"
  42
  43 #include <assert.h>
  44 #include <math.h>
  45 #include <stdio.h>
  46 #include <string.h>
  47
  48 #include <cstdint>
  49
  50 #include <array>
  51
  52 #include "gromacs/domdec/dlbtiming.h"
  53 #include "gromacs/domdec/domdec.h"
  54 #include "gromacs/domdec/domdec_struct.h"
  55 #include "gromacs/essentialdynamics/edsam.h"
  56 #include "gromacs/ewald/pme.h"
  57 #include "gromacs/gmxlib/chargegroup.h"
  58 #include "gromacs/gmxlib/network.h"
  59 #include "gromacs/gmxlib/nrnb.h"
  60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
  61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
  62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
  63 #include "gromacs/imd/imd.h"
  64 #include "gromacs/listed-forces/bonded.h"
  65 #include "gromacs/listed-forces/disre.h"
  66 #include "gromacs/listed-forces/orires.h"
  67 #include "gromacs/math/functions.h"
  68 #include "gromacs/math/units.h"
  69 #include "gromacs/math/vec.h"
  70 #include "gromacs/math/vecdump.h"
  71 #include "gromacs/mdlib/calcmu.h"
  72 #include "gromacs/mdlib/constr.h"
  73 #include "gromacs/mdlib/force.h"
  74 #include "gromacs/mdlib/forcerec.h"
  75 #include "gromacs/mdlib/genborn.h"
  76 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  77 #include "gromacs/mdlib/mdrun.h"
  78 #include "gromacs/mdlib/nb_verlet.h"
  79 #include "gromacs/mdlib/nbnxn_atomdata.h"
  80 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  81 #include "gromacs/mdlib/nbnxn_grid.h"
  82 #include "gromacs/mdlib/nbnxn_search.h"
  83 #include "gromacs/mdlib/qmmm.h"
  84 #include "gromacs/mdlib/update.h"
  85 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
  86 #include "gromacs/mdtypes/commrec.h"
  87 #include "gromacs/mdtypes/iforceprovider.h"
  88 #include "gromacs/mdtypes/inputrec.h"
  89 #include "gromacs/mdtypes/md_enums.h"
  90 #include "gromacs/mdtypes/state.h"
  91 #include "gromacs/pbcutil/ishift.h"
  92 #include "gromacs/pbcutil/mshift.h"
  93 #include "gromacs/pbcutil/pbc.h"
  94 #include "gromacs/pulling/pull.h"
  95 #include "gromacs/pulling/pull_rotation.h"
  96 #include "gromacs/timing/cyclecounter.h"
  97 #include "gromacs/timing/gpu_timing.h"
  98 #include "gromacs/timing/wallcycle.h"
  99 #include "gromacs/timing/wallcyclereporting.h"
 100 #include "gromacs/timing/walltime_accounting.h"
 101 #include "gromacs/utility/arrayref.h"
 102 #include "gromacs/utility/basedefinitions.h"
 103 #include "gromacs/utility/cstringutil.h"
 104 #include "gromacs/utility/exceptions.h"
 105 #include "gromacs/utility/fatalerror.h"
 106 #include "gromacs/utility/gmxassert.h"
 107 #include "gromacs/utility/gmxmpi.h"
 108 #include "gromacs/utility/logger.h"
 109 #include "gromacs/utility/pleasecite.h"
 110 #include "gromacs/utility/smalloc.h"
 111 #include "gromacs/utility/sysinfo.h"
 112
 113 #include "nbnxn_gpu.h"
 114 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
 115 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
 116
 117 void print_time(FILE                     *out,
 118                 gmx_walltime_accounting_t walltime_accounting,
 119                 gmx_int64_t               step,
 120                 t_inputrec               *ir,
 121                 t_commrec gmx_unused     *cr)
 122 {
 123     time_t finish;
 124     char   timebuf[STRLEN];
 125     double dt, elapsed_seconds, time_per_step;
 126     char   buf[48];
 127
 128 #if !GMX_THREAD_MPI
 129     if (!PAR(cr))
 130 #endif
 131     {
 132         fprintf(out, "\r");
 133     }
 134     fprintf(out, "step %s", gmx_step_str(step, buf));
 135     fflush(out);
 136
 137     if ((step >= ir->nstlist))
 138     {
 139         double seconds_since_epoch = gmx_gettime();
 140         elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
 141         time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
 142         dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
 143
 144         if (ir->nsteps >= 0)
 145         {
 146             if (dt >= 300)
 147             {
 148                 finish = (time_t) (seconds_since_epoch + dt);
 149                 gmx_ctime_r(&finish, timebuf, STRLEN);
 150                 sprintf(buf, "%s", timebuf);
 151                 buf[strlen(buf)-1] = '\0';
 152                 fprintf(out, ", will finish %s", buf);
 153             }
 154             else
 155             {
 156                 fprintf(out, ", remaining wall clock time: %5d s          ", (int)dt);
 157             }
 158         }
 159         else
 160         {
 161             fprintf(out, " performance: %.1f ns/day    ",
 162                     ir->delta_t/1000*24*60*60/time_per_step);
 163         }
 164     }
 165 #if !GMX_THREAD_MPI
 166     if (PAR(cr))
 167     {
 168         fprintf(out, "\n");
 169     }
 170 #endif
 171
 172     fflush(out);
 173 }
 174
 175 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
 176                          double the_time)
 177 {
 178     char   time_string[STRLEN];
 179
 180     if (!fplog)
 181     {
 182         return;
 183     }
 184
 185     {
 186         int    i;
 187         char   timebuf[STRLEN];
 188         time_t temp_time = (time_t) the_time;
 189
 190         gmx_ctime_r(&temp_time, timebuf, STRLEN);
 191         for (i = 0; timebuf[i] >= ' '; i++)
 192         {
 193             time_string[i] = timebuf[i];
 194         }
 195         time_string[i] = '\0';
 196     }
 197
 198     fprintf(fplog, "%s on rank %d %s\n", title, nodeid, time_string);
 199 }
 200
 201 void print_start(FILE *fplog, t_commrec *cr,
 202                  gmx_walltime_accounting_t walltime_accounting,
 203                  const char *name)
 204 {
 205     char buf[STRLEN];
 206
 207     sprintf(buf, "Started %s", name);
 208     print_date_and_time(fplog, cr->nodeid, buf,
 209                         walltime_accounting_get_start_time_stamp(walltime_accounting));
 210 }
 211
 212 static void sum_forces(rvec f[], const PaddedRVecVector *forceToAdd)
 213 {
 214     /* TODO: remove this - 1 when padding is properly implemented */
 215     int         end  = forceToAdd->size() - 1;
 216     const rvec *fAdd = as_rvec_array(forceToAdd->data());
 217
 218     // cppcheck-suppress unreadVariable
 219     int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
 220 #pragma omp parallel for num_threads(nt) schedule(static)
 221     for (int i = 0; i < end; i++)
 222     {
 223         rvec_inc(f[i], fAdd[i]);
 224     }
 225 }
 226
 227 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
 228                         tensor vir_part, t_graph *graph, matrix box,
 229                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
 230 {
 231     int    i;
 232
 233     /* The short-range virial from surrounding boxes */
 234     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
 235     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
 236
 237     /* Calculate partial virial, for local atoms only, based on short range.
 238      * Total virial is computed in global_stat, called from do_md
 239      */
 240     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
 241     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
 242
 243     /* Add position restraint contribution */
 244     for (i = 0; i < DIM; i++)
 245     {
 246         vir_part[i][i] += fr->vir_diag_posres[i];
 247     }
 248
 249     /* Add wall contribution */
 250     for (i = 0; i < DIM; i++)
 251     {
 252         vir_part[i][ZZ] += fr->vir_wall_z[i];
 253     }
 254
 255     if (debug)
 256     {
 257         pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
 258     }
 259 }
 260
 261 static void pull_potential_wrapper(t_commrec *cr,
 262                                    t_inputrec *ir,
 263                                    matrix box, rvec x[],
 264                                    rvec f[],
 265                                    tensor vir_force,
 266                                    t_mdatoms *mdatoms,
 267                                    gmx_enerdata_t *enerd,
 268                                    real *lambda,
 269                                    double t,
 270                                    gmx_wallcycle_t wcycle)
 271 {
 272     t_pbc  pbc;
 273     real   dvdl;
 274
 275     /* Calculate the center of mass forces, this requires communication,
 276      * which is why pull_potential is called close to other communication.
 277      * The virial contribution is calculated directly,
 278      * which is why we call pull_potential after calc_virial.
 279      */
 280     wallcycle_start(wcycle, ewcPULLPOT);
 281     set_pbc(&pbc, ir->ePBC, box);
 282     dvdl                     = 0;
 283     enerd->term[F_COM_PULL] +=
 284         pull_potential(ir->pull_work, mdatoms, &pbc,
 285                        cr, t, lambda[efptRESTRAINT], x, f, vir_force, &dvdl);
 286     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
 287     wallcycle_stop(wcycle, ewcPULLPOT);
 288 }
 289
 290 static void pme_receive_force_ener(t_commrec      *cr,
 291                                    gmx_wallcycle_t wcycle,
 292                                    gmx_enerdata_t *enerd,
 293                                    t_forcerec     *fr)
 294 {
 295     real   e_q, e_lj, dvdl_q, dvdl_lj;
 296     float  cycles_ppdpme, cycles_seppme;
 297
 298     cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
 299     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
 300
 301     /* In case of node-splitting, the PP nodes receive the long-range
 302      * forces, virial and energy from the PME nodes here.
 303      */
 304     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
 305     dvdl_q  = 0;
 306     dvdl_lj = 0;
 307     gmx_pme_receive_f(cr, as_rvec_array(fr->f_novirsum->data()), fr->vir_el_recip, &e_q,
 308                       fr->vir_lj_recip, &e_lj, &dvdl_q, &dvdl_lj,
 309                       &cycles_seppme);
 310     enerd->term[F_COUL_RECIP] += e_q;
 311     enerd->term[F_LJ_RECIP]   += e_lj;
 312     enerd->dvdl_lin[efptCOUL] += dvdl_q;
 313     enerd->dvdl_lin[efptVDW]  += dvdl_lj;
 314
 315     if (wcycle)
 316     {
 317         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
 318     }
 319     wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
 320 }
 321
 322 static void print_large_forces(FILE *fp, t_mdatoms *md, t_commrec *cr,
 323                                gmx_int64_t step, real forceTolerance,
 324                                const rvec *x, const rvec *f)
 325 {
 326     real           force2Tolerance = gmx::square(forceTolerance);
 327     std::uintmax_t numNonFinite    = 0;
 328     for (int i = 0; i < md->homenr; i++)
 329     {
 330         real force2    = norm2(f[i]);
 331         bool nonFinite = !std::isfinite(force2);
 332         if (force2 >= force2Tolerance || nonFinite)
 333         {
 334             fprintf(fp, "step %" GMX_PRId64 " atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
 335                     step,
 336                     ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
 337         }
 338         if (nonFinite)
 339         {
 340             numNonFinite++;
 341         }
 342     }
 343     if (numNonFinite > 0)
 344     {
 345         /* Note that with MPI this fatal call on one rank might interrupt
 346          * the printing on other ranks. But we can only avoid that with
 347          * an expensive MPI barrier that we would need at each step.
 348          */
 349         gmx_fatal(FARGS, "At step %" GMX_PRId64 " detected non-finite forces on %ju atoms", step, numNonFinite);
 350     }
 351 }
 352
 353 static void post_process_forces(t_commrec *cr,
 354                                 gmx_int64_t step,
 355                                 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 356                                 gmx_localtop_t *top,
 357                                 matrix box, rvec x[],
 358                                 rvec f[],
 359                                 tensor vir_force,
 360                                 t_mdatoms *mdatoms,
 361                                 t_graph *graph,
 362                                 t_forcerec *fr, gmx_vsite_t *vsite,
 363                                 int flags)
 364 {
 365     if (fr->bF_NoVirSum)
 366     {
 367         if (vsite)
 368         {
 369             /* Spread the mesh force on virtual sites to the other particles...
 370              * This is parallellized. MPI communication is performed
 371              * if the constructing atoms aren't local.
 372              */
 373             wallcycle_start(wcycle, ewcVSITESPREAD);
 374             spread_vsite_f(vsite, x, as_rvec_array(fr->f_novirsum->data()), nullptr,
 375                            (flags & GMX_FORCE_VIRIAL), fr->vir_el_recip,
 376                            nrnb,
 377                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
 378             wallcycle_stop(wcycle, ewcVSITESPREAD);
 379         }
 380         if (flags & GMX_FORCE_VIRIAL)
 381         {
 382             /* Now add the forces, this is local */
 383             sum_forces(f, fr->f_novirsum);
 384
 385             if (EEL_FULL(fr->ic->eeltype))
 386             {
 387                 /* Add the mesh contribution to the virial */
 388                 m_add(vir_force, fr->vir_el_recip, vir_force);
 389             }
 390             if (EVDW_PME(fr->ic->vdwtype))
 391             {
 392                 /* Add the mesh contribution to the virial */
 393                 m_add(vir_force, fr->vir_lj_recip, vir_force);
 394             }
 395             if (debug)
 396             {
 397                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
 398             }
 399         }
 400     }
 401
 402     if (fr->print_force >= 0)
 403     {
 404         print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
 405     }
 406 }
 407
 408 static void do_nb_verlet(t_forcerec *fr,
 409                          interaction_const_t *ic,
 410                          gmx_enerdata_t *enerd,
 411                          int flags, int ilocality,
 412                          int clearF,
 413                          gmx_int64_t step,
 414                          t_nrnb *nrnb,
 415                          gmx_wallcycle_t wcycle)
 416 {
 417     if (!(flags & GMX_FORCE_NONBONDED))
 418     {
 419         /* skip non-bonded calculation */
 420         return;
 421     }
 422
 423     nonbonded_verlet_t       *nbv  = fr->nbv;
 424     nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
 425
 426     /* GPU kernel launch overhead is already timed separately */
 427     if (fr->cutoff_scheme != ecutsVERLET)
 428     {
 429         gmx_incons("Invalid cut-off scheme passed!");
 430     }
 431
 432     bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
 433
 434     if (!bUsingGpuKernels)
 435     {
 436         /* When dynamic pair-list  pruning is requested, we need to prune
 437          * at nstlistPrune steps.
 438          */
 439         if (nbv->listParams->useDynamicPruning &&
 440             (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
 441         {
 442             /* Prune the pair-list beyond fr->ic->rlistPrune using
 443              * the current coordinates of the atoms.
 444              */
 445             wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
 446             nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
 447             wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
 448         }
 449
 450         wallcycle_sub_start(wcycle, ewcsNONBONDED);
 451     }
 452
 453     switch (nbvg->kernel_type)
 454     {
 455         case nbnxnk4x4_PlainC:
 456         case nbnxnk4xN_SIMD_4xN:
 457         case nbnxnk4xN_SIMD_2xNN:
 458             nbnxn_kernel_cpu(nbvg,
 459                              nbv->nbat,
 460                              ic,
 461                              fr->shift_vec,
 462                              flags,
 463                              clearF,
 464                              fr->fshift[0],
 465                              enerd->grpp.ener[egCOULSR],
 466                              fr->bBHAM ?
 467                              enerd->grpp.ener[egBHAMSR] :
 468                              enerd->grpp.ener[egLJSR]);
 469             break;
 470
 471         case nbnxnk8x8x8_GPU:
 472             nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbv->nbat, flags, ilocality);
 473             break;
 474
 475         case nbnxnk8x8x8_PlainC:
 476             nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
 477                                  nbv->nbat, ic,
 478                                  fr->shift_vec,
 479                                  flags,
 480                                  clearF,
 481                                  nbv->nbat->out[0].f,
 482                                  fr->fshift[0],
 483                                  enerd->grpp.ener[egCOULSR],
 484                                  fr->bBHAM ?
 485                                  enerd->grpp.ener[egBHAMSR] :
 486                                  enerd->grpp.ener[egLJSR]);
 487             break;
 488
 489         default:
 490             GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
 491
 492     }
 493     if (!bUsingGpuKernels)
 494     {
 495         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 496     }
 497
 498     int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
 499     if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
 500     {
 501         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
 502     }
 503     else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
 504              (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
 505     {
 506         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
 507     }
 508     else
 509     {
 510         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
 511     }
 512     enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
 513     if (flags & GMX_FORCE_ENERGY)
 514     {
 515         /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
 516         enr_nbnxn_kernel_ljc += 1;
 517         enr_nbnxn_kernel_lj  += 1;
 518     }
 519
 520     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
 521              nbvg->nbl_lists.natpair_ljq);
 522     inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
 523              nbvg->nbl_lists.natpair_lj);
 524     /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
 525     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
 526              nbvg->nbl_lists.natpair_q);
 527
 528     if (ic->vdw_modifier == eintmodFORCESWITCH)
 529     {
 530         /* We add up the switch cost separately */
 531         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 532                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 533     }
 534     if (ic->vdw_modifier == eintmodPOTSWITCH)
 535     {
 536         /* We add up the switch cost separately */
 537         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 538                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 539     }
 540     if (ic->vdwtype == evdwPME)
 541     {
 542         /* We add up the LJ Ewald cost separately */
 543         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 544                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 545     }
 546 }
 547
 548 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
 549                              t_forcerec           *fr,
 550                              rvec                  x[],
 551                              rvec                  f[],
 552                              t_mdatoms            *mdatoms,
 553                              t_lambda             *fepvals,
 554                              real                 *lambda,
 555                              gmx_enerdata_t       *enerd,
 556                              int                   flags,
 557                              t_nrnb               *nrnb,
 558                              gmx_wallcycle_t       wcycle)
 559 {
 560     int              donb_flags;
 561     nb_kernel_data_t kernel_data;
 562     real             lam_i[efptNR];
 563     real             dvdl_nb[efptNR];
 564     int              th;
 565     int              i, j;
 566
 567     donb_flags = 0;
 568     /* Add short-range interactions */
 569     donb_flags |= GMX_NONBONDED_DO_SR;
 570
 571     /* Currently all group scheme kernels always calculate (shift-)forces */
 572     if (flags & GMX_FORCE_FORCES)
 573     {
 574         donb_flags |= GMX_NONBONDED_DO_FORCE;
 575     }
 576     if (flags & GMX_FORCE_VIRIAL)
 577     {
 578         donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
 579     }
 580     if (flags & GMX_FORCE_ENERGY)
 581     {
 582         donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
 583     }
 584
 585     kernel_data.flags  = donb_flags;
 586     kernel_data.lambda = lambda;
 587     kernel_data.dvdl   = dvdl_nb;
 588
 589     kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
 590     kernel_data.energygrp_vdw  = enerd->grpp.ener[egLJSR];
 591
 592     /* reset free energy components */
 593     for (i = 0; i < efptNR; i++)
 594     {
 595         dvdl_nb[i]  = 0;
 596     }
 597
 598     assert(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl);
 599
 600     wallcycle_sub_start(wcycle, ewcsNONBONDED);
 601 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
 602     for (th = 0; th < nbl_lists->nnbl; th++)
 603     {
 604         try
 605         {
 606             gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
 607                                       x, f, fr, mdatoms, &kernel_data, nrnb);
 608         }
 609         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 610     }
 611
 612     if (fepvals->sc_alpha != 0)
 613     {
 614         enerd->dvdl_nonlin[efptVDW]  += dvdl_nb[efptVDW];
 615         enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
 616     }
 617     else
 618     {
 619         enerd->dvdl_lin[efptVDW]  += dvdl_nb[efptVDW];
 620         enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
 621     }
 622
 623     /* If we do foreign lambda and we have soft-core interactions
 624      * we have to recalculate the (non-linear) energies contributions.
 625      */
 626     if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
 627     {
 628         kernel_data.flags          = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
 629         kernel_data.lambda         = lam_i;
 630         kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
 631         kernel_data.energygrp_vdw  = enerd->foreign_grpp.ener[egLJSR];
 632         /* Note that we add to kernel_data.dvdl, but ignore the result */
 633
 634         for (i = 0; i < enerd->n_lambda; i++)
 635         {
 636             for (j = 0; j < efptNR; j++)
 637             {
 638                 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
 639             }
 640             reset_foreign_enerdata(enerd);
 641 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
 642             for (th = 0; th < nbl_lists->nnbl; th++)
 643             {
 644                 try
 645                 {
 646                     gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
 647                                               x, f, fr, mdatoms, &kernel_data, nrnb);
 648                 }
 649                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 650             }
 651
 652             sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
 653             enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
 654         }
 655     }
 656
 657     wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 658 }
 659
 660 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
 661 {
 662     return nbv != nullptr && nbv->bUseGPU;
 663 }
 664
 665 static gmx_inline void clear_rvecs_omp(int n, rvec v[])
 666 {
 667     int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
 668
 669     /* Note that we would like to avoid this conditional by putting it
 670      * into the omp pragma instead, but then we still take the full
 671      * omp parallel for overhead (at least with gcc5).
 672      */
 673     if (nth == 1)
 674     {
 675         for (int i = 0; i < n; i++)
 676         {
 677             clear_rvec(v[i]);
 678         }
 679     }
 680     else
 681     {
 682 #pragma omp parallel for num_threads(nth) schedule(static)
 683         for (int i = 0; i < n; i++)
 684         {
 685             clear_rvec(v[i]);
 686         }
 687     }
 688 }
 689
 690 /*! \brief  This routine checks if the potential energy is finite.
 691  *
 692  * Note that passing this check does not guarantee finite forces,
 693  * since those use slightly different arithmetics. But in most cases
 694  * there is just a narrow coordinate range where forces are not finite
 695  * and energies are finite.
 696  *
 697  * \param[in] enerd  The energy data; the non-bonded group energies need to be added in here before calling this routine
 698  */
 699 static void checkPotentialEnergyValidity(const gmx_enerdata_t *enerd)
 700 {
 701     if (!std::isfinite(enerd->term[F_EPOT]))
 702     {
 703         gmx_fatal(FARGS, "The total potential energy is %g, which is not finite. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A non-finite potential energy can be caused by overlapping interactions in bonded interactions or very large or NaN coordinate values. Usually this is caused by a badly or non-equilibrated initial configuration or incorrect interactions or parameters in the topology.",
 704                   enerd->term[F_EPOT],
 705                   enerd->term[F_LJ],
 706                   enerd->term[F_COUL_SR]);
 707     }
 708 }
 709
 710 /*! \brief Compute forces and/or energies for special algorithms
 711  *
 712  * The intention is to collect all calls to algorithms that compute
 713  * forces on local atoms only and that do not contribute to the local
 714  * virial sum (but add their virial contribution separately).
 715  * Eventually these should likely all become ForceProviders.
 716  * Within this function the intention is to have algorithms that do
 717  * global communication at the end, so global barriers within the MD loop
 718  * are as close together as possible.
 719  *
 720  * \param[in]     cr              The communication record
 721  * \param[in]     inputrec        The input record
 722  * \param[in]     step            The current MD step
 723  * \param[in]     t               The current time
 724  * \param[in,out] wcycle          Wallcycle accounting struct
 725  * \param[in,out] forceProviders  Pointer to a list of force providers
 726  * \param[in]     box             The unit cell
 727  * \param[in]     x               The coordinates
 728  * \param[in]     mdatoms         Per atom properties
 729  * \param[in]     lambda          Array of free-energy lambda values
 730  * \param[in]     forceFlags      Flags that tell whether we should compute forces/energies/virial
 731  * \param[in,out] force           Force buffer; does not contribute to the virial
 732  * \param[in,out] virial          Virial buffer
 733  * \param[in,out] enerd           Energy buffer
 734  * \param[in,out] ed              Essential dynamics pointer
 735  * \param[in]     bNS             Tells if we did neighbor searching this step, used for ED sampling
 736  *
 737  * \todo Remove bNS, which is used incorrectly.
 738  * \todo Convert all other algorithms called here to ForceProviders.
 739  */
 740 static void
 741 computeSpecialForces(t_commrec        *cr,
 742                      t_inputrec       *inputrec,
 743                      gmx_int64_t       step,
 744                      double            t,
 745                      gmx_wallcycle_t   wcycle,
 746                      ForceProviders   *forceProviders,
 747                      matrix            box,
 748                      rvec             *x,
 749                      t_mdatoms        *mdatoms,
 750                      real             *lambda,
 751                      int               forceFlags,
 752                      PaddedRVecVector *force,
 753                      tensor            virial,
 754                      gmx_enerdata_t   *enerd,
 755                      gmx_edsam_t       ed,
 756                      gmx_bool          bNS)
 757 {
 758     const bool computeForces = (forceFlags & GMX_FORCE_FORCES);
 759
 760     /* NOTE: Currently all ForceProviders only provide forces.
 761      *       When they also provide energies, remove this conditional.
 762      */
 763     if (computeForces)
 764     {
 765         /* Collect forces from modules */
 766         gmx::ArrayRef<gmx::RVec> f = *force;
 767
 768         forceProviders->calculateForces(cr, mdatoms, box, t, x, f);
 769     }
 770
 771     rvec *f = as_rvec_array(force->data());
 772
 773     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
 774     {
 775         pull_potential_wrapper(cr, inputrec, box, x,
 776                                f, virial, mdatoms, enerd, lambda, t,
 777                                wcycle);
 778     }
 779
 780     /* Add the forces from enforced rotation potentials (if any) */
 781     if (inputrec->bRot)
 782     {
 783         wallcycle_start(wcycle, ewcROTadd);
 784         enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
 785         wallcycle_stop(wcycle, ewcROTadd);
 786     }
 787
 788     if (ed)
 789     {
 790         /* Note that since init_edsam() is called after the initialization
 791          * of forcerec, edsam doesn't request the noVirSum force buffer.
 792          * Thus if no other algorithm (e.g. PME) requires it, the forces
 793          * here will contribute to the virial.
 794          */
 795         do_flood(cr, inputrec, x, f, ed, box, step, bNS);
 796     }
 797
 798     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
 799     if (inputrec->bIMD && computeForces)
 800     {
 801         IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
 802     }
 803 }
 804
 805 static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
 806                                 t_inputrec *inputrec,
 807                                 gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 808                                 gmx_localtop_t *top,
 809                                 gmx_groups_t gmx_unused *groups,
 810                                 matrix box, rvec x[], history_t *hist,
 811                                 PaddedRVecVector *force,
 812                                 tensor vir_force,
 813                                 t_mdatoms *mdatoms,
 814                                 gmx_enerdata_t *enerd, t_fcdata *fcd,
 815                                 real *lambda, t_graph *graph,
 816                                 t_forcerec *fr, interaction_const_t *ic,
 817                                 gmx_vsite_t *vsite, rvec mu_tot,
 818                                 double t, gmx_edsam_t ed,
 819                                 gmx_bool bBornRadii,
 820                                 int flags,
 821                                 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
 822                                 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
 823 {
 824     int                 cg1, i, j;
 825     double              mu[2*DIM];
 826     gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM;
 827     gmx_bool            bDoForces, bUseGPU, bUseOrEmulGPU;
 828     rvec                vzero, box_diag;
 829     float               cycles_pme, cycles_wait_gpu;
 830     nonbonded_verlet_t *nbv = fr->nbv;
 831
 832     bStateChanged = (flags & GMX_FORCE_STATECHANGED);
 833     bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
 834     bFillGrid     = (bNS && bStateChanged);
 835     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
 836     bDoForces     = (flags & GMX_FORCE_FORCES);
 837     bUseGPU       = fr->nbv->bUseGPU;
 838     bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
 839
 840     /* At a search step we need to start the first balancing region
 841      * somewhere early inside the step after communication during domain
 842      * decomposition (and not during the previous step as usual).
 843      */
 844     if (bNS &&
 845         ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
 846     {
 847         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
 848     }
 849
 850     cycles_wait_gpu = 0;
 851
 852     const int start  = 0;
 853     const int homenr = mdatoms->homenr;
 854
 855     clear_mat(vir_force);
 856
 857     if (DOMAINDECOMP(cr))
 858     {
 859         cg1 = cr->dd->ncg_tot;
 860     }
 861     else
 862     {
 863         cg1 = top->cgs.nr;
 864     }
 865     if (fr->n_tpi > 0)
 866     {
 867         cg1--;
 868     }
 869
 870     if (bStateChanged)
 871     {
 872         update_forcerec(fr, box);
 873
 874         if (inputrecNeedMutot(inputrec))
 875         {
 876             /* Calculate total (local) dipole moment in a temporary common array.
 877              * This makes it possible to sum them over nodes faster.
 878              */
 879             calc_mu(start, homenr,
 880                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
 881                     mu, mu+DIM);
 882         }
 883     }
 884
 885     if (fr->ePBC != epbcNONE)
 886     {
 887         /* Compute shift vectors every step,
 888          * because of pressure coupling or box deformation!
 889          */
 890         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
 891         {
 892             calc_shifts(box, fr->shift_vec);
 893         }
 894
 895         if (bCalcCGCM)
 896         {
 897             put_atoms_in_box_omp(fr->ePBC, box, homenr, x);
 898             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
 899         }
 900         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
 901         {
 902             unshift_self(graph, box, x);
 903         }
 904     }
 905
 906     nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
 907                                  fr->shift_vec, nbv->nbat);
 908
 909 #if GMX_MPI
 910     if (!(cr->duty & DUTY_PME))
 911     {
 912         gmx_bool bBS;
 913         matrix   boxs;
 914
 915         /* Send particle coordinates to the pme nodes.
 916          * Since this is only implemented for domain decomposition
 917          * and domain decomposition does not use the graph,
 918          * we do not need to worry about shifting.
 919          */
 920
 921         wallcycle_start(wcycle, ewcPP_PMESENDX);
 922
 923         bBS = (inputrec->nwall == 2);
 924         if (bBS)
 925         {
 926             copy_mat(box, boxs);
 927             svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
 928         }
 929
 930         gmx_pme_send_coordinates(cr, bBS ? boxs : box, x,
 931                                  lambda[efptCOUL], lambda[efptVDW],
 932                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
 933                                  step);
 934
 935         wallcycle_stop(wcycle, ewcPP_PMESENDX);
 936     }
 937 #endif /* GMX_MPI */
 938
 939     /* do gridding for pair search */
 940     if (bNS)
 941     {
 942         if (graph && bStateChanged)
 943         {
 944             /* Calculate intramolecular shift vectors to make molecules whole */
 945             mk_mshift(fplog, graph, fr->ePBC, box, x);
 946         }
 947
 948         clear_rvec(vzero);
 949         box_diag[XX] = box[XX][XX];
 950         box_diag[YY] = box[YY][YY];
 951         box_diag[ZZ] = box[ZZ][ZZ];
 952
 953         wallcycle_start(wcycle, ewcNS);
 954         if (!fr->bDomDec)
 955         {
 956             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
 957             nbnxn_put_on_grid(nbv->nbs, fr->ePBC, box,
 958                               0, vzero, box_diag,
 959                               0, mdatoms->homenr, -1, fr->cginfo, x,
 960                               0, nullptr,
 961                               nbv->grp[eintLocal].kernel_type,
 962                               nbv->nbat);
 963             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
 964         }
 965         else
 966         {
 967             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
 968             nbnxn_put_on_grid_nonlocal(nbv->nbs, domdec_zones(cr->dd),
 969                                        fr->cginfo, x,
 970                                        nbv->grp[eintNonlocal].kernel_type,
 971                                        nbv->nbat);
 972             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
 973         }
 974
 975         nbnxn_atomdata_set(nbv->nbat, nbv->nbs, mdatoms, fr->cginfo);
 976         wallcycle_stop(wcycle, ewcNS);
 977     }
 978
 979     /* initialize the GPU atom data and copy shift vector */
 980     if (bUseGPU)
 981     {
 982         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
 983         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
 984
 985         if (bNS)
 986         {
 987             nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
 988         }
 989
 990         nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
 991
 992         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
 993         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
 994     }
 995
 996     /* do local pair search */
 997     if (bNS)
 998     {
 999         wallcycle_start_nocount(wcycle, ewcNS);
1000         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1001         nbnxn_make_pairlist(nbv->nbs, nbv->nbat,
1002                             &top->excls,
1003                             nbv->listParams->rlistOuter,
1004                             nbv->min_ci_balanced,
1005                             &nbv->grp[eintLocal].nbl_lists,
1006                             eintLocal,
1007                             nbv->grp[eintLocal].kernel_type,
1008                             nrnb);
1009         nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
1010         if (nbv->listParams->useDynamicPruning && !bUseGPU)
1011         {
1012             nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
1013         }
1014         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1015
1016         if (bUseGPU)
1017         {
1018             /* initialize local pair-list on the GPU */
1019             nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1020                                     nbv->grp[eintLocal].nbl_lists.nbl[0],
1021                                     eintLocal);
1022         }
1023         wallcycle_stop(wcycle, ewcNS);
1024     }
1025     else
1026     {
1027         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1028         wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
1029         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, x,
1030                                         nbv->nbat);
1031         wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
1032         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1033     }
1034
1035     if (bUseGPU)
1036     {
1037         if (DOMAINDECOMP(cr))
1038         {
1039             ddOpenBalanceRegionGpu(cr->dd);
1040         }
1041
1042         wallcycle_start(wcycle, ewcLAUNCH_GPU);
1043         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1044         /* launch local nonbonded work on GPU */
1045         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1046                      step, nrnb, wcycle);
1047         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1048         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1049     }
1050
1051     /* Communicate coordinates and sum dipole if necessary +
1052        do non-local pair search */
1053     if (DOMAINDECOMP(cr))
1054     {
1055         if (bNS)
1056         {
1057             wallcycle_start_nocount(wcycle, ewcNS);
1058             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1059
1060             nbnxn_make_pairlist(nbv->nbs, nbv->nbat,
1061                                 &top->excls,
1062                                 nbv->listParams->rlistOuter,
1063                                 nbv->min_ci_balanced,
1064                                 &nbv->grp[eintNonlocal].nbl_lists,
1065                                 eintNonlocal,
1066                                 nbv->grp[eintNonlocal].kernel_type,
1067                                 nrnb);
1068             nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1069             if (nbv->listParams->useDynamicPruning && !bUseGPU)
1070             {
1071                 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1072             }
1073             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1074
1075             if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1076             {
1077                 /* initialize non-local pair-list on the GPU */
1078                 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1079                                         nbv->grp[eintNonlocal].nbl_lists.nbl[0],
1080                                         eintNonlocal);
1081             }
1082             wallcycle_stop(wcycle, ewcNS);
1083         }
1084         else
1085         {
1086             wallcycle_start(wcycle, ewcMOVEX);
1087             dd_move_x(cr->dd, box, x);
1088             wallcycle_stop(wcycle, ewcMOVEX);
1089
1090             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1091             wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
1092             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, x,
1093                                             nbv->nbat);
1094             wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
1095             wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1096         }
1097
1098         if (bUseGPU)
1099         {
1100             wallcycle_start(wcycle, ewcLAUNCH_GPU);
1101             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1102             /* launch non-local nonbonded tasks on GPU */
1103             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1104                          step, nrnb, wcycle);
1105             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1106             wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1107         }
1108     }
1109
1110     if (bUseGPU)
1111     {
1112         /* launch D2H copy-back F */
1113         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1114         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1115         if (DOMAINDECOMP(cr))
1116         {
1117             nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1118                                      flags, eatNonlocal);
1119         }
1120         nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1121                                  flags, eatLocal);
1122         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1123         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1124     }
1125
1126     if (bStateChanged && inputrecNeedMutot(inputrec))
1127     {
1128         if (PAR(cr))
1129         {
1130             gmx_sumd(2*DIM, mu, cr);
1131             ddReopenBalanceRegionCpu(cr->dd);
1132         }
1133
1134         for (i = 0; i < 2; i++)
1135         {
1136             for (j = 0; j < DIM; j++)
1137             {
1138                 fr->mu_tot[i][j] = mu[i*DIM + j];
1139             }
1140         }
1141     }
1142     if (fr->efep == efepNO)
1143     {
1144         copy_rvec(fr->mu_tot[0], mu_tot);
1145     }
1146     else
1147     {
1148         for (j = 0; j < DIM; j++)
1149         {
1150             mu_tot[j] =
1151                 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1152                 lambda[efptCOUL]*fr->mu_tot[1][j];
1153         }
1154     }
1155
1156     /* Reset energies */
1157     reset_enerdata(enerd);
1158     clear_rvecs(SHIFTS, fr->fshift);
1159
1160     if (DOMAINDECOMP(cr) && !(cr->duty & DUTY_PME))
1161     {
1162         wallcycle_start(wcycle, ewcPPDURINGPME);
1163         dd_force_flop_start(cr->dd, nrnb);
1164     }
1165
1166     if (inputrec->bRot)
1167     {
1168         wallcycle_start(wcycle, ewcROT);
1169         do_rotation(cr, inputrec, box, x, t, step, bNS);
1170         wallcycle_stop(wcycle, ewcROT);
1171     }
1172
1173     /* Temporary solution until all routines take PaddedRVecVector */
1174     rvec *f = as_rvec_array(force->data());
1175
1176     /* Start the force cycle counter.
1177      * Note that a different counter is used for dynamic load balancing.
1178      */
1179     wallcycle_start(wcycle, ewcFORCE);
1180     fr->f_novirsum = force;
1181     if (bDoForces)
1182     {
1183         /* If we need to compute the virial, we might need a separate
1184          * force buffer for algorithms for which the virial is calculated
1185          * separately, such as PME.
1186          */
1187         if (fr->bF_NoVirSum && (flags & GMX_FORCE_VIRIAL))
1188         {
1189             fr->f_novirsum = fr->forceBufferNoVirialSummation;
1190
1191             /* TODO: remove this - 1 when padding is properly implemented */
1192             clear_rvecs_omp(fr->f_novirsum->size() - 1,
1193                             as_rvec_array(fr->f_novirsum->data()));
1194         }
1195         /* Clear the short- and long-range forces */
1196         clear_rvecs_omp(fr->natoms_force_constr, f);
1197
1198         clear_rvec(fr->vir_diag_posres);
1199     }
1200
1201     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1202     {
1203         clear_pull_forces(inputrec->pull_work);
1204     }
1205
1206     /* We calculate the non-bonded forces, when done on the CPU, here.
1207      * We do this before calling do_force_lowlevel, because in that
1208      * function, the listed forces are calculated before PME, which
1209      * does communication.  With this order, non-bonded and listed
1210      * force calculation imbalance can be balanced out by the domain
1211      * decomposition load balancing.
1212      */
1213
1214     if (!bUseOrEmulGPU)
1215     {
1216         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1217                      step, nrnb, wcycle);
1218     }
1219
1220     if (fr->efep != efepNO)
1221     {
1222         /* Calculate the local and non-local free energy interactions here.
1223          * Happens here on the CPU both with and without GPU.
1224          */
1225         if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1226         {
1227             do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1228                              fr, x, f, mdatoms,
1229                              inputrec->fepvals, lambda,
1230                              enerd, flags, nrnb, wcycle);
1231         }
1232
1233         if (DOMAINDECOMP(cr) &&
1234             fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1235         {
1236             do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1237                              fr, x, f, mdatoms,
1238                              inputrec->fepvals, lambda,
1239                              enerd, flags, nrnb, wcycle);
1240         }
1241     }
1242
1243     if (!bUseOrEmulGPU)
1244     {
1245         int aloc;
1246
1247         if (DOMAINDECOMP(cr))
1248         {
1249             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1250                          step, nrnb, wcycle);
1251         }
1252
1253         if (!bUseOrEmulGPU)
1254         {
1255             aloc = eintLocal;
1256         }
1257         else
1258         {
1259             aloc = eintNonlocal;
1260         }
1261
1262         /* Add all the non-bonded force to the normal force array.
1263          * This can be split into a local and a non-local part when overlapping
1264          * communication with calculation with domain decomposition.
1265          */
1266         wallcycle_stop(wcycle, ewcFORCE);
1267         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1268         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1269         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->nbat, f);
1270         wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1271         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1272         wallcycle_start_nocount(wcycle, ewcFORCE);
1273
1274         /* if there are multiple fshift output buffers reduce them */
1275         if ((flags & GMX_FORCE_VIRIAL) &&
1276             nbv->grp[aloc].nbl_lists.nnbl > 1)
1277         {
1278             /* This is not in a subcounter because it takes a
1279                negligible and constant-sized amount of time */
1280             nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1281                                                      fr->fshift);
1282         }
1283     }
1284
1285     /* update QMMMrec, if necessary */
1286     if (fr->bQMMM)
1287     {
1288         update_QMMMrec(cr, fr, x, mdatoms, box);
1289     }
1290
1291     /* Compute the bonded and non-bonded energies and optionally forces */
1292     do_force_lowlevel(fr, inputrec, &(top->idef),
1293                       cr, nrnb, wcycle, mdatoms,
1294                       x, hist, f, enerd, fcd, top, fr->born,
1295                       bBornRadii, box,
1296                       inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1297                       flags, &cycles_pme);
1298
1299     wallcycle_stop(wcycle, ewcFORCE);
1300
1301     computeSpecialForces(cr, inputrec, step, t, wcycle,
1302                          fr->forceProviders, box, x, mdatoms, lambda,
1303                          flags, fr->f_novirsum, vir_force, enerd,
1304                          ed, bNS);
1305
1306     if (bUseOrEmulGPU)
1307     {
1308         /* wait for non-local forces (or calculate in emulation mode) */
1309         if (DOMAINDECOMP(cr))
1310         {
1311             if (bUseGPU)
1312             {
1313                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1314                 nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
1315                                        flags, eatNonlocal,
1316                                        enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1317                                        fr->fshift);
1318                 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1319             }
1320             else
1321             {
1322                 wallcycle_start_nocount(wcycle, ewcFORCE);
1323                 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1324                              step, nrnb, wcycle);
1325                 wallcycle_stop(wcycle, ewcFORCE);
1326             }
1327             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1328             wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1329             /* skip the reduction if there was no non-local work to do */
1330             if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1331             {
1332                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
1333                                                nbv->nbat, f);
1334             }
1335             wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1336             wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1337         }
1338     }
1339
1340     if (DOMAINDECOMP(cr))
1341     {
1342         /* We are done with the CPU compute.
1343          * We will now communicate the non-local forces.
1344          * If we use a GPU this will overlap with GPU work, so in that case
1345          * we do not close the DD force balancing region here.
1346          */
1347         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1348         {
1349             ddCloseBalanceRegionCpu(cr->dd);
1350         }
1351         if (bDoForces)
1352         {
1353             wallcycle_start(wcycle, ewcMOVEF);
1354             dd_move_f(cr->dd, f, fr->fshift);
1355             wallcycle_stop(wcycle, ewcMOVEF);
1356         }
1357     }
1358
1359     if (bUseOrEmulGPU)
1360     {
1361         /* wait for local forces (or calculate in emulation mode) */
1362         if (bUseGPU)
1363         {
1364             /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1365              * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1366              * but even with a step of 0.1 ms the difference is less than 1%
1367              * of the step time.
1368              */
1369             const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1370
1371             wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1372             nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
1373                                    flags, eatLocal,
1374                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1375                                    fr->fshift);
1376             float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1377
1378             if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1379             {
1380                 DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1381                 if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1382                 {
1383                     /* We measured few cycles, it could be that the kernel
1384                      * and transfer finished earlier and there was no actual
1385                      * wait time, only API call overhead.
1386                      * Then the actual time could be anywhere between 0 and
1387                      * cycles_wait_est. We will use half of cycles_wait_est.
1388                      */
1389                     waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1390                 }
1391                 ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1392             }
1393
1394             /* now clear the GPU outputs while we finish the step on the CPU */
1395             wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1396             wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1397             nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1398
1399             /* Is dynamic pair-list pruning activated? */
1400             if (nbv->listParams->useDynamicPruning)
1401             {
1402                 /* We should not launch the rolling pruning kernel at a search
1403                  * step or just before search steps, since that's useless.
1404                  * Without domain decomposition we prune at even steps.
1405                  * With domain decomposition we alternate local and non-local
1406                  * pruning at even and odd steps.
1407                  */
1408                 int  numRollingParts     = nbv->listParams->numRollingParts;
1409                 GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1410                 int  stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1411                 bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
1412                 if (stepWithCurrentList > 0 &&
1413                     stepWithCurrentList < inputrec->nstlist - 1 &&
1414                     (stepIsEven || DOMAINDECOMP(cr)))
1415                 {
1416                     nbnxn_gpu_launch_kernel_pruneonly(fr->nbv->gpu_nbv,
1417                                                       stepIsEven ? eintLocal : eintNonlocal,
1418                                                       numRollingParts);
1419                 }
1420             }
1421             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1422             wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1423         }
1424         else
1425         {
1426             wallcycle_start_nocount(wcycle, ewcFORCE);
1427             do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1428                          DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1429                          step, nrnb, wcycle);
1430             wallcycle_stop(wcycle, ewcFORCE);
1431         }
1432         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1433         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1434         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
1435                                        nbv->nbat, f);
1436         wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1437         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1438     }
1439
1440     if (DOMAINDECOMP(cr))
1441     {
1442         dd_force_flop_stop(cr->dd, nrnb);
1443     }
1444
1445     if (bDoForces)
1446     {
1447         /* If we have NoVirSum forces, but we do not calculate the virial,
1448          * we sum fr->f_novirsum=f later.
1449          */
1450         if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1451         {
1452             wallcycle_start(wcycle, ewcVSITESPREAD);
1453             spread_vsite_f(vsite, x, f, fr->fshift, FALSE, nullptr, nrnb,
1454                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1455             wallcycle_stop(wcycle, ewcVSITESPREAD);
1456         }
1457
1458         if (flags & GMX_FORCE_VIRIAL)
1459         {
1460             /* Calculation of the virial must be done after vsites! */
1461             calc_virial(0, mdatoms->homenr, x, f,
1462                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1463         }
1464     }
1465
1466     if (PAR(cr) && !(cr->duty & DUTY_PME))
1467     {
1468         /* In case of node-splitting, the PP nodes receive the long-range
1469          * forces, virial and energy from the PME nodes here.
1470          */
1471         pme_receive_force_ener(cr, wcycle, enerd, fr);
1472     }
1473
1474     if (bDoForces)
1475     {
1476         post_process_forces(cr, step, nrnb, wcycle,
1477                             top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1478                             flags);
1479     }
1480
1481     if (flags & GMX_FORCE_ENERGY)
1482     {
1483         /* Sum the potential energy terms from group contributions */
1484         sum_epot(&(enerd->grpp), enerd->term);
1485
1486         if (!EI_TPI(inputrec->eI))
1487         {
1488             checkPotentialEnergyValidity(enerd);
1489         }
1490     }
1491 }
1492
1493 static void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
1494                                t_inputrec *inputrec,
1495                                gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1496                                gmx_localtop_t *top,
1497                                gmx_groups_t *groups,
1498                                matrix box, rvec x[], history_t *hist,
1499                                PaddedRVecVector *force,
1500                                tensor vir_force,
1501                                t_mdatoms *mdatoms,
1502                                gmx_enerdata_t *enerd, t_fcdata *fcd,
1503                                real *lambda, t_graph *graph,
1504                                t_forcerec *fr, gmx_vsite_t *vsite, rvec mu_tot,
1505                                double t, gmx_edsam_t ed,
1506                                gmx_bool bBornRadii,
1507                                int flags,
1508                                DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1509                                DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1510 {
1511     int        cg0, cg1, i, j;
1512     double     mu[2*DIM];
1513     gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM;
1514     gmx_bool   bDoForces;
1515     float      cycles_pme;
1516
1517     const int  start  = 0;
1518     const int  homenr = mdatoms->homenr;
1519
1520     clear_mat(vir_force);
1521
1522     cg0 = 0;
1523     if (DOMAINDECOMP(cr))
1524     {
1525         cg1 = cr->dd->ncg_tot;
1526     }
1527     else
1528     {
1529         cg1 = top->cgs.nr;
1530     }
1531     if (fr->n_tpi > 0)
1532     {
1533         cg1--;
1534     }
1535
1536     bStateChanged  = (flags & GMX_FORCE_STATECHANGED);
1537     bNS            = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1538     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1539     bFillGrid      = (bNS && bStateChanged);
1540     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
1541     bDoForces      = (flags & GMX_FORCE_FORCES);
1542
1543     if (bStateChanged)
1544     {
1545         update_forcerec(fr, box);
1546
1547         if (inputrecNeedMutot(inputrec))
1548         {
1549             /* Calculate total (local) dipole moment in a temporary common array.
1550              * This makes it possible to sum them over nodes faster.
1551              */
1552             calc_mu(start, homenr,
1553                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1554                     mu, mu+DIM);
1555         }
1556     }
1557
1558     if (fr->ePBC != epbcNONE)
1559     {
1560         /* Compute shift vectors every step,
1561          * because of pressure coupling or box deformation!
1562          */
1563         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1564         {
1565             calc_shifts(box, fr->shift_vec);
1566         }
1567
1568         if (bCalcCGCM)
1569         {
1570             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1571                                      &(top->cgs), x, fr->cg_cm);
1572             inc_nrnb(nrnb, eNR_CGCM, homenr);
1573             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1574         }
1575         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1576         {
1577             unshift_self(graph, box, x);
1578         }
1579     }
1580     else if (bCalcCGCM)
1581     {
1582         calc_cgcm(fplog, cg0, cg1, &(top->cgs), x, fr->cg_cm);
1583         inc_nrnb(nrnb, eNR_CGCM, homenr);
1584     }
1585
1586     if (bCalcCGCM && gmx_debug_at)
1587     {
1588         pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1589     }
1590
1591 #if GMX_MPI
1592     if (!(cr->duty & DUTY_PME))
1593     {
1594         gmx_bool bBS;
1595         matrix   boxs;
1596
1597         /* Send particle coordinates to the pme nodes.
1598          * Since this is only implemented for domain decomposition
1599          * and domain decomposition does not use the graph,
1600          * we do not need to worry about shifting.
1601          */
1602
1603         wallcycle_start(wcycle, ewcPP_PMESENDX);
1604
1605         bBS = (inputrec->nwall == 2);
1606         if (bBS)
1607         {
1608             copy_mat(box, boxs);
1609             svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
1610         }
1611
1612         gmx_pme_send_coordinates(cr, bBS ? boxs : box, x,
1613                                  lambda[efptCOUL], lambda[efptVDW],
1614                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1615                                  step);
1616
1617         wallcycle_stop(wcycle, ewcPP_PMESENDX);
1618     }
1619 #endif /* GMX_MPI */
1620
1621     /* Communicate coordinates and sum dipole if necessary */
1622     if (DOMAINDECOMP(cr))
1623     {
1624         wallcycle_start(wcycle, ewcMOVEX);
1625         dd_move_x(cr->dd, box, x);
1626         wallcycle_stop(wcycle, ewcMOVEX);
1627         /* No GPU support, no move_x overlap, so reopen the balance region here */
1628         if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1629         {
1630             ddReopenBalanceRegionCpu(cr->dd);
1631         }
1632     }
1633
1634     if (inputrecNeedMutot(inputrec))
1635     {
1636         if (bStateChanged)
1637         {
1638             if (PAR(cr))
1639             {
1640                 gmx_sumd(2*DIM, mu, cr);
1641                 ddReopenBalanceRegionCpu(cr->dd);
1642             }
1643             for (i = 0; i < 2; i++)
1644             {
1645                 for (j = 0; j < DIM; j++)
1646                 {
1647                     fr->mu_tot[i][j] = mu[i*DIM + j];
1648                 }
1649             }
1650         }
1651         if (fr->efep == efepNO)
1652         {
1653             copy_rvec(fr->mu_tot[0], mu_tot);
1654         }
1655         else
1656         {
1657             for (j = 0; j < DIM; j++)
1658             {
1659                 mu_tot[j] =
1660                     (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1661             }
1662         }
1663     }
1664
1665     /* Reset energies */
1666     reset_enerdata(enerd);
1667     clear_rvecs(SHIFTS, fr->fshift);
1668
1669     if (bNS)
1670     {
1671         wallcycle_start(wcycle, ewcNS);
1672
1673         if (graph && bStateChanged)
1674         {
1675             /* Calculate intramolecular shift vectors to make molecules whole */
1676             mk_mshift(fplog, graph, fr->ePBC, box, x);
1677         }
1678
1679         /* Do the actual neighbour searching */
1680         ns(fplog, fr, box,
1681            groups, top, mdatoms,
1682            cr, nrnb, bFillGrid);
1683
1684         wallcycle_stop(wcycle, ewcNS);
1685     }
1686
1687     if (inputrec->implicit_solvent && bNS)
1688     {
1689         make_gb_nblist(cr, inputrec->gb_algorithm,
1690                        x, box, fr, &top->idef, graph, fr->born);
1691     }
1692
1693     if (DOMAINDECOMP(cr) && !(cr->duty & DUTY_PME))
1694     {
1695         wallcycle_start(wcycle, ewcPPDURINGPME);
1696         dd_force_flop_start(cr->dd, nrnb);
1697     }
1698
1699     if (inputrec->bRot)
1700     {
1701         wallcycle_start(wcycle, ewcROT);
1702         do_rotation(cr, inputrec, box, x, t, step, bNS);
1703         wallcycle_stop(wcycle, ewcROT);
1704     }
1705
1706     /* Temporary solution until all routines take PaddedRVecVector */
1707     rvec *f = as_rvec_array(force->data());
1708
1709     /* Start the force cycle counter.
1710      * Note that a different counter is used for dynamic load balancing.
1711      */
1712     wallcycle_start(wcycle, ewcFORCE);
1713
1714     fr->f_novirsum = force;
1715     if (bDoForces)
1716     {
1717         /* If we need to compute the virial, we might need a separate
1718          * force buffer for algorithms for which the virial is calculated
1719          * separately, such as PME.
1720          */
1721         if (fr->bF_NoVirSum && (flags & GMX_FORCE_VIRIAL))
1722         {
1723             fr->f_novirsum = fr->forceBufferNoVirialSummation;
1724
1725             /* TODO: remove this - 1 when padding is properly implemented */
1726             clear_rvecs(fr->f_novirsum->size() - 1,
1727                         as_rvec_array(fr->f_novirsum->data()));
1728         }
1729
1730         /* Clear the short- and long-range forces */
1731         clear_rvecs(fr->natoms_force_constr, f);
1732
1733         clear_rvec(fr->vir_diag_posres);
1734     }
1735     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1736     {
1737         clear_pull_forces(inputrec->pull_work);
1738     }
1739
1740     /* update QMMMrec, if necessary */
1741     if (fr->bQMMM)
1742     {
1743         update_QMMMrec(cr, fr, x, mdatoms, box);
1744     }
1745
1746     /* Compute the bonded and non-bonded energies and optionally forces */
1747     do_force_lowlevel(fr, inputrec, &(top->idef),
1748                       cr, nrnb, wcycle, mdatoms,
1749                       x, hist, f, enerd, fcd, top, fr->born,
1750                       bBornRadii, box,
1751                       inputrec->fepvals, lambda,
1752                       graph, &(top->excls), fr->mu_tot,
1753                       flags,
1754                       &cycles_pme);
1755
1756     wallcycle_stop(wcycle, ewcFORCE);
1757
1758     if (DOMAINDECOMP(cr))
1759     {
1760         dd_force_flop_stop(cr->dd, nrnb);
1761
1762         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1763         {
1764             ddCloseBalanceRegionCpu(cr->dd);
1765         }
1766     }
1767
1768     computeSpecialForces(cr, inputrec, step, t, wcycle,
1769                          fr->forceProviders, box, x, mdatoms, lambda,
1770                          flags, fr->f_novirsum, vir_force, enerd,
1771                          ed, bNS);
1772
1773     if (bDoForces)
1774     {
1775         /* Communicate the forces */
1776         if (DOMAINDECOMP(cr))
1777         {
1778             wallcycle_start(wcycle, ewcMOVEF);
1779             dd_move_f(cr->dd, f, fr->fshift);
1780             /* Do we need to communicate the separate force array
1781              * for terms that do not contribute to the single sum virial?
1782              * Position restraints and electric fields do not introduce
1783              * inter-cg forces, only full electrostatics methods do.
1784              * When we do not calculate the virial, fr->f_novirsum = f,
1785              * so we have already communicated these forces.
1786              */
1787             if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
1788                 (flags & GMX_FORCE_VIRIAL))
1789             {
1790                 dd_move_f(cr->dd, as_rvec_array(fr->f_novirsum->data()), nullptr);
1791             }
1792             wallcycle_stop(wcycle, ewcMOVEF);
1793         }
1794
1795         /* If we have NoVirSum forces, but we do not calculate the virial,
1796          * we sum fr->f_novirsum=f later.
1797          */
1798         if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1799         {
1800             wallcycle_start(wcycle, ewcVSITESPREAD);
1801             spread_vsite_f(vsite, x, f, fr->fshift, FALSE, nullptr, nrnb,
1802                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1803             wallcycle_stop(wcycle, ewcVSITESPREAD);
1804         }
1805
1806         if (flags & GMX_FORCE_VIRIAL)
1807         {
1808             /* Calculation of the virial must be done after vsites! */
1809             calc_virial(0, mdatoms->homenr, x, f,
1810                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1811         }
1812     }
1813
1814     if (PAR(cr) && !(cr->duty & DUTY_PME))
1815     {
1816         /* In case of node-splitting, the PP nodes receive the long-range
1817          * forces, virial and energy from the PME nodes here.
1818          */
1819         pme_receive_force_ener(cr, wcycle, enerd, fr);
1820     }
1821
1822     if (bDoForces)
1823     {
1824         post_process_forces(cr, step, nrnb, wcycle,
1825                             top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1826                             flags);
1827     }
1828
1829     if (flags & GMX_FORCE_ENERGY)
1830     {
1831         /* Sum the potential energy terms from group contributions */
1832         sum_epot(&(enerd->grpp), enerd->term);
1833
1834         if (!EI_TPI(inputrec->eI))
1835         {
1836             checkPotentialEnergyValidity(enerd);
1837         }
1838     }
1839
1840 }
1841
1842 void do_force(FILE *fplog, t_commrec *cr,
1843               t_inputrec *inputrec,
1844               gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1845               gmx_localtop_t *top,
1846               gmx_groups_t *groups,
1847               matrix box, PaddedRVecVector *coordinates, history_t *hist,
1848               PaddedRVecVector *force,
1849               tensor vir_force,
1850               t_mdatoms *mdatoms,
1851               gmx_enerdata_t *enerd, t_fcdata *fcd,
1852               gmx::ArrayRef<real> lambda, t_graph *graph,
1853               t_forcerec *fr,
1854               gmx_vsite_t *vsite, rvec mu_tot,
1855               double t, gmx_edsam_t ed,
1856               gmx_bool bBornRadii,
1857               int flags,
1858               DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1859               DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1860 {
1861     /* modify force flag if not doing nonbonded */
1862     if (!fr->bNonbonded)
1863     {
1864         flags &= ~GMX_FORCE_NONBONDED;
1865     }
1866
1867     GMX_ASSERT(coordinates->size() >= static_cast<unsigned int>(fr->natoms_force + 1) ||
1868                fr->natoms_force == 0, "We might need 1 element extra for SIMD");
1869     GMX_ASSERT(force->size() >= static_cast<unsigned int>(fr->natoms_force + 1) ||
1870                fr->natoms_force == 0, "We might need 1 element extra for SIMD");
1871
1872     rvec *x = as_rvec_array(coordinates->data());
1873
1874     switch (inputrec->cutoff_scheme)
1875     {
1876         case ecutsVERLET:
1877             do_force_cutsVERLET(fplog, cr, inputrec,
1878                                 step, nrnb, wcycle,
1879                                 top,
1880                                 groups,
1881                                 box, x, hist,
1882                                 force, vir_force,
1883                                 mdatoms,
1884                                 enerd, fcd,
1885                                 lambda.data(), graph,
1886                                 fr, fr->ic,
1887                                 vsite, mu_tot,
1888                                 t, ed,
1889                                 bBornRadii,
1890                                 flags,
1891                                 ddOpenBalanceRegion,
1892                                 ddCloseBalanceRegion);
1893             break;
1894         case ecutsGROUP:
1895             do_force_cutsGROUP(fplog, cr, inputrec,
1896                                step, nrnb, wcycle,
1897                                top,
1898                                groups,
1899                                box, x, hist,
1900                                force, vir_force,
1901                                mdatoms,
1902                                enerd, fcd,
1903                                lambda.data(), graph,
1904                                fr, vsite, mu_tot,
1905                                t, ed,
1906                                bBornRadii,
1907                                flags,
1908                                ddOpenBalanceRegion,
1909                                ddCloseBalanceRegion);
1910             break;
1911         default:
1912             gmx_incons("Invalid cut-off scheme passed!");
1913     }
1914
1915     /* In case we don't have constraints and are using GPUs, the next balancing
1916      * region starts here.
1917      * Some "special" work at the end of do_force_cuts?, such as vsite spread,
1918      * virial calculation and COM pulling, is not thus not included in
1919      * the balance timing, which is ok as most tasks do communication.
1920      */
1921     if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1922     {
1923         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
1924     }
1925 }
1926
1927
1928 void do_constrain_first(FILE *fplog, gmx_constr_t constr,
1929                         t_inputrec *ir, t_mdatoms *md,
1930                         t_state *state, t_commrec *cr, t_nrnb *nrnb,
1931                         t_forcerec *fr, gmx_localtop_t *top)
1932 {
1933     int             i, m, start, end;
1934     gmx_int64_t     step;
1935     real            dt = ir->delta_t;
1936     real            dvdl_dum;
1937     rvec           *savex;
1938
1939     /* We need to allocate one element extra, since we might use
1940      * (unaligned) 4-wide SIMD loads to access rvec entries.
1941      */
1942     snew(savex, state->natoms + 1);
1943
1944     start = 0;
1945     end   = md->homenr;
1946
1947     if (debug)
1948     {
1949         fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
1950                 start, md->homenr, end);
1951     }
1952     /* Do a first constrain to reset particles... */
1953     step = ir->init_step;
1954     if (fplog)
1955     {
1956         char buf[STEPSTRSIZE];
1957         fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
1958                 gmx_step_str(step, buf));
1959     }
1960     dvdl_dum = 0;
1961
1962     /* constrain the current position */
1963     constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
1964               ir, cr, step, 0, 1.0, md,
1965               as_rvec_array(state->x.data()), as_rvec_array(state->x.data()), nullptr,
1966               fr->bMolPBC, state->box,
1967               state->lambda[efptBONDED], &dvdl_dum,
1968               nullptr, nullptr, nrnb, econqCoord);
1969     if (EI_VV(ir->eI))
1970     {
1971         /* constrain the inital velocity, and save it */
1972         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
1973         constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
1974                   ir, cr, step, 0, 1.0, md,
1975                   as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
1976                   fr->bMolPBC, state->box,
1977                   state->lambda[efptBONDED], &dvdl_dum,
1978                   nullptr, nullptr, nrnb, econqVeloc);
1979     }
1980     /* constrain the inital velocities at t-dt/2 */
1981     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
1982     {
1983         for (i = start; (i < end); i++)
1984         {
1985             for (m = 0; (m < DIM); m++)
1986             {
1987                 /* Reverse the velocity */
1988                 state->v[i][m] = -state->v[i][m];
1989                 /* Store the position at t-dt in buf */
1990                 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
1991             }
1992         }
1993         /* Shake the positions at t=-dt with the positions at t=0
1994          * as reference coordinates.
1995          */
1996         if (fplog)
1997         {
1998             char buf[STEPSTRSIZE];
1999             fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2000                     gmx_step_str(step, buf));
2001         }
2002         dvdl_dum = 0;
2003         constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
2004                   ir, cr, step, -1, 1.0, md,
2005                   as_rvec_array(state->x.data()), savex, nullptr,
2006                   fr->bMolPBC, state->box,
2007                   state->lambda[efptBONDED], &dvdl_dum,
2008                   as_rvec_array(state->v.data()), nullptr, nrnb, econqCoord);
2009
2010         for (i = start; i < end; i++)
2011         {
2012             for (m = 0; m < DIM; m++)
2013             {
2014                 /* Re-reverse the velocities */
2015                 state->v[i][m] = -state->v[i][m];
2016             }
2017         }
2018     }
2019     sfree(savex);
2020 }
2021
2022
2023 static void
2024 integrate_table(real vdwtab[], real scale, int offstart, int rstart, int rend,
2025                 double *enerout, double *virout)
2026 {
2027     double enersum, virsum;
2028     double invscale, invscale2, invscale3;
2029     double r, ea, eb, ec, pa, pb, pc, pd;
2030     double y0, f, g, h;
2031     int    ri, offset;
2032     double tabfactor;
2033
2034     invscale  = 1.0/scale;
2035     invscale2 = invscale*invscale;
2036     invscale3 = invscale*invscale2;
2037
2038     /* Following summation derived from cubic spline definition,
2039      * Numerical Recipies in C, second edition, p. 113-116.  Exact for
2040      * the cubic spline.  We first calculate the negative of the
2041      * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2042      * add the more standard, abrupt cutoff correction to that result,
2043      * yielding the long-range correction for a switched function.  We
2044      * perform both the pressure and energy loops at the same time for
2045      * simplicity, as the computational cost is low. */
2046
2047     if (offstart == 0)
2048     {
2049         /* Since the dispersion table has been scaled down a factor
2050          * 6.0 and the repulsion a factor 12.0 to compensate for the
2051          * c6/c12 parameters inside nbfp[] being scaled up (to save
2052          * flops in kernels), we need to correct for this.
2053          */
2054         tabfactor = 6.0;
2055     }
2056     else
2057     {
2058         tabfactor = 12.0;
2059     }
2060
2061     enersum = 0.0;
2062     virsum  = 0.0;
2063     for (ri = rstart; ri < rend; ++ri)
2064     {
2065         r  = ri*invscale;
2066         ea = invscale3;
2067         eb = 2.0*invscale2*r;
2068         ec = invscale*r*r;
2069
2070         pa = invscale3;
2071         pb = 3.0*invscale2*r;
2072         pc = 3.0*invscale*r*r;
2073         pd = r*r*r;
2074
2075         /* this "8" is from the packing in the vdwtab array - perhaps
2076            should be defined? */
2077
2078         offset = 8*ri + offstart;
2079         y0     = vdwtab[offset];
2080         f      = vdwtab[offset+1];
2081         g      = vdwtab[offset+2];
2082         h      = vdwtab[offset+3];
2083
2084         enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2085         virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2086     }
2087     *enerout = 4.0*M_PI*enersum*tabfactor;
2088     *virout  = 4.0*M_PI*virsum*tabfactor;
2089 }
2090
2091 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2092 {
2093     double   eners[2], virs[2], enersum, virsum;
2094     double   r0, rc3, rc9;
2095     int      ri0, ri1, i;
2096     real     scale, *vdwtab;
2097
2098     fr->enershiftsix    = 0;
2099     fr->enershifttwelve = 0;
2100     fr->enerdiffsix     = 0;
2101     fr->enerdifftwelve  = 0;
2102     fr->virdiffsix      = 0;
2103     fr->virdifftwelve   = 0;
2104
2105     const interaction_const_t *ic = fr->ic;
2106
2107     if (eDispCorr != edispcNO)
2108     {
2109         for (i = 0; i < 2; i++)
2110         {
2111             eners[i] = 0;
2112             virs[i]  = 0;
2113         }
2114         if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2115             (ic->vdw_modifier == eintmodPOTSWITCH) ||
2116             (ic->vdw_modifier == eintmodFORCESWITCH) ||
2117             (ic->vdwtype == evdwSHIFT) ||
2118             (ic->vdwtype == evdwSWITCH))
2119         {
2120             if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2121                  (ic->vdw_modifier == eintmodFORCESWITCH) ||
2122                  (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2123             {
2124                 gmx_fatal(FARGS,
2125                           "With dispersion correction rvdw-switch can not be zero "
2126                           "for vdw-type = %s", evdw_names[ic->vdwtype]);
2127             }
2128
2129             /* TODO This code depends on the logic in tables.c that
2130                constructs the table layout, which should be made
2131                explicit in future cleanup. */
2132             GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2133                        "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2134             scale  = fr->dispersionCorrectionTable->scale;
2135             vdwtab = fr->dispersionCorrectionTable->data;
2136
2137             /* Round the cut-offs to exact table values for precision */
2138             ri0  = static_cast<int>(floor(ic->rvdw_switch*scale));
2139             ri1  = static_cast<int>(ceil(ic->rvdw*scale));
2140
2141             /* The code below has some support for handling force-switching, i.e.
2142              * when the force (instead of potential) is switched over a limited
2143              * region. This leads to a constant shift in the potential inside the
2144              * switching region, which we can handle by adding a constant energy
2145              * term in the force-switch case just like when we do potential-shift.
2146              *
2147              * For now this is not enabled, but to keep the functionality in the
2148              * code we check separately for switch and shift. When we do force-switch
2149              * the shifting point is rvdw_switch, while it is the cutoff when we
2150              * have a classical potential-shift.
2151              *
2152              * For a pure potential-shift the potential has a constant shift
2153              * all the way out to the cutoff, and that is it. For other forms
2154              * we need to calculate the constant shift up to the point where we
2155              * start modifying the potential.
2156              */
2157             ri0  = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2158
2159             r0   = ri0/scale;
2160             rc3  = r0*r0*r0;
2161             rc9  = rc3*rc3*rc3;
2162
2163             if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2164                 (ic->vdwtype == evdwSHIFT))
2165             {
2166                 /* Determine the constant energy shift below rvdw_switch.
2167                  * Table has a scale factor since we have scaled it down to compensate
2168                  * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2169                  */
2170                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2171                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2172             }
2173             else if (ic->vdw_modifier == eintmodPOTSHIFT)
2174             {
2175                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3));
2176                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3));
2177             }
2178
2179             /* Add the constant part from 0 to rvdw_switch.
2180              * This integration from 0 to rvdw_switch overcounts the number
2181              * of interactions by 1, as it also counts the self interaction.
2182              * We will correct for this later.
2183              */
2184             eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2185             eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2186
2187             /* Calculate the contribution in the range [r0,r1] where we
2188              * modify the potential. For a pure potential-shift modifier we will
2189              * have ri0==ri1, and there will not be any contribution here.
2190              */
2191             for (i = 0; i < 2; i++)
2192             {
2193                 enersum = 0;
2194                 virsum  = 0;
2195                 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2196                 eners[i] -= enersum;
2197                 virs[i]  -= virsum;
2198             }
2199
2200             /* Alright: Above we compensated by REMOVING the parts outside r0
2201              * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2202              *
2203              * Regardless of whether r0 is the point where we start switching,
2204              * or the cutoff where we calculated the constant shift, we include
2205              * all the parts we are missing out to infinity from r0 by
2206              * calculating the analytical dispersion correction.
2207              */
2208             eners[0] += -4.0*M_PI/(3.0*rc3);
2209             eners[1] +=  4.0*M_PI/(9.0*rc9);
2210             virs[0]  +=  8.0*M_PI/rc3;
2211             virs[1]  += -16.0*M_PI/(3.0*rc9);
2212         }
2213         else if (ic->vdwtype == evdwCUT ||
2214                  EVDW_PME(ic->vdwtype) ||
2215                  ic->vdwtype == evdwUSER)
2216         {
2217             if (ic->vdwtype == evdwUSER && fplog)
2218             {
2219                 fprintf(fplog,
2220                         "WARNING: using dispersion correction with user tables\n");
2221             }
2222
2223             /* Note that with LJ-PME, the dispersion correction is multiplied
2224              * by the difference between the actual C6 and the value of C6
2225              * that would produce the combination rule.
2226              * This means the normal energy and virial difference formulas
2227              * can be used here.
2228              */
2229
2230             rc3  = ic->rvdw*ic->rvdw*ic->rvdw;
2231             rc9  = rc3*rc3*rc3;
2232             /* Contribution beyond the cut-off */
2233             eners[0] += -4.0*M_PI/(3.0*rc3);
2234             eners[1] +=  4.0*M_PI/(9.0*rc9);
2235             if (ic->vdw_modifier == eintmodPOTSHIFT)
2236             {
2237                 /* Contribution within the cut-off */
2238                 eners[0] += -4.0*M_PI/(3.0*rc3);
2239                 eners[1] +=  4.0*M_PI/(3.0*rc9);
2240             }
2241             /* Contribution beyond the cut-off */
2242             virs[0]  +=  8.0*M_PI/rc3;
2243             virs[1]  += -16.0*M_PI/(3.0*rc9);
2244         }
2245         else
2246         {
2247             gmx_fatal(FARGS,
2248                       "Dispersion correction is not implemented for vdw-type = %s",
2249                       evdw_names[ic->vdwtype]);
2250         }
2251
2252         /* When we deprecate the group kernels the code below can go too */
2253         if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2254         {
2255             /* Calculate self-interaction coefficient (assuming that
2256              * the reciprocal-space contribution is constant in the
2257              * region that contributes to the self-interaction).
2258              */
2259             fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2260
2261             eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2262             virs[0]  +=  gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2263         }
2264
2265         fr->enerdiffsix    = eners[0];
2266         fr->enerdifftwelve = eners[1];
2267         /* The 0.5 is due to the Gromacs definition of the virial */
2268         fr->virdiffsix     = 0.5*virs[0];
2269         fr->virdifftwelve  = 0.5*virs[1];
2270     }
2271 }
2272
2273 void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
2274                    matrix box, real lambda, tensor pres, tensor virial,
2275                    real *prescorr, real *enercorr, real *dvdlcorr)
2276 {
2277     gmx_bool bCorrAll, bCorrPres;
2278     real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2279     int      m;
2280
2281     *prescorr = 0;
2282     *enercorr = 0;
2283     *dvdlcorr = 0;
2284
2285     clear_mat(virial);
2286     clear_mat(pres);
2287
2288     if (ir->eDispCorr != edispcNO)
2289     {
2290         bCorrAll  = (ir->eDispCorr == edispcAllEner ||
2291                      ir->eDispCorr == edispcAllEnerPres);
2292         bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2293                      ir->eDispCorr == edispcAllEnerPres);
2294
2295         invvol = 1/det(box);
2296         if (fr->n_tpi)
2297         {
2298             /* Only correct for the interactions with the inserted molecule */
2299             dens   = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2300             ninter = fr->n_tpi;
2301         }
2302         else
2303         {
2304             dens   = fr->numAtomsForDispersionCorrection*invvol;
2305             ninter = 0.5*fr->numAtomsForDispersionCorrection;
2306         }
2307
2308         if (ir->efep == efepNO)
2309         {
2310             avcsix    = fr->avcsix[0];
2311             avctwelve = fr->avctwelve[0];
2312         }
2313         else
2314         {
2315             avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
2316             avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2317         }
2318
2319         enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2320         *enercorr += avcsix*enerdiff;
2321         dvdlambda  = 0.0;
2322         if (ir->efep != efepNO)
2323         {
2324             dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2325         }
2326         if (bCorrAll)
2327         {
2328             enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2329             *enercorr += avctwelve*enerdiff;
2330             if (fr->efep != efepNO)
2331             {
2332                 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2333             }
2334         }
2335
2336         if (bCorrPres)
2337         {
2338             svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2339             if (ir->eDispCorr == edispcAllEnerPres)
2340             {
2341                 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2342             }
2343             /* The factor 2 is because of the Gromacs virial definition */
2344             spres = -2.0*invvol*svir*PRESFAC;
2345
2346             for (m = 0; m < DIM; m++)
2347             {
2348                 virial[m][m] += svir;
2349                 pres[m][m]   += spres;
2350             }
2351             *prescorr += spres;
2352         }
2353
2354         /* Can't currently control when it prints, for now, just print when degugging */
2355         if (debug)
2356         {
2357             if (bCorrAll)
2358             {
2359                 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2360                         avcsix, avctwelve);
2361             }
2362             if (bCorrPres)
2363             {
2364                 fprintf(debug,
2365                         "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2366                         *enercorr, spres, svir);
2367             }
2368             else
2369             {
2370                 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2371             }
2372         }
2373
2374         if (fr->efep != efepNO)
2375         {
2376             *dvdlcorr += dvdlambda;
2377         }
2378     }
2379 }
2380
2381 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2382                             const gmx_mtop_t *mtop, rvec x[],
2383                             gmx_bool bFirst)
2384 {
2385     t_graph        *graph;
2386     int             mb, as, mol;
2387     gmx_molblock_t *molb;
2388
2389     if (bFirst && fplog)
2390     {
2391         fprintf(fplog, "Removing pbc first time\n");
2392     }
2393
2394     snew(graph, 1);
2395     as = 0;
2396     for (mb = 0; mb < mtop->nmolblock; mb++)
2397     {
2398         molb = &mtop->molblock[mb];
2399         if (molb->natoms_mol == 1 ||
2400             (!bFirst && mtop->moltype[molb->type].cgs.nr == 1))
2401         {
2402             /* Just one atom or charge group in the molecule, no PBC required */
2403             as += molb->nmol*molb->natoms_mol;
2404         }
2405         else
2406         {
2407             /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2408             mk_graph_ilist(nullptr, mtop->moltype[molb->type].ilist,
2409                            0, molb->natoms_mol, FALSE, FALSE, graph);
2410
2411             for (mol = 0; mol < molb->nmol; mol++)
2412             {
2413                 mk_mshift(fplog, graph, ePBC, box, x+as);
2414
2415                 shift_self(graph, box, x+as);
2416                 /* The molecule is whole now.
2417                  * We don't need the second mk_mshift call as in do_pbc_first,
2418                  * since we no longer need this graph.
2419                  */
2420
2421                 as += molb->natoms_mol;
2422             }
2423             done_graph(graph);
2424         }
2425     }
2426     sfree(graph);
2427 }
2428
2429 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2430                        const gmx_mtop_t *mtop, rvec x[])
2431 {
2432     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2433 }
2434
2435 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2436                  const gmx_mtop_t *mtop, rvec x[])
2437 {
2438     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2439 }
2440
2441 void put_atoms_in_box_omp(int ePBC, const matrix box, int natoms, rvec x[])
2442 {
2443     int t, nth;
2444     nth = gmx_omp_nthreads_get(emntDefault);
2445
2446 #pragma omp parallel for num_threads(nth) schedule(static)
2447     for (t = 0; t < nth; t++)
2448     {
2449         try
2450         {
2451             int offset, len;
2452
2453             offset = (natoms*t    )/nth;
2454             len    = (natoms*(t + 1))/nth - offset;
2455             put_atoms_in_box(ePBC, box, len, x + offset);
2456         }
2457         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2458     }
2459 }
2460
2461 // TODO This can be cleaned up a lot, and move back to runner.cpp
2462 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, t_commrec *cr,
2463                 const t_inputrec *inputrec,
2464                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2465                 gmx_walltime_accounting_t walltime_accounting,
2466                 nonbonded_verlet_t *nbv,
2467                 gmx_bool bWriteStat)
2468 {
2469     t_nrnb *nrnb_tot = nullptr;
2470     double  delta_t  = 0;
2471     double  nbfs     = 0, mflop = 0;
2472     double  elapsed_time,
2473             elapsed_time_over_all_ranks,
2474             elapsed_time_over_all_threads,
2475             elapsed_time_over_all_threads_over_all_ranks;
2476     /* Control whether it is valid to print a report. Only the
2477        simulation master may print, but it should not do so if the run
2478        terminated e.g. before a scheduled reset step. This is
2479        complicated by the fact that PME ranks are unaware of the
2480        reason why they were sent a pmerecvqxFINISH. To avoid
2481        communication deadlocks, we always do the communication for the
2482        report, even if we've decided not to write the report, because
2483        how long it takes to finish the run is not important when we've
2484        decided not to report on the simulation performance. */
2485     bool    printReport = SIMMASTER(cr);
2486
2487     if (!walltime_accounting_get_valid_finish(walltime_accounting))
2488     {
2489         GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2490         printReport = false;
2491     }
2492
2493     if (cr->nnodes > 1)
2494     {
2495         snew(nrnb_tot, 1);
2496 #if GMX_MPI
2497         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2498                       cr->mpi_comm_mysim);
2499 #endif
2500     }
2501     else
2502     {
2503         nrnb_tot = nrnb;
2504     }
2505
2506     elapsed_time                                 = walltime_accounting_get_elapsed_time(walltime_accounting);
2507     elapsed_time_over_all_ranks                  = elapsed_time;
2508     elapsed_time_over_all_threads                = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
2509     elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2510 #if GMX_MPI
2511     if (cr->nnodes > 1)
2512     {
2513         /* reduce elapsed_time over all MPI ranks in the current simulation */
2514         MPI_Allreduce(&elapsed_time,
2515                       &elapsed_time_over_all_ranks,
2516                       1, MPI_DOUBLE, MPI_SUM,
2517                       cr->mpi_comm_mysim);
2518         elapsed_time_over_all_ranks /= cr->nnodes;
2519         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2520          * current simulation. */
2521         MPI_Allreduce(&elapsed_time_over_all_threads,
2522                       &elapsed_time_over_all_threads_over_all_ranks,
2523                       1, MPI_DOUBLE, MPI_SUM,
2524                       cr->mpi_comm_mysim);
2525     }
2526 #endif
2527
2528     if (printReport)
2529     {
2530         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2531     }
2532     if (cr->nnodes > 1)
2533     {
2534         sfree(nrnb_tot);
2535     }
2536
2537     if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr))
2538     {
2539         print_dd_statistics(cr, inputrec, fplog);
2540     }
2541
2542     /* TODO Move the responsibility for any scaling by thread counts
2543      * to the code that handled the thread region, so that there's a
2544      * mechanism to keep cycle counting working during the transition
2545      * to task parallelism. */
2546     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
2547     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2548     wallcycle_scale_by_num_threads(wcycle, cr->duty == DUTY_PME, nthreads_pp, nthreads_pme);
2549     auto cycle_sum(wallcycle_sum(cr, wcycle));
2550
2551     if (printReport)
2552     {
2553         struct gmx_wallclock_gpu_t* gputimes = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2554
2555         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2556                         elapsed_time_over_all_ranks,
2557                         wcycle, cycle_sum, gputimes);
2558
2559         if (EI_DYNAMICS(inputrec->eI))
2560         {
2561             delta_t = inputrec->delta_t;
2562         }
2563
2564         if (fplog)
2565         {
2566             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2567                        elapsed_time_over_all_ranks,
2568                        walltime_accounting_get_nsteps_done(walltime_accounting),
2569                        delta_t, nbfs, mflop);
2570         }
2571         if (bWriteStat)
2572         {
2573             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2574                        elapsed_time_over_all_ranks,
2575                        walltime_accounting_get_nsteps_done(walltime_accounting),
2576                        delta_t, nbfs, mflop);
2577         }
2578     }
2579 }
2580
2581 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2582 {
2583     /* this function works, but could probably use a logic rewrite to keep all the different
2584        types of efep straight. */
2585
2586     if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
2587     {
2588         return;
2589     }
2590
2591     t_lambda *fep = ir->fepvals;
2592     *fep_state    = fep->init_fep_state; /* this might overwrite the checkpoint
2593                                             if checkpoint is set -- a kludge is in for now
2594                                             to prevent this.*/
2595
2596     for (int i = 0; i < efptNR; i++)
2597     {
2598         /* overwrite lambda state with init_lambda for now for backwards compatibility */
2599         if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2600         {
2601             lambda[i] = fep->init_lambda;
2602             if (lam0)
2603             {
2604                 lam0[i] = lambda[i];
2605             }
2606         }
2607         else
2608         {
2609             lambda[i] = fep->all_lambda[i][*fep_state];
2610             if (lam0)
2611             {
2612                 lam0[i] = lambda[i];
2613             }
2614         }
2615     }
2616     if (ir->bSimTemp)
2617     {
2618         /* need to rescale control temperatures to match current state */
2619         for (int i = 0; i < ir->opts.ngtc; i++)
2620         {
2621             if (ir->opts.ref_t[i] > 0)
2622             {
2623                 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2624             }
2625         }
2626     }
2627
2628     /* Send to the log the information on the current lambdas */
2629     if (fplog != nullptr)
2630     {
2631         fprintf(fplog, "Initial vector of lambda components:[ ");
2632         for (const auto &l : lambda)
2633         {
2634             fprintf(fplog, "%10.4f ", l);
2635         }
2636         fprintf(fplog, "]\n");
2637     }
2638     return;
2639 }
2640
2641
2642 void init_md(FILE *fplog,
2643              t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2644              t_inputrec *ir, const gmx_output_env_t *oenv,
2645              const MdrunOptions &mdrunOptions,
2646              double *t, double *t0,
2647              t_state *globalState, double *lam0,
2648              t_nrnb *nrnb, gmx_mtop_t *mtop,
2649              gmx_update_t **upd,
2650              int nfile, const t_filenm fnm[],
2651              gmx_mdoutf_t *outf, t_mdebin **mdebin,
2652              tensor force_vir, tensor shake_vir, rvec mu_tot,
2653              gmx_bool *bSimAnn, t_vcm **vcm,
2654              gmx_wallcycle_t wcycle)
2655 {
2656     int  i;
2657
2658     /* Initial values */
2659     *t = *t0       = ir->init_t;
2660
2661     *bSimAnn = FALSE;
2662     for (i = 0; i < ir->opts.ngtc; i++)
2663     {
2664         /* set bSimAnn if any group is being annealed */
2665         if (ir->opts.annealing[i] != eannNO)
2666         {
2667             *bSimAnn = TRUE;
2668         }
2669     }
2670
2671     /* Initialize lambda variables */
2672     /* TODO: Clean up initialization of fep_state and lambda in t_state.
2673      * We currently need to call initialize_lambdas on non-master ranks
2674      * to initialize lam0.
2675      */
2676     if (MASTER(cr))
2677     {
2678         initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
2679     }
2680     else
2681     {
2682         int                      tmpFepState;
2683         std::array<real, efptNR> tmpLambda;
2684         initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
2685     }
2686
2687     // TODO upd is never NULL in practice, but the analysers don't know that
2688     if (upd)
2689     {
2690         *upd = init_update(ir);
2691     }
2692     if (*bSimAnn)
2693     {
2694         update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2695     }
2696
2697     if (vcm != nullptr)
2698     {
2699         *vcm = init_vcm(fplog, &mtop->groups, ir);
2700     }
2701
2702     if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
2703     {
2704         if (ir->etc == etcBERENDSEN)
2705         {
2706             please_cite(fplog, "Berendsen84a");
2707         }
2708         if (ir->etc == etcVRESCALE)
2709         {
2710             please_cite(fplog, "Bussi2007a");
2711         }
2712         if (ir->eI == eiSD1)
2713         {
2714             please_cite(fplog, "Goga2012");
2715         }
2716     }
2717     init_nrnb(nrnb);
2718
2719     if (nfile != -1)
2720     {
2721         *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
2722
2723         *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
2724                               mtop, ir, mdoutf_get_fp_dhdl(*outf));
2725     }
2726
2727     /* Initiate variables */
2728     clear_mat(force_vir);
2729     clear_mat(shake_vir);
2730     clear_rvec(mu_tot);
2731 }